Merge "Allocate tile data adaptively to accommodate the frame size increase"

diff --git a/build/make/Android.mk b/build/make/Android.mk
index 0add523..e971c9d 100644
--- a/build/make/Android.mk
+++ b/build/make/Android.mk

@@ -163,6 +163,7 @@
 endif
 
 # Add a dependency to force generation of the RTCD files.
+define rtcd_dep_template
 ifeq ($(CONFIG_VP8), yes)
 $(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vp8_rtcd.h
 endif
@@ -175,6 +176,9 @@
 ifeq ($(TARGET_ARCH_ABI),x86)
 $(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vpx_config.asm
 endif
+endef
+
+$(eval $(call rtcd_dep_template))
 
 .PHONY: clean
 clean:

diff --git a/libs.mk b/libs.mk
index 0f87a8a..0ca8379 100644
--- a/libs.mk
+++ b/libs.mk

@@ -373,6 +373,7 @@
 
 TEST_INTRA_PRED_SPEED_BIN=./test_intra_pred_speed$(EXE_SFX)
 TEST_INTRA_PRED_SPEED_SRCS=$(addprefix test/,$(call enabled,TEST_INTRA_PRED_SPEED_SRCS))
+TEST_INTRA_PRED_SPEED_OBJS := $(sort $(call objs,$(TEST_INTRA_PRED_SPEED_SRCS)))
 
 libvpx_test_srcs.txt:
 	@echo "    [CREATE] $@"
@@ -486,7 +487,6 @@
               $(LIBVPX_TEST_OBJS) \
               -L. -lvpx -lgtest $(extralibs) -lm))
 
-TEST_INTRA_PRED_SPEED_OBJS := $(sort $(call objs,$(TEST_INTRA_PRED_SPEED_SRCS)))
 ifneq ($(strip $(TEST_INTRA_PRED_SPEED_OBJS)),)
 $(TEST_INTRA_PRED_SPEED_OBJS) $(TEST_INTRA_PRED_SPEED_OBJS:.o=.d): CXXFLAGS += $(GTEST_INCLUDES)
 OBJS-yes += $(TEST_INTRA_PRED_SPEED_OBJS)

diff --git a/test/android/Android.mk b/test/android/Android.mk
index af85634..48872a2 100644
--- a/test/android/Android.mk
+++ b/test/android/Android.mk

@@ -51,4 +51,6 @@
 LOCAL_C_INCLUDES := $(BINDINGS_DIR)
 FILTERED_SRC := $(sort $(filter %.cc %.c, $(LIBVPX_TEST_SRCS-yes)))
 LOCAL_SRC_FILES := $(addprefix ./test/, $(FILTERED_SRC))
+# some test files depend on *_rtcd.h, ensure they're generated first.
+$(eval $(call rtcd_dep_template))
 include $(BUILD_EXECUTABLE)

diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index a684ea4..d2687b2 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc

@@ -16,6 +16,7 @@
 
 #include "./vp9_rtcd.h"
 #include "test/acm_random.h"
+#include "test/clear_system_state.h"
 #include "test/md5_helper.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
@@ -66,6 +67,7 @@
     for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
       pred_funcs[k](src, kBPS, above, left);
     }
+    libvpx_test::ClearSystemState();
     vpx_usec_timer_mark(&timer);
     const int elapsed_time =
         static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
@@ -211,6 +213,14 @@
                 NULL, NULL, NULL, NULL, vp9_tm_predictor_4x4_neon)
 #endif  // HAVE_NEON
 
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TestIntraPred4, vp9_dc_predictor_4x4_msa,
+                vp9_dc_left_predictor_4x4_msa, vp9_dc_top_predictor_4x4_msa,
+                vp9_dc_128_predictor_4x4_msa, vp9_v_predictor_4x4_msa,
+                vp9_h_predictor_4x4_msa, NULL, NULL, NULL, NULL, NULL,
+                NULL, vp9_tm_predictor_4x4_msa)
+#endif  // HAVE_MSA
+
 // -----------------------------------------------------------------------------
 // 8x8
 
@@ -256,6 +266,14 @@
 
 #endif  // HAVE_NEON
 
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TestIntraPred8, vp9_dc_predictor_8x8_msa,
+                vp9_dc_left_predictor_8x8_msa, vp9_dc_top_predictor_8x8_msa,
+                vp9_dc_128_predictor_8x8_msa, vp9_v_predictor_8x8_msa,
+                vp9_h_predictor_8x8_msa, NULL, NULL, NULL, NULL, NULL,
+                NULL, vp9_tm_predictor_8x8_msa)
+#endif  // HAVE_MSA
+
 // -----------------------------------------------------------------------------
 // 16x16
 
@@ -299,6 +317,14 @@
                 vp9_tm_predictor_16x16_neon)
 #endif  // HAVE_NEON
 
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TestIntraPred16, vp9_dc_predictor_16x16_msa,
+                vp9_dc_left_predictor_16x16_msa, vp9_dc_top_predictor_16x16_msa,
+                vp9_dc_128_predictor_16x16_msa, vp9_v_predictor_16x16_msa,
+                vp9_h_predictor_16x16_msa, NULL, NULL, NULL, NULL, NULL,
+                NULL, vp9_tm_predictor_16x16_msa)
+#endif  // HAVE_MSA
+
 // -----------------------------------------------------------------------------
 // 32x32
 
@@ -311,17 +337,20 @@
                 vp9_d63_predictor_32x32_c, vp9_tm_predictor_32x32_c)
 
 #if HAVE_SSE2
+#if ARCH_X86_64
 INTRA_PRED_TEST(SSE2, TestIntraPred32, vp9_dc_predictor_32x32_sse2,
                 vp9_dc_left_predictor_32x32_sse2,
                 vp9_dc_top_predictor_32x32_sse2,
                 vp9_dc_128_predictor_32x32_sse2, vp9_v_predictor_32x32_sse2,
                 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-#if ARCH_X86_64
-                vp9_tm_predictor_32x32_sse2
+                vp9_tm_predictor_32x32_sse2)
 #else
-                NULL
-#endif
-               )
+INTRA_PRED_TEST(SSE2, TestIntraPred32, vp9_dc_predictor_32x32_sse2,
+                vp9_dc_left_predictor_32x32_sse2,
+                vp9_dc_top_predictor_32x32_sse2,
+                vp9_dc_128_predictor_32x32_sse2, vp9_v_predictor_32x32_sse2,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+#endif  // ARCH_X86_64
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
@@ -337,4 +366,12 @@
                 NULL, NULL, NULL, NULL, NULL, vp9_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TestIntraPred32, vp9_dc_predictor_32x32_msa,
+                vp9_dc_left_predictor_32x32_msa, vp9_dc_top_predictor_32x32_msa,
+                vp9_dc_128_predictor_32x32_msa, vp9_v_predictor_32x32_msa,
+                vp9_h_predictor_32x32_msa, NULL, NULL, NULL, NULL, NULL,
+                NULL, vp9_tm_predictor_32x32_msa)
+#endif  // HAVE_MSA
+
 #include "test/test_libvpx.cc"

diff --git a/vp9/common/mips/msa/vp9_intra_predict_msa.c b/vp9/common/mips/msa/vp9_intra_predict_msa.c
new file mode 100644
index 0000000..2fc6105
--- /dev/null
+++ b/vp9/common/mips/msa/vp9_intra_predict_msa.c

@@ -0,0 +1,737 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/mips/msa/vp9_macros_msa.h"
+
+#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) {  \
+  out0 = __msa_subs_u_h(out0, in0);                \
+  out1 = __msa_subs_u_h(out1, in1);                \
+}
+
+static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst,
+                                       int32_t dst_stride) {
+  uint32_t src_data;
+
+  src_data = LW(src);
+
+  SW4(src_data, src_data, src_data, src_data, dst, dst_stride);
+}
+
+static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst,
+                                       int32_t dst_stride) {
+  uint32_t row;
+  uint32_t src_data1, src_data2;
+
+  src_data1 = LW(src);
+  src_data2 = LW(src + 4);
+
+  for (row = 8; row--;) {
+    SW(src_data1, dst);
+    SW(src_data2, (dst + 4));
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst,
+                                         int32_t dst_stride) {
+  uint32_t row;
+  v16u8 src0;
+
+  src0 = LD_UB(src);
+
+  for (row = 16; row--;) {
+    ST_UB(src0, dst);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
+                                         int32_t dst_stride) {
+  uint32_t row;
+  v16u8 src1, src2;
+
+  src1 = LD_UB(src);
+  src2 = LD_UB(src + 16);
+
+  for (row = 32; row--;) {
+    ST_UB2(src1, src2, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint32_t out0, out1, out2, out3;
+
+  out0 = src[0] * 0x01010101;
+  out1 = src[1] * 0x01010101;
+  out2 = src[2] * 0x01010101;
+  out3 = src[3] * 0x01010101;
+
+  SW4(out0, out1, out2, out3, dst, dst_stride);
+}
+
+static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+  out0 = src[0] * 0x0101010101010101ull;
+  out1 = src[1] * 0x0101010101010101ull;
+  out2 = src[2] * 0x0101010101010101ull;
+  out3 = src[3] * 0x0101010101010101ull;
+  out4 = src[4] * 0x0101010101010101ull;
+  out5 = src[5] * 0x0101010101010101ull;
+  out6 = src[6] * 0x0101010101010101ull;
+  out7 = src[7] * 0x0101010101010101ull;
+
+  SD4(out0, out1, out2, out3, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(out4, out5, out6, out7, dst, dst_stride);
+}
+
+static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  uint32_t row;
+  uint8_t inp0, inp1, inp2, inp3;
+  v16u8 src0, src1, src2, src3;
+
+  for (row = 4; row--;) {
+    inp0 = src[0];
+    inp1 = src[1];
+    inp2 = src[2];
+    inp3 = src[3];
+    src += 4;
+
+    src0 = (v16u8)__msa_fill_b(inp0);
+    src1 = (v16u8)__msa_fill_b(inp1);
+    src2 = (v16u8)__msa_fill_b(inp2);
+    src3 = (v16u8)__msa_fill_b(inp3);
+
+    ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  uint32_t row;
+  uint8_t inp0, inp1, inp2, inp3;
+  v16u8 src0, src1, src2, src3;
+
+  for (row = 8; row--;) {
+    inp0 = src[0];
+    inp1 = src[1];
+    inp2 = src[2];
+    inp3 = src[3];
+    src += 4;
+
+    src0 = (v16u8)__msa_fill_b(inp0);
+    src1 = (v16u8)__msa_fill_b(inp1);
+    src2 = (v16u8)__msa_fill_b(inp2);
+    src3 = (v16u8)__msa_fill_b(inp3);
+
+    ST_UB2(src0, src0, dst, 16);
+    dst += dst_stride;
+    ST_UB2(src1, src1, dst, 16);
+    dst += dst_stride;
+    ST_UB2(src2, src2, dst, 16);
+    dst += dst_stride;
+    ST_UB2(src3, src3, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_dc_4x4_msa(const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     uint8_t *dst, int32_t dst_stride) {
+  uint32_t val0, val1;
+  v16i8 store, src = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  val0 = LW(src_top);
+  val1 = LW(src_left);
+  INSERT_W2_SB(val0, val1, src);
+  sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_w((v4i32)store, 0);
+
+  SW4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint32_t val0;
+  v16i8 store, data = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+
+  val0 = LW(src);
+  data = (v16i8)__msa_insert_w((v4i32)data, 0, val0);
+  sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_w((v4i32)store, 0);
+
+  SW4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) {
+  uint32_t out;
+  const v16i8 store = __msa_ldi_b(128);
+
+  out = __msa_copy_u_w((v4i32)store, 0);
+
+  SW4(out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     uint8_t *dst, int32_t dst_stride) {
+  uint64_t val0, val1;
+  v16i8 store;
+  v16u8 src = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  val0 = LD(src_top);
+  val1 = LD(src_left);
+  INSERT_D2_UB(val0, val1, src);
+  sum_h = __msa_hadd_u_h(src, src);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_d((v2i64)store, 0);
+
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint64_t val0;
+  v16i8 store;
+  v16u8 data = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  val0 = LD(src);
+  data = (v16u8)__msa_insert_d((v2i64)data, 0, val0);
+  sum_h = __msa_hadd_u_h(data, data);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_d((v2i64)store, 0);
+
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) {
+  uint64_t out;
+  const v16i8 store = __msa_ldi_b(128);
+
+  out = __msa_copy_u_d((v2i64)store, 0);
+
+  SD4(out, out, out, out, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
+                                       const uint8_t *src_left,
+                                       uint8_t *dst, int32_t dst_stride) {
+  v16u8 top, left, out;
+  v8u16 sum_h, sum_top, sum_left;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  top = LD_UB(src_top);
+  left = LD_UB(src_left);
+  HADD_UB2_UH(top, left, sum_top, sum_left);
+  sum_h = sum_top + sum_left;
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+  dst += (8 * dst_stride);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  v16u8 data, out;
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  data = LD_UB(src);
+  sum_h = __msa_hadd_u_h(data, data);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+  dst += (8 * dst_stride);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) {
+  const v16u8 out = (v16u8)__msa_ldi_b(128);
+
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+  dst += (8 * dst_stride);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_32x32_msa(const uint8_t *src_top,
+                                       const uint8_t *src_left,
+                                       uint8_t *dst, int32_t dst_stride) {
+  uint32_t row;
+  v16u8 top0, top1, left0, left1, out;
+  v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  LD_UB2(src_top, 16, top0, top1);
+  LD_UB2(src_left, 16, left0, left1);
+  HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
+  HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
+  sum_h = sum_top0 + sum_top1;
+  sum_h += sum_left0 + sum_left1;
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  for (row = 16; row--;) {
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  uint32_t row;
+  v16u8 data0, data1, out;
+  v8u16 sum_h, sum_data0, sum_data1;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  LD_UB2(src, 16, data0, data1);
+  HADD_UB2_UH(data0, data1, sum_data0, sum_data1);
+  sum_h = sum_data0 + sum_data1;
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  for (row = 16; row--;) {
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
+  uint32_t row;
+  const v16u8 out = (v16u8)__msa_ldi_b(128);
+
+  for (row = 16; row--;) {
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
+                                     const uint8_t *src_left,
+                                     uint8_t *dst, int32_t dst_stride) {
+  uint32_t val;
+  uint8_t top_left = src_top_ptr[-1];
+  v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
+  v16u8 src0, src1, src2, src3;
+  v8u16 src_top_left, vec0, vec1, vec2, vec3;
+
+  src_top_left = (v8u16)__msa_fill_h(top_left);
+  val = LW(src_top_ptr);
+  src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val);
+
+  src_left0 = __msa_fill_b(src_left[0]);
+  src_left1 = __msa_fill_b(src_left[1]);
+  src_left2 = __msa_fill_b(src_left[2]);
+  src_left3 = __msa_fill_b(src_left[3]);
+
+  ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
+             src_left3, src_top, src0, src1, src2, src3);
+  HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
+  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
+  SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
+  PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
+  ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
+}
+
+static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
+                                     const uint8_t *src_left,
+                                     uint8_t *dst, int32_t dst_stride) {
+  uint64_t val;
+  uint8_t top_left = src_top_ptr[-1];
+  uint32_t loop_cnt;
+  v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
+  v8u16 src_top_left, vec0, vec1, vec2, vec3;
+  v16u8 src0, src1, src2, src3;
+
+  val = LD(src_top_ptr);
+  src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val);
+  src_top_left = (v8u16)__msa_fill_h(top_left);
+
+  for (loop_cnt = 2; loop_cnt--;) {
+    src_left0 = __msa_fill_b(src_left[0]);
+    src_left1 = __msa_fill_b(src_left[1]);
+    src_left2 = __msa_fill_b(src_left[2]);
+    src_left3 = __msa_fill_b(src_left[3]);
+    src_left += 4;
+
+    ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
+               src_left3, src_top, src0, src1, src2, src3);
+    HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
+    SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
+                                       const uint8_t *src_left,
+                                       uint8_t *dst, int32_t dst_stride) {
+  uint8_t top_left = src_top_ptr[-1];
+  uint32_t loop_cnt;
+  v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
+  v8u16 src_top_left, res_r, res_l;
+
+  src_top = LD_SB(src_top_ptr);
+  src_top_left = (v8u16)__msa_fill_h(top_left);
+
+  for (loop_cnt = 4; loop_cnt--;) {
+    src_left0 = __msa_fill_b(src_left[0]);
+    src_left1 = __msa_fill_b(src_left[1]);
+    src_left2 = __msa_fill_b(src_left[2]);
+    src_left3 = __msa_fill_b(src_left[3]);
+    src_left += 4;
+
+    ILVRL_B2_UH(src_left0, src_top, res_r, res_l);
+    HADD_UB2_UH(res_r, res_l, res_r, res_l);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+
+    SAT_UH2_UH(res_r, res_l, 7);
+    PCKEV_ST_SB(res_r, res_l, dst);
+    dst += dst_stride;
+
+    ILVRL_B2_UH(src_left1, src_top, res_r, res_l);
+    HADD_UB2_UH(res_r, res_l, res_r, res_l);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+    SAT_UH2_UH(res_r, res_l, 7);
+    PCKEV_ST_SB(res_r, res_l, dst);
+    dst += dst_stride;
+
+    ILVRL_B2_UH(src_left2, src_top, res_r, res_l);
+    HADD_UB2_UH(res_r, res_l, res_r, res_l);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+    SAT_UH2_UH(res_r, res_l, 7);
+    PCKEV_ST_SB(res_r, res_l, dst);
+    dst += dst_stride;
+
+    ILVRL_B2_UH(src_left3, src_top, res_r, res_l);
+    HADD_UB2_UH(res_r, res_l, res_r, res_l);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+    SAT_UH2_UH(res_r, res_l, 7);
+    PCKEV_ST_SB(res_r, res_l, dst);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_tm_32x32_msa(const uint8_t *src_top,
+                                       const uint8_t *src_left,
+                                       uint8_t *dst, int32_t dst_stride) {
+  uint8_t top_left = src_top[-1];
+  uint32_t loop_cnt;
+  v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
+  v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;
+
+  LD_SB2(src_top, 16, src_top0, src_top1);
+  src_top_left = (v8u16)__msa_fill_h(top_left);
+
+  for (loop_cnt = 8; loop_cnt--;) {
+    src_left0 = __msa_fill_b(src_left[0]);
+    src_left1 = __msa_fill_b(src_left[1]);
+    src_left2 = __msa_fill_b(src_left[2]);
+    src_left3 = __msa_fill_b(src_left[3]);
+    src_left += 4;
+
+    ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
+    ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
+    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+    PCKEV_ST_SB(res_r0, res_l0, dst);
+    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+    dst += dst_stride;
+
+    ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
+    ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
+    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+    PCKEV_ST_SB(res_r0, res_l0, dst);
+    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+    dst += dst_stride;
+
+    ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
+    ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
+    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+    PCKEV_ST_SB(res_r0, res_l0, dst);
+    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+    dst += dst_stride;
+
+    ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
+    ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
+    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+    PCKEV_ST_SB(res_r0, res_l0, dst);
+    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+    dst += dst_stride;
+  }
+}
+
+void vp9_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_4x4_msa(above, dst, y_stride);
+}
+
+void vp9_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_8x8_msa(above, dst, y_stride);
+}
+
+void vp9_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_16x16_msa(above, dst, y_stride);
+}
+
+void vp9_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_32x32_msa(above, dst, y_stride);
+}
+
+void vp9_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_4x4_msa(left, dst, y_stride);
+}
+
+void vp9_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_8x8_msa(left, dst, y_stride);
+}
+
+void vp9_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_16x16_msa(left, dst, y_stride);
+}
+
+void vp9_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_32x32_msa(left, dst, y_stride);
+}
+
+void vp9_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_4x4_msa(above, left, dst, y_stride);
+}
+
+void vp9_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_8x8_msa(above, left, dst, y_stride);
+}
+
+void vp9_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_16x16_msa(above, left, dst, y_stride);
+}
+
+void vp9_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_32x32_msa(above, left, dst, y_stride);
+}
+
+void vp9_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_4x4_msa(above, dst, y_stride);
+}
+
+void vp9_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_8x8_msa(above, dst, y_stride);
+}
+
+void vp9_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_16x16_msa(above, dst, y_stride);
+}
+
+void vp9_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_32x32_msa(above, dst, y_stride);
+}
+
+void vp9_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_4x4_msa(left, dst, y_stride);
+}
+
+void vp9_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_8x8_msa(left, dst, y_stride);
+}
+
+void vp9_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_16x16_msa(left, dst, y_stride);
+}
+
+void vp9_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_32x32_msa(left, dst, y_stride);
+}
+
+void vp9_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_4x4_msa(dst, y_stride);
+}
+
+void vp9_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_8x8_msa(dst, y_stride);
+}
+
+void vp9_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_16x16_msa(dst, y_stride);
+}
+
+void vp9_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_32x32_msa(dst, y_stride);
+}
+
+void vp9_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_tm_4x4_msa(above, left, dst, y_stride);
+}
+
+void vp9_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_tm_8x8_msa(above, left, dst, y_stride);
+}
+
+void vp9_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_tm_16x16_msa(above, left, dst, y_stride);
+}
+
+void vp9_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_tm_32x32_msa(above, left, dst, y_stride);
+}

diff --git a/vp9/common/mips/msa/vp9_macros_msa.h b/vp9/common/mips/msa/vp9_macros_msa.h
index 3751e35..f1217d5a 100644
--- a/vp9/common/mips/msa/vp9_macros_msa.h
+++ b/vp9/common/mips/msa/vp9_macros_msa.h

@@ -743,6 +743,26 @@
   CLIP_SH2_0_255(in2, in3);                   \
 }
 
+/* Description : Horizontal addition of unsigned byte vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each unsigned odd byte element from 'in0' is added to
+                 even unsigned byte element from 'in0' (pairwise) and the
+                 halfword result is stored in 'out0'
+*/
+#define HADD_UB2(RTYPE, in0, in1, out0, out1) {          \
+  out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0);  \
+  out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1);  \
+}
+#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
+
+#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  HADD_UB2(RTYPE, in0, in1, out0, out1);                               \
+  HADD_UB2(RTYPE, in2, in3, out2, out3);                               \
+}
+#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
+
 /* Description : Insert specified word elements from input vectors to 1
                  destination vector
    Arguments   : Inputs  - in0, in1, in2, in3 (4 input vectors)
@@ -755,6 +775,19 @@
 }
 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
 
+/* Description : Insert specified double word elements from input vectors to 1
+                 destination vector
+   Arguments   : Inputs  - in0, in1      (2 input vectors)
+                 Outputs - out           (output vector)
+                 Return Type - as per RTYPE
+*/
+#define INSERT_D2(RTYPE, in0, in1, out) {           \
+  out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \
+  out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \
+}
+#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
+#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
+
 /* Description : Interleave even byte elements from vectors
    Arguments   : Inputs  - in0, in1, in2, in3
                  Outputs - out0, out1

diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index e53e15d..53ae921 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h

@@ -50,16 +50,16 @@
 } b_mode_info;
 
 // Note that the rate-distortion optimization loop, bit-stream writer, and
-// decoder implementation modules critically rely on the enum entry values
+// decoder implementation modules critically rely on the defined entry values
 // specified herein. They should be refactored concurrently.
-typedef enum {
-  NONE = -1,
-  INTRA_FRAME = 0,
-  LAST_FRAME = 1,
-  GOLDEN_FRAME = 2,
-  ALTREF_FRAME = 3,
-  MAX_REF_FRAMES = 4
-} MV_REFERENCE_FRAME;
+
+#define NONE           -1
+#define INTRA_FRAME     0
+#define LAST_FRAME      1
+#define GOLDEN_FRAME    2
+#define ALTREF_FRAME    3
+#define MAX_REF_FRAMES  4
+typedef int8_t MV_REFERENCE_FRAME;
 
 // This structure now relates to 8x8 block regions.
 typedef struct {
@@ -75,12 +75,17 @@
   PREDICTION_MODE uv_mode;
 
   // Only for INTER blocks
+  INTERP_FILTER interp_filter;
   MV_REFERENCE_FRAME ref_frame[2];
+
+  // TODO(slavarnway): Delete and use bmi[3].as_mv[] instead.
   int_mv mv[2];
+
+#if CONFIG_VP9_ENCODER
+  // TODO(slavarnway): Move to encoder
   int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
   uint8_t mode_context[MAX_REF_FRAMES];
-  INTERP_FILTER interp_filter;
-
+#endif
 } MB_MODE_INFO;
 
 typedef struct MODE_INFO {

diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 0482025..d089f23 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h

@@ -12,6 +12,7 @@
 #define VP9_COMMON_VP9_ENUMS_H_
 
 #include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -40,23 +41,22 @@
   MAX_PROFILES
 } BITSTREAM_PROFILE;
 
-typedef enum BLOCK_SIZE {
-  BLOCK_4X4,
-  BLOCK_4X8,
-  BLOCK_8X4,
-  BLOCK_8X8,
-  BLOCK_8X16,
-  BLOCK_16X8,
-  BLOCK_16X16,
-  BLOCK_16X32,
-  BLOCK_32X16,
-  BLOCK_32X32,
-  BLOCK_32X64,
-  BLOCK_64X32,
-  BLOCK_64X64,
-  BLOCK_SIZES,
-  BLOCK_INVALID = BLOCK_SIZES
-} BLOCK_SIZE;
+#define BLOCK_4X4     0
+#define BLOCK_4X8     1
+#define BLOCK_8X4     2
+#define BLOCK_8X8     3
+#define BLOCK_8X16    4
+#define BLOCK_16X8    5
+#define BLOCK_16X16   6
+#define BLOCK_16X32   7
+#define BLOCK_32X16   8
+#define BLOCK_32X32   9
+#define BLOCK_32X64  10
+#define BLOCK_64X32  11
+#define BLOCK_64X64  12
+#define BLOCK_SIZES  13
+#define BLOCK_INVALID BLOCK_SIZES
+typedef uint8_t BLOCK_SIZE;
 
 typedef enum PARTITION_TYPE {
   PARTITION_NONE,
@@ -72,13 +72,12 @@
 #define PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)
 
 // block transform size
-typedef enum {
-  TX_4X4 = 0,                      // 4x4 transform
-  TX_8X8 = 1,                      // 8x8 transform
-  TX_16X16 = 2,                    // 16x16 transform
-  TX_32X32 = 3,                    // 32x32 transform
-  TX_SIZES
-} TX_SIZE;
+typedef uint8_t TX_SIZE;
+#define TX_4X4   ((TX_SIZE)0)   // 4x4 transform
+#define TX_8X8   ((TX_SIZE)1)   // 8x8 transform
+#define TX_16X16 ((TX_SIZE)2)   // 16x16 transform
+#define TX_32X32 ((TX_SIZE)3)   // 32x32 transform
+#define TX_SIZES ((TX_SIZE)4)
 
 // frame transform mode
 typedef enum {
@@ -110,23 +109,22 @@
   PLANE_TYPES
 } PLANE_TYPE;
 
-typedef enum {
-  DC_PRED,         // Average of above and left pixels
-  V_PRED,          // Vertical
-  H_PRED,          // Horizontal
-  D45_PRED,        // Directional 45  deg = round(arctan(1/1) * 180/pi)
-  D135_PRED,       // Directional 135 deg = 180 - 45
-  D117_PRED,       // Directional 117 deg = 180 - 63
-  D153_PRED,       // Directional 153 deg = 180 - 27
-  D207_PRED,       // Directional 207 deg = 180 + 27
-  D63_PRED,        // Directional 63  deg = round(arctan(2/1) * 180/pi)
-  TM_PRED,         // True-motion
-  NEARESTMV,
-  NEARMV,
-  ZEROMV,
-  NEWMV,
-  MB_MODE_COUNT
-} PREDICTION_MODE;
+#define DC_PRED    0       // Average of above and left pixels
+#define V_PRED     1       // Vertical
+#define H_PRED     2       // Horizontal
+#define D45_PRED   3       // Directional 45  deg = round(arctan(1/1) * 180/pi)
+#define D135_PRED  4       // Directional 135 deg = 180 - 45
+#define D117_PRED  5       // Directional 117 deg = 180 - 63
+#define D153_PRED  6       // Directional 153 deg = 180 - 27
+#define D207_PRED  7       // Directional 207 deg = 180 + 27
+#define D63_PRED   8       // Directional 63  deg = round(arctan(2/1) * 180/pi)
+#define TM_PRED    9       // True-motion
+#define NEARESTMV 10
+#define NEARMV    11
+#define ZEROMV    12
+#define NEWMV     13
+#define MB_MODE_COUNT 14
+typedef uint8_t PREDICTION_MODE;
 
 #define INTRA_MODES (TM_PRED + 1)
 

diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c
index afcdf22..b256d4a 100644
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c

@@ -12,7 +12,8 @@
 
 #include "vp9/common/vp9_filter.h"
 
-const InterpKernel vp9_bilinear_filters[SUBPEL_SHIFTS] = {
+DECLARE_ALIGNED(256, const InterpKernel,
+                vp9_bilinear_filters[SUBPEL_SHIFTS]) = {
   { 0, 0, 0, 128,   0, 0, 0, 0 },
   { 0, 0, 0, 120,   8, 0, 0, 0 },
   { 0, 0, 0, 112,  16, 0, 0, 0 },

diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h
index 808a270..13d38af 100644
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h

@@ -27,17 +27,16 @@
 #define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
 #define SUBPEL_TAPS 8
 
-typedef enum {
-  EIGHTTAP = 0,
-  EIGHTTAP_SMOOTH = 1,
-  EIGHTTAP_SHARP = 2,
-  SWITCHABLE_FILTERS = 3, /* Number of switchable filters */
-  BILINEAR = 3,
-  // The codec can operate in four possible inter prediction filter mode:
-  // 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three.
-  SWITCHABLE_FILTER_CONTEXTS = SWITCHABLE_FILTERS + 1,
-  SWITCHABLE = 4  /* should be the last one */
-} INTERP_FILTER;
+#define EIGHTTAP            0
+#define EIGHTTAP_SMOOTH     1
+#define EIGHTTAP_SHARP      2
+#define SWITCHABLE_FILTERS  3 /* Number of switchable filters */
+#define BILINEAR            3
+// The codec can operate in four possible inter prediction filter mode:
+// 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three.
+#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
+#define SWITCHABLE 4 /* should be the last one */
+typedef uint8_t INTERP_FILTER;
 
 typedef int16_t InterpKernel[SUBPEL_TAPS];
 

diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 484e457..9816728 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c

@@ -267,8 +267,8 @@
 
   for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
     int lvl_seg = default_filt_lvl;
-    if (vp9_segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) {
-      const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
+    if (segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) {
+      const int data = get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
       lvl_seg = clamp(seg->abs_delta == SEGMENT_ABSDATA ?
                       data : default_filt_lvl + data,
                       0, MAX_LOOP_FILTER);

diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index ce69527..5f8ee0f 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c

@@ -18,7 +18,8 @@
                              MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                              int_mv *mv_ref_list,
                              int block, int mi_row, int mi_col,
-                             find_mv_refs_sync sync, void *const data) {
+                             find_mv_refs_sync sync, void *const data,
+                             uint8_t *mode_context) {
   const int *ref_sign_bias = cm->ref_frame_sign_bias;
   int i, refmv_count = 0;
   const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
@@ -138,7 +139,7 @@
 
  Done:
 
-  mi->mbmi.mode_context[ref_frame] = counter_to_context[context_counter];
+  mode_context[ref_frame] = counter_to_context[context_counter];
 
   // Clamp vectors
   for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i)
@@ -150,9 +151,10 @@
                       MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                       int_mv *mv_ref_list,
                       int mi_row, int mi_col,
-                      find_mv_refs_sync sync, void *const data) {
+                      find_mv_refs_sync sync, void *const data,
+                      uint8_t *mode_context) {
   find_mv_refs_idx(cm, xd, tile, mi, ref_frame, mv_ref_list, -1,
-                   mi_row, mi_col, sync, data);
+                   mi_row, mi_col, sync, data, mode_context);
 }
 
 static void lower_mv_precision(MV *mv, int allow_hp) {
@@ -181,7 +183,8 @@
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
                                    const TileInfo *const tile,
                                    int block, int ref, int mi_row, int mi_col,
-                                   int_mv *nearest_mv, int_mv *near_mv) {
+                                   int_mv *nearest_mv, int_mv *near_mv,
+                                   uint8_t *mode_context) {
   int_mv mv_list[MAX_MV_REF_CANDIDATES];
   MODE_INFO *const mi = xd->mi[0];
   b_mode_info *bmi = mi->bmi;
@@ -190,7 +193,7 @@
   assert(MAX_MV_REF_CANDIDATES == 2);
 
   find_mv_refs_idx(cm, xd, tile, mi, mi->mbmi.ref_frame[ref], mv_list, block,
-                   mi_row, mi_col, NULL, NULL);
+                   mi_row, mi_col, NULL, NULL, mode_context);
 
   near_mv->as_int = 0;
   switch (block) {

diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h
index f1df521..621dc14 100644
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h

@@ -212,7 +212,8 @@
                       const TileInfo *const tile,
                       MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                       int_mv *mv_ref_list, int mi_row, int mi_col,
-                      find_mv_refs_sync sync, void *const data);
+                      find_mv_refs_sync sync, void *const data,
+                      uint8_t *mode_context);
 
 // check a list of motion vectors by sad score using a number rows of pixels
 // above and a number cols of pixels in the left to select the one with best
@@ -223,7 +224,8 @@
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
                                    const TileInfo *const tile,
                                    int block, int ref, int mi_row, int mi_col,
-                                   int_mv *nearest_mv, int_mv *near_mv);
+                                   int_mv *nearest_mv, int_mv *near_mv,
+                                   uint8_t *mode_context);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/vp9/common/vp9_quant_common.c b/vp9/common/vp9_quant_common.c
index 564a3eb..d83f3c1 100644
--- a/vp9/common/vp9_quant_common.c
+++ b/vp9/common/vp9_quant_common.c

@@ -266,8 +266,8 @@
 
 int vp9_get_qindex(const struct segmentation *seg, int segment_id,
                    int base_qindex) {
-  if (vp9_segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
-    const int data = vp9_get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
+  if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
+    const int data = get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
     const int seg_qindex = seg->abs_delta == SEGMENT_ABSDATA ?
         data : base_qindex + data;
     return clamp(seg_qindex, 0, MAXQ);

diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 3312f29..650f4ad 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c

@@ -464,15 +464,17 @@
 static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                  const uint8_t *above, const uint8_t *left) {
   const uint8_t above_right = above[bs - 1];
+  const uint8_t *const dst_row0 = dst;
   int x, size;
-  uint8_t avg[31];  // TODO(jzern): this could be block size specific
   (void)left;
 
   for (x = 0; x < bs - 1; ++x) {
-    avg[x] = AVG3(above[x], above[x + 1], above[x + 2]);
+    dst[x] = AVG3(above[x], above[x + 1], above[x + 2]);
   }
-  for (x = 0, size = bs - 1; x < bs; ++x, --size) {
-    memcpy(dst, avg + x, size);
+  dst[bs - 1] = above_right;
+  dst += stride;
+  for (x = 1, size = bs - 2; x < bs; ++x, --size) {
+    memcpy(dst, dst_row0 + x, size);
     memset(dst + size, above_right, x + 1);
     dst += stride;
   }

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index bbe200d..5035126 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -66,7 +66,7 @@
 specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_4x4 neon dspr2/, "$ssse3_x86inc";
+specialize qw/vp9_h_predictor_4x4 neon dspr2 msa/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d117_predictor_4x4/;
@@ -78,22 +78,22 @@
 specialize qw/vp9_d153_predictor_4x4/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_4x4 neon/, "$sse_x86inc";
+specialize qw/vp9_v_predictor_4x4 neon msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_4x4 neon dspr2/, "$sse_x86inc";
+specialize qw/vp9_tm_predictor_4x4 neon dspr2 msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_predictor_4x4 dspr2/, "$sse_x86inc";
+specialize qw/vp9_dc_predictor_4x4 dspr2 msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_4x4/, "$sse_x86inc";
+specialize qw/vp9_dc_top_predictor_4x4 msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_4x4/, "$sse_x86inc";
+specialize qw/vp9_dc_left_predictor_4x4 msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_4x4/, "$sse_x86inc";
+specialize qw/vp9_dc_128_predictor_4x4 msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d207_predictor_8x8/, "$ssse3_x86inc";
@@ -105,7 +105,7 @@
 specialize qw/vp9_d63_predictor_8x8/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_8x8 neon dspr2/, "$ssse3_x86inc";
+specialize qw/vp9_h_predictor_8x8 neon dspr2 msa/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d117_predictor_8x8/;
@@ -117,22 +117,22 @@
 specialize qw/vp9_d153_predictor_8x8/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_8x8 neon/, "$sse_x86inc";
+specialize qw/vp9_v_predictor_8x8 neon msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_8x8 neon dspr2/, "$sse2_x86inc";
+specialize qw/vp9_tm_predictor_8x8 neon dspr2 msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_predictor_8x8 dspr2 neon/, "$sse_x86inc";
+specialize qw/vp9_dc_predictor_8x8 dspr2 neon msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_8x8 neon/, "$sse_x86inc";
+specialize qw/vp9_dc_top_predictor_8x8 neon msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_8x8 neon/, "$sse_x86inc";
+specialize qw/vp9_dc_left_predictor_8x8 neon msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_8x8 neon/, "$sse_x86inc";
+specialize qw/vp9_dc_128_predictor_8x8 neon msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d207_predictor_16x16/, "$ssse3_x86inc";
@@ -144,7 +144,7 @@
 specialize qw/vp9_d63_predictor_16x16/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_16x16 neon dspr2/, "$ssse3_x86inc";
+specialize qw/vp9_h_predictor_16x16 neon dspr2 msa/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d117_predictor_16x16/;
@@ -156,22 +156,22 @@
 specialize qw/vp9_d153_predictor_16x16/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_16x16 neon/, "$sse2_x86inc";
+specialize qw/vp9_v_predictor_16x16 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_16x16 neon/, "$sse2_x86inc";
+specialize qw/vp9_tm_predictor_16x16 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_predictor_16x16 dspr2 neon/, "$sse2_x86inc";
+specialize qw/vp9_dc_predictor_16x16 dspr2 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_16x16 neon/, "$sse2_x86inc";
+specialize qw/vp9_dc_top_predictor_16x16 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_16x16 neon/, "$sse2_x86inc";
+specialize qw/vp9_dc_left_predictor_16x16 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_16x16 neon/, "$sse2_x86inc";
+specialize qw/vp9_dc_128_predictor_16x16 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d207_predictor_32x32/, "$ssse3_x86inc";
@@ -183,7 +183,7 @@
 specialize qw/vp9_d63_predictor_32x32/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_32x32 neon/, "$ssse3_x86inc";
+specialize qw/vp9_h_predictor_32x32 neon msa/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d117_predictor_32x32/;
@@ -195,22 +195,22 @@
 specialize qw/vp9_d153_predictor_32x32/;
 
 add_proto qw/void vp9_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_32x32 neon/, "$sse2_x86inc";
+specialize qw/vp9_v_predictor_32x32 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_32x32 neon/, "$sse2_x86_64";
+specialize qw/vp9_tm_predictor_32x32 neon msa/, "$sse2_x86_64";
 
 add_proto qw/void vp9_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_predictor_32x32/, "$sse2_x86inc";
+specialize qw/vp9_dc_predictor_32x32 msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_32x32/, "$sse2_x86inc";
+specialize qw/vp9_dc_top_predictor_32x32 msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_32x32/, "$sse2_x86inc";
+specialize qw/vp9_dc_left_predictor_32x32 msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_32x32/, "$sse2_x86inc";
+specialize qw/vp9_dc_128_predictor_32x32 msa/, "$sse2_x86inc";
 
 #
 # Loopfilter

diff --git a/vp9/common/vp9_seg_common.c b/vp9/common/vp9_seg_common.c
index 910200e..471e238 100644
--- a/vp9/common/vp9_seg_common.c
+++ b/vp9/common/vp9_seg_common.c

@@ -25,12 +25,6 @@
 // the coding mechanism is still subject to change so these provide a
 // convenient single point of change.
 
-int vp9_segfeature_active(const struct segmentation *seg, int segment_id,
-                          SEG_LVL_FEATURES feature_id) {
-  return seg->enabled &&
-         (seg->feature_mask[segment_id] & (1 << feature_id));
-}
-
 void vp9_clearall_segfeatures(struct segmentation *seg) {
   vp9_zero(seg->feature_data);
   vp9_zero(seg->feature_mask);
@@ -60,12 +54,6 @@
   seg->feature_data[segment_id][feature_id] = seg_data;
 }
 
-int vp9_get_segdata(const struct segmentation *seg, int segment_id,
-                    SEG_LVL_FEATURES feature_id) {
-  return seg->feature_data[segment_id][feature_id];
-}
-
-
 const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)] = {
   2,  4,  6,  8, 10, 12,
   0, -1, -2, -3, -4, -5, -6, -7

diff --git a/vp9/common/vp9_seg_common.h b/vp9/common/vp9_seg_common.h
index ff2d66a..95c9918 100644
--- a/vp9/common/vp9_seg_common.h
+++ b/vp9/common/vp9_seg_common.h

@@ -49,9 +49,12 @@
   unsigned int feature_mask[MAX_SEGMENTS];
 };
 
-int vp9_segfeature_active(const struct segmentation *seg,
-                          int segment_id,
-                          SEG_LVL_FEATURES feature_id);
+static INLINE int segfeature_active(const struct segmentation *seg,
+                                    int segment_id,
+                                    SEG_LVL_FEATURES feature_id) {
+  return seg->enabled &&
+         (seg->feature_mask[segment_id] & (1 << feature_id));
+}
 
 void vp9_clearall_segfeatures(struct segmentation *seg);
 
@@ -68,9 +71,10 @@
                      SEG_LVL_FEATURES feature_id,
                      int seg_data);
 
-int vp9_get_segdata(const struct segmentation *seg,
-                    int segment_id,
-                    SEG_LVL_FEATURES feature_id);
+static INLINE int get_segdata(const struct segmentation *seg, int segment_id,
+                              SEG_LVL_FEATURES feature_id) {
+  return seg->feature_data[segment_id][feature_id];
+}
 
 extern const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)];
 

diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 7ce3389..d34926d 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c

@@ -177,7 +177,7 @@
 
 static int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd,
                      int segment_id, vp9_reader *r) {
-  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
     const int ctx = vp9_get_skip_context(xd);
@@ -307,9 +307,9 @@
   FRAME_CONTEXT *const fc = cm->fc;
   FRAME_COUNTS *counts = xd->counts;
 
-  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
-    ref_frame[0] = (MV_REFERENCE_FRAME)vp9_get_segdata(&cm->seg, segment_id,
-                                                       SEG_LVL_REF_FRAME);
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id,
+                                                   SEG_LVL_REF_FRAME);
     ref_frame[1] = NONE;
   } else {
     const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
@@ -444,9 +444,8 @@
 
 static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                                int segment_id, vp9_reader *r) {
-  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
-    return vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) !=
-           INTRA_FRAME;
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    return get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME;
   } else {
     const int ctx = vp9_get_intra_inter_context(xd);
     const int is_inter = vp9_read(r, cm->fc->intra_inter_prob[ctx]);
@@ -473,7 +472,9 @@
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int allow_hp = cm->allow_high_precision_mv;
   int_mv nearestmv[2], nearmv[2];
-  int inter_mode_ctx, ref, is_compound;
+  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
+  int ref, is_compound;
+  uint8_t inter_mode_ctx[MAX_REF_FRAMES];
 
   read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
   is_compound = has_second_ref(mbmi);
@@ -487,13 +488,11 @@
                          "Reference frame has invalid dimensions");
     vp9_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col,
                          &ref_buf->sf);
-    vp9_find_mv_refs(cm, xd, tile, mi, frame, mbmi->ref_mvs[frame],
-                     mi_row, mi_col, fpm_sync, (void *)pbi);
+    vp9_find_mv_refs(cm, xd, tile, mi, frame, ref_mvs[frame],
+                     mi_row, mi_col, fpm_sync, (void *)pbi, inter_mode_ctx);
   }
 
-  inter_mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
-
-  if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+  if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     mbmi->mode = ZEROMV;
     if (bsize < BLOCK_8X8) {
         vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
@@ -502,12 +501,13 @@
     }
   } else {
     if (bsize >= BLOCK_8X8)
-      mbmi->mode = read_inter_mode(cm, xd, r, inter_mode_ctx);
+      mbmi->mode = read_inter_mode(cm, xd, r,
+                                   inter_mode_ctx[mbmi->ref_frame[0]]);
   }
 
   if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
     for (ref = 0; ref < 1 + is_compound; ++ref) {
-      vp9_find_best_ref_mvs(xd, allow_hp, mbmi->ref_mvs[mbmi->ref_frame[ref]],
+      vp9_find_best_ref_mvs(xd, allow_hp, ref_mvs[mbmi->ref_frame[ref]],
                             &nearestmv[ref], &nearmv[ref]);
     }
   }
@@ -526,13 +526,16 @@
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         int_mv block[2];
         const int j = idy * 2 + idx;
-        b_mode = read_inter_mode(cm, xd, r, inter_mode_ctx);
+        b_mode = read_inter_mode(cm, xd, r, inter_mode_ctx[mbmi->ref_frame[0]]);
 
-        if (b_mode == NEARESTMV || b_mode == NEARMV)
+        if (b_mode == NEARESTMV || b_mode == NEARMV) {
+          uint8_t dummy_mode_ctx[MAX_REF_FRAMES];
           for (ref = 0; ref < 1 + is_compound; ++ref)
             vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, j, ref, mi_row, mi_col,
                                           &nearest_sub8x8[ref],
-                                          &near_sub8x8[ref]);
+                                          &near_sub8x8[ref],
+                                          dummy_mode_ctx);
+        }
 
         if (!assign_mv(cm, xd, b_mode, block, nearestmv,
                        nearest_sub8x8, near_sub8x8,

diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 71c1e0b..df70d48 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c

@@ -95,19 +95,6 @@
     return 1;
 }
 
-static void adjust_cyclic_refresh_parameters(VP9_COMP *const cpi) {
-  const VP9_COMMON *const cm = &cpi->common;
-  const RATE_CONTROL *const rc = &cpi->rc;
-  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
-  // Adjust some parameters, currently only for low resolutions at low bitrates.
-  if (cm->width <= 352 &&
-      cm->height <= 288 &&
-      rc->avg_frame_bandwidth < 3400) {
-      cr->motion_thresh = 4;
-      cr->rate_boost_fac = 1.25;
-  }
-}
-
 // Check if this coding block, of size bsize, should be considered for refresh
 // (lower-qp coding). Decision can be based on various factors, such as
 // size of the coding block (i.e., below min_block size rejected), coding
@@ -435,18 +422,30 @@
   cr->sb_index = i;
 }
 
-// Set/update global/frame level cyclic refresh parameters.
+// Set cyclic refresh parameters.
 void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   cr->percent_refresh = 10;
+  cr->max_qdelta_perc = 50;
+  cr->time_for_refresh = 0;
   // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4)
-  // periods of the refresh cycle, after a key frame. This corresponds to ~40
-  // frames with cr->percent_refresh = 10.
-  if (rc->frames_since_key <  40)
+  // periods of the refresh cycle, after a key frame.
+  if (rc->frames_since_key <  4 * cr->percent_refresh)
     cr->rate_ratio_qdelta = 3.0;
   else
     cr->rate_ratio_qdelta = 2.0;
+  // Adjust some parameters for low resolutions at low bitrates.
+  if (cm->width <= 352 &&
+      cm->height <= 288 &&
+      rc->avg_frame_bandwidth < 3400) {
+    cr->motion_thresh = 4;
+    cr->rate_boost_fac = 1.25;
+  } else {
+    cr->motion_thresh = 32;
+    cr->rate_boost_fac = 1.7;
+  }
 }
 
 // Setup cyclic background refresh: set delta q and segmentation map.
@@ -475,9 +474,6 @@
     int qindex2;
     const double q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth);
     vp9_clear_system_state();
-    cr->max_qdelta_perc = 50;
-    cr->time_for_refresh = 0;
-    cr->rate_boost_fac = 1.7;
     // Set rate threshold to some multiple (set to 2 for now) of the target
     // rate (target is given by sb64_target_rate and scaled by 256).
     cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2;
@@ -485,9 +481,6 @@
     // q will not exceed 457, so (q * q) is within 32bit; see:
     // vp9_convert_qindex_to_q(), vp9_ac_quant(), ac_qlookup*[].
     cr->thresh_dist_sb = ((int64_t)(q * q)) << 2;
-    cr->motion_thresh = 32;
-
-    adjust_cyclic_refresh_parameters(cpi);
 
     // Set up segmentation.
     // Clear down the segment map.

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index d20e067..092d265 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c

@@ -93,7 +93,7 @@
 
 static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                       int segment_id, const MODE_INFO *mi, vp9_writer *w) {
-  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
     const int skip = mi->mbmi.skip;
@@ -207,10 +207,10 @@
 
   // If segment level coding of this signal is disabled...
   // or the segment allows multiple reference frame options
-  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
     assert(!is_compound);
     assert(mbmi->ref_frame[0] ==
-               vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
+               get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
   } else {
     // does the feature use compound prediction or not
     // (if not specified at the frame/segment level)
@@ -264,7 +264,7 @@
 
   skip = write_skip(cm, xd, segment_id, mi, w);
 
-  if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+  if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
     vp9_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd));
 
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
@@ -293,7 +293,7 @@
     write_ref_frames(cm, xd, w);
 
     // If segment skip is not enabled code the mode.
-    if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
+    if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
       if (bsize >= BLOCK_8X8) {
         write_inter_mode(w, mode, inter_probs);
       }
@@ -787,10 +787,10 @@
 
     for (i = 0; i < MAX_SEGMENTS; i++) {
       for (j = 0; j < SEG_LVL_MAX; j++) {
-        const int active = vp9_segfeature_active(seg, i, j);
+        const int active = segfeature_active(seg, i, j);
         vp9_wb_write_bit(wb, active);
         if (active) {
-          const int data = vp9_get_segdata(seg, i, j);
+          const int data = get_segdata(seg, i, j);
           const int data_max = vp9_seg_feature_data_max(j);
 
           if (vp9_is_segfeature_signed(j)) {

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 7c91da1..f5e3e98 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -370,6 +370,7 @@
 
 static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
   variance_node node;
+  memset(&node, 0, sizeof(node));
   tree_to_node(data, bsize, &node);
   sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]);
   sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]);
@@ -1050,7 +1051,7 @@
   if (!output_enabled)
     return;
 
-  if (!vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+  if (!segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     for (i = 0; i < TX_MODES; i++)
       rdc->tx_select_diff[i] += ctx->tx_rd_diff[i];
   }
@@ -1247,7 +1248,7 @@
     vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
   } else {
     if (bsize >= BLOCK_8X8) {
-      if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+      if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
         vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize,
                                            ctx, best_rd);
       else
@@ -1290,8 +1291,8 @@
   if (!frame_is_intra_only(cm)) {
     FRAME_COUNTS *const counts = td->counts;
     const int inter_block = is_inter_block(mbmi);
-    const int seg_ref_active = vp9_segfeature_active(&cm->seg, mbmi->segment_id,
-                                                     SEG_LVL_REF_FRAME);
+    const int seg_ref_active = segfeature_active(&cm->seg, mbmi->segment_id,
+                                                 SEG_LVL_REF_FRAME);
     if (!seg_ref_active) {
       counts->intra_inter[vp9_get_intra_inter_context(xd)][inter_block]++;
       // If the segment reference feature is enabled we have only a single
@@ -1316,7 +1317,7 @@
       }
     }
     if (inter_block &&
-        !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
       const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
       if (bsize >= BLOCK_8X8) {
         const PREDICTION_MODE mode = mbmi->mode;
@@ -2848,7 +2849,7 @@
       const uint8_t *const map = seg->update_map ? cpi->segmentation_map
                                                  : cm->last_frame_seg_map;
       int segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
-      seg_skip = vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+      seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
     }
 
     x->source_variance = UINT_MAX;
@@ -2908,7 +2909,7 @@
 static int check_dual_ref_flags(VP9_COMP *cpi) {
   const int ref_flags = cpi->ref_frame_flags;
 
-  if (vp9_segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) {
+  if (segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) {
     return 0;
   } else {
     return (!!(ref_flags & VP9_GOLD_FLAG) + !!(ref_flags & VP9_LAST_FLAG)
@@ -2983,7 +2984,7 @@
 
   if (cm->frame_type == KEY_FRAME)
     hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx);
-  else if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+  else if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
     set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize);
   else if (bsize >= BLOCK_8X8)
     vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col,
@@ -3598,7 +3599,7 @@
       const uint8_t *const map = seg->update_map ? cpi->segmentation_map
                                                  : cm->last_frame_seg_map;
       int segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
-      seg_skip = vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+      seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
       if (seg_skip) {
         partition_search_type = FIXED_PARTITION;
       }
@@ -4160,8 +4161,8 @@
   MODE_INFO **mi_8x8 = xd->mi;
   MODE_INFO *mi = mi_8x8[0];
   MB_MODE_INFO *mbmi = &mi->mbmi;
-  const int seg_skip = vp9_segfeature_active(&cm->seg, mbmi->segment_id,
-                                             SEG_LVL_SKIP);
+  const int seg_skip = segfeature_active(&cm->seg, mbmi->segment_id,
+                                         SEG_LVL_SKIP);
   const int mis = cm->mi_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index f801851..3d7843e 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c

@@ -124,8 +124,8 @@
             stats->pcnt_motion,
             stats->pcnt_second_ref,
             stats->pcnt_neutral,
-            stats->ul_intra_pct,
-            stats->image_start_row,
+            stats->intra_skip_pct,
+            stats->inactive_zone_rows,
             stats->MVr,
             stats->mvr_abs,
             stats->MVc,
@@ -162,8 +162,8 @@
   section->pcnt_motion  = 0.0;
   section->pcnt_second_ref = 0.0;
   section->pcnt_neutral = 0.0;
-  section->ul_intra_pct = 0.0;
-  section->image_start_row = 0.0;
+  section->intra_skip_pct = 0.0;
+  section->inactive_zone_rows = 0.0;
   section->MVr = 0.0;
   section->mvr_abs     = 0.0;
   section->MVc        = 0.0;
@@ -189,8 +189,8 @@
   section->pcnt_motion += frame->pcnt_motion;
   section->pcnt_second_ref += frame->pcnt_second_ref;
   section->pcnt_neutral += frame->pcnt_neutral;
-  section->ul_intra_pct += frame->ul_intra_pct;
-  section->image_start_row += frame->image_start_row;
+  section->intra_skip_pct += frame->intra_skip_pct;
+  section->inactive_zone_rows += frame->inactive_zone_rows;
   section->MVr += frame->MVr;
   section->mvr_abs     += frame->mvr_abs;
   section->MVc        += frame->MVc;
@@ -214,8 +214,8 @@
   section->pcnt_motion -= frame->pcnt_motion;
   section->pcnt_second_ref -= frame->pcnt_second_ref;
   section->pcnt_neutral -= frame->pcnt_neutral;
-  section->ul_intra_pct -= frame->ul_intra_pct;
-  section->image_start_row -= frame->image_start_row;
+  section->intra_skip_pct -= frame->intra_skip_pct;
+  section->inactive_zone_rows -= frame->inactive_zone_rows;
   section->MVr -= frame->MVr;
   section->mvr_abs     -= frame->mvr_abs;
   section->MVc        -= frame->MVc;
@@ -487,7 +487,7 @@
   int second_ref_count = 0;
   const int intrapenalty = INTRA_MODE_PENALTY;
   double neutral_count;
-  int ul_intra_count = 0;
+  int intra_skip_count = 0;
   int image_data_start_row = INVALID_ROW;
   int new_mv_count = 0;
   int sum_in_vectors = 0;
@@ -655,7 +655,7 @@
       // common in animations, graphics and screen content, so may be used
       // as a signal to detect these types of content.
       if (this_error < UL_INTRA_THRESH) {
-        ++ul_intra_count;
+        ++intra_skip_count;
       } else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) {
         image_data_start_row = mb_row;
       }
@@ -995,8 +995,8 @@
   }
   // Exclude any image dead zone
   if (image_data_start_row > 0) {
-    ul_intra_count =
-      MAX(0, ul_intra_count - (image_data_start_row * cm->mb_cols * 2));
+    intra_skip_count =
+      MAX(0, intra_skip_count - (image_data_start_row * cm->mb_cols * 2));
   }
 
   {
@@ -1023,8 +1023,8 @@
     fps.pcnt_inter = (double)intercount / num_mbs;
     fps.pcnt_second_ref = (double)second_ref_count / num_mbs;
     fps.pcnt_neutral = (double)neutral_count / num_mbs;
-    fps.ul_intra_pct = (double)ul_intra_count / num_mbs;
-    fps.image_start_row = (double)image_data_start_row;
+    fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
+    fps.inactive_zone_rows = (double)image_data_start_row;
 
     if (mvcount > 0) {
       fps.MVr = (double)sum_mvr / mvcount;
@@ -1146,21 +1146,25 @@
 
 static int get_twopass_worst_quality(const VP9_COMP *cpi,
                                      const double section_err,
+                                     double inactive_zone,
                                      int section_target_bandwidth,
                                      double group_weight_factor) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
 
+  inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
+
   if (section_target_bandwidth <= 0) {
     return rc->worst_quality;  // Highest value allowed
   } else {
     const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
                         ? cpi->initial_mbs : cpi->common.MBs;
-    const double err_per_mb = section_err / num_mbs;
+    const int active_mbs = MAX(1, num_mbs - (int)(num_mbs * inactive_zone));
+    const double av_err_per_mb = section_err / active_mbs;
     const double speed_term = 1.0 + 0.04 * oxcf->speed;
-    const double ediv_size_correction = num_mbs / EDIV_SIZE_FACTOR;
+    const double ediv_size_correction = (double)num_mbs / EDIV_SIZE_FACTOR;
     const int target_norm_bits_per_mb = ((uint64_t)section_target_bandwidth <<
-                                         BPER_MB_NORMBITS) / num_mbs;
+                                         BPER_MB_NORMBITS) / active_mbs;
 
     int q;
     int is_svc_upper_layer = 0;
@@ -1173,7 +1177,7 @@
     // content at the given rate.
     for (q = rc->best_quality; q < rc->worst_quality; ++q) {
       const double factor =
-          calc_correction_factor(err_per_mb,
+          calc_correction_factor(av_err_per_mb,
                                  ERR_DIVISOR - ediv_size_correction,
                                  is_svc_upper_layer ? SVC_FACTOR_PT_LOW :
                                  FACTOR_PT_LOW, FACTOR_PT_HIGH, q,
@@ -1452,6 +1456,8 @@
   const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
                       ? cpi->initial_mbs : cpi->common.MBs;
 
+  // TODO(paulwilkins): correct for dead zone
+
   // Underlying boost factor is based on inter error ratio.
   frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
                 DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
@@ -1817,6 +1823,8 @@
 #if GROUP_ADAPTIVE_MAXQ
   double gf_group_raw_error = 0.0;
 #endif
+  double gf_group_skip_pct = 0.0;
+  double gf_group_inactive_zone_rows = 0.0;
   double gf_first_frame_err = 0.0;
   double mod_frame_err = 0.0;
 
@@ -1866,6 +1874,8 @@
 #if GROUP_ADAPTIVE_MAXQ
     gf_group_raw_error -= this_frame->coded_error;
 #endif
+    gf_group_skip_pct -= this_frame->intra_skip_pct;
+    gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
   }
 
   // Motion breakout threshold for loop below depends on image size.
@@ -1910,6 +1920,8 @@
 #if GROUP_ADAPTIVE_MAXQ
     gf_group_raw_error += this_frame->coded_error;
 #endif
+    gf_group_skip_pct += this_frame->intra_skip_pct;
+    gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
 
     if (EOF == input_stats(twopass, &next_frame))
       break;
@@ -2012,6 +2024,8 @@
 #if GROUP_ADAPTIVE_MAXQ
       gf_group_raw_error += this_frame->coded_error;
 #endif
+      gf_group_skip_pct += this_frame->intra_skip_pct;
+      gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
     }
     rc->baseline_gf_interval = new_gf_interval;
   }
@@ -2034,6 +2048,12 @@
     const int vbr_group_bits_per_frame =
       (int)(gf_group_bits / rc->baseline_gf_interval);
     const double group_av_err = gf_group_raw_error  / rc->baseline_gf_interval;
+    const double group_av_skip_pct =
+      gf_group_skip_pct / rc->baseline_gf_interval;
+    const double group_av_inactive_zone =
+      ((gf_group_inactive_zone_rows * 2) /
+       (rc->baseline_gf_interval * (double)cm->mb_rows));
+
     int tmp_q;
     // rc factor is a weight factor that corrects for local rate control drift.
     double rc_factor = 1.0;
@@ -2045,7 +2065,9 @@
                       (double)(100 - rc->rate_error_estimate) / 100.0);
     }
     tmp_q =
-      get_twopass_worst_quality(cpi, group_av_err, vbr_group_bits_per_frame,
+      get_twopass_worst_quality(cpi, group_av_err,
+                                (group_av_skip_pct + group_av_inactive_zone),
+                                vbr_group_bits_per_frame,
                                 twopass->kfgroup_inter_fraction * rc_factor);
     twopass->active_worst_quality =
       MAX(tmp_q, twopass->active_worst_quality >> 1);
@@ -2584,10 +2606,17 @@
     // Special case code for first frame.
     const int section_target_bandwidth = (int)(twopass->bits_left /
                                                frames_left);
+    const double section_length = twopass->total_left_stats.count;
     const double section_error =
-      twopass->total_left_stats.coded_error / twopass->total_left_stats.count;
+      twopass->total_left_stats.coded_error / section_length;
+    const double section_intra_skip =
+      twopass->total_left_stats.intra_skip_pct / section_length;
+    const double section_inactive_zone =
+      (twopass->total_left_stats.inactive_zone_rows * 2) /
+      ((double)cm->mb_rows * section_length);
     const int tmp_q =
       get_twopass_worst_quality(cpi, section_error,
+                                section_intra_skip + section_inactive_zone,
                                 section_target_bandwidth, DEFAULT_GRP_WEIGHT);
 
     twopass->active_worst_quality = tmp_q;
@@ -2604,7 +2633,7 @@
     return;
 
   // Set the frame content type flag.
-  if (this_frame.ul_intra_pct >= FC_ANIMATION_THRESH)
+  if (this_frame.intra_skip_pct >= FC_ANIMATION_THRESH)
     twopass->fr_content_type = FC_GRAPHICS_ANIMATION;
   else
     twopass->fr_content_type = FC_NORMAL;

diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 45f1132..0047932 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h

@@ -51,8 +51,8 @@
   double pcnt_motion;
   double pcnt_second_ref;
   double pcnt_neutral;
-  double ul_intra_pct;
-  double image_start_row;
+  double intra_skip_pct;
+  double inactive_zone_rows;  // Image mask rows top and bottom.
   double MVr;
   double mvr_abs;
   double MVc;

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 2c78831..2479b6e 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c

@@ -1179,7 +1179,8 @@
 
       if (cm->use_prev_frame_mvs)
         vp9_find_mv_refs(cm, xd, tile_info, xd->mi[0], ref_frame,
-                         candidates, mi_row, mi_col, NULL, NULL);
+                         candidates, mi_row, mi_col, NULL, NULL,
+                         xd->mi[0]->mbmi.mode_context);
       else
         const_motion[ref_frame] = mv_refs_rt(cm, xd, tile_info,
                                              xd->mi[0],
@@ -1658,7 +1659,8 @@
       vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col,
                            sf, sf);
       vp9_find_mv_refs(cm, xd, tile_info, xd->mi[0], ref_frame,
-                       candidates, mi_row, mi_col, NULL, NULL);
+                       candidates, mi_row, mi_col, NULL, NULL,
+                       xd->mi[0]->mbmi.mode_context);
 
       vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
                             &dummy_mv[0], &dummy_mv[1]);
@@ -1690,8 +1692,8 @@
 
     // If the segment reference frame feature is enabled....
     // then do nothing if the current ref frame is not allowed..
-    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
-        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
       continue;
 
     mbmi->ref_frame[0] = ref_frame;
@@ -1732,7 +1734,8 @@
         b_mv[NEWMV].as_int = INVALID_MV;
         vp9_append_sub8x8_mvs_for_idx(cm, xd, tile_info, i, 0, mi_row, mi_col,
                                       &b_mv[NEARESTMV],
-                                      &b_mv[NEARMV]);
+                                      &b_mv[NEARMV],
+                                      xd->mi[0]->mbmi.mode_context);
 
         for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
           int b_rate = 0;

diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index db5460b..e6e17c0 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c

@@ -678,7 +678,7 @@
     x->plane[i].quant_thred[1] = x->plane[i].zbin[1] * x->plane[i].zbin[1];
   }
 
-  x->skip_block = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
+  x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
   x->q_index = qindex;
 
   x->errorperbit = rdmult >> 6;

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 9fa258c..162d4de 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -1804,7 +1804,8 @@
         frame_mv[ZEROMV][frame].as_int = 0;
         vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, i, ref, mi_row, mi_col,
                                       &frame_mv[NEARESTMV][frame],
-                                      &frame_mv[NEARMV][frame]);
+                                      &frame_mv[NEARMV][frame],
+                                      xd->mi[0]->mbmi.mode_context);
       }
 
       // search for the best motion vector on this segment
@@ -2119,8 +2120,8 @@
                                      unsigned int *ref_costs_single,
                                      unsigned int *ref_costs_comp,
                                      vp9_prob *comp_mode_p) {
-  int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
-                                             SEG_LVL_REF_FRAME);
+  int seg_ref_active = segfeature_active(&cm->seg, segment_id,
+                                         SEG_LVL_REF_FRAME);
   if (seg_ref_active) {
     memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
     memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
@@ -2220,7 +2221,7 @@
 
   // Gets an initial list of candidate vectors from neighbours and orders them
   vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col,
-                   NULL, NULL);
+                   NULL, NULL, xd->mi[0]->mbmi.mode_context);
 
   // Candidate refinement carried out at encoder and decoder
   vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
@@ -3006,8 +3007,8 @@
     }
     // If the segment reference frame feature is enabled....
     // then do nothing if the current ref frame is not allowed..
-    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
-        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
       ref_frame_skip_mask[0] |= (1 << ref_frame);
       ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
     }
@@ -3016,7 +3017,7 @@
   // Disable this drop out case if the ref frame
   // segment level feature is enabled for this segment. This is to
   // prevent the possibility that we end up unable to pick any mode.
-  if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+  if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
     // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
     // unless ARNR filtering is enabled in which case we want
     // an unfiltered alternative. We allow near/nearest as well
@@ -3195,7 +3196,7 @@
 
       // Do not allow compound prediction if the segment level reference frame
       // feature is in use as in this case there can only be one reference.
-      if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+      if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
         continue;
 
       if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
@@ -3637,7 +3638,7 @@
 
   rd_cost->rate = INT_MAX;
 
-  assert(vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
+  assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
 
   mbmi->mode = ZEROMV;
   mbmi->uv_mode = DC_PRED;
@@ -3849,7 +3850,7 @@
         continue;
       // Do not allow compound prediction if the segment level reference frame
       // feature is in use as in this case there can only be one reference.
-      if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+      if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
         continue;
 
       if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
@@ -3874,13 +3875,13 @@
 
     // If the segment reference frame feature is enabled....
     // then do nothing if the current ref frame is not allowed..
-    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
-        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
       continue;
     // Disable this drop out case if the ref frame
     // segment level feature is enabled for this segment. This is to
     // prevent the possibility that we end up unable to pick any mode.
-    } else if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+    } else if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
       // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
       // unless ARNR filtering is enabled in which case we want
       // an unfiltered alternative. We allow near/nearest as well

diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 3592031..181a99c 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c

@@ -484,7 +484,7 @@
 static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
                              TX_SIZE tx_size) {
   const int eob_max = 16 << (tx_size << 1);
-  return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+  return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
 static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -615,8 +615,8 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const int ctx = vp9_get_skip_context(xd);
-  const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
-                                              SEG_LVL_SKIP);
+  const int skip_inc = !segfeature_active(&cm->seg, mbmi->segment_id,
+                                          SEG_LVL_SKIP);
   struct tokenize_b_args arg = {cpi, td, t};
   if (mbmi->skip) {
     if (!dry_run)

diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index d018699..b01fdd1 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk

@@ -146,6 +146,7 @@
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct32x32_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct_msa.h
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_intra_predict_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_4_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_8_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_16_msa.c