Merge "mips msa vpx_dsp variance optimization"
diff --git a/test/vpxenc.sh b/test/vpxenc.sh
index bf551a8..e899499 100755
--- a/test/vpxenc.sh
+++ b/test/vpxenc.sh
@@ -290,6 +290,35 @@
   fi
 }
 
+vpxenc_vp9_webm_rt_multithread_tiled_frameparallel() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_mt_t_fp.webm"
+    local readonly tilethread_min=2
+    local readonly tilethread_max=4
+    local readonly num_threads="$(seq ${tilethread_min} ${tilethread_max})"
+    local readonly num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})"
+
+    for threads in ${num_threads}; do
+      for tile_cols in ${num_tile_cols}; do
+        vpxenc $(y4m_input_720p) \
+          $(vpxenc_rt_params vp9) \
+          --threads=${threads} \
+          --tile-columns=${tile_cols} \
+          --frame-parallel=1 \
+          --output="${output}"
+      done
+    done
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+
+    rm "${output}"
+  fi
+}
+
 vpxenc_vp9_webm_2pass() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
@@ -390,6 +419,7 @@
               vpxenc_vp9_webm
               vpxenc_vp9_webm_rt
               vpxenc_vp9_webm_rt_multithread_tiled
+              vpxenc_vp9_webm_rt_multithread_tiled_frameparallel
               vpxenc_vp9_webm_2pass
               vpxenc_vp9_ivf_lossless
               vpxenc_vp9_ivf_minq0_maxq0
diff --git a/vp8/common/arm/neon/idct_dequant_0_2x_neon.c b/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
index 967c322..e6f862f 100644
--- a/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
+++ b/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
@@ -18,7 +18,8 @@
     unsigned char *dst0;
     int i, a0, a1;
     int16x8x2_t q2Add;
-    int32x2_t d2s32, d4s32;
+    int32x2_t d2s32 = vdup_n_s32(0),
+              d4s32 = vdup_n_s32(0);
     uint8x8_t d2u8, d4u8;
     uint16x8_t q1u16, q2u16;
 
diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
index e1c8609..06a87b6 100644
--- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
+++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
@@ -153,6 +153,7 @@
 #else
 static INLINE
 uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
+    x.val[0] = x.val[1] = x.val[2] = x.val[3] = vdup_n_u8(0);
     x = vld4_lane_u8(src, x, 0);
     src += pitch;
     x = vld4_lane_u8(src, x, 1);
diff --git a/vp9/common/mips/msa/vp9_macros_msa.h b/vp9/common/mips/msa/vp9_macros_msa.h
index 4385075..e008eaf 100644
--- a/vp9/common/mips/msa/vp9_macros_msa.h
+++ b/vp9/common/mips/msa/vp9_macros_msa.h
@@ -229,13 +229,12 @@
 #endif  // (__mips_isa_rev >= 6)
 
 /* Description : Load 4 words with stride
-   Arguments   : Inputs  - psrc    (source pointer to load from)
-                         - stride
+   Arguments   : Inputs  - psrc, stride
                  Outputs - out0, out1, out2, out3
-   Details     : Loads word in 'out0' from (psrc)
-                 Loads word in 'out1' from (psrc + stride)
-                 Loads word in 'out2' from (psrc + 2 * stride)
-                 Loads word in 'out3' from (psrc + 3 * stride)
+   Details     : Load word in 'out0' from (psrc)
+                 Load word in 'out1' from (psrc + stride)
+                 Load word in 'out2' from (psrc + 2 * stride)
+                 Load word in 'out3' from (psrc + 3 * stride)
 */
 #define LW4(psrc, stride, out0, out1, out2, out3) {  \
   out0 = LW((psrc));                                 \
@@ -245,11 +244,10 @@
 }
 
 /* Description : Load double words with stride
-   Arguments   : Inputs  - psrc    (source pointer to load from)
-                         - stride
+   Arguments   : Inputs  - psrc, stride
                  Outputs - out0, out1
-   Details     : Loads double word in 'out0' from (psrc)
-                 Loads double word in 'out1' from (psrc + stride)
+   Details     : Load double word in 'out0' from (psrc)
+                 Load double word in 'out1' from (psrc + stride)
 */
 #define LD2(psrc, stride, out0, out1) {  \
   out0 = LD((psrc));                     \
@@ -261,11 +259,11 @@
 }
 
 /* Description : Store 4 words with stride
-   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
-   Details     : Stores word from 'in0' to (pdst)
-                 Stores word from 'in1' to (pdst + stride)
-                 Stores word from 'in2' to (pdst + 2 * stride)
-                 Stores word from 'in3' to (pdst + 3 * stride)
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Store word from 'in0' to (pdst)
+                 Store word from 'in1' to (pdst + stride)
+                 Store word from 'in2' to (pdst + 2 * stride)
+                 Store word from 'in3' to (pdst + 3 * stride)
 */
 #define SW4(in0, in1, in2, in3, pdst, stride) {  \
   SW(in0, (pdst))                                \
@@ -275,11 +273,11 @@
 }
 
 /* Description : Store 4 double words with stride
-   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
-   Details     : Stores double word from 'in0' to (pdst)
-                 Stores double word from 'in1' to (pdst + stride)
-                 Stores double word from 'in2' to (pdst + 2 * stride)
-                 Stores double word from 'in3' to (pdst + 3 * stride)
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Store double word from 'in0' to (pdst)
+                 Store double word from 'in1' to (pdst + stride)
+                 Store double word from 'in2' to (pdst + 2 * stride)
+                 Store double word from 'in3' to (pdst + 3 * stride)
 */
 #define SD4(in0, in1, in2, in3, pdst, stride) {  \
   SD(in0, (pdst))                                \
@@ -289,12 +287,11 @@
 }
 
 /* Description : Load vectors with 16 byte elements with stride
-   Arguments   : Inputs  - psrc    (source pointer to load from)
-                         - stride
+   Arguments   : Inputs  - psrc, stride
                  Outputs - out0, out1
                  Return Type - as per RTYPE
-   Details     : Loads 16 byte elements in 'out0' from (psrc)
-                 Loads 16 byte elements in 'out1' from (psrc + stride)
+   Details     : Load 16 byte elements in 'out0' from (psrc)
+                 Load 16 byte elements in 'out1' from (psrc + stride)
 */
 #define LD_B2(RTYPE, psrc, stride, out0, out1) {  \
   out0 = LD_B(RTYPE, (psrc));                     \
@@ -333,11 +330,10 @@
 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
 
 /* Description : Load vectors with 8 halfword elements with stride
-   Arguments   : Inputs  - psrc    (source pointer to load from)
-                         - stride
+   Arguments   : Inputs  - psrc, stride
                  Outputs - out0, out1
-   Details     : Loads 8 halfword elements in 'out0' from (psrc)
-                 Loads 8 halfword elements in 'out1' from (psrc + stride)
+   Details     : Load 8 halfword elements in 'out0' from (psrc)
+                 Load 8 halfword elements in 'out1' from (psrc + stride)
 */
 #define LD_H2(RTYPE, psrc, stride, out0, out1) {  \
   out0 = LD_H(RTYPE, (psrc));                     \
@@ -368,9 +364,9 @@
 }
 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
 
-/* Description : Load as 4x4 block of signed halfword elements from 1D source
+/* Description : Load 4x4 block of signed halfword elements from 1D source
                  data into 4 vectors (Each vector with 4 signed halfwords)
-   Arguments   : Inputs  - psrc
+   Arguments   : Input   - psrc
                  Outputs - out0, out1, out2, out3
 */
 #define LD4x4_SH(psrc, out0, out1, out2, out3) {         \
@@ -381,8 +377,7 @@
 }
 
 /* Description : Load 2 vectors of signed word elements with stride
-   Arguments   : Inputs  - psrc    (source pointer to load from)
-                         - stride
+   Arguments   : Inputs  - psrc, stride
                  Outputs - out0, out1
                  Return Type - signed word
 */
@@ -392,10 +387,9 @@
 }
 
 /* Description : Store vectors of 16 byte elements with stride
-   Arguments   : Inputs  - in0, in1, stride
-                 Outputs - pdst    (destination pointer to store to)
-   Details     : Stores 16 byte elements from 'in0' to (pdst)
-                 Stores 16 byte elements from 'in1' to (pdst + stride)
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 16 byte elements from 'in0' to (pdst)
+                 Store 16 byte elements from 'in1' to (pdst + stride)
 */
 #define ST_B2(RTYPE, in0, in1, pdst, stride) {  \
   ST_B(RTYPE, in0, (pdst));                     \
@@ -417,10 +411,9 @@
 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
 
 /* Description : Store vectors of 8 halfword elements with stride
-   Arguments   : Inputs  - in0, in1, stride
-                 Outputs - pdst    (destination pointer to store to)
-   Details     : Stores 8 halfword elements from 'in0' to (pdst)
-                 Stores 8 halfword elements from 'in1' to (pdst + stride)
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 8 halfword elements from 'in0' to (pdst)
+                 Store 8 halfword elements from 'in1' to (pdst + stride)
 */
 #define ST_H2(RTYPE, in0, in1, pdst, stride) {  \
   ST_H(RTYPE, in0, (pdst));                     \
@@ -441,8 +434,7 @@
 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
 
 /* Description : Store vectors of word elements with stride
-   Arguments   : Inputs  - in0, in1, stride
-                         - pdst    (destination pointer to store to)
+   Arguments   : Inputs - in0, in1, pdst, stride
    Details     : Store 4 word elements from 'in0' to (pdst)
                  Store 4 word elements from 'in1' to (pdst + stride)
 */
@@ -451,17 +443,16 @@
   ST_SW(in1, (pdst) + stride);            \
 }
 
-/* Description : Store as 2x4 byte block to destination memory from input vector
-   Arguments   : Inputs  - in, stidx, pdst, stride
-                 Return Type - unsigned byte
-   Details     : Index stidx halfword element from 'in' vector is copied and
-                 stored on first line
-                 Index stidx+1 halfword element from 'in' vector is copied and
-                 stored on second line
-                 Index stidx+2 halfword element from 'in' vector is copied and
-                 stored on third line
-                 Index stidx+3 halfword element from 'in' vector is copied and
-                 stored on fourth line
+/* Description : Store 2x4 byte block to destination memory from input vector
+   Arguments   : Inputs - in, stidx, pdst, stride
+   Details     : Index 'stidx' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst)
+                 Index 'stidx+1' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + stride)
+                 Index 'stidx+2' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + 2 * stride)
+                 Index 'stidx+3' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + 3 * stride)
 */
 #define ST2x4_UB(in, stidx, pdst, stride) {         \
   uint16_t out0_m, out1_m, out2_m, out3_m;          \
@@ -479,10 +470,10 @@
 }
 
 /* Description : Store 4x2 byte block to destination memory from input vector
-   Arguments   : Inputs  - in, pdst, stride
-   Details     : Index 0 word element from 'in' vector is copied to a GP
+   Arguments   : Inputs - in, pdst, stride
+   Details     : Index 0 word element from 'in' vector is copied to the GP
                  register and stored to (pdst)
-                 Index 1 word element from 'in' vector is copied to a GP
+                 Index 1 word element from 'in' vector is copied to the GP
                  register and stored to (pdst + stride)
 */
 #define ST4x2_UB(in, pdst, stride) {        \
@@ -496,17 +487,16 @@
   SW(out1_m, pblk_4x2_m + stride);          \
 }
 
-/* Description : Store as 4x4 byte block to destination memory from input vector
-   Arguments   : Inputs  - in0, in1, pdst, stride
-                 Return Type - unsigned byte
-   Details     : Idx0 word element from input vector 'in0' is copied and stored
-                 on first line
-                 Idx1 word element from input vector 'in0' is copied and stored
-                 on second line
-                 Idx2 word element from input vector 'in1' is copied and stored
-                 on third line
-                 Idx3 word element from input vector 'in1' is copied and stored
-                 on fourth line
+/* Description : Store 4x4 byte block to destination memory from input vector
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : 'Idx0' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst)
+                 'Idx1' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + stride)
+                 'Idx2' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + 2 * stride)
+                 'Idx3' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + 3 * stride)
 */
 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) {  \
   uint32_t out0_m, out1_m, out2_m, out3_m;                          \
@@ -526,10 +516,10 @@
   ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \
 }
 
-/* Description : Store as 8x1 byte block to destination memory from input vector
-   Arguments   : Inputs  - in, pdst
-   Details     : Index 0 double word element from input vector 'in' is copied
-                 and stored to destination memory at (pdst)
+/* Description : Store 8x1 byte block to destination memory from input vector
+   Arguments   : Inputs - in, pdst
+   Details     : Index 0 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst)
 */
 #define ST8x1_UB(in, pdst) {              \
   uint64_t out0_m;                        \
@@ -538,12 +528,12 @@
   SD(out0_m, pdst);                       \
 }
 
-/* Description : Store as 8x2 byte block to destination memory from input vector
-   Arguments   : Inputs  - in, pdst, stride
-   Details     : Index 0 double word element from input vector 'in' is copied
-                 and stored to destination memory at (pdst)
-                 Index 1 double word element from input vector 'in' is copied
-                 and stored to destination memory at (pdst + stride)
+/* Description : Store 8x2 byte block to destination memory from input vector
+   Arguments   : Inputs - in, pdst, stride
+   Details     : Index 0 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst)
+                 Index 1 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst + stride)
 */
 #define ST8x2_UB(in, pdst, stride) {        \
   uint64_t out0_m, out1_m;                  \
@@ -556,17 +546,17 @@
   SD(out1_m, pblk_8x2_m + stride);          \
 }
 
-/* Description : Store as 8x4 byte block to destination memory from input
+/* Description : Store 8x4 byte block to destination memory from input
                  vectors
-   Arguments   : Inputs  - in0, in1, pdst, stride
-   Details     : Index 0 double word element from input vector 'in0' is copied
-                 and stored to destination memory at (pblk_8x4_m)
-                 Index 1 double word element from input vector 'in0' is copied
-                 and stored to destination memory at (pblk_8x4_m + stride)
-                 Index 0 double word element from input vector 'in1' is copied
-                 and stored to destination memory at (pblk_8x4_m + 2 * stride)
-                 Index 1 double word element from input vector 'in1' is copied
-                 and stored to destination memory at (pblk_8x4_m + 3 * stride)
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Index 0 double word element from 'in0' vector is copied to the
+                 GP register and stored to (pdst)
+                 Index 1 double word element from 'in0' vector is copied to the
+                 GP register and stored to (pdst + stride)
+                 Index 0 double word element from 'in1' vector is copied to the
+                 GP register and stored to (pdst + 2 * stride)
+                 Index 1 double word element from 'in1' vector is copied to the
+                 GP register and stored to (pdst + 3 * stride)
 */
 #define ST8x4_UB(in0, in1, pdst, stride) {                  \
   uint64_t out0_m, out1_m, out2_m, out3_m;                  \
@@ -583,14 +573,10 @@
 /* Description : average with rounding (in0 + in1 + 1) / 2.
    Arguments   : Inputs  - in0, in1, in2, in3,
                  Outputs - out0, out1
-                 Return Type - signed byte
-   Details     : Each byte element from 'in0' vector is added with each byte
-                 element from 'in1' vector. The addition of the elements plus 1
-                (for rounding) is done unsigned with full precision,
-                i.e. the result has one extra bit. Unsigned division by 2
-                (or logical shift right by one bit) is performed before writing
-                the result to vector 'out0'
-                Similar for the pair of 'in2' and 'in3'
+                 Return Type - as per RTYPE
+   Details     : Each unsigned byte element from 'in0' vector is added with
+                 each unsigned byte element from 'in1' vector. Then average
+                 with rounding is calculated and written to 'out0'
 */
 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1);    \
@@ -605,12 +591,12 @@
 }
 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
 
-/* Description : Immediate number of columns to slide with zero
+/* Description : Immediate number of elements to slide with zero
    Arguments   : Inputs  - in0, in1, slide_val
                  Outputs - out0, out1
                  Return Type - as per RTYPE
    Details     : Byte elements from 'zero_m' vector are slide into 'in0' by
-                 number of elements specified by 'slide_val'
+                 value specified in the 'slide_val'
 */
 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) {          \
   v16i8 zero_m = { 0 };                                              \
@@ -626,12 +612,12 @@
 }
 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
 
-/* Description : Immediate number of columns to slide
+/* Description : Immediate number of elements to slide
    Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
                  Outputs - out0, out1
                  Return Type - as per RTYPE
    Details     : Byte elements from 'in0_0' vector are slide into 'in1_0' by
-                 number of elements specified by 'slide_val'
+                 value specified in the 'slide_val'
 */
 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) {  \
   out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);         \
@@ -651,10 +637,8 @@
    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
                  Outputs - out0, out1
                  Return Type - as per RTYPE
-   Details     : Selective byte elements from in0 & in1 are copied to out0 as
-                 per control vector mask0
-                 Selective byte elements from in2 & in3 are copied to out1 as
-                 per control vector mask1
+   Details     : Byte elements from 'in0' & 'in1' are copied selectively to
+                 'out0' as per control vector 'mask0'
 */
 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) {  \
   out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0);     \
@@ -673,16 +657,14 @@
 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
 
 /* Description : Dot product of byte vector elements
-   Arguments   : Inputs  - mult0, mult1
-                           cnst0, cnst1
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
                  Outputs - out0, out1
-                 Return Type - unsigned halfword
-   Details     : Unsigned byte elements from mult0 are multiplied with
-                 unsigned byte elements from cnst0 producing a result
+                 Return Type - as per RTYPE
+   Details     : Unsigned byte elements from 'mult0' are multiplied with
+                 unsigned byte elements from 'cnst0' producing a result
                  twice the size of input i.e. unsigned halfword.
-                 Then this multiplication results of adjacent odd-even elements
-                 are added together and stored to the out vector
-                 (2 unsigned halfword results)
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
 */
 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
   out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);        \
@@ -699,16 +681,14 @@
 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
 
 /* Description : Dot product of byte vector elements
-   Arguments   : Inputs  - mult0, mult1
-                           cnst0, cnst1
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
                  Outputs - out0, out1
-                 Return Type - signed halfword
-   Details     : Signed byte elements from mult0 are multiplied with
-                 signed byte elements from cnst0 producing a result
+                 Return Type - as per RTYPE
+   Details     : Signed byte elements from 'mult0' are multiplied with
+                 signed byte elements from 'cnst0' producing a result
                  twice the size of input i.e. signed halfword.
-                 Then this multiplication results of adjacent odd-even elements
-                 are added together and stored to the out vector
-                 (2 signed halfword results)
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
 */
 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
   out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);        \
@@ -724,16 +704,14 @@
 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
 
 /* Description : Dot product of halfword vector elements
-   Arguments   : Inputs  - mult0, mult1
-                           cnst0, cnst1
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
                  Outputs - out0, out1
-                 Return Type - signed word
-   Details     : Signed halfword elements from mult0 are multiplied with
-                 signed halfword elements from cnst0 producing a result
+                 Return Type - as per RTYPE
+   Details     : Signed halfword elements from 'mult0' are multiplied with
+                 signed halfword elements from 'cnst0' producing a result
                  twice the size of input i.e. signed word.
-                 Then this multiplication results of adjacent odd-even elements
-                 are added together and stored to the out vector
-                 (2 signed word results)
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
 */
 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
   out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);        \
@@ -750,16 +728,14 @@
 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
 
 /* Description : Dot product of word vector elements
-   Arguments   : Inputs  - mult0, mult1
-                           cnst0, cnst1
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
                  Outputs - out0, out1
-                 Return Type - signed word
-   Details     : Signed word elements from mult0 are multiplied with
-                 signed word elements from cnst0 producing a result
+                 Return Type - as per RTYPE
+   Details     : Signed word elements from 'mult0' are multiplied with
+                 signed word elements from 'cnst0' producing a result
                  twice the size of input i.e. signed double word.
-                 Then this multiplication results of adjacent odd-even elements
-                 are added together and stored to the out vector
-                 (2 signed double word results)
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
 */
 #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
   out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);        \
@@ -768,16 +744,14 @@
 #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
 
 /* Description : Dot product & addition of byte vector elements
-   Arguments   : Inputs  - mult0, mult1
-                           cnst0, cnst1
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
                  Outputs - out0, out1
-                 Return Type - signed halfword
-   Details     : Signed byte elements from mult0 are multiplied with
-                 signed byte elements from cnst0 producing a result
+                 Return Type - as per RTYPE
+   Details     : Signed byte elements from 'mult0' are multiplied with
+                 signed byte elements from 'cnst0' producing a result
                  twice the size of input i.e. signed halfword.
-                 Then this multiplication results of adjacent odd-even elements
-                 are added to the out vector
-                 (2 signed halfword results)
+                 The multiplication result of adjacent odd-even elements
+                 are added to the 'out0' vector
 */
 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \
   out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0);  \
@@ -793,8 +767,7 @@
 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
 
 /* Description : Dot product & addition of halfword vector elements
-   Arguments   : Inputs  - mult0, mult1
-                           cnst0, cnst1
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
                  Outputs - out0, out1
                  Return Type - as per RTYPE
    Details     : Signed halfword elements from 'mult0' are multiplied with
@@ -828,10 +801,10 @@
 /* Description : Minimum values between unsigned elements of
                  either vector are copied to the output vector
    Arguments   : Inputs  - in0, in1, min_vec
-                 Outputs - in0, in1, (in place)
-                 Return Type - unsigned halfword
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
    Details     : Minimum of unsigned halfword element values from 'in0' and
-                 'min_value' are written to output vector 'in0'
+                 'min_vec' are written to output vector 'in0'
 */
 #define MIN_UH2(RTYPE, in0, in1, min_vec) {         \
   in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec);  \
@@ -847,8 +820,8 @@
 
 /* Description : Clips all signed halfword elements of input vector
                  between 0 & 255
-   Arguments   : Inputs  - in       (input vector)
-                 Outputs - out_m    (output vector with clipped elements)
+   Arguments   : Input  - in
+                 Output - out_m
                  Return Type - signed halfword
 */
 #define CLIP_SH_0_255(in) ({                          \
@@ -868,12 +841,12 @@
   CLIP_SH2_0_255(in2, in3);                   \
 }
 
-/* Description : Addition of 4 signed word elements
-                 4 signed word elements of input vector are added together and
+/* Description : Horizontal addition of 4 signed word elements of input vector
+   Arguments   : Input  - in       (signed word vector)
+                 Output - sum_m    (i32 sum)
+                 Return Type - signed word (GP)
+   Details     : 4 signed word elements of 'in' vector are added together and
                  the resulting integer sum is returned
-   Arguments   : Inputs  - in       (signed word vector)
-                 Outputs - sum_m    (i32 sum)
-                 Return Type - signed word
 */
 #define HADD_SW_S32(in) ({                        \
   v2i64 res0_m, res1_m;                           \
@@ -892,7 +865,7 @@
                  Return Type - as per RTYPE
    Details     : Each unsigned odd byte element from 'in0' is added to
                  even unsigned byte element from 'in0' (pairwise) and the
-                 halfword result is stored in 'out0'
+                 halfword result is written to 'out0'
 */
 #define HADD_UB2(RTYPE, in0, in1, out0, out1) {          \
   out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0);  \
@@ -934,11 +907,11 @@
 }
 #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
 
-/* Description : Insert specified word elements from input vectors to 1
-                 destination vector
-   Arguments   : Inputs  - in0, in1, in2, in3 (4 input vectors)
-                 Outputs - out                (output vector)
+/* Description : Set element n input vector to GPR value
+   Arguments   : Inputs - in0, in1, in2, in3
+                 Output - out
                  Return Type - as per RTYPE
+   Details     : Set element 0 in vector 'out' to value specified in 'in0'
 */
 #define INSERT_W2(RTYPE, in0, in1, out) {           \
   out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);  \
@@ -955,12 +928,6 @@
 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
 
-/* Description : Insert specified double word elements from input vectors to 1
-                 destination vector
-   Arguments   : Inputs  - in0, in1      (2 input vectors)
-                 Outputs - out           (output vector)
-                 Return Type - as per RTYPE
-*/
 #define INSERT_D2(RTYPE, in0, in1, out) {           \
   out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \
   out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \
@@ -972,10 +939,8 @@
    Arguments   : Inputs  - in0, in1, in2, in3
                  Outputs - out0, out1
                  Return Type - as per RTYPE
-   Details     : Even byte elements of 'in0' and even byte
-                 elements of 'in1' are interleaved and copied to 'out0'
-                 Even byte elements of 'in2' and even byte
-                 elements of 'in3' are interleaved and copied to 'out1'
+   Details     : Even byte elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
 */
 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0);     \
@@ -988,10 +953,8 @@
    Arguments   : Inputs  - in0, in1, in2, in3
                  Outputs - out0, out1
                  Return Type - as per RTYPE
-   Details     : Even halfword elements of 'in0' and even halfword
-                 elements of 'in1' are interleaved and copied to 'out0'
-                 Even halfword elements of 'in2' and even halfword
-                 elements of 'in3' are interleaved and copied to 'out1'
+   Details     : Even halfword elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
 */
 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);     \
@@ -1018,10 +981,8 @@
    Arguments   : Inputs  - in0, in1, in2, in3
                  Outputs - out0, out1
                  Return Type - as per RTYPE
-   Details     : Even double word elements of 'in0' and even double word
-                 elements of 'in1' are interleaved and copied to 'out0'
-                 Even double word elements of 'in2' and even double word
-                 elements of 'in3' are interleaved and copied to 'out1'
+   Details     : Even double word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
 */
 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0);     \
@@ -1033,10 +994,8 @@
    Arguments   : Inputs  - in0, in1, in2, in3
                  Outputs - out0, out1
                  Return Type - as per RTYPE
-   Details     : Left half of byte elements of in0 and left half of byte
-                 elements of in1 are interleaved and copied to out0.
-                 Left half of byte elements of in2 and left half of byte
-                 elements of in3 are interleaved and copied to out1.
+   Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'.
 */
 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);     \
@@ -1059,10 +1018,8 @@
    Arguments   : Inputs  - in0, in1, in2, in3
                  Outputs - out0, out1
                  Return Type - as per RTYPE
-   Details     : Left half of halfword elements of in0 and left half of halfword
-                 elements of in1 are interleaved and copied to out0.
-                 Left half of halfword elements of in2 and left half of halfword
-                 elements of in3 are interleaved and copied to out1.
+   Details     : Left half of halfword elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
 */
 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);     \
@@ -1074,10 +1031,8 @@
    Arguments   : Inputs  - in0, in1, in2, in3
                  Outputs - out0, out1
                  Return Type - as per RTYPE
-   Details     : Left half of word elements of in0 and left half of word
-                 elements of in1 are interleaved and copied to out0.
-                 Left half of word elements of in2 and left half of word
-                 elements of in3 are interleaved and copied to out1.
+   Details     : Left half of word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'.
 */
 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);     \
@@ -1087,14 +1042,11 @@
 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
 
 /* Description : Interleave right half of byte elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-                 Outputs - out0, out1, out2, out3
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
                  Return Type - as per RTYPE
-   Details     : Right half of byte elements of in0 and right half of byte
-                 elements of in1 are interleaved and copied to out0.
-                 Right half of byte elements of in2 and right half of byte
-                 elements of in3 are interleaved and copied to out1.
-                 Similar for other pairs
+   Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
+                 and written to out0.
 */
 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);     \
@@ -1126,14 +1078,11 @@
 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
 
 /* Description : Interleave right half of halfword elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-                 Outputs - out0, out1, out2, out3
-                 Return Type - signed halfword
-   Details     : Right half of halfword elements of in0 and right half of
-                 halfword elements of in1 are interleaved and copied to out0.
-                 Right half of halfword elements of in2 and right half of
-                 halfword elements of in3 are interleaved and copied to out1.
-                 Similar for other pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of halfword elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
 */
 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);     \
@@ -1163,13 +1112,11 @@
 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
 
 /* Description : Interleave right half of double word elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-                 Outputs - out0, out1, out2, out3
-                 Return Type - unsigned double word
-   Details     : Right half of double word elements of in0 and right half of
-                 double word elements of in1 are interleaved and copied to out0.
-                 Right half of double word elements of in2 and right half of
-                 double word elements of in3 are interleaved and copied to out1.
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of double word elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
 */
 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) {   \
   out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1));  \
@@ -1198,9 +1145,7 @@
                  Outputs - out0, out1
                  Return Type - as per RTYPE
    Details     : Right half of byte elements from 'in0' and 'in1' are
-                 interleaved and stored to 'out0'
-                 Left half of byte elements from 'in0' and 'in1' are
-                 interleaved and stored to 'out1'
+                 interleaved and written to 'out0'
 */
 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) {        \
   out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);  \
@@ -1226,14 +1171,14 @@
 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
 
 /* Description : Saturate the halfword element values to the max
-                 unsigned value of (sat_val+1 bits)
+                 unsigned value of (sat_val + 1) bits
                  The element data width remains unchanged
-   Arguments   : Inputs  - in0, in1, in2, in3, sat_val
-                 Outputs - in0, in1, in2, in3 (in place)
-                 Return Type - unsigned halfword
+   Arguments   : Inputs  - in0, in1, sat_val
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
    Details     : Each unsigned halfword element from 'in0' is saturated to the
-                 value generated with (sat_val+1) bit range.
-                 The results are stored in place
+                 value generated with (sat_val + 1) bit range.
+                 The results are written in place
 */
 #define SAT_UH2(RTYPE, in0, in1, sat_val) {         \
   in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val);  \
@@ -1248,14 +1193,14 @@
 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
 
 /* Description : Saturate the halfword element values to the max
-                 unsigned value of (sat_val+1 bits)
+                 unsigned value of (sat_val + 1) bits
                  The element data width remains unchanged
-   Arguments   : Inputs  - in0, in1, in2, in3, sat_val
-                 Outputs - in0, in1, in2, in3 (in place)
-                 Return Type - unsigned halfword
+   Arguments   : Inputs  - in0, in1, sat_val
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
    Details     : Each unsigned halfword element from 'in0' is saturated to the
-                 value generated with (sat_val+1) bit range
-                 The results are stored in place
+                 value generated with (sat_val + 1) bit range
+                 The results are written in place
 */
 #define SAT_SH2(RTYPE, in0, in1, sat_val) {         \
   in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val);  \
@@ -1296,12 +1241,9 @@
    Arguments   : Inputs  - in0, in1, in2, in3
                  Outputs - out0, out1
                  Return Type - as per RTYPE
-   Details     : Even byte elements of in0 are copied to the left half of
-                 out0 & even byte elements of in1 are copied to the right
-                 half of out0.
-                 Even byte elements of in2 are copied to the left half of
-                 out1 & even byte elements of in3 are copied to the right
-                 half of out1.
+   Details     : Even byte elements of 'in0' are copied to the left half of
+                 'out0' & even byte elements of 'in1' are copied to the right
+                 half of 'out0'.
 */
 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);     \
@@ -1324,12 +1266,9 @@
    Arguments   : Inputs  - in0, in1, in2, in3
                  Outputs - out0, out1
                  Return Type - as per RTYPE
-   Details     : Even halfword elements of in0 are copied to the left half of
-                 out0 & even halfword elements of in1 are copied to the right
-                 half of out0.
-                 Even halfword elements of in2 are copied to the left half of
-                 out1 & even halfword elements of in3 are copied to the right
-                 half of out1.
+   Details     : Even halfword elements of 'in0' are copied to the left half of
+                 'out0' & even halfword elements of 'in1' are copied to the
+                 right half of 'out0'.
 */
 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1);     \
@@ -1348,13 +1287,10 @@
 /* Description : Pack even double word elements of vector pairs
    Arguments   : Inputs  - in0, in1, in2, in3
                  Outputs - out0, out1
-                 Return Type - unsigned byte
-   Details     : Even double elements of in0 are copied to the left half of
-                 out0 & even double elements of in1 are copied to the right
-                 half of out0.
-                 Even double elements of in2 are copied to the left half of
-                 out1 & even double elements of in3 are copied to the right
-                 half of out1.
+                 Return Type - as per RTYPE
+   Details     : Even double elements of 'in0' are copied to the left half of
+                 'out0' & even double elements of 'in1' are copied to the right
+                 half of 'out0'.
 */
 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1);     \
@@ -1372,15 +1308,10 @@
 
 /* Description : Each byte element is logically xor'ed with immediate 128
    Arguments   : Inputs  - in0, in1
-                 Outputs - in0, in1 (in-place)
+                 Outputs - in place operation
                  Return Type - as per RTYPE
    Details     : Each unsigned byte element from input vector 'in0' is
-                 logically xor'ed with 128 and the result is in-place stored in
-                 'in0' vector
-                 Each unsigned byte element from input vector 'in1' is
-                 logically xor'ed with 128 and the result is in-place stored in
-                 'in1' vector
-                 Similar for other pairs
+                 logically xor'ed with 128 and the result is stored in-place.
 */
 #define XORI_B2_128(RTYPE, in0, in1) {         \
   in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128);  \
@@ -1432,8 +1363,7 @@
                  Return Type - as per RTYPE
    Details     : Signed halfword elements from 'in0' are added to signed
                  halfword elements of 'in1'. The result is then signed saturated
-                 between -32768 to +32767 (as per halfword data type)
-                 Similar for other pairs
+                 between halfword data type range
 */
 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1);    \
@@ -1450,11 +1380,10 @@
 
 /* Description : Shift left all elements of vector (generic for all data types)
    Arguments   : Inputs  - in0, in1, in2, in3, shift
-                 Outputs - in0, in1, in2, in3 (in place)
+                 Outputs - in place operation
                  Return Type - as per input vector RTYPE
    Details     : Each element of vector 'in0' is left shifted by 'shift' and
-                 the result is in place written to 'in0'
-                 Similar for other pairs
+                 the result is written in-place.
 */
 #define SLLI_4V(in0, in1, in2, in3, shift) {  \
   in0 = in0 << shift;                         \
@@ -1466,12 +1395,10 @@
 /* Description : Arithmetic shift right all elements of vector
                  (generic for all data types)
    Arguments   : Inputs  - in0, in1, in2, in3, shift
-                 Outputs - in0, in1, in2, in3 (in place)
+                 Outputs - in place operation
                  Return Type - as per input vector RTYPE
    Details     : Each element of vector 'in0' is right shifted by 'shift' and
-                 the result is in place written to 'in0'
-                 Here, 'shift' is GP variable passed in
-                 Similar for other pairs
+                 the result is written in-place. 'shift' is a GP variable.
 */
 #define SRA_4V(in0, in1, in2, in3, shift) {  \
   in0 = in0 >> shift;                        \
@@ -1502,14 +1429,13 @@
 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
 
 /* Description : Shift right arithmetic rounded (immediate)
-   Arguments   : Inputs  - in0, in1, in2, in3, shift
-                 Outputs - in0, in1, in2, in3 (in place)
+   Arguments   : Inputs  - in0, in1, shift
+                 Outputs - in place operation
                  Return Type - as per RTYPE
-   Details     : Each element of vector 'in0' is shifted right arithmetic by
-                 value in 'shift'.
-                 The last discarded bit is added to shifted value for rounding
-                 and the result is in place written to 'in0'
-                 Similar for other pairs
+   Details     : Each element of vector 'in0' is shifted right arithmetically by
+                 the value in 'shift'. The last discarded bit is added to the
+                 shifted value for rounding and the result is written in-place.
+                 'shift' is an immediate value.
 */
 #define SRARI_H2(RTYPE, in0, in1, shift) {        \
   in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift);  \
@@ -1525,16 +1451,6 @@
 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
 
-/* Description : Shift right arithmetic rounded (immediate)
-   Arguments   : Inputs  - in0, in1, shift
-                 Outputs - in0, in1     (in place)
-                 Return Type - as per RTYPE
-   Details     : Each element of vector 'in0' is shifted right arithmetic by
-                 value in 'shift'.
-                 The last discarded bit is added to shifted value for rounding
-                 and the result is in place written to 'in0'
-                 Similar for other pairs
-*/
 #define SRARI_W2(RTYPE, in0, in1, shift) {        \
   in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift);  \
   in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift);  \
@@ -1581,8 +1497,8 @@
 /* Description : Addition of 2 pairs of vectors
    Arguments   : Inputs  - in0, in1, in2, in3
                  Outputs - out0, out1
-   Details     : Each element from 2 pairs vectors is added and 2 results are
-                 produced
+   Details     : Each element in 'in0' is added to 'in1' and result is written
+                 to 'out0'.
 */
 #define ADD2(in0, in1, in2, in3, out0, out1) {  \
   out0 = in0 + in1;                             \
@@ -1597,8 +1513,8 @@
 /* Description : Subtraction of 2 pairs of vectors
    Arguments   : Inputs  - in0, in1, in2, in3
                  Outputs - out0, out1
-   Details     : Each element from 2 pairs vectors is subtracted and 2 results
-                 are produced
+   Details     : Each element in 'in1' is subtracted from 'in0' and result is
+                 written to 'out0'.
 */
 #define SUB2(in0, in1, in2, in3, out0, out1) {  \
   out0 = in0 - in1;                             \
@@ -1613,8 +1529,8 @@
 }
 
 /* Description : Sign extend halfword elements from right half of the vector
-   Arguments   : Inputs  - in    (input halfword vector)
-                 Outputs - out   (sign extended word vectors)
+   Arguments   : Input  - in    (halfword vector)
+                 Output - out   (sign extended word vector)
                  Return Type - signed word
    Details     : Sign bit of halfword elements from input vector 'in' is
                  extracted and interleaved with same vector 'in0' to generate
@@ -1628,8 +1544,8 @@
 }
 
 /* Description : Zero extend unsigned byte elements to halfword elements
-   Arguments   : Inputs  - in           (1 input unsigned byte vector)
-                 Outputs - out0, out1   (unsigned 2 halfword vectors)
+   Arguments   : Input   - in          (unsigned byte vector)
+                 Outputs - out0, out1  (unsigned  halfword vectors)
                  Return Type - signed halfword
    Details     : Zero extended right half of vector is returned in 'out0'
                  Zero extended left half of vector is returned in 'out1'
@@ -1641,9 +1557,9 @@
 }
 
 /* Description : Sign extend halfword elements from input vector and return
-                 result in pair of vectors
-   Arguments   : Inputs  - in           (1 input halfword vector)
-                 Outputs - out0, out1   (sign extended 2 word vectors)
+                 the result in pair of vectors
+   Arguments   : Input   - in            (halfword vector)
+                 Outputs - out0, out1   (sign extended word vectors)
                  Return Type - signed word
    Details     : Sign bit of halfword elements from input vector 'in' is
                  extracted and interleaved right with same vector 'in0' to
@@ -1717,13 +1633,10 @@
   out15 = in0 - in15;                                                         \
 }
 
-/* Description : Transposes input 8x8 byte block
+/* Description : Transpose input 8x8 byte block
    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-                           (input 8x8 byte block)
                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
-                           (output 8x8 byte block)
-                 Return Type - unsigned byte
-   Details     :
+                 Return Type - as per RTYPE
 */
 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
                         out0, out1, out2, out3, out4, out5, out6, out7) {  \
@@ -1741,12 +1654,11 @@
 }
 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
 
-/* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
+/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
                            in8, in9, in10, in11, in12, in13, in14, in15
                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
                  Return Type - unsigned byte
-   Details     :
 */
 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \
                             in8, in9, in10, in11, in12, in13, in14, in15,      \
@@ -1789,11 +1701,10 @@
   out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
 }
 
-/* Description : Transposes 4x4 block with half word elements in vectors
+/* Description : Transpose 4x4 block with half word elements in vectors
    Arguments   : Inputs  - in0, in1, in2, in3
                  Outputs - out0, out1, out2, out3
                  Return Type - signed halfword
-   Details     :
 */
 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \
   v8i16 s0_m, s1_m;                                                       \
@@ -1804,11 +1715,10 @@
   out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);                   \
 }
 
-/* Description : Transposes 4x8 block with half word elements in vectors
+/* Description : Transpose 4x8 block with half word elements in vectors
    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
                  Return Type - signed halfword
-   Details     :
 */
 #define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,            \
                            out0, out1, out2, out3, out4, out5, out6, out7) {  \
@@ -1832,11 +1742,10 @@
   out7 = zero_m;                                                              \
 }
 
-/* Description : Transposes 8x4 block with half word elements in vectors
+/* Description : Transpose 8x4 block with half word elements in vectors
    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
                  Return Type - signed halfword
-   Details     :
 */
 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \
   v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
@@ -1847,11 +1756,10 @@
   ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);                 \
 }
 
-/* Description : Transposes 8x8 block with half word elements in vectors
+/* Description : Transpose 8x8 block with half word elements in vectors
    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
-                 Return Type - signed halfword
-   Details     :
+                 Return Type - as per RTYPE
 */
 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
                        out0, out1, out2, out3, out4, out5, out6, out7) {  \
@@ -1876,11 +1784,10 @@
 }
 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
 
-/* Description : Transposes 4x4 block with word elements in vectors
+/* Description : Transpose 4x4 block with word elements in vectors
    Arguments   : Inputs  - in0, in1, in2, in3
                  Outputs - out0, out1, out2, out3
                  Return Type - signed word
-   Details     :
 */
 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) {  \
   v4i32 s0_m, s1_m, s2_m, s3_m;                                           \
@@ -1895,15 +1802,12 @@
 }
 
 /* Description : Add block 4x4
-   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
-                 Outputs -
-                 Return Type - unsigned bytes
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
    Details     : Least significant 4 bytes from each input vector are added to
-                 the destination bytes, clipped between 0-255 and then stored.
+                 the destination bytes, clipped between 0-255 and stored.
 */
 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) {     \
   uint32_t src0_m, src1_m, src2_m, src3_m;                      \
-  uint32_t out0_m, out1_m, out2_m, out3_m;                      \
   v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
   v16i8 dst0_m = { 0 };                                         \
   v16i8 dst1_m = { 0 };                                         \
@@ -1917,17 +1821,12 @@
   ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \
   CLIP_SH2_0_255(res0_m, res1_m);                               \
   PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
-                                                                \
-  out0_m = __msa_copy_u_w((v4i32)dst0_m, 0);                    \
-  out1_m = __msa_copy_u_w((v4i32)dst0_m, 1);                    \
-  out2_m = __msa_copy_u_w((v4i32)dst1_m, 0);                    \
-  out3_m = __msa_copy_u_w((v4i32)dst1_m, 1);                    \
-  SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride);            \
+  ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);           \
 }
 
 /* Description : Pack even elements of input vectors & xor with 128
-   Arguments   : Inputs  - in0, in1
-                 Outputs - out_m
+   Arguments   : Inputs - in0, in1
+                 Output - out_m
                  Return Type - unsigned byte
    Details     : Signed byte even elements from 'in0' and 'in1' are packed
                  together in one vector and the resulting vector is xor'ed with
@@ -1943,8 +1842,8 @@
 
 /* Description : Converts inputs to unsigned bytes, interleave, average & store
                  as 8x4 unsigned byte block
-   Arguments   : Inputs  - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
-                           pdst, stride
+   Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
+                          pdst, stride
 */
 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,                      \
                                 dst0, dst1, dst2, dst3, pdst, stride) {  \
@@ -1960,7 +1859,7 @@
 
 /* Description : Pack even byte elements and store byte vector in destination
                  memory
-   Arguments   : Inputs  - in0, in1, pdst
+   Arguments   : Inputs - in0, in1, pdst
 */
 #define PCKEV_ST_SB(in0, in1, pdst) {             \
   v16i8 tmp_m;                                    \
@@ -1970,7 +1869,7 @@
 }
 
 /* Description : Horizontal 2 tap filter kernel code
-   Arguments   : Inputs  - in0, in1, mask, coeff, shift
+   Arguments   : Inputs - in0, in1, mask, coeff, shift
 */
 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({    \
   v16i8 tmp0_m;                                                \
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 64d379c..d26048c 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -23,6 +23,7 @@
 #include "vp9/common/vp9_mv.h"
 #include "vp9/common/vp9_scale.h"
 #include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_tile_common.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -149,7 +150,10 @@
 
 typedef struct macroblockd {
   struct macroblockd_plane plane[MAX_MB_PLANE];
+
   FRAME_COUNTS *counts;
+  TileInfo tile;
+
   int mi_stride;
 
   MODE_INFO **mi;
diff --git a/vp9/common/vp9_common_data.c b/vp9/common/vp9_common_data.c
index 2aaa009..0bf7cbc 100644
--- a/vp9/common/vp9_common_data.c
+++ b/vp9/common/vp9_common_data.c
@@ -11,27 +11,27 @@
 #include "vp9/common/vp9_common_data.h"
 
 // Log 2 conversion lookup tables for block width and height
-const int b_width_log2_lookup[BLOCK_SIZES] =
+const uint8_t b_width_log2_lookup[BLOCK_SIZES] =
   {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4};
-const int b_height_log2_lookup[BLOCK_SIZES] =
+const uint8_t b_height_log2_lookup[BLOCK_SIZES] =
   {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4};
-const int num_4x4_blocks_wide_lookup[BLOCK_SIZES] =
+const uint8_t num_4x4_blocks_wide_lookup[BLOCK_SIZES] =
   {1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16};
-const int num_4x4_blocks_high_lookup[BLOCK_SIZES] =
+const uint8_t num_4x4_blocks_high_lookup[BLOCK_SIZES] =
   {1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16};
 // Log 2 conversion lookup tables for modeinfo width and height
-const int mi_width_log2_lookup[BLOCK_SIZES] =
+const uint8_t mi_width_log2_lookup[BLOCK_SIZES] =
   {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
-const int num_8x8_blocks_wide_lookup[BLOCK_SIZES] =
+const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES] =
   {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8};
-const int num_8x8_blocks_high_lookup[BLOCK_SIZES] =
+const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES] =
   {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};
 
 // MIN(3, MIN(b_width_log2(bsize), b_height_log2(bsize)))
-const int size_group_lookup[BLOCK_SIZES] =
+const uint8_t size_group_lookup[BLOCK_SIZES] =
   {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3};
 
-const int num_pels_log2_lookup[BLOCK_SIZES] =
+const uint8_t num_pels_log2_lookup[BLOCK_SIZES] =
   {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12};
 
 const PARTITION_TYPE partition_lookup[][BLOCK_SIZES] = {
diff --git a/vp9/common/vp9_common_data.h b/vp9/common/vp9_common_data.h
index a06c9be..95a1179 100644
--- a/vp9/common/vp9_common_data.h
+++ b/vp9/common/vp9_common_data.h
@@ -12,20 +12,21 @@
 #define VP9_COMMON_VP9_COMMON_DATA_H_
 
 #include "vp9/common/vp9_enums.h"
+#include "vpx/vpx_integer.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-extern const int b_width_log2_lookup[BLOCK_SIZES];
-extern const int b_height_log2_lookup[BLOCK_SIZES];
-extern const int mi_width_log2_lookup[BLOCK_SIZES];
-extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZES];
-extern const int num_8x8_blocks_high_lookup[BLOCK_SIZES];
-extern const int num_4x4_blocks_high_lookup[BLOCK_SIZES];
-extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZES];
-extern const int size_group_lookup[BLOCK_SIZES];
-extern const int num_pels_log2_lookup[BLOCK_SIZES];
+extern const uint8_t b_width_log2_lookup[BLOCK_SIZES];
+extern const uint8_t b_height_log2_lookup[BLOCK_SIZES];
+extern const uint8_t mi_width_log2_lookup[BLOCK_SIZES];
+extern const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES];
+extern const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES];
+extern const uint8_t num_4x4_blocks_high_lookup[BLOCK_SIZES];
+extern const uint8_t num_4x4_blocks_wide_lookup[BLOCK_SIZES];
+extern const uint8_t size_group_lookup[BLOCK_SIZES];
+extern const uint8_t num_pels_log2_lookup[BLOCK_SIZES];
 extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZES];
 extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES];
 extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES];
diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index 5f8ee0f..77d1ff4 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -14,7 +14,6 @@
 // This function searches the neighbourhood of a given MB/SB
 // to try and find candidate reference vectors.
 static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                             const TileInfo *const tile,
                              MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                              int_mv *mv_ref_list,
                              int block, int mi_row, int mi_col,
@@ -27,6 +26,7 @@
   int context_counter = 0;
   const MV_REF *const  prev_frame_mvs = cm->use_prev_frame_mvs ?
       cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col : NULL;
+  const TileInfo *const tile = &xd->tile;
 
   // Blank the reference vector list
   memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
@@ -147,13 +147,12 @@
 }
 
 void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                      const TileInfo *const tile,
                       MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                       int_mv *mv_ref_list,
                       int mi_row, int mi_col,
                       find_mv_refs_sync sync, void *const data,
                       uint8_t *mode_context) {
-  find_mv_refs_idx(cm, xd, tile, mi, ref_frame, mv_ref_list, -1,
+  find_mv_refs_idx(cm, xd, mi, ref_frame, mv_ref_list, -1,
                    mi_row, mi_col, sync, data, mode_context);
 }
 
@@ -181,7 +180,6 @@
 }
 
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                   const TileInfo *const tile,
                                    int block, int ref, int mi_row, int mi_col,
                                    int_mv *nearest_mv, int_mv *near_mv,
                                    uint8_t *mode_context) {
@@ -192,7 +190,7 @@
 
   assert(MAX_MV_REF_CANDIDATES == 2);
 
-  find_mv_refs_idx(cm, xd, tile, mi, mi->mbmi.ref_frame[ref], mv_list, block,
+  find_mv_refs_idx(cm, xd, mi, mi->mbmi.ref_frame[ref], mv_list, block,
                    mi_row, mi_col, NULL, NULL, mode_context);
 
   near_mv->as_int = 0;
diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h
index 621dc14..bd216d4 100644
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -209,7 +209,6 @@
 
 typedef void (*find_mv_refs_sync)(void *const data, int mi_row);
 void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                      const TileInfo *const tile,
                       MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                       int_mv *mv_ref_list, int mi_row, int mi_col,
                       find_mv_refs_sync sync, void *const data,
@@ -222,7 +221,6 @@
                            int_mv *mvlist, int_mv *nearest_mv, int_mv *near_mv);
 
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                   const TileInfo *const tile,
                                    int block, int ref, int mi_row, int mi_col,
                                    int_mv *nearest_mv, int_mv *near_mv,
                                    uint8_t *mode_context);
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index c97b5d7..22a5efd 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -1077,7 +1077,7 @@
 specialize qw/vp9_full_range_search/;
 
 add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
-specialize qw/vp9_temporal_filter_apply sse2/;
+specialize qw/vp9_temporal_filter_apply sse2 msa/;
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 9311d8d..659b848 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -698,7 +698,6 @@
 }
 
 static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                                 const TileInfo *const tile,
                                  BLOCK_SIZE bsize, int mi_row, int mi_col) {
   const int bw = num_8x8_blocks_wide_lookup[bsize];
   const int bh = num_8x8_blocks_high_lookup[bsize];
@@ -706,6 +705,7 @@
   const int y_mis = MIN(bh, cm->mi_rows - mi_row);
   const int offset = mi_row * cm->mi_stride + mi_col;
   int x, y;
+  const TileInfo *const tile = &xd->tile;
 
   xd->mi = cm->mi_grid_visible + offset;
   xd->mi[0] = &cm->mi[offset];
@@ -726,12 +726,11 @@
 }
 
 static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
-                         const TileInfo *const tile,
                          int mi_row, int mi_col,
                          vp9_reader *r, BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &pbi->common;
   const int less8x8 = bsize < BLOCK_8X8;
-  MB_MODE_INFO *mbmi = set_offsets(cm, xd, tile, bsize, mi_row, mi_col);
+  MB_MODE_INFO *mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col);
 
   if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) {
     const BLOCK_SIZE uv_subsize =
@@ -741,7 +740,7 @@
                          VPX_CODEC_CORRUPT_FRAME, "Invalid block size.");
   }
 
-  vp9_read_mode_info(pbi, xd, tile, mi_row, mi_col, r);
+  vp9_read_mode_info(pbi, xd, mi_row, mi_col, r);
 
   if (less8x8)
     bsize = BLOCK_8X8;
@@ -795,7 +794,6 @@
 }
 
 static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd,
-                             const TileInfo *const tile,
                              int mi_row, int mi_col,
                              vp9_reader* r, BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &pbi->common;
@@ -811,27 +809,27 @@
   partition = read_partition(xd, mi_row, mi_col, bsize, r, has_rows, has_cols);
   subsize = get_subsize(bsize, partition);
   if (bsize == BLOCK_8X8) {
-    decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
+    decode_block(pbi, xd, mi_row, mi_col, r, subsize);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(pbi, xd, mi_row, mi_col, r, subsize);
         break;
       case PARTITION_HORZ:
-        decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(pbi, xd, mi_row, mi_col, r, subsize);
         if (has_rows)
-          decode_block(pbi, xd, tile, mi_row + hbs, mi_col, r, subsize);
+          decode_block(pbi, xd, mi_row + hbs, mi_col, r, subsize);
         break;
       case PARTITION_VERT:
-        decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(pbi, xd, mi_row, mi_col, r, subsize);
         if (has_cols)
-          decode_block(pbi, xd, tile, mi_row, mi_col + hbs, r, subsize);
+          decode_block(pbi, xd, mi_row, mi_col + hbs, r, subsize);
         break;
       case PARTITION_SPLIT:
-        decode_partition(pbi, xd, tile, mi_row, mi_col, r, subsize);
-        decode_partition(pbi, xd, tile, mi_row, mi_col + hbs, r, subsize);
-        decode_partition(pbi, xd, tile, mi_row + hbs, mi_col, r, subsize);
-        decode_partition(pbi, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize);
+        decode_partition(pbi, xd, mi_row, mi_col, r, subsize);
+        decode_partition(pbi, xd, mi_row, mi_col + hbs, r, subsize);
+        decode_partition(pbi, xd, mi_row + hbs, mi_col, r, subsize);
+        decode_partition(pbi, xd, mi_row + hbs, mi_col + hbs, r, subsize);
         break;
       default:
         assert(0 && "Invalid partition type");
@@ -1315,7 +1313,6 @@
   // Load all tile information into tile_data.
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-      TileInfo tile;
       const TileBuffer *const buf = &tile_buffers[tile_row][tile_col];
       tile_data = pbi->tile_data + tile_cols * tile_row + tile_col;
       tile_data->cm = cm;
@@ -1323,7 +1320,7 @@
       tile_data->xd.corrupted = 0;
       tile_data->xd.counts = cm->frame_parallel_decoding_mode ?
                              NULL : &cm->counts;
-      vp9_tile_init(&tile, tile_data->cm, tile_row, tile_col);
+      vp9_tile_init(&tile_data->xd.tile, tile_data->cm, tile_row, tile_col);
       setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
                           &tile_data->bit_reader, pbi->decrypt_cb,
                           pbi->decrypt_state);
@@ -1345,8 +1342,8 @@
         vp9_zero(tile_data->xd.left_seg_context);
         for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
              mi_col += MI_BLOCK_SIZE) {
-          decode_partition(pbi, &tile_data->xd, &tile, mi_row,
-                           mi_col, &tile_data->bit_reader, BLOCK_64X64);
+          decode_partition(pbi, &tile_data->xd, mi_row, mi_col,
+                           &tile_data->bit_reader, BLOCK_64X64);
         }
         pbi->mb.corrupted |= tile_data->xd.corrupted;
         if (pbi->mb.corrupted)
@@ -1419,7 +1416,7 @@
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE) {
       decode_partition(tile_data->pbi, &tile_data->xd,
-                       tile, mi_row, mi_col, &tile_data->bit_reader,
+                       mi_row, mi_col, &tile_data->bit_reader,
                        BLOCK_64X64);
     }
   }
@@ -1543,6 +1540,7 @@
       tile_data->xd.counts = cm->frame_parallel_decoding_mode ?
                              0 : &tile_data->counts;
       vp9_tile_init(tile, cm, 0, buf->col);
+      vp9_tile_init(&tile_data->xd.tile, cm, 0, buf->col);
       setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
                           &tile_data->bit_reader, pbi->decrypt_cb,
                           pbi->decrypt_state);
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 8a8d8dd..cd20c84 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -464,7 +464,6 @@
 
 static void read_inter_block_mode_info(VP9Decoder *const pbi,
                                        MACROBLOCKD *const xd,
-                                       const TileInfo *const tile,
                                        MODE_INFO *const mi,
                                        int mi_row, int mi_col, vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
@@ -482,13 +481,14 @@
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
     RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
+
     xd->block_refs[ref] = ref_buf;
     if ((!vp9_is_valid_scale(&ref_buf->sf)))
       vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
                          "Reference frame has invalid dimensions");
     vp9_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col,
                          &ref_buf->sf);
-    vp9_find_mv_refs(cm, xd, tile, mi, frame, ref_mvs[frame],
+    vp9_find_mv_refs(cm, xd, mi, frame, ref_mvs[frame],
                      mi_row, mi_col, fpm_sync, (void *)pbi, inter_mode_ctx);
   }
 
@@ -531,7 +531,7 @@
         if (b_mode == NEARESTMV || b_mode == NEARMV) {
           uint8_t dummy_mode_ctx[MAX_REF_FRAMES];
           for (ref = 0; ref < 1 + is_compound; ++ref)
-            vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, j, ref, mi_row, mi_col,
+            vp9_append_sub8x8_mvs_for_idx(cm, xd, j, ref, mi_row, mi_col,
                                           &nearest_sub8x8[ref],
                                           &near_sub8x8[ref],
                                           dummy_mode_ctx);
@@ -567,7 +567,6 @@
 
 static void read_inter_frame_mode_info(VP9Decoder *const pbi,
                                        MACROBLOCKD *const xd,
-                                       const TileInfo *const tile,
                                        int mi_row, int mi_col, vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MODE_INFO *const mi = xd->mi[0];
@@ -582,13 +581,12 @@
   mbmi->tx_size = read_tx_size(cm, xd, !mbmi->skip || !inter_block, r);
 
   if (inter_block)
-    read_inter_block_mode_info(pbi, xd, tile, mi, mi_row, mi_col, r);
+    read_inter_block_mode_info(pbi, xd, mi, mi_row, mi_col, r);
   else
     read_intra_block_mode_info(cm, xd, mi, r);
 }
 
 void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
-                        const TileInfo *const tile,
                         int mi_row, int mi_col, vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MODE_INFO *const mi = xd->mi[0];
@@ -602,7 +600,7 @@
   if (frame_is_intra_only(cm)) {
     read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
   } else {
-    read_inter_frame_mode_info(pbi, xd, tile, mi_row, mi_col, r);
+    read_inter_frame_mode_info(pbi, xd, mi_row, mi_col, r);
 
     for (h = 0; h < y_mis; ++h) {
       MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
diff --git a/vp9/decoder/vp9_decodemv.h b/vp9/decoder/vp9_decodemv.h
index dd97d8d..db57b40 100644
--- a/vp9/decoder/vp9_decodemv.h
+++ b/vp9/decoder/vp9_decodemv.h
@@ -18,10 +18,7 @@
 extern "C" {
 #endif
 
-struct TileInfo;
-
 void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
-                        const struct TileInfo *const tile,
                         int mi_row, int mi_col, vp9_reader *r);
 
 #ifdef __cplusplus
diff --git a/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c b/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
new file mode 100644
index 0000000..4053bff
--- /dev/null
+++ b/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
@@ -0,0 +1,289 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/mips/msa/vp9_macros_msa.h"
+
+static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr,
+                                            uint32_t stride,
+                                            uint8_t *frm2_ptr,
+                                            int32_t filt_sth,
+                                            int32_t filt_wgt,
+                                            uint32_t *acc,
+                                            uint16_t *cnt) {
+  uint32_t row;
+  uint64_t f0, f1, f2, f3;
+  v16i8 frm2, frm1 = { 0 };
+  v16i8 frm4, frm3 = { 0 };
+  v16u8 frm_r, frm_l;
+  v8i16 frm2_r, frm2_l;
+  v8i16 diff0, diff1, mod0_h, mod1_h;
+  v4i32 cnst3, cnst16, filt_wt, strength;
+  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
+  v4i32 acc0, acc1, acc2, acc3;
+  v8i16 cnt0, cnt1;
+
+  filt_wt = __msa_fill_w(filt_wgt);
+  strength = __msa_fill_w(filt_sth);
+  cnst3 = __msa_ldi_w(3);
+  cnst16 = __msa_ldi_w(16);
+
+  for (row = 2; row--;) {
+    LD4(frm1_ptr, stride, f0, f1, f2, f3);
+    frm1_ptr += (4 * stride);
+
+    LD_SB2(frm2_ptr, 16, frm2, frm4);
+    frm2_ptr += 32;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    INSERT_D2_SB(f0, f1, frm1);
+    INSERT_D2_SB(f2, f3, frm3);
+    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
+         diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    UNPCK_UB_SH(frm2, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
+         diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+    UNPCK_UB_SH(frm4, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+  }
+}
+
+static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr,
+                                             uint32_t stride,
+                                             uint8_t *frm2_ptr,
+                                             int32_t filt_sth,
+                                             int32_t filt_wgt,
+                                             uint32_t *acc,
+                                             uint16_t *cnt) {
+  uint32_t row;
+  v16i8 frm1, frm2, frm3, frm4;
+  v16u8 frm_r, frm_l;
+  v16i8 zero = { 0 };
+  v8u16 frm2_r, frm2_l;
+  v8i16 diff0, diff1, mod0_h, mod1_h;
+  v4i32 cnst3, cnst16, filt_wt, strength;
+  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
+  v4i32 acc0, acc1, acc2, acc3;
+  v8i16 cnt0, cnt1;
+
+  filt_wt = __msa_fill_w(filt_wgt);
+  strength = __msa_fill_w(filt_sth);
+  cnst3 = __msa_ldi_w(3);
+  cnst16 = __msa_ldi_w(16);
+
+  for (row = 8; row--;) {
+    LD_SB2(frm1_ptr, stride, frm1, frm3);
+    frm1_ptr += stride;
+
+    LD_SB2(frm2_ptr, 16, frm2, frm4);
+    frm2_ptr += 16;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+
+    frm1_ptr += stride;
+    frm2_ptr += 16;
+  }
+}
+
+void vp9_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride,
+                                   uint8_t *frame2_ptr, uint32_t blk_w,
+                                   uint32_t blk_h, int32_t strength,
+                                   int32_t filt_wgt, uint32_t *accu,
+                                   uint16_t *cnt) {
+  if (8 == (blk_w * blk_h)) {
+    temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr,
+                                    strength, filt_wgt, accu, cnt);
+  } else if (16 == (blk_w * blk_h)) {
+    temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr,
+                                     strength, filt_wgt, accu, cnt);
+  } else {
+    vp9_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,
+                                strength, filt_wgt, accu, cnt);
+  }
+}
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index b405975..cd8c4e1 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -230,6 +230,9 @@
     mbmi->segment_id = 0;
     x->encode_breakout = cpi->encode_breakout;
   }
+
+  // required by vp9_append_sub8x8_mvs_for_idx() and vp9_find_best_ref_mvs()
+  xd->tile = *tile;
 }
 
 static void duplicate_mode_info_in_sb(VP9_COMMON *cm, MACROBLOCKD *xd,
@@ -2929,7 +2932,7 @@
     vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col,
                         rd_cost, bsize, ctx);
   else
-    vp9_pick_inter_mode_sub8x8(cpi, x, tile_data, mi_row, mi_col,
+    vp9_pick_inter_mode_sub8x8(cpi, x, mi_row, mi_col,
                                rd_cost, bsize, ctx);
 
   duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 3d7843e..16640fe 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -1031,8 +1031,10 @@
       fps.mvr_abs = (double)sum_mvr_abs / mvcount;
       fps.MVc = (double)sum_mvc / mvcount;
       fps.mvc_abs = (double)sum_mvc_abs / mvcount;
-      fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / mvcount)) / mvcount;
-      fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / mvcount)) / mvcount;
+      fps.MVrv = ((double)sum_mvrs -
+                  ((double)sum_mvr * sum_mvr / mvcount)) / mvcount;
+      fps.MVcv = ((double)sum_mvcs -
+                  ((double)sum_mvc * sum_mvc / mvcount)) / mvcount;
       fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);
       fps.new_mv_count = new_mv_count;
       fps.pcnt_motion = (double)mvcount / num_mbs;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 3eaa990..a627136 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1179,7 +1179,7 @@
                            sf, sf);
 
       if (cm->use_prev_frame_mvs)
-        vp9_find_mv_refs(cm, xd, tile_info, xd->mi[0], ref_frame,
+        vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame,
                          candidates, mi_row, mi_col, NULL, NULL,
                          xd->mi[0]->mbmi.mode_context);
       else
@@ -1623,11 +1623,9 @@
 }
 
 void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
-                                TileDataEnc *tile_data,
                                 int mi_row, int mi_col, RD_COST *rd_cost,
                                 BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
   VP9_COMMON *const cm = &cpi->common;
-  TileInfo *const tile_info = &tile_data->tile_info;
   SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
@@ -1659,7 +1657,7 @@
                              &cm->frame_refs[ref_frame - 1].sf;
       vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col,
                            sf, sf);
-      vp9_find_mv_refs(cm, xd, tile_info, xd->mi[0], ref_frame,
+      vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame,
                        candidates, mi_row, mi_col, NULL, NULL,
                        xd->mi[0]->mbmi.mode_context);
 
@@ -1733,7 +1731,7 @@
 
         b_mv[ZEROMV].as_int = 0;
         b_mv[NEWMV].as_int = INVALID_MV;
-        vp9_append_sub8x8_mvs_for_idx(cm, xd, tile_info, i, 0, mi_row, mi_col,
+        vp9_append_sub8x8_mvs_for_idx(cm, xd, i, 0, mi_row, mi_col,
                                       &b_mv[NEARESTMV],
                                       &b_mv[NEARMV],
                                       xd->mi[0]->mbmi.mode_context);
diff --git a/vp9/encoder/vp9_pickmode.h b/vp9/encoder/vp9_pickmode.h
index 11f4409..a43bb81 100644
--- a/vp9/encoder/vp9_pickmode.h
+++ b/vp9/encoder/vp9_pickmode.h
@@ -27,7 +27,6 @@
                          PICK_MODE_CONTEXT *ctx);
 
 void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
-                                TileDataEnc *tile_data,
                                 int mi_row, int mi_col, RD_COST *rd_cost,
                                 BLOCK_SIZE bsize,
                                 PICK_MODE_CONTEXT *ctx);
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 162d4de..3f9b2eb 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1732,7 +1732,6 @@
 }
 
 static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                                        const TileInfo * const tile,
                                         int_mv *best_ref_mv,
                                         int_mv *second_best_ref_mv,
                                         int64_t best_rd, int *returntotrate,
@@ -1802,7 +1801,7 @@
       for (ref = 0; ref < 1 + has_second_rf; ++ref) {
         const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
         frame_mv[ZEROMV][frame].as_int = 0;
-        vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, i, ref, mi_row, mi_col,
+        vp9_append_sub8x8_mvs_for_idx(cm, xd, i, ref, mi_row, mi_col,
                                       &frame_mv[NEARESTMV][frame],
                                       &frame_mv[NEARMV][frame],
                                       xd->mi[0]->mbmi.mode_context);
@@ -2199,7 +2198,6 @@
 }
 
 static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
-                               const TileInfo *const tile,
                                MV_REFERENCE_FRAME ref_frame,
                                BLOCK_SIZE block_size,
                                int mi_row, int mi_col,
@@ -2220,7 +2218,7 @@
   vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
 
   // Gets an initial list of candidate vectors from neighbours and orders them
-  vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col,
+  vp9_find_mv_refs(cm, xd, mi, ref_frame, candidates, mi_row, mi_col,
                    NULL, NULL, xd->mi[0]->mbmi.mode_context);
 
   // Candidate refinement carried out at encoder and decoder
@@ -2982,7 +2980,7 @@
     x->pred_mv_sad[ref_frame] = INT_MAX;
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
-      setup_buffer_inter(cpi, x, tile_info, ref_frame, bsize, mi_row, mi_col,
+      setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
                          frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
@@ -3714,7 +3712,6 @@
                                    PICK_MODE_CONTEXT *ctx,
                                    int64_t best_rd_so_far) {
   VP9_COMMON *const cm = &cpi->common;
-  TileInfo *const tile_info = &tile_data->tile_info;
   RD_OPT *const rd_opt = &cpi->rd;
   SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -3778,8 +3775,7 @@
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      setup_buffer_inter(cpi, x, tile_info,
-                         ref_frame, bsize, mi_row, mi_col,
+      setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
                          frame_mv[NEARESTMV], frame_mv[NEARMV],
                          yv12_mb);
     } else {
@@ -3971,7 +3967,7 @@
             int newbest, rs;
             int64_t rs_rd;
             mbmi->interp_filter = switchable_filter_index;
-            tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile_info,
+            tmp_rd = rd_pick_best_sub8x8_mode(cpi, x,
                                               &mbmi->ref_mvs[ref_frame][0],
                                               second_ref, best_yrd, &rate,
                                               &rate_y, &distortion,
@@ -4037,7 +4033,7 @@
       if (!pred_exists) {
         // Handles the special case when a filter that is not in the
         // switchable list (bilinear, 6-tap) is indicated at the frame level
-        tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile_info,
+        tmp_rd = rd_pick_best_sub8x8_mode(cpi, x,
                                           &mbmi->ref_mvs[ref_frame][0],
                                           second_ref, best_yrd, &rate, &rate_y,
                                           &distortion, &skippable, &total_sse,
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 96422d7..e78c111 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -160,5 +160,6 @@
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_subtract_msa.c
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c
 
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))