Merge "mips msa vp8 loop filter optimization"
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 6294af1..4ee4ad4 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -494,6 +494,14 @@
         make_tuple(&vp9_fwht4x4_mmx, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8)));
 #endif
 
+#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && \
+    !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans4x4WHT,
+    ::testing::Values(
+        make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_sse2, 0, VPX_BITS_8)));
+#endif
+
 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4DCT,
diff --git a/tools_common.c b/tools_common.c
index 901734e..8d356af 100644
--- a/tools_common.c
+++ b/tools_common.c
@@ -392,7 +392,7 @@
           (uint16_t *)(src->planes[plane] + y * src->stride[plane]);
       uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
       for (x = 0; x < w; x++) {
-        *p_dst++ = *p_src++;
+        *p_dst++ = (uint8_t)(*p_src++);
       }
     }
   }
diff --git a/vp8/encoder/x86/quantize_ssse3.c b/vp8/encoder/x86/quantize_ssse3.c
index 448217f..14282db 100644
--- a/vp8/encoder/x86/quantize_ssse3.c
+++ b/vp8/encoder/x86/quantize_ssse3.c
@@ -17,7 +17,7 @@
 #include <intrin.h>
 #pragma intrinsic(_BitScanReverse)
 static int bsr(int mask) {
-  int eob;
+  unsigned long eob;
   _BitScanReverse(&eob, mask);
   eob++;
   if (mask == 0)
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index fd8fe3f..8054cdf 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -128,6 +128,11 @@
   ENTROPY_CONTEXT *left_context;
   int16_t seg_dequant[MAX_SEGMENTS][2];
 
+  // number of 4x4s in current block
+  uint16_t n4_w, n4_h;
+  // log2 of n4_w, n4_h
+  uint8_t n4_wl, n4_hl;
+
   // encoder
   const int16_t *dequant;
 };
@@ -144,6 +149,8 @@
 
 typedef struct macroblockd {
   struct macroblockd_plane plane[MAX_MB_PLANE];
+  uint8_t bmode_blocks_wl;
+  uint8_t bmode_blocks_hl;
 
   FRAME_COUNTS *counts;
   TileInfo tile;
@@ -187,12 +194,6 @@
   int bd;
 #endif
 
-  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
-#if CONFIG_VP9_ENCODER
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[64 * 64]);
-#else
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
-#endif
   int lossless;
   int corrupted;
 
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index cee1682..cbce2dd 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -37,6 +37,10 @@
   _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
                 (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
 
+#define octa_set_epi16(a, b, c, d, e, f, g, h) \
+  _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
+                 (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
+
 // Constants:
 //  for (int i = 1; i< 32; ++i)
 //    printf("static const int cospi_%d_64 = %.0f;\n", i,
@@ -158,7 +162,7 @@
 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits
 #define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
 #else
-#define WRAPLOW(x, bd) (x)
+#define WRAPLOW(x, bd) ((int32_t)(x))
 #endif  // CONFIG_EMULATE_HARDWARE
 
 void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 8f1240a..ca0dfc8 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -348,11 +348,12 @@
           (const vp9_prob (*)[PARTITION_TYPES - 1])cm->fc->partition_prob;
 }
 
-static INLINE void vp9_init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd) {
+static INLINE void vp9_init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                        tran_low_t *dqcoeff) {
   int i;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
-    xd->plane[i].dqcoeff = xd->dqcoeff;
+    xd->plane[i].dqcoeff = dqcoeff;
     xd->above_context[i] = cm->above_context +
         i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols);
 
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 524e79f..538f1ed 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -475,7 +475,7 @@
     specialize qw/vp9_iwht4x4_1_add msa/;
 
     add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vp9_iwht4x4_16_add msa/;
+    specialize qw/vp9_iwht4x4_16_add msa/, "$sse2_x86inc";
   }
 }
 
diff --git a/vp9/common/x86/vp9_idct_sse2.asm b/vp9/common/x86/vp9_idct_sse2.asm
new file mode 100644
index 0000000..9619e37
--- /dev/null
+++ b/vp9/common/x86/vp9_idct_sse2.asm
@@ -0,0 +1,88 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+  paddw           m0,        m2
+  psubw           m3,        m1
+  psubw           m4,        m0,        m3
+  psraw           m4,        1
+  psubw           m5,        m4,        m1 ;b1
+  psubw           m4,        m2 ;c1
+  psubw           m0,        m5
+  paddw           m3,        m4
+                                ; m0 a0
+  SWAP            1,         4  ; m1 c1
+  SWAP            2,         3  ; m2 d1
+  SWAP            3,         5  ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+  SWAP            4, 0
+  SWAP            5, 2
+  punpcklwd       m4,        m1
+  pshufd          m0,        m4, 0x0e
+  punpcklwd       m5,        m3
+  pshufd          m2,        m5, 0x0e
+  SWAP            1, 4
+  SWAP            3, 0
+  punpckldq       m1,        m5
+  pshufd          m4,        m1, 0x0e
+  punpckldq       m3,        m2
+  pshufd          m0,        m3, 0x0e
+  SWAP            2, 3, 0, 1, 4
+%endmacro
+
+; transposes a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
+%macro TRANSPOSE_4X4_WIDE 0
+  mova            m3, m0
+  punpcklwd       m0, m1
+  punpckhwd       m3, m1
+  mova            m2, m0
+  punpcklwd       m0, m3
+  punpckhwd       m2, m3
+  pshufd          m1, m0, 0x0e
+  pshufd          m3, m2, 0x0e
+%endmacro
+
+%macro ADD_STORE_4P_2X 5  ; src1, src2, tmp1, tmp2, zero
+  movq            m%3,       [outputq]
+  movq            m%4,       [outputq + strideq]
+  punpcklbw       m%3,       m%5
+  punpcklbw       m%4,       m%5
+  paddw           m%1,       m%3
+  paddw           m%2,       m%4
+  packuswb        m%1,       m%5
+  packuswb        m%2,       m%5
+  movd            [outputq], m%1
+  movd            [outputq + strideq], m%2
+%endmacro
+
+INIT_XMM sse2
+cglobal iwht4x4_16_add, 3, 4, 7, input, output, stride
+  mova            m0,        [inputq +  0] ;a1
+  mova            m1,        [inputq + 16] ;c1
+
+  psraw           m0,        2
+  psraw           m1,        2
+
+  TRANSPOSE_4X4_WIDE
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  TRANSFORM_COLS
+
+  pxor            m4, m4
+  ADD_STORE_4P_2X  0, 1, 5, 6, 4
+  lea             outputq, [outputq + 2 * strideq]
+  ADD_STORE_4P_2X  2, 3, 5, 6, 4
+
+  RET
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
index e321dbe..fe8af54 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -1333,43 +1333,47 @@
   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
   __m128i x8, x9, x10, x11, x12, x13, x14, x15;
 
-  // Read in 16 lines
-  x0 = _mm_loadl_epi64((__m128i *)in0);
-  x8 = _mm_loadl_epi64((__m128i *)in1);
-  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
-  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));
-  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));
-  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));
-  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));
-  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));
-  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));
-  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));
-  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));
-  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));
-  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));
-  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));
-  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));
-  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));
+  // 2-way interleave w/hoisting of unpacks
+  x0 = _mm_loadl_epi64((__m128i *)in0);  // 1
+  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));  // 3
+  x0 = _mm_unpacklo_epi8(x0, x1);  // 1
 
-  x0 = _mm_unpacklo_epi8(x0, x1);
-  x1 = _mm_unpacklo_epi8(x2, x3);
-  x2 = _mm_unpacklo_epi8(x4, x5);
-  x3 = _mm_unpacklo_epi8(x6, x7);
+  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));  // 5
+  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));  // 7
+  x1 = _mm_unpacklo_epi8(x2, x3);  // 2
 
-  x8 = _mm_unpacklo_epi8(x8, x9);
-  x9 = _mm_unpacklo_epi8(x10, x11);
-  x10 = _mm_unpacklo_epi8(x12, x13);
-  x11 = _mm_unpacklo_epi8(x14, x15);
+  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));  // 9
+  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));  // 11
+  x2 = _mm_unpacklo_epi8(x4, x5);  // 3
 
-  x4 = _mm_unpacklo_epi16(x0, x1);
-  x5 = _mm_unpacklo_epi16(x2, x3);
-  x12 = _mm_unpacklo_epi16(x8, x9);
-  x13 = _mm_unpacklo_epi16(x10, x11);
+  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));  // 13
+  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));  // 15
+  x3 = _mm_unpacklo_epi8(x6, x7);  // 4
+  x4 = _mm_unpacklo_epi16(x0, x1);  // 9
 
-  x6 = _mm_unpacklo_epi32(x4, x5);
-  x7 = _mm_unpackhi_epi32(x4, x5);
-  x14 = _mm_unpacklo_epi32(x12, x13);
-  x15 = _mm_unpackhi_epi32(x12, x13);
+  x8 = _mm_loadl_epi64((__m128i *)in1);  // 2
+  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));  // 4
+  x8 = _mm_unpacklo_epi8(x8, x9);  // 5
+  x5 = _mm_unpacklo_epi16(x2, x3);  // 10
+
+  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));  // 6
+  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));  // 8
+  x9 = _mm_unpacklo_epi8(x10, x11);  // 6
+
+  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));  // 10
+  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));  // 12
+  x10 = _mm_unpacklo_epi8(x12, x13);  // 7
+  x12 = _mm_unpacklo_epi16(x8, x9);  // 11
+
+  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));  // 14
+  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));  // 16
+  x11 = _mm_unpacklo_epi8(x14, x15);  // 8
+  x13 = _mm_unpacklo_epi16(x10, x11);  // 12
+
+  x6 = _mm_unpacklo_epi32(x4, x5);  // 13
+  x7 = _mm_unpackhi_epi32(x4, x5);  // 14
+  x14 = _mm_unpacklo_epi32(x12, x13);  // 15
+  x15 = _mm_unpackhi_epi32(x12, x13);  // 16
 
   // Store first 4-line result
   _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
@@ -1405,33 +1409,36 @@
 
     x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
     x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
-    x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
-    x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
-    x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
-    x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
-    x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
-    x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
     // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
     x0 = _mm_unpacklo_epi8(x0, x1);
+
+    x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
+    x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
     // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
     x1 = _mm_unpacklo_epi8(x2, x3);
+
+    x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
+    x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
     // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
     x2 = _mm_unpacklo_epi8(x4, x5);
+
+    x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
+    x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
     // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
     x3 = _mm_unpacklo_epi8(x6, x7);
+
     // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
     x4 = _mm_unpacklo_epi16(x0, x1);
     // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
     x5 = _mm_unpacklo_epi16(x2, x3);
     // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
     x6 = _mm_unpacklo_epi32(x4, x5);
-    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-    x7 = _mm_unpackhi_epi32(x4, x5);
-
     _mm_storel_pd((double *)(out + 0*out_p),
                   _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
     _mm_storeh_pd((double *)(out + 1*out_p),
                   _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
+    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+    x7 = _mm_unpackhi_epi32(x4, x5);
     _mm_storel_pd((double *)(out + 2*out_p),
                   _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
     _mm_storeh_pd((double *)(out + 3*out_p),
@@ -1443,13 +1450,13 @@
     x5 = _mm_unpackhi_epi16(x2, x3);
     // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
     x6 = _mm_unpacklo_epi32(x4, x5);
-    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
-    x7 = _mm_unpackhi_epi32(x4, x5);
-
     _mm_storel_pd((double *)(out + 4*out_p),
                   _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
     _mm_storeh_pd((double *)(out + 5*out_p),
                   _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
+    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+    x7 = _mm_unpackhi_epi32(x4, x5);
+
     _mm_storel_pd((double *)(out + 6*out_p),
                   _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
     _mm_storeh_pd((double *)(out + 7*out_p),
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 6c64540..b3cf3fd 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -182,35 +182,29 @@
   }
 }
 
-static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
-                                    TX_SIZE tx_size, uint8_t *dst, int stride,
-                                    int eob) {
+static void inverse_transform_block_inter(MACROBLOCKD* xd, int plane,
+                                          const TX_SIZE tx_size,
+                                          uint8_t *dst, int stride,
+                                          int eob) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   if (eob > 0) {
-    TX_TYPE tx_type = DCT_DCT;
     tran_low_t *const dqcoeff = pd->dqcoeff;
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       if (xd->lossless) {
-        tx_type = DCT_DCT;
         vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
       } else {
-        const PLANE_TYPE plane_type = pd->plane_type;
         switch (tx_size) {
           case TX_4X4:
-            tx_type = get_tx_type_4x4(plane_type, xd, block);
-            vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd);
             break;
           case TX_8X8:
-            tx_type = get_tx_type(plane_type, xd);
-            vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd);
             break;
           case TX_16X16:
-            tx_type = get_tx_type(plane_type, xd);
-            vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd);
             break;
           case TX_32X32:
-            tx_type = DCT_DCT;
             vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
             break;
           default:
@@ -219,25 +213,19 @@
       }
     } else {
       if (xd->lossless) {
-        tx_type = DCT_DCT;
         vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
       } else {
-        const PLANE_TYPE plane_type = pd->plane_type;
         switch (tx_size) {
           case TX_4X4:
-            tx_type = get_tx_type_4x4(plane_type, xd, block);
-            vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+            vp9_idct4x4_add(dqcoeff, dst, stride, eob);
             break;
           case TX_8X8:
-            tx_type = get_tx_type(plane_type, xd);
-            vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+            vp9_idct8x8_add(dqcoeff, dst, stride, eob);
             break;
           case TX_16X16:
-            tx_type = get_tx_type(plane_type, xd);
-            vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+            vp9_idct16x16_add(dqcoeff, dst, stride, eob);
             break;
           case TX_32X32:
-            tx_type = DCT_DCT;
             vp9_idct32x32_add(dqcoeff, dst, stride, eob);
             break;
           default:
@@ -248,25 +236,19 @@
     }
 #else
     if (xd->lossless) {
-      tx_type = DCT_DCT;
       vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
     } else {
-      const PLANE_TYPE plane_type = pd->plane_type;
       switch (tx_size) {
         case TX_4X4:
-          tx_type = get_tx_type_4x4(plane_type, xd, block);
-          vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+          vp9_idct4x4_add(dqcoeff, dst, stride, eob);
           break;
         case TX_8X8:
-          tx_type = get_tx_type(plane_type, xd);
-          vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+          vp9_idct8x8_add(dqcoeff, dst, stride, eob);
           break;
         case TX_16X16:
-          tx_type = get_tx_type(plane_type, xd);
-          vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+          vp9_idct16x16_add(dqcoeff, dst, stride, eob);
           break;
         case TX_32X32:
-          tx_type = DCT_DCT;
           vp9_idct32x32_add(dqcoeff, dst, stride, eob);
           break;
         default:
@@ -277,7 +259,97 @@
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
     if (eob == 1) {
-      memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0]));
+      dqcoeff[0] = 0;
+    } else {
+      if (tx_size <= TX_16X16 && eob <= 10)
+        memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+      else if (tx_size == TX_32X32 && eob <= 34)
+        memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
+      else
+        memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
+    }
+  }
+}
+
+static void inverse_transform_block_intra(MACROBLOCKD* xd, int plane,
+                                          const TX_TYPE tx_type,
+                                          const TX_SIZE tx_size,
+                                          uint8_t *dst, int stride,
+                                          int eob) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  if (eob > 0) {
+    tran_low_t *const dqcoeff = pd->dqcoeff;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (xd->lossless) {
+        vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+      } else {
+        switch (tx_size) {
+          case TX_4X4:
+            vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_8X8:
+            vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_16X16:
+            vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_32X32:
+            vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          default:
+            assert(0 && "Invalid transform size");
+        }
+      }
+    } else {
+      if (xd->lossless) {
+        vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+      } else {
+        switch (tx_size) {
+          case TX_4X4:
+            vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+            break;
+          case TX_8X8:
+            vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+            break;
+          case TX_16X16:
+            vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+            break;
+          case TX_32X32:
+            vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+            break;
+          default:
+            assert(0 && "Invalid transform size");
+            return;
+        }
+      }
+    }
+#else
+    if (xd->lossless) {
+      vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+    } else {
+      switch (tx_size) {
+        case TX_4X4:
+          vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+          break;
+        case TX_8X8:
+          vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+          break;
+        case TX_16X16:
+          vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+          break;
+        case TX_32X32:
+          vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+          break;
+        default:
+          assert(0 && "Invalid transform size");
+          return;
+      }
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    if (eob == 1) {
+      dqcoeff[0] = 0;
     } else {
       if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
         memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
@@ -289,6 +361,16 @@
   }
 }
 
+static INLINE void dec_txfrm_block_to_raster_xy(int bwl,
+                                                TX_SIZE tx_size, int block,
+                                                int *x, int *y) {
+  const int tx_cols_log2 = bwl - tx_size;
+  const int tx_cols = 1 << tx_cols_log2;
+  const int raster_mb = block >> (tx_size << 1);
+  *x = (raster_mb & (tx_cols - 1)) << tx_size;
+  *y = (raster_mb >> tx_cols_log2) << tx_size;
+}
+
 struct intra_args {
   MACROBLOCKD *xd;
   vp9_reader *r;
@@ -296,7 +378,6 @@
 };
 
 static void predict_and_reconstruct_intra_block(int plane, int block,
-                                                BLOCK_SIZE plane_bsize,
                                                 TX_SIZE tx_size, void *arg) {
   struct intra_args *const args = (struct intra_args *)arg;
   MACROBLOCKD *const xd = args->xd;
@@ -306,48 +387,39 @@
                                             : mi->mbmi.uv_mode;
   int x, y;
   uint8_t *dst;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
+  const int bwl = pd->n4_wl;
+  dec_txfrm_block_to_raster_xy(bwl, tx_size, block, &x, &y);
   dst = &pd->dst.buf[4 * y * pd->dst.stride + 4 * x];
 
   vp9_predict_intra_block(xd, block >> (tx_size << 1),
-                          b_width_log2_lookup[plane_bsize], tx_size, mode,
+                          bwl, tx_size, mode,
                           dst, pd->dst.stride, dst, pd->dst.stride,
                           x, y, plane);
 
   if (!mi->mbmi.skip) {
+    const TX_TYPE tx_type = (plane || xd->lossless) ?
+        DCT_DCT : intra_mode_to_tx_type_lookup[mode];
     const scan_order *sc = (plane || xd->lossless) ?
-        &vp9_default_scan_orders[tx_size] :
-        &vp9_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]];
-    const int eob = vp9_decode_block_tokens(xd, plane, sc,
-                                            plane_bsize, x, y, tx_size,
+        &vp9_default_scan_orders[tx_size] : &vp9_scan_orders[tx_size][tx_type];
+    const int eob = vp9_decode_block_tokens(xd, plane, sc, x, y, tx_size,
                                             args->r, args->seg_id);
-    inverse_transform_block(xd, plane, block, tx_size, dst, pd->dst.stride,
-                            eob);
+    inverse_transform_block_intra(xd, plane, tx_type, tx_size,
+                                  dst, pd->dst.stride, eob);
   }
 }
 
-struct inter_args {
-  MACROBLOCKD *xd;
-  vp9_reader *r;
-  int *eobtotal;
-  int seg_id;
-};
-
-static void reconstruct_inter_block(int plane, int block,
-                                    BLOCK_SIZE plane_bsize,
-                                    TX_SIZE tx_size, void *arg) {
-  struct inter_args *args = (struct inter_args *)arg;
-  MACROBLOCKD *const xd = args->xd;
+static int reconstruct_inter_block(MACROBLOCKD *const xd, vp9_reader *r,
+                                   MB_MODE_INFO *const mbmi, int plane,
+                                   int row, int col, TX_SIZE tx_size) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  int x, y, eob;
   const scan_order *sc = &vp9_default_scan_orders[tx_size];
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
-  eob = vp9_decode_block_tokens(xd, plane, sc, plane_bsize,
-                                x, y, tx_size, args->r, args->seg_id);
-  inverse_transform_block(xd, plane, block, tx_size,
-                          &pd->dst.buf[4 * y * pd->dst.stride + 4 * x],
-                          pd->dst.stride, eob);
-  *args->eobtotal += eob;
+  const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size, r,
+                                          mbmi->segment_id);
+
+  inverse_transform_block_inter(xd, plane, tx_size,
+                            &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
+                            pd->dst.stride, eob);
+  return eob;
 }
 
 static void build_mc_border(const uint8_t *src, int src_stride,
@@ -648,8 +720,7 @@
 
 static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
                                           MACROBLOCKD *xd,
-                                          int mi_row, int mi_col,
-                                          BLOCK_SIZE bsize) {
+                                          int mi_row, int mi_col) {
   int plane;
   const int mi_x = mi_col * MI_SIZE;
   const int mi_y = mi_row * MI_SIZE;
@@ -659,15 +730,13 @@
   const int is_compound = has_second_ref(&mi->mbmi);
 
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
-                                                        &xd->plane[plane]);
     struct macroblockd_plane *const pd = &xd->plane[plane];
     struct buf_2d *const dst_buf = &pd->dst;
-    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int num_4x4_w = pd->n4_w;
+    const int num_4x4_h = pd->n4_h;
 
-    const int bw = 4 * num_4x4_w;
-    const int bh = 4 * num_4x4_h;
+    const int n4w_x4 = 4 * num_4x4_w;
+    const int n4h_x4 = 4 * num_4x4_h;
     int ref;
 
     for (ref = 0; ref < 1 + is_compound; ++ref) {
@@ -680,11 +749,10 @@
 
       if (sb_type < BLOCK_8X8) {
         int i = 0, x, y;
-        assert(bsize == BLOCK_8X8);
         for (y = 0; y < num_4x4_h; ++y) {
           for (x = 0; x < num_4x4_w; ++x) {
             const MV mv = average_split_mvs(pd, mi, ref, i++);
-            dec_build_inter_predictors(pbi, xd, plane, bw, bh,
+            dec_build_inter_predictors(pbi, xd, plane, n4w_x4, n4h_x4,
                                        4 * x, 4 * y, 4, 4, mi_x, mi_y, kernel,
                                        sf, pre_buf, dst_buf, &mv,
                                        ref_frame_buf, is_scaled, ref);
@@ -692,8 +760,8 @@
         }
       } else {
         const MV mv = mi->mbmi.mv[ref].as_mv;
-        dec_build_inter_predictors(pbi, xd, plane, bw, bh,
-                                   0, 0, bw, bh, mi_x, mi_y, kernel,
+        dec_build_inter_predictors(pbi, xd, plane, n4w_x4, n4h_x4,
+                                   0, 0, n4w_x4, n4h_x4, mi_x, mi_y, kernel,
                                    sf, pre_buf, dst_buf, &mv, ref_frame_buf,
                                    is_scaled, ref);
       }
@@ -701,24 +769,105 @@
   }
 }
 
+static INLINE TX_SIZE dec_get_uv_tx_size(const MB_MODE_INFO *mbmi,
+                                         int n4_wl, int n4_hl) {
+  // get minimum log2 num4x4s dimension
+  const int x = MIN(n4_wl, n4_hl);
+  return MIN(mbmi->tx_size,  x);
+}
+
+// TODO(slavarnway): Eliminate the foreach_ functions in future commits.
+// NOTE: Jingning removed the foreach_ for recon inter in a previous commit.
+
+typedef void (*dec_foreach_transformed_block_visitor)(int plane, int block,
+                                                      TX_SIZE tx_size,
+                                                      void *arg);
+
+static void dec_foreach_transformed_block_in_plane(
+    const MACROBLOCKD *const xd,
+    int plane,
+    dec_foreach_transformed_block_visitor visit, void *arg) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MB_MODE_INFO* mbmi = &xd->mi[0]->mbmi;
+  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+  // transform size varies per plane, look it up in a common way.
+  const TX_SIZE tx_size =
+      plane ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
+              : mbmi->tx_size;
+  const int num_4x4_w = pd->n4_w;
+  const int num_4x4_h = pd->n4_h;
+  const int step = 1 << (tx_size << 1);
+  int i = 0, r, c;
+
+  // If mb_to_right_edge is < 0 we are in a situation in which
+  // the current block size extends into the UMV and we won't
+  // visit the sub blocks that are wholly within the UMV.
+  const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
+      xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+  const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
+      xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (r = 0; r < max_blocks_high; r += (1 << tx_size)) {
+    for (c = 0; c < num_4x4_w; c += (1 << tx_size)) {
+      // Skip visiting the sub blocks that are wholly within the UMV.
+      if (c < max_blocks_wide)
+        visit(plane, i, tx_size, arg);
+      i += step;
+    }
+  }
+}
+
+static void dec_foreach_transformed_block(const MACROBLOCKD* const xd,
+    dec_foreach_transformed_block_visitor visit, void *arg) {
+  int plane;
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane)
+    dec_foreach_transformed_block_in_plane(xd, plane, visit, arg);
+}
+
+static INLINE void dec_reset_skip_context(MACROBLOCKD *xd) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) * pd->n4_w);
+    memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) * pd->n4_h);
+  }
+}
+
+static void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, int bwl,
+                         int bhl) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].n4_w = (bw << 1) >> xd->plane[i].subsampling_x;
+    xd->plane[i].n4_h = (bh << 1) >> xd->plane[i].subsampling_y;
+    xd->plane[i].n4_wl = bwl - xd->plane[i].subsampling_x;
+    xd->plane[i].n4_hl = bhl - xd->plane[i].subsampling_y;
+  }
+}
+
 static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                                 BLOCK_SIZE bsize, int mi_row, int mi_col) {
-  const int bw = num_8x8_blocks_wide_lookup[bsize];
-  const int bh = num_8x8_blocks_high_lookup[bsize];
-  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
-  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                 int bw, int bh, int x_mis, int y_mis,
+                                 int bwl, int bhl) {
   const int offset = mi_row * cm->mi_stride + mi_col;
   int x, y;
   const TileInfo *const tile = &xd->tile;
 
   xd->mi = cm->mi_grid_visible + offset;
   xd->mi[0] = &cm->mi[offset];
+  // TODO(slavarnway): Generate sb_type based on bwl and bhl, instead of
+  // passing bsize from decode_partition().
   xd->mi[0]->mbmi.sb_type = bsize;
   for (y = 0; y < y_mis; ++y)
     for (x = !y; x < x_mis; ++x) {
       xd->mi[y * cm->mi_stride + x] = xd->mi[0];
     }
 
+  set_plane_n4(xd, bw, bh, bwl, bhl);
+
   set_skip_context(xd, mi_row, mi_col);
 
   // Distance of Mb to the various image edges. These are specified to 8th pel
@@ -731,10 +880,17 @@
 
 static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
                          int mi_row, int mi_col,
-                         vp9_reader *r, BLOCK_SIZE bsize) {
+                         vp9_reader *r, BLOCK_SIZE bsize,
+                         int bwl, int bhl) {
   VP9_COMMON *const cm = &pbi->common;
   const int less8x8 = bsize < BLOCK_8X8;
-  MB_MODE_INFO *mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col);
+  const int bw = 1 << (bwl - 1);
+  const int bh = 1 << (bhl - 1);
+  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+
+  MB_MODE_INFO *mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col,
+                                   bw, bh, x_mis, y_mis, bwl, bhl);
 
   if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) {
     const BLOCK_SIZE uv_subsize =
@@ -744,28 +900,45 @@
                          VPX_CODEC_CORRUPT_FRAME, "Invalid block size.");
   }
 
-  vp9_read_mode_info(pbi, xd, mi_row, mi_col, r);
-
-  if (less8x8)
-    bsize = BLOCK_8X8;
+  vp9_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
 
   if (mbmi->skip) {
-    reset_skip_context(xd, bsize);
+    dec_reset_skip_context(xd);
   }
 
   if (!is_inter_block(mbmi)) {
     struct intra_args arg = {xd, r, mbmi->segment_id};
-    vp9_foreach_transformed_block(xd, bsize,
+    dec_foreach_transformed_block(xd,
                                   predict_and_reconstruct_intra_block, &arg);
   } else {
     // Prediction
-    dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col, bsize);
+    dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col);
 
     // Reconstruction
     if (!mbmi->skip) {
       int eobtotal = 0;
-      struct inter_args arg = {xd, r, &eobtotal, mbmi->segment_id};
-      vp9_foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
+      int plane;
+
+      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+        const struct macroblockd_plane *const pd = &xd->plane[plane];
+        const TX_SIZE tx_size =
+            plane ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
+                    : mbmi->tx_size;
+        const int num_4x4_w = pd->n4_w;
+        const int num_4x4_h = pd->n4_h;
+        const int step = (1 << tx_size);
+        int row, col;
+        const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ?
+            0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+        const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ?
+            0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+        for (row = 0; row < max_blocks_high; row += step)
+          for (col = 0; col < max_blocks_wide; col += step)
+            eobtotal += reconstruct_inter_block(xd, r, mbmi, plane, row, col,
+                                                tx_size);
+      }
+
       if (!less8x8 && eobtotal == 0)
         mbmi->skip = 1;  // skip loopfilter
     }
@@ -774,10 +947,36 @@
   xd->corrupted |= vp9_reader_has_error(r);
 }
 
+static INLINE int dec_partition_plane_context(const MACROBLOCKD *xd,
+                                              int mi_row, int mi_col,
+                                              int bsl) {
+  const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
+  const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+  int above = (*above_ctx >> bsl) & 1 , left = (*left_ctx >> bsl) & 1;
+
+//  assert(bsl >= 0);
+
+  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
+}
+
+static INLINE void dec_update_partition_context(MACROBLOCKD *xd,
+                                                int mi_row, int mi_col,
+                                                BLOCK_SIZE subsize,
+                                                int bw) {
+  PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
+  PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+
+  // update the partition context at the end notes. set partition bits
+  // of block sizes larger than the current one to be one, and partition
+  // bits of smaller block sizes to be zero.
+  memset(above_ctx, partition_context_lookup[subsize].above, bw);
+  memset(left_ctx, partition_context_lookup[subsize].left, bw);
+}
+
 static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                     BLOCK_SIZE bsize, vp9_reader *r,
-                                     int has_rows, int has_cols) {
-  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+                                     vp9_reader *r,
+                                     int has_rows, int has_cols, int bsl) {
+  const int ctx = dec_partition_plane_context(xd, mi_row, mi_col, bsl);
   const vp9_prob *const probs = get_partition_probs(xd, ctx);
   FRAME_COUNTS *counts = xd->counts;
   PARTITION_TYPE p;
@@ -797,11 +996,14 @@
   return p;
 }
 
+// TODO(slavarnway): eliminate bsize and subsize in future commits
 static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd,
                              int mi_row, int mi_col,
-                             vp9_reader* r, BLOCK_SIZE bsize) {
+                             vp9_reader* r, BLOCK_SIZE bsize, int n4x4_l2) {
   VP9_COMMON *const cm = &pbi->common;
-  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const int n8x8_l2 = n4x4_l2 - 1;
+  const int num_8x8_wh = 1 << n8x8_l2;
+  const int hbs = num_8x8_wh >> 1;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
   const int has_rows = (mi_row + hbs) < cm->mi_rows;
@@ -810,30 +1012,37 @@
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  partition = read_partition(xd, mi_row, mi_col, bsize, r, has_rows, has_cols);
-  subsize = get_subsize(bsize, partition);
-  if (bsize == BLOCK_8X8) {
-    decode_block(pbi, xd, mi_row, mi_col, r, subsize);
+  partition = read_partition(xd, mi_row, mi_col, r, has_rows, has_cols,
+                             n8x8_l2);
+  subsize = subsize_lookup[partition][bsize];  // get_subsize(bsize, partition);
+  if (!hbs) {
+    // calculate bmode block dimensions (log 2)
+    xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
+    xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ);
+    decode_block(pbi, xd, mi_row, mi_col, r, subsize, 1, 1);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        decode_block(pbi, xd, mi_row, mi_col, r, subsize);
+        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n4x4_l2);
         break;
       case PARTITION_HORZ:
-        decode_block(pbi, xd, mi_row, mi_col, r, subsize);
+        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n8x8_l2);
         if (has_rows)
-          decode_block(pbi, xd, mi_row + hbs, mi_col, r, subsize);
+          decode_block(pbi, xd, mi_row + hbs, mi_col, r, subsize, n4x4_l2,
+                       n8x8_l2);
         break;
       case PARTITION_VERT:
-        decode_block(pbi, xd, mi_row, mi_col, r, subsize);
+        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2, n4x4_l2);
         if (has_cols)
-          decode_block(pbi, xd, mi_row, mi_col + hbs, r, subsize);
+          decode_block(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2,
+                       n4x4_l2);
         break;
       case PARTITION_SPLIT:
-        decode_partition(pbi, xd, mi_row, mi_col, r, subsize);
-        decode_partition(pbi, xd, mi_row, mi_col + hbs, r, subsize);
-        decode_partition(pbi, xd, mi_row + hbs, mi_col, r, subsize);
-        decode_partition(pbi, xd, mi_row + hbs, mi_col + hbs, r, subsize);
+        decode_partition(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd, mi_row + hbs, mi_col, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd, mi_row + hbs, mi_col + hbs, r, subsize,
+                         n8x8_l2);
         break;
       default:
         assert(0 && "Invalid partition type");
@@ -843,7 +1052,7 @@
   // update partition context
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
-    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+    dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh);
 }
 
 static void setup_token_decoder(const uint8_t *data,
@@ -1324,11 +1533,12 @@
       tile_data->xd.corrupted = 0;
       tile_data->xd.counts = cm->frame_parallel_decoding_mode ?
                              NULL : &cm->counts;
+      vp9_zero(tile_data->dqcoeff);
       vp9_tile_init(&tile_data->xd.tile, tile_data->cm, tile_row, tile_col);
       setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
                           &tile_data->bit_reader, pbi->decrypt_cb,
                           pbi->decrypt_state);
-      vp9_init_macroblockd(cm, &tile_data->xd);
+      vp9_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff);
     }
   }
 
@@ -1346,8 +1556,8 @@
         vp9_zero(tile_data->xd.left_seg_context);
         for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
              mi_col += MI_BLOCK_SIZE) {
-          decode_partition(pbi, &tile_data->xd, mi_row, mi_col,
-                           &tile_data->bit_reader, BLOCK_64X64);
+          decode_partition(pbi, &tile_data->xd, mi_row,
+                           mi_col, &tile_data->bit_reader, BLOCK_64X64, 4);
         }
         pbi->mb.corrupted |= tile_data->xd.corrupted;
         if (pbi->mb.corrupted)
@@ -1421,7 +1631,7 @@
          mi_col += MI_BLOCK_SIZE) {
       decode_partition(tile_data->pbi, &tile_data->xd,
                        mi_row, mi_col, &tile_data->bit_reader,
-                       BLOCK_64X64);
+                       BLOCK_64X64, 4);
     }
   }
   return !tile_data->xd.corrupted;
@@ -1543,12 +1753,13 @@
       tile_data->xd.corrupted = 0;
       tile_data->xd.counts = cm->frame_parallel_decoding_mode ?
                              0 : &tile_data->counts;
+      vp9_zero(tile_data->dqcoeff);
       vp9_tile_init(tile, cm, 0, buf->col);
       vp9_tile_init(&tile_data->xd.tile, cm, 0, buf->col);
       setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
                           &tile_data->bit_reader, pbi->decrypt_cb,
                           pbi->decrypt_state);
-      vp9_init_macroblockd(cm, &tile_data->xd);
+      vp9_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff);
 
       worker->had_error = 0;
       if (i == num_workers - 1 || n == tile_cols - 1) {
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 0d08f8c..d42a654 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -91,41 +91,44 @@
     return MIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]);
 }
 
-static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize,
-                           int mi_row, int mi_col, int segment_id) {
-  const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = num_8x8_blocks_wide_lookup[bsize];
-  const int bh = num_8x8_blocks_high_lookup[bsize];
-  const int xmis = MIN(cm->mi_cols - mi_col, bw);
-  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+static int dec_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids,
+                              int mi_offset, int x_mis, int y_mis) {
+  int x, y, segment_id = INT_MAX;
+
+  for (y = 0; y < y_mis; y++)
+    for (x = 0; x < x_mis; x++)
+      segment_id = MIN(segment_id,
+                       segment_ids[mi_offset + y * cm->mi_cols + x]);
+
+  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+  return segment_id;
+}
+
+static void set_segment_id(VP9_COMMON *cm, int mi_offset,
+                           int x_mis, int y_mis, int segment_id) {
   int x, y;
 
   assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
 
-  for (y = 0; y < ymis; y++)
-    for (x = 0; x < xmis; x++)
+  for (y = 0; y < y_mis; y++)
+    for (x = 0; x < x_mis; x++)
       cm->current_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
 }
 
 static void copy_segment_id(const VP9_COMMON *cm,
                            const uint8_t *last_segment_ids,
                            uint8_t *current_segment_ids,
-                           BLOCK_SIZE bsize, int mi_row, int mi_col) {
-  const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = num_8x8_blocks_wide_lookup[bsize];
-  const int bh = num_8x8_blocks_high_lookup[bsize];
-  const int xmis = MIN(cm->mi_cols - mi_col, bw);
-  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+                           int mi_offset, int x_mis, int y_mis) {
   int x, y;
 
-  for (y = 0; y < ymis; y++)
-    for (x = 0; x < xmis; x++)
+  for (y = 0; y < y_mis; y++)
+    for (x = 0; x < x_mis; x++)
       current_segment_ids[mi_offset + y * cm->mi_cols + x] =  last_segment_ids ?
           last_segment_ids[mi_offset + y * cm->mi_cols + x] : 0;
 }
 
-static int read_intra_segment_id(VP9_COMMON *const cm, BLOCK_SIZE bsize,
-                                 int mi_row, int mi_col,
+static int read_intra_segment_id(VP9_COMMON *const cm, int mi_offset,
+                                 int x_mis, int y_mis,
                                  vp9_reader *r) {
   struct segmentation *const seg = &cm->seg;
   int segment_id;
@@ -135,12 +138,12 @@
 
   if (!seg->update_map) {
     copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
-                    bsize, mi_row, mi_col);
+                    mi_offset, x_mis, y_mis);
     return 0;
   }
 
   segment_id = read_segment_id(r, seg);
-  set_segment_id(cm, bsize, mi_row, mi_col, segment_id);
+  set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
   return segment_id;
 }
 
@@ -148,18 +151,25 @@
                                  int mi_row, int mi_col, vp9_reader *r) {
   struct segmentation *const seg = &cm->seg;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
   int predicted_segment_id, segment_id;
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = xd->plane[0].n4_w >> 1;
+  const int bh = xd->plane[0].n4_h >> 1;
+
+  // TODO(slavarnway): move x_mis, y_mis into xd ?????
+  const int x_mis = MIN(cm->mi_cols - mi_col, bw);
+  const int y_mis = MIN(cm->mi_rows - mi_row, bh);
 
   if (!seg->enabled)
     return 0;  // Default for disabled segmentation
 
   predicted_segment_id = cm->last_frame_seg_map ?
-      get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col) : 0;
+      dec_get_segment_id(cm, cm->last_frame_seg_map, mi_offset, x_mis, y_mis) :
+      0;
 
   if (!seg->update_map) {
     copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
-                    bsize, mi_row, mi_col);
+                    mi_offset, x_mis, y_mis);
     return predicted_segment_id;
   }
 
@@ -171,7 +181,7 @@
   } else {
     segment_id = read_segment_id(r, seg);
   }
-  set_segment_id(cm, bsize, mi_row, mi_col, segment_id);
+  set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
   return segment_id;
 }
 
@@ -198,8 +208,15 @@
   const MODE_INFO *left_mi  = xd->left_mi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   int i;
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = xd->plane[0].n4_w >> 1;
+  const int bh = xd->plane[0].n4_h >> 1;
 
-  mbmi->segment_id = read_intra_segment_id(cm, bsize, mi_row, mi_col, r);
+  // TODO(slavarnway): move x_mis, y_mis into xd ?????
+  const int x_mis = MIN(cm->mi_cols - mi_col, bw);
+  const int y_mis = MIN(cm->mi_rows - mi_row, bh);
+
+  mbmi->segment_id = read_intra_segment_id(cm, mi_offset, x_mis, y_mis, r);
   mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
   mbmi->tx_size = read_tx_size(cm, xd, 1, r);
   mbmi->ref_frame[0] = INTRA_FRAME;
@@ -519,8 +536,8 @@
                       : cm->interp_filter;
 
   if (bsize < BLOCK_8X8) {
-    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];  // 1 or 2
-    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];  // 1 or 2
+    const int num_4x4_w = 1 << xd->bmode_blocks_wl;
+    const int num_4x4_h = 1 << xd->bmode_blocks_hl;
     int idx, idy;
     PREDICTION_MODE b_mode;
     int_mv nearest_sub8x8[2], near_sub8x8[2];
@@ -589,13 +606,10 @@
 }
 
 void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
-                        int mi_row, int mi_col, vp9_reader *r) {
+                        int mi_row, int mi_col, vp9_reader *r,
+                        int x_mis, int y_mis) {
   VP9_COMMON *const cm = &pbi->common;
   MODE_INFO *const mi = xd->mi[0];
-  const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
-  const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
-  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
-  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
   MV_REF* frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
   int w, h;
 
diff --git a/vp9/decoder/vp9_decodemv.h b/vp9/decoder/vp9_decodemv.h
index db57b40..53bac8c 100644
--- a/vp9/decoder/vp9_decodemv.h
+++ b/vp9/decoder/vp9_decodemv.h
@@ -19,7 +19,8 @@
 #endif
 
 void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
-                        int mi_row, int mi_col, vp9_reader *r);
+                        int mi_row, int mi_col, vp9_reader *r,
+                        int x_mis, int y_mis);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index 76a8746..af47f85 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -32,6 +32,8 @@
   VP9_COMMON *cm;
   vp9_reader bit_reader;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
 } TileData;
 
 typedef struct TileWorkerData {
@@ -39,6 +41,8 @@
   vp9_reader bit_reader;
   FRAME_COUNTS counts;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
   struct vpx_internal_error_info error_info;
 } TileWorkerData;
 
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 4e3ab82..5596c9e 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -205,9 +205,54 @@
   return c;
 }
 
+// TODO(slavarnway): Decode version of vp9_set_context.  Modify vp9_set_context
+// after testing is complete, then delete this version.
+static
+void dec_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
+                      TX_SIZE tx_size, int has_eob,
+                      int aoff, int loff) {
+  ENTROPY_CONTEXT *const a = pd->above_context + aoff;
+  ENTROPY_CONTEXT *const l = pd->left_context + loff;
+  const int tx_size_in_blocks = 1 << tx_size;
+
+  // above
+  if (has_eob && xd->mb_to_right_edge < 0) {
+    int i;
+    const int blocks_wide = pd->n4_w +
+                            (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+    int above_contexts = tx_size_in_blocks;
+    if (above_contexts + aoff > blocks_wide)
+      above_contexts = blocks_wide - aoff;
+
+    for (i = 0; i < above_contexts; ++i)
+      a[i] = has_eob;
+    for (i = above_contexts; i < tx_size_in_blocks; ++i)
+      a[i] = 0;
+  } else {
+    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  }
+
+  // left
+  if (has_eob && xd->mb_to_bottom_edge < 0) {
+    int i;
+    const int blocks_high = pd->n4_h +
+                            (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+    int left_contexts = tx_size_in_blocks;
+    if (left_contexts + loff > blocks_high)
+      left_contexts = blocks_high - loff;
+
+    for (i = 0; i < left_contexts; ++i)
+      l[i] = has_eob;
+    for (i = left_contexts; i < tx_size_in_blocks; ++i)
+      l[i] = 0;
+  } else {
+    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  }
+}
+
 int vp9_decode_block_tokens(MACROBLOCKD *xd,
                             int plane, const scan_order *sc,
-                            BLOCK_SIZE plane_bsize, int x, int y,
+                            int x, int y,
                             TX_SIZE tx_size, vp9_reader *r,
                             int seg_id) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -217,7 +262,7 @@
   const int eob = decode_coefs(xd, pd->plane_type,
                                pd->dqcoeff, tx_size,
                                dequant, ctx, sc->scan, sc->neighbors, r);
-  vp9_set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, x, y);
+  dec_set_contexts(xd, pd, tx_size, eob > 0, x, y);
   return eob;
 }
 
diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index ed826a4..cf0e48a 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@@ -22,7 +22,7 @@
 
 int vp9_decode_block_tokens(MACROBLOCKD *xd,
                             int plane, const scan_order *sc,
-                            BLOCK_SIZE plane_bsize, int x, int y,
+                            int x, int y,
                             TX_SIZE tx_size, vp9_reader *r,
                             int seg_id);
 
diff --git a/vp9/decoder/vp9_reader.c b/vp9/decoder/vp9_reader.c
index 2c96f74..9a46cd7 100644
--- a/vp9/decoder/vp9_reader.c
+++ b/vp9/decoder/vp9_reader.c
@@ -7,12 +7,24 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <stdlib.h>
 
 #include "vpx_ports/mem.h"
 #include "vpx_mem/vpx_mem.h"
 
+#include "./vpx_config.h"
 #include "vp9/decoder/vp9_reader.h"
 
+#include "vpx_util/endian_inl.h"
+
+#if CONFIG_BIG_ENDIAN
+#define BIGENDIFY64(X) (X)
+#define BIGENDIFY32(X) (X)
+#else
+#define BIGENDIFY64(X) BSwap64(X)
+#define BIGENDIFY32(X) BSwap32(X)
+#endif
+
 int vp9_reader_init(vp9_reader *r,
                     const uint8_t *buffer,
                     size_t size,
@@ -39,11 +51,9 @@
   const uint8_t *buffer_start = buffer;
   BD_VALUE value = r->value;
   int count = r->count;
-  int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
-  int loop_end = 0;
   const size_t bytes_left = buffer_end - buffer;
   const size_t bits_left = bytes_left * CHAR_BIT;
-  const int x = (int)(shift + CHAR_BIT - bits_left);
+  int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
 
   if (r->decrypt_cb) {
     size_t n = MIN(sizeof(r->clear_buffer), bytes_left);
@@ -51,17 +61,31 @@
     buffer = r->clear_buffer;
     buffer_start = r->clear_buffer;
   }
+  if (bits_left > BD_VALUE_SIZE) {
+#if UINTPTR_MAX == 0xffffffffffffffff
+      BD_VALUE big_endian_values = BIGENDIFY64(*((const BD_VALUE *) buffer));
+#else
+      BD_VALUE big_endian_values = BIGENDIFY32(*((const BD_VALUE *) buffer));
+#endif
+      const int bits = (shift & 0xfffffff8) + CHAR_BIT;
+      const BD_VALUE nv = big_endian_values >> (BD_VALUE_SIZE - bits);
+      count += bits;
+      buffer += (bits >> 3);
+      value = r->value | (nv << (shift & 0x7));
+  } else {
+    const int bits_over = (int)(shift + CHAR_BIT - bits_left);
+    int loop_end = 0;
+    if (bits_over >= 0) {
+      count += LOTS_OF_BITS;
+      loop_end = bits_over;
+    }
 
-  if (x >= 0) {
-    count += LOTS_OF_BITS;
-    loop_end = x;
-  }
-
-  if (x < 0 || bits_left) {
-    while (shift >= loop_end) {
-      count += CHAR_BIT;
-      value |= (BD_VALUE)*buffer++ << shift;
-      shift -= CHAR_BIT;
+    if (bits_over < 0 || bits_left) {
+      while (shift >= loop_end) {
+        count += CHAR_BIT;
+        value |= (BD_VALUE)*buffer++ << shift;
+        shift -= CHAR_BIT;
+      }
     }
   }
 
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 02d986e..a450945 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -194,7 +194,6 @@
 
   set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
 
-
   mbmi = &xd->mi[0]->mbmi;
 
   // Set up destination pointers.
@@ -2123,38 +2122,6 @@
   BLOCK_64X64
 };
 
-// Checks to see if a macro block is at the edge of the active image.
-// In most cases this is the "real" edge unless there are formatting
-// bars embedded in the stream.
-static int active_edge_sb(VP9_COMP *cpi,
-                          int mi_row, int mi_col) {
-  int is_active_edge = 0;
-  int top_edge = 0;
-  int bottom_edge = cpi->common.mi_rows;
-  int left_edge = 0;
-  int right_edge = cpi->common.mi_cols;
-
-  // For two pass account for any formatting bars detected.
-  if (cpi->oxcf.pass == 2) {
-    TWO_PASS *twopass = &cpi->twopass;
-
-    // The inactive region is specified in MBs not mi units.
-    // The image edge is in the following MB row.
-    top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
-
-    bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
-    bottom_edge = MAX(top_edge, bottom_edge);
-  }
-
-  if (((top_edge >= mi_row) && (top_edge < (mi_row + MI_BLOCK_SIZE))) ||
-      ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + MI_BLOCK_SIZE))) ||
-      ((left_edge >= mi_col) && (left_edge < (mi_col + MI_BLOCK_SIZE))) ||
-      ((right_edge >= mi_col) && (right_edge < (mi_col + MI_BLOCK_SIZE)))) {
-    is_active_edge = 1;
-  }
-
-  return is_active_edge;
-}
 
 // Look at all the mode_info entries for blocks that are part of this
 // partition and find the min and max values for sb_type.
@@ -2253,7 +2220,7 @@
   // Test for blocks at the edge of the active image.
   // This may be the actual edge of the image or where there are formatting
   // bars.
-  if (active_edge_sb(cpi, mi_row, mi_col)) {
+  if (vp9_active_edge_sb(cpi, mi_row, mi_col)) {
     min_size = BLOCK_4X4;
   } else {
     min_size = MIN(cpi->sf.rd_auto_partition_min_limit,
@@ -2674,8 +2641,9 @@
   }
 
   // PARTITION_HORZ
-  if (partition_horz_allowed && do_rect) {
-    subsize = get_subsize(bsize, PARTITION_HORZ);
+  if (partition_horz_allowed &&
+      (do_rect || vp9_active_h_edge(cpi, mi_row, mi_step))) {
+      subsize = get_subsize(bsize, PARTITION_HORZ);
     if (cpi->sf.adaptive_motion_search)
       load_pred_mv(x, ctx);
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
@@ -2721,8 +2689,9 @@
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
   }
   // PARTITION_VERT
-  if (partition_vert_allowed && do_rect) {
-    subsize = get_subsize(bsize, PARTITION_VERT);
+  if (partition_vert_allowed &&
+      (do_rect || vp9_active_v_edge(cpi, mi_col, mi_step))) {
+      subsize = get_subsize(bsize, PARTITION_VERT);
 
     if (cpi->sf.adaptive_motion_search)
       load_pred_mv(x, ctx);
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 3130941..659ce72 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -763,7 +763,7 @@
 }
 
 void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
-                               TX_SIZE tx_size, void *arg) {
+                            TX_SIZE tx_size, void *arg) {
   struct encode_b_args* const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index f27f57a..6c595b7 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -732,7 +732,7 @@
 
   vp9_set_mb_mi(cm, cm->width, cm->height);
   vp9_init_context_buffers(cm);
-  vp9_init_macroblockd(cm, xd);
+  vp9_init_macroblockd(cm, xd, NULL);
   cpi->td.mb.mbmi_ext_base = cpi->mbmi_ext_base;
   memset(cpi->mbmi_ext_base, 0,
          cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index a85b70b..61279f8 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -126,6 +126,7 @@
             stats->pcnt_neutral,
             stats->intra_skip_pct,
             stats->inactive_zone_rows,
+            stats->inactive_zone_cols,
             stats->MVr,
             stats->mvr_abs,
             stats->MVc,
@@ -164,6 +165,7 @@
   section->pcnt_neutral = 0.0;
   section->intra_skip_pct = 0.0;
   section->inactive_zone_rows = 0.0;
+  section->inactive_zone_cols = 0.0;
   section->MVr = 0.0;
   section->mvr_abs     = 0.0;
   section->MVc        = 0.0;
@@ -191,6 +193,7 @@
   section->pcnt_neutral += frame->pcnt_neutral;
   section->intra_skip_pct += frame->intra_skip_pct;
   section->inactive_zone_rows += frame->inactive_zone_rows;
+  section->inactive_zone_cols += frame->inactive_zone_cols;
   section->MVr += frame->MVr;
   section->mvr_abs     += frame->mvr_abs;
   section->MVc        += frame->MVc;
@@ -216,6 +219,7 @@
   section->pcnt_neutral -= frame->pcnt_neutral;
   section->intra_skip_pct -= frame->intra_skip_pct;
   section->inactive_zone_rows -= frame->inactive_zone_rows;
+  section->inactive_zone_cols -= frame->inactive_zone_cols;
   section->MVr -= frame->MVr;
   section->mvr_abs     -= frame->mvr_abs;
   section->MVc        -= frame->MVc;
@@ -1050,6 +1054,7 @@
     fps.pcnt_neutral = (double)neutral_count / num_mbs;
     fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
     fps.inactive_zone_rows = (double)image_data_start_row;
+    fps.inactive_zone_cols = (double)0;  // TODO(paulwilkins): fix
 
     if (mvcount > 0) {
       fps.MVr = (double)sum_mvr / mvcount;
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 0047932..49f9da3 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -53,6 +53,7 @@
   double pcnt_neutral;
   double intra_skip_pct;
   double inactive_zone_rows;  // Image mask rows top and bottom.
+  double inactive_zone_cols;  // Image mask columns at left and right edges.
   double MVr;
   double mvr_abs;
   double MVc;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index e6e17c0..32c1f76 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -57,17 +57,14 @@
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (!skip_block) {
-    const int rc = 0;
-    const int coeff = coeff_ptr[rc];
+    const int coeff = coeff_ptr[0];
     const int coeff_sign = (coeff >> 31);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-    const int64_t tmp =
-        (clamp(abs_coeff + round_ptr[rc != 0], INT32_MIN, INT32_MAX) *
-         quant) >> 16;
-    qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
-    if (tmp)
+    const int64_t tmp = abs_coeff + round_ptr[0];
+    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16);
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
+    if (abs_qcoeff)
       eob = 0;
   }
   *eob_ptr = eob + 1;
@@ -89,7 +86,6 @@
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (!skip_block) {
-
     tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
                 INT16_MIN, INT16_MAX);
     tmp = (tmp * quant) >> 15;
@@ -117,17 +113,14 @@
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (!skip_block) {
-    const int rc = 0;
-    const int coeff = coeff_ptr[rc];
+    const int coeff = coeff_ptr[0];
     const int coeff_sign = (coeff >> 31);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-    const int64_t tmp =
-        (clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
-               INT32_MIN, INT32_MAX) * quant) >> 15;
-    qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
-    if (tmp)
+    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
+    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15);
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
+    if (abs_qcoeff)
       eob = 0;
   }
   *eob_ptr = eob + 1;
@@ -207,15 +200,11 @@
       const int coeff = coeff_ptr[rc];
       const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-      const int64_t tmp =
-          (clamp(abs_coeff + round_ptr[rc != 0], INT32_MIN, INT32_MAX) *
-           quant_ptr[rc != 0]) >> 16;
-
-      qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
+      const int64_t tmp = abs_coeff + round_ptr[rc != 0];
+      const uint32_t abs_qcoeff = (uint32_t)((tmp * quant_ptr[rc != 0]) >> 16);
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
       dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-
-      if (tmp)
+      if (abs_qcoeff)
         eob = i;
     }
   }
@@ -287,21 +276,21 @@
 
   if (!skip_block) {
     for (i = 0; i < n_coeffs; i++) {
+      uint32_t abs_qcoeff = 0;
       const int rc = scan[i];
       const int coeff = coeff_ptr[rc];
       const int coeff_sign = (coeff >> 31);
-      int64_t tmp = 0;
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
       if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
-        tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
-                    INT32_MIN, INT32_MAX);
-        tmp = (tmp * quant_ptr[rc != 0]) >> 15;
-        qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
+        const int64_t tmp = abs_coeff
+                           + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+        abs_qcoeff = (uint32_t) ((tmp * quant_ptr[rc != 0]) >> 15);
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
         dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
       }
 
-      if (tmp)
+      if (abs_qcoeff)
         eob = i;
     }
   }
@@ -398,14 +387,13 @@
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
       if (abs_coeff >= zbins[rc != 0]) {
-        int64_t tmp = clamp(abs_coeff + round_ptr[rc != 0],
-                            INT32_MIN, INT32_MAX);
-        tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-                  quant_shift_ptr[rc != 0]) >> 16;  // quantization
-        qcoeff_ptr[rc]  = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
+        const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
+        const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+        const uint32_t abs_qcoeff =
+            (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
         dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-
-        if (tmp)
+        if (abs_qcoeff)
           eob = i;
       }
     }
@@ -513,16 +501,14 @@
       const int coeff = coeff_ptr[rc];
       const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      int64_t tmp = clamp(abs_coeff +
-                          ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
-                          INT32_MIN, INT32_MAX);
-      tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-               quant_shift_ptr[rc != 0]) >> 15;
-
-      qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
+      const int64_t tmp1 = abs_coeff
+                         + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+      const uint32_t abs_qcoeff =
+          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
       dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-
-      if (tmp)
+      if (abs_qcoeff)
         eob = idx_arr[i];
     }
   }
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 3a27e89..9985f89 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2898,6 +2898,77 @@
   *this_rd += (*this_rd * var_factor) / 100;
 }
 
+
+// Do we have an internal image edge (e.g. formatting bars).
+int vp9_internal_image_edge(VP9_COMP *cpi) {
+  return (cpi->oxcf.pass == 2) &&
+    ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) ||
+    (cpi->twopass.this_frame_stats.inactive_zone_cols > 0));
+}
+
+// Checks to see if a super block is on a horizontal image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int vp9_active_h_edge(VP9_COMP *cpi, int mi_row, int mi_step) {
+  int top_edge = 0;
+  int bottom_edge = cpi->common.mi_rows;
+  int is_active_h_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (cpi->oxcf.pass == 2) {
+    TWO_PASS *twopass = &cpi->twopass;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+
+    bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+    bottom_edge = MAX(top_edge, bottom_edge);
+  }
+
+  if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
+      ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
+    is_active_h_edge = 1;
+  }
+  return is_active_h_edge;
+}
+
+// Checks to see if a super block is on a vertical image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int vp9_active_v_edge(VP9_COMP *cpi, int mi_col, int mi_step) {
+  int left_edge = 0;
+  int right_edge = cpi->common.mi_cols;
+  int is_active_v_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (cpi->oxcf.pass == 2) {
+    TWO_PASS *twopass = &cpi->twopass;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+
+    right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+    right_edge = MAX(left_edge, right_edge);
+  }
+
+  if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
+      ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
+    is_active_v_edge = 1;
+  }
+  return is_active_v_edge;
+}
+
+// Checks to see if a super block is at the edge of the active image.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int vp9_active_edge_sb(VP9_COMP *cpi,
+                       int mi_row, int mi_col) {
+  return vp9_active_h_edge(cpi, mi_row, MI_BLOCK_SIZE) ||
+         vp9_active_v_edge(cpi, mi_col, MI_BLOCK_SIZE);
+}
+
 void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
                                TileDataEnc *tile_data,
                                MACROBLOCK *x,
@@ -3751,13 +3822,15 @@
   int skip_uv;
   PREDICTION_MODE mode_uv = DC_PRED;
   const int intra_cost_penalty = vp9_get_intra_cost_penalty(
-      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+    cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
   int_mv seg_mvs[4][MAX_REF_FRAMES];
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
   int ref_frame_skip_mask[2] = { 0 };
   int64_t mask_filter = 0;
   int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
+  int internal_active_edge =
+    vp9_active_edge_sb(cpi, mi_row, mi_col) && vp9_internal_image_edge(cpi);
 
   x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
   memset(x->zcoeff_blk[TX_4X4], 0, 4);
@@ -3843,7 +3916,8 @@
       continue;
 
     // Test best rd so far against threshold for trying this mode.
-    if (rd_less_than_thresh(best_rd,
+    if (!internal_active_edge &&
+        rd_less_than_thresh(best_rd,
                             rd_opt->threshes[segment_id][bsize][ref_index],
                             tile_data->thresh_freq_fact[bsize][ref_index]))
       continue;
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 459b032..00ee55c 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -54,6 +54,11 @@
                                         PICK_MODE_CONTEXT *ctx,
                                         int64_t best_rd_so_far);
 
+int vp9_internal_image_edge(struct VP9_COMP *cpi);
+int vp9_active_h_edge(struct VP9_COMP *cpi, int mi_row, int mi_step);
+int vp9_active_v_edge(struct VP9_COMP *cpi, int mi_col, int mi_step);
+int vp9_active_edge_sb(struct VP9_COMP *cpi, int mi_row, int mi_col);
+
 void vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi,
                                    struct TileDataEnc *tile_data,
                                    struct macroblock *x,
@@ -61,6 +66,7 @@
                                    struct RD_COST *rd_cost,
                                    BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
                                    int64_t best_rd_so_far);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 6c6c4ed..e544f9b 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -12,6 +12,8 @@
 
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_speed_features.h"
+#include "vp9/encoder/vp9_rdopt.h"
+
 
 // Intra only frames, golden frames (except alt ref overlays) and
 // alt ref frames tend to be coded at a higher than ambient quality
@@ -89,8 +91,10 @@
 
   // If this is a two pass clip that fits the criteria for animated or
   // graphics content then reset disable_split_mask for speeds 1-4.
+  // Also if the image edge is internal to the coded area.
   if ((speed >= 1) && (cpi->oxcf.pass == 2) &&
-      (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)) {
+      ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
+       (vp9_internal_image_edge(cpi)))) {
     sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
   }
 
@@ -112,7 +116,12 @@
   sf->allow_skip_recode = 1;
 
   if (speed >= 1) {
-    sf->use_square_partition_only = !frame_is_intra_only(cm);
+    if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
+        vp9_internal_image_edge(cpi)) {
+      sf->use_square_partition_only = frame_is_boosted(cpi);
+    } else {
+      sf->use_square_partition_only = !frame_is_intra_only(cm);
+    }
     sf->less_rectangular_check  = 1;
 
     sf->use_rd_breakout = 1;
@@ -152,6 +161,7 @@
   }
 
   if (speed >= 3) {
+    sf->use_square_partition_only = !frame_is_intra_only(cm);
     sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
                                                         : USE_LARGESTALL;
     sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
diff --git a/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h b/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h
index 003ebd1..5074d31 100644
--- a/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h
+++ b/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h
@@ -28,7 +28,8 @@
         temp_in[j] = intermediate[j * 32 + i];
       vp9_fdct32(temp_in, temp_out, 0);
       for (j = 0; j < 32; ++j)
-        out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+        out[j + i * 32] =
+            (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
     }
 }
   #define HIGH_FDCT32x32_2D_C vp9_highbd_fdct32x32_c
@@ -42,7 +43,7 @@
         temp_in[j] = intermediate[j * 32 + i];
       vp9_fdct32(temp_in, temp_out, 1);
       for (j = 0; j < 32; ++j)
-        out[j + i * 32] = temp_out[j];
+        out[j + i * 32] = (tran_low_t)temp_out[j];
     }
 }
   #define HIGH_FDCT32x32_2D_C vp9_highbd_fdct32x32_rd_c
diff --git a/vp9/encoder/x86/vp9_dct_sse2_impl.h b/vp9/encoder/x86/vp9_dct_sse2_impl.h
index 11bf5a2..86e9ecf 100644
--- a/vp9/encoder/x86/vp9_dct_sse2_impl.h
+++ b/vp9/encoder/x86/vp9_dct_sse2_impl.h
@@ -40,35 +40,35 @@
   // These are the coefficients used for the multiplies.
   // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
   // where cospi_N_64 = cos(N pi /64)
-  const __m128i k__cospi_A = _mm_setr_epi16(cospi_16_64, cospi_16_64,
+  const __m128i k__cospi_A = octa_set_epi16(cospi_16_64, cospi_16_64,
                                             cospi_16_64, cospi_16_64,
                                             cospi_16_64, -cospi_16_64,
                                             cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_B = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
+  const __m128i k__cospi_B = octa_set_epi16(cospi_16_64, -cospi_16_64,
                                             cospi_16_64, -cospi_16_64,
                                             cospi_16_64, cospi_16_64,
                                             cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_C = _mm_setr_epi16(cospi_8_64, cospi_24_64,
+  const __m128i k__cospi_C = octa_set_epi16(cospi_8_64, cospi_24_64,
                                             cospi_8_64, cospi_24_64,
                                             cospi_24_64, -cospi_8_64,
                                             cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_D = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
+  const __m128i k__cospi_D = octa_set_epi16(cospi_24_64, -cospi_8_64,
                                             cospi_24_64, -cospi_8_64,
                                             cospi_8_64, cospi_24_64,
                                             cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_E = _mm_setr_epi16(cospi_16_64, cospi_16_64,
+  const __m128i k__cospi_E = octa_set_epi16(cospi_16_64, cospi_16_64,
                                             cospi_16_64, cospi_16_64,
                                             cospi_16_64, cospi_16_64,
                                             cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_F = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
+  const __m128i k__cospi_F = octa_set_epi16(cospi_16_64, -cospi_16_64,
                                             cospi_16_64, -cospi_16_64,
                                             cospi_16_64, -cospi_16_64,
                                             cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_G = _mm_setr_epi16(cospi_8_64, cospi_24_64,
+  const __m128i k__cospi_G = octa_set_epi16(cospi_8_64, cospi_24_64,
                                             cospi_8_64, cospi_24_64,
                                             -cospi_8_64, -cospi_24_64,
                                             -cospi_8_64, -cospi_24_64);
-  const __m128i k__cospi_H = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
+  const __m128i k__cospi_H = octa_set_epi16(cospi_24_64, -cospi_8_64,
                                             cospi_24_64, -cospi_8_64,
                                             -cospi_24_64, cospi_8_64,
                                             -cospi_24_64, cospi_8_64);
@@ -267,7 +267,7 @@
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -588,7 +588,7 @@
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
diff --git a/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c b/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
index cbdd1c9..0174cfe 100644
--- a/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
@@ -85,13 +85,13 @@
       for (j = 0; j < 4; j++) {
         if (test & (1 << (4 * j))) {
           int k = 4 * i + j;
-          int64_t tmp = clamp(abs_coeff[j] + round_ptr[k != 0],
-                              INT32_MIN, INT32_MAX);
-          tmp = ((((tmp * quant_ptr[k != 0]) >> 16) + tmp) *
-                    quant_shift_ptr[k != 0]) >> 16;  // quantization
-          qcoeff_ptr[k] = (tmp ^ coeff_sign[j]) - coeff_sign[j];
+          const int64_t tmp1 = abs_coeff[j] + round_ptr[k != 0];
+          const int64_t tmp2 = ((tmp1 * quant_ptr[k != 0]) >> 16) + tmp1;
+          const uint32_t abs_qcoeff =
+              (uint32_t)((tmp2 * quant_shift_ptr[k != 0]) >> 16);
+          qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
           dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
-          if (tmp)
+          if (abs_qcoeff)
             eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
         }
       }
@@ -162,17 +162,15 @@
       const int rc = idx_arr[i];
       const int coeff = coeff_ptr[rc];
       const int coeff_sign = (coeff >> 31);
-      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      int64_t tmp = clamp(abs_coeff +
-                          ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
-                          INT32_MIN, INT32_MAX);
-      tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-               quant_shift_ptr[rc != 0]) >> 15;
-
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int64_t tmp1 = abs_coeff
+                         + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+      const uint32_t abs_qcoeff =
+          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+      qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
       dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-
-      if (tmp)
+      if (abs_qcoeff)
         eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
     }
   }
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 2e43c27..bcab558 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -75,6 +75,7 @@
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm
diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c
index fe35c1e..b45331c 100644
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c
@@ -50,7 +50,7 @@
                                     high_variance_fn_t var_fn, int block_size) {
   int i, j;
   uint64_t sse_long = 0;
-  int64_t sum_long = 0;
+  int32_t sum_long = 0;
 
   for (i = 0; i < h; i += block_size) {
     for (j = 0; j < w; j += block_size) {
@@ -63,7 +63,7 @@
     }
   }
   *sum = ROUND_POWER_OF_TWO(sum_long, 2);
-  *sse = ROUND_POWER_OF_TWO(sse_long, 4);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
 }
 
 static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
@@ -72,7 +72,7 @@
                                     high_variance_fn_t var_fn, int block_size) {
   int i, j;
   uint64_t sse_long = 0;
-  int64_t sum_long = 0;
+  int32_t sum_long = 0;
 
   for (i = 0; i < h; i += block_size) {
     for (j = 0; j < w; j += block_size) {
@@ -85,7 +85,7 @@
     }
   }
   *sum = ROUND_POWER_OF_TWO(sum_long, 4);
-  *sse = ROUND_POWER_OF_TWO(sse_long, 8);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
 }
 
 
@@ -386,7 +386,7 @@
     } \
   } \
   se = ROUND_POWER_OF_TWO(se, 4); \
-  sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+  sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
   *sse_ptr = sse; \
   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
 }
@@ -555,7 +555,7 @@
     } \
   } \
   se = ROUND_POWER_OF_TWO(se, 4); \
-  sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+  sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
   *sse_ptr = sse; \
   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
 }
diff --git a/vpx_util/endian_inl.h b/vpx_util/endian_inl.h
new file mode 100644
index 0000000..91753a6
--- /dev/null
+++ b/vpx_util/endian_inl.h
@@ -0,0 +1,113 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Endian related functions.
+
+#ifndef VPX_UTIL_ENDIAN_INL_H_
+#define VPX_UTIL_ENDIAN_INL_H_
+
+#include <stdlib.h>
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#if defined(__GNUC__)
+# define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
+# define LOCAL_GCC_PREREQ(maj, min) \
+    (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_GCC_VERSION 0
+# define LOCAL_GCC_PREREQ(maj, min) 0
+#endif
+
+#ifdef __clang__
+# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
+# define LOCAL_CLANG_PREREQ(maj, min) \
+    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_CLANG_VERSION 0
+# define LOCAL_CLANG_PREREQ(maj, min) 0
+#endif  // __clang__
+
+// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
+#if !defined(WORDS_BIGENDIAN) && \
+    (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
+     (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
+#define WORDS_BIGENDIAN
+#endif
+
+#if defined(WORDS_BIGENDIAN)
+#define HToLE32 BSwap32
+#define HToLE16 BSwap16
+#else
+#define HToLE32(x) (x)
+#define HToLE16(x) (x)
+#endif
+
+// clang-3.3 and gcc-4.3 have builtin functions for swap32/swap64
+#if LOCAL_GCC_PREREQ(4, 3) || LOCAL_CLANG_PREREQ(3, 3)
+#define HAVE_BUILTIN_BSWAP32
+#define HAVE_BUILTIN_BSWAP64
+#endif
+// clang-3.3 and gcc-4.8 have a builtin function for swap16
+#if LOCAL_GCC_PREREQ(4, 8) || LOCAL_CLANG_PREREQ(3, 3)
+#define HAVE_BUILTIN_BSWAP16
+#endif
+
+static INLINE uint16_t BSwap16(uint16_t x) {
+#if defined(HAVE_BUILTIN_BSWAP16)
+  return __builtin_bswap16(x);
+#elif defined(_MSC_VER)
+  return _byteswap_ushort(x);
+#else
+  // gcc will recognize a 'rorw $8, ...' here:
+  return (x >> 8) | ((x & 0xff) << 8);
+#endif  // HAVE_BUILTIN_BSWAP16
+}
+
+static INLINE uint32_t BSwap32(uint32_t x) {
+#if HAVE_MIPS32
+  uint32_t ret;
+  __asm__ volatile (
+    "wsbh   %[ret], %[x]          \n\t"
+    "rotr   %[ret], %[ret],  16   \n\t"
+    : [ret]"=r"(ret)
+    : [x]"r"(x)
+  );
+  return ret;
+#elif defined(HAVE_BUILTIN_BSWAP32)
+  return __builtin_bswap32(x);
+#elif defined(__i386__) || defined(__x86_64__)
+  uint32_t swapped_bytes;
+  __asm__ volatile("bswap %0" : "=r"(swapped_bytes) : "0"(x));
+  return swapped_bytes;
+#elif defined(_MSC_VER)
+  return (uint32_t)_byteswap_ulong(x);
+#else
+  return (x >> 24) | ((x >> 8) & 0xff00) | ((x << 8) & 0xff0000) | (x << 24);
+#endif  // HAVE_BUILTIN_BSWAP32
+}
+
+static INLINE uint64_t BSwap64(uint64_t x) {
+#if defined(HAVE_BUILTIN_BSWAP64)
+  return __builtin_bswap64(x);
+#elif defined(__x86_64__)
+  uint64_t swapped_bytes;
+  __asm__ volatile("bswapq %0" : "=r"(swapped_bytes) : "0"(x));
+  return swapped_bytes;
+#elif defined(_MSC_VER)
+  return (uint64_t)_byteswap_uint64(x);
+#else  // generic code for swapping 64-bit values (suggested by bdb@)
+  x = ((x & 0xffffffff00000000ull) >> 32) | ((x & 0x00000000ffffffffull) << 32);
+  x = ((x & 0xffff0000ffff0000ull) >> 16) | ((x & 0x0000ffff0000ffffull) << 16);
+  x = ((x & 0xff00ff00ff00ff00ull) >>  8) | ((x & 0x00ff00ff00ff00ffull) <<  8);
+  return x;
+#endif  // HAVE_BUILTIN_BSWAP64
+}
+
+#endif  // VPX_UTIL_ENDIAN_INL_H_
diff --git a/vpx_util/vpx_util.mk b/vpx_util/vpx_util.mk
index 1161125..c0ef8d3 100644
--- a/vpx_util/vpx_util.mk
+++ b/vpx_util/vpx_util.mk
@@ -11,3 +11,4 @@
 UTIL_SRCS-yes += vpx_util.mk
 UTIL_SRCS-yes += vpx_thread.c
 UTIL_SRCS-yes += vpx_thread.h
+UTIL_SRCS-yes += endian_inl.h