Merge "Fix intermediate height in convolve_c"

diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 89453bc..0743f35 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc

@@ -75,7 +75,7 @@
 static const double C14 = 0.195090322016128;
 static const double C15 = 0.098017140329561;
 
-static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
+void butterfly_16x16_dct_1d(double input[16], double output[16]) {
   double step[16];
   double intermediate[16];
   double temp1, temp2;
@@ -287,37 +287,37 @@
   vp9_short_iht16x16_add_c(out, dst, stride >> 1, tx_type);
 }
 
-class FwdTrans16x16Test : public ::testing::TestWithParam<int> {
+class Trans16x16Test : public ::testing::TestWithParam<int> {
  public:
-  virtual ~FwdTrans16x16Test() {}
+  virtual ~Trans16x16Test() {}
 
   virtual void SetUp() {
     tx_type_ = GetParam();
     if (tx_type_ == 0) {
-      fwd_txfm = fdct16x16;
-      inv_txfm = idct16x16_add;
+      fwd_txfm_ = fdct16x16;
+      inv_txfm_ = idct16x16_add;
     } else {
-      fwd_txfm = fht16x16;
-      inv_txfm = iht16x16_add;
+      fwd_txfm_ = fht16x16;
+      inv_txfm_ = iht16x16_add;
     }
   }
 
  protected:
   void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
                   int stride, int tx_type) {
-    (*fwd_txfm)(in, out, dst, stride, tx_type);
+    (*fwd_txfm_)(in, out, dst, stride, tx_type);
   }
   void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
                   int stride, int tx_type) {
-    (*inv_txfm)(in, out, dst, stride, tx_type);
+    (*inv_txfm_)(in, out, dst, stride, tx_type);
   }
 
   int tx_type_;
-  void (*fwd_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
-  void (*inv_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
+  void (*fwd_txfm_)(int16_t*, int16_t*, uint8_t*, int, int);
+  void (*inv_txfm_)(int16_t*, int16_t*, uint8_t*, int, int);
 };
 
-TEST_P(FwdTrans16x16Test, AccuracyCheck) {
+TEST_P(Trans16x16Test, AccuracyCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   int max_error = 0;
   int total_error = 0;
@@ -355,7 +355,7 @@
       << "Error: 16x16 FHT/IHT has average round trip error > 1 per block";
 }
 
-TEST_P(FwdTrans16x16Test, CoeffSizeCheck) {
+TEST_P(Trans16x16Test, CoeffSizeCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = 1000;
   for (int i = 0; i < count_test_block; ++i) {
@@ -389,14 +389,19 @@
   }
 }
 
-INSTANTIATE_TEST_CASE_P(VP9, FwdTrans16x16Test, ::testing::Range(0, 4));
-
-TEST(VP9Idct16x16Test, AccuracyCheck) {
+TEST_P(Trans16x16Test, InvAccuracyCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = 1000;
+  // TODO(jingning): is this unit test necessary? if so, need to add
+  // check sets for inverse hybrid transforms too.
+  if (tx_type_ != DCT_DCT)
+    return;
+
   for (int i = 0; i < count_test_block; ++i) {
-    int16_t in[256], coeff[256];
-    uint8_t dst[256], src[256];
+    DECLARE_ALIGNED_ARRAY(16, int16_t, in, 256);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, 256);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 256);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 256);
     double out_r[256];
 
     for (int j = 0; j < 256; ++j) {
@@ -410,7 +415,10 @@
     reference_16x16_dct_2d(in, out_r);
     for (int j = 0; j < 256; j++)
       coeff[j] = round(out_r[j]);
-    vp9_short_idct16x16_add_c(coeff, dst, 16);
+
+    const int pitch = 32;
+    RunInvTxfm(coeff, coeff, dst, pitch, tx_type_);
+
     for (int j = 0; j < 256; ++j) {
       const int diff = dst[j] - src[j];
       const int error = diff * diff;
@@ -421,4 +429,5 @@
   }
 }
 
+INSTANTIATE_TEST_CASE_P(VP9, Trans16x16Test, ::testing::Range(0, 4));
 }  // namespace

diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 0ac4905..579d7e2 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1

@@ -520,3 +520,5 @@
 83c6d8f2969b759e10e5c6542baca1265c874c29  vp90-2-03-size-226x224.webm.md5
 fe0af2ee47b1e5f6a66db369e2d7e9d870b38dce  vp90-2-03-size-226x226.webm
 94ad19b8b699cea105e2ff18f0df2afd7242bcf7  vp90-2-03-size-226x226.webm.md5
+495256cfd123fe777b2c0406862ed8468a1f4677  vp91-2-04-yv444.webm
+65e3a7ffef61ab340d9140f335ecc49125970c2c  vp91-2-04-yv444.webm.md5

diff --git a/test/test.mk b/test/test.mk
index 25e05b9..2042c86 100644
--- a/test/test.mk
+++ b/test/test.mk

@@ -629,3 +629,5 @@
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5

diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc
index 9b0e9d5..4cd356d 100644
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc

@@ -159,7 +159,10 @@
   "vp90-2-03-size-226x198.webm", "vp90-2-03-size-226x200.webm",
   "vp90-2-03-size-226x202.webm", "vp90-2-03-size-226x208.webm",
   "vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm",
-  "vp90-2-03-size-226x226.webm"
+  "vp90-2-03-size-226x226.webm",
+#if CONFIG_NON420
+  "vp91-2-04-yv444.webm"
+#endif
 };
 #endif
 

diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc
index a66a450..332a839 100644
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc

@@ -39,8 +39,8 @@
   ACMRandom rnd(ACMRandom::DeterministicSeed());
 
   // FIXME(rbultje) split in its own file
-  for (BLOCK_SIZE_TYPE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
-       bsize = static_cast<BLOCK_SIZE_TYPE>(static_cast<int>(bsize) + 1)) {
+  for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
+       bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {
     const int block_width  = 4 << b_width_log2(bsize);
     const int block_height = 4 << b_height_log2(bsize);
     int16_t *diff = reinterpret_cast<int16_t *>(

diff --git a/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
new file mode 100644
index 0000000..cf5c8f7
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm

@@ -0,0 +1,198 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_short_idct16x16_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_short_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
+;                                    int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct16x16_1_add_neon| PROC
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 6)
+    add              r0, r0, #32               ; + (1 <<((6) - 1))
+    asr              r0, r0, #6                ; >> 6
+
+    vdup.s16         q0, r0                    ; duplicate a1
+    mov              r0, #8
+    sub              r2, #8
+
+    ; load destination data row0 - row3
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row4 - row7
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row8 - row11
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row12 - row15
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    bx               lr
+    ENDP             ; |vp9_short_idct16x16_1_add_neon|
+
+    END

diff --git a/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
new file mode 100644
index 0000000..923804f
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm

@@ -0,0 +1,88 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_short_idct8x8_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_short_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
+;                                  int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct8x8_1_add_neon| PROC
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 5)
+    add              r0, r0, #16               ; + (1 <<((5) - 1))
+    asr              r0, r0, #5                ; >> 5
+
+    vdup.s16         q0, r0                    ; duplicate a1
+
+    ; load destination data
+    vld1.64          {d2}, [r1], r2
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r2
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r2
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r2
+    vld1.64          {d17}, [r1]
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r2
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r2
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r2
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r2
+    vst1.64          {d31}, [r12], r2
+
+    bx               lr
+    ENDP             ; |vp9_short_idct8x8_1_add_neon|
+
+    END

diff --git a/vp9/common/generic/vp9_systemdependent.c b/vp9/common/generic/vp9_systemdependent.c
index 79092cd..f1447211 100644
--- a/vp9/common/generic/vp9_systemdependent.c
+++ b/vp9/common/generic/vp9_systemdependent.c

@@ -13,6 +13,7 @@
 #include "vp9_rtcd.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
-void vp9_machine_specific_config(VP9_COMMON *ctx) {
+void vp9_machine_specific_config(VP9_COMMON *cm) {
+  (void)cm;
   vp9_rtcd();
 }

diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index b68a052..e89fea8 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c

@@ -47,26 +47,26 @@
   }
 }
 
-void vp9_free_frame_buffers(VP9_COMMON *oci) {
+void vp9_free_frame_buffers(VP9_COMMON *cm) {
   int i;
 
   for (i = 0; i < NUM_YV12_BUFFERS; i++)
-    vp9_free_frame_buffer(&oci->yv12_fb[i]);
+    vp9_free_frame_buffer(&cm->yv12_fb[i]);
 
-  vp9_free_frame_buffer(&oci->post_proc_buffer);
+  vp9_free_frame_buffer(&cm->post_proc_buffer);
 
-  vpx_free(oci->mip);
-  vpx_free(oci->prev_mip);
-  vpx_free(oci->above_seg_context);
-  vpx_free(oci->last_frame_seg_map);
+  vpx_free(cm->mip);
+  vpx_free(cm->prev_mip);
+  vpx_free(cm->above_seg_context);
+  vpx_free(cm->last_frame_seg_map);
 
-  vpx_free(oci->above_context[0]);
+  vpx_free(cm->above_context[0]);
   for (i = 0; i < MAX_MB_PLANE; i++)
-    oci->above_context[i] = 0;
-  oci->mip = NULL;
-  oci->prev_mip = NULL;
-  oci->above_seg_context = NULL;
-  oci->last_frame_seg_map = NULL;
+    cm->above_context[i] = 0;
+  cm->mip = NULL;
+  cm->prev_mip = NULL;
+  cm->above_seg_context = NULL;
+  cm->last_frame_seg_map = NULL;
 }
 
 static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {
@@ -93,95 +93,95 @@
   vp9_update_mode_info_in_image(cm, cm->prev_mi);
 }
 
-int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
+int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
   int i, mi_cols;
 
   const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
   const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
-  const int ss_x = oci->subsampling_x;
-  const int ss_y = oci->subsampling_y;
+  const int ss_x = cm->subsampling_x;
+  const int ss_y = cm->subsampling_y;
   int mi_size;
 
-  vp9_free_frame_buffers(oci);
+  vp9_free_frame_buffers(cm);
 
   for (i = 0; i < NUM_YV12_BUFFERS; i++) {
-    oci->fb_idx_ref_cnt[i] = 0;
-    if (vp9_alloc_frame_buffer(&oci->yv12_fb[i], width, height, ss_x, ss_y,
+    cm->fb_idx_ref_cnt[i] = 0;
+    if (vp9_alloc_frame_buffer(&cm->yv12_fb[i], width, height, ss_x, ss_y,
                                VP9BORDERINPIXELS) < 0)
       goto fail;
   }
 
-  oci->new_fb_idx = NUM_YV12_BUFFERS - 1;
-  oci->fb_idx_ref_cnt[oci->new_fb_idx] = 1;
+  cm->new_fb_idx = NUM_YV12_BUFFERS - 1;
+  cm->fb_idx_ref_cnt[cm->new_fb_idx] = 1;
 
   for (i = 0; i < ALLOWED_REFS_PER_FRAME; i++)
-    oci->active_ref_idx[i] = i;
+    cm->active_ref_idx[i] = i;
 
   for (i = 0; i < NUM_REF_FRAMES; i++) {
-    oci->ref_frame_map[i] = i;
-    oci->fb_idx_ref_cnt[i] = 1;
+    cm->ref_frame_map[i] = i;
+    cm->fb_idx_ref_cnt[i] = 1;
   }
 
-  if (vp9_alloc_frame_buffer(&oci->post_proc_buffer, width, height, ss_x, ss_y,
+  if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
                              VP9BORDERINPIXELS) < 0)
     goto fail;
 
-  set_mb_mi(oci, aligned_width, aligned_height);
+  set_mb_mi(cm, aligned_width, aligned_height);
 
   // Allocation
-  mi_size = oci->mode_info_stride * (oci->mi_rows + MI_BLOCK_SIZE);
+  mi_size = cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE);
 
-  oci->mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
-  if (!oci->mip)
+  cm->mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
+  if (!cm->mip)
     goto fail;
 
-  oci->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
-  if (!oci->prev_mip)
+  cm->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
+  if (!cm->prev_mip)
     goto fail;
 
-  setup_mi(oci);
+  setup_mi(cm);
 
   // FIXME(jkoleszar): allocate subsampled arrays for U/V once subsampling
   // information is exposed at this level
-  mi_cols = mi_cols_aligned_to_sb(oci->mi_cols);
+  mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
 
   // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
   // block where mi unit size is 8x8.
-  oci->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE *
+  cm->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE *
                                          (2 * mi_cols), 1);
-  if (!oci->above_context[0])
+  if (!cm->above_context[0])
     goto fail;
 
-  oci->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1);
-  if (!oci->above_seg_context)
+  cm->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1);
+  if (!cm->above_seg_context)
     goto fail;
 
   // Create the segmentation map structure and set to 0.
-  oci->last_frame_seg_map = vpx_calloc(oci->mi_rows * oci->mi_cols, 1);
-  if (!oci->last_frame_seg_map)
+  cm->last_frame_seg_map = vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
+  if (!cm->last_frame_seg_map)
     goto fail;
 
   return 0;
 
  fail:
-  vp9_free_frame_buffers(oci);
+  vp9_free_frame_buffers(cm);
   return 1;
 }
 
-void vp9_create_common(VP9_COMMON *oci) {
-  vp9_machine_specific_config(oci);
+void vp9_create_common(VP9_COMMON *cm) {
+  vp9_machine_specific_config(cm);
 
-  vp9_init_mbmode_probs(oci);
+  vp9_init_mbmode_probs(cm);
 
-  oci->tx_mode = ONLY_4X4;
-  oci->comp_pred_mode = HYBRID_PREDICTION;
+  cm->tx_mode = ONLY_4X4;
+  cm->comp_pred_mode = HYBRID_PREDICTION;
 
   // Initialize reference frame sign bias structure to defaults
-  vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
+  vpx_memset(cm->ref_frame_sign_bias, 0, sizeof(cm->ref_frame_sign_bias));
 }
 
-void vp9_remove_common(VP9_COMMON *oci) {
-  vp9_free_frame_buffers(oci);
+void vp9_remove_common(VP9_COMMON *cm) {
+  vp9_free_frame_buffers(cm);
 }
 
 void vp9_initialize_common() {

diff --git a/vp9/common/vp9_alloccommon.h b/vp9/common/vp9_alloccommon.h
index 8bf5ed1..b7d7eba 100644
--- a/vp9/common/vp9_alloccommon.h
+++ b/vp9/common/vp9_alloccommon.h

@@ -16,14 +16,14 @@
 
 void vp9_initialize_common();
 
-void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi);
-void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi);
+void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi);
+void vp9_update_mode_info_in_image(VP9_COMMON *cm, MODE_INFO *mi);
 
-void vp9_create_common(VP9_COMMON *oci);
-void vp9_remove_common(VP9_COMMON *oci);
+void vp9_create_common(VP9_COMMON *cm);
+void vp9_remove_common(VP9_COMMON *cm);
 
-int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height);
-void vp9_free_frame_buffers(VP9_COMMON *oci);
+int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height);
+void vp9_free_frame_buffers(VP9_COMMON *cm);
 
 
 void vp9_update_frame_size(VP9_COMMON *cm);

diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 6e57259..5ba7846 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h

@@ -71,7 +71,7 @@
   D135_PRED,       // Directional 135 deg = 180 - 45
   D117_PRED,       // Directional 117 deg = 180 - 63
   D153_PRED,       // Directional 153 deg = 180 - 27
-  D27_PRED,        // Directional 27  deg = round(arctan(1/2) * 180/pi)
+  D207_PRED,       // Directional 207 deg = 180 + 27
   D63_PRED,        // Directional 63  deg = round(arctan(2/1) * 180/pi)
   TM_PRED,         // True-motion
   NEARESTMV,
@@ -115,18 +115,18 @@
   MAX_REF_FRAMES = 4
 } MV_REFERENCE_FRAME;
 
-static INLINE int b_width_log2(BLOCK_SIZE_TYPE sb_type) {
+static INLINE int b_width_log2(BLOCK_SIZE sb_type) {
   return b_width_log2_lookup[sb_type];
 }
-static INLINE int b_height_log2(BLOCK_SIZE_TYPE sb_type) {
+static INLINE int b_height_log2(BLOCK_SIZE sb_type) {
   return b_height_log2_lookup[sb_type];
 }
 
-static INLINE int mi_width_log2(BLOCK_SIZE_TYPE sb_type) {
+static INLINE int mi_width_log2(BLOCK_SIZE sb_type) {
   return mi_width_log2_lookup[sb_type];
 }
 
-static INLINE int mi_height_log2(BLOCK_SIZE_TYPE sb_type) {
+static INLINE int mi_height_log2(BLOCK_SIZE sb_type) {
   return mi_height_log2_lookup[sb_type];
 }
 
@@ -134,7 +134,7 @@
 typedef struct {
   MB_PREDICTION_MODE mode, uv_mode;
   MV_REFERENCE_FRAME ref_frame[2];
-  TX_SIZE txfm_size;
+  TX_SIZE tx_size;
   int_mv mv[2];                // for each reference frame used
   int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
   int_mv best_mv, best_second_mv;
@@ -153,7 +153,7 @@
 
   INTERPOLATIONFILTERTYPE interp_filter;
 
-  BLOCK_SIZE_TYPE sb_type;
+  BLOCK_SIZE sb_type;
 } MB_MODE_INFO;
 
 typedef struct {
@@ -245,7 +245,7 @@
 
 } MACROBLOCKD;
 
-static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
+static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) {
   switch (subsize) {
     case BLOCK_64X64:
     case BLOCK_64X32:
@@ -270,9 +270,8 @@
   }
 }
 
-static INLINE void update_partition_context(MACROBLOCKD *xd,
-                                            BLOCK_SIZE_TYPE sb_type,
-                                            BLOCK_SIZE_TYPE sb_size) {
+static INLINE void update_partition_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type,
+                                            BLOCK_SIZE sb_size) {
   const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
   const int bwl = b_width_log2(sb_type);
   const int bhl = b_height_log2(sb_type);
@@ -290,8 +289,7 @@
   vpx_memset(xd->left_seg_context, pcvalue[bhl == bsl], bs);
 }
 
-static INLINE int partition_plane_context(MACROBLOCKD *xd,
-                                          BLOCK_SIZE_TYPE sb_type) {
+static INLINE int partition_plane_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type) {
   int bsl = mi_width_log2(sb_type), bs = 1 << bsl;
   int above = 0, left = 0, i;
   int boffset = mi_width_log2(BLOCK_64X64) - bsl;
@@ -311,9 +309,8 @@
   return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }
 
-static BLOCK_SIZE_TYPE get_subsize(BLOCK_SIZE_TYPE bsize,
-                                   PARTITION_TYPE partition) {
-  const BLOCK_SIZE_TYPE subsize = subsize_lookup[partition][bsize];
+static BLOCK_SIZE get_subsize(BLOCK_SIZE bsize, PARTITION_TYPE partition) {
+  const BLOCK_SIZE subsize = subsize_lookup[partition][bsize];
   assert(subsize < BLOCK_SIZES);
   return subsize;
 }
@@ -363,34 +360,33 @@
 
 
 static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {
-  return MIN(mbmi->txfm_size, max_uv_txsize_lookup[mbmi->sb_type]);
+  return MIN(mbmi->tx_size, max_uv_txsize_lookup[mbmi->sb_type]);
 }
 
-static BLOCK_SIZE_TYPE get_plane_block_size(BLOCK_SIZE_TYPE bsize,
-                           const struct macroblockd_plane *pd) {
-  BLOCK_SIZE_TYPE bs = ss_size_lookup[bsize]
-                                     [pd->subsampling_x][pd->subsampling_y];
+static BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
+                                       const struct macroblockd_plane *pd) {
+  BLOCK_SIZE bs = ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y];
   assert(bs < BLOCK_SIZES);
   return bs;
 }
 
-static INLINE int plane_block_width(BLOCK_SIZE_TYPE bsize,
+static INLINE int plane_block_width(BLOCK_SIZE bsize,
                                     const struct macroblockd_plane* plane) {
   return 4 << (b_width_log2(bsize) - plane->subsampling_x);
 }
 
-static INLINE int plane_block_height(BLOCK_SIZE_TYPE bsize,
+static INLINE int plane_block_height(BLOCK_SIZE bsize,
                                      const struct macroblockd_plane* plane) {
   return 4 << (b_height_log2(bsize) - plane->subsampling_y);
 }
 
 typedef void (*foreach_transformed_block_visitor)(int plane, int block,
-                                                  BLOCK_SIZE_TYPE plane_bsize,
+                                                  BLOCK_SIZE plane_bsize,
                                                   TX_SIZE tx_size,
                                                   void *arg);
 
 static INLINE void foreach_transformed_block_in_plane(
-    const MACROBLOCKD *const xd, BLOCK_SIZE_TYPE bsize, int plane,
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
     foreach_transformed_block_visitor visit, void *arg) {
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const MB_MODE_INFO* mbmi = &xd->mode_info_context->mbmi;
@@ -398,8 +394,8 @@
   // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
   // transform size varies per plane, look it up in a common way.
   const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi)
-                                : mbmi->txfm_size;
-  const BLOCK_SIZE_TYPE plane_bsize = get_plane_block_size(bsize, pd);
+                                : mbmi->tx_size;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
   const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
   const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
   const int step = 1 << (tx_size << 1);
@@ -440,7 +436,7 @@
 }
 
 static INLINE void foreach_transformed_block(
-    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
+    const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
     foreach_transformed_block_visitor visit, void *arg) {
   int plane;
 
@@ -449,7 +445,7 @@
 }
 
 static INLINE void foreach_transformed_block_uv(
-    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
+    const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
     foreach_transformed_block_visitor visit, void *arg) {
   int plane;
 
@@ -457,25 +453,25 @@
     foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
 }
 
-static int raster_block_offset(BLOCK_SIZE_TYPE plane_bsize,
+static int raster_block_offset(BLOCK_SIZE plane_bsize,
                                int raster_block, int stride) {
   const int bw = b_width_log2(plane_bsize);
   const int y = 4 * (raster_block >> bw);
   const int x = 4 * (raster_block & ((1 << bw) - 1));
   return y * stride + x;
 }
-static int16_t* raster_block_offset_int16(BLOCK_SIZE_TYPE plane_bsize,
+static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
                                           int raster_block, int16_t *base) {
   const int stride = 4 << b_width_log2(plane_bsize);
   return base + raster_block_offset(plane_bsize, raster_block, stride);
 }
-static uint8_t* raster_block_offset_uint8(BLOCK_SIZE_TYPE plane_bsize,
+static uint8_t* raster_block_offset_uint8(BLOCK_SIZE plane_bsize,
                                           int raster_block, uint8_t *base,
                                           int stride) {
   return base + raster_block_offset(plane_bsize, raster_block, stride);
 }
 
-static int txfrm_block_to_raster_block(BLOCK_SIZE_TYPE plane_bsize,
+static int txfrm_block_to_raster_block(BLOCK_SIZE plane_bsize,
                                        TX_SIZE tx_size, int block) {
   const int bwl = b_width_log2(plane_bsize);
   const int tx_cols_log2 = bwl - tx_size;
@@ -486,7 +482,7 @@
   return x + (y << bwl);
 }
 
-static void txfrm_block_to_raster_xy(BLOCK_SIZE_TYPE plane_bsize,
+static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
                                      TX_SIZE tx_size, int block,
                                      int *x, int *y) {
   const int bwl = b_width_log2(plane_bsize);
@@ -497,7 +493,7 @@
   *y = (raster_mb >> tx_cols_log2) << tx_size;
 }
 
-static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE_TYPE plane_bsize,
+static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize,
                              int plane, int block, TX_SIZE tx_size) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   uint8_t *const buf = pd->dst.buf;
@@ -536,7 +532,7 @@
 }
 static void set_contexts_on_border(MACROBLOCKD *xd,
                                    struct macroblockd_plane *pd,
-                                   BLOCK_SIZE_TYPE plane_bsize,
+                                   BLOCK_SIZE plane_bsize,
                                    int tx_size_in_blocks, int has_eob,
                                    int aoff, int loff,
                                    ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
@@ -573,7 +569,7 @@
 }
 
 static void set_contexts(MACROBLOCKD *xd, struct macroblockd_plane *pd,
-                         BLOCK_SIZE_TYPE plane_bsize, TX_SIZE tx_size,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                          int has_eob, int aoff, int loff) {
   ENTROPY_CONTEXT *const A = pd->above_context + aoff;
   ENTROPY_CONTEXT *const L = pd->left_context + loff;
@@ -588,4 +584,10 @@
   }
 }
 
+static int get_tx_eob(struct segmentation *seg, int segment_id,
+                      TX_SIZE tx_size) {
+  const int eob_max = 16 << (tx_size << 1);
+  return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+}
+
 #endif  // VP9_COMMON_VP9_BLOCKD_H_

diff --git a/vp9/common/vp9_common_data.c b/vp9/common/vp9_common_data.c
index 310a667..dc41efd 100644
--- a/vp9/common/vp9_common_data.c
+++ b/vp9/common/vp9_common_data.c

@@ -74,7 +74,7 @@
   }
 };
 
-const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES] = {
+const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES] = {
   {     // PARTITION_NONE
     BLOCK_4X4,   BLOCK_4X8,   BLOCK_8X4,
     BLOCK_8X8,   BLOCK_8X16,  BLOCK_16X8,
@@ -115,7 +115,7 @@
   TX_16X16, TX_16X16, TX_16X16, TX_32X32
 };
 
-const BLOCK_SIZE_TYPE ss_size_lookup[BLOCK_SIZES][2][2] = {
+const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
 //  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
 //  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
   {{BLOCK_4X4,   BLOCK_INVALID}, {BLOCK_INVALID, BLOCK_INVALID}},

diff --git a/vp9/common/vp9_common_data.h b/vp9/common/vp9_common_data.h
index 808b9ed..3822bfc 100644
--- a/vp9/common/vp9_common_data.h
+++ b/vp9/common/vp9_common_data.h

@@ -24,9 +24,9 @@
 extern const int size_group_lookup[BLOCK_SIZES];
 extern const int num_pels_log2_lookup[BLOCK_SIZES];
 extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZES];
-extern const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES];
+extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES];
 extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES];
 extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES];
-extern const BLOCK_SIZE_TYPE ss_size_lookup[BLOCK_SIZES][2][2];
+extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2];
 
 #endif    // VP9_COMMON_VP9_COMMON_DATA_H

diff --git a/vp9/common/vp9_debugmodes.c b/vp9/common/vp9_debugmodes.c
index f2def5c..4de50aa 100644
--- a/vp9/common/vp9_debugmodes.c
+++ b/vp9/common/vp9_debugmodes.c

@@ -22,17 +22,17 @@
  * and uses the passed in member offset to print out the value of an integer
  * for each mbmi member value in the mi structure.
  */
-static void print_mi_data(VP9_COMMON *common, FILE *file, char *descriptor,
+static void print_mi_data(VP9_COMMON *cm, FILE *file, char *descriptor,
                           size_t member_offset) {
   int mi_row;
   int mi_col;
   int mi_index = 0;
-  MODE_INFO *mi = common->mi;
-  int rows = common->mi_rows;
-  int cols = common->mi_cols;
+  MODE_INFO *mi = cm->mi;
+  int rows = cm->mi_rows;
+  int cols = cm->mi_cols;
   char prefix = descriptor[0];
 
-  log_frame_info(common, descriptor, file);
+  log_frame_info(cm, descriptor, file);
   mi_index = 0;
   for (mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(file, "%c ", prefix);
@@ -59,7 +59,7 @@
   print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
   print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip_coeff));
   print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
-  print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, txfm_size));
+  print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
   print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
 
   log_frame_info(cm, "Vectors ",mvs);

diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 21e0e04..32d9e0c 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c

@@ -436,11 +436,11 @@
 
 #include "vp9/common/vp9_default_coef_probs.h"
 
-void vp9_default_coef_probs(VP9_COMMON *pc) {
-  vp9_copy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4);
-  vp9_copy(pc->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
-  vp9_copy(pc->fc.coef_probs[TX_16X16], default_coef_probs_16x16);
-  vp9_copy(pc->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
+void vp9_default_coef_probs(VP9_COMMON *cm) {
+  vp9_copy(cm->fc.coef_probs[TX_4X4], default_coef_probs_4x4);
+  vp9_copy(cm->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
+  vp9_copy(cm->fc.coef_probs[TX_16X16], default_coef_probs_16x16);
+  vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
 }
 
 // Neighborhood 5-tuples for various scans and blocksizes,

diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 82cbfd3..699b44a 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h

@@ -95,7 +95,7 @@
 #define MODULUS_PARAM               13  /* Modulus parameter */
 
 struct VP9Common;
-void vp9_default_coef_probs(struct VP9Common *);
+void vp9_default_coef_probs(struct VP9Common *cm);
 extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]);
 
 extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]);
@@ -154,13 +154,13 @@
                        vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
 
 void vp9_coef_tree_initialize(void);
-void vp9_adapt_coef_probs(struct VP9Common *);
+void vp9_adapt_coef_probs(struct VP9Common *cm);
 
-static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
+static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
   int i;
   for (i = 0; i < MAX_MB_PLANE; i++) {
     struct macroblockd_plane *const pd = &xd->plane[i];
-    const BLOCK_SIZE_TYPE plane_bsize = get_plane_block_size(bsize, pd);
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
     vpx_memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) *
                    num_4x4_blocks_wide_lookup[plane_bsize]);
     vpx_memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) *
@@ -336,6 +336,45 @@
   }
 }
 
+static int get_entropy_context(const MACROBLOCKD *xd, TX_SIZE tx_size,
+                               PLANE_TYPE type, int block_idx,
+                               ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
+                               const int16_t **scan,
+                               const uint8_t **band_translate) {
+  ENTROPY_CONTEXT above_ec, left_ec;
+
+  switch (tx_size) {
+    case TX_4X4:
+      *scan = get_scan_4x4(get_tx_type_4x4(type, xd, block_idx));
+      *band_translate = vp9_coefband_trans_4x4;
+      above_ec = A[0] != 0;
+      left_ec = L[0] != 0;
+      break;
+    case TX_8X8:
+      *scan = get_scan_8x8(get_tx_type_8x8(type, xd));
+      *band_translate = vp9_coefband_trans_8x8plus;
+      above_ec = !!*(uint16_t *)A;
+      left_ec  = !!*(uint16_t *)L;
+      break;
+    case TX_16X16:
+      *scan = get_scan_16x16(get_tx_type_16x16(type, xd));
+      *band_translate = vp9_coefband_trans_8x8plus;
+      above_ec = !!*(uint32_t *)A;
+      left_ec  = !!*(uint32_t *)L;
+      break;
+    case TX_32X32:
+      *scan = vp9_default_scan_32x32;
+      *band_translate = vp9_coefband_trans_8x8plus;
+      above_ec = !!*(uint64_t *)A;
+      left_ec  = !!*(uint64_t *)L;
+      break;
+    default:
+      assert(!"Invalid transform size.");
+  }
+
+  return combine_entropy_contexts(above_ec, left_ec);
+}
+
 enum { VP9_COEF_UPDATE_PROB = 252 };
 
 #endif  // VP9_COMMON_VP9_ENTROPY_H_

diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 3bc8de1..a75d1a9 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c

@@ -235,7 +235,7 @@
   -D135_PRED, -D117_PRED,           /* 5 = D135_NODE */
   -D45_PRED, 14,                    /* 6 = D45_NODE */
   -D63_PRED, 16,                    /* 7 = D63_NODE */
-  -D153_PRED, -D27_PRED             /* 8 = D153_NODE */
+  -D153_PRED, -D207_PRED             /* 8 = D153_NODE */
 };
 
 const vp9_tree_index vp9_inter_mode_tree[6] = {

diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index 2f8085d..4cf4c03 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h

@@ -58,9 +58,9 @@
 
 void vp9_setup_past_independence(struct VP9Common *cm);
 
-void vp9_init_mbmode_probs(struct VP9Common *x);
+void vp9_init_mbmode_probs(struct VP9Common *cm);
 
-void vp9_adapt_mode_probs(struct VP9Common *);
+void vp9_adapt_mode_probs(struct VP9Common *cm);
 
 void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
                                       unsigned int (*ct_32x32p)[2]);

diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 9fab56f..1bf0742 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h

@@ -22,7 +22,7 @@
 #define MI_MASK (MI_BLOCK_SIZE - 1)
 
 
-typedef enum BLOCK_SIZE_TYPE {
+typedef enum BLOCK_SIZE {
   BLOCK_4X4,
   BLOCK_4X8,
   BLOCK_8X4,
@@ -38,7 +38,7 @@
   BLOCK_64X64,
   BLOCK_SIZES,
   BLOCK_INVALID = BLOCK_SIZES
-} BLOCK_SIZE_TYPE;
+} BLOCK_SIZE;
 
 typedef enum PARTITION_TYPE {
   PARTITION_NONE,

diff --git a/vp9/common/vp9_findnearmv.h b/vp9/common/vp9_findnearmv.h
index 178ad87..72572df 100644
--- a/vp9/common/vp9_findnearmv.h
+++ b/vp9/common/vp9_findnearmv.h

@@ -36,7 +36,7 @@
                xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }
 
-void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *pc,
+void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm,
                                    MACROBLOCKD *xd,
                                    int_mv *dst_nearest,
                                    int_mv *dst_near,

diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index fc52747..cfa61c2 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c

@@ -22,13 +22,217 @@
   const uint8_t *hev_thr;
 };
 
+// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
+// Each 1 bit represents a position in which we want to apply the loop filter.
+// Left_ entries refer to whether we apply a filter on the border to the
+// left of the block.   Above_ entries refer to whether or not to apply a
+// filter on the above border.   Int_ entries refer to whether or not to
+// apply borders on the 4x4 edges within the 8x8 block that each bit
+// represents.
+// Since each transform is accompanied by a potentially different type of
+// loop filter there is a different entry in the array for each transform size.
+typedef struct {
+  uint64_t left_y[TX_SIZES];
+  uint64_t above_y[TX_SIZES];
+  uint64_t int_4x4_y;
+  uint16_t left_uv[TX_SIZES];
+  uint16_t above_uv[TX_SIZES];
+  uint16_t int_4x4_uv;
+} LOOP_FILTER_MASK;
+
+// 64 bit masks for left transform size.  Each 1 represents a position where
+// we should apply a loop filter across the left border of an 8x8 block
+// boundary.
+//
+// In the case of TX_16X16->  ( in low order byte first we end up with
+// a mask that looks like this
+//
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//
+// A loopfilter should be applied to every other 8x8 horizontally.
+static const uint64_t left_64x64_txform_mask[TX_SIZES]= {
+    0xffffffffffffffff,  // TX_4X4
+    0xffffffffffffffff,  // TX_8x8
+    0x5555555555555555,  // TX_16x16
+    0x1111111111111111,  // TX_32x32
+};
+
+// 64 bit masks for above transform size.  Each 1 represents a position where
+// we should apply a loop filter across the top border of an 8x8 block
+// boundary.
+//
+// In the case of TX_32x32 ->  ( in low order byte first we end up with
+// a mask that looks like this
+//
+//    11111111
+//    00000000
+//    00000000
+//    00000000
+//    11111111
+//    00000000
+//    00000000
+//    00000000
+//
+// A loopfilter should be applied to every other 4 the row vertically.
+static const uint64_t above_64x64_txform_mask[TX_SIZES]= {
+    0xffffffffffffffff,  // TX_4X4
+    0xffffffffffffffff,  // TX_8x8
+    0x00ff00ff00ff00ff,  // TX_16x16
+    0x000000ff000000ff,  // TX_32x32
+};
+
+// 64 bit masks for prediction sizes (left).  Each 1 represents a position
+// where left border of an 8x8 block.  These are aligned to the right most
+// appropriate bit,  and then shifted into place.
+//
+// In the case of TX_16x32 ->  ( low order byte first ) we end up with
+// a mask that looks like this :
+//
+//  10000000
+//  10000000
+//  10000000
+//  10000000
+//  00000000
+//  00000000
+//  00000000
+//  00000000
+static const uint64_t left_prediction_mask[BLOCK_SIZES] = {
+    0x0000000000000001,  // BLOCK_4X4,
+    0x0000000000000001,  // BLOCK_4X8,
+    0x0000000000000001,  // BLOCK_8X4,
+    0x0000000000000001,  // BLOCK_8X8,
+    0x0000000000000101,  // BLOCK_8X16,
+    0x0000000000000001,  // BLOCK_16X8,
+    0x0000000000000101,  // BLOCK_16X16,
+    0x0000000001010101,  // BLOCK_16X32,
+    0x0000000000000101,  // BLOCK_32X16,
+    0x0000000001010101,  // BLOCK_32X32,
+    0x0101010101010101,  // BLOCK_32X64,
+    0x0000000001010101,  // BLOCK_64X32,
+    0x0101010101010101,  // BLOCK_64X64
+};
+
+// 64 bit mask to shift and set for each prediction size.
+static const uint64_t above_prediction_mask[BLOCK_SIZES] = {
+    0x0000000000000001,  // BLOCK_4X4
+    0x0000000000000001,  // BLOCK_4X8
+    0x0000000000000001,  // BLOCK_8X4
+    0x0000000000000001,  // BLOCK_8X8
+    0x0000000000000001,  // BLOCK_8X16,
+    0x0000000000000003,  // BLOCK_16X8
+    0x0000000000000003,  // BLOCK_16X16
+    0x0000000000000003,  // BLOCK_16X32,
+    0x000000000000000f,  // BLOCK_32X16,
+    0x000000000000000f,  // BLOCK_32X32,
+    0x000000000000000f,  // BLOCK_32X64,
+    0x00000000000000ff,  // BLOCK_64X32,
+    0x00000000000000ff,  // BLOCK_64X64
+};
+// 64 bit mask to shift and set for each prediction size.  A bit is set for
+// each 8x8 block that would be in the left most block of the given block
+// size in the 64x64 block.
+static const uint64_t size_mask[BLOCK_SIZES] = {
+    0x0000000000000001,  // BLOCK_4X4
+    0x0000000000000001,  // BLOCK_4X8
+    0x0000000000000001,  // BLOCK_8X4
+    0x0000000000000001,  // BLOCK_8X8
+    0x0000000000000101,  // BLOCK_8X16,
+    0x0000000000000003,  // BLOCK_16X8
+    0x0000000000000303,  // BLOCK_16X16
+    0x0000000003030303,  // BLOCK_16X32,
+    0x0000000000000f0f,  // BLOCK_32X16,
+    0x000000000f0f0f0f,  // BLOCK_32X32,
+    0x0f0f0f0f0f0f0f0f,  // BLOCK_32X64,
+    0x00000000ffffffff,  // BLOCK_64X32,
+    0xffffffffffffffff,  // BLOCK_64X64
+};
+
+// These are used for masking the left and above borders.
+static const uint64_t left_border =  0x1111111111111111;
+static const uint64_t above_border = 0x000000ff000000ff;
+
+// 16 bit masks for uv transform sizes.
+static const uint16_t left_64x64_txform_mask_uv[TX_SIZES]= {
+    0xffff,  // TX_4X4
+    0xffff,  // TX_8x8
+    0x5555,  // TX_16x16
+    0x1111,  // TX_32x32
+};
+
+static const uint16_t above_64x64_txform_mask_uv[TX_SIZES]= {
+    0xffff,  // TX_4X4
+    0xffff,  // TX_8x8
+    0x0f0f,  // TX_16x16
+    0x000f,  // TX_32x32
+};
+
+// 16 bit left mask to shift and set for each uv prediction size.
+static const uint16_t left_prediction_mask_uv[BLOCK_SIZES] = {
+    0x0001,  // BLOCK_4X4,
+    0x0001,  // BLOCK_4X8,
+    0x0001,  // BLOCK_8X4,
+    0x0001,  // BLOCK_8X8,
+    0x0001,  // BLOCK_8X16,
+    0x0001,  // BLOCK_16X8,
+    0x0001,  // BLOCK_16X16,
+    0x0011,  // BLOCK_16X32,
+    0x0001,  // BLOCK_32X16,
+    0x0011,  // BLOCK_32X32,
+    0x1111,  // BLOCK_32X64
+    0x0011,  // BLOCK_64X32,
+    0x1111,  // BLOCK_64X64
+};
+// 16 bit above mask to shift and set for uv each prediction size.
+static const uint16_t above_prediction_mask_uv[BLOCK_SIZES] = {
+    0x0001,  // BLOCK_4X4
+    0x0001,  // BLOCK_4X8
+    0x0001,  // BLOCK_8X4
+    0x0001,  // BLOCK_8X8
+    0x0001,  // BLOCK_8X16,
+    0x0001,  // BLOCK_16X8
+    0x0001,  // BLOCK_16X16
+    0x0001,  // BLOCK_16X32,
+    0x0003,  // BLOCK_32X16,
+    0x0003,  // BLOCK_32X32,
+    0x0003,  // BLOCK_32X64,
+    0x000f,  // BLOCK_64X32,
+    0x000f,  // BLOCK_64X64
+};
+
+// 64 bit mask to shift and set for each uv prediction size
+static const uint16_t size_mask_uv[BLOCK_SIZES] = {
+    0x0001,  // BLOCK_4X4
+    0x0001,  // BLOCK_4X8
+    0x0001,  // BLOCK_8X4
+    0x0001,  // BLOCK_8X8
+    0x0001,  // BLOCK_8X16,
+    0x0001,  // BLOCK_16X8
+    0x0001,  // BLOCK_16X16
+    0x0011,  // BLOCK_16X32,
+    0x0003,  // BLOCK_32X16,
+    0x0033,  // BLOCK_32X32,
+    0x3333,  // BLOCK_32X64,
+    0x00ff,  // BLOCK_64X32,
+    0xffff,  // BLOCK_64X64
+};
+static const uint16_t left_border_uv =  0x1111;
+static const uint16_t above_border_uv = 0x000f;
+
+
 static void lf_init_lut(loop_filter_info_n *lfi) {
   lfi->mode_lf_lut[DC_PRED] = 0;
   lfi->mode_lf_lut[D45_PRED] = 0;
   lfi->mode_lf_lut[D135_PRED] = 0;
   lfi->mode_lf_lut[D117_PRED] = 0;
   lfi->mode_lf_lut[D153_PRED] = 0;
-  lfi->mode_lf_lut[D27_PRED] = 0;
+  lfi->mode_lf_lut[D207_PRED] = 0;
   lfi->mode_lf_lut[D63_PRED] = 0;
   lfi->mode_lf_lut[V_PRED] = 0;
   lfi->mode_lf_lut[H_PRED] = 0;
@@ -39,7 +243,7 @@
   lfi->mode_lf_lut[NEWMV] = 1;
 }
 
-static void update_sharpness(loop_filter_info_n *const lfi, int sharpness_lvl) {
+static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
   int lvl;
 
   // For each possible value for the loop filter fill out limits
@@ -78,7 +282,7 @@
     vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
 }
 
-void vp9_loop_filter_frame_init(VP9_COMMON *const cm, int default_filt_lvl) {
+void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
   int seg_id;
   // n_shift is the a multiplier for lf_deltas
   // the multiplier is 1 for when filter_lvl is between 0 and 31;
@@ -124,9 +328,9 @@
   }
 }
 
-static int build_lfi(const loop_filter_info_n *const lfi_n,
-                     const MB_MODE_INFO *const mbmi,
-                     struct loop_filter_info *const lfi) {
+static int build_lfi(const loop_filter_info_n *lfi_n,
+                     const MB_MODE_INFO *mbmi,
+                     struct loop_filter_info *lfi) {
   const int seg = mbmi->segment_id;
   const int ref = mbmi->ref_frame[0];
   const int mode = lfi_n->mode_lf_lut[mbmi->mode];
@@ -236,10 +440,347 @@
   }
 }
 
-static void filter_block_plane(VP9_COMMON *const cm,
-                               struct macroblockd_plane *const plane,
-                               const MODE_INFO *mi,
-                               int mi_row, int mi_col) {
+// This function ors into the current lfm structure, where to do loop
+// filters for the specific mi we are looking at.   It uses information
+// including the block_size_type (32x16, 32x32, etc),  the transform size,
+// whether there were any coefficients encoded, and the loop filter strength
+// block we are currently looking at. Shift is used to position the
+// 1's we produce.
+// TODO(JBB) Need another function for different resolution color..
+static void build_masks(const loop_filter_info_n *const lfi_n,
+                        const MODE_INFO *mi, const int shift_y,
+                        const int shift_uv,
+                        LOOP_FILTER_MASK *lfm) {
+  const BLOCK_SIZE block_size = mi->mbmi.sb_type;
+  const TX_SIZE tx_size_y = mi->mbmi.tx_size;
+  const TX_SIZE tx_size_uv = get_uv_tx_size(&mi->mbmi);
+  const int skip = mi->mbmi.skip_coeff;
+  const int seg = mi->mbmi.segment_id;
+  const int ref = mi->mbmi.ref_frame[0];
+  const int mode = lfi_n->mode_lf_lut[mi->mbmi.mode];
+  const int filter_level = lfi_n->lvl[seg][ref][mode];
+  uint64_t *left_y = &lfm->left_y[tx_size_y];
+  uint64_t *above_y = &lfm->above_y[tx_size_y];
+  uint64_t *int_4x4_y = &lfm->int_4x4_y;
+  uint16_t *left_uv = &lfm->left_uv[tx_size_uv];
+  uint16_t *above_uv = &lfm->above_uv[tx_size_uv];
+  uint16_t *int_4x4_uv = &lfm->int_4x4_uv;
+
+  // If filter level is 0 we don't loop filter.
+  if (!filter_level)
+    return;
+
+  // These set 1 in the current block size for the block size edges.
+  // For instance if the block size is 32x16,   we'll set :
+  //    above =   1111
+  //              0000
+  //    and
+  //    left  =   1000
+  //          =   1000
+  // NOTE : In this example the low bit is left most ( 1000 ) is stored as
+  //        1,  not 8...
+  //
+  // U and v set things on a 16 bit scale.
+  //
+  *above_y |= above_prediction_mask[block_size] << shift_y;
+  *above_uv |= above_prediction_mask_uv[block_size] << shift_uv;
+  *left_y |= left_prediction_mask[block_size] << shift_y;
+  *left_uv |= left_prediction_mask_uv[block_size] << shift_uv;
+
+  // If the block has no coefficients and is not intra we skip applying
+  // the loop filter on block edges.
+  if (skip && ref > INTRA_FRAME)
+    return;
+
+  // Here we are adding a mask for the transform size.  The transform
+  // size mask is set to be correct for a 64x64 prediction block size. We
+  // mask to match the size of the block we are working on and then shift it
+  // into place..
+  *above_y |= (size_mask[block_size] &
+               above_64x64_txform_mask[tx_size_y]) << shift_y;
+  *above_uv |= (size_mask_uv[block_size] &
+                above_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
+
+  *left_y |= (size_mask[block_size] &
+              left_64x64_txform_mask[tx_size_y]) << shift_y;
+  *left_uv |= (size_mask_uv[block_size] &
+               left_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
+
+  // Here we are trying to determine what to do with the internal 4x4 block
+  // boundaries.  These differ from the 4x4 boundaries on the outside edge of
+  // an 8x8 in that the internal ones can be skipped and don't depend on
+  // the prediction block size.
+  if (tx_size_y == TX_4X4) {
+    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
+  }
+  if (tx_size_uv == TX_4X4) {
+    *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
+  }
+}
+
+// This function does the same thing as the one above with the exception that
+// it only affects the y masks.   It exists because for blocks < 16x16 in size,
+// we only update u and v masks on the first block.
+static void build_y_mask(const loop_filter_info_n *const lfi_n,
+                         const MODE_INFO *mi, const int shift_y,
+                         LOOP_FILTER_MASK *lfm) {
+  const BLOCK_SIZE block_size = mi->mbmi.sb_type;
+  const TX_SIZE tx_size_y = mi->mbmi.tx_size;
+  const int skip = mi->mbmi.skip_coeff;
+  const int seg = mi->mbmi.segment_id;
+  const int ref = mi->mbmi.ref_frame[0];
+  const int mode = lfi_n->mode_lf_lut[mi->mbmi.mode];
+  const int filter_level = lfi_n->lvl[seg][ref][mode];
+  uint64_t *left_y = &lfm->left_y[tx_size_y];
+  uint64_t *above_y = &lfm->above_y[tx_size_y];
+  uint64_t *int_4x4_y = &lfm->int_4x4_y;
+
+  if (!filter_level)
+    return;
+
+  *above_y |= above_prediction_mask[block_size] << shift_y;
+  *left_y |= left_prediction_mask[block_size] << shift_y;
+
+  if (skip && ref > INTRA_FRAME)
+    return;
+
+  *above_y |= (size_mask[block_size] &
+               above_64x64_txform_mask[tx_size_y]) << shift_y;
+
+  *left_y |= (size_mask[block_size] &
+              left_64x64_txform_mask[tx_size_y]) << shift_y;
+
+  if (tx_size_y == TX_4X4) {
+    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
+  }
+}
+
+// This function sets up the bit masks for the entire 64x64 region represented
+// by mi_row, mi_col.
+// TODO(JBB): This function only works for yv12.
+static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
+                       const MODE_INFO *mi, const int mode_info_stride,
+                       LOOP_FILTER_MASK *lfm) {
+  int idx_32, idx_16, idx_8;
+  const loop_filter_info_n *const lfi_n = &cm->lf_info;
+  const MODE_INFO *mip = mi;
+  const MODE_INFO *mip2 = mi;
+
+  // These are offsets to the next mi in the 64x64 block. It is what gets
+  // added to the mi ptr as we go through each loop.  It helps us to avoids
+  // setting up special row and column counters for each index.  The last step
+  // brings us out back to the starting position.
+  const int offset_32[] = {4, (mode_info_stride << 2) - 4, 4,
+                           -(mode_info_stride << 2) - 4};
+  const int offset_16[] = {2, (mode_info_stride << 1) - 2, 2,
+                           -(mode_info_stride << 1) - 2};
+  const int offset[] = {1, mode_info_stride - 1, 1, -mode_info_stride - 1};
+
+  // Following variables represent shifts to position the current block
+  // mask over the appropriate block.   A shift of 36 to the left will move
+  // the bits for the final 32 by 32 block in the 64x64 up 4 rows and left
+  // 4 rows to the appropriate spot.
+  const int shift_32_y[] = {0, 4, 32, 36};
+  const int shift_16_y[] = {0, 2, 16, 18};
+  const int shift_8_y[] = {0, 1, 8, 9};
+  const int shift_32_uv[] = {0, 2, 8, 10};
+  const int shift_16_uv[] = {0, 1, 4, 5};
+  int i;
+  const int max_rows = (mi_row + MI_BLOCK_SIZE > cm->mi_rows ?
+                        cm->mi_rows - mi_row : MI_BLOCK_SIZE);
+  const int max_cols = (mi_col + MI_BLOCK_SIZE > cm->mi_cols ?
+                        cm->mi_cols - mi_col : MI_BLOCK_SIZE);
+
+  vp9_zero(*lfm);
+
+  // TODO(jimbankoski): Try moving most of the following code into decode
+  // loop and storing lfm in the mbmi structure so that we don't have to go
+  // through the recursive loop structure multiple times.
+  switch (mip->mbmi.sb_type) {
+    case BLOCK_64X64:
+      build_masks(lfi_n, mip , 0, 0, lfm);
+      break;
+    case BLOCK_64X32:
+      build_masks(lfi_n, mip, 0, 0, lfm);
+      mip2 = mip + mode_info_stride * 4;
+      build_masks(lfi_n, mip2 , 32, 8, lfm);
+      break;
+    case BLOCK_32X64:
+      build_masks(lfi_n, mip, 0, 0, lfm);
+      mip2 = mip + 4;
+      build_masks(lfi_n, mip2, 4, 2, lfm);
+      break;
+    default:
+      for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) {
+        const int shift_y = shift_32_y[idx_32];
+        const int shift_uv = shift_32_uv[idx_32];
+        const int mi_32_col_offset = ((idx_32 & 1) << 2);
+        const int mi_32_row_offset = ((idx_32 >> 1) << 2);
+        if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows)
+          continue;
+        switch (mip->mbmi.sb_type) {
+          case BLOCK_32X32:
+            build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+            break;
+          case BLOCK_32X16:
+            build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+            mip2 = mip + mode_info_stride * 2;
+            build_masks(lfi_n, mip2, shift_y + 16, shift_uv + 4, lfm);
+            break;
+          case BLOCK_16X32:
+            build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+            mip2 = mip + 2;
+            build_masks(lfi_n, mip2, shift_y + 2, shift_uv + 1, lfm);
+            break;
+          default:
+            for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
+              const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16];
+              const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16];
+              const int mi_16_col_offset = mi_32_col_offset +
+                  ((idx_16 & 1) << 1);
+              const int mi_16_row_offset = mi_32_row_offset +
+                  ((idx_16 >> 1) << 1);
+
+              if (mi_16_col_offset >= max_cols || mi_16_row_offset >= max_rows)
+                continue;
+
+              switch (mip->mbmi.sb_type) {
+                case BLOCK_16X16:
+                  build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+                  break;
+                case BLOCK_16X8:
+                  build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+                  mip2 = mip + mode_info_stride;
+                  build_y_mask(lfi_n, mip2, shift_y+8, lfm);
+                  break;
+                case BLOCK_8X16:
+                  build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+                  mip2 = mip + 1;
+                  build_y_mask(lfi_n, mip2, shift_y+1, lfm);
+                  break;
+                default: {
+                  const int shift_y = shift_32_y[idx_32] +
+                                      shift_16_y[idx_16] +
+                                      shift_8_y[0];
+                  build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+                  mip += offset[0];
+                  for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
+                    const int shift_y = shift_32_y[idx_32] +
+                                        shift_16_y[idx_16] +
+                                        shift_8_y[idx_8];
+                    const int mi_8_col_offset = mi_16_col_offset +
+                        ((idx_8 & 1));
+                    const int mi_8_row_offset = mi_16_row_offset +
+                        ((idx_8 >> 1));
+
+                    if (mi_8_col_offset >= max_cols ||
+                        mi_8_row_offset >= max_rows)
+                      continue;
+                    build_y_mask(lfi_n, mip, shift_y, lfm);
+                  }
+                  break;
+                }
+              }
+            }
+            break;
+        }
+      }
+      break;
+  }
+  // The largest loopfilter we have is 16x16 so we use the 16x16 mask
+  // for 32x32 transforms also also.
+  lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];
+  lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];
+  lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];
+  lfm->above_uv[TX_16X16] |= lfm->above_uv[TX_32X32];
+
+  // We do at least 8 tap filter on every 32x32 even if the transform size
+  // is 4x4.  So if the 4x4 is set on a border pixel add it to the 8x8 and
+  // remove it from the 4x4.
+  lfm->left_y[TX_8X8] |= lfm->left_y[TX_4X4] & left_border;
+  lfm->left_y[TX_4X4] &= ~left_border;
+  lfm->above_y[TX_8X8] |= lfm->above_y[TX_4X4] & above_border;
+  lfm->above_y[TX_4X4] &= ~above_border;
+  lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_4X4] & left_border_uv;
+  lfm->left_uv[TX_4X4] &= ~left_border_uv;
+  lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_4X4] & above_border_uv;
+  lfm->above_uv[TX_4X4] &= ~above_border_uv;
+
+  // We do some special edge handling.
+  if (mi_row + MI_BLOCK_SIZE > cm->mi_rows) {
+    const uint64_t rows = cm->mi_rows - mi_row;
+
+    // Each pixel inside the border gets a 1,
+    const uint64_t mask_y = (((uint64_t) 1 << (rows << 3)) - 1);
+    const uint16_t mask_uv = (((uint16_t) 1 << (((rows + 1) >> 1) << 2)) - 1);
+
+    // Remove values completely outside our border.
+    for (i = 0; i < TX_32X32; i++) {
+      lfm->left_y[i] &= mask_y;
+      lfm->above_y[i] &= mask_y;
+      lfm->left_uv[i] &= mask_uv;
+      lfm->above_uv[i] &= mask_uv;
+    }
+    lfm->int_4x4_y &= mask_y;
+    lfm->int_4x4_uv &= mask_uv;
+
+    // We don't apply a wide loop filter on the last uv block row.  If set
+    // apply the shorter one instead.
+    if (rows == 1) {
+      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16];
+      lfm->above_uv[TX_16X16] = 0;
+    }
+    if (rows == 5) {
+      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16] & 0xff00;
+      lfm->above_uv[TX_16X16] &= ~(lfm->above_uv[TX_16X16] & 0xff00);
+    }
+  }
+
+  if (mi_col + MI_BLOCK_SIZE > cm->mi_cols) {
+    const uint64_t columns = cm->mi_cols - mi_col;
+
+    // Each pixel inside the border gets a 1, the multiply copies the border
+    // to where we need it.
+    const uint64_t mask_y  = (((1 << columns) - 1)) * 0x0101010101010101;
+    const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111;
+
+    // Internal edges are not applied on the last column of the image so
+    // we mask 1 more for the internal edges
+    const uint16_t mask_uv_int = ((1 << (columns >> 1)) - 1) * 0x1111;
+
+    // Remove the bits outside the image edge.
+    for (i = 0; i < TX_32X32; i++) {
+      lfm->left_y[i] &= mask_y;
+      lfm->above_y[i] &= mask_y;
+      lfm->left_uv[i] &= mask_uv;
+      lfm->above_uv[i] &= mask_uv;
+    }
+    lfm->int_4x4_y &= mask_y;
+    lfm->int_4x4_uv &= mask_uv_int;
+
+    // We don't apply a wide loop filter on the last uv column.  If set
+    // apply the shorter one instead.
+    if (columns == 1) {
+      lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_16X16];
+      lfm->left_uv[TX_16X16] = 0;
+    }
+    if (columns == 5) {
+      lfm->left_uv[TX_8X8] |= (lfm->left_uv[TX_16X16] & 0xcccc);
+      lfm->left_uv[TX_16X16] &= ~(lfm->left_uv[TX_16X16] & 0xcccc);
+    }
+  }
+  // We don't a loop filter on the first column in the image.  Mask that out.
+  if (mi_col == 0) {
+    for (i = 0; i < TX_32X32; i++) {
+      lfm->left_y[i] &= 0xfefefefefefefefe;
+      lfm->left_uv[i] &= 0xeeee;
+    }
+  }
+}
+static void filter_block_plane_non420(VP9_COMMON *cm,
+                                      struct macroblockd_plane *plane,
+                                      const MODE_INFO *mi,
+                                      int mi_row, int mi_col) {
   const int ss_x = plane->subsampling_x;
   const int ss_y = plane->subsampling_y;
   const int row_step = 1 << ss_x;
@@ -274,7 +815,7 @@
       const int skip_this_r = skip_this && !block_edge_above;
       const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
                             ? get_uv_tx_size(&mi[c].mbmi)
-                            : mi[c].mbmi.txfm_size;
+                            : mi[c].mbmi.tx_size;
       const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
       const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
 
@@ -356,11 +897,92 @@
   }
 }
 
+static void filter_block_plane(VP9_COMMON *const cm,
+                               struct macroblockd_plane *const plane,
+                               const MODE_INFO *mi,
+                               int mi_row, int mi_col,
+                               LOOP_FILTER_MASK *lfm) {
+  const int ss_x = plane->subsampling_x;
+  const int ss_y = plane->subsampling_y;
+  const int row_step = 1 << ss_x;
+  const int col_step = 1 << ss_y;
+  const int row_step_stride = cm->mode_info_stride * row_step;
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t* const dst0 = dst->buf;
+  unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
+  struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+  int r, c;
+  int row_shift = 3 - ss_x;
+  int row_mask = 0xff >> (ss_x << 2);
+
+#define MASK_ROW(value) ((value >> (r_sampled << row_shift)) & row_mask)
+
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+    int r_sampled = r >> ss_x;
+
+    // Determine the vertical edges that need filtering
+    for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
+      if (!build_lfi(&cm->lf_info, &mi[c].mbmi, lfi[r] + (c >> ss_x)))
+        continue;
+    }
+    if (!plane->plane_type) {
+      mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_y);
+      // Disable filtering on the leftmost column
+      filter_selectively_vert(dst->buf, dst->stride,
+                              MASK_ROW(lfm->left_y[TX_16X16]),
+                              MASK_ROW(lfm->left_y[TX_8X8]),
+                              MASK_ROW(lfm->left_y[TX_4X4]),
+                              MASK_ROW(lfm->int_4x4_y),
+                              lfi[r]);
+    } else {
+      mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_uv);
+      // Disable filtering on the leftmost column
+      filter_selectively_vert(dst->buf, dst->stride,
+                              MASK_ROW(lfm->left_uv[TX_16X16]),
+                              MASK_ROW(lfm->left_uv[TX_8X8]),
+                              MASK_ROW(lfm->left_uv[TX_4X4]),
+                              MASK_ROW(lfm->int_4x4_uv),
+                              lfi[r]);
+    }
+    dst->buf += 8 * dst->stride;
+    mi += row_step_stride;
+  }
+
+  // Now do horizontal pass
+  dst->buf = dst0;
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+    const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
+    const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
+    int r_sampled = r >> ss_x;
+
+    if (!plane->plane_type) {
+      filter_selectively_horiz(dst->buf, dst->stride,
+                               MASK_ROW(lfm->above_y[TX_16X16]),
+                               MASK_ROW(lfm->above_y[TX_8X8]),
+                               MASK_ROW(lfm->above_y[TX_4X4]),
+                               MASK_ROW(lfm->int_4x4_y),
+                               mi_row + r == 0, lfi[r]);
+    } else {
+      filter_selectively_horiz(dst->buf, dst->stride,
+                               MASK_ROW(lfm->above_uv[TX_16X16]),
+                               MASK_ROW(lfm->above_uv[TX_8X8]),
+                               MASK_ROW(lfm->above_uv[TX_4X4]),
+                               mask_4x4_int_r,
+                               mi_row + r == 0, lfi[r]);
+    }
+    dst->buf += 8 * dst->stride;
+  }
+#undef MASK_ROW
+}
+
 void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
                           VP9_COMMON *cm, MACROBLOCKD *xd,
                           int start, int stop, int y_only) {
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
   int mi_row, mi_col;
+  LOOP_FILTER_MASK lfm;
+  int use_420 = y_only || (xd->plane[1].subsampling_y == 1 &&
+      xd->plane[1].subsampling_x == 1);
 
   for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
     MODE_INFO* const mi = cm->mi + mi_row * cm->mode_info_stride;
@@ -369,8 +991,18 @@
       int plane;
 
       setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
+
+      // TODO(JBB): Make setup_mask work for non 420.
+      if (use_420)
+        setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mode_info_stride, &lfm);
+
       for (plane = 0; plane < num_planes; ++plane) {
-        filter_block_plane(cm, &xd->plane[plane], mi + mi_col, mi_row, mi_col);
+        if (use_420)
+          filter_block_plane(cm, &xd->plane[plane], mi + mi_col, mi_row, mi_col,
+                             &lfm);
+        else
+          filter_block_plane_non420(cm, &xd->plane[plane], mi + mi_col,
+                                    mi_row, mi_col);
       }
     }
   }

diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index e27ba43..d8381ec 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c

@@ -36,7 +36,7 @@
   9,  // D135_PRED
   9,  // D117_PRED
   9,  // D153_PRED
-  9,  // D27_PRED
+  9,  // D207_PRED
   9,  // D63_PRED
   9,  // TM_PRED
   0,  // NEARESTMV

diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index 859c99e..48d3d2d 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c

@@ -630,21 +630,21 @@
   }
 }
 
-int vp9_post_proc_frame(struct VP9Common *oci,
+int vp9_post_proc_frame(struct VP9Common *cm,
                         YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) {
-  int q = oci->lf.filter_level * 10 / 6;
+  int q = cm->lf.filter_level * 10 / 6;
   int flags = ppflags->post_proc_flag;
   int deblock_level = ppflags->deblocking_level;
   int noise_level = ppflags->noise_level;
 
-  if (!oci->frame_to_show)
+  if (!cm->frame_to_show)
     return -1;
 
   if (q > 63)
     q = 63;
 
   if (!flags) {
-    *dest = *oci->frame_to_show;
+    *dest = *cm->frame_to_show;
     return 0;
   }
 
@@ -653,52 +653,52 @@
 #endif
 
   if (flags & VP9D_DEMACROBLOCK) {
-    deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,
+    deblock_and_de_macro_block(cm->frame_to_show, &cm->post_proc_buffer,
                                q + (deblock_level - 5) * 10, 1, 0);
   } else if (flags & VP9D_DEBLOCK) {
-    vp9_deblock(oci->frame_to_show, &oci->post_proc_buffer, q);
+    vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, q);
   } else {
-    vp8_yv12_copy_frame(oci->frame_to_show, &oci->post_proc_buffer);
+    vp8_yv12_copy_frame(cm->frame_to_show, &cm->post_proc_buffer);
   }
 
   if (flags & VP9D_ADDNOISE) {
-    if (oci->postproc_state.last_q != q
-        || oci->postproc_state.last_noise != noise_level) {
-      fillrd(&oci->postproc_state, 63 - q, noise_level);
+    if (cm->postproc_state.last_q != q
+        || cm->postproc_state.last_noise != noise_level) {
+      fillrd(&cm->postproc_state, 63 - q, noise_level);
     }
 
-    vp9_plane_add_noise(oci->post_proc_buffer.y_buffer,
-                        oci->postproc_state.noise,
-                        oci->postproc_state.blackclamp,
-                        oci->postproc_state.whiteclamp,
-                        oci->postproc_state.bothclamp,
-                        oci->post_proc_buffer.y_width,
-                        oci->post_proc_buffer.y_height,
-                        oci->post_proc_buffer.y_stride);
+    vp9_plane_add_noise(cm->post_proc_buffer.y_buffer,
+                        cm->postproc_state.noise,
+                        cm->postproc_state.blackclamp,
+                        cm->postproc_state.whiteclamp,
+                        cm->postproc_state.bothclamp,
+                        cm->post_proc_buffer.y_width,
+                        cm->post_proc_buffer.y_height,
+                        cm->post_proc_buffer.y_stride);
   }
 
 #if 0 && CONFIG_POSTPROC_VISUALIZER
   if (flags & VP9D_DEBUG_TXT_FRAME_INFO) {
     char message[512];
     sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
-            (oci->frame_type == KEY_FRAME),
-            oci->refresh_golden_frame,
-            oci->base_qindex,
-            oci->filter_level,
+            (cm->frame_type == KEY_FRAME),
+            cm->refresh_golden_frame,
+            cm->base_qindex,
+            cm->filter_level,
             flags,
-            oci->mb_cols, oci->mb_rows);
-    vp9_blit_text(message, oci->post_proc_buffer.y_buffer,
-                  oci->post_proc_buffer.y_stride);
+            cm->mb_cols, cm->mb_rows);
+    vp9_blit_text(message, cm->post_proc_buffer.y_buffer,
+                  cm->post_proc_buffer.y_stride);
   }
 
   if (flags & VP9D_DEBUG_TXT_MBLK_MODES) {
     int i, j;
     uint8_t *y_ptr;
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
     int mb_rows = post->y_height >> 4;
     int mb_cols = post->y_width  >> 4;
     int mb_index = 0;
-    MODE_INFO *mi = oci->mi;
+    MODE_INFO *mi = cm->mi;
 
     y_ptr = post->y_buffer + 4 * post->y_stride + 4;
 
@@ -723,11 +723,11 @@
   if (flags & VP9D_DEBUG_TXT_DC_DIFF) {
     int i, j;
     uint8_t *y_ptr;
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
     int mb_rows = post->y_height >> 4;
     int mb_cols = post->y_width  >> 4;
     int mb_index = 0;
-    MODE_INFO *mi = oci->mi;
+    MODE_INFO *mi = cm->mi;
 
     y_ptr = post->y_buffer + 4 * post->y_stride + 4;
 
@@ -739,7 +739,7 @@
                         mi[mb_index].mbmi.mode != SPLITMV &&
                         mi[mb_index].mbmi.skip_coeff);
 
-        if (oci->frame_type == KEY_FRAME)
+        if (cm->frame_type == KEY_FRAME)
           sprintf(zz, "a");
         else
           sprintf(zz, "%c", dc_diff + '0');
@@ -759,19 +759,19 @@
     char message[512];
     snprintf(message, sizeof(message),
              "Bitrate: %10.2f framerate: %10.2f ",
-             oci->bitrate, oci->framerate);
-    vp9_blit_text(message, oci->post_proc_buffer.y_buffer,
-                  oci->post_proc_buffer.y_stride);
+             cm->bitrate, cm->framerate);
+    vp9_blit_text(message, cm->post_proc_buffer.y_buffer,
+                  cm->post_proc_buffer.y_stride);
   }
 
   /* Draw motion vectors */
   if ((flags & VP9D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) {
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
     int width  = post->y_width;
     int height = post->y_height;
-    uint8_t *y_buffer = oci->post_proc_buffer.y_buffer;
-    int y_stride = oci->post_proc_buffer.y_stride;
-    MODE_INFO *mi = oci->mi;
+    uint8_t *y_buffer = cm->post_proc_buffer.y_buffer;
+    int y_stride = cm->post_proc_buffer.y_stride;
+    MODE_INFO *mi = cm->mi;
     int x0, y0;
 
     for (y0 = 0; y0 < height; y0 += 16) {
@@ -880,7 +880,7 @@
               }
             }
           }
-        } else if (mi->mbmi.mode >= NEARESTMV) {
+        } else if (is_inter_mode(mi->mbmi.mode)) {
           MV *mv = &mi->mbmi.mv.as_mv;
           const int lx0 = x0 + 8;
           const int ly0 = y0 + 8;
@@ -908,14 +908,14 @@
   if ((flags & VP9D_DEBUG_CLR_BLK_MODES)
       && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) {
     int y, x;
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
     int width  = post->y_width;
     int height = post->y_height;
-    uint8_t *y_ptr = oci->post_proc_buffer.y_buffer;
-    uint8_t *u_ptr = oci->post_proc_buffer.u_buffer;
-    uint8_t *v_ptr = oci->post_proc_buffer.v_buffer;
-    int y_stride = oci->post_proc_buffer.y_stride;
-    MODE_INFO *mi = oci->mi;
+    uint8_t *y_ptr = cm->post_proc_buffer.y_buffer;
+    uint8_t *u_ptr = cm->post_proc_buffer.u_buffer;
+    uint8_t *v_ptr = cm->post_proc_buffer.v_buffer;
+    int y_stride = cm->post_proc_buffer.y_stride;
+    MODE_INFO *mi = cm->mi;
 
     for (y = 0; y < height; y += 16) {
       for (x = 0; x < width; x += 16) {
@@ -973,14 +973,14 @@
   if ((flags & VP9D_DEBUG_CLR_FRM_REF_BLKS) &&
       ppflags->display_ref_frame_flag) {
     int y, x;
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
     int width  = post->y_width;
     int height = post->y_height;
-    uint8_t *y_ptr = oci->post_proc_buffer.y_buffer;
-    uint8_t *u_ptr = oci->post_proc_buffer.u_buffer;
-    uint8_t *v_ptr = oci->post_proc_buffer.v_buffer;
-    int y_stride = oci->post_proc_buffer.y_stride;
-    MODE_INFO *mi = oci->mi;
+    uint8_t *y_ptr = cm->post_proc_buffer.y_buffer;
+    uint8_t *u_ptr = cm->post_proc_buffer.u_buffer;
+    uint8_t *v_ptr = cm->post_proc_buffer.v_buffer;
+    int y_stride = cm->post_proc_buffer.y_stride;
+    MODE_INFO *mi = cm->mi;
 
     for (y = 0; y < height; y += 16) {
       for (x = 0; x < width; x += 16) {
@@ -1006,11 +1006,11 @@
   }
 #endif
 
-  *dest = oci->post_proc_buffer;
+  *dest = cm->post_proc_buffer;
 
   /* handle problem with extending borders */
-  dest->y_width = oci->width;
-  dest->y_height = oci->height;
+  dest->y_width = cm->width;
+  dest->y_height = cm->height;
   dest->uv_height = dest->y_height / 2;
 
   return 0;

diff --git a/vp9/common/vp9_postproc.h b/vp9/common/vp9_postproc.h
index 759855f..c63beae 100644
--- a/vp9/common/vp9_postproc.h
+++ b/vp9/common/vp9_postproc.h

@@ -26,7 +26,7 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_ppflags.h"
 
-int vp9_post_proc_frame(struct VP9Common *oci,
+int vp9_post_proc_frame(struct VP9Common *cm,
                         YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags);
 
 void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);

diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c
index 97ccb13..494cea7 100644
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c

@@ -90,25 +90,24 @@
   // left of the entries correpsonding to real macroblocks.
   // The prediction flags in these dummy entries are initialised to 0.
   if (above_in_image && left_in_image) {  // both edges available
-    if (above_mbmi->ref_frame[1] <= INTRA_FRAME &&
-        left_mbmi->ref_frame[1] <= INTRA_FRAME)
+    if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi))
       // neither edge uses comp pred (0/1)
       pred_context = (above_mbmi->ref_frame[0] == cm->comp_fixed_ref) ^
                      (left_mbmi->ref_frame[0] == cm->comp_fixed_ref);
-    else if (above_mbmi->ref_frame[1] <= INTRA_FRAME)
+    else if (!has_second_ref(above_mbmi))
       // one of two edges uses comp pred (2/3)
       pred_context = 2 + (above_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
-                          above_mbmi->ref_frame[0] == INTRA_FRAME);
-    else if (left_mbmi->ref_frame[1] <= INTRA_FRAME)
+                          !is_inter_block(above_mbmi));
+    else if (!has_second_ref(left_mbmi))
       // one of two edges uses comp pred (2/3)
       pred_context = 2 + (left_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
-                          left_mbmi->ref_frame[0] == INTRA_FRAME);
+                          !is_inter_block(left_mbmi));
     else  // both edges use comp pred (4)
       pred_context = 4;
   } else if (above_in_image || left_in_image) {  // one edge available
     const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
 
-    if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)
+    if (!has_second_ref(edge_mbmi))
       // edge does not use comp pred (0/1)
       pred_context = edge_mbmi->ref_frame[0] == cm->comp_fixed_ref;
     else
@@ -146,14 +145,14 @@
     } else if (above_intra || left_intra) {  // intra/inter
       const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
 
-      if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)  // single pred (1/3)
+      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
         pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
       else  // comp pred (1/3)
         pred_context = 1 + 2 * (edge_mbmi->ref_frame[var_ref_idx]
                                     != cm->comp_var_ref[1]);
     } else {  // inter/inter
-      int l_sg = left_mbmi->ref_frame[1] <= INTRA_FRAME;
-      int a_sg = above_mbmi->ref_frame[1] <= INTRA_FRAME;
+      const int l_sg = !has_second_ref(left_mbmi);
+      const int a_sg = !has_second_ref(above_mbmi);
       MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0]
                                      : above_mbmi->ref_frame[var_ref_idx];
       MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0]
@@ -187,13 +186,15 @@
   } else if (above_in_image || left_in_image) {  // one edge available
     const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
 
-    if (edge_mbmi->ref_frame[0] == INTRA_FRAME)
+    if (!is_inter_block(edge_mbmi)) {
       pred_context = 2;
-    else if (edge_mbmi->ref_frame[1] > INTRA_FRAME)
-      pred_context = 4 * (edge_mbmi->ref_frame[var_ref_idx]
+    } else {
+      if (has_second_ref(edge_mbmi))
+        pred_context = 4 * (edge_mbmi->ref_frame[var_ref_idx]
                               != cm->comp_var_ref[1]);
-    else
-      pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
+      else
+        pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
+    }
   } else {  // no edges available (2)
     pred_context = 2;
   }
@@ -368,11 +369,11 @@
 
   if (above_in_image)
     above_context = above_mbmi->skip_coeff ? max_tx_size
-                                              : above_mbmi->txfm_size;
+                                           : above_mbmi->tx_size;
 
   if (left_in_image)
     left_context = left_mbmi->skip_coeff ? max_tx_size
-                                            : left_mbmi->txfm_size;
+                                         : left_mbmi->tx_size;
 
   if (!left_in_image)
     left_context = above_context;
@@ -383,7 +384,7 @@
   return above_context + left_context > max_tx_size;
 }
 
-void vp9_set_pred_flag_seg_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
+void vp9_set_pred_flag_seg_id(VP9_COMMON *cm, BLOCK_SIZE bsize,
                               int mi_row, int mi_col, uint8_t pred_flag) {
   MODE_INFO *mi = &cm->mi[mi_row * cm->mode_info_stride + mi_col];
   const int bw = 1 << mi_width_log2(bsize);
@@ -397,7 +398,7 @@
       mi[y * cm->mode_info_stride + x].mbmi.seg_id_predicted = pred_flag;
 }
 
-void vp9_set_pred_flag_mbskip(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
+void vp9_set_pred_flag_mbskip(VP9_COMMON *cm, BLOCK_SIZE bsize,
                               int mi_row, int mi_col, uint8_t pred_flag) {
   MODE_INFO *mi = &cm->mi[mi_row * cm->mode_info_stride + mi_col];
   const int bw = 1 << mi_width_log2(bsize);
@@ -412,7 +413,7 @@
 }
 
 int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
-                       BLOCK_SIZE_TYPE bsize, int mi_row, int mi_col) {
+                       BLOCK_SIZE bsize, int mi_row, int mi_col) {
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
   const int bw = 1 << mi_width_log2(bsize);
   const int bh = 1 << mi_height_log2(bsize);

diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h
index c01d394..89e1356 100644
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h

@@ -15,7 +15,7 @@
 #include "vp9/common/vp9_onyxc_int.h"
 
 int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
-                       BLOCK_SIZE_TYPE bsize, int mi_row, int mi_col);
+                       BLOCK_SIZE bsize, int mi_row, int mi_col);
 
 
 static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) {
@@ -32,7 +32,7 @@
   return seg->pred_probs[vp9_get_pred_context_seg_id(xd)];
 }
 
-void vp9_set_pred_flag_seg_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
+void vp9_set_pred_flag_seg_id(VP9_COMMON *cm, BLOCK_SIZE bsize,
                               int mi_row, int mi_col, uint8_t pred_flag);
 
 static INLINE int vp9_get_pred_context_mbskip(const MACROBLOCKD *xd) {
@@ -53,7 +53,7 @@
   return xd->mode_info_context->mbmi.skip_coeff;
 }
 
-void vp9_set_pred_flag_mbskip(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
+void vp9_set_pred_flag_mbskip(VP9_COMMON *cm, BLOCK_SIZE bsize,
                               int mi_row, int mi_col, uint8_t pred_flag);
 
 unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
@@ -103,7 +103,7 @@
 
 unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd);
 
-static const vp9_prob *get_tx_probs(BLOCK_SIZE_TYPE bsize, uint8_t context,
+static const vp9_prob *get_tx_probs(BLOCK_SIZE bsize, uint8_t context,
                                     const struct tx_probs *tx_probs) {
   if (bsize < BLOCK_16X16)
     return tx_probs->p8x8[context];
@@ -115,12 +115,12 @@
 
 static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd,
                                      const struct tx_probs *tx_probs) {
-  const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+  const BLOCK_SIZE bsize = xd->mode_info_context->mbmi.sb_type;
   const int context = vp9_get_pred_context_tx_size(xd);
   return get_tx_probs(bsize, context, tx_probs);
 }
 
-static void update_tx_counts(BLOCK_SIZE_TYPE bsize, uint8_t context,
+static void update_tx_counts(BLOCK_SIZE bsize, uint8_t context,
                              TX_SIZE tx_size, struct tx_counts *tx_counts) {
   if (bsize >= BLOCK_32X32)
     tx_counts->p32x32[context][tx_size]++;

diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 89c2aa8..88bba3a 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c

@@ -117,8 +117,7 @@
   int x, y;
 };
 
-static void build_inter_predictors(int plane, int block,
-                                   BLOCK_SIZE_TYPE bsize,
+static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
                                    int pred_w, int pred_h,
                                    void *argv) {
   const struct build_inter_predictors_args* const arg = argv;
@@ -174,14 +173,14 @@
 }
 
 // TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
-// calculate the subsampled BLOCK_SIZE_TYPE, but that type isn't defined for
+// calculate the subsampled BLOCK_SIZE, but that type isn't defined for
 // sizes smaller than 16x16 yet.
 typedef void (*foreach_predicted_block_visitor)(int plane, int block,
-                                                BLOCK_SIZE_TYPE bsize,
+                                                BLOCK_SIZE bsize,
                                                 int pred_w, int pred_h,
                                                 void *arg);
 static INLINE void foreach_predicted_block_in_plane(
-    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, int plane,
+    const MACROBLOCKD* const xd, BLOCK_SIZE bsize, int plane,
     foreach_predicted_block_visitor visit, void *arg) {
   int i, x, y;
 
@@ -216,8 +215,7 @@
   }
 }
 
-static void build_inter_predictors_for_planes(MACROBLOCKD *xd,
-                                              BLOCK_SIZE_TYPE bsize,
+static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
                                               int mi_row, int mi_col,
                                               int plane_from, int plane_to) {
   int plane;
@@ -231,16 +229,16 @@
 }
 
 void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                    BLOCK_SIZE_TYPE bsize) {
+                                    BLOCK_SIZE bsize) {
   build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, 0);
 }
 void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                     BLOCK_SIZE_TYPE bsize) {
+                                     BLOCK_SIZE bsize) {
   build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 1,
                                     MAX_MB_PLANE - 1);
 }
 void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                   BLOCK_SIZE_TYPE bsize) {
+                                   BLOCK_SIZE bsize) {
   build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0,
                                     MAX_MB_PLANE - 1);
 }

diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index 82c0796..504b793 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h

@@ -15,19 +15,14 @@
 #include "vp9/common/vp9_onyxc_int.h"
 
 struct subpix_fn_table;
-void vp9_build_inter_predictors_sby(MACROBLOCKD *xd,
-                                    int mb_row,
-                                    int mb_col,
-                                    BLOCK_SIZE_TYPE bsize);
+void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize);
 
-void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd,
-                                     int mb_row,
-                                     int mb_col,
-                                     BLOCK_SIZE_TYPE bsize);
+void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                     BLOCK_SIZE bsize);
 
-void vp9_build_inter_predictors_sb(MACROBLOCKD *mb,
-                                   int mb_row, int mb_col,
-                                   BLOCK_SIZE_TYPE bsize);
+void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize);
 
 void vp9_setup_interp_filters(MACROBLOCKD *xd,
                               INTERPOLATIONFILTERTYPE filter,

diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 2a1bf5c..4a451b9 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c

@@ -26,7 +26,7 @@
     ADST_ADST,  // D135
     ADST_DCT,   // D117
     DCT_ADST,   // D153
-    DCT_ADST,   // D27
+    DCT_ADST,   // D207
     ADST_DCT,   // D63
     ADST_ADST,  // TM
     DCT_DCT,    // NEARESTMV
@@ -297,7 +297,7 @@
 
   intra_pred_allsizes(pred[V_PRED], v);
   intra_pred_allsizes(pred[H_PRED], h);
-  intra_pred_allsizes(pred[D27_PRED], d207);
+  intra_pred_allsizes(pred[D207_PRED], d207);
   intra_pred_allsizes(pred[D45_PRED], d45);
   intra_pred_allsizes(pred[D63_PRED], d63);
   intra_pred_allsizes(pred[D117_PRED], d117);

diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 30c1b26..104db6a 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -301,7 +301,7 @@
 specialize vp9_short_idct4x4_add sse2 neon
 
 prototype void vp9_short_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct8x8_1_add sse2
+specialize vp9_short_idct8x8_1_add sse2 neon
 
 prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct8x8_add sse2 neon
@@ -310,7 +310,7 @@
 specialize vp9_short_idct10_8x8_add sse2 neon
 
 prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct16x16_1_add sse2
+specialize vp9_short_idct16x16_1_add sse2 neon
 
 prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct16x16_add sse2 neon
@@ -701,7 +701,7 @@
 specialize vp9_quantize_b $ssse3_x86_64
 
 prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
-specialize vp9_quantize_b_32x32
+specialize vp9_quantize_b_32x32 # $ssse3_x86_64 FIXME(jingning): need a unit test on thisbefore enabled
 
 #
 # Structured Similarity (SSIM)

diff --git a/vp9/common/vp9_systemdependent.h b/vp9/common/vp9_systemdependent.h
index 1b9147e..cc909e2 100644
--- a/vp9/common/vp9_systemdependent.h
+++ b/vp9/common/vp9_systemdependent.h

@@ -34,6 +34,6 @@
 #endif
 
 struct VP9Common;
-void vp9_machine_specific_config(struct VP9Common *);
+void vp9_machine_specific_config(struct VP9Common *cm);
 
 #endif  // VP9_COMMON_VP9_SYSTEMDEPENDENT_H_

diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
index 4af4f94..fa4dd9b 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c

@@ -17,23 +17,11 @@
                                             const unsigned char *_blimit,
                                             const unsigned char *_limit,
                                             const unsigned char *_thresh) {
-  DECLARE_ALIGNED(16, unsigned char, flat2_op[7][8]);
-  DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][8]);
-
-  DECLARE_ALIGNED(16, unsigned char, flat_op[3][8]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq[3][8]);
-
-  DECLARE_ALIGNED(16, unsigned char, ap[8][8]);
-  DECLARE_ALIGNED(16, unsigned char, aq[8][8]);
-
-
   __m128i mask, hev, flat, flat2;
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
-  __m128i p7, p6, p5;
-  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
-  __m128i q5, q6, q7;
-  int i = 0;
+  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+  __m128i abs_p1p0;
   const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
   const unsigned int extended_limit  = _limit[0]  * 0x01010101u;
   const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
@@ -44,41 +32,35 @@
   const __m128i blimit =
       _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
 
-  p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
-  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
-  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
-  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
-  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
-  q4 = _mm_loadl_epi64((__m128i *)(s + 4 * p));
-
-  _mm_storel_epi64((__m128i *)ap[4], p4);
-  _mm_storel_epi64((__m128i *)ap[3], p3);
-  _mm_storel_epi64((__m128i *)ap[2], p2);
-  _mm_storel_epi64((__m128i *)ap[1], p1);
-  _mm_storel_epi64((__m128i *)ap[0], p0);
-  _mm_storel_epi64((__m128i *)aq[4], q4);
-  _mm_storel_epi64((__m128i *)aq[3], q3);
-  _mm_storel_epi64((__m128i *)aq[2], q2);
-  _mm_storel_epi64((__m128i *)aq[1], q1);
-  _mm_storel_epi64((__m128i *)aq[0], q0);
-
+  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
+  q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
+                                       (__m64 *)(s + 4 * p)));
+  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+  q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3),
+                                       (__m64 *)(s + 3 * p)));
+  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2),
+                                       (__m64 *)(s + 2 * p)));
+  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1),
+                                       (__m64 *)(s + 1 * p)));
+  p1q1 = _mm_shuffle_epi32(q1p1, 78);
+  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0),
+                                       (__m64 *)(s - 0 * p)));
+  p0q0 = _mm_shuffle_epi32(q0p0, 78);
 
   {
-    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
-                                          _mm_subs_epu8(p0, p1));
-    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
-                                          _mm_subs_epu8(q0, q1));
-    const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
-                                    _mm_subs_epu8(q0, p0));
-    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
-                                    _mm_subs_epu8(q1, p1));
-    __m128i work;
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+    abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
+                            _mm_subs_epu8(q0p0, q1p1));
+    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
+    fe = _mm_set1_epi8(0xfe);
+    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+    abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
+                            _mm_subs_epu8(p0q0, q0p0));
+    abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
+                            _mm_subs_epu8(p1q1, q1p1));
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
     hev = _mm_subs_epu8(flat, thresh);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
@@ -88,19 +70,16 @@
     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(flat, mask);
+    mask = _mm_max_epu8(abs_p1p0, mask);
     // mask |= (abs(p1 - p0) > limit) * -1;
     // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
-                                     _mm_subs_epu8(p1, p2)),
-                         _mm_or_si128(_mm_subs_epu8(p3, p2),
-                                      _mm_subs_epu8(p2, p3)));
+
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
+                                     _mm_subs_epu8(q1p1, q2p2)),
+                        _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
+                                     _mm_subs_epu8(q2p2, q3p3)));
     mask = _mm_max_epu8(work, mask);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
-                                     _mm_subs_epu8(q1, q2)),
-                         _mm_or_si128(_mm_subs_epu8(q3, q2),
-                                      _mm_subs_epu8(q2, q3)));
-    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
     mask = _mm_subs_epu8(mask, limit);
     mask = _mm_cmpeq_epi8(mask, zero);
   }
@@ -110,21 +89,19 @@
     const __m128i t4 = _mm_set1_epi8(4);
     const __m128i t3 = _mm_set1_epi8(3);
     const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
-    const __m128i t1f = _mm_set1_epi8(0x1f);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i t7f = _mm_set1_epi8(0x7f);
-
-    __m128i ps1 = _mm_xor_si128(p1, t80);
-    __m128i ps0 = _mm_xor_si128(p0, t80);
-    __m128i qs0 = _mm_xor_si128(q0, t80);
-    __m128i qs1 = _mm_xor_si128(q1, t80);
+    const __m128i t1 = _mm_set1_epi16(0x1);
+    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+    __m128i qs0 = _mm_xor_si128(p0q0, t80);
+    __m128i qs1 = _mm_xor_si128(p1q1, t80);
     __m128i filt;
     __m128i work_a;
     __m128i filter1, filter2;
+    __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
 
-    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, qs0ps0);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
@@ -134,82 +111,60 @@
     filter1 = _mm_adds_epi8(filt, t4);
     filter2 = _mm_adds_epi8(filt, t3);
 
-    /* Filter1 >> 3 */
-    work_a = _mm_cmpgt_epi8(zero, filter1);
-    filter1 = _mm_srli_epi16(filter1, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter1 = _mm_and_si128(filter1, t1f);
-    filter1 = _mm_or_si128(filter1, work_a);
-    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+    filter1 = _mm_unpacklo_epi8(zero, filter1);
+    filter1 = _mm_srai_epi16(filter1, 0xB);
+    filter2 = _mm_unpacklo_epi8(zero, filter2);
+    filter2 = _mm_srai_epi16(filter2, 0xB);
 
-    /* Filter2 >> 3 */
-    work_a = _mm_cmpgt_epi8(zero, filter2);
-    filter2 = _mm_srli_epi16(filter2, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter2 = _mm_and_si128(filter2, t1f);
-    filter2 = _mm_or_si128(filter2, work_a);
-    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+    /* Filter1 >> 3 */
+    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
 
     /* filt >> 1 */
-    filt = _mm_adds_epi8(filter1, t1);
-    work_a = _mm_cmpgt_epi8(zero, filt);
-    filt = _mm_srli_epi16(filt, 1);
-    work_a = _mm_and_si128(work_a, t80);
-    filt = _mm_and_si128(filt, t7f);
-    filt = _mm_or_si128(filt, work_a);
-    filt = _mm_andnot_si128(hev, filt);
-    ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-    qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+    filt = _mm_adds_epi16(filter1, t1);
+    filt = _mm_srai_epi16(filt, 1);
+    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
+                            filt);
+    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
     // loopfilter done
 
     {
       __m128i work;
-      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
-                                       _mm_subs_epu8(p0, p2)),
-                           _mm_or_si128(_mm_subs_epu8(q2, q0),
-                                        _mm_subs_epu8(q0, q2)));
-      flat = _mm_max_epu8(work, flat);
-      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
-                                       _mm_subs_epu8(p0, p3)),
-                           _mm_or_si128(_mm_subs_epu8(q3, q0),
-                                        _mm_subs_epu8(q0, q3)));
-      flat = _mm_max_epu8(work, flat);
-      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
-                                       _mm_subs_epu8(p0, p4)),
-                           _mm_or_si128(_mm_subs_epu8(q4, q0),
-                                        _mm_subs_epu8(q0, q4)));
+      flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
+                                       _mm_subs_epu8(q0p0, q2p2)),
+                          _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
+                                       _mm_subs_epu8(q0p0, q3p3)));
+      flat = _mm_max_epu8(abs_p1p0, flat);
+      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
       flat = _mm_subs_epu8(flat, one);
       flat = _mm_cmpeq_epi8(flat, zero);
       flat = _mm_and_si128(flat, mask);
 
-      p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
-      q5 = _mm_loadl_epi64((__m128i *)(s + 5 * p));
-      flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0),
-                                       _mm_subs_epu8(p0, p5)),
-                           _mm_or_si128(_mm_subs_epu8(q5, q0),
-                                        _mm_subs_epu8(q0, q5)));
-      _mm_storel_epi64((__m128i *)ap[5], p5);
-      _mm_storel_epi64((__m128i *)aq[5], q5);
-      flat2 = _mm_max_epu8(work, flat2);
-      p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
-      q6 = _mm_loadl_epi64((__m128i *)(s + 6 * p));
-      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0),
-                                       _mm_subs_epu8(p0, p6)),
-                           _mm_or_si128(_mm_subs_epu8(q6, q0),
-                                        _mm_subs_epu8(q0, q6)));
-      _mm_storel_epi64((__m128i *)ap[6], p6);
-      _mm_storel_epi64((__m128i *)aq[6], q6);
-      flat2 = _mm_max_epu8(work, flat2);
+      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
+      q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5),
+                                           (__m64 *)(s + 5 * p)));
 
-      p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
-      q7 = _mm_loadl_epi64((__m128i *)(s + 7 * p));
-      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0),
-                                       _mm_subs_epu8(p0, p7)),
-                           _mm_or_si128(_mm_subs_epu8(q7, q0),
-                                        _mm_subs_epu8(q0, q7)));
-      _mm_storel_epi64((__m128i *)ap[7], p7);
-      _mm_storel_epi64((__m128i *)aq[7], q7);
+      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
+      q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6),
+                                           (__m64 *)(s + 6 * p)));
+
+      flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
+                                        _mm_subs_epu8(q0p0, q4p4)),
+                           _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
+                                        _mm_subs_epu8(q0p0, q5p5)));
+
+      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
+      q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7),
+                                           (__m64 *)(s + 7 * p)));
+
+      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
+                                       _mm_subs_epu8(q0p0, q6p6)),
+                          _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
+                                       _mm_subs_epu8(q0p0, q7p7)));
+
       flat2 = _mm_max_epu8(work, flat2);
+      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
       flat2 = _mm_subs_epu8(flat2, one);
       flat2 = _mm_cmpeq_epi8(flat2, zero);
       flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
@@ -220,260 +175,198 @@
     {
       const __m128i eight = _mm_set1_epi16(8);
       const __m128i four = _mm_set1_epi16(4);
-      {
-        __m128i workp_shft;
-        __m128i a, b, c;
+      __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+      __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+      __m128i pixelFilter_p, pixelFilter_q;
+      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+      __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
 
-        p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7])), zero);
-        p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6])), zero);
-        p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5])), zero);
-        p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4])), zero);
-        p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3])), zero);
-        p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2])), zero);
-        p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1])), zero);
-        p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0])), zero);
-        q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0])), zero);
-        q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1])), zero);
-        q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2])), zero);
-        q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3])), zero);
-        q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4])), zero);
-        q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5])), zero);
-        q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6])), zero);
-        q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7])), zero);
+      p7_16 = _mm_unpacklo_epi8(q7p7, zero);;
+      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+      q7_16 = _mm_unpackhi_epi8(q7p7, zero);
 
-        c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7);  // p7 * 7
-        c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c));
+      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+                                    _mm_add_epi16(p4_16, p3_16));
+      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+                                    _mm_add_epi16(q4_16, q3_16));
 
-        b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2));
-        a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1));
-        a = _mm_add_epi16(_mm_add_epi16(p0, q0), a);
+      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
+      pixelFilter_p =  _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
 
-        _mm_storel_epi64((__m128i *)&flat_op[2][i*8],
-                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
-                                          , b));
+      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
+      pixelFilter_q =  _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+      pixelFilter_p =  _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
+                                                         pixelFilter_q));
+      pixetFilter_p2p1p0 =   _mm_add_epi16(four,
+                                           _mm_add_epi16(pixetFilter_p2p1p0,
+                                                         pixetFilter_q2q1q0));
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                           _mm_add_epi16(p7_16, p0_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                           _mm_add_epi16(q7_16, q0_16)), 4);
+      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                           _mm_add_epi16(p3_16, p0_16)), 3);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                           _mm_add_epi16(q3_16, q0_16)), 3);
 
-        c = _mm_add_epi16(_mm_add_epi16(p5, eight), c);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
+      flat_q0p0 = _mm_packus_epi16(res_p, res_q);
 
-        a = _mm_add_epi16(q1, a);
-        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1);
-        _mm_storel_epi64((__m128i *)&flat_op[1][i*8],
-                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
-                                          , b));
+      sum_p7 = _mm_add_epi16(p7_16, p7_16);
+      sum_q7 = _mm_add_epi16(q7_16, q7_16);
+      sum_p3 = _mm_add_epi16(p3_16, p3_16);
+      sum_q3 = _mm_add_epi16(q3_16, q3_16);
 
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p1_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q1_16)), 4);
+      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
 
-        a = _mm_add_epi16(q2, a);
-        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0);
-        _mm_storel_epi64((__m128i *)&flat_op[0][i*8],
-                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
-                                          , b));
+      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                             _mm_add_epi16(sum_p3, p1_16)), 3);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+                             _mm_add_epi16(sum_q3, q1_16)), 3);
+      flat_q1p1 = _mm_packus_epi16(res_p, res_q);
 
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+      sum_q3 = _mm_add_epi16(sum_q3, q3_16);
 
-        a = _mm_add_epi16(q3, a);
-        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0);
-        _mm_storel_epi64((__m128i *)&flat_oq[0][i*8],
-                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
-                                          , b));
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p2_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q2_16)), 4);
+      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
 
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
+      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
 
-        b = _mm_add_epi16(q3, b);
-        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1);
-        _mm_storel_epi64((__m128i *)&flat_oq[1][i*8],
-                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
-                                          , b));
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                           _mm_add_epi16(sum_p3, p2_16)), 3);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+                                           _mm_add_epi16(sum_q3, q2_16)), 3);
+      flat_q2p2 = _mm_packus_epi16(res_p, res_q);
 
-        c = _mm_add_epi16(q4, c);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p3_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q3_16)), 4);
+      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
 
-        b = _mm_add_epi16(q3, b);
-        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2);
-        _mm_storel_epi64((__m128i *)&flat_oq[2][i*8],
-                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
-                                          , b));
-        a = _mm_add_epi16(q5, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p4_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q4_16)), 4);
+      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
 
-        a = _mm_add_epi16(q6, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p5_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q5_16)), 4);
+      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
 
-        a = _mm_add_epi16(q7, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        a = _mm_add_epi16(q7, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        a = _mm_add_epi16(q7, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        a = _mm_add_epi16(q7, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        a = _mm_add_epi16(q7, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        a = _mm_add_epi16(q7, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        a = _mm_add_epi16(q7, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-      }
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p6_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q6_16)), 4);
+      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
     }
     // wide flat
     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-    work_a = _mm_loadl_epi64((__m128i *)ap[2]);
-    p2 = _mm_loadl_epi64((__m128i *)flat_op[2]);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p2 = _mm_and_si128(flat, p2);
-    p2 = _mm_or_si128(work_a, p2);
-    _mm_storel_epi64((__m128i *)flat_op[2], p2);
+    flat = _mm_shuffle_epi32(flat, 68);
+    flat2 = _mm_shuffle_epi32(flat2, 68);
 
-    p1 = _mm_loadl_epi64((__m128i *)flat_op[1]);
-    work_a = _mm_andnot_si128(flat, ps1);
-    p1 = _mm_and_si128(flat, p1);
-    p1 = _mm_or_si128(work_a, p1);
-    _mm_storel_epi64((__m128i *)flat_op[1], p1);
+    q2p2 = _mm_andnot_si128(flat, q2p2);
+    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+    q2p2 = _mm_or_si128(q2p2, flat_q2p2);
 
-    p0 = _mm_loadl_epi64((__m128i *)flat_op[0]);
-    work_a = _mm_andnot_si128(flat, ps0);
-    p0 = _mm_and_si128(flat, p0);
-    p0 = _mm_or_si128(work_a, p0);
-    _mm_storel_epi64((__m128i *)flat_op[0], p0);
+    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
 
-    q0 = _mm_loadl_epi64((__m128i *)flat_oq[0]);
-    work_a = _mm_andnot_si128(flat, qs0);
-    q0 = _mm_and_si128(flat, q0);
-    q0 = _mm_or_si128(work_a, q0);
-    _mm_storel_epi64((__m128i *)flat_oq[0], q0);
+    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
 
-    q1 = _mm_loadl_epi64((__m128i *)flat_oq[1]);
-    work_a = _mm_andnot_si128(flat, qs1);
-    q1 = _mm_and_si128(flat, q1);
-    q1 = _mm_or_si128(work_a, q1);
-    _mm_storel_epi64((__m128i *)flat_oq[1], q1);
+    q6p6 = _mm_andnot_si128(flat2, q6p6);
+    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
+    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
 
-    work_a = _mm_loadl_epi64((__m128i *)aq[2]);
-    q2 = _mm_loadl_epi64((__m128i *)flat_oq[2]);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q2 = _mm_and_si128(flat, q2);
-    q2 = _mm_or_si128(work_a, q2);
-    _mm_storel_epi64((__m128i *)flat_oq[2], q2);
+    q5p5 = _mm_andnot_si128(flat2, q5p5);
+    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
+    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
 
-    // write out op6 - op3
-    {
-      unsigned char *dst = (s - 7 * p);
-      for (i = 6; i > 2; i--) {
-        __m128i flat2_output;
-        work_a = _mm_loadl_epi64((__m128i *)ap[i]);
-        flat2_output = _mm_loadl_epi64((__m128i *)flat2_op[i]);
-        work_a = _mm_andnot_si128(flat2, work_a);
-        flat2_output = _mm_and_si128(flat2, flat2_output);
-        work_a = _mm_or_si128(work_a, flat2_output);
-        _mm_storel_epi64((__m128i *)dst, work_a);
-        dst += p;
-      }
-    }
+    q4p4 = _mm_andnot_si128(flat2, q4p4);
+    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
+    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
 
-    work_a = _mm_loadl_epi64((__m128i *)flat_op[2]);
-    p2 = _mm_loadl_epi64((__m128i *)flat2_op[2]);
-    work_a = _mm_andnot_si128(flat2, work_a);
-    p2 = _mm_and_si128(flat2, p2);
-    p2 = _mm_or_si128(work_a, p2);
-    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+    q3p3 = _mm_andnot_si128(flat2, q3p3);
+    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
+    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
 
-    work_a = _mm_loadl_epi64((__m128i *)flat_op[1]);
-    p1 = _mm_loadl_epi64((__m128i *)flat2_op[1]);
-    work_a = _mm_andnot_si128(flat2, work_a);
-    p1 = _mm_and_si128(flat2, p1);
-    p1 = _mm_or_si128(work_a, p1);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
+    q2p2 = _mm_andnot_si128(flat2, q2p2);
+    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
+    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
 
-    work_a = _mm_loadl_epi64((__m128i *)flat_op[0]);
-    p0 = _mm_loadl_epi64((__m128i *)flat2_op[0]);
-    work_a = _mm_andnot_si128(flat2, work_a);
-    p0 = _mm_and_si128(flat2, p0);
-    p0 = _mm_or_si128(work_a, p0);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
+    q1p1 = _mm_andnot_si128(flat2, q1p1);
+    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
+    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
 
-    work_a = _mm_loadl_epi64((__m128i *)flat_oq[0]);
-    q0 = _mm_loadl_epi64((__m128i *)flat2_oq[0]);
-    work_a = _mm_andnot_si128(flat2, work_a);
-    q0 = _mm_and_si128(flat2, q0);
-    q0 = _mm_or_si128(work_a, q0);
-    _mm_storel_epi64((__m128i *)(s - 0 * p), q0);
-
-    work_a = _mm_loadl_epi64((__m128i *)flat_oq[1]);
-    q1 = _mm_loadl_epi64((__m128i *)flat2_oq[1]);
-    work_a = _mm_andnot_si128(flat2, work_a);
-    q1 = _mm_and_si128(flat2, q1);
-    q1 = _mm_or_si128(work_a, q1);
-    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
-
-    work_a = _mm_loadl_epi64((__m128i *)flat_oq[2]);
-    q2 = _mm_loadl_epi64((__m128i *)flat2_oq[2]);
-    work_a = _mm_andnot_si128(flat2, work_a);
-    q2 = _mm_and_si128(flat2, q2);
-    q2 = _mm_or_si128(work_a, q2);
-    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
-
-    // write out oq3 - oq7
-    {
-      unsigned char *dst = (s + 3 * p);
-      for (i = 3; i < 7; i++) {
-        __m128i flat2_output;
-        work_a = _mm_loadl_epi64((__m128i *)aq[i]);
-        flat2_output = _mm_loadl_epi64((__m128i *)flat2_oq[i]);
-        work_a = _mm_andnot_si128(flat2, work_a);
-        flat2_output = _mm_and_si128(flat2, flat2_output);
-        work_a = _mm_or_si128(work_a, flat2_output);
-        _mm_storel_epi64((__m128i *)dst, work_a);
-        dst += p;
-      }
-    }
+    q0p0 = _mm_andnot_si128(flat2, q0p0);
+    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
+    _mm_storeh_pi((__m64 *)(s - 0 * p),  _mm_castsi128_ps(q0p0));
   }
 }
 

diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index d1c59c3..8dfb22c 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c

@@ -43,7 +43,7 @@
 }
 
 static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                     BLOCK_SIZE_TYPE bsize, vp9_reader *r) {
+                                     BLOCK_SIZE bsize, vp9_reader *r) {
   const uint8_t context = vp9_get_pred_context_tx_size(xd);
   const vp9_prob *tx_probs = get_tx_probs(bsize, context, &cm->fc.tx_probs);
   TX_SIZE tx_size = vp9_read(r, tx_probs[0]);
@@ -58,7 +58,7 @@
 }
 
 static TX_SIZE read_tx_size(VP9D_COMP *pbi, TX_MODE tx_mode,
-                            BLOCK_SIZE_TYPE bsize, int allow_select,
+                            BLOCK_SIZE bsize, int allow_select,
                             vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
@@ -75,7 +75,7 @@
     return TX_4X4;
 }
 
-static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
+static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize,
                            int mi_row, int mi_col, int segment_id) {
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
   const int bw = 1 << mi_width_log2(bsize);
@@ -95,7 +95,7 @@
                                  vp9_reader *r) {
   MACROBLOCKD *const xd = &pbi->mb;
   struct segmentation *const seg = &pbi->common.seg;
-  const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+  const BLOCK_SIZE bsize = xd->mode_info_context->mbmi.sb_type;
   int segment_id;
 
   if (!seg->enabled)
@@ -114,7 +114,7 @@
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
   struct segmentation *const seg = &cm->seg;
-  const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+  const BLOCK_SIZE bsize = xd->mode_info_context->mbmi.sb_type;
   int pred_segment_id, segment_id;
 
   if (!seg->enabled)
@@ -155,12 +155,12 @@
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
   MB_MODE_INFO *const mbmi = &m->mbmi;
-  const BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
   const int mis = cm->mode_info_stride;
 
   mbmi->segment_id = read_intra_segment_id(pbi, mi_row, mi_col, r);
   mbmi->skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r);
-  mbmi->txfm_size = read_tx_size(pbi, cm->tx_mode, bsize, 1, r);
+  mbmi->tx_size = read_tx_size(pbi, cm->tx_mode, bsize, 1, r);
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE;
 
@@ -381,7 +381,7 @@
                                   vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
 
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE;
@@ -439,31 +439,31 @@
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   int_mv *const mv0 = &mbmi->mv[0];
   int_mv *const mv1 = &mbmi->mv[1];
-  const BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
   const int allow_hp = xd->allow_high_precision_mv;
 
   int_mv nearest, nearby, best_mv;
   int_mv nearest_second, nearby_second, best_mv_second;
   uint8_t inter_mode_ctx;
-  MV_REFERENCE_FRAME ref0, ref1;
+  MV_REFERENCE_FRAME ref0;
   int is_compound;
 
+  mbmi->uv_mode = DC_PRED;
   read_ref_frames(pbi, r, mbmi->segment_id, mbmi->ref_frame);
   ref0 = mbmi->ref_frame[0];
-  ref1 = mbmi->ref_frame[1];
-  is_compound = ref1 > INTRA_FRAME;
+  is_compound = has_second_ref(mbmi);
 
   vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,
                    ref0, mbmi->ref_mvs[ref0], mi_row, mi_col);
 
   inter_mode_ctx = mbmi->mode_context[ref0];
 
-  if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+  if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     mbmi->mode = ZEROMV;
-  else if (bsize >= BLOCK_8X8)
-    mbmi->mode = read_inter_mode(cm, r, inter_mode_ctx);
-
-  mbmi->uv_mode = DC_PRED;
+  } else {
+    if (bsize >= BLOCK_8X8)
+      mbmi->mode = read_inter_mode(cm, r, inter_mode_ctx);
+  }
 
   // nearest, nearby
   if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
@@ -471,11 +471,8 @@
     best_mv.as_int = mbmi->ref_mvs[ref0][0].as_int;
   }
 
-  mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE
-                            ? read_switchable_filter_type(pbi, r)
-                            : cm->mcomp_filter_type;
-
   if (is_compound) {
+    const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
     vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,
                      ref1, mbmi->ref_mvs[ref1], mi_row, mi_col);
 
@@ -486,6 +483,10 @@
     }
   }
 
+  mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE
+                              ? read_switchable_filter_type(pbi, r)
+                              : cm->mcomp_filter_type;
+
   if (bsize < BLOCK_8X8) {
     const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];  // 1 or 2
     const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];  // 1 or 2
@@ -590,8 +591,8 @@
   mbmi->segment_id = read_inter_segment_id(pbi, mi_row, mi_col, r);
   mbmi->skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r);
   inter_block = read_is_inter_block(pbi, mbmi->segment_id, r);
-  mbmi->txfm_size = read_tx_size(pbi, cm->tx_mode, mbmi->sb_type,
-                                 !mbmi->skip_coeff || !inter_block, r);
+  mbmi->tx_size = read_tx_size(pbi, cm->tx_mode, mbmi->sb_type,
+                               !mbmi->skip_coeff || !inter_block, r);
 
   if (inter_block)
     read_inter_block_mode_info(pbi, mi, mi_row, mi_col, r);
@@ -668,7 +669,7 @@
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
   MODE_INFO *mi = xd->mode_info_context;
-  const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
   const int bw = 1 << mi_width_log2(bsize);
   const int bh = 1 << mi_height_log2(bsize);
   const int y_mis = MIN(bh, cm->mi_rows - mi_row);

diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index fd88b6e..41e406d 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c

@@ -87,7 +87,7 @@
     xd->plane[i].dequant = cm->uv_dequant[xd->q_index];
 }
 
-static void decode_block(int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
                          TX_SIZE tx_size, void *arg) {
   MACROBLOCKD* const xd = arg;
   struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -123,8 +123,7 @@
   }
 }
 
-static void decode_block_intra(int plane, int block,
-                               BLOCK_SIZE_TYPE plane_bsize,
+static void decode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                                TX_SIZE tx_size, void *arg) {
   MACROBLOCKD* const xd = arg;
   struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -133,32 +132,23 @@
                                                        block);
   uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block,
                                                  pd->dst.buf, pd->dst.stride);
-  int b_mode;
-  const int tx_ib = raster_block >> tx_size;
-  const int mode = (plane == 0) ? mi->mbmi.mode : mi->mbmi.uv_mode;
-
-  if (plane == 0 && mi->mbmi.sb_type < BLOCK_8X8) {
-    assert(plane_bsize == BLOCK_8X8);
-    b_mode = mi->bmi[raster_block].as_mode;
-  } else {
-    b_mode = mode;
-  }
+  const MB_PREDICTION_MODE mode = (plane == 0)
+        ? ((mi->mbmi.sb_type < BLOCK_8X8) ? mi->bmi[raster_block].as_mode
+                                          : mi->mbmi.mode)
+        : mi->mbmi.uv_mode;
 
   if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0)
     extend_for_intra(xd, plane_bsize, plane, block, tx_size);
 
-  vp9_predict_intra_block(xd, tx_ib, b_width_log2(plane_bsize), tx_size, b_mode,
-                          dst, pd->dst.stride,
-                          dst, pd->dst.stride);
+  vp9_predict_intra_block(xd, raster_block >> tx_size,
+                          b_width_log2(plane_bsize), tx_size, mode,
+                          dst, pd->dst.stride, dst, pd->dst.stride);
 
-  // Early exit if there are no coefficients
-  if (mi->mbmi.skip_coeff)
-    return;
-
-  decode_block(plane, block, plane_bsize, tx_size, arg);
+  if (!mi->mbmi.skip_coeff)
+    decode_block(plane, block, plane_bsize, tx_size, arg);
 }
 
-static int decode_tokens(VP9D_COMP *pbi, BLOCK_SIZE_TYPE bsize, vp9_reader *r) {
+static int decode_tokens(VP9D_COMP *pbi, BLOCK_SIZE bsize, vp9_reader *r) {
   MACROBLOCKD *const xd = &pbi->mb;
 
   if (xd->mode_info_context->mbmi.skip_coeff) {
@@ -173,20 +163,19 @@
   }
 }
 
-static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE_TYPE bsize,
+static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE bsize,
                         int mi_row, int mi_col) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  const int bh = 1 << mi_height_log2(bsize);
-  const int bw = 1 << mi_width_log2(bsize);
-  const int mi_idx = mi_row * cm->mode_info_stride + mi_col;
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int offset = mi_row * cm->mode_info_stride + mi_col;
 
-  xd->mode_info_context = cm->mi + mi_idx;
+  xd->mode_info_context = cm->mi + offset;
   xd->mode_info_context->mbmi.sb_type = bsize;
   // Special case: if prev_mi is NULL, the previous mode info context
   // cannot be used.
-  xd->prev_mode_info_context = cm->prev_mi ? cm->prev_mi + mi_idx : NULL;
-
+  xd->prev_mode_info_context = cm->prev_mi ? cm->prev_mi + offset : NULL;
 
   set_skip_context(cm, xd, mi_row, mi_col);
   set_partition_seg_context(cm, xd, mi_row, mi_col);
@@ -215,7 +204,7 @@
 }
 
 static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
-                           vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
+                           vp9_reader *r, BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
   const int less8x8 = bsize < BLOCK_8X8;
@@ -243,7 +232,7 @@
     int eobtotal;
 
     set_ref(pbi, 0, mi_row, mi_col);
-    if (mbmi->ref_frame[1] > INTRA_FRAME)
+    if (has_second_ref(mbmi))
       set_ref(pbi, 1, mi_row, mi_col);
 
     vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
@@ -265,14 +254,14 @@
 }
 
 static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col,
-                            vp9_reader* r, BLOCK_SIZE_TYPE bsize) {
-  VP9_COMMON *const pc = &pbi->common;
+                            vp9_reader* r, BLOCK_SIZE bsize) {
+  VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  const int bs = (1 << mi_width_log2(bsize)) / 2;
+  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
   PARTITION_TYPE partition = PARTITION_NONE;
-  BLOCK_SIZE_TYPE subsize;
+  BLOCK_SIZE subsize;
 
-  if (mi_row >= pc->mi_rows || mi_col >= pc->mi_cols)
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
   if (bsize < BLOCK_8X8) {
@@ -280,25 +269,25 @@
       return;
   } else {
     int pl;
-    const int idx = check_bsize_coverage(bs, pc->mi_rows, pc->mi_cols,
+    const int idx = check_bsize_coverage(hbs, cm->mi_rows, cm->mi_cols,
                                          mi_row, mi_col);
-    set_partition_seg_context(pc, xd, mi_row, mi_col);
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
     pl = partition_plane_context(xd, bsize);
 
     if (idx == 0)
       partition = treed_read(r, vp9_partition_tree,
-                             pc->fc.partition_prob[pc->frame_type][pl]);
+                             cm->fc.partition_prob[cm->frame_type][pl]);
     else if (idx > 0 &&
-        !vp9_read(r, pc->fc.partition_prob[pc->frame_type][pl][idx]))
+        !vp9_read(r, cm->fc.partition_prob[cm->frame_type][pl][idx]))
       partition = (idx == 1) ? PARTITION_HORZ : PARTITION_VERT;
     else
       partition = PARTITION_SPLIT;
 
-    pc->counts.partition[pl][partition]++;
+    cm->counts.partition[pl][partition]++;
   }
 
   subsize = get_subsize(bsize, partition);
-  *(get_sb_index(xd, subsize)) = 0;
+  *get_sb_index(xd, subsize) = 0;
 
   switch (partition) {
     case PARTITION_NONE:
@@ -306,22 +295,22 @@
       break;
     case PARTITION_HORZ:
       decode_modes_b(pbi, mi_row, mi_col, r, subsize);
-      *(get_sb_index(xd, subsize)) = 1;
-      if (mi_row + bs < pc->mi_rows)
-        decode_modes_b(pbi, mi_row + bs, mi_col, r, subsize);
+      *get_sb_index(xd, subsize) = 1;
+      if (mi_row + hbs < cm->mi_rows)
+        decode_modes_b(pbi, mi_row + hbs, mi_col, r, subsize);
       break;
     case PARTITION_VERT:
       decode_modes_b(pbi, mi_row, mi_col, r, subsize);
-      *(get_sb_index(xd, subsize)) = 1;
-      if (mi_col + bs < pc->mi_cols)
-        decode_modes_b(pbi, mi_row, mi_col + bs, r, subsize);
+      *get_sb_index(xd, subsize) = 1;
+      if (mi_col + hbs < cm->mi_cols)
+        decode_modes_b(pbi, mi_row, mi_col + hbs, r, subsize);
       break;
     case PARTITION_SPLIT: {
       int n;
       for (n = 0; n < 4; n++) {
         const int j = n >> 1, i = n & 1;
-        *(get_sb_index(xd, subsize)) = n;
-        decode_modes_sb(pbi, mi_row + j * bs, mi_col + i * bs, r, subsize);
+        *get_sb_index(xd, subsize) = n;
+        decode_modes_sb(pbi, mi_row + j * hbs, mi_col + i * hbs, r, subsize);
       }
     } break;
     default:
@@ -331,7 +320,7 @@
   // update partition context
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) {
-    set_partition_seg_context(pc, xd, mi_row, mi_col);
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
     update_partition_context(xd, subsize, bsize);
   }
 }
@@ -339,18 +328,18 @@
 static void setup_token_decoder(VP9D_COMP *pbi,
                                 const uint8_t *data, size_t read_size,
                                 vp9_reader *r) {
-  VP9_COMMON *pc = &pbi->common;
+  VP9_COMMON *cm = &pbi->common;
   const uint8_t *data_end = pbi->source + pbi->source_sz;
 
   // Validate the calculated partition length. If the buffer
   // described by the partition can't be fully read, then restrict
   // it to the portion that can be (for EC mode) or throw an error.
   if (!read_is_valid(data, read_size, data_end))
-    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt tile length");
 
   if (vp9_reader_init(r, data, read_size))
-    vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder %d", 1);
 }
 
@@ -582,31 +571,30 @@
 
 static void decode_tile(VP9D_COMP *pbi, vp9_reader *r) {
   const int num_threads = pbi->oxcf.max_threads;
-  VP9_COMMON *const pc = &pbi->common;
+  VP9_COMMON *const cm = &pbi->common;
   int mi_row, mi_col;
-  YV12_BUFFER_CONFIG *const fb = &pc->yv12_fb[pc->new_fb_idx];
+  YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[cm->new_fb_idx];
 
   if (pbi->do_loopfilter_inline) {
     if (num_threads > 1) {
       LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
       lf_data->frame_buffer = fb;
-      lf_data->cm = pc;
+      lf_data->cm = cm;
       lf_data->xd = pbi->mb;
       lf_data->stop = 0;
       lf_data->y_only = 0;
     }
-    vp9_loop_filter_frame_init(pc, pc->lf.filter_level);
+    vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
   }
 
-  for (mi_row = pc->cur_tile_mi_row_start; mi_row < pc->cur_tile_mi_row_end;
+  for (mi_row = cm->cur_tile_mi_row_start; mi_row < cm->cur_tile_mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
     // For a SB there are 2 left contexts, each pertaining to a MB row within
-    vp9_zero(pc->left_context);
-    vp9_zero(pc->left_seg_context);
-    for (mi_col = pc->cur_tile_mi_col_start; mi_col < pc->cur_tile_mi_col_end;
-         mi_col += MI_BLOCK_SIZE) {
+    vp9_zero(cm->left_context);
+    vp9_zero(cm->left_seg_context);
+    for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+         mi_col += MI_BLOCK_SIZE)
       decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_64X64);
-    }
 
     if (pbi->do_loopfilter_inline) {
       // delay the loopfilter by 1 macroblock row.
@@ -617,7 +605,7 @@
         LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
 
         // decoding has completed: finish up the loop filter in this thread.
-        if (mi_row + MI_BLOCK_SIZE >= pc->cur_tile_mi_row_end) continue;
+        if (mi_row + MI_BLOCK_SIZE >= cm->cur_tile_mi_row_end) continue;
 
         vp9_worker_sync(&pbi->lf_worker);
         lf_data->start = lf_start;
@@ -625,7 +613,7 @@
         pbi->lf_worker.hook = vp9_loop_filter_worker;
         vp9_worker_launch(&pbi->lf_worker);
       } else {
-        vp9_loop_filter_rows(fb, pc, &pbi->mb, lf_start, mi_row, 0);
+        vp9_loop_filter_rows(fb, cm, &pbi->mb, lf_start, mi_row, 0);
       }
     }
   }
@@ -640,8 +628,8 @@
     } else {
       lf_start = mi_row - MI_BLOCK_SIZE;
     }
-    vp9_loop_filter_rows(fb, pc, &pbi->mb,
-                         lf_start, pc->mi_rows, 0);
+    vp9_loop_filter_rows(fb, cm, &pbi->mb,
+                         lf_start, cm->mi_rows, 0);
   }
 }
 
@@ -664,20 +652,20 @@
 static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
   vp9_reader residual_bc;
 
-  VP9_COMMON *const pc = &pbi->common;
+  VP9_COMMON *const cm = &pbi->common;
 
   const uint8_t *const data_end = pbi->source + pbi->source_sz;
-  const int aligned_mi_cols = mi_cols_aligned_to_sb(pc->mi_cols);
-  const int tile_cols = 1 << pc->log2_tile_cols;
-  const int tile_rows = 1 << pc->log2_tile_rows;
+  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
   int tile_row, tile_col;
 
   // Note: this memset assumes above_context[0], [1] and [2]
   // are allocated as part of the same buffer.
-  vpx_memset(pc->above_context[0], 0,
+  vpx_memset(cm->above_context[0], 0,
              sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE * (2 * aligned_mi_cols));
 
-  vpx_memset(pc->above_seg_context, 0,
+  vpx_memset(cm->above_seg_context, 0,
              sizeof(PARTITION_CONTEXT) * aligned_mi_cols);
 
   if (pbi->oxcf.inv_tile_order) {
@@ -702,9 +690,9 @@
     }
 
     for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-      vp9_get_tile_row_offsets(pc, tile_row);
+      vp9_get_tile_row_offsets(cm, tile_row);
       for (tile_col = tile_cols - 1; tile_col >= 0; tile_col--) {
-        vp9_get_tile_col_offsets(pc, tile_col);
+        vp9_get_tile_col_offsets(cm, tile_col);
         setup_token_decoder(pbi, data_ptr2[tile_row][tile_col],
                             data_end - data_ptr2[tile_row][tile_col],
                             &residual_bc);
@@ -718,16 +706,16 @@
     int has_more;
 
     for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-      vp9_get_tile_row_offsets(pc, tile_row);
+      vp9_get_tile_row_offsets(cm, tile_row);
       for (tile_col = 0; tile_col < tile_cols; tile_col++) {
         size_t size;
 
-        vp9_get_tile_col_offsets(pc, tile_col);
+        vp9_get_tile_col_offsets(cm, tile_col);
 
         has_more = tile_col < tile_cols - 1 || tile_row < tile_rows - 1;
         if (has_more) {
           if (!read_is_valid(data, 4, data_end))
-            vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+            vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                          "Truncated packet or corrupt tile length");
 
           size = read_be32(data);
@@ -940,17 +928,17 @@
 
 int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
   int i;
-  VP9_COMMON *const pc = &pbi->common;
+  VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
 
   const uint8_t *data = pbi->source;
   const uint8_t *data_end = pbi->source + pbi->source_sz;
 
   struct vp9_read_bit_buffer rb = { data, data_end, 0,
-                                    pc, error_handler };
+                                    cm, error_handler };
   const size_t first_partition_size = read_uncompressed_header(pbi, &rb);
-  const int keyframe = pc->frame_type == KEY_FRAME;
-  YV12_BUFFER_CONFIG *new_fb = &pc->yv12_fb[pc->new_fb_idx];
+  const int keyframe = cm->frame_type == KEY_FRAME;
+  YV12_BUFFER_CONFIG *new_fb = &cm->yv12_fb[cm->new_fb_idx];
 
   if (!first_partition_size) {
     // showing a frame directly
@@ -961,42 +949,39 @@
   xd->corrupted = 0;
   new_fb->corrupted = 0;
   pbi->do_loopfilter_inline =
-      (pc->log2_tile_rows | pc->log2_tile_cols) == 0 && pc->lf.filter_level;
+      (cm->log2_tile_rows | cm->log2_tile_cols) == 0 && cm->lf.filter_level;
 
   if (!pbi->decoded_key_frame && !keyframe)
     return -1;
 
   if (!read_is_valid(data, first_partition_size, data_end))
-    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt header length");
 
-  xd->mode_info_context = pc->mi;
-  xd->prev_mode_info_context = pc->prev_mi;
-  xd->mode_info_stride = pc->mode_info_stride;
+  xd->mode_info_context = cm->mi;
+  xd->prev_mode_info_context = cm->prev_mi;
+  xd->mode_info_stride = cm->mode_info_stride;
 
-  init_dequantizer(pc, &pbi->mb);
+  init_dequantizer(cm, &pbi->mb);
 
-  if (!keyframe)
-    vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
+  cm->fc = cm->frame_contexts[cm->frame_context_idx];
 
-  pc->fc = pc->frame_contexts[pc->frame_context_idx];
-
-  vp9_zero(pc->counts);
+  vp9_zero(cm->counts);
 
   new_fb->corrupted |= read_compressed_header(pbi, data, first_partition_size);
 
-  setup_block_dptrs(xd, pc->subsampling_x, pc->subsampling_y);
+  setup_block_dptrs(xd, cm->subsampling_x, cm->subsampling_y);
 
   // clear out the coeff buffer
   for (i = 0; i < MAX_MB_PLANE; ++i)
     vp9_zero(xd->plane[i].qcoeff);
 
-  set_prev_mi(pc);
+  set_prev_mi(cm);
 
   *p_data_end = decode_tiles(pbi, data + first_partition_size);
 
-  pc->last_width = pc->width;
-  pc->last_height = pc->height;
+  cm->last_width = cm->width;
+  cm->last_height = cm->height;
 
   new_fb->corrupted |= xd->corrupted;
 
@@ -1004,21 +989,21 @@
     if (keyframe && !new_fb->corrupted)
       pbi->decoded_key_frame = 1;
     else
-      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                          "A stream must start with a complete key frame");
   }
 
-  if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) {
-    vp9_adapt_coef_probs(pc);
+  if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
+    vp9_adapt_coef_probs(cm);
 
-    if (!keyframe && !pc->intra_only) {
-      vp9_adapt_mode_probs(pc);
-      vp9_adapt_mv_probs(pc, xd->allow_high_precision_mv);
+    if (!keyframe && !cm->intra_only) {
+      vp9_adapt_mode_probs(cm);
+      vp9_adapt_mv_probs(cm, xd->allow_high_precision_mv);
     }
   }
 
-  if (pc->refresh_frame_context)
-    pc->frame_contexts[pc->frame_context_idx] = pc->fc;
+  if (cm->refresh_frame_context)
+    cm->frame_contexts[cm->frame_context_idx] = cm->fc;
 
   return 0;
 }

diff --git a/vp9/decoder/vp9_decodframe.h b/vp9/decoder/vp9_decodframe.h
index 00b6d67..c665f6f 100644
--- a/vp9/decoder/vp9_decodframe.h
+++ b/vp9/decoder/vp9_decodframe.h

@@ -15,7 +15,7 @@
 struct VP9Common;
 struct VP9Decompressor;
 
-void vp9_init_dequantizer(struct VP9Common *pc);
+void vp9_init_dequantizer(struct VP9Common *cm);
 int vp9_decode_frame(struct VP9Decompressor *cpi, const uint8_t **p_data_end);
 
 #endif  // VP9_DECODER_VP9_DECODFRAME_H_

diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 3a62bba..c119093 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c

@@ -94,9 +94,8 @@
                         ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
   FRAME_CONTEXT *const fc = &cm->fc;
   FRAME_COUNTS *const counts = &cm->counts;
-  ENTROPY_CONTEXT above_ec, left_ec;
   const int ref = is_inter_block(&xd->mode_info_context->mbmi);
-  int band, pt, c = 0;
+  int band, c = 0;
   vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES] =
       fc->coef_probs[tx_size][type][ref];
   vp9_prob coef_probs_full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
@@ -104,38 +103,10 @@
   vp9_prob *prob;
   vp9_coeff_count_model *coef_counts = counts->coef[tx_size];
   const int16_t *scan, *nb;
-  uint8_t token_cache[1024];
   const uint8_t *band_translate;
-
-  switch (tx_size) {
-    default:
-    case TX_4X4:
-      scan = get_scan_4x4(get_tx_type_4x4(type, xd, block_idx));
-      above_ec = A[0] != 0;
-      left_ec = L[0] != 0;
-      band_translate = vp9_coefband_trans_4x4;
-      break;
-    case TX_8X8:
-      scan = get_scan_8x8(get_tx_type_8x8(type, xd));
-      above_ec = !!*(uint16_t *)A;
-      left_ec  = !!*(uint16_t *)L;
-      band_translate = vp9_coefband_trans_8x8plus;
-      break;
-    case TX_16X16:
-      scan = get_scan_16x16(get_tx_type_16x16(type, xd));
-      above_ec = !!*(uint32_t *)A;
-      left_ec  = !!*(uint32_t *)L;
-      band_translate = vp9_coefband_trans_8x8plus;
-      break;
-    case TX_32X32:
-      scan = vp9_default_scan_32x32;
-      above_ec = !!*(uint64_t *)A;
-      left_ec  = !!*(uint64_t *)L;
-      band_translate = vp9_coefband_trans_8x8plus;
-      break;
-  }
-
-  pt = combine_entropy_contexts(above_ec, left_ec);
+  uint8_t token_cache[1024];
+  int pt = get_entropy_context(xd, tx_size, type, block_idx, A, L,
+                               &scan, &band_translate);
   nb = vp9_get_coef_neighbors_handle(scan);
 
   while (1) {
@@ -239,17 +210,13 @@
   return c;
 }
 
-static int get_eob(struct segmentation *seg, int segment_id, int eob_max) {
-  return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
-}
-
 struct decode_block_args {
   VP9D_COMP *pbi;
   vp9_reader *r;
   int *eobtotal;
 };
 
-static void decode_block(int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
                          TX_SIZE tx_size, void *argv) {
   const struct decode_block_args* const arg = argv;
 
@@ -258,8 +225,7 @@
   struct segmentation *seg = &arg->pbi->common.seg;
   struct macroblockd_plane* pd = &xd->plane[plane];
   const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  const int ss_txfrm_size = tx_size << 1;
-  const int seg_eob = get_eob(seg, segment_id, 16 << ss_txfrm_size);
+  const int seg_eob = get_tx_eob(seg, segment_id, tx_size);
   int aoff, loff, eob;
 
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
@@ -275,7 +241,7 @@
   *arg->eobtotal += eob;
 }
 
-int vp9_decode_tokens(VP9D_COMP *pbi, vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
+int vp9_decode_tokens(VP9D_COMP *pbi, vp9_reader *r, BLOCK_SIZE bsize) {
   int eobtotal = 0;
   struct decode_block_args args = {pbi, r, &eobtotal};
   foreach_transformed_block(&pbi->mb, bsize, decode_block, &args);

diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index f98fe8d..cf07c56 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h

@@ -15,6 +15,6 @@
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/decoder/vp9_dboolhuff.h"
 
-int vp9_decode_tokens(VP9D_COMP* pbi, vp9_reader *r, BLOCK_SIZE_TYPE bsize);
+int vp9_decode_tokens(VP9D_COMP* pbi, vp9_reader *r, BLOCK_SIZE bsize);
 
 #endif  // VP9_DECODER_VP9_DETOKENIZE_H_

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index d7c73b6..f3bbc17 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c

@@ -199,7 +199,7 @@
 }
 
 static void write_selected_tx_size(const VP9_COMP *cpi, TX_SIZE tx_size,
-                                   BLOCK_SIZE_TYPE bsize, vp9_writer *w) {
+                                   BLOCK_SIZE bsize, vp9_writer *w) {
   const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   const vp9_prob *tx_probs = get_tx_probs2(xd, &cpi->common.fc.tx_probs);
   vp9_write(w, tx_size != TX_4X4, tx_probs[0]);
@@ -237,7 +237,7 @@
 
 static void update_switchable_interp_probs(VP9_COMP *const cpi,
                                            vp9_writer* const bc) {
-  VP9_COMMON *const pc = &cpi->common;
+  VP9_COMMON *const cm = &cpi->common;
   unsigned int branch_ct[SWITCHABLE_FILTERS + 1]
                         [SWITCHABLE_FILTERS - 1][2];
   vp9_prob new_prob[SWITCHABLE_FILTERS + 1][SWITCHABLE_FILTERS - 1];
@@ -246,21 +246,21 @@
     vp9_tree_probs_from_distribution(
         vp9_switchable_interp_tree,
         new_prob[j], branch_ct[j],
-        pc->counts.switchable_interp[j], 0);
+        cm->counts.switchable_interp[j], 0);
   }
   for (j = 0; j <= SWITCHABLE_FILTERS; ++j) {
     for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) {
-      vp9_cond_prob_diff_update(bc, &pc->fc.switchable_interp_prob[j][i],
+      vp9_cond_prob_diff_update(bc, &cm->fc.switchable_interp_prob[j][i],
                                 MODE_UPDATE_PROB, branch_ct[j][i]);
     }
   }
 #ifdef MODE_STATS
   if (!cpi->dummy_packing)
-    update_switchable_interp_stats(pc);
+    update_switchable_interp_stats(cm);
 #endif
 }
 
-static void update_inter_mode_probs(VP9_COMMON *pc, vp9_writer* const bc) {
+static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer* const bc) {
   int i, j;
 
   for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
@@ -269,10 +269,10 @@
 
     vp9_tree_probs_from_distribution(vp9_inter_mode_tree,
                                      new_prob, branch_ct,
-                                     pc->counts.inter_mode[i], NEARESTMV);
+                                     cm->counts.inter_mode[i], NEARESTMV);
 
     for (j = 0; j < INTER_MODES - 1; ++j)
-      vp9_cond_prob_diff_update(bc, &pc->fc.inter_mode_probs[i][j],
+      vp9_cond_prob_diff_update(bc, &cm->fc.inter_mode_probs[i][j],
                                 MODE_UPDATE_PROB, branch_ct[j]);
   }
 }
@@ -356,39 +356,39 @@
 
 // This function encodes the reference frame
 static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
-  VP9_COMMON *const pc = &cpi->common;
+  VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mi = &xd->mode_info_context->mbmi;
   const int segment_id = mi->segment_id;
-  int seg_ref_active = vp9_segfeature_active(&pc->seg, segment_id,
+  int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
                                              SEG_LVL_REF_FRAME);
   // If segment level coding of this signal is disabled...
   // or the segment allows multiple reference frame options
   if (!seg_ref_active) {
     // does the feature use compound prediction or not
     // (if not specified at the frame/segment level)
-    if (pc->comp_pred_mode == HYBRID_PREDICTION) {
+    if (cm->comp_pred_mode == HYBRID_PREDICTION) {
       vp9_write(bc, mi->ref_frame[1] > INTRA_FRAME,
-                vp9_get_pred_prob_comp_inter_inter(pc, xd));
+                vp9_get_pred_prob_comp_inter_inter(cm, xd));
     } else {
       assert((mi->ref_frame[1] <= INTRA_FRAME) ==
-                 (pc->comp_pred_mode == SINGLE_PREDICTION_ONLY));
+                 (cm->comp_pred_mode == SINGLE_PREDICTION_ONLY));
     }
 
     if (mi->ref_frame[1] > INTRA_FRAME) {
       vp9_write(bc, mi->ref_frame[0] == GOLDEN_FRAME,
-                vp9_get_pred_prob_comp_ref_p(pc, xd));
+                vp9_get_pred_prob_comp_ref_p(cm, xd));
     } else {
       vp9_write(bc, mi->ref_frame[0] != LAST_FRAME,
-                vp9_get_pred_prob_single_ref_p1(pc, xd));
+                vp9_get_pred_prob_single_ref_p1(cm, xd));
       if (mi->ref_frame[0] != LAST_FRAME)
         vp9_write(bc, mi->ref_frame[0] != GOLDEN_FRAME,
-                  vp9_get_pred_prob_single_ref_p2(pc, xd));
+                  vp9_get_pred_prob_single_ref_p2(cm, xd));
     }
   } else {
     assert(mi->ref_frame[1] <= INTRA_FRAME);
-    assert(vp9_get_segdata(&pc->seg, segment_id, SEG_LVL_REF_FRAME) ==
+    assert(vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ==
            mi->ref_frame[0]);
   }
 
@@ -397,20 +397,20 @@
 }
 
 static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
-  VP9_COMMON *const pc = &cpi->common;
-  const nmv_context *nmvc = &pc->fc.nmvc;
+  VP9_COMMON *const cm = &cpi->common;
+  const nmv_context *nmvc = &cm->fc.nmvc;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct segmentation *seg = &pc->seg;
+  struct segmentation *seg = &cm->seg;
   MB_MODE_INFO *const mi = &m->mbmi;
   const MV_REFERENCE_FRAME rf = mi->ref_frame[0];
   const MB_PREDICTION_MODE mode = mi->mode;
   const int segment_id = mi->segment_id;
   int skip_coeff;
-  const BLOCK_SIZE_TYPE bsize = mi->sb_type;
+  const BLOCK_SIZE bsize = mi->sb_type;
   const int allow_hp = xd->allow_high_precision_mv;
 
-  x->partition_info = x->pi + (m - pc->mi);
+  x->partition_info = x->pi + (m - cm->mi);
 
 #ifdef ENTROPY_STATS
   active_section = 9;
@@ -432,12 +432,12 @@
 
   if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
     vp9_write(bc, rf != INTRA_FRAME,
-              vp9_get_pred_prob_intra_inter(pc, xd));
+              vp9_get_pred_prob_intra_inter(cm, xd));
 
-  if (bsize >= BLOCK_8X8 && pc->tx_mode == TX_MODE_SELECT &&
+  if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
       !(rf != INTRA_FRAME &&
         (skip_coeff || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
-    write_selected_tx_size(cpi, mi->txfm_size, bsize, bc);
+    write_selected_tx_size(cpi, mi->tx_size, bsize, bc);
   }
 
   if (rf == INTRA_FRAME) {
@@ -446,7 +446,7 @@
 #endif
 
     if (bsize >= BLOCK_8X8) {
-      write_intra_mode(bc, mode, pc->fc.y_mode_prob[size_group_lookup[bsize]]);
+      write_intra_mode(bc, mode, cm->fc.y_mode_prob[size_group_lookup[bsize]]);
     } else {
       int idx, idy;
       const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
@@ -454,11 +454,11 @@
       for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
         for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
           const MB_PREDICTION_MODE bm = m->bmi[idy * 2 + idx].as_mode;
-          write_intra_mode(bc, bm, pc->fc.y_mode_prob[0]);
+          write_intra_mode(bc, bm, cm->fc.y_mode_prob[0]);
         }
       }
     }
-    write_intra_mode(bc, mi->uv_mode, pc->fc.uv_mode_prob[mode]);
+    write_intra_mode(bc, mi->uv_mode, cm->fc.uv_mode_prob[mode]);
   } else {
     vp9_prob *mv_ref_p;
     encode_ref_frame(cpi, bc);
@@ -472,18 +472,18 @@
     if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
       if (bsize >= BLOCK_8X8) {
         write_sb_mv_ref(bc, mode, mv_ref_p);
-        ++pc->counts.inter_mode[mi->mode_context[rf]]
+        ++cm->counts.inter_mode[mi->mode_context[rf]]
                                [inter_mode_offset(mode)];
       }
     }
 
-    if (pc->mcomp_filter_type == SWITCHABLE) {
+    if (cm->mcomp_filter_type == SWITCHABLE) {
       const int ctx = vp9_get_pred_context_switchable_interp(xd);
       write_token(bc, vp9_switchable_interp_tree,
-                  pc->fc.switchable_interp_prob[ctx],
+                  cm->fc.switchable_interp_prob[ctx],
                   &vp9_switchable_interp_encodings[mi->interp_filter]);
     } else {
-      assert(mi->interp_filter == pc->mcomp_filter_type);
+      assert(mi->interp_filter == cm->mcomp_filter_type);
     }
 
     if (bsize < BLOCK_8X8) {
@@ -499,7 +499,7 @@
           blockmode = x->partition_info->bmi[j].mode;
           blockmv = m->bmi[j].as_mv[0];
           write_sb_mv_ref(bc, blockmode, mv_ref_p);
-          ++pc->counts.inter_mode[mi->mode_context[rf]]
+          ++cm->counts.inter_mode[mi->mode_context[rf]]
                                  [inter_mode_offset(blockmode)];
 
           if (blockmode == NEWMV) {
@@ -533,11 +533,11 @@
 
 static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO *m,
                               vp9_writer *bc) {
-  const VP9_COMMON *const c = &cpi->common;
+  const VP9_COMMON *const cm = &cpi->common;
   const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  const struct segmentation *const seg = &c->seg;
+  const struct segmentation *const seg = &cm->seg;
   const int ym = m->mbmi.mode;
-  const int mis = c->mode_info_stride;
+  const int mis = cm->mode_info_stride;
   const int segment_id = m->mbmi.segment_id;
 
   if (seg->update_map)
@@ -545,8 +545,8 @@
 
   write_skip_coeff(cpi, segment_id, m, bc);
 
-  if (m->mbmi.sb_type >= BLOCK_8X8 && c->tx_mode == TX_MODE_SELECT)
-    write_selected_tx_size(cpi, m->mbmi.txfm_size, m->mbmi.sb_type, bc);
+  if (m->mbmi.sb_type >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT)
+    write_selected_tx_size(cpi, m->mbmi.tx_size, m->mbmi.sb_type, bc);
 
   if (m->mbmi.sb_type >= BLOCK_8X8) {
     const MB_PREDICTION_MODE A = above_block_mode(m, 0, mis);
@@ -584,11 +584,13 @@
   if (m->mbmi.sb_type < BLOCK_8X8)
     if (xd->ab_index > 0)
       return;
+
   xd->mode_info_context = m;
-  set_mi_row_col(&cpi->common, xd, mi_row,
-                 1 << mi_height_log2(m->mbmi.sb_type),
-                 mi_col, 1 << mi_width_log2(m->mbmi.sb_type));
-  if ((cm->frame_type == KEY_FRAME) || cm->intra_only) {
+  set_mi_row_col(&cpi->common, xd,
+                 mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type],
+                 mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type]);
+
+  if (cm->frame_type == KEY_FRAME || cm->intra_only) {
     write_mb_modes_kf(cpi, m, bc);
 #ifdef ENTROPY_STATS
     active_section = 8;
@@ -606,8 +608,7 @@
 
 static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
                            TOKENEXTRA **tok, TOKENEXTRA *tok_end,
-                           int mi_row, int mi_col,
-                           BLOCK_SIZE_TYPE bsize) {
+                           int mi_row, int mi_col, BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
   const int mis = cm->mode_info_stride;
@@ -615,7 +616,7 @@
   int bs = (1 << bsl) / 4;  // mode_info step for subsize
   int n;
   PARTITION_TYPE partition = PARTITION_NONE;
-  BLOCK_SIZE_TYPE subsize;
+  BLOCK_SIZE subsize;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
@@ -683,18 +684,18 @@
 
 static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
                         TOKENEXTRA **tok, TOKENEXTRA *tok_end) {
-  VP9_COMMON *const c = &cpi->common;
-  const int mis = c->mode_info_stride;
-  MODE_INFO *m, *m_ptr = c->mi;
+  VP9_COMMON *const cm = &cpi->common;
+  const int mis = cm->mode_info_stride;
+  MODE_INFO *m, *m_ptr = cm->mi;
   int mi_row, mi_col;
 
-  m_ptr += c->cur_tile_mi_col_start + c->cur_tile_mi_row_start * mis;
+  m_ptr += cm->cur_tile_mi_col_start + cm->cur_tile_mi_row_start * mis;
 
-  for (mi_row = c->cur_tile_mi_row_start; mi_row < c->cur_tile_mi_row_end;
+  for (mi_row = cm->cur_tile_mi_row_start; mi_row < cm->cur_tile_mi_row_end;
        mi_row += 8, m_ptr += 8 * mis) {
     m = m_ptr;
-    vp9_zero(c->left_seg_context);
-    for (mi_col = c->cur_tile_mi_col_start; mi_col < c->cur_tile_mi_col_end;
+    vp9_zero(cm->left_seg_context);
+    for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
          mi_col += MI_BLOCK_SIZE, m += MI_BLOCK_SIZE)
       write_modes_sb(cpi, m, bc, tok, tok_end, mi_row, mi_col, BLOCK_64X64);
   }
@@ -783,94 +784,170 @@
   vp9_coeff_probs_model *old_frame_coef_probs =
       cpi->common.fc.coef_probs[tx_size];
   vp9_coeff_stats *frame_branch_ct = cpi->frame_branch_ct[tx_size];
-  int i, j, k, l, t;
-  int update[2] = {0, 0};
-  int savings;
-
+  const vp9_prob upd = VP9_COEF_UPDATE_PROB;
   const int entropy_nodes_update = UNCONSTRAINED_NODES;
+  int i, j, k, l, t;
+  switch (cpi->sf.use_fast_coef_updates) {
+    case 0: {
+      /* dry run to see if there is any udpate at all needed */
+      int savings = 0;
+      int update[2] = {0, 0};
+      for (i = 0; i < BLOCK_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
+                const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t];
+                int s;
+                int u = 0;
 
-  const int tstart = 0;
-  /* dry run to see if there is any udpate at all needed */
-  savings = 0;
-  for (i = 0; i < BLOCK_TYPES; ++i) {
-    for (j = 0; j < REF_TYPES; ++j) {
-      for (k = 0; k < COEF_BANDS; ++k) {
-        // int prev_coef_savings[ENTROPY_NODES] = {0};
-        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
-          for (t = tstart; t < entropy_nodes_update; ++t) {
-            vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
-            const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t];
-            const vp9_prob upd = VP9_COEF_UPDATE_PROB;
-            int s;
-            int u = 0;
-
-            if (l >= 3 && k == 0)
-              continue;
-            if (t == PIVOT_NODE)
-              s = vp9_prob_diff_update_savings_search_model(
-                  frame_branch_ct[i][j][k][l][0],
-                  old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
-            else
-              s = vp9_prob_diff_update_savings_search(
-                  frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
-            if (s > 0 && newp != oldp)
-              u = 1;
-            if (u)
-              savings += s - (int)(vp9_cost_zero(upd));
-            else
-              savings -= (int)(vp9_cost_zero(upd));
-            update[u]++;
-          }
-        }
-      }
-    }
-  }
-
-  // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
-  /* Is coef updated at all */
-  if (update[1] == 0 || savings < 0) {
-    vp9_write_bit(bc, 0);
-    return;
-  }
-  vp9_write_bit(bc, 1);
-  for (i = 0; i < BLOCK_TYPES; ++i) {
-    for (j = 0; j < REF_TYPES; ++j) {
-      for (k = 0; k < COEF_BANDS; ++k) {
-        // int prev_coef_savings[ENTROPY_NODES] = {0};
-        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
-          // calc probs and branch cts for this frame only
-          for (t = tstart; t < entropy_nodes_update; ++t) {
-            vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
-            vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
-            const vp9_prob upd = VP9_COEF_UPDATE_PROB;
-            int s;
-            int u = 0;
-            if (l >= 3 && k == 0)
-              continue;
-            if (t == PIVOT_NODE)
-              s = vp9_prob_diff_update_savings_search_model(
-                  frame_branch_ct[i][j][k][l][0],
-                  old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
-            else
-              s = vp9_prob_diff_update_savings_search(
-                  frame_branch_ct[i][j][k][l][t],
-                  *oldp, &newp, upd);
-            if (s > 0 && newp != *oldp)
-              u = 1;
-            vp9_write(bc, u, upd);
-#ifdef ENTROPY_STATS
-            if (!cpi->dummy_packing)
-              ++tree_update_hist[tx_size][i][j][k][l][t][u];
-#endif
-            if (u) {
-              /* send/use new probability */
-              vp9_write_prob_diff_update(bc, newp, *oldp);
-              *oldp = newp;
+                if (l >= 3 && k == 0)
+                  continue;
+                if (t == PIVOT_NODE)
+                  s = vp9_prob_diff_update_savings_search_model(
+                      frame_branch_ct[i][j][k][l][0],
+                      old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
+                else
+                  s = vp9_prob_diff_update_savings_search(
+                      frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
+                if (s > 0 && newp != oldp)
+                  u = 1;
+                if (u)
+                  savings += s - (int)(vp9_cost_zero(upd));
+                else
+                  savings -= (int)(vp9_cost_zero(upd));
+                update[u]++;
+              }
             }
           }
         }
       }
+
+      // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
+      /* Is coef updated at all */
+      if (update[1] == 0 || savings < 0) {
+        vp9_write_bit(bc, 0);
+        return;
+      }
+      vp9_write_bit(bc, 1);
+      for (i = 0; i < BLOCK_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+              // calc probs and branch cts for this frame only
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
+                vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
+                const vp9_prob upd = VP9_COEF_UPDATE_PROB;
+                int s;
+                int u = 0;
+                if (l >= 3 && k == 0)
+                  continue;
+                if (t == PIVOT_NODE)
+                  s = vp9_prob_diff_update_savings_search_model(
+                      frame_branch_ct[i][j][k][l][0],
+                      old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
+                else
+                  s = vp9_prob_diff_update_savings_search(
+                      frame_branch_ct[i][j][k][l][t],
+                      *oldp, &newp, upd);
+                if (s > 0 && newp != *oldp)
+                  u = 1;
+                vp9_write(bc, u, upd);
+#ifdef ENTROPY_STATS
+                if (!cpi->dummy_packing)
+                  ++tree_update_hist[tx_size][i][j][k][l][t][u];
+#endif
+                if (u) {
+                  /* send/use new probability */
+                  vp9_write_prob_diff_update(bc, newp, *oldp);
+                  *oldp = newp;
+                }
+              }
+            }
+          }
+        }
+      }
+      return;
     }
+
+    case 1:
+    case 2: {
+      const int prev_coef_contexts_to_update =
+          (cpi->sf.use_fast_coef_updates == 2 ?
+           PREV_COEF_CONTEXTS >> 1 : PREV_COEF_CONTEXTS);
+      const int coef_band_to_update =
+          (cpi->sf.use_fast_coef_updates == 2 ?
+           COEF_BANDS >> 1 : COEF_BANDS);
+      int updates = 0;
+      int noupdates_before_first = 0;
+      for (i = 0; i < BLOCK_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+              // calc probs and branch cts for this frame only
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
+                vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
+                int s;
+                int u = 0;
+                if (l >= 3 && k == 0)
+                  continue;
+                if (l >= prev_coef_contexts_to_update ||
+                    k >= coef_band_to_update) {
+                  u = 0;
+                } else {
+                  if (t == PIVOT_NODE)
+                    s = vp9_prob_diff_update_savings_search_model(
+                        frame_branch_ct[i][j][k][l][0],
+                        old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
+                  else
+                    s = vp9_prob_diff_update_savings_search(
+                        frame_branch_ct[i][j][k][l][t],
+                        *oldp, &newp, upd);
+                  if (s > 0 && newp != *oldp)
+                    u = 1;
+                }
+                updates += u;
+                if (u == 0 && updates == 0) {
+                  noupdates_before_first++;
+#ifdef ENTROPY_STATS
+                  if (!cpi->dummy_packing)
+                    ++tree_update_hist[tx_size][i][j][k][l][t][u];
+#endif
+                  continue;
+                }
+                if (u == 1 && updates == 1) {
+                  int v;
+                  // first update
+                  vp9_write_bit(bc, 1);
+                  for (v = 0; v < noupdates_before_first; ++v)
+                    vp9_write(bc, 0, upd);
+                }
+                vp9_write(bc, u, upd);
+#ifdef ENTROPY_STATS
+                if (!cpi->dummy_packing)
+                  ++tree_update_hist[tx_size][i][j][k][l][t][u];
+#endif
+                if (u) {
+                  /* send/use new probability */
+                  vp9_write_prob_diff_update(bc, newp, *oldp);
+                  *oldp = newp;
+                }
+              }
+            }
+          }
+        }
+      }
+      if (updates == 0) {
+        vp9_write_bit(bc, 0);  // no updates
+      }
+      return;
+    }
+
+    default:
+      assert(0);
   }
 }
 
@@ -1457,7 +1534,7 @@
   vp9_compute_update_table();
 
 #ifdef ENTROPY_STATS
-  if (pc->frame_type == INTER_FRAME)
+  if (cm->frame_type == INTER_FRAME)
     active_section = 0;
   else
     active_section = 7;

diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 9426f44..7b2dd11 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h

@@ -170,10 +170,10 @@
   PICK_MODE_CONTEXT sb64_context;
   int partition_cost[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
 
-  BLOCK_SIZE_TYPE b_partitioning[4][4][4];
-  BLOCK_SIZE_TYPE mb_partitioning[4][4];
-  BLOCK_SIZE_TYPE sb_partitioning[4];
-  BLOCK_SIZE_TYPE sb64_partitioning;
+  BLOCK_SIZE b_partitioning[4][4][4];
+  BLOCK_SIZE mb_partitioning[4][4];
+  BLOCK_SIZE sb_partitioning[4];
+  BLOCK_SIZE sb64_partitioning;
 
   void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch);
   void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch);

diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 3112dad..4f4ad04 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c

@@ -1077,6 +1077,44 @@
   output[30] = step[30];
   output[31] = step[31];
 
+  // dump the magnitude by 4, hence the intermediate values are within
+  // the range of 16 bits.
+  if (round) {
+    output[0] = half_round_shift(output[0]);
+    output[1] = half_round_shift(output[1]);
+    output[2] = half_round_shift(output[2]);
+    output[3] = half_round_shift(output[3]);
+    output[4] = half_round_shift(output[4]);
+    output[5] = half_round_shift(output[5]);
+    output[6] = half_round_shift(output[6]);
+    output[7] = half_round_shift(output[7]);
+    output[8] = half_round_shift(output[8]);
+    output[9] = half_round_shift(output[9]);
+    output[10] = half_round_shift(output[10]);
+    output[11] = half_round_shift(output[11]);
+    output[12] = half_round_shift(output[12]);
+    output[13] = half_round_shift(output[13]);
+    output[14] = half_round_shift(output[14]);
+    output[15] = half_round_shift(output[15]);
+
+    output[16] = half_round_shift(output[16]);
+    output[17] = half_round_shift(output[17]);
+    output[18] = half_round_shift(output[18]);
+    output[19] = half_round_shift(output[19]);
+    output[20] = half_round_shift(output[20]);
+    output[21] = half_round_shift(output[21]);
+    output[22] = half_round_shift(output[22]);
+    output[23] = half_round_shift(output[23]);
+    output[24] = half_round_shift(output[24]);
+    output[25] = half_round_shift(output[25]);
+    output[26] = half_round_shift(output[26]);
+    output[27] = half_round_shift(output[27]);
+    output[28] = half_round_shift(output[28]);
+    output[29] = half_round_shift(output[29]);
+    output[30] = half_round_shift(output[30]);
+    output[31] = half_round_shift(output[31]);
+  }
+
   // Stage 3
   step[0] = output[0] + output[(8 - 1)];
   step[1] = output[1] + output[(8 - 2)];
@@ -1112,44 +1150,6 @@
   step[30] = output[30] + output[25];
   step[31] = output[31] + output[24];
 
-  // dump the magnitude by half, hence the intermediate values are within 1108
-  // the range of 16 bits.
-  if (round) {
-    step[0] = half_round_shift(step[0]);
-    step[1] = half_round_shift(step[1]);
-    step[2] = half_round_shift(step[2]);
-    step[3] = half_round_shift(step[3]);
-    step[4] = half_round_shift(step[4]);
-    step[5] = half_round_shift(step[5]);
-    step[6] = half_round_shift(step[6]);
-    step[7] = half_round_shift(step[7]);
-    step[8] = half_round_shift(step[8]);
-    step[9] = half_round_shift(step[9]);
-    step[10] = half_round_shift(step[10]);
-    step[11] = half_round_shift(step[11]);
-    step[12] = half_round_shift(step[12]);
-    step[13] = half_round_shift(step[13]);
-    step[14] = half_round_shift(step[14]);
-    step[15] = half_round_shift(step[15]);
-
-    step[16] = half_round_shift(step[16]);
-    step[17] = half_round_shift(step[17]);
-    step[18] = half_round_shift(step[18]);
-    step[19] = half_round_shift(step[19]);
-    step[20] = half_round_shift(step[20]);
-    step[21] = half_round_shift(step[21]);
-    step[22] = half_round_shift(step[22]);
-    step[23] = half_round_shift(step[23]);
-    step[24] = half_round_shift(step[24]);
-    step[25] = half_round_shift(step[25]);
-    step[26] = half_round_shift(step[26]);
-    step[27] = half_round_shift(step[27]);
-    step[28] = half_round_shift(step[28]);
-    step[29] = half_round_shift(step[29]);
-    step[30] = half_round_shift(step[30]);
-    step[31] = half_round_shift(step[31]);
-  }
-
   // Stage 4
   output[0] = step[0] + step[3];
   output[1] = step[1] + step[2];

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index a0a3ace..eb83903 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -47,7 +47,7 @@
 #endif
 
 static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
-                              int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize);
+                              int mi_row, int mi_col, BLOCK_SIZE bsize);
 
 static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
 
@@ -78,7 +78,7 @@
 };
 
 static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi, MACROBLOCK *x,
-                                              BLOCK_SIZE_TYPE bs) {
+                                              BLOCK_SIZE bs) {
   unsigned int var, sse;
   var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
                            x->plane[0].src.stride,
@@ -336,7 +336,7 @@
 }
 
 static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
-                         BLOCK_SIZE_TYPE bsize, int output_enabled) {
+                         BLOCK_SIZE bsize, int output_enabled) {
   int i, x_idx, y;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
@@ -404,7 +404,7 @@
       THR_D135_PRED /*D135_PRED*/,
       THR_D117_PRED /*D117_PRED*/,
       THR_D153_PRED /*D153_PRED*/,
-      THR_D27_PRED /*D27_PRED*/,
+      THR_D207_PRED /*D207_PRED*/,
       THR_D63_PRED /*D63_PRED*/,
       THR_TM /*TM_PRED*/,
       THR_B_PRED /*I4X4_PRED*/,
@@ -469,10 +469,10 @@
 }
 
 static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
-                        BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCK * const x = &cpi->mb;
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCKD * const xd = &x->e_mbd;
+                        BLOCK_SIZE bsize) {
+  MACROBLOCK *const x = &cpi->mb;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi;
   const int dst_fb_idx = cm->new_fb_idx;
   const int idx_str = xd->mode_info_stride * mi_row + mi_col;
@@ -553,7 +553,7 @@
 
 static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
                           int *totalrate, int64_t *totaldist,
-                          BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx,
+                          BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
                           int64_t best_rd) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
@@ -637,9 +637,8 @@
 // TODO(jingning): the variables used here are little complicated. need further
 // refactoring on organizing the temporary buffers, when recursive
 // partition down to 4x4 block size is enabled.
-static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
-                                            BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD * const xd = &x->e_mbd;
+static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
 
   switch (bsize) {
     case BLOCK_64X64:
@@ -674,9 +673,8 @@
   }
 }
 
-static BLOCK_SIZE_TYPE *get_sb_partitioning(MACROBLOCK *x,
-                                            BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *xd = &x->e_mbd;
+static BLOCK_SIZE *get_sb_partitioning(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
   switch (bsize) {
     case BLOCK_64X64:
       return &x->sb64_partitioning;
@@ -696,7 +694,7 @@
                             ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
                             ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
                             PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
-                            BLOCK_SIZE_TYPE bsize) {
+                            BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -727,7 +725,7 @@
                          ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
                          ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
                          PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
-                         BLOCK_SIZE_TYPE bsize) {
+                         BLOCK_SIZE bsize) {
   const VP9_COMMON *const cm = &cpi->common;
   const MACROBLOCK *const x = &cpi->mb;
   const MACROBLOCKD *const xd = &x->e_mbd;
@@ -758,7 +756,7 @@
 }
 
 static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
-                     int output_enabled, BLOCK_SIZE_TYPE bsize, int sub_index) {
+                     int output_enabled, BLOCK_SIZE bsize, int sub_index) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCK * const x = &cpi->mb;
   MACROBLOCKD * const xd = &x->e_mbd;
@@ -767,7 +765,7 @@
     return;
 
   if (sub_index != -1)
-    *(get_sb_index(xd, bsize)) = sub_index;
+    *get_sb_index(xd, bsize) = sub_index;
 
   if (bsize < BLOCK_8X8) {
     // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
@@ -788,15 +786,15 @@
 }
 
 static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
-                      int output_enabled, BLOCK_SIZE_TYPE bsize) {
+                      int output_enabled, BLOCK_SIZE bsize) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCK * const x = &cpi->mb;
   MACROBLOCKD * const xd = &x->e_mbd;
-  BLOCK_SIZE_TYPE c1 = BLOCK_8X8;
+  BLOCK_SIZE c1 = BLOCK_8X8;
   const int bsl = b_width_log2(bsize), bs = (1 << bsl) / 4;
   int UNINITIALIZED_IS_SAFE(pl);
   PARTITION_TYPE partition;
-  BLOCK_SIZE_TYPE subsize;
+  BLOCK_SIZE subsize;
   int i;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
@@ -837,7 +835,7 @@
       for (i = 0; i < 4; i++) {
         const int x_idx = i & 1, y_idx = i >> 1;
 
-        *(get_sb_index(xd, subsize)) = i;
+        *get_sb_index(xd, subsize) = i;
         encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
                   output_enabled, subsize);
       }
@@ -853,8 +851,7 @@
   }
 }
 
-static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m,
-                             BLOCK_SIZE_TYPE bsize) {
+static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m, BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   const int mis = cm->mode_info_stride;
   int block_row, block_col;
@@ -876,23 +873,17 @@
   }
 }
 
-static void set_block_size(VP9_COMMON * const cm, MODE_INFO *m,
-                           BLOCK_SIZE_TYPE bsize, int mis, int mi_row,
+static void set_block_size(VP9_COMMON * const cm, MODE_INFO *mi,
+                           BLOCK_SIZE bsize, int mis, int mi_row,
                            int mi_col) {
-  int row, col;
-  int bwl = b_width_log2(bsize);
-  int bhl = b_height_log2(bsize);
-  int bsl = (bwl > bhl ? bwl : bhl);
-
-  int bs = (1 << bsl) / 2;  // Block size in units of 8 pels.
-  MODE_INFO *m2 = m + mi_row * mis + mi_col;
-  for (row = 0; row < bs; row++) {
-    for (col = 0; col < bs; col++) {
-      if (mi_row + row >= cm->mi_rows || mi_col + col >= cm->mi_cols)
-        continue;
-      m2[row * mis + col].mbmi.sb_type = bsize;
-    }
-  }
+  int r, c;
+  const int bs = MAX(num_8x8_blocks_wide_lookup[bsize],
+                     num_8x8_blocks_high_lookup[bsize]);
+  MODE_INFO *const mi2 = &mi[mi_row * mis + mi_col];
+  for (r = 0; r < bs; r++)
+    for (c = 0; c < bs; c++)
+      if (mi_row + r < cm->mi_rows && mi_col + c < cm->mi_cols)
+        mi2[r * mis + c].mbmi.sb_type = bsize;
 }
 
 typedef struct {
@@ -929,9 +920,9 @@
   V64X64,
 } TREE_LEVEL;
 
-static void tree_to_node(void *data, BLOCK_SIZE_TYPE block_size, vt_node *node) {
+static void tree_to_node(void *data, BLOCK_SIZE bsize, vt_node *node) {
   int i;
-  switch (block_size) {
+  switch (bsize) {
     case BLOCK_64X64: {
       v64x64 *vt = (v64x64 *) data;
       node->vt = &vt->vt;
@@ -988,9 +979,9 @@
                 a->sum_error + b->sum_error, a->count + b->count);
 }
 
-static void fill_variance_tree(void *data, BLOCK_SIZE_TYPE block_size) {
+static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
   vt_node node;
-  tree_to_node(data, block_size, &node);
+  tree_to_node(data, bsize, &node);
   sum_2_variances(&node.vt->horz[0], node.split[0], node.split[1]);
   sum_2_variances(&node.vt->horz[1], node.split[2], node.split[3]);
   sum_2_variances(&node.vt->vert[0], node.split[0], node.split[2]);
@@ -1000,7 +991,7 @@
 
 #if PERFORM_RANDOM_PARTITIONING
 static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m,
-    BLOCK_SIZE_TYPE block_size, int mi_row,
+    BLOCK_SIZE block_size, int mi_row,
     int mi_col, int mi_size) {
   VP9_COMMON * const cm = &cpi->common;
   vt_node vt;
@@ -1039,27 +1030,27 @@
 #else  // !PERFORM_RANDOM_PARTITIONING
 
 static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m,
-                               BLOCK_SIZE_TYPE block_size, int mi_row,
+                               BLOCK_SIZE bsize, int mi_row,
                                int mi_col, int mi_size) {
   VP9_COMMON * const cm = &cpi->common;
   vt_node vt;
   const int mis = cm->mode_info_stride;
   int64_t threshold = 50 * cpi->common.base_qindex;
 
-  tree_to_node(data, block_size, &vt);
+  tree_to_node(data, bsize, &vt);
 
   // split none is available only if we have more than half a block size
   // in width and height inside the visible image
   if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows
       && vt.vt->none.variance < threshold) {
-    set_block_size(cm, m, block_size, mis, mi_row, mi_col);
+    set_block_size(cm, m, bsize, mis, mi_row, mi_col);
     return 1;
   }
 
   // vertical split is available on all but the bottom border
   if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold
       && vt.vt->vert[1].variance < threshold) {
-    set_block_size(cm, m, get_subsize(block_size, PARTITION_VERT), mis, mi_row,
+    set_block_size(cm, m, get_subsize(bsize, PARTITION_VERT), mis, mi_row,
                    mi_col);
     return 1;
   }
@@ -1067,7 +1058,7 @@
   // horizontal split is available on all but the right border
   if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold
       && vt.vt->horz[1].variance < threshold) {
-    set_block_size(cm, m, get_subsize(block_size, PARTITION_HORZ), mis, mi_row,
+    set_block_size(cm, m, get_subsize(bsize, PARTITION_HORZ), mis, mi_row,
                    mi_col);
     return 1;
   }
@@ -1192,7 +1183,7 @@
 }
 
 static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
-                             int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize,
+                             int mi_row, int mi_col, BLOCK_SIZE bsize,
                              int *rate, int64_t *dist, int do_recon) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCK * const x = &cpi->mb;
@@ -1206,7 +1197,7 @@
   int bss = (1 << bsl) / 4;
   int i, pl;
   PARTITION_TYPE partition = PARTITION_NONE;
-  BLOCK_SIZE_TYPE subsize;
+  BLOCK_SIZE subsize;
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
   int last_part_rate = INT_MAX;
@@ -1217,9 +1208,9 @@
   int64_t none_dist = INT_MAX;
   int chosen_rate = INT_MAX;
   int64_t chosen_dist = INT_MAX;
-  BLOCK_SIZE_TYPE sub_subsize = BLOCK_4X4;
+  BLOCK_SIZE sub_subsize = BLOCK_4X4;
   int splits_below = 0;
-  BLOCK_SIZE_TYPE bs_type = m->mbmi.sb_type;
+  BLOCK_SIZE bs_type = m->mbmi.sb_type;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
@@ -1283,7 +1274,7 @@
                     bsize, get_block_context(x, bsize), INT64_MAX);
       break;
     case PARTITION_HORZ:
-      *(get_sb_index(xd, subsize)) = 0;
+      *get_sb_index(xd, subsize) = 0;
       pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
                     subsize, get_block_context(x, subsize), INT64_MAX);
       if (last_part_rate != INT_MAX &&
@@ -1292,7 +1283,7 @@
         int64_t dt = 0;
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-        *(get_sb_index(xd, subsize)) = 1;
+        *get_sb_index(xd, subsize) = 1;
         pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize,
                       get_block_context(x, subsize), INT64_MAX);
         if (rt == INT_MAX || dt == INT_MAX) {
@@ -1306,7 +1297,7 @@
       }
       break;
     case PARTITION_VERT:
-      *(get_sb_index(xd, subsize)) = 0;
+      *get_sb_index(xd, subsize) = 0;
       pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
                     subsize, get_block_context(x, subsize), INT64_MAX);
       if (last_part_rate != INT_MAX &&
@@ -1315,7 +1306,7 @@
         int64_t dt = 0;
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-        *(get_sb_index(xd, subsize)) = 1;
+        *get_sb_index(xd, subsize) = 1;
         pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize,
                       get_block_context(x, subsize), INT64_MAX);
         if (rt == INT_MAX || dt == INT_MAX) {
@@ -1341,7 +1332,7 @@
         if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
           continue;
 
-        *(get_sb_index(xd, subsize)) = i;
+        *get_sb_index(xd, subsize) = i;
 
         rd_use_partition(cpi, m + jj * bss * mis + ii * bss, tp, mi_row + y_idx,
                          mi_col + x_idx, subsize, &rt, &dt, i != 3);
@@ -1366,7 +1357,7 @@
       && partition != PARTITION_SPLIT && bsize > BLOCK_8X8
       && (mi_row + ms < cm->mi_rows || mi_row + (ms >> 1) == cm->mi_rows)
       && (mi_col + ms < cm->mi_cols || mi_col + (ms >> 1) == cm->mi_cols)) {
-    BLOCK_SIZE_TYPE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
+    BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
     split_rate = 0;
     split_dist = 0;
     restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -1384,9 +1375,9 @@
           || (mi_col + x_idx >= cm->mi_cols))
         continue;
 
-      *(get_sb_index(xd, split_subsize)) = i;
-      *(get_sb_partitioning(x, bsize)) = split_subsize;
-      *(get_sb_partitioning(x, split_subsize)) = split_subsize;
+      *get_sb_index(xd, split_subsize) = i;
+      *get_sb_partitioning(x, bsize) = split_subsize;
+      *get_sb_partitioning(x, split_subsize) = split_subsize;
 
       save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
@@ -1454,14 +1445,17 @@
   *dist = chosen_dist;
 }
 
-static const BLOCK_SIZE_TYPE min_partition_size[BLOCK_SIZES] =
-  { BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, BLOCK_4X4,
-    BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, BLOCK_8X8,
-    BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16 };
-static const BLOCK_SIZE_TYPE max_partition_size[BLOCK_SIZES] =
-  { BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
-    BLOCK_32X32, BLOCK_32X32, BLOCK_32X32, BLOCK_64X64,
-    BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64 };
+static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = {
+  BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, BLOCK_4X4,
+  BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, BLOCK_8X8,
+  BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16
+};
+
+static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = {
+  BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
+  BLOCK_32X32, BLOCK_32X32, BLOCK_32X32, BLOCK_64X64,
+  BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64
+};
 
 // Look at all the mode_info entries for blocks that are part of this
 // partition and find the min and max values for sb_type.
@@ -1471,8 +1465,8 @@
 // The min and max are assumed to have been initialized prior to calling this
 // function so repeat calls can accumulate a min and max of more than one sb64.
 static void get_sb_partition_size_range(VP9_COMP *cpi, MODE_INFO * mi,
-                                        BLOCK_SIZE_TYPE * min_block_size,
-                                        BLOCK_SIZE_TYPE * max_block_size ) {
+                                        BLOCK_SIZE *min_block_size,
+                                        BLOCK_SIZE *max_block_size ) {
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   int sb_width_in_blocks = MI_BLOCK_SIZE;
   int sb_height_in_blocks  = MI_BLOCK_SIZE;
@@ -1482,8 +1476,8 @@
   // Check the sb_type for each block that belongs to this region.
   for (i = 0; i < sb_height_in_blocks; ++i) {
     for (j = 0; j < sb_width_in_blocks; ++j) {
-      *min_block_size = MIN(*min_block_size, mi[index+j].mbmi.sb_type);
-      *max_block_size = MAX(*max_block_size, mi[index+j].mbmi.sb_type);
+      *min_block_size = MIN(*min_block_size, mi[index + j].mbmi.sb_type);
+      *max_block_size = MAX(*max_block_size, mi[index + j].mbmi.sb_type);
     }
     index += xd->mode_info_stride;
   }
@@ -1492,8 +1486,8 @@
 // Look at neighboring blocks and set a min and max partition size based on
 // what they chose.
 static void rd_auto_partition_range(VP9_COMP *cpi,
-                                    BLOCK_SIZE_TYPE * min_block_size,
-                                    BLOCK_SIZE_TYPE * max_block_size) {
+                                    BLOCK_SIZE *min_block_size,
+                                    BLOCK_SIZE *max_block_size) {
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   MODE_INFO *mi = xd->mode_info_context;
   MODE_INFO *above_sb64_mi;
@@ -1544,8 +1538,7 @@
   }
 }
 
-static void compute_fast_motion_search_level(VP9_COMP *const cpi,
-                                             const BLOCK_SIZE_TYPE bsize) {
+static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -1644,26 +1637,25 @@
 // unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
 static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
-                              int mi_col, BLOCK_SIZE_TYPE bsize, int *rate,
+                              int mi_col, BLOCK_SIZE bsize, int *rate,
                               int64_t *dist, int do_recon, int64_t best_rd) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCK * const x = &cpi->mb;
   MACROBLOCKD * const xd = &x->e_mbd;
-  int bsl = b_width_log2(bsize), bs = 1 << bsl;
-  int ms = bs / 2;
+  const int ms = num_8x8_blocks_wide_lookup[bsize] / 2;
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
   TOKENEXTRA *tp_orig = *tp;
   int i, pl;
-  BLOCK_SIZE_TYPE subsize;
+  BLOCK_SIZE subsize;
   int this_rate, sum_rate = 0, best_rate = INT_MAX;
   int64_t this_dist, sum_dist = 0, best_dist = INT64_MAX;
   int64_t sum_rd = 0;
   int do_split = bsize >= BLOCK_8X8;
   int do_rect = 1;
   // Override skipping rectangular partition operations for edge blocks
-  const int force_horz_split = (mi_row + (ms >> 1) >= cm->mi_rows);
-  const int force_vert_split = (mi_col + (ms >> 1) >= cm->mi_cols);
+  const int force_horz_split = (mi_row + ms >= cm->mi_rows);
+  const int force_vert_split = (mi_col + ms >= cm->mi_cols);
 
   int partition_none_allowed = !force_horz_split && !force_vert_split;
   int partition_horz_allowed = !force_vert_split && bsize >= BLOCK_8X8;
@@ -1743,14 +1735,13 @@
   if (do_split) {
     subsize = get_subsize(bsize, PARTITION_SPLIT);
     for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
-      int x_idx = (i & 1) * (ms >> 1);
-      int y_idx = (i >> 1) * (ms >> 1);
+      const int x_idx = (i & 1) * ms;
+      const int y_idx = (i >> 1) * ms;
 
-      if ((mi_row + y_idx >= cm->mi_rows) ||
-          (mi_col + x_idx >= cm->mi_cols))
+      if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
         continue;
 
-      *(get_sb_index(xd, subsize)) = i;
+      *get_sb_index(xd, subsize) = i;
 
       rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize,
                         &this_rate, &this_dist, i != 3, best_rd - sum_rd);
@@ -1796,17 +1787,17 @@
   // PARTITION_HORZ
   if (partition_horz_allowed && do_rect) {
     subsize = get_subsize(bsize, PARTITION_HORZ);
-    *(get_sb_index(xd, subsize)) = 0;
+    *get_sb_index(xd, subsize) = 0;
     pick_sb_modes(cpi, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
                   get_block_context(x, subsize), best_rd);
     sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
 
-    if (sum_rd < best_rd && mi_row + (ms >> 1) < cm->mi_rows) {
+    if (sum_rd < best_rd && mi_row + ms < cm->mi_rows) {
       update_state(cpi, get_block_context(x, subsize), subsize, 0);
       encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
 
-      *(get_sb_index(xd, subsize)) = 1;
-      pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, &this_rate,
+      *get_sb_index(xd, subsize) = 1;
+      pick_sb_modes(cpi, mi_row + ms, mi_col, &this_rate,
                     &this_dist, subsize, get_block_context(x, subsize),
                     best_rd - sum_rd);
       if (this_rate == INT_MAX) {
@@ -1836,16 +1827,16 @@
   if (partition_vert_allowed && do_rect) {
     subsize = get_subsize(bsize, PARTITION_VERT);
 
-    *(get_sb_index(xd, subsize)) = 0;
+    *get_sb_index(xd, subsize) = 0;
     pick_sb_modes(cpi, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
                   get_block_context(x, subsize), best_rd);
     sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
-    if (sum_rd < best_rd && mi_col + (ms >> 1) < cm->mi_cols) {
+    if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
       update_state(cpi, get_block_context(x, subsize), subsize, 0);
       encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
 
-      *(get_sb_index(xd, subsize)) = 1;
-      pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), &this_rate,
+      *get_sb_index(xd, subsize) = 1;
+      pick_sb_modes(cpi, mi_row, mi_col + ms, &this_rate,
                     &this_dist, subsize, get_block_context(x, subsize),
                     best_rd - sum_rd);
       if (this_rate == INT_MAX) {
@@ -2218,25 +2209,25 @@
 }
 
 static void set_txfm_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs,
-                          TX_SIZE txfm_size) {
+                          TX_SIZE tx_size) {
   int x, y;
 
   for (y = 0; y < ymbs; y++) {
     for (x = 0; x < xmbs; x++)
-      mi[y * mis + x].mbmi.txfm_size = txfm_size;
+      mi[y * mis + x].mbmi.tx_size = tx_size;
   }
 }
 
 static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO *mi, int mis,
-                                   TX_SIZE txfm_max, int bw, int bh, int mi_row,
-                                   int mi_col, BLOCK_SIZE_TYPE bsize) {
-  VP9_COMMON * const cm = &cpi->common;
-  MB_MODE_INFO * const mbmi = &mi->mbmi;
+                                   TX_SIZE max_tx_size, int bw, int bh,
+                                   int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  if (mbmi->txfm_size > txfm_max) {
+  if (mbmi->tx_size > max_tx_size) {
     MACROBLOCK * const x = &cpi->mb;
     MACROBLOCKD * const xd = &x->e_mbd;
     const int ymbs = MIN(bh, cm->mi_rows - mi_row);
@@ -2245,57 +2236,49 @@
     xd->mode_info_context = mi;
     assert(vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
            get_skip_flag(mi, mis, ymbs, xmbs));
-    set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);
+    set_txfm_flag(mi, mis, ymbs, xmbs, max_tx_size);
   }
 }
 
 static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi,
-                                    TX_SIZE txfm_max, int mi_row, int mi_col,
-                                    BLOCK_SIZE_TYPE bsize) {
-  VP9_COMMON * const cm = &cpi->common;
+                                    TX_SIZE max_tx_size, int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize) {
+  const VP9_COMMON *const cm = &cpi->common;
   const int mis = cm->mode_info_stride;
-  int bwl, bhl;
-  const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1);
+  int bw, bh;
+  const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  bwl = mi_width_log2(mi->mbmi.sb_type);
-  bhl = mi_height_log2(mi->mbmi.sb_type);
+  bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
+  bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
 
-  if (bwl == bsl && bhl == bsl) {
-    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, 1 << bsl, mi_row,
+  if (bw == bs && bh == bs) {
+    reset_skip_txfm_size_b(cpi, mi, mis, max_tx_size, bs, bs, mi_row,
                            mi_col, bsize);
-  } else if (bwl == bsl && bhl < bsl) {
-    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, bs, mi_row, mi_col,
+  } else if (bw == bs && bh < bs) {
+    reset_skip_txfm_size_b(cpi, mi, mis, max_tx_size, bs, hbs, mi_row, mi_col,
                            bsize);
-    reset_skip_txfm_size_b(cpi, mi + bs * mis, mis, txfm_max, 1 << bsl, bs,
-                           mi_row + bs, mi_col, bsize);
-  } else if (bwl < bsl && bhl == bsl) {
-    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, bs, 1 << bsl, mi_row, mi_col,
+    reset_skip_txfm_size_b(cpi, mi + hbs * mis, mis, max_tx_size, bs, hbs,
+                           mi_row + hbs, mi_col, bsize);
+  } else if (bw < bs && bh == bs) {
+    reset_skip_txfm_size_b(cpi, mi, mis, max_tx_size, hbs, bs, mi_row, mi_col,
                            bsize);
-    reset_skip_txfm_size_b(cpi, mi + bs, mis, txfm_max, bs, 1 << bsl, mi_row,
-                           mi_col + bs, bsize);
+    reset_skip_txfm_size_b(cpi, mi + hbs, mis, max_tx_size, hbs, bs, mi_row,
+                           mi_col + hbs, bsize);
   } else {
-    BLOCK_SIZE_TYPE subsize;
+    const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
     int n;
 
-    assert(bwl < bsl && bhl < bsl);
-    if (bsize == BLOCK_64X64) {
-      subsize = BLOCK_32X32;
-    } else if (bsize == BLOCK_32X32) {
-      subsize = BLOCK_16X16;
-    } else {
-      assert(bsize == BLOCK_16X16);
-      subsize = BLOCK_8X8;
-    }
+    assert(bw < bs && bh < bs);
 
     for (n = 0; n < 4; n++) {
-      const int y_idx = n >> 1, x_idx = n & 0x01;
+      const int mi_dc = hbs * (n & 1);
+      const int mi_dr = hbs * (n >> 1);
 
-      reset_skip_txfm_size_sb(cpi, mi + y_idx * bs * mis + x_idx * bs, txfm_max,
-                              mi_row + y_idx * bs, mi_col + x_idx * bs,
-                              subsize);
+      reset_skip_txfm_size_sb(cpi, &mi[mi_dr * mis + mi_dc], max_tx_size,
+                              mi_row + mi_dr, mi_col + mi_dc, subsize);
     }
   }
 }
@@ -2524,7 +2507,7 @@
 static void sum_intra_stats(VP9_COMP *cpi, const MODE_INFO *mi) {
   const MB_PREDICTION_MODE y_mode = mi->mbmi.mode;
   const MB_PREDICTION_MODE uv_mode = mi->mbmi.uv_mode;
-  const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
 
   ++cpi->y_uv_mode_count[y_mode][uv_mode];
 
@@ -2562,7 +2545,7 @@
 }
 
 static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
-                              int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) {
+                              int mi_row, int mi_col, BLOCK_SIZE bsize) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCK * const x = &cpi->mb;
   MACROBLOCKD * const xd = &x->e_mbd;
@@ -2665,7 +2648,7 @@
             (mbmi->skip_coeff ||
              vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) {
       const uint8_t context = vp9_get_pred_context_tx_size(xd);
-      update_tx_counts(bsize, context, mbmi->txfm_size, &cm->counts.tx);
+      update_tx_counts(bsize, context, mbmi->tx_size, &cm->counts.tx);
     } else {
       int x, y;
       TX_SIZE sz = (cm->tx_mode == TX_MODE_SELECT) ? TX_32X32 : cm->tx_mode;
@@ -2678,18 +2661,15 @@
         if (sz == TX_8X8 && bsize < BLOCK_8X8)
           sz = TX_4X4;
       } else if (bsize >= BLOCK_8X8) {
-        sz = mbmi->txfm_size;
+        sz = mbmi->tx_size;
       } else {
         sz = TX_4X4;
       }
 
-      for (y = 0; y < mi_height; y++) {
-        for (x = 0; x < mi_width; x++) {
-          if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows) {
-            mi[mis * y + x].mbmi.txfm_size = sz;
-          }
-        }
-      }
+      for (y = 0; y < mi_height; y++)
+        for (x = 0; x < mi_width; x++)
+          if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows)
+            mi[mis * y + x].mbmi.tx_size = sz;
     }
   }
 }

diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index 7393974..588b774 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c

@@ -20,8 +20,8 @@
   x->skip_encode = 0;
   mbmi->mode = DC_PRED;
   mbmi->ref_frame[0] = INTRA_FRAME;
-  mbmi->txfm_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_16X16 ? TX_16X16
-                                                                   : TX_8X8)
+  mbmi->tx_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_16X16 ? TX_16X16
+                                                                 : TX_8X8)
                                    : TX_4X4;
   vp9_encode_intra_block_y(x, mbmi->sb_type);
   return vp9_get_mb_ss(x->plane[0].src_diff);

diff --git a/vp9/encoder/vp9_encodeintra.h b/vp9/encoder/vp9_encodeintra.h
index 496f421..e217924 100644
--- a/vp9/encoder/vp9_encodeintra.h
+++ b/vp9/encoder/vp9_encodeintra.h

@@ -14,7 +14,7 @@
 #include "vp9/encoder/vp9_onyx_int.h"
 
 int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred);
-void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                             TX_SIZE tx_size, void *arg);
 
 #endif  // VP9_ENCODER_VP9_ENCODEINTRA_H_

diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 13aee3e..da9a3bd 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c

@@ -69,7 +69,7 @@
     vp9_short_idct16x16_add(dqcoeff, dest, stride);
 }
 
-static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int plane) {
+static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   struct macroblock_plane *const p = &x->plane[plane];
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -81,18 +81,18 @@
                      pd->dst.buf, pd->dst.stride);
 }
 
-void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
   subtract_plane(x, bsize, 0);
 }
 
-void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize) {
   int i;
 
   for (i = 1; i < MAX_MB_PLANE; i++)
     subtract_plane(x, bsize, i);
 }
 
-void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
   vp9_subtract_sby(x, bsize);
   vp9_subtract_sbuv(x, bsize);
 }
@@ -142,7 +142,7 @@
 }
 
 static void optimize_b(MACROBLOCK *mb,
-                       int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+                       int plane, int block, BLOCK_SIZE plane_bsize,
                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                        TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &mb->e_mbd;
@@ -370,7 +370,7 @@
   *a = *l = (final_eob > 0);
 }
 
-void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+void vp9_optimize_b(int plane, int block, BLOCK_SIZE plane_bsize,
                     TX_SIZE tx_size, MACROBLOCK *mb, struct optimize_ctx *ctx) {
   int x, y;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
@@ -378,15 +378,15 @@
              &ctx->ta[plane][x], &ctx->tl[plane][y], tx_size);
 }
 
-static void optimize_init_b(int plane, BLOCK_SIZE_TYPE bsize,
+static void optimize_init_b(int plane, BLOCK_SIZE bsize,
                             struct encode_b_args *args) {
   const MACROBLOCKD *xd = &args->x->e_mbd;
   const struct macroblockd_plane* const pd = &xd->plane[plane];
-  const BLOCK_SIZE_TYPE plane_bsize = get_plane_block_size(bsize, pd);
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
   const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
   const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
   const MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->txfm_size;
+  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
   int i;
 
   switch (tx_size) {
@@ -419,7 +419,7 @@
   }
 }
 
-void vp9_xform_quant(int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
                      TX_SIZE tx_size, void *arg) {
   struct encode_b_args* const args = arg;
   MACROBLOCK* const x = args->x;
@@ -492,7 +492,7 @@
   }
 }
 
-static void encode_block(int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
                          TX_SIZE tx_size, void *arg) {
   struct encode_b_args *const args = arg;
   MACROBLOCK *const x = args->x;
@@ -536,7 +536,7 @@
   }
 }
 
-void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
   struct encode_b_args arg = {x, &ctx};
@@ -548,7 +548,7 @@
   foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg);
 }
 
-void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
   struct encode_b_args arg = {x, &ctx};
@@ -564,7 +564,7 @@
   foreach_transformed_block(xd, bsize, encode_block, &arg);
 }
 
-void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                             TX_SIZE tx_size, void *arg) {
   struct encode_b_args* const args = arg;
   MACROBLOCK *const x = args->x;
@@ -714,7 +714,7 @@
   }
 }
 
-void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
   struct encode_b_args arg = {x, &ctx};
@@ -722,7 +722,7 @@
   foreach_transformed_block_in_plane(xd, bsize, 0, vp9_encode_block_intra,
                                      &arg);
 }
-void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
   struct encode_b_args arg = {x, &ctx};

diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 1db15c3..2aa4188 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h

@@ -32,18 +32,18 @@
   struct optimize_ctx *ctx;
 };
 
-void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize);
 
-void vp9_xform_quant(int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
                      TX_SIZE tx_size, void *arg);
 
-void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
 
-void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize);
 
 
 #endif  // VP9_ENCODER_VP9_ENCODEMB_H_

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 6b37cc9..92485f9 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c

@@ -661,7 +661,7 @@
           mv.as_mv.col <<= 3;
           this_error = motion_error;
           vp9_set_mbmode_and_mvs(x, NEWMV, &mv);
-          xd->mode_info_context->mbmi.txfm_size = TX_4X4;
+          xd->mode_info_context->mbmi.tx_size = TX_4X4;
           xd->mode_info_context->mbmi.ref_frame[0] = LAST_FRAME;
           xd->mode_info_context->mbmi.ref_frame[1] = NONE;
           vp9_build_inter_predictors_sby(xd, mb_row << 1,

diff --git a/vp9/encoder/vp9_modecosts.c b/vp9/encoder/vp9_modecosts.c
index 5b23653..a5dfaed 100644
--- a/vp9/encoder/vp9_modecosts.c
+++ b/vp9/encoder/vp9_modecosts.c

@@ -16,7 +16,7 @@
 
 
 void vp9_init_mode_costs(VP9_COMP *c) {
-  VP9_COMMON *x = &c->common;
+  VP9_COMMON *const cm = &c->common;
   const vp9_tree_p KT = vp9_intra_mode_tree;
   int i, j;
 
@@ -28,16 +28,16 @@
   }
 
   // TODO(rbultje) separate tables for superblock costing?
-  vp9_cost_tokens(c->mb.mbmode_cost, x->fc.y_mode_prob[1],
+  vp9_cost_tokens(c->mb.mbmode_cost, cm->fc.y_mode_prob[1],
                   vp9_intra_mode_tree);
   vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
-                  x->fc.uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
+                  cm->fc.uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
   vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
                   vp9_kf_uv_mode_prob[INTRA_MODES - 1],
                   vp9_intra_mode_tree);
 
   for (i = 0; i <= SWITCHABLE_FILTERS; ++i)
     vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
-                    x->fc.switchable_interp_prob[i],
+                    cm->fc.switchable_interp_prob[i],
                     vp9_switchable_interp_tree);
 }

diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index e15c44d..e9c214f 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c

@@ -608,7 +608,7 @@
   sf->thresh_mult[THR_D135_PRED] += speed_multiplier * 2500;
   sf->thresh_mult[THR_D117_PRED] += speed_multiplier * 2500;
   sf->thresh_mult[THR_D153_PRED] += speed_multiplier * 2500;
-  sf->thresh_mult[THR_D27_PRED] += speed_multiplier * 2500;
+  sf->thresh_mult[THR_D207_PRED] += speed_multiplier * 2500;
   sf->thresh_mult[THR_D63_PRED] += speed_multiplier * 2500;
 
   if (cpi->sf.skip_lots_of_modes) {
@@ -741,6 +741,7 @@
   sf->skip_encode_sb = 0;
   sf->use_uv_intra_rd_estimate = 0;
   sf->use_fast_lpf_pick = 0;
+  sf->use_fast_coef_updates = 0;
   sf->using_small_partition_info = 0;
   // Skip any mode not chosen at size < X for all sizes > X
   // Hence BLOCK_64X64 (skip is off)
@@ -802,6 +803,7 @@
 
         sf->intra_y_mode_mask = INTRA_DC_TM_H_V;
         sf->intra_uv_mode_mask = INTRA_DC_TM_H_V;
+        sf->use_fast_coef_updates = 1;
       }
       if (speed == 2) {
         sf->adjust_thresholds_by_speed = 1;
@@ -840,6 +842,7 @@
         sf->auto_min_max_partition_interval = 2;
         sf->disable_split_var_thresh = 32;
         sf->disable_filter_search_var_thresh = 32;
+        sf->use_fast_coef_updates = 2;
       }
       if (speed == 3) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
@@ -866,6 +869,7 @@
         sf->disable_filter_search_var_thresh = 64;
         sf->intra_y_mode_mask = INTRA_DC_ONLY;
         sf->intra_uv_mode_mask = INTRA_DC_ONLY;
+        sf->use_fast_coef_updates = 2;
       }
       if (speed == 4) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
@@ -894,6 +898,7 @@
         sf->subpel_iters_per_step = 1;
         sf->disable_split_var_thresh = 64;
         sf->disable_filter_search_var_thresh = 96;
+        sf->use_fast_coef_updates = 2;
       }
       /*
       if (speed == 2) {
@@ -1720,6 +1725,10 @@
 
   vp9_zero(cpi->y_uv_mode_count)
 
+#ifdef MODE_TEST_HIT_STATS
+  vp9_zero(cpi->mode_test_hits)
+#endif
+
   return (VP9_PTR) cpi;
 }
 
@@ -1801,6 +1810,34 @@
 
 #endif
 
+#ifdef MODE_TEST_HIT_STATS
+    if (cpi->pass != 1) {
+      double norm_per_pixel_mode_tests = 0;
+      double norm_counts[BLOCK_SIZES];
+      int i;
+      int sb64_per_frame;
+      int norm_factors[BLOCK_SIZES] =
+        {256, 128, 128, 64, 32, 32, 16, 8, 8, 4, 2, 2, 1};
+      FILE *f = fopen("mode_hit_stats.stt", "a");
+
+      // On average, how many mode tests do we do
+      for (i = 0; i < BLOCK_SIZES; ++i) {
+        norm_counts[i] = (double)cpi->mode_test_hits[i] /
+                         (double)norm_factors[i];
+        norm_per_pixel_mode_tests += norm_counts[i];
+      }
+      // Convert to a number per 64x64 and per frame
+      sb64_per_frame = ((cpi->common.height + 63) / 64) *
+                       ((cpi->common.width + 63) / 64);
+      norm_per_pixel_mode_tests =
+        norm_per_pixel_mode_tests /
+        (double)(cpi->common.current_video_frame * sb64_per_frame);
+
+      fprintf(f, "%6.4f\n", norm_per_pixel_mode_tests);
+      fclose(f);
+    }
+#endif
+
 #ifdef ENTROPY_STATS
     {
       int i, j, k;

diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 8328374..252e982 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h

@@ -36,6 +36,8 @@
 #define DISABLE_RC_LONG_TERM_MEM 0
 #endif
 
+// #define MODE_TEST_HIT_STATS
+
 // #define SPEEDSTATS 1
 #if CONFIG_MULTIPLE_ARF
 // Set MIN_GF_INTERVAL to 1 for the full decomposition.
@@ -182,7 +184,7 @@
   THR_H_PRED,
   THR_V_PRED,
   THR_D135_PRED,
-  THR_D27_PRED,
+  THR_D207_PRED,
   THR_D153_PRED,
   THR_D63_PRED,
   THR_D117_PRED,
@@ -273,12 +275,12 @@
   int use_square_partition_only;
   int unused_mode_skip_lvl;
   int reference_masking;
-  BLOCK_SIZE_TYPE always_this_block_size;
+  BLOCK_SIZE always_this_block_size;
   int auto_min_max_partition_size;
   int auto_min_max_partition_interval;
   int auto_min_max_partition_count;
-  BLOCK_SIZE_TYPE min_partition_size;
-  BLOCK_SIZE_TYPE max_partition_size;
+  BLOCK_SIZE min_partition_size;
+  BLOCK_SIZE max_partition_size;
   int adjust_partitioning_from_last_frame;
   int last_partitioning_redo_frequency;
   int disable_splitmv;
@@ -298,6 +300,7 @@
   int use_rd_breakout;
   int use_uv_intra_rd_estimate;
   int use_fast_lpf_pick;
+  int use_fast_coef_updates;  // 0: 2-loop, 1: 1-loop, 2: 1-loop reduced
 } SPEED_FEATURES;
 
 typedef struct VP9_COMP {
@@ -663,6 +666,12 @@
 #ifdef ENTROPY_STATS
   int64_t mv_ref_stats[INTER_MODE_CONTEXTS][INTER_MODES - 1][2];
 #endif
+
+
+#ifdef MODE_TEST_HIT_STATS
+  // Debug / test stats
+  int64_t mode_test_hits[BLOCK_SIZES];
+#endif
 } VP9_COMP;
 
 static int get_ref_frame_idx(VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {

diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 02c0685..fb0e470 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c

@@ -84,7 +84,6 @@
   *eob_ptr = eob + 1;
 }
 
-// This function works well for large transform size.
 void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
                             int skip_block,
                             int16_t *zbin_ptr, int16_t *round_ptr,
@@ -105,8 +104,8 @@
   eob = -1;
 
   // Base ZBIN
-  zbins[0] = zbin_ptr[0] + zbin_oq_value;
-  zbins[1] = zbin_ptr[1] + zbin_oq_value;
+  zbins[0] = ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1);
+  zbins[1] = ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1);
   nzbins[0] = zbins[0] * -1;
   nzbins[1] = zbins[1] * -1;
 
@@ -114,7 +113,7 @@
     // Pre-scan pass
     for (i = 0; i < n_coeffs; i++) {
       rc = scan[i];
-      z = coeff_ptr[rc] * 2;
+      z = coeff_ptr[rc];
 
       // If the coefficient is out of the base ZBIN range, keep it for
       // quantization.
@@ -130,14 +129,14 @@
       // Calculate ZBIN
       zbin = (zbins[rc != 0]);
 
-      z = coeff_ptr[rc] * 2;
+      z = coeff_ptr[rc];
       sz = (z >> 31);                               // sign of z
       x  = (z ^ sz) - sz;                           // x = abs(z)
 
       if (x >= zbin) {
-        x += (round_ptr[rc != 0]);
+        x += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
         y  = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) *
-              quant_shift_ptr[rc != 0]) >> 16;      // quantize (x)
+              quant_shift_ptr[rc != 0]) >> 15;      // quantize (x)
 
         x  = (y ^ sz) - sz;                         // get the sign back
         qcoeff_ptr[rc]  = x;                        // write to destination

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index ee21957..39b6544 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -90,7 +90,7 @@
   {H_PRED,    INTRA_FRAME,  NONE},
   {V_PRED,    INTRA_FRAME,  NONE},
   {D135_PRED, INTRA_FRAME,  NONE},
-  {D27_PRED,  INTRA_FRAME,  NONE},
+  {D207_PRED,  INTRA_FRAME,  NONE},
   {D153_PRED, INTRA_FRAME,  NONE},
   {D63_PRED,  INTRA_FRAME,  NONE},
   {D117_PRED, INTRA_FRAME,  NONE},
@@ -364,7 +364,7 @@
   vp9_clear_system_state();
 }
 
-static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
                             MACROBLOCK *x, MACROBLOCKD *xd,
                             int *out_rate_sum, int64_t *out_dist_sum) {
   // Note our transform coeffs are 8 times an orthogonal transform.
@@ -375,7 +375,7 @@
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     struct macroblock_plane *const p = &x->plane[i];
     struct macroblockd_plane *const pd = &xd->plane[i];
-    const BLOCK_SIZE_TYPE bs = get_plane_block_size(bsize, pd);
+    const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
     unsigned int sse;
     int rate;
     int64_t dist;
@@ -393,55 +393,52 @@
   *out_dist_sum = dist_sum << 4;
 }
 
-static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE bsize,
                                  TX_SIZE tx_size,
                                  MACROBLOCK *x, MACROBLOCKD *xd,
                                  int *out_rate_sum, int64_t *out_dist_sum,
                                  int *out_skip) {
-  int t = 4, j, k;
-  BLOCK_SIZE_TYPE bs = BLOCK_4X4;
+  int j, k;
+  BLOCK_SIZE bs;
   struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
-  const int width = plane_block_width(bsize, pd);
-  const int height = plane_block_height(bsize, pd);
+  const int width = 4 << num_4x4_blocks_wide_lookup[bsize];
+  const int height = 4 << num_4x4_blocks_high_lookup[bsize];
   int rate_sum = 0;
   int64_t dist_sum = 0;
+  const int t = 4 << tx_size;
 
   if (tx_size == TX_4X4) {
     bs = BLOCK_4X4;
-    t = 4;
   } else if (tx_size == TX_8X8) {
     bs = BLOCK_8X8;
-    t = 8;
   } else if (tx_size == TX_16X16) {
     bs = BLOCK_16X16;
-    t = 16;
   } else if (tx_size == TX_32X32) {
     bs = BLOCK_32X32;
-    t = 32;
   } else {
     assert(0);
   }
+
   *out_skip = 1;
   for (j = 0; j < height; j += t) {
     for (k = 0; k < width; k += t) {
       int rate;
       int64_t dist;
       unsigned int sse;
-      (void) cpi->fn_ptr[bs].vf(p->src.buf + j * p->src.stride + k,
-                                p->src.stride,
-                                pd->dst.buf + j * pd->dst.stride + k,
-                                pd->dst.stride, &sse);
+      cpi->fn_ptr[bs].vf(&p->src.buf[j * p->src.stride + k], p->src.stride,
+                         &pd->dst.buf[j * pd->dst.stride + k], pd->dst.stride,
+                         &sse);
       // sse works better than var, since there is no dc prediction used
-      model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3,
-                               &rate, &dist);
+      model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3, &rate, &dist);
       rate_sum += rate;
       dist_sum += dist;
       *out_skip &= (rate < 1024);
     }
   }
+
   *out_rate_sum = rate_sum;
-  *out_dist_sum = (dist_sum << 4);
+  *out_dist_sum = dist_sum << 4;
 }
 
 int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
@@ -492,7 +489,7 @@
   int c, cost;
 
   // Check for consistency of tx_size with mode info
-  assert(type == PLANE_TYPE_Y_WITH_DC ? mbmi->txfm_size == tx_size
+  assert(type == PLANE_TYPE_Y_WITH_DC ? mbmi->tx_size == tx_size
                                       : get_uv_tx_size(mbmi) == tx_size);
 
   if (eob == 0) {
@@ -579,7 +576,7 @@
   }
 }
 
-static void rate_block(int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
                        TX_SIZE tx_size, void *arg) {
   struct rdcost_block_args* args = arg;
 
@@ -592,7 +589,7 @@
                             args->scan, args->nb);
 }
 
-static void block_yrd_txfm(int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
                            TX_SIZE tx_size, void *arg) {
   struct rdcost_block_args *args = arg;
   MACROBLOCK *const x = args->x;
@@ -626,10 +623,10 @@
                              int *rate, int64_t *distortion,
                              int *skippable, int64_t *sse,
                              int64_t ref_best_rd, int plane,
-                             BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
+                             BLOCK_SIZE bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const BLOCK_SIZE_TYPE bs = get_plane_block_size(bsize, pd);
+  const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bs];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bs];
   int i;
@@ -637,7 +634,7 @@
                                     num_4x4_blocks_wide, num_4x4_blocks_high,
                                     0, 0, 0, ref_best_rd, 0 };
   if (plane == 0)
-    xd->mode_info_context->mbmi.txfm_size = tx_size;
+    xd->mode_info_context->mbmi.tx_size = tx_size;
 
   switch (tx_size) {
     case TX_4X4:
@@ -687,7 +684,7 @@
                                      int *rate, int64_t *distortion,
                                      int *skip, int64_t *sse,
                                      int64_t ref_best_rd,
-                                     BLOCK_SIZE_TYPE bs) {
+                                     BLOCK_SIZE bs) {
   const TX_SIZE max_txfm_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -695,20 +692,20 @@
   if (max_txfm_size == TX_32X32 &&
       (cm->tx_mode == ALLOW_32X32 ||
        cm->tx_mode == TX_MODE_SELECT)) {
-    mbmi->txfm_size = TX_32X32;
+    mbmi->tx_size = TX_32X32;
   } else if (max_txfm_size >= TX_16X16 &&
              (cm->tx_mode == ALLOW_16X16 ||
               cm->tx_mode == ALLOW_32X32 ||
               cm->tx_mode == TX_MODE_SELECT)) {
-    mbmi->txfm_size = TX_16X16;
+    mbmi->tx_size = TX_16X16;
   } else if (cm->tx_mode != ONLY_4X4) {
-    mbmi->txfm_size = TX_8X8;
+    mbmi->tx_size = TX_8X8;
   } else {
-    mbmi->txfm_size = TX_4X4;
+    mbmi->tx_size = TX_4X4;
   }
   txfm_rd_in_plane(x, rate, distortion, skip,
-                   &sse[mbmi->txfm_size], ref_best_rd, 0, bs,
-                   mbmi->txfm_size);
+                   &sse[mbmi->tx_size], ref_best_rd, 0, bs,
+                   mbmi->tx_size);
   cpi->txfm_stepdown_count[0]++;
 }
 
@@ -717,7 +714,7 @@
                                      int64_t *d, int64_t *distortion,
                                      int *s, int *skip,
                                      int64_t tx_cache[TX_MODES],
-                                     BLOCK_SIZE_TYPE bs) {
+                                     BLOCK_SIZE bs) {
   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -763,26 +760,26 @@
        (cm->tx_mode == TX_MODE_SELECT &&
         rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
         rd[TX_32X32][1] < rd[TX_4X4][1]))) {
-    mbmi->txfm_size = TX_32X32;
+    mbmi->tx_size = TX_32X32;
   } else if (max_tx_size >= TX_16X16 &&
              (cm->tx_mode == ALLOW_16X16 ||
               cm->tx_mode == ALLOW_32X32 ||
               (cm->tx_mode == TX_MODE_SELECT &&
                rd[TX_16X16][1] < rd[TX_8X8][1] &&
                rd[TX_16X16][1] < rd[TX_4X4][1]))) {
-    mbmi->txfm_size = TX_16X16;
+    mbmi->tx_size = TX_16X16;
   } else if (cm->tx_mode == ALLOW_8X8 ||
              cm->tx_mode == ALLOW_16X16 ||
              cm->tx_mode == ALLOW_32X32 ||
            (cm->tx_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {
-    mbmi->txfm_size = TX_8X8;
+    mbmi->tx_size = TX_8X8;
   } else {
-    mbmi->txfm_size = TX_4X4;
+    mbmi->tx_size = TX_4X4;
   }
 
-  *distortion = d[mbmi->txfm_size];
-  *rate       = r[mbmi->txfm_size][cm->tx_mode == TX_MODE_SELECT];
-  *skip       = s[mbmi->txfm_size];
+  *distortion = d[mbmi->tx_size];
+  *rate       = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
+  *skip       = s[mbmi->tx_size];
 
   tx_cache[ONLY_4X4] = rd[TX_4X4][0];
   tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
@@ -820,7 +817,7 @@
                                           int64_t *d, int64_t *distortion,
                                           int *s, int *skip, int64_t *sse,
                                           int64_t ref_best_rd,
-                                          BLOCK_SIZE_TYPE bs) {
+                                          BLOCK_SIZE bs) {
   const TX_SIZE max_txfm_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -870,28 +867,28 @@
         rd[TX_32X32][1] <= rd[TX_16X16][1] &&
         rd[TX_32X32][1] <= rd[TX_8X8][1] &&
         rd[TX_32X32][1] <= rd[TX_4X4][1]))) {
-    mbmi->txfm_size = TX_32X32;
+    mbmi->tx_size = TX_32X32;
   } else if (max_txfm_size >= TX_16X16 &&
              (cm->tx_mode == ALLOW_16X16 ||
               cm->tx_mode == ALLOW_32X32 ||
               (cm->tx_mode == TX_MODE_SELECT &&
                rd[TX_16X16][1] <= rd[TX_8X8][1] &&
                rd[TX_16X16][1] <= rd[TX_4X4][1]))) {
-    mbmi->txfm_size = TX_16X16;
+    mbmi->tx_size = TX_16X16;
   } else if (cm->tx_mode == ALLOW_8X8 ||
              cm->tx_mode == ALLOW_16X16 ||
              cm->tx_mode == ALLOW_32X32 ||
            (cm->tx_mode == TX_MODE_SELECT &&
             rd[TX_8X8][1] <= rd[TX_4X4][1])) {
-    mbmi->txfm_size = TX_8X8;
+    mbmi->tx_size = TX_8X8;
   } else {
-    mbmi->txfm_size = TX_4X4;
+    mbmi->tx_size = TX_4X4;
   }
 
   // Actually encode using the chosen mode if a model was used, but do not
   // update the r, d costs
-  txfm_rd_in_plane(x, rate, distortion, skip, &sse[mbmi->txfm_size],
-                   ref_best_rd, 0, bs, mbmi->txfm_size);
+  txfm_rd_in_plane(x, rate, distortion, skip, &sse[mbmi->tx_size],
+                   ref_best_rd, 0, bs, mbmi->tx_size);
 
   if (max_txfm_size == TX_32X32 &&
       rd[TX_32X32][1] <= rd[TX_16X16][1] &&
@@ -911,7 +908,7 @@
 
 static void super_block_yrd(VP9_COMP *cpi,
                             MACROBLOCK *x, int *rate, int64_t *distortion,
-                            int *skip, int64_t *psse, BLOCK_SIZE_TYPE bs,
+                            int *skip, int64_t *psse, BLOCK_SIZE bs,
                             int64_t txfm_cache[TX_MODES],
                             int64_t ref_best_rd) {
   int r[TX_SIZES][2], s[TX_SIZES];
@@ -930,7 +927,7 @@
     choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
                              ref_best_rd, bs);
     if (psse)
-      *psse = sse[mbmi->txfm_size];
+      *psse = sse[mbmi->tx_size];
     return;
   }
 
@@ -966,7 +963,7 @@
                              skip, txfm_cache, bs);
   }
   if (psse)
-    *psse = sse[mbmi->txfm_size];
+    *psse = sse[mbmi->tx_size];
 }
 
 static int conditional_skipintra(MB_PREDICTION_MODE mode,
@@ -979,7 +976,7 @@
       best_intra_mode != V_PRED &&
       best_intra_mode != D45_PRED)
     return 1;
-  if (mode == D27_PRED &&
+  if (mode == D207_PRED &&
       best_intra_mode != H_PRED &&
       best_intra_mode != D45_PRED)
     return 1;
@@ -996,8 +993,7 @@
                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                                      int *bestrate, int *bestratey,
                                      int64_t *bestdistortion,
-                                     BLOCK_SIZE_TYPE bsize,
-                                     int64_t rd_thresh) {
+                                     BLOCK_SIZE bsize, int64_t rd_thresh) {
   MB_PREDICTION_MODE mode;
   MACROBLOCKD *xd = &x->e_mbd;
   int64_t best_rd = rd_thresh;
@@ -1025,7 +1021,7 @@
 
   vpx_memcpy(ta, a, sizeof(ta));
   vpx_memcpy(tl, l, sizeof(tl));
-  xd->mode_info_context->mbmi.txfm_size = TX_4X4;
+  xd->mode_info_context->mbmi.tx_size = TX_4X4;
 
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     int64_t this_rd;
@@ -1131,7 +1127,7 @@
                                             int64_t best_rd) {
   int i, j;
   MACROBLOCKD *const xd = &mb->e_mbd;
-  BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+  const BLOCK_SIZE bsize = xd->mode_info_context->mbmi.sb_type;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   int idx, idy;
@@ -1197,7 +1193,7 @@
 static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                       int *rate, int *rate_tokenonly,
                                       int64_t *distortion, int *skippable,
-                                      BLOCK_SIZE_TYPE bsize,
+                                      BLOCK_SIZE bsize,
                                       int64_t tx_cache[TX_MODES],
                                       int64_t best_rd) {
   MB_PREDICTION_MODE mode;
@@ -1243,7 +1239,7 @@
     if (this_rd < best_rd) {
       mode_selected   = mode;
       best_rd         = this_rd;
-      best_tx         = mic->mbmi.txfm_size;
+      best_tx         = mic->mbmi.tx_size;
       *rate           = this_rate;
       *rate_tokenonly = this_rate_tokenonly;
       *distortion     = this_distortion;
@@ -1251,7 +1247,7 @@
     }
 
     if (cpi->sf.tx_size_search_method == USE_FULL_RD && this_rd < INT64_MAX) {
-      for (i = 0; i < TX_MODES; i++) {
+      for (i = 0; i < TX_MODES && local_tx_cache[i] < INT64_MAX; i++) {
         const int64_t adj_rd = this_rd + local_tx_cache[i] -
             local_tx_cache[cpi->common.tx_mode];
         if (adj_rd < tx_cache[i]) {
@@ -1262,14 +1258,14 @@
   }
 
   mic->mbmi.mode = mode_selected;
-  mic->mbmi.txfm_size = best_tx;
+  mic->mbmi.tx_size = best_tx;
 
   return best_rd;
 }
 
 static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
                              int *rate, int64_t *distortion, int *skippable,
-                             int64_t *sse, BLOCK_SIZE_TYPE bsize,
+                             int64_t *sse, BLOCK_SIZE bsize,
                              int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
@@ -1312,7 +1308,7 @@
 static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                        int *rate, int *rate_tokenonly,
                                        int64_t *distortion, int *skippable,
-                                       BLOCK_SIZE_TYPE bsize) {
+                                       BLOCK_SIZE bsize) {
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE mode_selected = DC_PRED;
   int64_t best_rd = INT64_MAX, this_rd;
@@ -1354,7 +1350,7 @@
 static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
                               int *rate, int *rate_tokenonly,
                               int64_t *distortion, int *skippable,
-                              BLOCK_SIZE_TYPE bsize) {
+                              BLOCK_SIZE bsize) {
   int64_t this_rd;
   int64_t this_sse;
 
@@ -1368,7 +1364,7 @@
   return this_rd;
 }
 
-static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE bsize,
                                  int *rate_uv, int *rate_uv_tokenonly,
                                  int64_t *dist_uv, int *skip_uv,
                                  MB_PREDICTION_MODE *mode_uv) {
@@ -1410,13 +1406,13 @@
 }
 
 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                BLOCK_SIZE_TYPE bsize,
+                                BLOCK_SIZE bsize,
                                 int_mv *frame_mv,
                                 int mi_row, int mi_col,
                                 int_mv single_newmv[MAX_REF_FRAMES],
                                 int *rate_mv);
 static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                 BLOCK_SIZE_TYPE bsize,
+                                 BLOCK_SIZE bsize,
                                  int mi_row, int mi_col,
                                  int_mv *tmp_mv, int *rate_mv);
 
@@ -1504,7 +1500,7 @@
   MACROBLOCKD *xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[0];
   MODE_INFO *const mi = xd->mode_info_context;
-  const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
   const int width = plane_block_width(bsize, pd);
   const int height = plane_block_height(bsize, pd);
   int idx, idy;
@@ -1515,28 +1511,20 @@
   int16_t* src_diff = raster_block_offset_int16(BLOCK_8X8, i,
                                                 x->plane[0].src_diff);
   int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, i);
-  uint8_t* const pre = raster_block_offset_uint8(BLOCK_8X8, i,
-                                                 pd->pre[0].buf,
-                                                 pd->pre[0].stride);
   uint8_t* const dst = raster_block_offset_uint8(BLOCK_8X8, i,
                                                  pd->dst.buf, pd->dst.stride);
   int64_t thisdistortion = 0, thissse = 0;
   int thisrate = 0;
+  int ref, second_ref = has_second_ref(&mi->mbmi);
 
-  vp9_build_inter_predictor(pre, pd->pre[0].stride,
-                            dst, pd->dst.stride,
-                            &mi->bmi[i].as_mv[0].as_mv,
-                            &xd->scale_factor[0],
-                            width, height, 0, &xd->subpix, MV_PRECISION_Q3);
-
-  if (mi->mbmi.ref_frame[1] > 0) {
-    uint8_t* const second_pre =
-    raster_block_offset_uint8(BLOCK_8X8, 0, pd->pre[1].buf, pd->pre[1].stride);
-    vp9_build_inter_predictor(second_pre, pd->pre[1].stride,
+  for (ref = 0; ref < 1 + second_ref; ++ref) {
+    const uint8_t *pre = raster_block_offset_uint8(BLOCK_8X8, i,
+                                     pd->pre[ref].buf, pd->pre[ref].stride);
+    vp9_build_inter_predictor(pre, pd->pre[ref].stride,
                               dst, pd->dst.stride,
-                              &mi->bmi[i].as_mv[1].as_mv,
-                              &xd->scale_factor[1],
-                              width, height, 1, &xd->subpix, MV_PRECISION_Q3);
+                              &mi->bmi[i].as_mv[ref].as_mv,
+                              &xd->scale_factor[ref],
+                              width, height, ref, &xd->subpix, MV_PRECISION_Q3);
   }
 
   vp9_subtract_block(height, width, src_diff, 8, src, src_stride,
@@ -1647,7 +1635,7 @@
   int64_t this_segment_rd = 0;
   int label_mv_thresh;
   int segmentyrate = 0;
-  BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   vp9_variance_fn_ptr_t *v_fn_ptr;
@@ -2072,7 +2060,7 @@
 
 static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
                     uint8_t *ref_y_buffer, int ref_y_stride,
-                    int ref_frame, BLOCK_SIZE_TYPE block_size ) {
+                    int ref_frame, BLOCK_SIZE block_size ) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   int_mv this_mv;
@@ -2241,7 +2229,7 @@
 
 static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                                int idx, MV_REFERENCE_FRAME frame_type,
-                               BLOCK_SIZE_TYPE block_size,
+                               BLOCK_SIZE block_size,
                                int mi_row, int mi_col,
                                int_mv frame_nearest_mv[MAX_REF_FRAMES],
                                int_mv frame_near_mv[MAX_REF_FRAMES],
@@ -2304,7 +2292,7 @@
 }
 
 static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                 BLOCK_SIZE_TYPE bsize,
+                                 BLOCK_SIZE bsize,
                                  int mi_row, int mi_col,
                                  int_mv *tmp_mv, int *rate_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
@@ -2317,7 +2305,7 @@
   int_mv mvp_full;
   int ref = mbmi->ref_frame[0];
   int_mv ref_mv = mbmi->ref_mvs[ref][0];
-  const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]);
+  const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
 
   int tmp_col_min = x->mv_col_min;
   int tmp_col_max = x->mv_col_max;
@@ -2430,7 +2418,7 @@
 }
 
 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                BLOCK_SIZE_TYPE bsize,
+                                BLOCK_SIZE bsize,
                                 int_mv *frame_mv,
                                 int mi_row, int mi_col,
                                 int_mv single_newmv[MAX_REF_FRAMES],
@@ -2441,7 +2429,7 @@
   int refs[2] = { mbmi->ref_frame[0],
     (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   int_mv ref_mv[2];
-  const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]);
+  const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
   int ite;
   // Prediction buffer from second frame.
   uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
@@ -2587,7 +2575,7 @@
 }
 
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                                 BLOCK_SIZE_TYPE bsize,
+                                 BLOCK_SIZE bsize,
                                  int64_t txfm_cache[],
                                  int *rate2, int64_t *distortion,
                                  int *skippable,
@@ -2871,9 +2859,8 @@
     if (cpi->active_map_enabled && x->active_ptr[0] == 0)
       x->skip = 1;
     else if (x->encode_breakout) {
-      const BLOCK_SIZE_TYPE y_size = get_plane_block_size(bsize, &xd->plane[0]);
-      const BLOCK_SIZE_TYPE uv_size = get_plane_block_size(bsize,
-                                                           &xd->plane[1]);
+      const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]);
+      const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
       unsigned int var, sse;
       // Skipping threshold for ac.
       unsigned int thresh_ac;
@@ -2999,7 +2986,7 @@
 
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                int *returnrate, int64_t *returndist,
-                               BLOCK_SIZE_TYPE bsize,
+                               BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -3051,14 +3038,14 @@
                                   int mi_row, int mi_col,
                                   int *returnrate,
                                   int64_t *returndistortion,
-                                  BLOCK_SIZE_TYPE bsize,
+                                  BLOCK_SIZE bsize,
                                   PICK_MODE_CONTEXT *ctx,
                                   int64_t best_rd_so_far) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   const struct segmentation *seg = &cm->seg;
-  const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]);
+  const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
   MB_PREDICTION_MODE this_mode;
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
@@ -3105,14 +3092,11 @@
   int_mv seg_mvs[4][MAX_REF_FRAMES];
   union b_mode_info best_bmodes[4];
   PARTITION_INFO best_partition;
-  int bwsl = b_width_log2(bsize);
-  int bws = (1 << bwsl) / 4;  // mode_info step for subsize
-  int bhsl = b_height_log2(bsize);
-  int bhs = (1 << bhsl) / 4;  // mode_info step for subsize
+  const int bws = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
   int best_skip2 = 0;
 
-  x->skip_encode = (cpi->sf.skip_encode_frame &&
-                    xd->q_index < QIDX_SKIP_THRESH);
+  x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
 
   for (i = 0; i < 4; i++) {
     int j;
@@ -3357,6 +3341,12 @@
       continue;
     }
 
+#ifdef MODE_TEST_HIT_STATS
+    // TEST/DEBUG CODE
+    // Keep a rcord of the number of test hits at each size
+    cpi->mode_test_hits[bsize]++;
+#endif
+
     if (this_mode == I4X4_PRED) {
       int rate;
 
@@ -3367,7 +3357,7 @@
         */
 
       // I4X4_PRED is only considered for block sizes less than 8x8.
-      mbmi->txfm_size = TX_4X4;
+      mbmi->tx_size = TX_4X4;
       if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
                                        &distortion_y, best_rd) >= best_rd)
         continue;
@@ -3418,7 +3408,7 @@
       if (rate_y == INT_MAX)
         continue;
 
-      uv_tx = MIN(mbmi->txfm_size, max_uv_txsize_lookup[bsize]);
+      uv_tx = MIN(mbmi->tx_size, max_uv_txsize_lookup[bsize]);
       if (rate_uv_intra[uv_tx] == INT_MAX) {
         choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[uv_tx],
                              &rate_uv_tokenonly[uv_tx],
@@ -3468,7 +3458,7 @@
           cpi->rd_threshes[bsize][THR_NEWA];
       this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
           cpi->rd_threshes[bsize][THR_NEWG] : this_rd_thresh;
-      xd->mode_info_context->mbmi.txfm_size = TX_4X4;
+      xd->mode_info_context->mbmi.tx_size = TX_4X4;
 
       cpi->rd_filter_cache[SWITCHABLE_FILTERS] = INT64_MAX;
       if (cm->mcomp_filter_type != BILINEAR) {
@@ -3832,7 +3822,7 @@
       tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
     }
     if (!mode_excluded && this_rd != INT64_MAX) {
-      for (i = 0; i < TX_MODES; i++) {
+      for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
         int64_t adj_rd = INT64_MAX;
         if (this_mode != I4X4_PRED) {
           adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];

diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index b7f56e5..eba7df9 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h

@@ -20,12 +20,12 @@
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
 
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                               int *r, int64_t *d, BLOCK_SIZE_TYPE bsize,
+                               int *r, int64_t *d, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);
 
 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   int mi_row, int mi_col,
-                                  int *r, int64_t *d, BLOCK_SIZE_TYPE bsize,
+                                  int *r, int64_t *d, BLOCK_SIZE bsize,
                                   PICK_MODE_CONTEXT *ctx, int64_t best_rd);
 
 void vp9_init_me_luts();

diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index 8ff608b..0a6d2ab 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c

@@ -138,7 +138,7 @@
 
   // Temporal prediction not allowed on key frames
   if (cm->frame_type != KEY_FRAME) {
-    const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+    const BLOCK_SIZE bsize = mi->mbmi.sb_type;
     // Test to see if the segment id matches the predicted value.
     const int pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map,
                                                    bsize, mi_row, mi_col);
@@ -161,52 +161,45 @@
                           int (*temporal_predictor_count)[2],
                           int *t_unpred_seg_counts,
                           int mi_row, int mi_col,
-                          BLOCK_SIZE_TYPE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
+                          BLOCK_SIZE bsize) {
+  const VP9_COMMON *const cm = &cpi->common;
   const int mis = cm->mode_info_stride;
-  int bwl, bhl;
-  const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1);
+  int bw, bh;
+  const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  bwl = mi_width_log2(mi->mbmi.sb_type);
-  bhl = mi_height_log2(mi->mbmi.sb_type);
+  bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
+  bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
 
-  if (bwl == bsl && bhl == bsl) {
+  if (bw == bs && bh == bs) {
     count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,
-               t_unpred_seg_counts, 1 << bsl, 1 << bsl, mi_row, mi_col);
-  } else if (bwl == bsl && bhl < bsl) {
+               t_unpred_seg_counts, bs, bs, mi_row, mi_col);
+  } else if (bw == bs && bh < bs) {
     count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,
-               t_unpred_seg_counts, 1 << bsl, bs, mi_row, mi_col);
-    count_segs(cpi, mi + bs * mis, no_pred_segcounts, temporal_predictor_count,
-               t_unpred_seg_counts, 1 << bsl, bs, mi_row + bs, mi_col);
-  } else if (bwl < bsl && bhl == bsl) {
+               t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
+    count_segs(cpi, mi + hbs * mis, no_pred_segcounts, temporal_predictor_count,
+               t_unpred_seg_counts, bs, hbs, mi_row + hbs, mi_col);
+  } else if (bw < bs && bh == bs) {
     count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,
-               t_unpred_seg_counts, bs, 1 << bsl, mi_row, mi_col);
-    count_segs(cpi, mi + bs, no_pred_segcounts, temporal_predictor_count,
-               t_unpred_seg_counts, bs, 1 << bsl, mi_row, mi_col + bs);
+               t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
+    count_segs(cpi, mi + hbs, no_pred_segcounts, temporal_predictor_count,
+               t_unpred_seg_counts, hbs, bs, mi_row, mi_col + hbs);
   } else {
-    BLOCK_SIZE_TYPE subsize;
+    const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
     int n;
 
-    assert(bwl < bsl && bhl < bsl);
-    if (bsize == BLOCK_64X64) {
-      subsize = BLOCK_32X32;
-    } else if (bsize == BLOCK_32X32) {
-      subsize = BLOCK_16X16;
-    } else {
-      assert(bsize == BLOCK_16X16);
-      subsize = BLOCK_8X8;
-    }
+    assert(bw < bs && bh < bs);
 
     for (n = 0; n < 4; n++) {
-      const int y_idx = n >> 1, x_idx = n & 0x01;
+      const int mi_dc = hbs * (n & 1);
+      const int mi_dr = hbs * (n >> 1);
 
-      count_segs_sb(cpi, mi + y_idx * bs * mis + x_idx * bs,
+      count_segs_sb(cpi, &mi[mi_dr * mis + mi_dc],
                     no_pred_segcounts, temporal_predictor_count,
                     t_unpred_seg_counts,
-                    mi_row + y_idx * bs, mi_col + x_idx * bs, subsize);
+                    mi_row + mi_dr, mi_col + mi_dc, subsize);
     }
   }
 }

diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index b4270d5..03bf147 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c

@@ -97,8 +97,7 @@
   TX_SIZE tx_size;
 };
 
-static void set_entropy_context_b(int plane, int block,
-                                  BLOCK_SIZE_TYPE plane_bsize,
+static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize,
                                   TX_SIZE tx_size, void *arg) {
   struct tokenize_b_args* const args = arg;
   MACROBLOCKD *const xd = args->xd;
@@ -108,7 +107,7 @@
   set_contexts(xd, pd, plane_bsize, tx_size, pd->eobs[block] > 0, aoff, loff);
 }
 
-static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
                        TX_SIZE tx_size, void *arg) {
   struct tokenize_b_args* const args = arg;
   VP9_COMP *cpi = args->cpi;
@@ -122,16 +121,16 @@
   const int eob = pd->eobs[block];
   const PLANE_TYPE type = pd->plane_type;
   const int16_t *qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
-  int seg_eob;
+
   const int segment_id = mbmi->segment_id;
   const int16_t *scan, *nb;
   vp9_coeff_count *const counts = cpi->coef_counts[tx_size];
   vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size];
   const int ref = is_inter_block(mbmi);
-  ENTROPY_CONTEXT above_ec, left_ec;
   uint8_t token_cache[1024];
   const uint8_t *band_translate;
   ENTROPY_CONTEXT *A, *L;
+  const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
   int aoff, loff;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
 
@@ -140,45 +139,9 @@
 
   assert((!type && !plane) || (type && plane));
 
-  switch (tx_size) {
-    case TX_4X4:
-      above_ec = A[0] != 0;
-      left_ec = L[0] != 0;
-      seg_eob = 16;
-      scan = get_scan_4x4(get_tx_type_4x4(type, xd, block));
-      band_translate = vp9_coefband_trans_4x4;
-      break;
-    case TX_8X8:
-      above_ec = !!*(uint16_t *)A;
-      left_ec  = !!*(uint16_t *)L;
-      seg_eob = 64;
-      scan = get_scan_8x8(get_tx_type_8x8(type, xd));
-      band_translate = vp9_coefband_trans_8x8plus;
-      break;
-    case TX_16X16:
-      above_ec = !!*(uint32_t *)A;
-      left_ec  = !!*(uint32_t *)L;
-      seg_eob = 256;
-      scan = get_scan_16x16(get_tx_type_16x16(type, xd));
-      band_translate = vp9_coefband_trans_8x8plus;
-      break;
-    case TX_32X32:
-      above_ec = !!*(uint64_t *)A;
-      left_ec  = !!*(uint64_t *)L;
-      seg_eob = 1024;
-      scan = vp9_default_scan_32x32;
-      band_translate = vp9_coefband_trans_8x8plus;
-      break;
-    default:
-      assert(!"Invalid transform size");
-  }
-
-  pt = combine_entropy_contexts(above_ec, left_ec);
+  pt = get_entropy_context(xd, tx_size, type, block, A, L,
+                           &scan, &band_translate);
   nb = vp9_get_coef_neighbors_handle(scan);
-
-  if (vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP))
-    seg_eob = 0;
-
   c = 0;
   do {
     const int band = get_coef_band(band_translate, c);
@@ -222,20 +185,20 @@
 };
 
 static void is_skippable(int plane, int block,
-                         BLOCK_SIZE_TYPE plane_bsize, TX_SIZE tx_size,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                          void *argv) {
   struct is_skippable_args *args = argv;
   args->skippable[0] &= (!args->xd->plane[plane].eobs[block]);
 }
 
-int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
+int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
   int result = 1;
   struct is_skippable_args args = {xd, &result};
   foreach_transformed_block(xd, bsize, is_skippable, &args);
   return result;
 }
 
-int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
+int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize,
                               int plane) {
   int result = 1;
   struct is_skippable_args args = {xd, &result};
@@ -244,7 +207,7 @@
 }
 
 void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
-                     BLOCK_SIZE_TYPE bsize) {
+                     BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
@@ -252,7 +215,7 @@
   const int mb_skip_context = vp9_get_pred_context_mbskip(xd);
   const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
                                               SEG_LVL_SKIP);
-  struct tokenize_b_args arg = {cpi, xd, t, mbmi->txfm_size};
+  struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size};
 
   mbmi->skip_coeff = vp9_sb_is_skippable(xd, bsize);
   if (mbmi->skip_coeff) {

diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index 2afb748..b78e100 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h

@@ -31,13 +31,13 @@
 typedef int64_t vp9_coeff_accum[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
                                [MAX_ENTROPY_TOKENS + 1];
 
-int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
-int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
+int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize);
+int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize,
                               int plane);
 struct VP9_COMP;
 
 void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
-                     BLOCK_SIZE_TYPE bsize);
+                     BLOCK_SIZE bsize);
 
 #ifdef ENTROPY_STATS
 void init_context_counters();

diff --git a/vp9/encoder/x86/vp9_dct32x32_sse2.c b/vp9/encoder/x86/vp9_dct32x32_sse2.c
index ce65859..95ae266 100644
--- a/vp9/encoder/x86/vp9_dct32x32_sse2.c
+++ b/vp9/encoder/x86/vp9_dct32x32_sse2.c

@@ -403,6 +403,148 @@
         step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
         step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
       }
+
+#if !FDCT32x32_HIGH_PRECISION
+      // dump the magnitude by half, hence the intermediate values are within
+      // the range of 16 bits.
+      if (1 == pass) {
+        __m128i s3_00_0 = _mm_cmplt_epi16(step2[ 0], kZero);
+        __m128i s3_01_0 = _mm_cmplt_epi16(step2[ 1], kZero);
+        __m128i s3_02_0 = _mm_cmplt_epi16(step2[ 2], kZero);
+        __m128i s3_03_0 = _mm_cmplt_epi16(step2[ 3], kZero);
+        __m128i s3_04_0 = _mm_cmplt_epi16(step2[ 4], kZero);
+        __m128i s3_05_0 = _mm_cmplt_epi16(step2[ 5], kZero);
+        __m128i s3_06_0 = _mm_cmplt_epi16(step2[ 6], kZero);
+        __m128i s3_07_0 = _mm_cmplt_epi16(step2[ 7], kZero);
+        __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
+        __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
+        __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
+        __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
+        __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
+        __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
+        __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
+        __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
+        __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
+        __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
+        __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
+        __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
+        __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
+        __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
+        __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
+        __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
+        __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
+        __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
+        __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
+        __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
+        __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
+        __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
+        __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
+        __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
+
+        step2[ 0] = _mm_sub_epi16(step2[ 0], s3_00_0);
+        step2[ 1] = _mm_sub_epi16(step2[ 1], s3_01_0);
+        step2[ 2] = _mm_sub_epi16(step2[ 2], s3_02_0);
+        step2[ 3] = _mm_sub_epi16(step2[ 3], s3_03_0);
+        step2[ 4] = _mm_sub_epi16(step2[ 4], s3_04_0);
+        step2[ 5] = _mm_sub_epi16(step2[ 5], s3_05_0);
+        step2[ 6] = _mm_sub_epi16(step2[ 6], s3_06_0);
+        step2[ 7] = _mm_sub_epi16(step2[ 7], s3_07_0);
+        step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0);
+        step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0);
+        step2[10] = _mm_sub_epi16(step2[10], s3_10_0);
+        step2[11] = _mm_sub_epi16(step2[11], s3_11_0);
+        step2[12] = _mm_sub_epi16(step2[12], s3_12_0);
+        step2[13] = _mm_sub_epi16(step2[13], s3_13_0);
+        step2[14] = _mm_sub_epi16(step2[14], s2_14_0);
+        step2[15] = _mm_sub_epi16(step2[15], s2_15_0);
+        step1[16] = _mm_sub_epi16(step1[16], s3_16_0);
+        step1[17] = _mm_sub_epi16(step1[17], s3_17_0);
+        step1[18] = _mm_sub_epi16(step1[18], s3_18_0);
+        step1[19] = _mm_sub_epi16(step1[19], s3_19_0);
+        step2[20] = _mm_sub_epi16(step2[20], s3_20_0);
+        step2[21] = _mm_sub_epi16(step2[21], s3_21_0);
+        step2[22] = _mm_sub_epi16(step2[22], s3_22_0);
+        step2[23] = _mm_sub_epi16(step2[23], s3_23_0);
+        step2[24] = _mm_sub_epi16(step2[24], s3_24_0);
+        step2[25] = _mm_sub_epi16(step2[25], s3_25_0);
+        step2[26] = _mm_sub_epi16(step2[26], s3_26_0);
+        step2[27] = _mm_sub_epi16(step2[27], s3_27_0);
+        step1[28] = _mm_sub_epi16(step1[28], s3_28_0);
+        step1[29] = _mm_sub_epi16(step1[29], s3_29_0);
+        step1[30] = _mm_sub_epi16(step1[30], s3_30_0);
+        step1[31] = _mm_sub_epi16(step1[31], s3_31_0);
+
+        step2[ 0] = _mm_add_epi16(step2[ 0], kOne);
+        step2[ 1] = _mm_add_epi16(step2[ 1], kOne);
+        step2[ 2] = _mm_add_epi16(step2[ 2], kOne);
+        step2[ 3] = _mm_add_epi16(step2[ 3], kOne);
+        step2[ 4] = _mm_add_epi16(step2[ 4], kOne);
+        step2[ 5] = _mm_add_epi16(step2[ 5], kOne);
+        step2[ 6] = _mm_add_epi16(step2[ 6], kOne);
+        step2[ 7] = _mm_add_epi16(step2[ 7], kOne);
+        step2[ 8] = _mm_add_epi16(step2[ 8], kOne);
+        step2[ 9] = _mm_add_epi16(step2[ 9], kOne);
+        step2[10] = _mm_add_epi16(step2[10], kOne);
+        step2[11] = _mm_add_epi16(step2[11], kOne);
+        step2[12] = _mm_add_epi16(step2[12], kOne);
+        step2[13] = _mm_add_epi16(step2[13], kOne);
+        step2[14] = _mm_add_epi16(step2[14], kOne);
+        step2[15] = _mm_add_epi16(step2[15], kOne);
+        step1[16] = _mm_add_epi16(step1[16], kOne);
+        step1[17] = _mm_add_epi16(step1[17], kOne);
+        step1[18] = _mm_add_epi16(step1[18], kOne);
+        step1[19] = _mm_add_epi16(step1[19], kOne);
+        step2[20] = _mm_add_epi16(step2[20], kOne);
+        step2[21] = _mm_add_epi16(step2[21], kOne);
+        step2[22] = _mm_add_epi16(step2[22], kOne);
+        step2[23] = _mm_add_epi16(step2[23], kOne);
+        step2[24] = _mm_add_epi16(step2[24], kOne);
+        step2[25] = _mm_add_epi16(step2[25], kOne);
+        step2[26] = _mm_add_epi16(step2[26], kOne);
+        step2[27] = _mm_add_epi16(step2[27], kOne);
+        step1[28] = _mm_add_epi16(step1[28], kOne);
+        step1[29] = _mm_add_epi16(step1[29], kOne);
+        step1[30] = _mm_add_epi16(step1[30], kOne);
+        step1[31] = _mm_add_epi16(step1[31], kOne);
+
+        step2[ 0] = _mm_srai_epi16(step2[ 0], 2);
+        step2[ 1] = _mm_srai_epi16(step2[ 1], 2);
+        step2[ 2] = _mm_srai_epi16(step2[ 2], 2);
+        step2[ 3] = _mm_srai_epi16(step2[ 3], 2);
+        step2[ 4] = _mm_srai_epi16(step2[ 4], 2);
+        step2[ 5] = _mm_srai_epi16(step2[ 5], 2);
+        step2[ 6] = _mm_srai_epi16(step2[ 6], 2);
+        step2[ 7] = _mm_srai_epi16(step2[ 7], 2);
+        step2[ 8] = _mm_srai_epi16(step2[ 8], 2);
+        step2[ 9] = _mm_srai_epi16(step2[ 9], 2);
+        step2[10] = _mm_srai_epi16(step2[10], 2);
+        step2[11] = _mm_srai_epi16(step2[11], 2);
+        step2[12] = _mm_srai_epi16(step2[12], 2);
+        step2[13] = _mm_srai_epi16(step2[13], 2);
+        step2[14] = _mm_srai_epi16(step2[14], 2);
+        step2[15] = _mm_srai_epi16(step2[15], 2);
+        step1[16] = _mm_srai_epi16(step1[16], 2);
+        step1[17] = _mm_srai_epi16(step1[17], 2);
+        step1[18] = _mm_srai_epi16(step1[18], 2);
+        step1[19] = _mm_srai_epi16(step1[19], 2);
+        step2[20] = _mm_srai_epi16(step2[20], 2);
+        step2[21] = _mm_srai_epi16(step2[21], 2);
+        step2[22] = _mm_srai_epi16(step2[22], 2);
+        step2[23] = _mm_srai_epi16(step2[23], 2);
+        step2[24] = _mm_srai_epi16(step2[24], 2);
+        step2[25] = _mm_srai_epi16(step2[25], 2);
+        step2[26] = _mm_srai_epi16(step2[26], 2);
+        step2[27] = _mm_srai_epi16(step2[27], 2);
+        step1[28] = _mm_srai_epi16(step1[28], 2);
+        step1[29] = _mm_srai_epi16(step1[29], 2);
+        step1[30] = _mm_srai_epi16(step1[30], 2);
+        step1[31] = _mm_srai_epi16(step1[31], 2);
+      }
+#endif
+
+#if FDCT32x32_HIGH_PRECISION
+      if (pass == 0) {
+#endif
       // Stage 3
       {
         step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
@@ -469,144 +611,6 @@
         step3[31] = _mm_add_epi16(step2[24], step1[31]);
       }
 
-#if !FDCT32x32_HIGH_PRECISION
-      // dump the magnitude by half, hence the intermediate values are within
-      // the range of 16 bits.
-      if (1 == pass) {
-        __m128i s3_00_0 = _mm_cmplt_epi16(step3[ 0], kZero);
-        __m128i s3_01_0 = _mm_cmplt_epi16(step3[ 1], kZero);
-        __m128i s3_02_0 = _mm_cmplt_epi16(step3[ 2], kZero);
-        __m128i s3_03_0 = _mm_cmplt_epi16(step3[ 3], kZero);
-        __m128i s3_04_0 = _mm_cmplt_epi16(step3[ 4], kZero);
-        __m128i s3_05_0 = _mm_cmplt_epi16(step3[ 5], kZero);
-        __m128i s3_06_0 = _mm_cmplt_epi16(step3[ 6], kZero);
-        __m128i s3_07_0 = _mm_cmplt_epi16(step3[ 7], kZero);
-        __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
-        __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
-        __m128i s3_10_0 = _mm_cmplt_epi16(step3[10], kZero);
-        __m128i s3_11_0 = _mm_cmplt_epi16(step3[11], kZero);
-        __m128i s3_12_0 = _mm_cmplt_epi16(step3[12], kZero);
-        __m128i s3_13_0 = _mm_cmplt_epi16(step3[13], kZero);
-        __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
-        __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
-        __m128i s3_16_0 = _mm_cmplt_epi16(step3[16], kZero);
-        __m128i s3_17_0 = _mm_cmplt_epi16(step3[17], kZero);
-        __m128i s3_18_0 = _mm_cmplt_epi16(step3[18], kZero);
-        __m128i s3_19_0 = _mm_cmplt_epi16(step3[19], kZero);
-        __m128i s3_20_0 = _mm_cmplt_epi16(step3[20], kZero);
-        __m128i s3_21_0 = _mm_cmplt_epi16(step3[21], kZero);
-        __m128i s3_22_0 = _mm_cmplt_epi16(step3[22], kZero);
-        __m128i s3_23_0 = _mm_cmplt_epi16(step3[23], kZero);
-        __m128i s3_24_0 = _mm_cmplt_epi16(step3[24], kZero);
-        __m128i s3_25_0 = _mm_cmplt_epi16(step3[25], kZero);
-        __m128i s3_26_0 = _mm_cmplt_epi16(step3[26], kZero);
-        __m128i s3_27_0 = _mm_cmplt_epi16(step3[27], kZero);
-        __m128i s3_28_0 = _mm_cmplt_epi16(step3[28], kZero);
-        __m128i s3_29_0 = _mm_cmplt_epi16(step3[29], kZero);
-        __m128i s3_30_0 = _mm_cmplt_epi16(step3[30], kZero);
-        __m128i s3_31_0 = _mm_cmplt_epi16(step3[31], kZero);
-        step3[ 0] = _mm_sub_epi16(step3[ 0], s3_00_0);
-        step3[ 1] = _mm_sub_epi16(step3[ 1], s3_01_0);
-        step3[ 2] = _mm_sub_epi16(step3[ 2], s3_02_0);
-        step3[ 3] = _mm_sub_epi16(step3[ 3], s3_03_0);
-        step3[ 4] = _mm_sub_epi16(step3[ 4], s3_04_0);
-        step3[ 5] = _mm_sub_epi16(step3[ 5], s3_05_0);
-        step3[ 6] = _mm_sub_epi16(step3[ 6], s3_06_0);
-        step3[ 7] = _mm_sub_epi16(step3[ 7], s3_07_0);
-        step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0);
-        step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0);
-        step3[10] = _mm_sub_epi16(step3[10], s3_10_0);
-        step3[11] = _mm_sub_epi16(step3[11], s3_11_0);
-        step3[12] = _mm_sub_epi16(step3[12], s3_12_0);
-        step3[13] = _mm_sub_epi16(step3[13], s3_13_0);
-        step2[14] = _mm_sub_epi16(step2[14], s2_14_0);
-        step2[15] = _mm_sub_epi16(step2[15], s2_15_0);
-        step3[16] = _mm_sub_epi16(step3[16], s3_16_0);
-        step3[17] = _mm_sub_epi16(step3[17], s3_17_0);
-        step3[18] = _mm_sub_epi16(step3[18], s3_18_0);
-        step3[19] = _mm_sub_epi16(step3[19], s3_19_0);
-        step3[20] = _mm_sub_epi16(step3[20], s3_20_0);
-        step3[21] = _mm_sub_epi16(step3[21], s3_21_0);
-        step3[22] = _mm_sub_epi16(step3[22], s3_22_0);
-        step3[23] = _mm_sub_epi16(step3[23], s3_23_0);
-        step3[24] = _mm_sub_epi16(step3[24], s3_24_0);
-        step3[25] = _mm_sub_epi16(step3[25], s3_25_0);
-        step3[26] = _mm_sub_epi16(step3[26], s3_26_0);
-        step3[27] = _mm_sub_epi16(step3[27], s3_27_0);
-        step3[28] = _mm_sub_epi16(step3[28], s3_28_0);
-        step3[29] = _mm_sub_epi16(step3[29], s3_29_0);
-        step3[30] = _mm_sub_epi16(step3[30], s3_30_0);
-        step3[31] = _mm_sub_epi16(step3[31], s3_31_0);
-        step3[ 0] = _mm_add_epi16(step3[ 0], kOne);
-        step3[ 1] = _mm_add_epi16(step3[ 1], kOne);
-        step3[ 2] = _mm_add_epi16(step3[ 2], kOne);
-        step3[ 3] = _mm_add_epi16(step3[ 3], kOne);
-        step3[ 4] = _mm_add_epi16(step3[ 4], kOne);
-        step3[ 5] = _mm_add_epi16(step3[ 5], kOne);
-        step3[ 6] = _mm_add_epi16(step3[ 6], kOne);
-        step3[ 7] = _mm_add_epi16(step3[ 7], kOne);
-        step2[ 8] = _mm_add_epi16(step2[ 8], kOne);
-        step2[ 9] = _mm_add_epi16(step2[ 9], kOne);
-        step3[10] = _mm_add_epi16(step3[10], kOne);
-        step3[11] = _mm_add_epi16(step3[11], kOne);
-        step3[12] = _mm_add_epi16(step3[12], kOne);
-        step3[13] = _mm_add_epi16(step3[13], kOne);
-        step2[14] = _mm_add_epi16(step2[14], kOne);
-        step2[15] = _mm_add_epi16(step2[15], kOne);
-        step3[16] = _mm_add_epi16(step3[16], kOne);
-        step3[17] = _mm_add_epi16(step3[17], kOne);
-        step3[18] = _mm_add_epi16(step3[18], kOne);
-        step3[19] = _mm_add_epi16(step3[19], kOne);
-        step3[20] = _mm_add_epi16(step3[20], kOne);
-        step3[21] = _mm_add_epi16(step3[21], kOne);
-        step3[22] = _mm_add_epi16(step3[22], kOne);
-        step3[23] = _mm_add_epi16(step3[23], kOne);
-        step3[24] = _mm_add_epi16(step3[24], kOne);
-        step3[25] = _mm_add_epi16(step3[25], kOne);
-        step3[26] = _mm_add_epi16(step3[26], kOne);
-        step3[27] = _mm_add_epi16(step3[27], kOne);
-        step3[28] = _mm_add_epi16(step3[28], kOne);
-        step3[29] = _mm_add_epi16(step3[29], kOne);
-        step3[30] = _mm_add_epi16(step3[30], kOne);
-        step3[31] = _mm_add_epi16(step3[31], kOne);
-        step3[ 0] = _mm_srai_epi16(step3[ 0], 2);
-        step3[ 1] = _mm_srai_epi16(step3[ 1], 2);
-        step3[ 2] = _mm_srai_epi16(step3[ 2], 2);
-        step3[ 3] = _mm_srai_epi16(step3[ 3], 2);
-        step3[ 4] = _mm_srai_epi16(step3[ 4], 2);
-        step3[ 5] = _mm_srai_epi16(step3[ 5], 2);
-        step3[ 6] = _mm_srai_epi16(step3[ 6], 2);
-        step3[ 7] = _mm_srai_epi16(step3[ 7], 2);
-        step2[ 8] = _mm_srai_epi16(step2[ 8], 2);
-        step2[ 9] = _mm_srai_epi16(step2[ 9], 2);
-        step3[10] = _mm_srai_epi16(step3[10], 2);
-        step3[11] = _mm_srai_epi16(step3[11], 2);
-        step3[12] = _mm_srai_epi16(step3[12], 2);
-        step3[13] = _mm_srai_epi16(step3[13], 2);
-        step2[14] = _mm_srai_epi16(step2[14], 2);
-        step2[15] = _mm_srai_epi16(step2[15], 2);
-        step3[16] = _mm_srai_epi16(step3[16], 2);
-        step3[17] = _mm_srai_epi16(step3[17], 2);
-        step3[18] = _mm_srai_epi16(step3[18], 2);
-        step3[19] = _mm_srai_epi16(step3[19], 2);
-        step3[20] = _mm_srai_epi16(step3[20], 2);
-        step3[21] = _mm_srai_epi16(step3[21], 2);
-        step3[22] = _mm_srai_epi16(step3[22], 2);
-        step3[23] = _mm_srai_epi16(step3[23], 2);
-        step3[24] = _mm_srai_epi16(step3[24], 2);
-        step3[25] = _mm_srai_epi16(step3[25], 2);
-        step3[26] = _mm_srai_epi16(step3[26], 2);
-        step3[27] = _mm_srai_epi16(step3[27], 2);
-        step3[28] = _mm_srai_epi16(step3[28], 2);
-        step3[29] = _mm_srai_epi16(step3[29], 2);
-        step3[30] = _mm_srai_epi16(step3[30], 2);
-        step3[31] = _mm_srai_epi16(step3[31], 2);
-      }
-#endif
-
-#if FDCT32x32_HIGH_PRECISION
-      if (pass == 0) {
-#endif
       // Stage 4
       {
         step1[ 0] = _mm_add_epi16(step3[ 3], step3[ 0]);
@@ -1158,25 +1162,146 @@
         const __m128i mask16 = _mm_set1_epi32(0x80008000);
         const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
         // start using 32-bit operations
+        // stage 3
+        {
+          // expanding to 32-bit length priori to addition operations
+          lstep2[ 0] = k_cvtlo_epi16(step2[ 0], mask16, kZero);
+          lstep2[ 1] = k_cvthi_epi16(step2[ 0], mask16, kZero);
+          lstep2[ 2] = k_cvtlo_epi16(step2[ 1], mask16, kZero);
+          lstep2[ 3] = k_cvthi_epi16(step2[ 1], mask16, kZero);
+          lstep2[ 4] = k_cvtlo_epi16(step2[ 2], mask16, kZero);
+          lstep2[ 5] = k_cvthi_epi16(step2[ 2], mask16, kZero);
+          lstep2[ 6] = k_cvtlo_epi16(step2[ 3], mask16, kZero);
+          lstep2[ 7] = k_cvthi_epi16(step2[ 3], mask16, kZero);
+          lstep2[ 8] = k_cvtlo_epi16(step2[ 4], mask16, kZero);
+          lstep2[ 9] = k_cvthi_epi16(step2[ 4], mask16, kZero);
+          lstep2[10] = k_cvtlo_epi16(step2[ 5], mask16, kZero);
+          lstep2[11] = k_cvthi_epi16(step2[ 5], mask16, kZero);
+          lstep2[12] = k_cvtlo_epi16(step2[ 6], mask16, kZero);
+          lstep2[13] = k_cvthi_epi16(step2[ 6], mask16, kZero);
+          lstep2[14] = k_cvtlo_epi16(step2[ 7], mask16, kZero);
+          lstep2[15] = k_cvthi_epi16(step2[ 7], mask16, kZero);
+
+          lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]);
+          lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]);
+          lstep3[ 2] = _mm_add_epi32(lstep2[12], lstep2[ 2]);
+          lstep3[ 3] = _mm_add_epi32(lstep2[13], lstep2[ 3]);
+          lstep3[ 4] = _mm_add_epi32(lstep2[10], lstep2[ 4]);
+          lstep3[ 5] = _mm_add_epi32(lstep2[11], lstep2[ 5]);
+          lstep3[ 6] = _mm_add_epi32(lstep2[ 8], lstep2[ 6]);
+          lstep3[ 7] = _mm_add_epi32(lstep2[ 9], lstep2[ 7]);
+          lstep3[ 8] = _mm_sub_epi32(lstep2[ 6], lstep2[ 8]);
+          lstep3[ 9] = _mm_sub_epi32(lstep2[ 7], lstep2[ 9]);
+          lstep3[10] = _mm_sub_epi32(lstep2[ 4], lstep2[10]);
+          lstep3[11] = _mm_sub_epi32(lstep2[ 5], lstep2[11]);
+          lstep3[12] = _mm_sub_epi32(lstep2[ 2], lstep2[12]);
+          lstep3[13] = _mm_sub_epi32(lstep2[ 3], lstep2[13]);
+          lstep3[14] = _mm_sub_epi32(lstep2[ 0], lstep2[14]);
+          lstep3[15] = _mm_sub_epi32(lstep2[ 1], lstep2[15]);
+        }
+        {
+          const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+          const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+          const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+          const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+          const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+          const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+          const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+          const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+          const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+          const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+          const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+          const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+          // dct_const_round_shift
+          const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+          lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+          lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+          lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+          lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+          lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+          lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+          lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+          lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+        }
+        {
+          lstep2[40] = k_cvtlo_epi16(step2[20], mask16, kZero);
+          lstep2[41] = k_cvthi_epi16(step2[20], mask16, kZero);
+          lstep2[42] = k_cvtlo_epi16(step2[21], mask16, kZero);
+          lstep2[43] = k_cvthi_epi16(step2[21], mask16, kZero);
+          lstep2[44] = k_cvtlo_epi16(step2[22], mask16, kZero);
+          lstep2[45] = k_cvthi_epi16(step2[22], mask16, kZero);
+          lstep2[46] = k_cvtlo_epi16(step2[23], mask16, kZero);
+          lstep2[47] = k_cvthi_epi16(step2[23], mask16, kZero);
+          lstep2[48] = k_cvtlo_epi16(step2[24], mask16, kZero);
+          lstep2[49] = k_cvthi_epi16(step2[24], mask16, kZero);
+          lstep2[50] = k_cvtlo_epi16(step2[25], mask16, kZero);
+          lstep2[51] = k_cvthi_epi16(step2[25], mask16, kZero);
+          lstep2[52] = k_cvtlo_epi16(step2[26], mask16, kZero);
+          lstep2[53] = k_cvthi_epi16(step2[26], mask16, kZero);
+          lstep2[54] = k_cvtlo_epi16(step2[27], mask16, kZero);
+          lstep2[55] = k_cvthi_epi16(step2[27], mask16, kZero);
+
+          lstep1[32] = k_cvtlo_epi16(step1[16], mask16, kZero);
+          lstep1[33] = k_cvthi_epi16(step1[16], mask16, kZero);
+          lstep1[34] = k_cvtlo_epi16(step1[17], mask16, kZero);
+          lstep1[35] = k_cvthi_epi16(step1[17], mask16, kZero);
+          lstep1[36] = k_cvtlo_epi16(step1[18], mask16, kZero);
+          lstep1[37] = k_cvthi_epi16(step1[18], mask16, kZero);
+          lstep1[38] = k_cvtlo_epi16(step1[19], mask16, kZero);
+          lstep1[39] = k_cvthi_epi16(step1[19], mask16, kZero);
+          lstep1[56] = k_cvtlo_epi16(step1[28], mask16, kZero);
+          lstep1[57] = k_cvthi_epi16(step1[28], mask16, kZero);
+          lstep1[58] = k_cvtlo_epi16(step1[29], mask16, kZero);
+          lstep1[59] = k_cvthi_epi16(step1[29], mask16, kZero);
+          lstep1[60] = k_cvtlo_epi16(step1[30], mask16, kZero);
+          lstep1[61] = k_cvthi_epi16(step1[30], mask16, kZero);
+          lstep1[62] = k_cvtlo_epi16(step1[31], mask16, kZero);
+          lstep1[63] = k_cvthi_epi16(step1[31], mask16, kZero);
+
+          lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
+          lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
+          lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
+          lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
+          lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
+          lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]);
+          lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]);
+          lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]);
+          lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]);
+          lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]);
+          lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
+          lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]);
+          lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]);
+          lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]);
+          lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]);
+          lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]);
+          lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]);
+          lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]);
+          lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]);
+          lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]);
+          lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]);
+          lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]);
+          lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]);
+          lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]);
+          lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]);
+          lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]);
+          lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]);
+          lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]);
+          lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
+          lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
+          lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
+          lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
+        }
+
         // stage 4
         {
           // expanding to 32-bit length priori to addition operations
-          lstep3[ 0] = k_cvtlo_epi16(step3[ 0], mask16, kZero);
-          lstep3[ 1] = k_cvthi_epi16(step3[ 0], mask16, kZero);
-          lstep3[ 2] = k_cvtlo_epi16(step3[ 1], mask16, kZero);
-          lstep3[ 3] = k_cvthi_epi16(step3[ 1], mask16, kZero);
-          lstep3[ 4] = k_cvtlo_epi16(step3[ 2], mask16, kZero);
-          lstep3[ 5] = k_cvthi_epi16(step3[ 2], mask16, kZero);
-          lstep3[ 6] = k_cvtlo_epi16(step3[ 3], mask16, kZero);
-          lstep3[ 7] = k_cvthi_epi16(step3[ 3], mask16, kZero);
-          lstep3[20] = k_cvtlo_epi16(step3[10], mask16, kZero);
-          lstep3[21] = k_cvthi_epi16(step3[10], mask16, kZero);
-          lstep3[22] = k_cvtlo_epi16(step3[11], mask16, kZero);
-          lstep3[23] = k_cvthi_epi16(step3[11], mask16, kZero);
-          lstep3[24] = k_cvtlo_epi16(step3[12], mask16, kZero);
-          lstep3[25] = k_cvthi_epi16(step3[12], mask16, kZero);
-          lstep3[26] = k_cvtlo_epi16(step3[13], mask16, kZero);
-          lstep3[27] = k_cvthi_epi16(step3[13], mask16, kZero);
           lstep2[16] = k_cvtlo_epi16(step2[ 8], mask16, kZero);
           lstep2[17] = k_cvthi_epi16(step2[ 8], mask16, kZero);
           lstep2[18] = k_cvtlo_epi16(step2[ 9], mask16, kZero);
@@ -1212,88 +1337,150 @@
           lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
         }
         {
-          const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
-          const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
-          const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
-          const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
-          const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
-          const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
-          lstep1[10] = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
-          lstep1[11] = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
-          lstep1[12] = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
-          lstep1[13] = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
+          // to be continued...
+          //
+          const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+          const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+
+          u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
+          u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
+          u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
+          u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+          // TODO(jingning): manually inline k_madd_epi32_ to further hide
+          // instruction latency.
+          v[ 0] = k_madd_epi32(u[0], k32_p16_m16);
+          v[ 1] = k_madd_epi32(u[1], k32_p16_m16);
+          v[ 2] = k_madd_epi32(u[2], k32_p16_m16);
+          v[ 3] = k_madd_epi32(u[3], k32_p16_m16);
+          v[ 4] = k_madd_epi32(u[0], k32_p16_p16);
+          v[ 5] = k_madd_epi32(u[1], k32_p16_p16);
+          v[ 6] = k_madd_epi32(u[2], k32_p16_p16);
+          v[ 7] = k_madd_epi32(u[3], k32_p16_p16);
+
+          u[0] = k_packs_epi64(v[0], v[1]);
+          u[1] = k_packs_epi64(v[2], v[3]);
+          u[2] = k_packs_epi64(v[4], v[5]);
+          u[3] = k_packs_epi64(v[6], v[7]);
+
+          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+          lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+          lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+          lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+          lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
         }
         {
-          const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
-          const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
-          const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
-          const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
-          const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
-          const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
-          const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
-          const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
-          const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
-          const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
-          const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
-          const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
-          const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
-          const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
-          const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
-          const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
-          const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
-          const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
-          const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
-          const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
-          const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
-          const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
-          const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
-          const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
-          // dct_const_round_shift
-          const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
-          lstep1[36] = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
-          lstep1[37] = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
-          lstep1[38] = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
-          lstep1[39] = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
-          lstep1[40] = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
-          lstep1[41] = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
-          lstep1[42] = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
-          lstep1[43] = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
-          lstep1[52] = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
-          lstep1[53] = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
-          lstep1[54] = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
-          lstep1[55] = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
-          lstep1[56] = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
-          lstep1[57] = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
-          lstep1[58] = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
-          lstep1[59] = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
+          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+          const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
+          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+
+          u[ 0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
+          u[ 1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
+          u[ 2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
+          u[ 3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
+          u[ 4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
+          u[ 5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
+          u[ 6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
+          u[ 7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
+          u[ 8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
+          u[ 9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
+          u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
+          u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
+          u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
+          u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
+          u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
+          u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
+
+          v[ 0] = k_madd_epi32(u[ 0], k32_m08_p24);
+          v[ 1] = k_madd_epi32(u[ 1], k32_m08_p24);
+          v[ 2] = k_madd_epi32(u[ 2], k32_m08_p24);
+          v[ 3] = k_madd_epi32(u[ 3], k32_m08_p24);
+          v[ 4] = k_madd_epi32(u[ 4], k32_m08_p24);
+          v[ 5] = k_madd_epi32(u[ 5], k32_m08_p24);
+          v[ 6] = k_madd_epi32(u[ 6], k32_m08_p24);
+          v[ 7] = k_madd_epi32(u[ 7], k32_m08_p24);
+          v[ 8] = k_madd_epi32(u[ 8], k32_m24_m08);
+          v[ 9] = k_madd_epi32(u[ 9], k32_m24_m08);
+          v[10] = k_madd_epi32(u[10], k32_m24_m08);
+          v[11] = k_madd_epi32(u[11], k32_m24_m08);
+          v[12] = k_madd_epi32(u[12], k32_m24_m08);
+          v[13] = k_madd_epi32(u[13], k32_m24_m08);
+          v[14] = k_madd_epi32(u[14], k32_m24_m08);
+          v[15] = k_madd_epi32(u[15], k32_m24_m08);
+          v[16] = k_madd_epi32(u[12], k32_m08_p24);
+          v[17] = k_madd_epi32(u[13], k32_m08_p24);
+          v[18] = k_madd_epi32(u[14], k32_m08_p24);
+          v[19] = k_madd_epi32(u[15], k32_m08_p24);
+          v[20] = k_madd_epi32(u[ 8], k32_m08_p24);
+          v[21] = k_madd_epi32(u[ 9], k32_m08_p24);
+          v[22] = k_madd_epi32(u[10], k32_m08_p24);
+          v[23] = k_madd_epi32(u[11], k32_m08_p24);
+          v[24] = k_madd_epi32(u[ 4], k32_p24_p08);
+          v[25] = k_madd_epi32(u[ 5], k32_p24_p08);
+          v[26] = k_madd_epi32(u[ 6], k32_p24_p08);
+          v[27] = k_madd_epi32(u[ 7], k32_p24_p08);
+          v[28] = k_madd_epi32(u[ 0], k32_p24_p08);
+          v[29] = k_madd_epi32(u[ 1], k32_p24_p08);
+          v[30] = k_madd_epi32(u[ 2], k32_p24_p08);
+          v[31] = k_madd_epi32(u[ 3], k32_p24_p08);
+
+          u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64(v[10], v[11]);
+          u[ 6] = k_packs_epi64(v[12], v[13]);
+          u[ 7] = k_packs_epi64(v[14], v[15]);
+          u[ 8] = k_packs_epi64(v[16], v[17]);
+          u[ 9] = k_packs_epi64(v[18], v[19]);
+          u[10] = k_packs_epi64(v[20], v[21]);
+          u[11] = k_packs_epi64(v[22], v[23]);
+          u[12] = k_packs_epi64(v[24], v[25]);
+          u[13] = k_packs_epi64(v[26], v[27]);
+          u[14] = k_packs_epi64(v[28], v[29]);
+          u[15] = k_packs_epi64(v[30], v[31]);
+
+          v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          lstep1[36] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+          lstep1[37] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+          lstep1[38] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+          lstep1[39] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+          lstep1[40] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+          lstep1[41] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+          lstep1[42] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+          lstep1[43] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+          lstep1[52] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+          lstep1[53] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+          lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+          lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+          lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+          lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+          lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+          lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
         }
         // stage 5
         {
-          lstep3[ 8] = k_cvtlo_epi16(step3[4], mask16, kZero);
-          lstep3[ 9] = k_cvthi_epi16(step3[4], mask16, kZero);
-          lstep3[14] = k_cvtlo_epi16(step3[7], mask16, kZero);
-          lstep3[15] = k_cvthi_epi16(step3[7], mask16, kZero);
-
           lstep2[ 8] = _mm_add_epi32(lstep1[10], lstep3[ 8]);
           lstep2[ 9] = _mm_add_epi32(lstep1[11], lstep3[ 9]);
           lstep2[10] = _mm_sub_epi32(lstep3[ 8], lstep1[10]);
@@ -1465,23 +1652,6 @@
           lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
         }
         {
-          lstep3[32] = k_cvtlo_epi16(step3[16], mask16, kZero);
-          lstep3[33] = k_cvthi_epi16(step3[16], mask16, kZero);
-          lstep3[34] = k_cvtlo_epi16(step3[17], mask16, kZero);
-          lstep3[35] = k_cvthi_epi16(step3[17], mask16, kZero);
-          lstep3[44] = k_cvtlo_epi16(step3[22], mask16, kZero);
-          lstep3[45] = k_cvthi_epi16(step3[22], mask16, kZero);
-          lstep3[46] = k_cvtlo_epi16(step3[23], mask16, kZero);
-          lstep3[47] = k_cvthi_epi16(step3[23], mask16, kZero);
-          lstep3[48] = k_cvtlo_epi16(step3[24], mask16, kZero);
-          lstep3[49] = k_cvthi_epi16(step3[24], mask16, kZero);
-          lstep3[50] = k_cvtlo_epi16(step3[25], mask16, kZero);
-          lstep3[51] = k_cvthi_epi16(step3[25], mask16, kZero);
-          lstep3[60] = k_cvtlo_epi16(step3[30], mask16, kZero);
-          lstep3[61] = k_cvthi_epi16(step3[30], mask16, kZero);
-          lstep3[62] = k_cvtlo_epi16(step3[31], mask16, kZero);
-          lstep3[63] = k_cvthi_epi16(step3[31], mask16, kZero);
-
           lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
           lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
           lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);

diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.asm b/vp9/encoder/x86/vp9_quantize_ssse3.asm
index 60f7991..7deb981 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3.asm

@@ -36,6 +36,14 @@
   pshufd                          m4, m4, 0
   mova                            m2, [quantq]             ; m2 = quant
   paddw                           m0, m4                   ; m0 = zbin + zbin_oq
+%ifidn %1, b_32x32
+  pcmpeqw                         m5, m5
+  psrlw                           m5, 15
+  paddw                           m0, m5
+  paddw                           m1, m5
+  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
   mova                            m3, [r2q]                ; m3 = dequant
   psubw                           m0, [pw_1]
   mov                             r2, shiftmp
@@ -43,6 +51,9 @@
   mova                            m4, [r2]                 ; m4 = shift
   mov                             r4, dqcoeffmp
   mov                             r5, iscanmp
+%ifidn %1, b_32x32
+  psllw                           m4, 1
+%endif
   pxor                            m5, m5                   ; m5 = dedicated zero
   DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
   lea                         coeffq, [  coeffq+ncoeffq*2]
@@ -56,10 +67,6 @@
   mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
-%ifidn %1, b_32x32
-  paddw                           m6, m6
-  paddw                          m11, m11
-%endif
   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
   punpckhqdq                      m0, m0
   pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
@@ -112,10 +119,6 @@
   mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
-%ifidn %1, b_32x32
-  paddw                           m6, m6
-  paddw                          m11, m11
-%endif
   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
   pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
 %ifidn %1, b_32x32
@@ -164,6 +167,7 @@
   pmaxsw                          m8, m13
   add                        ncoeffq, mmsize
   jl .ac_only_loop
+
 %ifidn %1, b_32x32
   jmp .accumulate_eob
 .skip_iter:

diff --git a/vp9/encoder/x86/vp9_subpel_variance.asm b/vp9/encoder/x86/vp9_subpel_variance.asm
index 19e2feb..533456b 100644
--- a/vp9/encoder/x86/vp9_subpel_variance.asm
+++ b/vp9/encoder/x86/vp9_subpel_variance.asm

@@ -270,8 +270,13 @@
 %if mmsize == 16
   movhps               m2, [srcq+src_strideq*2]
 %else ; mmsize == 8
+%if %1 == 4
+  movh                 m1, [srcq+src_strideq*2]
+  punpckldq            m2, m1
+%else
   punpckldq            m2, [srcq+src_strideq*2]
 %endif
+%endif
   movh                 m1, [dstq]
 %if mmsize == 16
   movlhps              m0, m2
@@ -542,9 +547,16 @@
   movhps               m2, [srcq+src_strideq]
   movhps               m3, [srcq+src_strideq+1]
 %else
+%if %1 == 4
+  movh                 m1, [srcq+src_strideq]
+  punpckldq            m2, m1
+  movh                 m1, [srcq+src_strideq+1]
+  punpckldq            m3, m1
+%else
   punpckldq            m2, [srcq+src_strideq]
   punpckldq            m3, [srcq+src_strideq+1]
 %endif
+%endif
   pavgb                m2, m3
 %if mmsize == 16
   movlhps              m0, m2

diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index d5692ef..fb302ab 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk

@@ -98,7 +98,9 @@
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct4x4_1_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct4x4_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_1_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_1_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_copy_neon$(ASM)