Merge "Reduced y_dequant, uv_dequant size" into experimental

diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index a8139cb..151a38b 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc

@@ -8,6 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
 
 extern "C" {
 #include "./vpx_config.h"
@@ -16,10 +20,6 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 }
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/acm_random.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
 
 namespace {
 typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
@@ -46,27 +46,27 @@
 // Reference 8-tap subpixel filter, slightly modified to fit into this test.
 #define VP9_FILTER_WEIGHT 128
 #define VP9_FILTER_SHIFT 7
-static uint8_t clip_pixel(int x) {
+uint8_t clip_pixel(int x) {
   return x < 0 ? 0 :
          x > 255 ? 255 :
          x;
 }
 
-static void filter_block2d_8_c(const uint8_t *src_ptr,
-                               const unsigned int src_stride,
-                               const int16_t *HFilter,
-                               const int16_t *VFilter,
-                               uint8_t *dst_ptr,
-                               unsigned int dst_stride,
-                               unsigned int output_width,
-                               unsigned int output_height) {
+void filter_block2d_8_c(const uint8_t *src_ptr,
+                        const unsigned int src_stride,
+                        const int16_t *HFilter,
+                        const int16_t *VFilter,
+                        uint8_t *dst_ptr,
+                        unsigned int dst_stride,
+                        unsigned int output_width,
+                        unsigned int output_height) {
   // Between passes, we use an intermediate buffer whose height is extended to
   // have enough horizontally filtered values as input for the vertical pass.
   // This buffer is allocated to be big enough for the largest block type we
   // support.
   const int kInterp_Extend = 4;
   const unsigned int intermediate_height =
-    (kInterp_Extend - 1) + output_height + kInterp_Extend;
+      (kInterp_Extend - 1) + output_height + kInterp_Extend;
 
   /* Size of intermediate_buffer is max_intermediate_height * filter_max_width,
    * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height
@@ -87,15 +87,15 @@
     for (i = 0; i < intermediate_height; ++i) {
       for (j = 0; j < output_width; ++j) {
         // Apply filter...
-        int temp = ((int)src_ptr[0] * HFilter[0]) +
-                   ((int)src_ptr[1] * HFilter[1]) +
-                   ((int)src_ptr[2] * HFilter[2]) +
-                   ((int)src_ptr[3] * HFilter[3]) +
-                   ((int)src_ptr[4] * HFilter[4]) +
-                   ((int)src_ptr[5] * HFilter[5]) +
-                   ((int)src_ptr[6] * HFilter[6]) +
-                   ((int)src_ptr[7] * HFilter[7]) +
-                   (VP9_FILTER_WEIGHT >> 1);  // Rounding
+        const int temp = (src_ptr[0] * HFilter[0]) +
+                         (src_ptr[1] * HFilter[1]) +
+                         (src_ptr[2] * HFilter[2]) +
+                         (src_ptr[3] * HFilter[3]) +
+                         (src_ptr[4] * HFilter[4]) +
+                         (src_ptr[5] * HFilter[5]) +
+                         (src_ptr[6] * HFilter[6]) +
+                         (src_ptr[7] * HFilter[7]) +
+                         (VP9_FILTER_WEIGHT >> 1);  // Rounding
 
         // Normalize back to 0-255...
         *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT);
@@ -115,15 +115,15 @@
     for (i = 0; i < output_height; ++i) {
       for (j = 0; j < output_width; ++j) {
         // Apply filter...
-        int temp = ((int)src_ptr[0] * VFilter[0]) +
-                   ((int)src_ptr[1] * VFilter[1]) +
-                   ((int)src_ptr[2] * VFilter[2]) +
-                   ((int)src_ptr[3] * VFilter[3]) +
-                   ((int)src_ptr[4] * VFilter[4]) +
-                   ((int)src_ptr[5] * VFilter[5]) +
-                   ((int)src_ptr[6] * VFilter[6]) +
-                   ((int)src_ptr[7] * VFilter[7]) +
-                   (VP9_FILTER_WEIGHT >> 1);  // Rounding
+        const int temp = (src_ptr[0] * VFilter[0]) +
+                         (src_ptr[1] * VFilter[1]) +
+                         (src_ptr[2] * VFilter[2]) +
+                         (src_ptr[3] * VFilter[3]) +
+                         (src_ptr[4] * VFilter[4]) +
+                         (src_ptr[5] * VFilter[5]) +
+                         (src_ptr[6] * VFilter[6]) +
+                         (src_ptr[7] * VFilter[7]) +
+                         (VP9_FILTER_WEIGHT >> 1);  // Rounding
 
         // Normalize back to 0-255...
         *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT);
@@ -135,12 +135,12 @@
   }
 }
 
-static void block2d_average_c(uint8_t *src,
-                              unsigned int src_stride,
-                              uint8_t *output_ptr,
-                              unsigned int output_stride,
-                              unsigned int output_width,
-                              unsigned int output_height) {
+void block2d_average_c(uint8_t *src,
+                       unsigned int src_stride,
+                       uint8_t *output_ptr,
+                       unsigned int output_stride,
+                       unsigned int output_width,
+                       unsigned int output_height) {
   unsigned int i, j;
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; ++j) {
@@ -150,15 +150,15 @@
   }
 }
 
-static void filter_average_block2d_8_c(const uint8_t *src_ptr,
-                                       const unsigned int src_stride,
-                                       const int16_t *HFilter,
-                                       const int16_t *VFilter,
-                                       uint8_t *dst_ptr,
-                                       unsigned int dst_stride,
-                                       unsigned int output_width,
-                                       unsigned int output_height) {
-  uint8_t tmp[64*64];
+void filter_average_block2d_8_c(const uint8_t *src_ptr,
+                                const unsigned int src_stride,
+                                const int16_t *HFilter,
+                                const int16_t *VFilter,
+                                uint8_t *dst_ptr,
+                                unsigned int dst_stride,
+                                unsigned int output_width,
+                                unsigned int output_height) {
+  uint8_t tmp[64 * 64];
 
   assert(output_width <= 64);
   assert(output_height <= 64);
@@ -173,10 +173,9 @@
   static void SetUpTestCase() {
     // Force input_ to be unaligned, output to be 16 byte aligned.
     input_ = reinterpret_cast<uint8_t*>(
-        vpx_memalign(kDataAlignment, kOuterBlockSize * kOuterBlockSize + 1))
-        + 1;
+        vpx_memalign(kDataAlignment, kInputBufferSize + 1)) + 1;
     output_ = reinterpret_cast<uint8_t*>(
-        vpx_memalign(kDataAlignment, kOuterBlockSize * kOuterBlockSize));
+        vpx_memalign(kDataAlignment, kOutputBufferSize));
   }
 
   static void TearDownTestCase() {
@@ -186,62 +185,63 @@
     output_ = NULL;
   }
 
-  protected:
-    static const int kDataAlignment = 16;
-    static const int kOuterBlockSize = 128;
-    static const int kInputStride = kOuterBlockSize;
-    static const int kOutputStride = kOuterBlockSize;
-    static const int kMaxDimension = 64;
+ protected:
+  static const int kDataAlignment = 16;
+  static const int kOuterBlockSize = 128;
+  static const int kInputStride = kOuterBlockSize;
+  static const int kOutputStride = kOuterBlockSize;
+  static const int kMaxDimension = 64;
+  static const int kInputBufferSize = kOuterBlockSize * kOuterBlockSize;
+  static const int kOutputBufferSize = kOuterBlockSize * kOuterBlockSize;
 
-    int Width() const { return GET_PARAM(0); }
-    int Height() const { return GET_PARAM(1); }
-    int BorderLeft() const {
-      const int center = (kOuterBlockSize - Width()) / 2;
-      return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1);
-    }
-    int BorderTop() const { return (kOuterBlockSize - Height()) / 2; }
+  int Width() const { return GET_PARAM(0); }
+  int Height() const { return GET_PARAM(1); }
+  int BorderLeft() const {
+    const int center = (kOuterBlockSize - Width()) / 2;
+    return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1);
+  }
+  int BorderTop() const { return (kOuterBlockSize - Height()) / 2; }
 
-    bool IsIndexInBorder(int i) {
-      return (i < BorderTop() * kOuterBlockSize ||
-              i >= (BorderTop() + Height()) * kOuterBlockSize ||
-              i % kOuterBlockSize < BorderLeft() ||
-              i % kOuterBlockSize >= (BorderLeft() + Width()));
+  bool IsIndexInBorder(int i) {
+    return (i < BorderTop() * kOuterBlockSize ||
+            i >= (BorderTop() + Height()) * kOuterBlockSize ||
+            i % kOuterBlockSize < BorderLeft() ||
+            i % kOuterBlockSize >= (BorderLeft() + Width()));
+  }
+
+  virtual void SetUp() {
+    UUT_ = GET_PARAM(2);
+    /* Set up guard blocks for an inner block cetered in the outer block */
+    for (int i = 0; i < kOutputBufferSize; ++i) {
+      if (IsIndexInBorder(i))
+        output_[i] = 255;
+      else
+        output_[i] = 0;
     }
 
-    virtual void SetUp() {
-      UUT_ = GET_PARAM(2);
-      memset(input_, 0, sizeof(input_));
-      /* Set up guard blocks for an inner block cetered in the outer block */
-      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i) {
-        if (IsIndexInBorder(i))
-          output_[i] = 255;
-        else
-          output_[i] = 0;
-      }
+    ::libvpx_test::ACMRandom prng;
+    for (int i = 0; i < kInputBufferSize; ++i)
+      input_[i] = prng.Rand8Extremes();
+  }
 
-      ::libvpx_test::ACMRandom prng;
-      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i)
-        input_[i] = prng.Rand8Extremes();
+  void CheckGuardBlocks() {
+    for (int i = 0; i < kOutputBufferSize; ++i) {
+      if (IsIndexInBorder(i))
+        EXPECT_EQ(255, output_[i]);
     }
+  }
 
-    void CheckGuardBlocks() {
-      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i) {
-        if (IsIndexInBorder(i))
-          EXPECT_EQ(255, output_[i]);
-      }
-    }
+  uint8_t* input() const {
+    return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
+  }
 
-    uint8_t* input() {
-      return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
-    }
+  uint8_t* output() const {
+    return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
+  }
 
-    uint8_t* output() {
-      return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
-    }
-
-    const ConvolveFunctions* UUT_;
-    static uint8_t* input_;
-    static uint8_t* output_;
+  const ConvolveFunctions* UUT_;
+  static uint8_t* input_;
+  static uint8_t* output_;
 };
 uint8_t* ConvolveTest::input_ = NULL;
 uint8_t* ConvolveTest::output_ = NULL;
@@ -309,7 +309,7 @@
   vp9_sub_pel_filters_8lp
 };
 const int kNumFilterBanks = sizeof(kTestFilterList) /
-    sizeof(kTestFilterList[0]);
+                            sizeof(kTestFilterList[0]);
 const int kNumFilters = 16;
 
 TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
@@ -525,7 +525,6 @@
     make_tuple(64, 32, &convolve8_c),
     make_tuple(32, 64, &convolve8_c),
     make_tuple(64, 64, &convolve8_c)));
-}
 
 #if HAVE_SSSE3
 const ConvolveFunctions convolve8_ssse3(
@@ -548,3 +547,4 @@
     make_tuple(32, 64, &convolve8_ssse3),
     make_tuple(64, 64, &convolve8_ssse3)));
 #endif
+}  // namespace

diff --git a/test/superframe_test.cc b/test/superframe_test.cc
index 52faddb..062ec6c 100644
--- a/test/superframe_test.cc
+++ b/test/superframe_test.cc

@@ -30,7 +30,7 @@
   }
 
   virtual void TearDown() {
-    delete modified_buf_;
+    delete[] modified_buf_;
   }
 
   virtual bool Continue() const {
@@ -59,7 +59,7 @@
         buffer[pkt->data.frame.sz - index_sz] == marker) {
       // frame is a superframe. strip off the index.
       if (modified_buf_)
-        delete modified_buf_;
+        delete[] modified_buf_;
       modified_buf_ = new uint8_t[pkt->data.frame.sz - index_sz];
       memcpy(modified_buf_, pkt->data.frame.buf,
              pkt->data.frame.sz - index_sz);

diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 426699e..7d323eed 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h

@@ -453,7 +453,7 @@
 static INLINE void update_partition_context(MACROBLOCKD *xd,
                                             BLOCK_SIZE_TYPE sb_type,
                                             BLOCK_SIZE_TYPE sb_size) {
-  int bsl = mi_width_log2(sb_size), bs = 1 << bsl;
+  int bsl = mi_width_log2(sb_size), bs;
   int bwl = mi_width_log2(sb_type);
   int bhl = mi_height_log2(sb_type);
   int boffset = mi_width_log2(BLOCK_SIZE_SB64X64) - bsl;
@@ -462,6 +462,12 @@
   if (bsl == 0)
     return;
 
+#if CONFIG_SB8X8
+  bs = 1 << (bsl - 1);
+#else
+  bs = 1 << bsl;
+#endif
+
   // update the partition context at the end notes. set partition bits
   // of block sizes larger than the current one to be one, and partition
   // bits of smaller block sizes to be zero.
@@ -492,14 +498,26 @@
 
 static INLINE int partition_plane_context(MACROBLOCKD *xd,
                                           BLOCK_SIZE_TYPE sb_type) {
-  int bsl = mi_width_log2(sb_type), bs = 1 << bsl;
+  int bsl = mi_width_log2(sb_type), bs;
   int above = 0, left = 0, i;
   int boffset = mi_width_log2(BLOCK_SIZE_SB64X64) - bsl;
 
+#if CONFIG_SB8X8
+  bs = 1 << (bsl - 1);
+#else
+  bs = 1 << bsl;
+#endif
+
   assert(mi_width_log2(sb_type) == mi_height_log2(sb_type));
   assert(bsl >= 0);
   assert(boffset >= 0);
 
+#if CONFIG_SB8X8
+  bs = 1 << (bsl - 1);
+#else
+  bs = 1 << bsl;
+#endif
+
   for (i = 0; i < bs; i++)
     above |= (xd->above_seg_context[i] & (1 << boffset));
   for (i = 0; i < bs; i++)
@@ -511,6 +529,55 @@
   return (left * 2 + above) + (bsl - 1) * PARTITION_PLOFFSET;
 }
 
+static BLOCK_SIZE_TYPE get_subsize(BLOCK_SIZE_TYPE bsize,
+                                   PARTITION_TYPE partition) {
+  BLOCK_SIZE_TYPE subsize;
+  switch (partition) {
+    case PARTITION_NONE:
+      subsize = bsize;
+      break;
+    case PARTITION_HORZ:
+      if (bsize == BLOCK_SIZE_SB64X64)
+        subsize = BLOCK_SIZE_SB64X32;
+      else if (bsize == BLOCK_SIZE_SB32X32)
+        subsize = BLOCK_SIZE_SB32X16;
+#if CONFIG_SB8X8
+      else if (bsize == BLOCK_SIZE_MB16X16)
+        subsize = BLOCK_SIZE_SB16X8;
+#endif
+      else
+        assert(0);
+      break;
+    case PARTITION_VERT:
+      if (bsize == BLOCK_SIZE_SB64X64)
+        subsize = BLOCK_SIZE_SB32X64;
+      else if (bsize == BLOCK_SIZE_SB32X32)
+        subsize = BLOCK_SIZE_SB16X32;
+#if CONFIG_SB8X8
+      else if (bsize == BLOCK_SIZE_MB16X16)
+        subsize = BLOCK_SIZE_SB8X16;
+#endif
+      else
+        assert(0);
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_SIZE_SB64X64)
+        subsize = BLOCK_SIZE_SB32X32;
+      else if (bsize == BLOCK_SIZE_SB32X32)
+        subsize = BLOCK_SIZE_MB16X16;
+#if CONFIG_SB8X8
+      else if (bsize == BLOCK_SIZE_MB16X16)
+        subsize = BLOCK_SIZE_SB8X8;
+#endif
+      else
+        assert(0);
+      break;
+    default:
+      assert(0);
+  }
+  return subsize;
+}
+
 #define ACTIVE_HT   110                // quantization stepsize threshold
 
 #define ACTIVE_HT8  300

diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index c8dd6eb..87879ea 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h

@@ -18,6 +18,7 @@
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_quant_common.h"
 
 #if CONFIG_POSTPROC
 #include "vp9/common/vp9_postproc.h"
@@ -31,13 +32,6 @@
 
 void vp9_initialize_common(void);
 
-#define MINQ 0
-
-#define MAXQ 255
-#define QINDEX_BITS 8
-
-#define QINDEX_RANGE (MAXQ + 1)
-
 #if CONFIG_MULTIPLE_ARF
 #define NUM_REF_FRAMES 8
 #define NUM_REF_FRAMES_LG2 3

diff --git a/vp9/common/vp9_quant_common.c b/vp9/common/vp9_quant_common.c
index 2e9e4ca..5907b4f 100644
--- a/vp9/common/vp9_quant_common.c
+++ b/vp9/common/vp9_quant_common.c

@@ -10,6 +10,7 @@
 
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_seg_common.h"
 
 static int16_t dc_qlookup[QINDEX_RANGE];
 static int16_t ac_qlookup[QINDEX_RANGE];
@@ -44,3 +45,16 @@
 int16_t vp9_ac_quant(int qindex, int delta) {
   return ac_qlookup[clamp(qindex + delta, 0, MAXQ)];
 }
+
+
+int vp9_get_qindex(MACROBLOCKD *xd, int segment_id, int base_qindex) {
+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) {
+    const int data = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
+    return xd->mb_segment_abs_delta == SEGMENT_ABSDATA ?
+               data :  // Abs value
+               clamp(base_qindex + data, 0, MAXQ);  // Delta value
+  } else {
+    return base_qindex;
+  }
+}
+

diff --git a/vp9/common/vp9_quant_common.h b/vp9/common/vp9_quant_common.h
index 7daf15d..ded9426 100644
--- a/vp9/common/vp9_quant_common.h
+++ b/vp9/common/vp9_quant_common.h

@@ -12,11 +12,17 @@
 #define VP9_COMMON_VP9_QUANT_COMMON_H_
 
 #include "vp9/common/vp9_blockd.h"
-#include "vp9/common/vp9_onyxc_int.h"
+
+#define MINQ 0
+#define MAXQ 255
+#define QINDEX_RANGE (MAXQ - MINQ + 1)
+#define QINDEX_BITS 8
 
 void vp9_init_quant_tables();
 
 int16_t vp9_dc_quant(int qindex, int delta);
 int16_t vp9_ac_quant(int qindex, int delta);
 
+int vp9_get_qindex(MACROBLOCKD *mb, int segment_id, int base_qindex);
+
 #endif  // VP9_COMMON_VP9_QUANT_COMMON_H_

diff --git a/vp9/common/vp9_recon.c b/vp9/common/vp9_recon.c
index 4119450..4ab4f39 100644
--- a/vp9/common/vp9_recon.c
+++ b/vp9/common/vp9_recon.c

@@ -31,9 +31,10 @@
 void vp9_recon_b_c(uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr,
                    int stride) {
   assert(pred_ptr == dst_ptr);
-  recon(4, 4, diff_ptr, 16, dst_ptr, stride);
+  recon(4, 4, diff_ptr, 16 >> CONFIG_SB8X8, dst_ptr, stride);
 }
 
+#if !CONFIG_SB8X8
 void vp9_recon_uv_b_c(uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr,
                       int stride) {
   assert(pred_ptr == dst_ptr);
@@ -51,6 +52,7 @@
   assert(pred_ptr == dst_ptr);
   recon(4, 8, diff_ptr, 8, dst_ptr, stride);
 }
+#endif
 
 static void recon_plane(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize, int plane) {
   const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);

diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index a0155d9..daeb6b5 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c

@@ -545,34 +545,35 @@
 
 void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd,
                                       BLOCK_SIZE_TYPE bsize) {
-  const int bwl = mi_width_log2(bsize),  bw = MI_SIZE << bwl;
-  const int bhl = mi_height_log2(bsize), bh = MI_SIZE << bhl;
+  const int bwl = b_width_log2(bsize),  bw = 4 << bwl;
+  const int bhl = b_height_log2(bsize), bh = 4 << bhl;
 
   vp9_build_intra_predictors(xd->plane[0].dst.buf, xd->plane[0].dst.stride,
                              xd->plane[0].dst.buf, xd->plane[0].dst.stride,
                              xd->mode_info_context->mbmi.mode,
                              bw, bh,
                              xd->up_available, xd->left_available,
-                             xd->right_available);
+                             0 /*xd->right_available*/);
 }
 
 void vp9_build_intra_predictors_sbuv_s(MACROBLOCKD *xd,
                                        BLOCK_SIZE_TYPE bsize) {
-  const int bwl = mi_width_log2(bsize), bw = MI_UV_SIZE << bwl;
-  const int bhl = mi_height_log2(bsize), bh = MI_UV_SIZE << bhl;
+  const int bwl = b_width_log2(bsize), bw = 2 << bwl;
+  const int bhl = b_height_log2(bsize), bh = 2 << bhl;
 
   vp9_build_intra_predictors(xd->plane[1].dst.buf, xd->plane[1].dst.stride,
                              xd->plane[1].dst.buf, xd->plane[1].dst.stride,
                              xd->mode_info_context->mbmi.uv_mode,
                              bw, bh, xd->up_available,
-                             xd->left_available, xd->right_available);
+                             xd->left_available, 0 /*xd->right_available*/);
   vp9_build_intra_predictors(xd->plane[2].dst.buf, xd->plane[1].dst.stride,
                              xd->plane[2].dst.buf, xd->plane[1].dst.stride,
                              xd->mode_info_context->mbmi.uv_mode,
                              bw, bh, xd->up_available,
-                             xd->left_available, xd->right_available);
+                             xd->left_available, 0 /*xd->right_available*/);
 }
 
+#if !CONFIG_SB8X8
 void vp9_intra8x8_predict(MACROBLOCKD *xd,
                           int block4x4_idx,
                           int mode,
@@ -587,14 +588,18 @@
                              mode, 8, 8, have_top, have_left,
                              have_right);
 }
+#endif
 #if !CONFIG_NEWBINTRAMODES
 void vp9_intra4x4_predict(MACROBLOCKD *xd,
                           int block_idx,
                           int mode,
                           uint8_t *predictor, int pre_stride) {
-  const int have_top = (block_idx >> 2) || xd->up_available;
-  const int have_left = (block_idx & 3) || xd->left_available;
-  const int have_right = ((block_idx & 3) != 3);
+  const int have_top =
+      (block_idx >> (2 >> CONFIG_SB8X8)) || xd->up_available;
+  const int have_left =
+      (block_idx & (3 >> CONFIG_SB8X8)) || xd->left_available;
+  const int have_right =
+      ((block_idx & (3 >> CONFIG_SB8X8)) != (3 >> CONFIG_SB8X8));
 
   vp9_build_intra_predictors(predictor, pre_stride,
                              predictor, pre_stride,
@@ -602,6 +607,7 @@
                              have_right);
 }
 #endif
+#if !CONFIG_SB8X8
 void vp9_intra_uv4x4_predict(MACROBLOCKD *xd,
                              int block4x4_idx,
                              int mode,
@@ -616,3 +622,4 @@
                              mode, 4, 4, have_top, have_left,
                              have_right);
 }
+#endif

diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 745cc69..b1acc04 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -63,6 +63,8 @@
 prototype void vp9_recon_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
 specialize vp9_recon_b
 
+if [ "$CONFIG_SB8X8" != "yes" ]; then
+
 prototype void vp9_recon_uv_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
 specialize vp9_recon_uv_b
 
@@ -76,6 +78,8 @@
 specialize vp9_recon4b
 # specialize vp9_recon4b sse2
 
+fi
+
 prototype void vp9_recon_sb "struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize"
 specialize vp9_recon_sb
 
@@ -97,12 +101,16 @@
 prototype void vp9_intra4x4_predict "struct macroblockd *xd, int block, int b_mode, uint8_t *predictor, int pre_stride"
 specialize vp9_intra4x4_predict;
 
+if [ "$CONFIG_SB8X8" != "yes" ]; then
+
 prototype void vp9_intra8x8_predict "struct macroblockd *xd, int block, int b_mode, uint8_t *predictor, int pre_stride"
 specialize vp9_intra8x8_predict;
 
 prototype void vp9_intra_uv4x4_predict "struct macroblockd *xd, int block, int b_mode, uint8_t *predictor, int pre_stride"
 specialize vp9_intra_uv4x4_predict;
 
+fi
+
 if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
 prototype void vp9_add_residual_4x4 "const int16_t *diff, uint8_t *dest, int stride"
 specialize vp9_add_residual_4x4 sse2
@@ -342,6 +350,12 @@
 vp9_variance8x8_sse2=vp9_variance8x8_wmt
 vp9_variance8x8_mmx=vp9_variance8x8_mmx
 
+prototype unsigned int vp9_variance8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance8x4
+
+prototype unsigned int vp9_variance4x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance4x8
+
 prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance4x4 mmx sse2
 vp9_variance4x4_sse2=vp9_variance4x4_wmt

diff --git a/vp9/common/vp9_seg_common.c b/vp9/common/vp9_seg_common.c
index 9ed3e2d..02b7053 100644
--- a/vp9/common/vp9_seg_common.c
+++ b/vp9/common/vp9_seg_common.c

@@ -107,4 +107,10 @@
 }
 #endif
 
+const vp9_tree_index vp9_segment_tree[14] = {
+  2,  4,  6,  8, 10, 12,
+  0, -1, -2, -3, -4, -5, -6, -7
+};
+
+
 // TBD? Functions to read and write segment data with range / validity checking

diff --git a/vp9/common/vp9_seg_common.h b/vp9/common/vp9_seg_common.h
index 4550dd1..53d22a3 100644
--- a/vp9/common/vp9_seg_common.h
+++ b/vp9/common/vp9_seg_common.h

@@ -59,5 +59,8 @@
 void vp9_implicit_segment_map_update(VP9_COMMON * cm);
 #endif
 
+
+extern const vp9_tree_index vp9_segment_tree[14];
+
 #endif  // VP9_COMMON_VP9_SEG_COMMON_H_
 

diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 474250c..f1b214d 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c

@@ -76,23 +76,7 @@
 }
 
 static int read_mb_segid(vp9_reader *r, MACROBLOCKD *xd) {
-  const vp9_prob *const p = xd->mb_segment_tree_probs;
-  int ret_val;
-
-  if (vp9_read(r, p[0])) {
-    if (vp9_read(r, p[4])) {
-      ret_val = 6 + vp9_read(r, p[6]);
-    } else {
-      ret_val = 4 + vp9_read(r, p[5]);
-    }
-  } else {
-    if (vp9_read(r, p[1])) {
-      ret_val = 2 + vp9_read(r, p[3]);
-    } else {
-      ret_val = vp9_read(r, p[2]);
-    }
-  }
-  return ret_val;
+  return treed_read(r, vp9_segment_tree, xd->mb_segment_tree_probs);
 }
 
 static void set_segment_id(VP9_COMMON *cm, MB_MODE_INFO *mbmi,
@@ -146,7 +130,11 @@
     m->mbmi.mb_skip_coeff = vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
 
   // luma mode
+#if CONFIG_SB8X8
+  m->mbmi.mode = m->mbmi.sb_type > BLOCK_SIZE_SB8X8 ?
+#else
   m->mbmi.mode = m->mbmi.sb_type > BLOCK_SIZE_MB16X16 ?
+#endif
       read_kf_sb_ymode(r, cm->sb_kf_ymode_prob[cm->kf_ymode_probs_index]):
       read_kf_mb_ymode(r, cm->kf_ymode_prob[cm->kf_ymode_probs_index]);
 
@@ -154,11 +142,10 @@
 
   if (m->mbmi.mode == I4X4_PRED) {
     int i;
-    for (i = 0; i < 16; ++i) {
+    for (i = 0; i < (16 >> (2 * CONFIG_SB8X8)); ++i) {
       const B_PREDICTION_MODE a = above_block_mode(m, i, mis);
       const B_PREDICTION_MODE l = xd->left_available || (i & 3) ?
                                   left_block_mode(m, i) : B_DC_PRED;
-
       m->bmi[i].as_mode.first = read_kf_bmode(r, cm->kf_bmode_prob[a][l]);
     }
   }
@@ -202,7 +189,11 @@
   } else if (cm->txfm_mode >= ALLOW_32X32 &&
              m->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
     m->mbmi.txfm_size = TX_32X32;
-  } else if (cm->txfm_mode >= ALLOW_16X16 && m->mbmi.mode <= TM_PRED) {
+  } else if (cm->txfm_mode >= ALLOW_16X16 &&
+#if CONFIG_SB8X8
+             m->mbmi.sb_type >= BLOCK_SIZE_MB16X16 &&
+#endif
+             m->mbmi.mode <= TM_PRED) {
     m->mbmi.txfm_size = TX_16X16;
   } else if (cm->txfm_mode >= ALLOW_8X8 && m->mbmi.mode != I4X4_PRED) {
     m->mbmi.txfm_size = TX_8X8;
@@ -685,7 +676,12 @@
       if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) {
         mbmi->mode = ZEROMV;
       } else {
-        mbmi->mode = mbmi->sb_type > BLOCK_SIZE_MB16X16 ?
+        mbmi->mode =
+#if CONFIG_SB8X8
+                     mbmi->sb_type > BLOCK_SIZE_SB8X8 ?
+#else
+                     mbmi->sb_type > BLOCK_SIZE_MB16X16 ?
+#endif
                                      read_sb_mv_ref(r, mv_ref_p)
                                    : read_mv_ref(r, mv_ref_p);
         vp9_accum_mv_refs(cm, mbmi->mode, mbmi->mb_mode_context[ref_frame]);
@@ -875,7 +871,11 @@
           }
           */
 
-#if !CONFIG_SB8X8
+#if CONFIG_SB8X8
+          mi->bmi[j].as_mv[0].as_int = blockmv.as_int;
+          if (mbmi->second_ref_frame > 0)
+            mi->bmi[j].as_mv[1].as_int = secondmv.as_int;
+#else
           {
             /* Fill (uniform) modes, mvs of jth subset.
              Must do it here because ensuing subsets can
@@ -961,7 +961,12 @@
     // required for left and above block mv
     mv0->as_int = 0;
 
-    if (mbmi->sb_type > BLOCK_SIZE_MB16X16) {
+#if CONFIG_SB8X8
+    if (mbmi->sb_type > BLOCK_SIZE_SB8X8)
+#else
+    if (mbmi->sb_type > BLOCK_SIZE_MB16X16)
+#endif
+    {
       mbmi->mode = read_sb_ymode(r, cm->fc.sb_ymode_prob);
       cm->fc.sb_ymode_counts[mbmi->mode]++;
     } else {
@@ -979,7 +984,7 @@
         if (m == B_CONTEXT_PRED) m -= CONTEXT_PRED_REPLACEMENTS;
 #endif
         cm->fc.bmode_counts[m]++;
-      } while (++j < 16);
+      } while (++j < (16 >> (2 * CONFIG_SB8X8)));
     }
 
 #if !CONFIG_SB8X8

diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 522ea9f..c9b65b6 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c

@@ -177,22 +177,10 @@
   }
 }
 
-static int get_qindex(MACROBLOCKD *mb, int segment_id, int base_qindex) {
-  // Set the Q baseline allowing for any segment level adjustment
-  if (vp9_segfeature_active(mb, segment_id, SEG_LVL_ALT_Q)) {
-    const int data = vp9_get_segdata(mb, segment_id, SEG_LVL_ALT_Q);
-    return mb->mb_segment_abs_delta == SEGMENT_ABSDATA ?
-               data :  // Abs value
-               clamp(base_qindex + data, 0, MAXQ);  // Delta value
-  } else {
-    return base_qindex;
-  }
-}
-
 static void mb_init_dequantizer(VP9_COMMON *pc, MACROBLOCKD *xd) {
   int i;
   const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  xd->q_index = get_qindex(xd, segment_id, pc->base_qindex);
+  xd->q_index = vp9_get_qindex(xd, segment_id, pc->base_qindex);
 
   xd->plane[0].dequant = pc->y_dequant[xd->q_index];
   for (i = 1; i < MAX_MB_PLANE; i++)
@@ -251,9 +239,10 @@
 }
 #endif
 
-static INLINE void dequant_add_y(MACROBLOCKD *xd, TX_TYPE tx_type, int idx) {
+static INLINE void dequant_add_y(MACROBLOCKD *xd, TX_TYPE tx_type, int idx,
+                                 BLOCK_SIZE_TYPE bsize) {
   struct macroblockd_plane *const y = &xd->plane[0];
-  uint8_t* const dst = raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, idx,
+  uint8_t* const dst = raster_block_offset_uint8(xd, bsize, 0, idx,
                                                  xd->plane[0].dst.buf,
                                                  xd->plane[0].dst.stride);
   if (tx_type != DCT_DCT) {
@@ -284,7 +273,7 @@
     vp9_intra8x8_predict(xd, ib, i8x8mode, dst, xd->plane[0].dst.stride);
     for (j = 0; j < 4; j++) {
       tx_type = get_tx_type_4x4(xd, ib + iblock[j]);
-      dequant_add_y(xd, tx_type, ib + iblock[j]);
+      dequant_add_y(xd, tx_type, ib + iblock[j], BLOCK_SIZE_MB16X16);
     }
     dst = raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 1, i,
                                     xd->plane[1].dst.buf,
@@ -368,7 +357,7 @@
     vp9_intra4x4_predict(xd, i, b_mode, dst, xd->plane[0].dst.stride);
     // TODO(jingning): refactor to use foreach_transformed_block_in_plane_
     tx_type = get_tx_type_4x4(xd, i);
-    dequant_add_y(xd, tx_type, i);
+    dequant_add_y(xd, tx_type, i, bsize);
   }
 #if CONFIG_NEWBINTRAMODES
   if (!xd->mode_info_context->mbmi.mb_skip_coeff)
@@ -572,10 +561,12 @@
   set_refs(pbi, mi_row, mi_col);
 
 #if CONFIG_SB8X8
-  if (bsize >= BLOCK_SIZE_SB8X8)
-    decode_sb(pbi, xd, mi_row, mi_col, r, bsize);
+  if (bsize == BLOCK_SIZE_SB8X8 &&
+      (xd->mode_info_context->mbmi.mode == SPLITMV ||
+       xd->mode_info_context->mbmi.mode == I4X4_PRED))
+    decode_atom(pbi, xd, mi_row, mi_col, r, bsize);
   else
-    decode_atom(pbi, xd, mi_row, mi_col, r, BLOCK_SIZE_SB8X8);
+    decode_sb(pbi, xd, mi_row, mi_col, r, bsize);
 #else
   // TODO(jingning): merge decode_sb_ and decode_mb_
   if (bsize > BLOCK_SIZE_MB16X16) {
@@ -612,7 +603,11 @@
   if (mi_row >= pc->mi_rows || mi_col >= pc->mi_cols)
     return;
 
+#if CONFIG_SB8X8
+  if (bsize > BLOCK_SIZE_SB8X8) {
+#else
   if (bsize > BLOCK_SIZE_MB16X16) {
+#endif
     int pl;
     // read the partition information
     xd->left_seg_context =
@@ -624,34 +619,35 @@
     pc->fc.partition_counts[pl][partition]++;
   }
 
+  subsize = get_subsize(bsize, partition);
   switch (partition) {
     case PARTITION_NONE:
-      subsize = bsize;
       decode_modes_b(pbi, mi_row, mi_col, r, subsize);
       break;
     case PARTITION_HORZ:
-      subsize = (bsize == BLOCK_SIZE_SB64X64) ? BLOCK_SIZE_SB64X32 :
-                                                BLOCK_SIZE_SB32X16;
       decode_modes_b(pbi, mi_row, mi_col, r, subsize);
       if ((mi_row + bs) < pc->mi_rows)
         decode_modes_b(pbi, mi_row + bs, mi_col, r, subsize);
       break;
     case PARTITION_VERT:
-      subsize = (bsize == BLOCK_SIZE_SB64X64) ? BLOCK_SIZE_SB32X64 :
-                                                BLOCK_SIZE_SB16X32;
       decode_modes_b(pbi, mi_row, mi_col, r, subsize);
       if ((mi_col + bs) < pc->mi_cols)
         decode_modes_b(pbi, mi_row, mi_col + bs, r, subsize);
       break;
     case PARTITION_SPLIT:
-      subsize = (bsize == BLOCK_SIZE_SB64X64) ? BLOCK_SIZE_SB32X32 :
-                                                BLOCK_SIZE_MB16X16;
       for (n = 0; n < 4; n++) {
         int j = n >> 1, i = n & 0x01;
         if (subsize == BLOCK_SIZE_SB32X32)
           xd->sb_index = n;
+#if CONFIG_SB8X8
+        else if (subsize == BLOCK_SIZE_MB16X16)
+          xd->mb_index = n;
+        else
+          xd->b_index = n;
+#else
         else
           xd->mb_index = n;
+#endif
         decode_modes_sb(pbi, mi_row + j * bs, mi_col + i * bs, r, subsize);
       }
       break;
@@ -659,7 +655,11 @@
       assert(0);
   }
   // update partition context
+#if CONFIG_SB8X8
+  if ((partition == PARTITION_SPLIT) && (bsize > BLOCK_SIZE_MB16X16))
+#else
   if ((partition == PARTITION_SPLIT) && (bsize > BLOCK_SIZE_SB32X32))
+#endif
     return;
 
   xd->left_seg_context = pc->left_seg_context + ((mi_row >> CONFIG_SB8X8) & 3);

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 3c0bab2..7eb35da 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c

@@ -512,60 +512,9 @@
 // It should only be called if a segment map update is indicated.
 static void write_mb_segid(vp9_writer *bc,
                            const MB_MODE_INFO *mi, const MACROBLOCKD *xd) {
-  // Encode the MB segment id.
-  int seg_id = mi->segment_id;
-
-  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
-    switch (seg_id) {
-      case 0:
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[2]);
-        break;
-      case 1:
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[2]);
-        break;
-      case 2:
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[1]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[3]);
-        break;
-      case 3:
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[1]);
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[3]);
-        break;
-      case 4:
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[4]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[5]);
-        break;
-      case 5:
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[4]);
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[5]);
-        break;
-      case 6:
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[4]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[6]);
-        break;
-      case 7:
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[4]);
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[6]);
-        break;
-
-        // TRAP.. This should not happen
-      default:
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[2]);
-        break;
-    }
-  }
+  if (xd->segmentation_enabled && xd->update_mb_segmentation_map)
+    treed_write(bc, vp9_segment_tree, xd->mb_segment_tree_probs,
+                mi->segment_id, 3);
 }
 
 // This function encodes the reference frame
@@ -722,7 +671,11 @@
     active_section = 6;
 #endif
 
+#if CONFIG_SB8X8
+    if (m->mbmi.sb_type > BLOCK_SIZE_SB8X8)
+#else
     if (m->mbmi.sb_type > BLOCK_SIZE_MB16X16)
+#endif
       write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob);
     else
       write_ymode(bc, mode, pc->fc.ymode_prob);
@@ -761,7 +714,11 @@
 
     // If segment skip is not enabled code the mode.
     if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
+#if CONFIG_SB8X8
+      if (mi->sb_type > BLOCK_SIZE_SB8X8) {
+#else
       if (mi->sb_type > BLOCK_SIZE_MB16X16) {
+#endif
         write_sb_mv_ref(bc, mode, mv_ref_p);
       } else {
         write_mv_ref(bc, mode, mv_ref_p);
@@ -944,7 +901,11 @@
     vp9_write(bc, skip_coeff, vp9_get_pred_prob(c, xd, PRED_MBSKIP));
   }
 
+#if CONFIG_SB8X8
+  if (m->mbmi.sb_type > BLOCK_SIZE_SB8X8)
+#else
   if (m->mbmi.sb_type > BLOCK_SIZE_MB16X16)
+#endif
     sb_kfwrite_ymode(bc, ym, c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
   else
     kfwrite_ymode(bc, ym, c->kf_ymode_prob[c->kf_ymode_probs_index]);
@@ -960,7 +921,6 @@
 #ifdef ENTROPY_STATS
       ++intra_mode_stats [A] [L] [bm];
 #endif
-
       write_kf_bmode(bc, bm, c->kf_bmode_prob[a][l]);
     } while (++i < (16 >> (CONFIG_SB8X8 * 2)));
   }
@@ -1210,7 +1170,11 @@
   else
     assert(0);
 
+#if CONFIG_SB8X8
+  if (bsize > BLOCK_SIZE_SB8X8) {
+#else
   if (bsize > BLOCK_SIZE_MB16X16) {
+#endif
     int pl;
     xd->left_seg_context =
         cm->left_seg_context + ((mi_row >> CONFIG_SB8X8) & 3);
@@ -1221,34 +1185,23 @@
                 vp9_partition_encodings + partition);
   }
 
+  subsize = get_subsize(bsize, partition);
+
   switch (partition) {
     case PARTITION_NONE:
-      subsize = bsize;
       write_modes_b(cpi, m, bc, tok, tok_end, mi_row, mi_col);
       break;
     case PARTITION_HORZ:
-      subsize = (bsize == BLOCK_SIZE_SB64X64) ? BLOCK_SIZE_SB64X32 :
-                                                BLOCK_SIZE_SB32X16;
       write_modes_b(cpi, m, bc, tok, tok_end, mi_row, mi_col);
       if ((mi_row + bh) < cm->mi_rows)
         write_modes_b(cpi, m + bh * mis, bc, tok, tok_end, mi_row + bh, mi_col);
       break;
     case PARTITION_VERT:
-      subsize = (bsize == BLOCK_SIZE_SB64X64) ? BLOCK_SIZE_SB32X64 :
-                                                BLOCK_SIZE_SB16X32;
       write_modes_b(cpi, m, bc, tok, tok_end, mi_row, mi_col);
       if ((mi_col + bw) < cm->mi_cols)
         write_modes_b(cpi, m + bw, bc, tok, tok_end, mi_row, mi_col + bw);
       break;
     case PARTITION_SPLIT:
-      // TODO(jingning): support recursive partitioning down to 16x16 as for
-      // now. need to merge in 16x8, 8x16, 8x8, and smaller partitions.
-      if (bsize == BLOCK_SIZE_SB64X64)
-        subsize = BLOCK_SIZE_SB32X32;
-      else if (bsize == BLOCK_SIZE_SB32X32)
-        subsize = BLOCK_SIZE_MB16X16;
-      else
-        assert(0);
       for (n = 0; n < 4; n++) {
         int j = n >> 1, i = n & 0x01;
         write_modes_sb(cpi, m + j * bs * mis + i * bs, bc, tok, tok_end,
@@ -1260,7 +1213,11 @@
   }
 
   // update partition context
+#if CONFIG_SB8X8
+  if ((partition == PARTITION_SPLIT) && (bsize > BLOCK_SIZE_MB16X16))
+#else
   if ((partition == PARTITION_SPLIT) && (bsize > BLOCK_SIZE_SB32X32))
+#endif
     return;
 
   xd->left_seg_context = cm->left_seg_context + ((mi_row >> CONFIG_SB8X8) & 3);

diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 40ad680..83c1102 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h

@@ -29,7 +29,7 @@
     B_PREDICTION_MODE mode;
     int_mv mv;
     int_mv second_mv;
-  } bmi[16];
+  } bmi[16 >> (2 * CONFIG_SB8X8)];
 } PARTITION_INFO;
 
 // Structure to hold snapshot of coding context during the mode picking process

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 95bba21..3537e27 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -395,17 +395,6 @@
         x->partition_info->bmi[15 >> (CONFIG_SB8X8 * 2)].mv.as_int;
     mbmi->mv[1].as_int =
         x->partition_info->bmi[15 >> (CONFIG_SB8X8 * 2)].second_mv.as_int;
-#if CONFIG_SB8X8
-    vpx_memcpy(x->partition_info + mis, &ctx->partition_info,
-               sizeof(PARTITION_INFO));
-    vpx_memcpy(x->partition_info + 1, &ctx->partition_info,
-               sizeof(PARTITION_INFO));
-    vpx_memcpy(x->partition_info + mis + 1, &ctx->partition_info,
-               sizeof(PARTITION_INFO));
-    xd->mode_info_context[1].mbmi =
-    xd->mode_info_context[mis].mbmi =
-    xd->mode_info_context[1 + mis].mbmi = *mbmi;
-#endif
   }
 
   x->skip = ctx->skip;
@@ -499,12 +488,15 @@
       mbmi->best_mv.as_int = best_mv.as_int;
       mbmi->best_second_mv.as_int = best_second_mv.as_int;
       vp9_update_nmv_count(cpi, x, &best_mv, &best_second_mv);
-#if CONFIG_SB8X8
-      xd->mode_info_context[1].mbmi =
-      xd->mode_info_context[mis].mbmi =
-      xd->mode_info_context[1 + mis].mbmi = *mbmi;
-#endif
     }
+#if CONFIG_SB8X8
+    if (bsize > BLOCK_SIZE_SB8X8 && mbmi->mode == NEWMV) {
+      int i, j;
+      for (j = 0; j < bh; ++j)
+        for (i = 0; i < bw; ++i)
+          xd->mode_info_context[mis * j + i].mbmi = *mbmi;
+    }
+#endif
 #if CONFIG_COMP_INTERINTRA_PRED
     if (mbmi->mode >= NEARESTMV && mbmi->mode < SPLITMV &&
         mbmi->second_ref_frame <= INTRA_FRAME) {
@@ -1024,6 +1016,7 @@
       int j;
       ENTROPY_CONTEXT l2[8 * MAX_MB_PLANE], a2[8 * MAX_MB_PLANE];
 
+      sb_partitioning[i] = BLOCK_SIZE_MB16X16;
       if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
         continue;
 
@@ -1044,7 +1037,6 @@
       }
 
       /* Encode MBs in raster order within the SB */
-      sb_partitioning[i] = BLOCK_SIZE_MB16X16;
       for (j = 0; j < 4; j++) {
         const int x_idx_m = x_idx + ((j & 1) << CONFIG_SB8X8);
         const int y_idx_m = y_idx + ((j >> 1) << CONFIG_SB8X8);
@@ -1052,6 +1044,8 @@
 #if CONFIG_SB8X8
         int r2, d2, mb16_rate = 0, mb16_dist = 0, k;
         ENTROPY_CONTEXT l3[4 * MAX_MB_PLANE], a3[4 * MAX_MB_PLANE];
+
+        mb_partitioning[i][j] = BLOCK_SIZE_SB8X8;
 #endif
 
         if (mi_row + y_idx_m >= cm->mi_rows ||
@@ -1077,13 +1071,12 @@
                      sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
         }
 
-        mb_partitioning[i][j] = BLOCK_SIZE_SB8X8;
         for (k = 0; k < 4; k++) {
           xd->b_index = k;
 
           // try 8x8 coding
-          pick_sb_modes(cpi, mi_row + y_idx_m + (k & 1),
-                        mi_col + x_idx_m + (k >> 1),
+          pick_sb_modes(cpi, mi_row + y_idx_m + (k >> 1),
+                        mi_col + x_idx_m + (k & 1),
                         tp, &r, &d, BLOCK_SIZE_SB8X8,
                         &x->sb8_context[xd->sb_index][xd->mb_index]
                                        [xd->b_index]);
@@ -1093,7 +1086,8 @@
                                            [xd->b_index],
                        BLOCK_SIZE_SB8X8, 0);
           encode_superblock(cpi, tp,
-                            0, mi_row + y_idx_m, mi_col + x_idx_m,
+                            0, mi_row + y_idx_m + (k >> 1),
+                            mi_col + x_idx_m + (k & 1),
                             BLOCK_SIZE_SB8X8);
         }
         set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
@@ -1140,8 +1134,8 @@
         r2 += x->partition_cost[pl][PARTITION_VERT];
         if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
                 RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
-          mb16_rate = r;
-          mb16_dist = d;
+          mb16_rate = r2;
+          mb16_dist = d2;
           mb_partitioning[i][j] = BLOCK_SIZE_SB8X16;
         }
         for (p = 0; p < MAX_MB_PLANE; p++) {
@@ -1185,8 +1179,8 @@
         r2 += x->partition_cost[pl][PARTITION_HORZ];
         if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
                 RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
-          mb16_rate = r;
-          mb16_dist = d;
+          mb16_rate = r2;
+          mb16_dist = d2;
           mb_partitioning[i][j] = BLOCK_SIZE_SB16X8;
         }
         for (p = 0; p < MAX_MB_PLANE; p++) {
@@ -1226,7 +1220,7 @@
 
         // Dummy encode, do not do the tokenization
 #if CONFIG_SB8X8
-        encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0,
+        encode_sb(cpi, tp, mi_row + y_idx_m, mi_col + x_idx_m, 0,
                   BLOCK_SIZE_MB16X16, mb_partitioning[i][j], NULL, NULL);
 #else
         encode_macroblock(cpi, tp, 0, mi_row + y_idx_m,
@@ -2082,7 +2076,12 @@
 #endif
 #endif
 
-  if (xd->mode_info_context->mbmi.sb_type > BLOCK_SIZE_MB16X16) {
+#if CONFIG_SB8X8
+  if (xd->mode_info_context->mbmi.sb_type > BLOCK_SIZE_SB8X8)
+#else
+  if (xd->mode_info_context->mbmi.sb_type > BLOCK_SIZE_MB16X16)
+#endif
+  {
     ++cpi->sb_ymode_count[m];
   } else {
     ++cpi->ymode_count[m];
@@ -2441,7 +2440,8 @@
     vp9_subtract_sbuv(x, bsize);
     vp9_transform_sbuv_4x4(x, bsize);
     vp9_quantize_sbuv_4x4(x, bsize);
-    vp9_optimize_sbuv(cm, x, bsize);
+    if (x->optimize)
+      vp9_optimize_sbuv(cm, x, bsize);
     vp9_inverse_transform_sbuv_4x4(xd, bsize);
     vp9_recon_sbuv(xd, bsize);
 
@@ -2614,6 +2614,9 @@
         sz = TX_16X16;
       if (sz == TX_16X16 && bsize < BLOCK_SIZE_MB16X16)
         sz = TX_8X8;
+      if (sz == TX_8X8 && (xd->mode_info_context->mbmi.mode == SPLITMV ||
+                           xd->mode_info_context->mbmi.mode == I4X4_PRED))
+        sz = TX_4X4;
 
       for (y = 0; y < bh; y++) {
         for (x = 0; x < bw; x++) {

diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index ceca60d..738d6e6 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c

@@ -1647,6 +1647,12 @@
   BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,
       NULL, NULL, NULL, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
 
+#if CONFIG_SB8X8
+  BFP(BLOCK_4X8, NULL, vp9_variance4x8, NULL,
+      NULL, NULL, NULL, NULL, NULL, NULL)
+  BFP(BLOCK_8X4, NULL, vp9_variance8x4, NULL,
+      NULL, NULL, NULL, NULL, NULL, NULL)
+#endif
   BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
       NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
 

diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 541127e..9d1e984 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h

@@ -283,6 +283,8 @@
 enum BlockSize {
 #if CONFIG_SB8X8
   BLOCK_4X4,
+  BLOCK_4X8,
+  BLOCK_8X4,
   BLOCK_8X8,
   BLOCK_8X16,
   BLOCK_16X8,

diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 5b401d7..6815289 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c

@@ -320,27 +320,10 @@
 
 void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
   int i;
-  int qindex;
   MACROBLOCKD *xd = &x->e_mbd;
   int zbin_extra;
   int segment_id = xd->mode_info_context->mbmi.segment_id;
-
-  // Select the baseline MB Q index allowing for any segment level change.
-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) {
-    if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
-      // Abs Value
-      qindex = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
-    } else {
-      // Delta Value
-      qindex = cpi->common.base_qindex +
-                 vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
-
-      // Clamp to valid range
-      qindex = clamp(qindex, 0, MAXQ);
-    }
-  } else {
-    qindex = cpi->common.base_qindex;
-  }
+  const int qindex = vp9_get_qindex(xd, segment_id, cpi->common.base_qindex);
 
   // Y
   zbin_extra = (cpi->common.y_dequant[qindex][1] *

diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index fd7a4bb..fb74cbd 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h

@@ -22,10 +22,6 @@
 #define prototype_quantize_mb(sym) \
   void (sym)(MACROBLOCK *x)
 
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/vp9_quantize_x86.h"
-#endif
-
 void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *mb, int b_idx1, int b_idx2,
                                      int y_blocks);
 void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index da78be1..f971d91 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -13,8 +13,8 @@
 #include <math.h>
 #include <limits.h>
 #include <assert.h>
-#include "vp9/common/vp9_pragmas.h"
 
+#include "vp9/common/vp9_pragmas.h"
 #include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_treewriter.h"
 #include "vp9/encoder/vp9_onyx_int.h"
@@ -34,7 +34,6 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/encoder/vp9_encodemv.h"
-
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_entropy.h"
@@ -42,8 +41,6 @@
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_common.h"
 
-#define MAXF(a,b)            (((a) > (b)) ? (a) : (b))
-
 #define INVALID_MV 0x80008000
 
 /* Factor to weigh the rate for switchable interp filters */
@@ -157,11 +154,9 @@
   for (i = 0; i < BLOCK_TYPES; i++)
     for (j = 0; j < REF_TYPES; j++)
       for (k = 0; k < COEF_BANDS; k++)
-        for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
-          vp9_cost_tokens_skip((int *)(c[i][j][k][l]),
-                               p[i][j][k][l],
+        for (l = 0; l < PREV_COEF_CONTEXTS; l++)
+          vp9_cost_tokens_skip((int *)c[i][j][k][l], p[i][j][k][l],
                                vp9_coef_tree);
-        }
 }
 
 static int rd_iifactor[32] =  { 4, 4, 3, 2, 1, 0, 0, 0,
@@ -184,7 +179,7 @@
   for (i = 0; i < QINDEX_RANGE; i++) {
     sad_per_bit16lut[i] =
       (int)((0.0418 * vp9_convert_qindex_to_q(i)) + 2.4107);
-    sad_per_bit4lut[i] = (int)((0.063 * vp9_convert_qindex_to_q(i)) + 2.742);
+    sad_per_bit4lut[i] = (int)(0.063 * vp9_convert_qindex_to_q(i) + 2.742);
   }
 }
 
@@ -208,7 +203,7 @@
   // for key frames, golden frames and arf frames.
   // if (cpi->common.refresh_golden_frame ||
   //     cpi->common.refresh_alt_ref_frame)
-  qindex = (qindex < 0) ? 0 : ((qindex > MAXQ) ? MAXQ : qindex);
+  qindex = clamp(qindex, 0, MAXQ);
 
   cpi->RDMULT = compute_rd_mult(qindex);
   if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
@@ -1874,7 +1869,7 @@
           raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, n,
                                     x->plane[0].src.buf,
                                     x->plane[0].src.stride);
-          assert(((intptr_t)x->e_mbd.plane[0].pre[0].buf & 0xf) == 0);
+          assert(((intptr_t)x->e_mbd.plane[0].pre[0].buf & 0x7) == 0);
           x->e_mbd.plane[0].pre[0].buf =
           raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, n,
                                     x->e_mbd.plane[0].pre[0].buf,
@@ -2890,24 +2885,24 @@
       /* Use 8x8 result as 16x8/8x16's predictor MV. Adjust search range
        * according to the closeness of 2 MV. */
       /* block 8X16 */
-      sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[2].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[2].as_mv.col)) >> 3);
+      sr = MAX(abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[2].as_mv.row) >> 3,
+               abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[2].as_mv.col) >> 3);
       cal_step_param(sr, &bsi.sv_istep[0]);
 
-      sr = MAXF((abs(bsi.sv_mvp[1].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[1].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);
+      sr = MAX(abs(bsi.sv_mvp[1].as_mv.row - bsi.sv_mvp[3].as_mv.row) >> 3,
+               abs(bsi.sv_mvp[1].as_mv.col - bsi.sv_mvp[3].as_mv.col) >> 3);
       cal_step_param(sr, &bsi.sv_istep[1]);
 
       rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,
                        seg_mvs[PARTITIONING_8X16], txfm_cache);
 
       /* block 16X8 */
-      sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[1].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[1].as_mv.col)) >> 3);
+      sr = MAX(abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[1].as_mv.row) >> 3,
+               abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[1].as_mv.col) >> 3);
       cal_step_param(sr, &bsi.sv_istep[0]);
 
-      sr = MAXF((abs(bsi.sv_mvp[2].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[2].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);
+      sr = MAX(abs(bsi.sv_mvp[2].as_mv.row - bsi.sv_mvp[3].as_mv.row) >> 3,
+               abs(bsi.sv_mvp[2].as_mv.col - bsi.sv_mvp[3].as_mv.col) >> 3);
       cal_step_param(sr, &bsi.sv_istep[1]);
 
       rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,
@@ -3295,6 +3290,11 @@
     case BLOCK_32X16: return BLOCK_16X8;
     case BLOCK_16X32: return BLOCK_8X16;
     case BLOCK_16X16: return BLOCK_8X8;
+#if CONFIG_SB8X8
+    case BLOCK_16X8:  return BLOCK_8X4;
+    case BLOCK_8X16:  return BLOCK_4X8;
+    case BLOCK_8X8:   return BLOCK_4X4;
+#endif
     default:
       assert(0);
       return -1;
@@ -3310,6 +3310,11 @@
     case BLOCK_SIZE_SB32X16: return BLOCK_32X16;
     case BLOCK_SIZE_SB16X32: return BLOCK_16X32;
     case BLOCK_SIZE_MB16X16: return BLOCK_16X16;
+#if CONFIG_SB8X8
+    case BLOCK_SIZE_SB16X8:  return BLOCK_16X8;
+    case BLOCK_SIZE_SB8X16:  return BLOCK_8X16;
+    case BLOCK_SIZE_SB8X8:   return BLOCK_8X8;
+#endif
     default:
       assert(0);
       return -1;
@@ -4860,6 +4865,8 @@
                                              cpi->common.y_dc_delta_q);
 #if CONFIG_SB8X8
   int_mv seg_mvs[4][MAX_REF_FRAMES - 1];
+  union b_mode_info best_bmodes[4];
+  PARTITION_INFO best_partition;
 #endif
 
 #if CONFIG_SB8X8
@@ -4927,7 +4934,9 @@
   if (cpi->Speed == 0
       || (cpi->Speed > 0 && (ref_frame_mask & (1 << INTRA_FRAME)))) {
     mbmi->mode = DC_PRED;
-    for (i = 0; i <= ((bsize < BLOCK_SIZE_SB64X64) ? TX_16X16 : TX_32X32);
+    for (i = 0; i <= (bsize < BLOCK_SIZE_MB16X16 ? TX_4X4 :
+                      (bsize < BLOCK_SIZE_SB32X32 ? TX_8X8 :
+                       (bsize < BLOCK_SIZE_SB64X64 ? TX_16X16 : TX_32X32)));
          i++) {
       mbmi->txfm_size = i;
       rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[i], &rate_uv_tokenonly[i],
@@ -5097,6 +5106,8 @@
                       bsize, txfm_cache);
 
       uv_tx = mbmi->txfm_size;
+      if (bsize < BLOCK_SIZE_MB16X16 && uv_tx == TX_8X8)
+        uv_tx = TX_4X4;
       if (bsize < BLOCK_SIZE_SB32X32 && uv_tx == TX_16X16)
         uv_tx = TX_8X8;
       else if (bsize < BLOCK_SIZE_SB64X64 && uv_tx == TX_32X32)
@@ -5397,6 +5408,15 @@
         *returndistortion = distortion2;
         best_rd = this_rd;
         vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));
+#if CONFIG_SB8X8
+        vpx_memcpy(&best_partition, x->partition_info, sizeof(PARTITION_INFO));
+
+        if (this_mode == I4X4_PRED || this_mode == SPLITMV) {
+          for (i = 0; i < 4; i++) {
+            best_bmodes[i] = xd->mode_info_context->bmi[i];
+          }
+        }
+#endif
       }
 #if 0
       // Testing this mode gave rise to an improvement in best error score.
@@ -5563,7 +5583,28 @@
 
   // macroblock modes
   vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
+#if CONFIG_SB8X8
+  if (best_mbmode.mode == I4X4_PRED) {
+    for (i = 0; i < 4; i++) {
+      xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;
+    }
+  }
 
+  if (best_mbmode.mode == SPLITMV) {
+    for (i = 0; i < 4; i++)
+      xd->mode_info_context->bmi[i].as_mv[0].as_int =
+          best_bmodes[i].as_mv[0].as_int;
+    if (mbmi->second_ref_frame > 0)
+      for (i = 0; i < 4; i++)
+        xd->mode_info_context->bmi[i].as_mv[1].as_int =
+            best_bmodes[i].as_mv[1].as_int;
+
+    vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));
+
+    mbmi->mv[0].as_int = x->partition_info->bmi[3].mv.as_int;
+    mbmi->mv[1].as_int = x->partition_info->bmi[3].second_mv.as_int;
+  }
+#endif
   for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
     if (best_pred_rd[i] == INT64_MAX)
       best_pred_diff[i] = INT_MIN;
@@ -5585,7 +5626,12 @@
  end:
   set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
                     scale_factor);
-  store_coding_context(x, ctx, best_mode_index, NULL,
+  store_coding_context(x, ctx, best_mode_index,
+#if CONFIG_SB8X8
+                       &best_partition,
+#else
+                       NULL,
+#endif
                        &mbmi->ref_mvs[mbmi->ref_frame][0],
                        &mbmi->ref_mvs[mbmi->second_ref_frame < 0 ? 0 :
                                       mbmi->second_ref_frame][0],

diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index e04980c..86af268 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c

@@ -57,61 +57,57 @@
 }
 
 // Based on set of segment counts calculate a probability tree
-static void calc_segtree_probs(MACROBLOCKD *xd,
-                               int *segcounts,
+static void calc_segtree_probs(MACROBLOCKD *xd, int *segcounts,
                                vp9_prob *segment_tree_probs) {
   // Work out probabilities of each segment
-  segment_tree_probs[0] =
-    get_binary_prob(segcounts[0] + segcounts[1] + segcounts[2] + segcounts[3],
-                    segcounts[4] + segcounts[5] + segcounts[6] + segcounts[7]);
-  segment_tree_probs[1] =
-    get_binary_prob(segcounts[0] + segcounts[1], segcounts[2] + segcounts[3]);
-  segment_tree_probs[2] = get_binary_prob(segcounts[0], segcounts[1]);
-  segment_tree_probs[3] = get_binary_prob(segcounts[2], segcounts[3]);
-  segment_tree_probs[4] =
-    get_binary_prob(segcounts[4] + segcounts[5], segcounts[6] + segcounts[7]);
+  const int c01 = segcounts[0] + segcounts[1];
+  const int c23 = segcounts[2] + segcounts[3];
+  const int c45 = segcounts[4] + segcounts[5];
+  const int c67 = segcounts[6] + segcounts[7];
+
+  segment_tree_probs[0] = get_binary_prob(c01 + c23, c45 + c67);
+  segment_tree_probs[1] = get_binary_prob(c01, c23);
+  segment_tree_probs[2] = get_binary_prob(c45, c67);
+  segment_tree_probs[3] = get_binary_prob(segcounts[0], segcounts[1]);
+  segment_tree_probs[4] = get_binary_prob(segcounts[2], segcounts[3]);
   segment_tree_probs[5] = get_binary_prob(segcounts[4], segcounts[5]);
   segment_tree_probs[6] = get_binary_prob(segcounts[6], segcounts[7]);
 }
 
 // Based on set of segment counts and probabilities calculate a cost estimate
-static int cost_segmap(MACROBLOCKD *xd,
-                       int *segcounts,
-                       vp9_prob *probs) {
-  int cost;
-  int count1, count2;
+static int cost_segmap(MACROBLOCKD *xd, int *segcounts, vp9_prob *probs) {
+  const int c01 = segcounts[0] + segcounts[1];
+  const int c23 = segcounts[2] + segcounts[3];
+  const int c45 = segcounts[4] + segcounts[5];
+  const int c67 = segcounts[6] + segcounts[7];
+  const int c0123 = c01 + c23;
+  const int c4567 = c45 + c67;
 
   // Cost the top node of the tree
-  count1 = segcounts[0] + segcounts[1] + segcounts[2] + segcounts[3];
-  count2 = segcounts[3] + segcounts[4] + segcounts[5] + segcounts[6];
-  cost = count1 * vp9_cost_zero(probs[0]) +
-         count2 * vp9_cost_one(probs[0]);
+  int cost = c0123 * vp9_cost_zero(probs[0]) +
+             c4567 * vp9_cost_one(probs[0]);
 
   // Cost subsequent levels
-  if (count1 > 0) {
-    count1 = segcounts[0] + segcounts[1];
-    count2 = segcounts[2] + segcounts[3];
-    cost += count1 * vp9_cost_zero(probs[1]) +
-            count2 * vp9_cost_one(probs[1]);
+  if (c0123 > 0) {
+    cost += c01 * vp9_cost_zero(probs[1]) +
+            c23 * vp9_cost_one(probs[1]);
 
-    if (count1 > 0)
-      cost += segcounts[0] * vp9_cost_zero(probs[2]) +
-              segcounts[1] * vp9_cost_one(probs[2]);
-    if (count2 > 0)
-      cost += segcounts[2] * vp9_cost_zero(probs[3]) +
-              segcounts[3] * vp9_cost_one(probs[3]);
+    if (c01 > 0)
+      cost += segcounts[0] * vp9_cost_zero(probs[3]) +
+              segcounts[1] * vp9_cost_one(probs[3]);
+    if (c23 > 0)
+      cost += segcounts[2] * vp9_cost_zero(probs[4]) +
+              segcounts[3] * vp9_cost_one(probs[4]);
   }
 
-  if (count2 > 0) {
-    count1 = segcounts[4] + segcounts[5];
-    count2 = segcounts[6] + segcounts[7];
-    cost += count1 * vp9_cost_zero(probs[4]) +
-            count2 * vp9_cost_one(probs[4]);
+  if (c4567 > 0) {
+    cost += c45 * vp9_cost_zero(probs[2]) +
+            c67 * vp9_cost_one(probs[2]);
 
-    if (count1 > 0)
+    if (c45 > 0)
       cost += segcounts[4] * vp9_cost_zero(probs[5]) +
               segcounts[5] * vp9_cost_one(probs[5]);
-    if (count2 > 0)
+    if (c67 > 0)
       cost += segcounts[6] * vp9_cost_zero(probs[6]) +
               segcounts[7] * vp9_cost_one(probs[6]);
   }

diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance_c.c
index c4c70df..c2a6004 100644
--- a/vp9/encoder/vp9_variance_c.c
+++ b/vp9/encoder/vp9_variance_c.c

@@ -239,6 +239,32 @@
   return (var - (((unsigned int)avg * avg) >> 6));
 }
 
+unsigned int vp9_variance8x4_c(const uint8_t *src_ptr,
+                               int  source_stride,
+                               const uint8_t *ref_ptr,
+                               int  recon_stride,
+                               unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4, &var, &avg);
+  *sse = var;
+  return (var - (((unsigned int)avg * avg) >> 5));
+}
+
+unsigned int vp9_variance4x8_c(const uint8_t *src_ptr,
+                               int  source_stride,
+                               const uint8_t *ref_ptr,
+                               int  recon_stride,
+                               unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8, &var, &avg);
+  *sse = var;
+  return (var - (((unsigned int)avg * avg) >> 5));
+}
+
 unsigned int vp9_variance4x4_c(const uint8_t *src_ptr,
                                int  source_stride,
                                const uint8_t *ref_ptr,

diff --git a/vp9/encoder/x86/vp9_quantize_mmx.asm b/vp9/encoder/x86/vp9_quantize_mmx.asm
deleted file mode 100644
index 22e2356..0000000
--- a/vp9/encoder/x86/vp9_quantize_mmx.asm
+++ /dev/null

@@ -1,286 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;int vp9_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
-;                           short *qcoeff_ptr,short *dequant_ptr,
-;                           short *scan_mask, short *round_ptr,
-;                           short *quant_ptr, short *dqcoeff_ptr);
-global sym(vp9_fast_quantize_b_impl_mmx) PRIVATE
-sym(vp9_fast_quantize_b_impl_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov             rsi,        arg(0) ;coeff_ptr
-        movq            mm0,        [rsi]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm1,        [rax]
-
-        movq            mm3,        mm0
-        psraw           mm0,        15
-
-        pxor            mm3,        mm0
-        psubw           mm3,        mm0         ; abs
-
-        movq            mm2,        mm3
-        pcmpgtw         mm1,        mm2
-
-        pandn           mm1,        mm2
-        movq            mm3,        mm1
-
-        mov             rdx,        arg(6) ;quant_ptr
-        movq            mm1,        [rdx]
-
-        mov             rcx,        arg(5) ;round_ptr
-        movq            mm2,        [rcx]
-
-        paddw           mm3,        mm2
-        pmulhuw         mm3,        mm1
-
-        pxor            mm3,        mm0
-        psubw           mm3,        mm0     ;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-        movq            mm0,        mm3
-
-        movq            [rdi],      mm3
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm2,        [rax]
-
-        pmullw          mm3,        mm2
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax],      mm3
-
-        ; next 8
-        movq            mm4,        [rsi+8]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+8]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+8]
-        movq            mm6,        [rcx+8]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+8],    mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+8]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+8],    mm7
-
-
-                ; next 8
-        movq            mm4,        [rsi+16]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+16]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+16]
-        movq            mm6,        [rcx+16]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+16],   mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+16]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+16],   mm7
-
-
-                ; next 8
-        movq            mm4,        [rsi+24]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+24]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+24]
-        movq            mm6,        [rcx+24]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+24],   mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+24]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+24],   mm7
-
-
-
-        mov             rdi,        arg(4) ;scan_mask
-        mov             rsi,        arg(2) ;qcoeff_ptr
-
-        pxor            mm5,        mm5
-        pxor            mm7,        mm7
-
-        movq            mm0,        [rsi]
-        movq            mm1,        [rsi+8]
-
-        movq            mm2,        [rdi]
-        movq            mm3,        [rdi+8];
-
-        pcmpeqw         mm0,        mm7
-        pcmpeqw         mm1,        mm7
-
-        pcmpeqw         mm6,        mm6
-        pxor            mm0,        mm6
-
-        pxor            mm1,        mm6
-        psrlw           mm0,        15
-
-        psrlw           mm1,        15
-        pmaddwd         mm0,        mm2
-
-        pmaddwd         mm1,        mm3
-        movq            mm5,        mm0
-
-        paddd           mm5,        mm1
-
-        movq            mm0,        [rsi+16]
-        movq            mm1,        [rsi+24]
-
-        movq            mm2,        [rdi+16]
-        movq            mm3,        [rdi+24];
-
-        pcmpeqw         mm0,        mm7
-        pcmpeqw         mm1,        mm7
-
-        pcmpeqw         mm6,        mm6
-        pxor            mm0,        mm6
-
-        pxor            mm1,        mm6
-        psrlw           mm0,        15
-
-        psrlw           mm1,        15
-        pmaddwd         mm0,        mm2
-
-        pmaddwd         mm1,        mm3
-        paddd           mm5,        mm0
-
-        paddd           mm5,        mm1
-        movq            mm0,        mm5
-
-        psrlq           mm5,        32
-        paddd           mm0,        mm5
-
-        ; eob adjustment begins here
-        movq            rcx,        mm0
-        and             rcx,        0xffff
-
-        xor             rdx,        rdx
-        sub             rdx,        rcx ; rdx=-rcx
-
-        bsr             rax,        rcx
-        inc             rax
-
-        sar             rdx,        31
-        and             rax,        rdx
-        ; Substitute the sse assembly for the old mmx mixed assembly/C. The
-        ; following is kept as reference
-        ;    movq            rcx,        mm0
-        ;    bsr             rax,        rcx
-        ;
-        ;    mov             eob,        rax
-        ;    mov             eee,        rcx
-        ;
-        ;if(eee==0)
-        ;{
-        ;    eob=-1;
-        ;}
-        ;else if(eee<0)
-        ;{
-        ;    eob=15;
-        ;}
-        ;d->eob = eob+1;
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret

diff --git a/vp9/encoder/x86/vp9_quantize_sse2.asm b/vp9/encoder/x86/vp9_quantize_sse2.asm
deleted file mode 100644
index 700e64b..0000000
--- a/vp9/encoder/x86/vp9_quantize_sse2.asm
+++ /dev/null

@@ -1,379 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-; void vp9_regular_quantize_b_sse2 | arg
-;  (BLOCK  *b,                     |  0
-;   BLOCKD *d)                     |  1
-
-global sym(vp9_regular_quantize_b_sse2) PRIVATE
-sym(vp9_regular_quantize_b_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SAVE_XMM 7
-    GET_GOT     rbx
-
-%if ABI_IS_32BIT
-    push        rdi
-    push        rsi
-%else
-  %if LIBVPX_YASM_WIN64
-    push        rdi
-    push        rsi
-  %endif
-%endif
-
-    ALIGN_STACK 16, rax
-    %define zrun_zbin_boost   0  ;  8
-    %define abs_minus_zbin    8  ; 32
-    %define temp_qcoeff       40 ; 32
-    %define qcoeff            72 ; 32
-    %define stack_size        104
-    sub         rsp, stack_size
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %if LIBVPX_YASM_WIN64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rdx, [rdi + vp9_block_coeff] ; coeff_ptr
-    mov         rcx, [rdi + vp9_block_zbin] ; zbin_ptr
-    movd        xmm7, [rdi + vp9_block_zbin_extra] ; zbin_oq_value
-
-    ; z
-    movdqa      xmm0, [rdx]
-    movdqa      xmm4, [rdx + 16]
-    mov         rdx, [rdi + vp9_block_round] ; round_ptr
-
-    pshuflw     xmm7, xmm7, 0
-    punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm5, xmm4
-
-    ; sz
-    psraw       xmm0, 15
-    psraw       xmm4, 15
-
-    ; (z ^ sz)
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-
-    ; x = abs(z)
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    movdqa      xmm2, [rcx]
-    movdqa      xmm3, [rcx + 16]
-    mov         rcx, [rdi + vp9_block_quant] ; quant_ptr
-
-    ; *zbin_ptr + zbin_oq_value
-    paddw       xmm2, xmm7
-    paddw       xmm3, xmm7
-
-    ; x - (*zbin_ptr + zbin_oq_value)
-    psubw       xmm1, xmm2
-    psubw       xmm5, xmm3
-    movdqa      [rsp + abs_minus_zbin], xmm1
-    movdqa      [rsp + abs_minus_zbin + 16], xmm5
-
-    ; add (zbin_ptr + zbin_oq_value) back
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm3
-
-    movdqa      xmm2, [rdx]
-    movdqa      xmm6, [rdx + 16]
-
-    movdqa      xmm3, [rcx]
-    movdqa      xmm7, [rcx + 16]
-
-    ; x + round
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm6
-
-    ; y = x * quant_ptr >> 16
-    pmulhw      xmm3, xmm1
-    pmulhw      xmm7, xmm5
-
-    ; y += x
-    paddw       xmm1, xmm3
-    paddw       xmm5, xmm7
-
-    movdqa      [rsp + temp_qcoeff], xmm1
-    movdqa      [rsp + temp_qcoeff + 16], xmm5
-
-    pxor        xmm6, xmm6
-    ; zero qcoeff
-    movdqa      [rsp + qcoeff], xmm6
-    movdqa      [rsp + qcoeff + 16], xmm6
-
-    mov         rdx, [rdi + vp9_block_zrun_zbin_boost] ; zbin_boost_ptr
-    mov         rax, [rdi + vp9_block_quant_shift] ; quant_shift_ptr
-    mov         [rsp + zrun_zbin_boost], rdx
-
-%macro ZIGZAG_LOOP 1
-    ; x
-    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]
-
-    ; if (x >= zbin)
-    sub         cx, WORD PTR[rdx]           ; x - zbin
-    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
-    jl          .rq_zigzag_loop_%1           ; x < zbin
-
-    movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
-
-    ; downshift by quant_shift[rc]
-    movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc]
-    sar         edi, cl                     ; also sets Z bit
-    je          .rq_zigzag_loop_%1           ; !y
-    mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
-    mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
-.rq_zigzag_loop_%1:
-%endmacro
-; in vp9_default_zig_zag1d order: see vp9/common/vp9_entropy.c
-ZIGZAG_LOOP  0
-ZIGZAG_LOOP  1
-ZIGZAG_LOOP  4
-ZIGZAG_LOOP  8
-ZIGZAG_LOOP  5
-ZIGZAG_LOOP  2
-ZIGZAG_LOOP  3
-ZIGZAG_LOOP  6
-ZIGZAG_LOOP  9
-ZIGZAG_LOOP 12
-ZIGZAG_LOOP 13
-ZIGZAG_LOOP 10
-ZIGZAG_LOOP  7
-ZIGZAG_LOOP 11
-ZIGZAG_LOOP 14
-ZIGZAG_LOOP 15
-
-    movdqa      xmm2, [rsp + qcoeff]
-    movdqa      xmm3, [rsp + qcoeff + 16]
-
-    mov         rcx, [rsi + vp9_blockd_dequant] ; dequant_ptr
-    mov         rdi, [rsi + vp9_blockd_dqcoeff] ; dqcoeff_ptr
-
-    ; y ^ sz
-    pxor        xmm2, xmm0
-    pxor        xmm3, xmm4
-    ; x = (y ^ sz) - sz
-    psubw       xmm2, xmm0
-    psubw       xmm3, xmm4
-
-    ; dequant
-    movdqa      xmm0, [rcx]
-    movdqa      xmm1, [rcx + 16]
-
-    mov         rcx, [rsi + vp9_blockd_qcoeff] ; qcoeff_ptr
-
-    pmullw      xmm0, xmm2
-    pmullw      xmm1, xmm3
-
-    movdqa      [rcx], xmm2        ; store qcoeff
-    movdqa      [rcx + 16], xmm3
-    movdqa      [rdi], xmm0        ; store dqcoeff
-    movdqa      [rdi + 16], xmm1
-
-    ; select the last value (in zig_zag order) for EOB
-    pcmpeqw     xmm2, xmm6
-    pcmpeqw     xmm3, xmm6
-    ; !
-    pcmpeqw     xmm6, xmm6
-    pxor        xmm2, xmm6
-    pxor        xmm3, xmm6
-    ; mask inv_zig_zag
-    pand        xmm2, [GLOBAL(inv_zig_zag)]
-    pand        xmm3, [GLOBAL(inv_zig_zag + 16)]
-    ; select the max value
-    pmaxsw      xmm2, xmm3
-    pshufd      xmm3, xmm2, 00001110b
-    pmaxsw      xmm2, xmm3
-    pshuflw     xmm3, xmm2, 00001110b
-    pmaxsw      xmm2, xmm3
-    pshuflw     xmm3, xmm2, 00000001b
-    pmaxsw      xmm2, xmm3
-    movd        eax, xmm2
-    and         eax, 0xff
-    mov         [rsi + vp9_blockd_eob], eax
-
-    ; begin epilog
-    add         rsp, stack_size
-    pop         rsp
-%if ABI_IS_32BIT
-    pop         rsi
-    pop         rdi
-%else
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    pop         rdi
-  %endif
-%endif
-    RESTORE_GOT
-    RESTORE_XMM
-    pop         rbp
-    ret
-
-; void vp9_fast_quantize_b_sse2 | arg
-;  (BLOCK  *b,                  |  0
-;   BLOCKD *d)                  |  1
-
-global sym(vp9_fast_quantize_b_sse2) PRIVATE
-sym(vp9_fast_quantize_b_sse2):
-    push        rbp
-    mov         rbp, rsp
-    GET_GOT     rbx
-
-%if ABI_IS_32BIT
-    push        rdi
-    push        rsi
-%else
-  %if LIBVPX_YASM_WIN64
-    push        rdi
-    push        rsi
-  %else
-    ; these registers are used for passing arguments
-  %endif
-%endif
-
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %if LIBVPX_YASM_WIN64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rax, [rdi + vp9_block_coeff]
-    mov         rcx, [rdi + vp9_block_round]
-    mov         rdx, [rdi + vp9_block_quant_fast]
-
-    ; z = coeff
-    movdqa      xmm0, [rax]
-    movdqa      xmm4, [rax + 16]
-
-    ; dup z so we can save sz
-    movdqa      xmm1, xmm0
-    movdqa      xmm5, xmm4
-
-    ; sz = z >> 15
-    psraw       xmm0, 15
-    psraw       xmm4, 15
-
-    ; x = abs(z) = (z ^ sz) - sz
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    ; x += round
-    paddw       xmm1, [rcx]
-    paddw       xmm5, [rcx + 16]
-
-    mov         rax, [rsi + vp9_blockd_qcoeff]
-    mov         rcx, [rsi + vp9_blockd_dequant]
-    mov         rdi, [rsi + vp9_blockd_dqcoeff]
-
-    ; y = x * quant >> 16
-    pmulhw      xmm1, [rdx]
-    pmulhw      xmm5, [rdx + 16]
-
-    ; x = (y ^ sz) - sz
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    ; qcoeff = x
-    movdqa      [rax], xmm1
-    movdqa      [rax + 16], xmm5
-
-    ; x * dequant
-    movdqa      xmm2, xmm1
-    movdqa      xmm3, xmm5
-    pmullw      xmm2, [rcx]
-    pmullw      xmm3, [rcx + 16]
-
-    ; dqcoeff = x * dequant
-    movdqa      [rdi], xmm2
-    movdqa      [rdi + 16], xmm3
-
-    pxor        xmm4, xmm4                  ;clear all bits
-    pcmpeqw     xmm1, xmm4
-    pcmpeqw     xmm5, xmm4
-
-    pcmpeqw     xmm4, xmm4                  ;set all bits
-    pxor        xmm1, xmm4
-    pxor        xmm5, xmm4
-
-    pand        xmm1, [GLOBAL(inv_zig_zag)]
-    pand        xmm5, [GLOBAL(inv_zig_zag + 16)]
-
-    pmaxsw      xmm1, xmm5
-
-    ; now down to 8
-    pshufd      xmm5, xmm1, 00001110b
-
-    pmaxsw      xmm1, xmm5
-
-    ; only 4 left
-    pshuflw     xmm5, xmm1, 00001110b
-
-    pmaxsw      xmm1, xmm5
-
-    ; okay, just 2!
-    pshuflw     xmm5, xmm1, 00000001b
-
-    pmaxsw      xmm1, xmm5
-
-    movd        eax, xmm1
-    and         eax, 0xff
-    mov         [rsi + vp9_blockd_eob], eax
-
-    ; begin epilog
-%if ABI_IS_32BIT
-    pop         rsi
-    pop         rdi
-%else
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    pop         rdi
-  %endif
-%endif
-
-    RESTORE_GOT
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-inv_zig_zag:
-  dw 0x0001, 0x0002, 0x0006, 0x0007
-  dw 0x0003, 0x0005, 0x0008, 0x000d
-  dw 0x0004, 0x0009, 0x000c, 0x000e
-  dw 0x000a, 0x000b, 0x000f, 0x0010

diff --git a/vp9/encoder/x86/vp9_quantize_sse4.asm b/vp9/encoder/x86/vp9_quantize_sse4.asm
deleted file mode 100644
index 4c14e5f..0000000
--- a/vp9/encoder/x86/vp9_quantize_sse4.asm
+++ /dev/null

@@ -1,253 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-; void vp9_regular_quantize_b_sse4 | arg
-;  (BLOCK  *b,                     |  0
-;   BLOCKD *d)                     |  1
-
-global sym(vp9_regular_quantize_b_sse4) PRIVATE
-sym(vp9_regular_quantize_b_sse4):
-
-%if ABI_IS_32BIT
-    push        rbp
-    mov         rbp, rsp
-    GET_GOT     rbx
-    push        rdi
-    push        rsi
-
-    ALIGN_STACK 16, rax
-    %define qcoeff      0 ; 32
-    %define stack_size 32
-    sub         rsp, stack_size
-%else
-  %if LIBVPX_YASM_WIN64
-    SAVE_XMM 8, u
-    push        rdi
-    push        rsi
-  %endif
-%endif
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %if LIBVPX_YASM_WIN64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rax, [rdi + vp9_block_coeff]
-    mov         rcx, [rdi + vp9_block_zbin]
-    mov         rdx, [rdi + vp9_block_round]
-    movd        xmm7, [rdi + vp9_block_zbin_extra]
-
-    ; z
-    movdqa      xmm0, [rax]
-    movdqa      xmm1, [rax + 16]
-
-    ; duplicate zbin_oq_value
-    pshuflw     xmm7, xmm7, 0
-    punpcklwd   xmm7, xmm7
-
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm1
-
-    ; sz
-    psraw       xmm0, 15
-    psraw       xmm1, 15
-
-    ; (z ^ sz)
-    pxor        xmm2, xmm0
-    pxor        xmm3, xmm1
-
-    ; x = abs(z)
-    psubw       xmm2, xmm0
-    psubw       xmm3, xmm1
-
-    ; zbin
-    movdqa      xmm4, [rcx]
-    movdqa      xmm5, [rcx + 16]
-
-    ; *zbin_ptr + zbin_oq_value
-    paddw       xmm4, xmm7
-    paddw       xmm5, xmm7
-
-    movdqa      xmm6, xmm2
-    movdqa      xmm7, xmm3
-
-    ; x - (*zbin_ptr + zbin_oq_value)
-    psubw       xmm6, xmm4
-    psubw       xmm7, xmm5
-
-    ; round
-    movdqa      xmm4, [rdx]
-    movdqa      xmm5, [rdx + 16]
-
-    mov         rax, [rdi + vp9_block_quant_shift]
-    mov         rcx, [rdi + vp9_block_quant]
-    mov         rdx, [rdi + vp9_block_zrun_zbin_boost]
-
-    ; x + round
-    paddw       xmm2, xmm4
-    paddw       xmm3, xmm5
-
-    ; quant
-    movdqa      xmm4, [rcx]
-    movdqa      xmm5, [rcx + 16]
-
-    ; y = x * quant_ptr >> 16
-    pmulhw      xmm4, xmm2
-    pmulhw      xmm5, xmm3
-
-    ; y += x
-    paddw       xmm2, xmm4
-    paddw       xmm3, xmm5
-
-    pxor        xmm4, xmm4
-%if ABI_IS_32BIT
-    movdqa      [rsp + qcoeff], xmm4
-    movdqa      [rsp + qcoeff + 16], xmm4
-%else
-    pxor        xmm8, xmm8
-%endif
-
-    ; quant_shift
-    movdqa      xmm5, [rax]
-
-    ; zrun_zbin_boost
-    mov         rax, rdx
-
-%macro ZIGZAG_LOOP 5
-    ; x
-    pextrw      ecx, %4, %2
-
-    ; if (x >= zbin)
-    sub         cx, WORD PTR[rdx]           ; x - zbin
-    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
-    jl          .rq_zigzag_loop_%1          ; x < zbin
-
-    pextrw      edi, %3, %2                 ; y
-
-    ; downshift by quant_shift[rc]
-    pextrb      ecx, xmm5, %1               ; quant_shift[rc]
-    sar         edi, cl                     ; also sets Z bit
-    je          .rq_zigzag_loop_%1          ; !y
-%if ABI_IS_32BIT
-    mov         WORD PTR[rsp + qcoeff + %1 *2], di
-%else
-    pinsrw      %5, edi, %2                 ; qcoeff[rc]
-%endif
-    mov         rdx, rax                    ; reset to b->zrun_zbin_boost
-.rq_zigzag_loop_%1:
-%endmacro
-; in vp9_default_zig_zag1d order: see vp9/common/vp9_entropy.c
-ZIGZAG_LOOP  0, 0, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  1, 1, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  4, 4, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  8, 0, xmm3, xmm7, xmm8
-ZIGZAG_LOOP  5, 5, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  2, 2, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  3, 3, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  6, 6, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  9, 1, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8
-ZIGZAG_LOOP  7, 7, xmm2, xmm6, xmm4
-ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
-
-    mov         rcx, [rsi + vp9_blockd_dequant]
-    mov         rdi, [rsi + vp9_blockd_dqcoeff]
-
-%if ABI_IS_32BIT
-    movdqa      xmm4, [rsp + qcoeff]
-    movdqa      xmm5, [rsp + qcoeff + 16]
-%else
-    %define     xmm5 xmm8
-%endif
-
-    ; y ^ sz
-    pxor        xmm4, xmm0
-    pxor        xmm5, xmm1
-    ; x = (y ^ sz) - sz
-    psubw       xmm4, xmm0
-    psubw       xmm5, xmm1
-
-    ; dequant
-    movdqa      xmm0, [rcx]
-    movdqa      xmm1, [rcx + 16]
-
-    mov         rcx, [rsi + vp9_blockd_qcoeff]
-
-    pmullw      xmm0, xmm4
-    pmullw      xmm1, xmm5
-
-    ; store qcoeff
-    movdqa      [rcx], xmm4
-    movdqa      [rcx + 16], xmm5
-
-    ; store dqcoeff
-    movdqa      [rdi], xmm0
-    movdqa      [rdi + 16], xmm1
-
-    ; select the last value (in zig_zag order) for EOB
-    pxor        xmm6, xmm6
-    pcmpeqw     xmm4, xmm6
-    pcmpeqw     xmm5, xmm6
-
-    packsswb    xmm4, xmm5
-    pshufb      xmm4, [GLOBAL(zig_zag1d)]
-    pmovmskb    edx, xmm4
-    xor         rdi, rdi
-    mov         eax, -1
-    xor         dx, ax
-    bsr         eax, edx
-    sub         edi, edx
-    sar         edi, 31
-    add         eax, 1
-    and         eax, edi
-
-    mov         [rsi + vp9_blockd_eob], eax
-
-    ; begin epilog
-%if ABI_IS_32BIT
-    add         rsp, stack_size
-    pop         rsp
-
-    pop         rsi
-    pop         rdi
-    RESTORE_GOT
-    pop         rbp
-%else
-  %undef xmm5
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    pop         rdi
-    RESTORE_XMM
-  %endif
-%endif
-
-    ret
-
-SECTION_RODATA
-align 16
-; vp9/common/vp9_entropy.c: vp9_default_zig_zag1d
-zig_zag1d:
-    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15

diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.asm b/vp9/encoder/x86/vp9_quantize_ssse3.asm
deleted file mode 100644
index 1fa0521..0000000
--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ /dev/null

@@ -1,137 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-; void vp9_fast_quantize_b_ssse3 | arg
-;  (BLOCK  *b,                   |  0
-;   BLOCKD *d)                   |  1
-;
-
-global sym(vp9_fast_quantize_b_ssse3) PRIVATE
-sym(vp9_fast_quantize_b_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    GET_GOT     rbx
-
-%if ABI_IS_32BIT
-    push        rdi
-    push        rsi
-%else
-  %if LIBVPX_YASM_WIN64
-    push        rdi
-    push        rsi
-  %endif
-%endif
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %if LIBVPX_YASM_WIN64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rax, [rdi + vp9_block_coeff]
-    mov         rcx, [rdi + vp9_block_round]
-    mov         rdx, [rdi + vp9_block_quant_fast]
-
-    ; coeff
-    movdqa      xmm0, [rax]
-    movdqa      xmm4, [rax + 16]
-
-    ; round
-    movdqa      xmm2, [rcx]
-    movdqa      xmm3, [rcx + 16]
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm5, xmm4
-
-    ; sz = z >> 15
-    psraw       xmm0, 15
-    psraw       xmm4, 15
-
-    pabsw       xmm1, xmm1
-    pabsw       xmm5, xmm5
-
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm3
-
-    ; quant_fast
-    pmulhw      xmm1, [rdx]
-    pmulhw      xmm5, [rdx + 16]
-
-    mov         rax, [rsi + vp9_blockd_qcoeff]
-    mov         rdi, [rsi + vp9_blockd_dequant]
-    mov         rcx, [rsi + vp9_blockd_dqcoeff]
-
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    movdqa      [rax], xmm1
-    movdqa      [rax + 16], xmm5
-
-    movdqa      xmm2, [rdi]
-    movdqa      xmm3, [rdi + 16]
-
-    pxor        xmm4, xmm4
-    pmullw      xmm2, xmm1
-    pmullw      xmm3, xmm5
-
-    pcmpeqw     xmm1, xmm4                  ;non zero mask
-    pcmpeqw     xmm5, xmm4                  ;non zero mask
-    packsswb    xmm1, xmm5
-    pshufb      xmm1, [GLOBAL(zz_shuf)]
-
-    pmovmskb    edx, xmm1
-
-    xor         rdi, rdi
-    mov         eax, -1
-    xor         dx, ax                      ;flip the bits for bsr
-    bsr         eax, edx
-
-    movdqa      [rcx], xmm2                 ;store dqcoeff
-    movdqa      [rcx + 16], xmm3            ;store dqcoeff
-
-    sub         edi, edx                    ;check for all zeros in bit mask
-    sar         edi, 31                     ;0 or -1
-    add         eax, 1
-    and         eax, edi                    ;if the bit mask was all zero,
-                                            ;then eob = 0
-    mov         [rsi + vp9_blockd_eob], eax
-
-    ; begin epilog
-%if ABI_IS_32BIT
-    pop         rsi
-    pop         rdi
-%else
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    pop         rdi
-  %endif
-%endif
-
-    RESTORE_GOT
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-zz_shuf:
-    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15

diff --git a/vp9/encoder/x86/vp9_quantize_x86.h b/vp9/encoder/x86/vp9_quantize_x86.h
deleted file mode 100644
index d1db173..0000000
--- a/vp9/encoder/x86/vp9_quantize_x86.h
+++ /dev/null

@@ -1,48 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
- */
-
-#ifndef VP9_ENCODER_X86_VP9_QUANTIZE_X86_H_
-#define VP9_ENCODER_X86_VP9_QUANTIZE_X86_H_
-
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-#if HAVE_MMX
-
-#endif /* HAVE_MMX */
-
-
-#if HAVE_SSE2
-extern prototype_quantize_block(vp9_regular_quantize_b_sse2);
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef vp9_quantize_quantb
-#define vp9_quantize_quantb vp9_regular_quantize_b_sse2
-#endif /* !CONFIG_RUNTIME_CPU_DETECT */
-
-#endif /* HAVE_SSE2 */
-
-
-#if HAVE_SSE4_1
-extern prototype_quantize_block(vp9_regular_quantize_b_sse4);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef vp9_quantize_quantb
-#define vp9_quantize_quantb vp9_regular_quantize_b_sse4
-
-#endif /* !CONFIG_RUNTIME_CPU_DETECT */
-
-#endif /* HAVE_SSE4_1 */
-
-#endif /* QUANTIZE_X86_H */

diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 63c6ed8..39f836f 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk

@@ -17,15 +17,6 @@
 
 VP9_CX_SRCS-yes += vp9_cx_iface.c
 
-# encoder
-#INCLUDES += algo/vpx_common/vpx_mem/include
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += algo/vpx_ref/cpu_id/include
-#INCLUDES += common
-#INCLUDES += encoder
-
 VP9_CX_SRCS-yes += encoder/vp9_bitstream.c
 VP9_CX_SRCS-yes += encoder/vp9_boolhuff.c
 VP9_CX_SRCS-yes += encoder/vp9_dct.c
@@ -81,7 +72,6 @@
 
 
 VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_mcomp_x86.h
-VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_quantize_x86.h
 VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_x86_csystemdependent.c
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm
@@ -94,17 +84,13 @@
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm
-#VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_ssse3.c
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_impl_ssse3.asm
-#VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.asm
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
-#VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_quantize_sse4.asm
-VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_quantize_mmx.asm
 VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_encodeopt.asm
 VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm
 

diff --git a/vpxdec.c b/vpxdec.c
index df0b819..41c654f 100644
--- a/vpxdec.c
+++ b/vpxdec.c

@@ -49,8 +49,8 @@
 
 static const char *exec_name;
 
-#define VP8_FOURCC (0x30385056)
-#define VP9_FOURCC (0x30395056)
+#define VP8_FOURCC (0x00385056)
+#define VP9_FOURCC (0x00395056)
 static const struct {
   char const *name;
   const vpx_codec_iface_t *(*iface)(void);