Merge "changed to use adst for D63_PRED" into experimental
diff --git a/configure b/configure
index 6e60af1..cc8c581 100755
--- a/configure
+++ b/configure
@@ -240,16 +240,14 @@
 EXPERIMENT_LIST="
     csm
     implicit_segmentation
-    newbintramodes
-    comp_interintra_pred
-    enable_6tap
     modelcoefprob
     loop_dering
     scatterscan
     oneshotq
     multiple_arf
-    code_zerogroup
-    sb8x8
+    non420
+    ab4x4
+    comp_inter_joint_search
 "
 CONFIG_LIST="
     external_build
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 5a37816..6aeb96b 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -10,9 +10,10 @@
 #ifndef TEST_ENCODE_TEST_DRIVER_H_
 #define TEST_ENCODE_TEST_DRIVER_H_
 
-#include "./vpx_config.h"
 #include <string>
 #include <vector>
+
+#include "./vpx_config.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "vpx/vpx_encoder.h"
 
@@ -46,7 +47,7 @@
 class CxDataIterator {
  public:
   explicit CxDataIterator(vpx_codec_ctx_t *encoder)
-    : encoder_(encoder), iter_(NULL) {}
+      : encoder_(encoder), iter_(NULL) {}
 
   const vpx_codec_cx_pkt_t *Next() {
     return vpx_codec_get_cx_data(encoder_, &iter_);
@@ -92,7 +93,7 @@
     memset(&encoder_, 0, sizeof(encoder_));
   }
 
-  ~Encoder() {
+  virtual ~Encoder() {
     vpx_codec_destroy(&encoder_);
   }
 
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index 1eee0f5..ddfbd0f 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -206,11 +206,17 @@
   // reset previously set error/droppable frames
   Reset();
 
+#if 0
+  // TODO(jkoleszar): This test is disabled for the time being as too
+  // sensitive. It's not clear how to set a reasonable threshold for
+  // this behavior.
+
   // Now set an arbitrary set of error frames that are non-droppable
   unsigned int num_error_frames = 3;
   unsigned int error_frame_list[] = {3, 10, 20};
   SetErrorFrames(num_error_frames, error_frame_list);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
   // Test that dropping an arbitrary set of inter frames does not hurt too much
   // Note the Average Mismatch PSNR is the average of the PSNR between
   // decoded frame and encoder's version of the same frame for all frames
@@ -219,6 +225,7 @@
   std::cout << "             Mismatch PSNR: "
             << psnr_resilience_mismatch << "\n";
   EXPECT_GT(psnr_resilience_mismatch, 20.0);
+#endif
 }
 
 VP8_INSTANTIATE_TEST_CASE(ErrorResilienceTest, ONE_PASS_TEST_MODES);
diff --git a/test/i420_video_source.h b/test/i420_video_source.h
index 219bd33..12a6ab1 100644
--- a/test/i420_video_source.h
+++ b/test/i420_video_source.h
@@ -83,7 +83,7 @@
   void SetSize(unsigned int width, unsigned int height) {
     if (width != width_ || height != height_) {
       vpx_img_free(img_);
-      img_ = vpx_img_alloc(NULL, VPX_IMG_FMT_VPXI420, width, height, 1);
+      img_ = vpx_img_alloc(NULL, VPX_IMG_FMT_I420, width, height, 1);
       ASSERT_TRUE(img_ != NULL);
       width_ = width;
       height_ = height;
diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index 711d0bd..9633ed7 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -56,7 +56,13 @@
 
   void UpdateMD5(::libvpx_test::Decoder *dec, const vpx_codec_cx_pkt_t *pkt,
                  ::libvpx_test::MD5 *md5) {
-    dec->DecodeFrame((uint8_t *) pkt->data.frame.buf, pkt->data.frame.sz);
+    const vpx_codec_err_t res =
+        dec->DecodeFrame(reinterpret_cast<uint8_t*>(pkt->data.frame.buf),
+                         pkt->data.frame.sz);
+    if (res != VPX_CODEC_OK) {
+      abort_ = true;
+      ASSERT_EQ(VPX_CODEC_OK, res);
+    }
     const vpx_image_t *img = dec->GetDxData().Next();
     md5->Add(img);
   }
diff --git a/test/video_source.h b/test/video_source.h
index 9772657..26d5328 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -103,7 +103,7 @@
     if (width != width_ || height != height_) {
       vpx_img_free(img_);
       raw_sz_ = ((width + 31)&~31) * height * 3 / 2;
-      img_ = vpx_img_alloc(NULL, VPX_IMG_FMT_VPXI420, width, height, 32);
+      img_ = vpx_img_alloc(NULL, VPX_IMG_FMT_I420, width, height, 32);
       width_ = width;
       height_ = height;
     }
diff --git a/test/vp9_boolcoder_test.cc b/test/vp9_boolcoder_test.cc
index 44f516b..5d87ff6 100644
--- a/test/vp9_boolcoder_test.cc
+++ b/test/vp9_boolcoder_test.cc
@@ -52,7 +52,7 @@
         const int random_seed = 6432;
         const int buffer_size = 10000;
         ACMRandom bit_rnd(random_seed);
-        BOOL_CODER bw;
+        vp9_writer bw;
         uint8_t bw_buffer[buffer_size];
         vp9_start_encode(&bw, bw_buffer);
 
@@ -63,7 +63,7 @@
           } else if (bit_method == 3) {
             bit = bit_rnd(2);
           }
-          encode_bool(&bw, bit, static_cast<int>(probas[i]));
+          vp9_write(&bw, bit, static_cast<int>(probas[i]));
         }
 
         vp9_stop_encode(&bw);
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 8179a69..ec81fbd 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -52,10 +52,10 @@
   int i;
 
   for (i = 0; i < NUM_YV12_BUFFERS; i++)
-    vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]);
+    vp9_free_frame_buffer(&oci->yv12_fb[i]);
 
-  vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);
-  vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);
+  vp9_free_frame_buffer(&oci->temp_scale_frame);
+  vp9_free_frame_buffer(&oci->post_proc_buffer);
 
   vpx_free(oci->mip);
   vpx_free(oci->prev_mip);
@@ -70,7 +70,7 @@
 }
 
 int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
-  int i, mb_cols;
+  int i, mi_cols;
 
   // Our internal buffers are always multiples of 16
   const int aligned_width = multiple16(width);
@@ -80,8 +80,9 @@
 
   for (i = 0; i < NUM_YV12_BUFFERS; i++) {
     oci->fb_idx_ref_cnt[i] = 0;
-    if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height,
-                                    VP9BORDERINPIXELS) < 0) {
+    if (vp9_alloc_frame_buffer(&oci->yv12_fb[i], width, height,
+                               oci->subsampling_x, oci->subsampling_y,
+                               VP9BORDERINPIXELS) < 0) {
       vp9_free_frame_buffers(oci);
       return 1;
     }
@@ -98,14 +99,16 @@
     oci->fb_idx_ref_cnt[i] = 1;
   }
 
-  if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame, width, 16,
-                                  VP9BORDERINPIXELS) < 0) {
+  if (vp9_alloc_frame_buffer(&oci->temp_scale_frame, width, 16,
+                             oci->subsampling_x, oci->subsampling_y,
+                             VP9BORDERINPIXELS) < 0) {
     vp9_free_frame_buffers(oci);
     return 1;
   }
 
-  if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height,
-                                  VP9BORDERINPIXELS) < 0) {
+  if (vp9_alloc_frame_buffer(&oci->post_proc_buffer, width, height,
+                             oci->subsampling_x, oci->subsampling_y,
+                             VP9BORDERINPIXELS) < 0) {
     vp9_free_frame_buffers(oci);
     return 1;
   }
@@ -140,19 +143,19 @@
 
   // FIXME(jkoleszar): allocate subsampled arrays for U/V once subsampling
   // information is exposed at this level
-  mb_cols = mb_cols_aligned_to_sb(oci);
-  oci->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * 12 * mb_cols, 1);
+  mi_cols = mi_cols_aligned_to_sb(oci);
+  oci->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * 6 * mi_cols, 1);
   if (!oci->above_context[0]) {
     vp9_free_frame_buffers(oci);
     return 1;
   }
   oci->above_context[1] =
-    oci->above_context[0] + sizeof(ENTROPY_CONTEXT) * 4 * mb_cols;
+    oci->above_context[0] + sizeof(ENTROPY_CONTEXT) * 2 * mi_cols;
   oci->above_context[2] =
-    oci->above_context[1] + sizeof(ENTROPY_CONTEXT) * 4 * mb_cols;
+    oci->above_context[1] + sizeof(ENTROPY_CONTEXT) * 2 * mi_cols;
 
   oci->above_seg_context =
-    vpx_calloc(sizeof(PARTITION_CONTEXT) * mb_cols_aligned_to_sb(oci), 1);
+    vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1);
 
   if (!oci->above_seg_context) {
     vp9_free_frame_buffers(oci);
@@ -177,18 +180,15 @@
   switch (cm->version & 0x3) {
     case 0:
       cm->no_lpf = 0;
-      cm->filter_type = NORMAL_LOOPFILTER;
       cm->use_bilinear_mc_filter = 0;
       break;
     case 1:
       cm->no_lpf = 0;
-      cm->filter_type = SIMPLE_LOOPFILTER;
       cm->use_bilinear_mc_filter = 1;
       break;
     case 2:
     case 3:
       cm->no_lpf = 1;
-      cm->filter_type = NORMAL_LOOPFILTER;
       cm->use_bilinear_mc_filter = 1;
       break;
   }
@@ -203,7 +203,6 @@
   oci->txfm_mode = ONLY_4X4;
   oci->comp_pred_mode = HYBRID_PREDICTION;
   oci->no_lpf = 0;
-  oci->filter_type = NORMAL_LOOPFILTER;
   oci->use_bilinear_mc_filter = 0;
   oci->clr_type = REG_YUV;
   oci->clamp_type = RECON_CLAMP_REQUIRED;
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index c5cfb5e..3d1da44 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -62,9 +62,6 @@
 } FRAME_TYPE;
 
 typedef enum {
-#if CONFIG_ENABLE_6TAP
-  SIXTAP,
-#endif
   EIGHTTAP_SMOOTH,
   EIGHTTAP,
   EIGHTTAP_SHARP,
@@ -73,20 +70,17 @@
 } INTERPOLATIONFILTERTYPE;
 
 typedef enum {
-  DC_PRED,            /* average of above and left pixels */
-  V_PRED,             /* vertical prediction */
-  H_PRED,             /* horizontal prediction */
-  D45_PRED,           /* Directional 45 deg prediction  [anti-clockwise from 0 deg hor] */
-  D135_PRED,          /* Directional 135 deg prediction [anti-clockwise from 0 deg hor] */
-  D117_PRED,          /* Directional 112 deg prediction [anti-clockwise from 0 deg hor] */
-  D153_PRED,          /* Directional 157 deg prediction [anti-clockwise from 0 deg hor] */
-  D27_PRED,           /* Directional 22 deg prediction  [anti-clockwise from 0 deg hor] */
-  D63_PRED,           /* Directional 67 deg prediction  [anti-clockwise from 0 deg hor] */
-  TM_PRED,            /* Truemotion prediction */
-#if !CONFIG_SB8X8
-  I8X8_PRED,          /* 8x8 based prediction, each 8x8 has its own mode */
-#endif
-  I4X4_PRED,          /* 4x4 based prediction, each 4x4 has its own mode */
+  DC_PRED,         // Average of above and left pixels
+  V_PRED,          // Vertical
+  H_PRED,          // Horizontal
+  D45_PRED,        // Directional 45  deg = round(arctan(1/1) * 180/pi)
+  D135_PRED,       // Directional 135 deg = 180 - 45
+  D117_PRED,       // Directional 117 deg = 180 - 63
+  D153_PRED,       // Directional 153 deg = 180 - 27
+  D27_PRED,        // Directional 27  deg = round(arctan(1/2) * 180/pi)
+  D63_PRED,        // Directional 63  deg = round(arctan(2/1) * 180/pi)
+  TM_PRED,         // True-motion
+  I4X4_PRED,       // Each 4x4 subblock has its own mode
   NEARESTMV,
   NEARMV,
   ZEROMV,
@@ -128,9 +122,6 @@
 
 #define VP9_YMODES  (I4X4_PRED + 1)
 #define VP9_UV_MODES (TM_PRED + 1)
-#if !CONFIG_SB8X8
-#define VP9_I8X8_MODES (TM_PRED + 1)
-#endif
 #define VP9_I32X32_MODES (TM_PRED + 1)
 
 #define VP9_MVREFS (1 + SPLITMV - NEARESTMV)
@@ -148,9 +139,6 @@
   B_D27_PRED,
   B_D63_PRED,
   B_TM_PRED,
-#if CONFIG_NEWBINTRAMODES
-  B_CONTEXT_PRED,
-#endif
 
   LEFT4X4,
   ABOVE4X4,
@@ -163,25 +151,8 @@
 #define VP9_BINTRAMODES (LEFT4X4)
 #define VP9_SUBMVREFS (1 + NEW4X4 - LEFT4X4)
 
-#if CONFIG_NEWBINTRAMODES
-/* The number of I4X4_PRED intra modes that are replaced by B_CONTEXT_PRED */
-#define CONTEXT_PRED_REPLACEMENTS  0
-#define VP9_KF_BINTRAMODES (VP9_BINTRAMODES - 1)
-#define VP9_NKF_BINTRAMODES  (VP9_BINTRAMODES - CONTEXT_PRED_REPLACEMENTS)
-#else
 #define VP9_KF_BINTRAMODES (VP9_BINTRAMODES)   /* 10 */
 #define VP9_NKF_BINTRAMODES (VP9_BINTRAMODES)  /* 10 */
-#endif
-
-#if !CONFIG_SB8X8
-typedef enum {
-  PARTITIONING_16X8 = 0,
-  PARTITIONING_8X16,
-  PARTITIONING_8X8,
-  PARTITIONING_4X4,
-  NB_PARTITIONINGS,
-} SPLITMV_PARTITIONING_TYPE;
-#endif
 
 /* For keyframes, intra block modes are predicted by the (already decoded)
    modes for the Y blocks to the left and above us; for interframes, there
@@ -190,9 +161,6 @@
 union b_mode_info {
   struct {
     B_PREDICTION_MODE first;
-#if CONFIG_NEWBINTRAMODES
-    B_PREDICTION_MODE context;
-#endif
   } as_mode;
   int_mv as_mv[2];  // first, second inter predictor motion vectors
 };
@@ -208,12 +176,16 @@
 
 static INLINE int b_width_log2(BLOCK_SIZE_TYPE sb_type) {
   switch (sb_type) {
+#if CONFIG_AB4X4
+    case BLOCK_SIZE_SB4X8:
+#endif
     case BLOCK_SIZE_AB4X4: return 0;
-#if CONFIG_SB8X8
+#if CONFIG_AB4X4
+    case BLOCK_SIZE_SB8X4:
+#endif
     case BLOCK_SIZE_SB8X8:
     case BLOCK_SIZE_SB8X16: return 1;
     case BLOCK_SIZE_SB16X8:
-#endif
     case BLOCK_SIZE_MB16X16:
     case BLOCK_SIZE_SB16X32: return 2;
     case BLOCK_SIZE_SB32X16:
@@ -227,12 +199,16 @@
 
 static INLINE int b_height_log2(BLOCK_SIZE_TYPE sb_type) {
   switch (sb_type) {
+#if CONFIG_AB4X4
+    case BLOCK_SIZE_SB8X4:
+#endif
     case BLOCK_SIZE_AB4X4: return 0;
-#if CONFIG_SB8X8
+#if CONFIG_AB4X4
+    case BLOCK_SIZE_SB4X8:
+#endif
     case BLOCK_SIZE_SB8X8:
     case BLOCK_SIZE_SB16X8: return 1;
     case BLOCK_SIZE_SB8X16:
-#endif
     case BLOCK_SIZE_MB16X16:
     case BLOCK_SIZE_SB32X16: return 2;
     case BLOCK_SIZE_SB16X32:
@@ -245,30 +221,19 @@
 }
 
 static INLINE int mi_width_log2(BLOCK_SIZE_TYPE sb_type) {
-#if CONFIG_SB8X8
   int a = b_width_log2(sb_type) - 1;
-#else
-  int a = b_width_log2(sb_type) - 2;
-#endif
   assert(a >= 0);
   return a;
 }
 
 static INLINE int mi_height_log2(BLOCK_SIZE_TYPE sb_type) {
-#if CONFIG_SB8X8
   int a = b_height_log2(sb_type) - 1;
-#else
-  int a = b_height_log2(sb_type) - 2;
-#endif
   assert(a >= 0);
   return a;
 }
 
 typedef struct {
   MB_PREDICTION_MODE mode, uv_mode;
-#if CONFIG_COMP_INTERINTRA_PRED
-  MB_PREDICTION_MODE interintra_mode, interintra_uv_mode;
-#endif
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   TX_SIZE txfm_size;
   int_mv mv[2]; // for each reference frame used
@@ -277,9 +242,6 @@
 
   int mb_mode_context[MAX_REF_FRAMES];
 
-#if !CONFIG_SB8X8
-  SPLITMV_PARTITIONING_TYPE partitioning;
-#endif
   unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
   unsigned char need_to_clamp_mvs;
   unsigned char need_to_clamp_secondmv;
@@ -301,7 +263,7 @@
 
 typedef struct {
   MB_MODE_INFO mbmi;
-  union b_mode_info bmi[16 >> (CONFIG_SB8X8 * 2)];
+  union b_mode_info bmi[4];
 } MODE_INFO;
 
 struct scale_factors {
@@ -351,11 +313,6 @@
 
 #define BLOCK_OFFSET(x, i, n) ((x) + (i) * (n))
 
-#define MB_SUBBLOCK_FIELD(x, field, i) (\
-  ((i) < 16) ? BLOCK_OFFSET((x)->plane[0].field, (i), 16) : \
-  ((i) < 20) ? BLOCK_OFFSET((x)->plane[1].field, ((i) - 16), 16) : \
-  BLOCK_OFFSET((x)->plane[2].field, ((i) - 20), 16))
-
 typedef struct macroblockd {
   struct macroblockd_plane plane[MAX_MB_PLANE];
 
@@ -443,17 +400,44 @@
 
   int sb_index;   // index of 32x32 block inside the 64x64 block
   int mb_index;   // index of 16x16 block inside the 32x32 block
-#if CONFIG_SB8X8
   int b_index;    // index of 8x8 block inside the 16x16 block
+#if CONFIG_AB4X4
+  int ab_index;   // index of 4x4 block inside the 8x8 block
 #endif
   int q_index;
 
 } MACROBLOCKD;
 
+static int *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
+  switch (subsize) {
+    case BLOCK_SIZE_SB64X32:
+    case BLOCK_SIZE_SB32X64:
+    case BLOCK_SIZE_SB32X32:
+      return &xd->sb_index;
+    case BLOCK_SIZE_SB32X16:
+    case BLOCK_SIZE_SB16X32:
+    case BLOCK_SIZE_MB16X16:
+      return &xd->mb_index;
+    case BLOCK_SIZE_SB16X8:
+    case BLOCK_SIZE_SB8X16:
+    case BLOCK_SIZE_SB8X8:
+      return &xd->b_index;
+#if CONFIG_AB4X4
+    case BLOCK_SIZE_SB8X4:
+    case BLOCK_SIZE_SB4X8:
+    case BLOCK_SIZE_AB4X4:
+      return &xd->ab_index;
+#endif
+    default:
+      assert(0);
+      return NULL;
+  }
+}
+
 static INLINE void update_partition_context(MACROBLOCKD *xd,
                                             BLOCK_SIZE_TYPE sb_type,
                                             BLOCK_SIZE_TYPE sb_size) {
-  int bsl = mi_width_log2(sb_size), bs;
+  int bsl = mi_width_log2(sb_size), bs = 1 << bsl;
   int bwl = mi_width_log2(sb_type);
   int bhl = mi_height_log2(sb_type);
   int boffset = mi_width_log2(BLOCK_SIZE_SB64X64) - bsl;
@@ -462,12 +446,6 @@
   if (bsl == 0)
     return;
 
-#if CONFIG_SB8X8
-  bs = 1 << (bsl - 1);
-#else
-  bs = 1 << bsl;
-#endif
-
   // update the partition context at the end notes. set partition bits
   // of block sizes larger than the current one to be one, and partition
   // bits of smaller block sizes to be zero.
@@ -498,26 +476,14 @@
 
 static INLINE int partition_plane_context(MACROBLOCKD *xd,
                                           BLOCK_SIZE_TYPE sb_type) {
-  int bsl = mi_width_log2(sb_type), bs;
+  int bsl = mi_width_log2(sb_type), bs = 1 << bsl;
   int above = 0, left = 0, i;
   int boffset = mi_width_log2(BLOCK_SIZE_SB64X64) - bsl;
 
-#if CONFIG_SB8X8
-  bs = 1 << (bsl - 1);
-#else
-  bs = 1 << bsl;
-#endif
-
   assert(mi_width_log2(sb_type) == mi_height_log2(sb_type));
   assert(bsl >= 0);
   assert(boffset >= 0);
 
-#if CONFIG_SB8X8
-  bs = 1 << (bsl - 1);
-#else
-  bs = 1 << bsl;
-#endif
-
   for (i = 0; i < bs; i++)
     above |= (xd->above_seg_context[i] & (1 << boffset));
   for (i = 0; i < bs; i++)
@@ -541,10 +507,8 @@
         subsize = BLOCK_SIZE_SB64X32;
       else if (bsize == BLOCK_SIZE_SB32X32)
         subsize = BLOCK_SIZE_SB32X16;
-#if CONFIG_SB8X8
       else if (bsize == BLOCK_SIZE_MB16X16)
         subsize = BLOCK_SIZE_SB16X8;
-#endif
       else
         assert(0);
       break;
@@ -553,10 +517,8 @@
         subsize = BLOCK_SIZE_SB32X64;
       else if (bsize == BLOCK_SIZE_SB32X32)
         subsize = BLOCK_SIZE_SB16X32;
-#if CONFIG_SB8X8
       else if (bsize == BLOCK_SIZE_MB16X16)
         subsize = BLOCK_SIZE_SB8X16;
-#endif
       else
         assert(0);
       break;
@@ -565,10 +527,8 @@
         subsize = BLOCK_SIZE_SB32X32;
       else if (bsize == BLOCK_SIZE_SB32X32)
         subsize = BLOCK_SIZE_MB16X16;
-#if CONFIG_SB8X8
       else if (bsize == BLOCK_SIZE_MB16X16)
         subsize = BLOCK_SIZE_SB8X8;
-#endif
       else
         assert(0);
       break;
@@ -620,12 +580,6 @@
     case B_D27_PRED :
       return DCT_ADST;
 
-#if CONFIG_NEWBINTRAMODES
-    case B_CONTEXT_PRED:
-      assert(0);
-      break;
-#endif
-
     default:
       return DCT_DCT;
   }
@@ -655,47 +609,7 @@
   if (xd->mode_info_context->mbmi.mode == I4X4_PRED &&
       xd->q_index < ACTIVE_HT) {
     tx_type = txfm_map(
-#if CONFIG_NEWBINTRAMODES
-        xd->mode_info_context->bmi[ib].as_mode.first == B_CONTEXT_PRED ?
-          xd->mode_info_context->bmi[ib].as_mode.context :
-#endif
         xd->mode_info_context->bmi[ib].as_mode.first);
-#if !CONFIG_SB8X8
-  } else if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&
-             xd->q_index < ACTIVE_HT) {
-    const int ic = (ib & 10);
-#if USE_ADST_FOR_I8X8_4X4
-#if USE_ADST_PERIPHERY_ONLY
-    // Use ADST for periphery blocks only
-    const int inner = ib & 5;
-    tx_type = txfm_map(pred_mode_conv(
-        (MB_PREDICTION_MODE)xd->mode_info_context->bmi[ic].as_mode.first));
-
-#if USE_ADST_FOR_REMOTE_EDGE
-    if (inner == 5)
-      tx_type = DCT_DCT;
-#else
-    if (inner == 1) {
-      if (tx_type == ADST_ADST) tx_type = ADST_DCT;
-      else if (tx_type == DCT_ADST) tx_type = DCT_DCT;
-    } else if (inner == 4) {
-      if (tx_type == ADST_ADST) tx_type = DCT_ADST;
-      else if (tx_type == ADST_DCT) tx_type = DCT_DCT;
-    } else if (inner == 5) {
-      tx_type = DCT_DCT;
-    }
-#endif
-#else
-    // Use ADST
-    b += ic - ib;
-    tx_type = txfm_map(pred_mode_conv(
-        (MB_PREDICTION_MODE)b->bmi.as_mode.first));
-#endif
-#else
-    // Use 2D DCT
-    tx_type = DCT_DCT;
-#endif
-#endif  // !CONFIG_SB8X8
   } else if (xd->mode_info_context->mbmi.mode <= TM_PRED &&
              xd->q_index < ACTIVE_HT) {
 #if USE_ADST_FOR_I16X16_4X4
@@ -740,15 +654,6 @@
 #endif
   if (ib >= (1 << (wb + hb)))  // no chroma adst
     return tx_type;
-#if !CONFIG_SB8X8
-  if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&
-      xd->q_index < ACTIVE_HT8) {
-    // TODO(rbultje): MB_PREDICTION_MODE / B_PREDICTION_MODE should be merged
-    // or the relationship otherwise modified to address this type conversion.
-    tx_type = txfm_map(pred_mode_conv(
-           (MB_PREDICTION_MODE)xd->mode_info_context->bmi[ib].as_mode.first));
-  } else
-#endif  // CONFIG_SB8X8
   if (xd->mode_info_context->mbmi.mode <= TM_PRED &&
       xd->q_index < ACTIVE_HT8) {
 #if USE_ADST_FOR_I16X16_8X8
@@ -817,14 +722,12 @@
   return tx_type;
 }
 
-void vp9_setup_block_dptrs(MACROBLOCKD *xd);
+void vp9_setup_block_dptrs(MACROBLOCKD *xd,
+                           int subsampling_x, int subsampling_y);
 
 static TX_SIZE get_uv_tx_size(const MACROBLOCKD *xd) {
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   const TX_SIZE size = mbmi->txfm_size;
-#if !CONFIG_SB8X8
-  const MB_PREDICTION_MODE mode = mbmi->mode;
-#endif  // !CONFIG_SB8X8
 
   switch (mbmi->sb_type) {
     case BLOCK_SIZE_SB64X64:
@@ -836,7 +739,6 @@
         return TX_16X16;
       else
         return size;
-#if CONFIG_SB8X8
     case BLOCK_SIZE_SB32X16:
     case BLOCK_SIZE_SB16X32:
     case BLOCK_SIZE_MB16X16:
@@ -846,15 +748,6 @@
         return size;
     default:
       return TX_4X4;
-#else  // CONFIG_SB8X8
-    default:
-      if (size == TX_16X16)
-        return TX_8X8;
-      else if (size == TX_8X8 && (mode == I8X8_PRED || mode == SPLITMV))
-        return TX_4X4;
-      else
-        return size;
-#endif  // CONFIG_SB8X8
   }
 
   return size;
@@ -892,15 +785,14 @@
                                                   void *arg);
 static INLINE void foreach_transformed_block_in_plane(
     const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, int plane,
-#if !CONFIG_SB8X8
-    int is_split,
-#endif  // !CONFIG_SB8X8
     foreach_transformed_block_visitor visit, void *arg) {
   const int bw = b_width_log2(bsize), bh = b_height_log2(bsize);
 
   // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
   // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
-  const TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
+  // transform size varies per plane, look it up in a common way.
+  const TX_SIZE tx_size = plane ? get_uv_tx_size(xd)
+                                : xd->mode_info_context->mbmi.txfm_size;
   const int block_size_b = bw + bh;
   const int txfrm_size_b = tx_size * 2;
 
@@ -909,48 +801,24 @@
                      xd->plane[plane].subsampling_y;
   const int ss_block_size = block_size_b - ss_sum;
 
-  // size of the transform to use. scale the transform down if it's larger
-  // than the size of the subsampled data, or forced externally by the mb mode.
-  const int ss_max = MAX(xd->plane[plane].subsampling_x,
-                         xd->plane[plane].subsampling_y);
-  const int ss_txfrm_size = txfrm_size_b > ss_block_size
-#if !CONFIG_SB8X8
-                            || is_split
-#endif  // !CONFIG_SB8X8
-                                ? txfrm_size_b - ss_max * 2
-                                : txfrm_size_b;
-  const int step = 1 << ss_txfrm_size;
+  const int step = 1 << txfrm_size_b;
 
   int i;
 
   assert(txfrm_size_b <= block_size_b);
-  assert(ss_txfrm_size <= ss_block_size);
+  assert(txfrm_size_b <= ss_block_size);
   for (i = 0; i < (1 << ss_block_size); i += step) {
-    visit(plane, i, bsize, ss_txfrm_size, arg);
+    visit(plane, i, bsize, txfrm_size_b, arg);
   }
 }
 
 static INLINE void foreach_transformed_block(
     const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
     foreach_transformed_block_visitor visit, void *arg) {
-#if !CONFIG_SB8X8
-  const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;
-  const int is_split =
-      xd->mode_info_context->mbmi.txfm_size == TX_8X8 &&
-      (mode == I8X8_PRED || mode == SPLITMV);
-#endif  // !CONFIG_SB8X8
   int plane;
 
   for (plane = 0; plane < MAX_MB_PLANE; plane++) {
-#if !CONFIG_SB8X8
-    const int is_split_chroma = is_split &&
-         xd->plane[plane].plane_type == PLANE_TYPE_UV;
-#endif  // !CONFIG_SB8X8
-
     foreach_transformed_block_in_plane(xd, bsize, plane,
-#if !CONFIG_SB8X8
-                                       is_split_chroma,
-#endif  // !CONFIG_SB8X8
                                        visit, arg);
   }
 }
@@ -958,19 +826,10 @@
 static INLINE void foreach_transformed_block_uv(
     const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
     foreach_transformed_block_visitor visit, void *arg) {
-#if !CONFIG_SB8X8
-  const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;
-  const int is_split =
-      xd->mode_info_context->mbmi.txfm_size == TX_8X8 &&
-      (mode == I8X8_PRED || mode == SPLITMV);
-#endif  // !CONFIG_SB8X8
   int plane;
 
   for (plane = 1; plane < MAX_MB_PLANE; plane++) {
     foreach_transformed_block_in_plane(xd, bsize, plane,
-#if !CONFIG_SB8X8
-                                       is_split,
-#endif  // !CONFIG_SB8X8
                                        visit, arg);
   }
 }
@@ -998,16 +857,8 @@
   int pred_w, pred_h;
 
   if (mode == SPLITMV) {
-#if CONFIG_SB8X8
     pred_w = 0;
     pred_h = 0;
-#else
-    // 4x4 or 8x8
-    const int is_4x4 =
-        (xd->mode_info_context->mbmi.partitioning == PARTITIONING_4X4);
-    pred_w = is_4x4 ? 0 : 1 >> xd->plane[plane].subsampling_x;
-    pred_h = is_4x4 ? 0 : 1 >> xd->plane[plane].subsampling_y;
-#endif
   } else {
     pred_w = bw;
     pred_h = bh;
@@ -1092,49 +943,4 @@
   *y = raster_mb >> tx_cols_lg2 << (txwl);
 }
 
-static TX_SIZE tx_size_for_plane(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
-                                 int plane) {
-  // TODO(jkoleszar): This duplicates a ton of code, but we're going to be
-  // moving this to a per-plane lookup shortly, and this will go away then.
-  if (!plane) {
-    return xd->mode_info_context->mbmi.txfm_size;
-  } else {
-    const int bw = b_width_log2(bsize), bh = b_height_log2(bsize);
-#if !CONFIG_SB8X8
-    const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;
-    const int is_split =
-        xd->mode_info_context->mbmi.txfm_size == TX_8X8 &&
-        (mode == I8X8_PRED || mode == SPLITMV);
-#endif
-
-    // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
-    // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
-    const TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
-    const int block_size_b = bw + bh;
-    const int txfrm_size_b = tx_size * 2;
-
-    // subsampled size of the block
-    const int ss_sum = xd->plane[plane].subsampling_x +
-                       xd->plane[plane].subsampling_y;
-    const int ss_block_size = block_size_b - ss_sum;
-
-    // size of the transform to use. scale the transform down if it's larger
-    // than the size of the subsampled data, or forced externally by the mb mode
-    const int ss_max = MAX(xd->plane[plane].subsampling_x,
-                           xd->plane[plane].subsampling_y);
-    const int ss_txfrm_size = txfrm_size_b > ss_block_size
-#if !CONFIG_SB8X8
-                            || is_split
-#endif  // !CONFIG_SB8X8
-                                  ? txfrm_size_b - ss_max * 2
-                                  : txfrm_size_b;
-    return (TX_SIZE)(ss_txfrm_size / 2);
-  }
-}
-
-#if CONFIG_CODE_ZEROGROUP
-static int get_zpc_used(TX_SIZE tx_size) {
-  return (tx_size >= TX_16X16);
-}
-#endif
 #endif  // VP9_COMMON_VP9_BLOCKD_H_
diff --git a/vp9/common/vp9_coefupdateprobs.h b/vp9/common/vp9_coefupdateprobs.h
index e49935c..231def1 100644
--- a/vp9/common/vp9_coefupdateprobs.h
+++ b/vp9/common/vp9_coefupdateprobs.h
@@ -18,10 +18,6 @@
   252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252
 };
 
-#if CONFIG_CODE_ZEROGROUP
-#define ZPC_UPDATE_PROB         248
-#endif
-
 #if CONFIG_MODELCOEFPROB
 #define COEF_MODEL_UPDATE_PROB   16
 #endif
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index dbfb9ed..b6252d9 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -52,6 +52,10 @@
   return value < low ? low : (value > high ? high : value);
 }
 
+static INLINE double fclamp(double value, double low, double high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
 static INLINE int multiple16(int value) {
   return (value + 15) & ~15;
 }
diff --git a/vp9/common/vp9_default_coef_probs.h b/vp9/common/vp9_default_coef_probs.h
index 453b4a2..9e37333 100644
--- a/vp9/common/vp9_default_coef_probs.h
+++ b/vp9/common/vp9_default_coef_probs.h
@@ -699,86 +699,3 @@
     }
   }
 };
-
-#if CONFIG_CODE_ZEROGROUP
-
-// There are two probs: the first is the prob(0) of the isolated zero bit,
-// the second is the prob(0) of the end of orientation symbol [if 0 that
-// indicates a zerotree root].
-static const vp9_zpc_probs default_zpc_probs_4x4 = {
-  { /* Intra */
-    { /* Coeff Band 0 */
-      { 1, }, { 1, }, { 1, },
-    }, { /* Coeff Band 1 */
-      { 1, }, { 1, }, { 1, },
-    }, { /* Coeff Band 2 */
-      { 1, }, { 1, }, { 1, },
-    }
-  }, { /* Inter */
-    { /* Coeff Band 0 */
-      { 1, }, { 1, }, { 1, },
-    }, { /* Coeff Band 1 */
-      { 1, }, { 1, }, { 1, },
-    }, { /* Coeff Band 2 */
-      { 1, }, { 1, }, { 1, },
-    }
-  }
-};
-static const vp9_zpc_probs default_zpc_probs_8x8 = {
-  { /* Intra */
-    { /* ZPC Band 0 */
-      { 4, }, { 2, }, { 1, },
-    }, { /* ZPC Band 1 */
-      { 4, }, { 2, }, { 1, },
-    }, { /* ZPC Band 2 */
-      { 4, }, { 2, }, { 1, },
-    }
-  }, { /* Inter */
-    { /* ZPC Band 0 */
-      { 4, }, { 2, }, { 1, },
-    }, { /* ZPC Band 1 */
-      { 4, }, { 2, }, { 1, },
-    }, { /* ZPC Band 2 */
-      { 4, }, { 2, }, { 1, },
-    }
-  }
-};
-static const vp9_zpc_probs default_zpc_probs_16x16 = {
-  { /* Intra */
-    { /* ZPC Band 0 */
-      {  57,  }, {  30,  }, {   13,  },
-    }, { /* ZPC Band 1 */
-      {  46,  }, {  23,  }, {   4,  },
-    }, { /* ZPC Band 1 */
-      {  36,  }, {  11,  }, {   2,  },
-    },
-  }, { /* Inter */
-    { /* ZPC Band 0 */
-      {  45,  }, {  21  }, {  10,  },
-    }, { /* ZPC Band 1 */
-      {  24,  }, {  14,  }, {   3,  },
-    }, { /* ZPC Band 2 */
-      {  16,  }, {  6,  }, {   1,  },
-    },
-  },
-};
-static const vp9_zpc_probs default_zpc_probs_32x32 = {
-  { /* Intra */
-    { /* ZPC Band 0 */
-      {  132,  }, {  60,  }, {  19,  },
-    }, { /* ZPC Band 1 */
-      {  64,  }, {  32,  }, {   8,  },
-    }, { /* ZPC Band 2 */
-      {  25,  }, {  11,  }, {   1,  },
-    },
-  }, { /* Inter */
-    { /* ZPC Band 0 */
-      {  134,  }, {  39,  }, {  25,  },
-    }, { /* ZPC Band 1 */
-      {  64,  }, {  24,  }, {  12,  },
-    }, { /* ZPC Band 2 */
-      {  21,  }, {  10,  }, {   1,  },
-    },
-  },
-};
-#endif  // CONFIG_CODE_ZEROGROUP
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index faa45bd..532e5d3 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -8,9 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-#include <stdio.h>
-
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_onyxc_int.h"
@@ -19,8 +16,6 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_coefupdateprobs.h"
 
-const int vp9_i8x8_block[4] = {0, 2, 8, 10};
-
 DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = {
   0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
@@ -51,6 +46,13 @@
   5, 5, 5, 5, 5, 5, 5, 5,
   5, 5, 5, 5, 5, 5, 5, 5
 };
+
+DECLARE_ALIGNED(16, const uint8_t,
+                vp9_coefband_trans_8x8plus[MAXBAND_INDEX + 1]) = {
+  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
+  4, 4, 4, 4, 4, 5
+};
+
 DECLARE_ALIGNED(16, const int, vp9_coef_bands4x4[16]) = {
   0, 1, 2, 3,
   1, 2, 3, 4,
@@ -58,6 +60,12 @@
   3, 4, 5, 5
 };
 
+DECLARE_ALIGNED(16, const uint8_t,
+                vp9_coefband_trans_4x4[MAXBAND_INDEX + 1]) = {
+  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
+  4, 4, 4, 4, 4, 5
+};
+
 DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = {
   0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
 };
@@ -1338,16 +1346,6 @@
   vpx_memcpy(pc->fc.coef_probs_32x32, default_coef_probs_32x32,
              sizeof(pc->fc.coef_probs_32x32));
 #endif
-#if CONFIG_CODE_ZEROGROUP
-  vpx_memcpy(pc->fc.zpc_probs_4x4, default_zpc_probs_4x4,
-             sizeof(pc->fc.zpc_probs_4x4));
-  vpx_memcpy(pc->fc.zpc_probs_8x8, default_zpc_probs_8x8,
-             sizeof(pc->fc.zpc_probs_8x8));
-  vpx_memcpy(pc->fc.zpc_probs_16x16, default_zpc_probs_16x16,
-             sizeof(pc->fc.zpc_probs_16x16));
-  vpx_memcpy(pc->fc.zpc_probs_32x32, default_zpc_probs_32x32,
-             sizeof(pc->fc.zpc_probs_32x32));
-#endif
 }
 
 // Neighborhood 5-tuples for various scans and blocksizes,
@@ -1588,121 +1586,3 @@
                    count_sat, update_factor);
 #endif
 }
-
-#if CONFIG_CODE_ZEROGROUP
-OrientationType vp9_get_orientation(int rc, TX_SIZE tx_size) {
-  int i = rc >> (tx_size + 2);
-  int j = rc & ((4 << tx_size) - 1);
-  if (i > 2 * j)
-    return VERTICAL;
-  else if (j > 2 * i)
-    return HORIZONTAL;
-  else
-    return DIAGONAL;
-  /*
-  if (i == 0 && j == 0) return DIAGONAL;
-  while (i > 1 || j > 1) {
-    i >>= 1;
-    j >>= 1;
-  }
-  if (i == 0 && j == 1)
-    return HORIZONTAL;  // horizontal
-  else if (i == 1 && j == 1)
-    return DIAGONAL;    // diagonal
-  else if (i == 1 && j == 0)
-    return VERTICAL;    // vertical
-  assert(0);
-  */
-}
-
-int vp9_use_eoo(int c, int seg_eob, const int *scan,
-                TX_SIZE tx_size, int *is_last_zero, int *is_eoo) {
-  // NOTE: returning 0 from this function will turn off eoo symbols
-  // For instance we can experiment with turning eoo off for smaller blocks
-  // and/or lower bands
-  int o = vp9_get_orientation(scan[c], tx_size);
-  int band = get_coef_band(scan, tx_size, c);
-  int use_eoo = (!is_last_zero[o] &&
-                 !is_eoo[o] &&
-                 band <= ZPC_EOO_BAND_UPPER &&
-                 band >= ZPC_EOO_BAND_LOWER &&
-                 get_zpc_used(tx_size) &&
-                 seg_eob - c > (ZPC_USEEOO_THRESH << tx_size) &&
-                 is_eoo[0] + is_eoo[1] + is_eoo[2] < 2);
-  return use_eoo;
-}
-
-int vp9_is_eoo(int c, int eob, const int *scan, TX_SIZE tx_size,
-               const int16_t *qcoeff_ptr, int *last_nz_pos) {
-  int rc = scan[c];
-  int o = vp9_get_orientation(rc, tx_size);
-  int eoo = c > last_nz_pos[o];
-  return eoo;
-}
-
-static void adapt_zpc_probs_common(VP9_COMMON *cm,
-                                   TX_SIZE tx_size,
-                                   int count_sat,
-                                   int update_factor) {
-  int r, b, p, n;
-  int count, factor;
-  vp9_zpc_probs *zpc_probs;
-  vp9_zpc_probs *pre_zpc_probs;
-  vp9_zpc_count *zpc_counts;
-  if (!get_zpc_used(tx_size)) return;
-  if (tx_size == TX_32X32) {
-    zpc_probs = &cm->fc.zpc_probs_32x32;
-    pre_zpc_probs = &cm->fc.pre_zpc_probs_32x32;
-    zpc_counts = &cm->fc.zpc_counts_32x32;
-  } else if (tx_size == TX_16X16) {
-    zpc_probs = &cm->fc.zpc_probs_16x16;
-    pre_zpc_probs = &cm->fc.pre_zpc_probs_16x16;
-    zpc_counts = &cm->fc.zpc_counts_16x16;
-  } else if (tx_size == TX_8X8) {
-    zpc_probs = &cm->fc.zpc_probs_8x8;
-    pre_zpc_probs = &cm->fc.pre_zpc_probs_8x8;
-    zpc_counts = &cm->fc.zpc_counts_8x8;
-  } else {
-    zpc_probs = &cm->fc.zpc_probs_4x4;
-    pre_zpc_probs = &cm->fc.pre_zpc_probs_4x4;
-    zpc_counts = &cm->fc.zpc_counts_4x4;
-  }
-  for (r = 0; r < REF_TYPES; ++r) {
-    for (b = 0; b < ZPC_BANDS; ++b) {
-      for (p = 0; p < ZPC_PTOKS; ++p) {
-        for (n = 0; n < ZPC_NODES; ++n) {
-          vp9_prob prob = get_binary_prob((*zpc_counts)[r][b][p][n][0],
-                                          (*zpc_counts)[r][b][p][n][1]);
-          count = (*zpc_counts)[r][b][p][n][0] + (*zpc_counts)[r][b][p][n][1];
-          count = count > count_sat ? count_sat : count;
-          factor = (update_factor * count / count_sat);
-          (*zpc_probs)[r][b][p][n] = weighted_prob(
-              (*pre_zpc_probs)[r][b][p][n], prob, factor);
-        }
-      }
-    }
-  }
-}
-
-// #define ZPC_COUNT_TESTING
-void vp9_adapt_zpc_probs(VP9_COMMON *cm) {
-  int count_sat;
-  int update_factor; /* denominator 256 */
-
-  if (cm->frame_type == KEY_FRAME) {
-    update_factor = COEF_MAX_UPDATE_FACTOR_KEY;
-    count_sat = COEF_COUNT_SAT_KEY;
-  } else if (cm->last_frame_type == KEY_FRAME) {
-    update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY;  /* adapt quickly */
-    count_sat = COEF_COUNT_SAT_AFTER_KEY;
-  } else {
-    update_factor = COEF_MAX_UPDATE_FACTOR;
-    count_sat = COEF_COUNT_SAT;
-  }
-
-  adapt_zpc_probs_common(cm, TX_4X4, count_sat, update_factor);
-  adapt_zpc_probs_common(cm, TX_8X8, count_sat, update_factor);
-  adapt_zpc_probs_common(cm, TX_16X16, count_sat, update_factor);
-  adapt_zpc_probs_common(cm, TX_32X32, count_sat, update_factor);
-}
-#endif  // CONFIG_CODE_ZEROGROUP
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index f12ee95..9352bf6 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -16,8 +16,6 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
 
-extern const int vp9_i8x8_block[4];
-
 /* Coefficient token alphabet */
 
 #define ZERO_TOKEN              0       /* 0         Extra Bits 0+0 */
@@ -135,20 +133,20 @@
 
 extern const int vp9_coef_bands8x8[64];
 extern const int vp9_coef_bands4x4[16];
+extern const uint8_t vp9_coefband_trans_8x8plus[22];
+extern const uint8_t vp9_coefband_trans_4x4[22];
 
-static int get_coef_band(const int *scan, TX_SIZE tx_size, int coef_index) {
-  if (tx_size == TX_4X4) {
-    return vp9_coef_bands4x4[scan[coef_index]];
-  } else {
-    const int pos = scan[coef_index];
-    const int sz = 1 << (2 + tx_size);
-    const int x = pos & (sz - 1), y = pos >> (2 + tx_size);
-    if (x >= 8 || y >= 8)
-      return 5;
-    else
-      return vp9_coef_bands8x8[y * 8 + x];
-  }
+// This is the index in the scan order beyond which all coefficients for
+// 8x8 transform and above are in the top band.
+// For 4x4 blocks the index is less but to keep things common the lookup
+// table for 4x4 is padded out to this index.
+#define MAXBAND_INDEX 21
+
+static int get_coef_band(const uint8_t * band_translate, int coef_index) {
+  return (coef_index > MAXBAND_INDEX)
+    ? (COEF_BANDS-1) : band_translate[coef_index];
 }
+
 extern int vp9_get_coef_context(const int *scan, const int *neighbors,
                                 int nb_pad, uint8_t *token_cache, int c, int l);
 const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad);
@@ -176,62 +174,6 @@
                                 int b, int r);
 #endif  // CONFIG_MODELCOEFPROB
 
-#if CONFIG_CODE_ZEROGROUP
-
-#define ZPC_STATS
-
-typedef enum {
-  HORIZONTAL = 0,
-  DIAGONAL,
-  VERTICAL,
-} OrientationType;
-
-/* Note EOB should become part of this symbol eventually,
- * but holding off on this for now because that is a major
- * change in the rest of the codebase */
-
-#define ZPC_ISOLATED     (MAX_ENTROPY_TOKENS + 0)    /* Isolated zero */
-
-/* ZPC_EOORIENT: All remaining coefficients in the same orientation are 0.
- * In other words all remaining coeffs in the current subband, and all
- * children of the current subband are zero. Subbands are defined by
- * dyadic partitioning in the coeff domain */
-#define ZPC_EOORIENT     (MAX_ENTROPY_TOKENS + 1)    /* End of Orientation */
-
-/* Band limits over which the eoo bit is sent */
-#define ZPC_EOO_BAND_LOWER       0
-#define ZPC_EOO_BAND_UPPER       5
-
-#define USE_ZPC_EOORIENT         1       /* 0: not used */
-                                         /* 1: used */
-#define ZPC_NODES                1
-
-#define UNKNOWN_TOKEN          255       /* Not signalled, encoder only */
-
-#define ZPC_BANDS                3       /* context bands for izr */
-#define ZPC_PTOKS                3       /* context pt for zpcs */
-
-#define coef_to_zpc_band(b)      ((b) >> 1)
-#define coef_to_zpc_ptok(p)      ((p) > 2 ? 2 : (p))
-
-typedef vp9_prob vp9_zpc_probs[REF_TYPES][ZPC_BANDS]
-                              [ZPC_PTOKS][ZPC_NODES];
-typedef unsigned int vp9_zpc_count[REF_TYPES][ZPC_BANDS]
-                                  [ZPC_PTOKS][ZPC_NODES][2];
-
-OrientationType vp9_get_orientation(int rc, TX_SIZE tx_size);
-int vp9_use_eoo(int c, int eob, const int *scan, TX_SIZE tx_size,
-                int *is_last_zero, int *is_eoo);
-int vp9_is_eoo(int c, int eob, const int *scan, TX_SIZE tx_size,
-               const int16_t *qcoeff_ptr, int *last_nz_pos);
-
-#define ZPC_USEEOO_THRESH        4
-#define ZPC_ZEROSSAVED_EOO       7   /* encoder only */
-
-void vp9_adapt_zpc_probs(struct VP9Common *cm);
-
-#endif  // CONFIG_CODE_ZEROGROUP
-
 static INLINE const int* get_scan_4x4(TX_TYPE tx_type) {
   switch (tx_type) {
     case ADST_DCT:
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index f0a5d97..dcee62f 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -16,7 +16,6 @@
 #include "vpx_mem/vpx_mem.h"
 
 static const unsigned int kf_y_mode_cts[8][VP9_YMODES] = {
-#if CONFIG_SB8X8
   /* DC V   H  D45 135 117 153 D27 D63 TM i4X4 */
   {12,  6,  5,  5,  5,  5,  5,  5,  5,  2, 200},
   {25, 13, 13,  7,  7,  7,  7,  7,  7,  6, 160},
@@ -26,27 +25,11 @@
   {68, 33, 35,  8,  8,  8,  8,  8,  8, 17,  68},
   {78, 38, 38,  8,  8,  8,  8,  8,  8, 19,  52},
   {89, 42, 42,  8,  8,  8,  8,  8,  8, 21,  34},
-#else
-  /* DC V   H  D45 135 117 153 D27 D63 TM i8x8 i4X4 */
-  {12,  6,  5,  5,  5,  5,  5,  5,  5,  2, 22, 200},
-  {25, 13, 13,  7,  7,  7,  7,  7,  7,  6, 27, 160},
-  {31, 17, 18,  8,  8,  8,  8,  8,  8,  9, 26, 139},
-  {40, 22, 23,  8,  8,  8,  8,  8,  8, 12, 27, 116},
-  {53, 26, 28,  8,  8,  8,  8,  8,  8, 13, 26,  94},
-  {68, 33, 35,  8,  8,  8,  8,  8,  8, 17, 20,  68},
-  {78, 38, 38,  8,  8,  8,  8,  8,  8, 19, 16,  52},
-  {89, 42, 42,  8,  8,  8,  8,  8,  8, 21, 12,  34},
-#endif
 };
 
 static const unsigned int y_mode_cts  [VP9_YMODES] = {
-#if CONFIG_SB8X8
   /* DC V   H  D45 135 117 153 D27 D63 TM i4X4 */
   98, 19, 15, 14, 14, 14, 14, 12, 12, 13, 70
-#else
-  /* DC V   H  D45 135 117 153 D27 D63 TM i8x8 i4X4 */
-  98, 19, 15, 14, 14, 14, 14, 12, 12, 13, 16, 70
-#endif
 };
 
 static const unsigned int uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {
@@ -61,19 +44,9 @@
   { 150, 15, 10, 10, 10, 10, 10, 75, 10,  6}, /* D27 */
   { 150, 15, 10, 10, 10, 10, 10, 10, 75,  6}, /* D63 */
   { 160, 30, 30, 10, 10, 10, 10, 10, 10, 16}, /* TM */
-#if !CONFIG_SB8X8
-  { 132, 46, 40, 10, 10, 10, 10, 10, 10, 18}, /* i8x8 - never used */
-#endif
   { 150, 35, 41, 10, 10, 10, 10, 10, 10, 10}, /* i4X4 */
 };
 
-#if !CONFIG_SB8X8
-static const unsigned int i8x8_mode_cts  [VP9_I8X8_MODES] = {
-  /* DC V  H D45 135 117 153 D27 D63  TM */
-  73, 49, 61, 30, 30, 30, 30, 30, 30, 13
-};
-#endif
-
 static const unsigned int kf_uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {
   // DC   V   H  D45 135 117 153 D27 D63 TM
   { 160, 24, 24, 20, 20, 20, 20, 20, 20,  8}, /* DC */
@@ -86,28 +59,12 @@
   { 102, 33, 20, 20, 20, 20, 20, 64, 20, 14}, /* D27 */
   { 102, 33, 20, 20, 20, 20, 20, 20, 64, 14}, /* D63 */
   { 132, 36, 30, 20, 20, 20, 20, 20, 20, 18}, /* TM */
-#if !CONFIG_SB8X8
-  { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* i8x8 - never used */
-#endif
   { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* I4X4 */
 };
 
 static const unsigned int bmode_cts[VP9_NKF_BINTRAMODES] = {
-#if CONFIG_NEWBINTRAMODES
-#if CONTEXT_PRED_REPLACEMENTS == 6
-  /* DC    TM     V      H   CONTEXT */
-  43891, 17694, 10036, 3920, 20000
-#elif CONTEXT_PRED_REPLACEMENTS == 4
-  /* DC    TM     V      H   D45   D135   CONTEXT */
-  43891, 17694, 10036, 3920, 3363, 2546, 14000
-#elif CONTEXT_PRED_REPLACEMENTS == 0
-  /* DC    V     H    D45   D135  D117  D153   D27   D63   TM    CONTEXT */
-  43891, 10036, 3920, 3363, 2546, 5119, 2471, 1723, 3221, 17694, 50000
-#endif
-#else
   /* DC    V     H    D45   D135  D117  D153   D27   D63   TM  */
   43891, 10036, 3920, 3363, 2546, 5119, 2471, 1723, 3221, 17694
-#endif
 };
 
 typedef enum {
@@ -146,45 +103,13 @@
   { 208, 1, 1  }
 };
 
-#if !CONFIG_SB8X8
-vp9_mbsplit vp9_mbsplits [VP9_NUMMBSPLITS] = {
-  {
-    0,  0,  0,  0,
-    0,  0,  0,  0,
-    1,  1,  1,  1,
-    1,  1,  1,  1,
-  }, {
-    0,  0,  1,  1,
-    0,  0,  1,  1,
-    0,  0,  1,  1,
-    0,  0,  1,  1,
-  }, {
-    0,  0,  1,  1,
-    0,  0,  1,  1,
-    2,  2,  3,  3,
-    2,  2,  3,  3,
-  }, {
-    0,  1,  2,  3,
-    4,  5,  6,  7,
-    8,  9,  10, 11,
-    12, 13, 14, 15,
-  },
-};
-
-const int vp9_mbsplit_count [VP9_NUMMBSPLITS] = { 2, 2, 4, 16};
-
-const vp9_prob vp9_mbsplit_probs [VP9_NUMMBSPLITS - 1] = { 110, 111, 150};
-#endif
-
 const vp9_prob vp9_partition_probs[NUM_PARTITION_CONTEXTS]
                                   [PARTITION_TYPES - 1] = {
-#if CONFIG_SB8X8
   // FIXME(jingning,rbultje) put real probabilities here
   {202, 162, 107},
   {16,  2,   169},
   {3,   246,  19},
   {104, 90,  134},
-#endif
   {202, 162, 107},
   {16,  2,   169},
   {3,   246,  19},
@@ -210,32 +135,6 @@
 };
 
 const vp9_tree_index vp9_bmode_tree[VP9_NKF_BINTRAMODES * 2 - 2] = {
-#if CONFIG_NEWBINTRAMODES
-#if CONTEXT_PRED_REPLACEMENTS == 6
-  -B_DC_PRED, 2,
-  -B_TM_PRED, 4,
-  6, -(B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS),
-  -B_V_PRED, -B_H_PRED
-#elif CONTEXT_PRED_REPLACEMENTS == 4
-  -B_DC_PRED, 2,
-  -B_TM_PRED, 4,
-  6, 8,
-  -B_V_PRED, -B_H_PRED,
-  10, -(B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS),
-  -B_D135_PRED, -B_D45_PRED,
-#elif CONTEXT_PRED_REPLACEMENTS == 0
-  -B_DC_PRED, 2,                      /* 0 = DC_NODE */
-  -B_TM_PRED, 4,                      /* 1 = TM_NODE */
-  -B_V_PRED, 6,                       /* 2 = V_NODE */
-  8, 12,                              /* 3 = COM_NODE */
-  -B_H_PRED, 10,                      /* 4 = H_NODE */
-  -B_D135_PRED, -B_D117_PRED,         /* 5 = D135_NODE */
-  -B_D45_PRED, 14,                    /* 6 = D45_NODE */
-  -B_D63_PRED, 16,                    /* 7 = D63_NODE */
-  -B_D153_PRED, 18,                   /* 8 = D153_NODE */
-  -B_D27_PRED, -B_CONTEXT_PRED        /* 9 = D27_NODE */
-#endif
-#else
   -B_DC_PRED, 2,                      /* 0 = DC_NODE */
   -B_TM_PRED, 4,                      /* 1 = TM_NODE */
   -B_V_PRED, 6,                       /* 2 = V_NODE */
@@ -245,7 +144,6 @@
   -B_D45_PRED, 14,                    /* 6 = D45_NODE */
   -B_D63_PRED, 16,                    /* 7 = D63_NODE */
   -B_D153_PRED, -B_D27_PRED           /* 8 = D153_NODE */
-#endif
 };
 
 /* Again, these trees use the same probability indices as their
@@ -260,12 +158,7 @@
   -D27_PRED, -D63_PRED,
   16, 18,
   -V_PRED, -H_PRED,
-#if CONFIG_SB8X8
   -TM_PRED, -I4X4_PRED
-#else
-  -TM_PRED, 20,
-  -I4X4_PRED, -I8X8_PRED
-#endif
 };
 
 const vp9_tree_index vp9_kf_ymode_tree[VP9_YMODES * 2 - 2] = {
@@ -278,28 +171,9 @@
   -D27_PRED, -D63_PRED,
   16, 18,
   -V_PRED, -H_PRED,
-#if CONFIG_SB8X8
   -TM_PRED, -I4X4_PRED
-#else
-  -TM_PRED, 20,
-  -I4X4_PRED, -I8X8_PRED
-#endif
 };
 
-#if !CONFIG_SB8X8
-const vp9_tree_index vp9_i8x8_mode_tree[VP9_I8X8_MODES * 2 - 2] = {
-  2, 14,
-  -DC_PRED, 4,
-  6, 8,
-  -D45_PRED, -D135_PRED,
-  10, 12,
-  -D117_PRED, -D153_PRED,
-  -D27_PRED, -D63_PRED,
-  -V_PRED, 16,
-  -H_PRED, -TM_PRED
-};
-#endif
-
 const vp9_tree_index vp9_uv_mode_tree[VP9_UV_MODES * 2 - 2] = {
   2, 14,
   -DC_PRED, 4,
@@ -312,14 +186,6 @@
   -H_PRED, -TM_PRED
 };
 
-#if !CONFIG_SB8X8
-const vp9_tree_index vp9_mbsplit_tree[6] = {
-  -PARTITIONING_4X4,   2,
-  -PARTITIONING_8X8,   4,
-  -PARTITIONING_16X8, -PARTITIONING_8X16,
-};
-#endif
-
 const vp9_tree_index vp9_mv_ref_tree[8] = {
   -ZEROMV, 2,
   -NEARESTMV, 4,
@@ -352,10 +218,6 @@
 struct vp9_token vp9_sb_kf_ymode_encodings[VP9_I32X32_MODES];
 struct vp9_token vp9_kf_ymode_encodings[VP9_YMODES];
 struct vp9_token vp9_uv_mode_encodings[VP9_UV_MODES];
-#if !CONFIG_SB8X8
-struct vp9_token vp9_i8x8_mode_encodings[VP9_I8X8_MODES];
-struct vp9_token vp9_mbsplit_encodings[VP9_NUMMBSPLITS];
-#endif
 
 struct vp9_token vp9_mv_ref_encoding_array[VP9_MVREFS];
 struct vp9_token vp9_sb_mv_ref_encoding_array[VP9_MVREFS];
@@ -386,25 +248,14 @@
                                      bct, uv_mode_cts[i], 0);
   }
 
-#if !CONFIG_SB8X8
-  vp9_tree_probs_from_distribution(vp9_i8x8_mode_tree, x->fc.i8x8_mode_prob,
-                                   bct, i8x8_mode_cts, 0);
-#endif
-
   vpx_memcpy(x->fc.sub_mv_ref_prob, vp9_sub_mv_ref_prob2,
              sizeof(vp9_sub_mv_ref_prob2));
-#if !CONFIG_SB8X8
-  vpx_memcpy(x->fc.mbsplit_prob, vp9_mbsplit_probs, sizeof(vp9_mbsplit_probs));
-#endif
   vpx_memcpy(x->fc.switchable_interp_prob, vp9_switchable_interp_prob,
              sizeof(vp9_switchable_interp_prob));
 
   vpx_memcpy(x->fc.partition_prob, vp9_partition_probs,
              sizeof(vp9_partition_probs));
 
-#if CONFIG_COMP_INTERINTRA_PRED
-  x->fc.interintra_prob = VP9_DEF_INTERINTRA_PROB;
-#endif
   x->ref_pred_probs[0] = DEFAULT_PRED_PROB_0;
   x->ref_pred_probs[1] = DEFAULT_PRED_PROB_1;
   x->ref_pred_probs[2] = DEFAULT_PRED_PROB_2;
@@ -450,15 +301,9 @@
   -1, -2
 };
 struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
-#if CONFIG_ENABLE_6TAP
-const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
-  SIXTAP, EIGHTTAP, EIGHTTAP_SHARP};
-const int vp9_switchable_interp_map[SWITCHABLE+1] = {0, -1, 1, 2, -1, -1};
-#else
 const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
   EIGHTTAP, EIGHTTAP_SMOOTH, EIGHTTAP_SHARP};
 const int vp9_switchable_interp_map[SWITCHABLE+1] = {1, 0, 2, -1, -1};
-#endif
 const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1]
                                           [VP9_SWITCHABLE_FILTERS-1] = {
   {248, 192}, { 32, 248}, { 32,  32}, {192, 160}
@@ -476,20 +321,12 @@
 };
 const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
   EIGHTTAP, EIGHTTAP_SHARP};
-#if CONFIG_ENABLE_6TAP
-const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, -1, 0, 1, -1, -1};
-#else
 const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, 0, 1, -1, -1};
-#endif
 #endif  // VP9_SWITCHABLE_FILTERS
 
 // Indicates if the filter is interpolating or non-interpolating
 // Note currently only the EIGHTTAP_SMOOTH is non-interpolating
-#if CONFIG_ENABLE_6TAP
-const int vp9_is_interpolating_filter[SWITCHABLE + 1] = {1, 0, 1, 1, 1, -1};
-#else
 const int vp9_is_interpolating_filter[SWITCHABLE + 1] = {0, 1, 1, 1, -1};
-#endif
 
 void vp9_entropy_mode_init() {
   vp9_tokens_from_tree(vp9_kf_bmode_encodings,   vp9_kf_bmode_tree);
@@ -499,10 +336,6 @@
   vp9_tokens_from_tree(vp9_sb_ymode_encodings, vp9_sb_ymode_tree);
   vp9_tokens_from_tree(vp9_sb_kf_ymode_encodings, vp9_sb_kf_ymode_tree);
   vp9_tokens_from_tree(vp9_uv_mode_encodings,  vp9_uv_mode_tree);
-#if !CONFIG_SB8X8
-  vp9_tokens_from_tree(vp9_i8x8_mode_encodings,  vp9_i8x8_mode_tree);
-  vp9_tokens_from_tree(vp9_mbsplit_encodings, vp9_mbsplit_tree);
-#endif
   vp9_tokens_from_tree(vp9_switchable_interp_encodings,
                        vp9_switchable_interp_tree);
   vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree);
@@ -657,13 +490,6 @@
   for (t = 0; t < VP9_NUMMBSPLITS; ++t)
     printf("%d, ", fc->mbsplit_counts[t]);
   printf("};\n");
-#if CONFIG_COMP_INTERINTRA_PRED
-  printf("static const unsigned int\ninterintra_counts"
-         "[2] = {\n");
-  for (t = 0; t < 2; ++t)
-    printf("%d, ", fc->interintra_counts[t]);
-  printf("};\n");
-#endif
 #endif
 
   update_mode_probs(VP9_YMODES, vp9_ymode_tree,
@@ -681,36 +507,12 @@
   update_mode_probs(VP9_NKF_BINTRAMODES, vp9_bmode_tree,
                     fc->bmode_counts, fc->pre_bmode_prob,
                     fc->bmode_prob, 0);
-#if !CONFIG_SB8X8
-  update_mode_probs(VP9_I8X8_MODES,
-                    vp9_i8x8_mode_tree, fc->i8x8_mode_counts,
-                    fc->pre_i8x8_mode_prob, fc->i8x8_mode_prob, 0);
-#endif
 
   for (i = 0; i < SUBMVREF_COUNT; ++i)
     update_mode_probs(VP9_SUBMVREFS,
                       vp9_sub_mv_ref_tree, fc->sub_mv_ref_counts[i],
                       fc->pre_sub_mv_ref_prob[i], fc->sub_mv_ref_prob[i],
                       LEFT4X4);
-
-#if !CONFIG_SB8X8
-  update_mode_probs(VP9_NUMMBSPLITS, vp9_mbsplit_tree,
-                    fc->mbsplit_counts, fc->pre_mbsplit_prob,
-                    fc->mbsplit_prob, 0);
-#endif
-#if CONFIG_COMP_INTERINTRA_PRED
-  if (cm->use_interintra) {
-    int factor, interintra_prob, count;
-
-    interintra_prob = get_binary_prob(fc->interintra_counts[0],
-                                      fc->interintra_counts[1]);
-    count = fc->interintra_counts[0] + fc->interintra_counts[1];
-    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-    fc->interintra_prob = weighted_prob(fc->pre_interintra_prob,
-                                        interintra_prob, factor);
-  }
-#endif
   for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)
     update_mode_probs(PARTITION_TYPES, vp9_partition_tree,
                       fc->partition_counts[i], fc->pre_partition_prob[i],
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index 24f988f..4c3fc06 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -15,26 +15,6 @@
 #include "vp9/common/vp9_treecoder.h"
 
 #define SUBMVREF_COUNT 5
-#if !CONFIG_SB8X8
-#define VP9_NUMMBSPLITS 4
-#endif
-
-#if CONFIG_COMP_INTERINTRA_PRED
-#define VP9_DEF_INTERINTRA_PROB 248
-#define VP9_UPD_INTERINTRA_PROB 192
-// whether to use a separate uv mode (1) or use the same as the y mode (0)
-#define SEPARATE_INTERINTRA_UV  0
-#endif
-
-#if !CONFIG_SB8X8
-typedef const int vp9_mbsplit[16];
-
-extern vp9_mbsplit vp9_mbsplits[VP9_NUMMBSPLITS];
-
-extern const int vp9_mbsplit_count[VP9_NUMMBSPLITS];    /* # of subsets */
-
-extern const vp9_prob vp9_mbsplit_probs[VP9_NUMMBSPLITS - 1];
-#endif
 
 extern int vp9_mv_cont(const int_mv *l, const int_mv *a);
 
@@ -52,10 +32,6 @@
 extern const vp9_tree_index  vp9_uv_mode_tree[];
 #define vp9_sb_ymode_tree vp9_uv_mode_tree
 #define vp9_sb_kf_ymode_tree vp9_uv_mode_tree
-#if !CONFIG_SB8X8
-extern const vp9_tree_index  vp9_i8x8_mode_tree[];
-extern const vp9_tree_index  vp9_mbsplit_tree[];
-#endif
 extern const vp9_tree_index  vp9_mv_ref_tree[];
 extern const vp9_tree_index  vp9_sb_mv_ref_tree[];
 extern const vp9_tree_index  vp9_sub_mv_ref_tree[];
@@ -67,10 +43,6 @@
 extern struct vp9_token vp9_sb_kf_ymode_encodings[VP9_I32X32_MODES];
 extern struct vp9_token vp9_kf_ymode_encodings[VP9_YMODES];
 extern struct vp9_token vp9_uv_mode_encodings[VP9_UV_MODES];
-#if !CONFIG_SB8X8
-extern struct vp9_token vp9_i8x8_mode_encodings[VP9_I8X8_MODES];
-extern struct vp9_token vp9_mbsplit_encodings[VP9_NUMMBSPLITS];
-#endif
 
 /* Inter mode values do not start at zero */
 
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 3f00ba4..2f67074 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -13,22 +13,22 @@
 
 #include "./vpx_config.h"
 
-#if CONFIG_SB8X8
 #define LOG2_MI_SIZE 3
-#else
-#define LOG2_MI_SIZE 4
-#endif
 
 #define MI_SIZE (1 << LOG2_MI_SIZE)
 #define MI_UV_SIZE (1 << (LOG2_MI_SIZE - 1))
 
+#define MI_MASK ((64 >> LOG2_MI_SIZE) - 1)
+
 typedef enum BLOCK_SIZE_TYPE {
   BLOCK_SIZE_AB4X4,
-#if CONFIG_SB8X8
+#if CONFIG_AB4X4
+  BLOCK_SIZE_SB4X8,
+  BLOCK_SIZE_SB8X4,
+#endif
   BLOCK_SIZE_SB8X8,
   BLOCK_SIZE_SB8X16,
   BLOCK_SIZE_SB16X8,
-#endif
   BLOCK_SIZE_MB16X16,
   BLOCK_SIZE_SB16X32,
   BLOCK_SIZE_SB32X16,
@@ -36,6 +36,7 @@
   BLOCK_SIZE_SB32X64,
   BLOCK_SIZE_SB64X32,
   BLOCK_SIZE_SB64X64,
+  BLOCK_SIZE_TYPES
 } BLOCK_SIZE_TYPE;
 
 typedef enum PARTITION_TYPE {
@@ -47,6 +48,6 @@
 } PARTITION_TYPE;
 
 #define PARTITION_PLOFFSET   4  // number of probability models per block size
-#define NUM_PARTITION_CONTEXTS ((2 + CONFIG_SB8X8) * PARTITION_PLOFFSET)
+#define NUM_PARTITION_CONTEXTS (3 * PARTITION_PLOFFSET)
 
 #endif  // VP9_COMMON_VP9_ENUMS_H_
diff --git a/vp9/common/vp9_findnearmv.h b/vp9/common/vp9_findnearmv.h
index df1ab73..0a1c413 100644
--- a/vp9/common/vp9_findnearmv.h
+++ b/vp9/common/vp9_findnearmv.h
@@ -74,13 +74,9 @@
                            vp9_prob p[VP9_MVREFS - 1],
                            const int context);
 
-#if !CONFIG_SB8X8
-extern const uint8_t vp9_mbsplit_offset[4][16];
-#endif
-
 static int left_block_mv(const MACROBLOCKD *xd,
                          const MODE_INFO *cur_mb, int b) {
-  if (!(b & (3 >> CONFIG_SB8X8))) {
+  if (!(b & 1)) {
     if (!xd->left_available)
       return 0;
 
@@ -90,7 +86,7 @@
     if (cur_mb->mbmi.mode != SPLITMV)
       return cur_mb->mbmi.mv[0].as_int;
 
-    b += 4 >> CONFIG_SB8X8;
+    b += 2;
   }
 
   return (cur_mb->bmi + b - 1)->as_mv[0].as_int;
@@ -98,7 +94,7 @@
 
 static int left_block_second_mv(const MACROBLOCKD *xd,
                                 const MODE_INFO *cur_mb, int b) {
-  if (!(b & (3 >> CONFIG_SB8X8))) {
+  if (!(b & 1)) {
     if (!xd->left_available)
       return 0;
 
@@ -108,7 +104,7 @@
     if (cur_mb->mbmi.mode != SPLITMV)
       return cur_mb->mbmi.second_ref_frame > 0 ?
           cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int;
-    b += 4 >> CONFIG_SB8X8;
+    b += 2;
   }
 
   return cur_mb->mbmi.second_ref_frame > 0 ?
@@ -117,85 +113,69 @@
 }
 
 static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {
-  if (!(b >> (2 >> CONFIG_SB8X8))) {
+  if (!(b >> 1)) {
     /* On top edge, get from MB above us */
     cur_mb -= mi_stride;
 
     if (cur_mb->mbmi.mode != SPLITMV)
       return cur_mb->mbmi.mv[0].as_int;
-    b += 16 >> (2 * CONFIG_SB8X8);
+    b += 4;
   }
 
-  return (cur_mb->bmi + b - (4 >> CONFIG_SB8X8))->as_mv[0].as_int;
+  return (cur_mb->bmi + b - 2)->as_mv[0].as_int;
 }
 
 static int above_block_second_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {
-  if (!(b >> (2 >> CONFIG_SB8X8))) {
+  if (!(b >> 1)) {
     /* On top edge, get from MB above us */
     cur_mb -= mi_stride;
 
     if (cur_mb->mbmi.mode != SPLITMV)
       return cur_mb->mbmi.second_ref_frame > 0 ?
           cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int;
-    b += 16 >> (2 * CONFIG_SB8X8);
+    b += 4;
   }
 
   return cur_mb->mbmi.second_ref_frame > 0 ?
-      (cur_mb->bmi + b - (4 >> CONFIG_SB8X8))->as_mv[1].as_int :
-      (cur_mb->bmi + b - (4 >> CONFIG_SB8X8))->as_mv[0].as_int;
+      (cur_mb->bmi + b - 2)->as_mv[1].as_int :
+      (cur_mb->bmi + b - 2)->as_mv[0].as_int;
 }
 
 static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
-#if CONFIG_SB8X8
   // FIXME(rbultje, jingning): temporary hack because jenkins doesn't
   // understand this condition. This will go away soon.
   if (b == 0 || b == 2) {
-#else
-  if (!(b & (3 >> CONFIG_SB8X8))) {
-#endif
     /* On L edge, get from MB to left of us */
     --cur_mb;
 
     if (cur_mb->mbmi.mode <= TM_PRED) {
       return pred_mode_conv(cur_mb->mbmi.mode);
-#if !CONFIG_SB8X8
-    } else if (cur_mb->mbmi.mode == I8X8_PRED) {
-      return pred_mode_conv(
-          (MB_PREDICTION_MODE)(cur_mb->bmi + 3 + b)->as_mode.first);
-#endif  // !CONFIG_SB8X8
     } else if (cur_mb->mbmi.mode == I4X4_PRED) {
-      return ((cur_mb->bmi + (3 >> CONFIG_SB8X8) + b)->as_mode.first);
+      return ((cur_mb->bmi + 1 + b)->as_mode.first);
     } else {
       return B_DC_PRED;
     }
   }
-#if CONFIG_SB8X8
   assert(b == 1 || b == 3);
-#endif
   return (cur_mb->bmi + b - 1)->as_mode.first;
 }
 
 static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
                                           int b, int mi_stride) {
-  if (!(b >> (2 >> CONFIG_SB8X8))) {
+  if (!(b >> 1)) {
     /* On top edge, get from MB above us */
     cur_mb -= mi_stride;
 
     if (cur_mb->mbmi.mode <= TM_PRED) {
       return pred_mode_conv(cur_mb->mbmi.mode);
-#if !CONFIG_SB8X8
-    } else if (cur_mb->mbmi.mode == I8X8_PRED) {
-      return pred_mode_conv(
-          (MB_PREDICTION_MODE)(cur_mb->bmi + 12 + b)->as_mode.first);
-#endif
     } else if (cur_mb->mbmi.mode == I4X4_PRED) {
-      return ((cur_mb->bmi + (CONFIG_SB8X8 ? 2 : 12) + b)->as_mode.first);
+      return ((cur_mb->bmi + 2 + b)->as_mode.first);
     } else {
       return B_DC_PRED;
     }
   }
 
-  return (cur_mb->bmi + b - (4 >> CONFIG_SB8X8))->as_mode.first;
+  return (cur_mb->bmi + b - 2)->as_mode.first;
 }
 
 #endif  // VP9_COMMON_VP9_FINDNEARMV_H_
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index edb0c54..b668212 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -27,9 +27,6 @@
   lfi->mode_lf_lut[H_PRED] = 1;
   lfi->mode_lf_lut[TM_PRED] = 1;
   lfi->mode_lf_lut[I4X4_PRED]  = 0;
-#if !CONFIG_SB8X8
-  lfi->mode_lf_lut[I8X8_PRED] = 0;
-#endif
   lfi->mode_lf_lut[ZEROMV]  = 1;
   lfi->mode_lf_lut[NEARESTMV] = 2;
   lfi->mode_lf_lut[NEARMV] = 2;
@@ -169,12 +166,7 @@
 static int mb_lf_skip(const MB_MODE_INFO *const mbmi) {
   const int skip_coef = mbmi->mb_skip_coeff;
   const int tx_size = mbmi->txfm_size;
-#if CONFIG_SB8X8
   return mbmi->sb_type >= BLOCK_SIZE_MB16X16 &&
-#else
-  const MB_PREDICTION_MODE mode = mbmi->mode;
-  return mode != I4X4_PRED && mode != I8X8_PRED && mode != SPLITMV &&
-#endif
          (tx_size >= TX_16X16 || skip_coef);
 }
 
@@ -207,89 +199,75 @@
   if (filter_level) {
     const int skip_lf = mb_lf_skip(&mi->mbmi);
     const int tx_size = mi->mbmi.txfm_size;
-    if (cm->filter_type == NORMAL_LOOPFILTER) {
-      const int hev_index = filter_level >> 4;
-      lfi.mblim = lfi_n->mblim[filter_level];
-      lfi.blim = lfi_n->blim[filter_level];
-      lfi.lim = lfi_n->lim[filter_level];
-      lfi.hev_thr = lfi_n->hev_thr[hev_index];
+    const int hev_index = filter_level >> 4;
+    lfi.mblim = lfi_n->mblim[filter_level];
+    lfi.blim = lfi_n->blim[filter_level];
+    lfi.lim = lfi_n->lim[filter_level];
+    lfi.hev_thr = lfi_n->hev_thr[hev_index];
 
-      if (do_above_mb_h) {
-        if (tx_size >= TX_16X16)
-          vp9_lpf_mbh_w(y_ptr,
-                        do_above_mbuv_h ? u_ptr : NULL,
-                        do_above_mbuv_h ? v_ptr : NULL,
-                        y_stride, uv_stride, &lfi);
+    if (do_above_mb_h) {
+      if (tx_size >= TX_16X16)
+        vp9_lpf_mbh_w(y_ptr,
+                      do_above_mbuv_h ? u_ptr : NULL,
+                      do_above_mbuv_h ? v_ptr : NULL,
+                      y_stride, uv_stride, &lfi);
+      else
+        vp9_loop_filter_mbh(y_ptr, u_ptr, v_ptr, y_stride, uv_stride, &lfi);
+    }
+
+    if (!skip_lf) {
+      if (tx_size >= TX_8X8) {
+        if (tx_size == TX_8X8 &&
+            mi->mbmi.sb_type < BLOCK_SIZE_MB16X16)
+          vp9_loop_filter_bh8x8(y_ptr, u_ptr, v_ptr,
+                                y_stride, uv_stride, &lfi);
         else
-          vp9_loop_filter_mbh(y_ptr, u_ptr, v_ptr, y_stride, uv_stride, &lfi);
+          vp9_loop_filter_bh8x8(y_ptr, NULL, NULL,
+                                y_stride, uv_stride, &lfi);
+      } else {
+        vp9_loop_filter_bh(y_ptr, u_ptr, v_ptr,
+                           y_stride, uv_stride, &lfi);
       }
+    }
 
-      if (!skip_lf) {
-        if (tx_size >= TX_8X8) {
-          if (tx_size == TX_8X8 &&
-#if CONFIG_SB8X8
-              (mi->mbmi.sb_type < BLOCK_SIZE_MB16X16)
-#else
-              (mode == I8X8_PRED || mode == SPLITMV)
-#endif
-              )
-            vp9_loop_filter_bh8x8(y_ptr, u_ptr, v_ptr,
-                                  y_stride, uv_stride, &lfi);
-          else
-            vp9_loop_filter_bh8x8(y_ptr, NULL, NULL,
-                                  y_stride, uv_stride, &lfi);
-        } else {
-          vp9_loop_filter_bh(y_ptr, u_ptr, v_ptr,
-                             y_stride, uv_stride, &lfi);
-        }
-      }
+    if (do_left_mb_v) {
+      if (tx_size >= TX_16X16)
+        vp9_lpf_mbv_w(y_ptr,
+                      do_left_mbuv_v ? u_ptr : NULL,
+                      do_left_mbuv_v ? v_ptr : NULL,
+                      y_stride, uv_stride, &lfi);
+      else
+        vp9_loop_filter_mbv(y_ptr, u_ptr, v_ptr, y_stride, uv_stride, &lfi);
+    }
 
-      if (do_left_mb_v) {
-        if (tx_size >= TX_16X16)
-          vp9_lpf_mbv_w(y_ptr,
-                        do_left_mbuv_v ? u_ptr : NULL,
-                        do_left_mbuv_v ? v_ptr : NULL,
-                        y_stride, uv_stride, &lfi);
+    if (!skip_lf) {
+      if (tx_size >= TX_8X8) {
+        if (tx_size == TX_8X8 &&
+            mi->mbmi.sb_type < BLOCK_SIZE_MB16X16)
+          vp9_loop_filter_bv8x8(y_ptr, u_ptr, v_ptr,
+                                y_stride, uv_stride, &lfi);
         else
-          vp9_loop_filter_mbv(y_ptr, u_ptr, v_ptr, y_stride, uv_stride, &lfi);
+          vp9_loop_filter_bv8x8(y_ptr, NULL, NULL,
+                                y_stride, uv_stride, &lfi);
+      } else {
+        vp9_loop_filter_bv(y_ptr, u_ptr, v_ptr,
+                           y_stride, uv_stride, &lfi);
       }
-
-      if (!skip_lf) {
-        if (tx_size >= TX_8X8) {
-          if (tx_size == TX_8X8 &&
-#if CONFIG_SB8X8
-              (mi->mbmi.sb_type < BLOCK_SIZE_MB16X16)
-#else
-              (mode == I8X8_PRED || mode == SPLITMV)
-#endif
-              )
-            vp9_loop_filter_bv8x8(y_ptr, u_ptr, v_ptr,
-                                  y_stride, uv_stride, &lfi);
-          else
-            vp9_loop_filter_bv8x8(y_ptr, NULL, NULL,
-                                  y_stride, uv_stride, &lfi);
-        } else {
-          vp9_loop_filter_bv(y_ptr, u_ptr, v_ptr,
-                             y_stride, uv_stride, &lfi);
-        }
-      }
-      if (dering) {
+    }
+    if (dering) {
 #if CONFIG_LOOP_DERING
-        vp9_post_proc_down_and_across(y_ptr, y_ptr,
-          y_stride, y_stride,
-          16, 16, dering);
-        if (u_ptr && v_ptr) {
-          vp9_post_proc_down_and_across(u_ptr, u_ptr,
-            uv_stride, uv_stride,
-            8, 8, dering);
-          vp9_post_proc_down_and_across(v_ptr, v_ptr,
-            uv_stride, uv_stride,
-            8, 8, dering);
-        }
-#endif
+      vp9_post_proc_down_and_across(y_ptr, y_ptr,
+        y_stride, y_stride,
+        16, 16, dering);
+      if (u_ptr && v_ptr) {
+        vp9_post_proc_down_and_across(u_ptr, u_ptr,
+          uv_stride, uv_stride,
+          8, 8, dering);
+        vp9_post_proc_down_and_across(v_ptr, v_ptr,
+          uv_stride, uv_stride,
+          8, 8, dering);
       }
-    } else {
-      // TODO(yaowu): simple loop filter
+#endif
     }
   }
 }
@@ -322,7 +300,7 @@
       y_only? 0 : v_ptr,
       y_stride, uv_stride, dering);
   // process 2nd MB top-right
-  mi = mode_info_context + (1 << CONFIG_SB8X8);
+  mi = mode_info_context + 2;
   do_left_v = !(wbl >= 3 /* 32x16 or >=32x32 */ && (tx_size >= TX_32X32 ||
       sb_mb_lf_skip(mode_info_context, mi)));
   do_above_h = (mb_row > 0);
@@ -338,7 +316,7 @@
       y_stride, uv_stride, dering);
 
   // process 3rd MB bottom-left
-  mi = mode_info_context + (mis << CONFIG_SB8X8);
+  mi = mode_info_context + (mis << 1);
   do_left_v = (mb_col > 0);
   do_above_h = !(hbl >= 3 /* 16x32 or >=32x32 */ && (tx_size >= TX_32X32 ||
       sb_mb_lf_skip(mode_info_context, mi)));
@@ -354,15 +332,15 @@
       y_stride, uv_stride, dering);
 
   // process 4th MB bottom right
-  mi = mode_info_context + ((mis + 1) << CONFIG_SB8X8);
+  mi = mode_info_context + ((mis + 1) << 1);
   do_left_v = !(wbl >= 3 /* 32x16 or >=32x32 */ && (tx_size >= TX_32X32 ||
-      sb_mb_lf_skip(mi - (1 << CONFIG_SB8X8), mi)));
+      sb_mb_lf_skip(mi - 2, mi)));
   do_above_h = !(hbl >= 3 /* 16x32 or >=32x32 */ && (tx_size >= TX_32X32 ||
-      sb_mb_lf_skip(mode_info_context + (1 << CONFIG_SB8X8), mi)));
+      sb_mb_lf_skip(mode_info_context + 2, mi)));
   do_left_v_mbuv = (wbl >= 3 /* 32x16 or >=32x32 */ && (tx_size >= TX_16X16 ||
-      sb_mb_lf_skip(mi - (1 << CONFIG_SB8X8), mi)));
+      sb_mb_lf_skip(mi - 2, mi)));
   do_above_h_mbuv = !(hbl >= 3 /* 16x32 or >=32x32 */ && (tx_size >= TX_16X16 ||
-      sb_mb_lf_skip(mode_info_context + (1 << CONFIG_SB8X8), mi)));
+      sb_mb_lf_skip(mode_info_context + 2, mi)));
   lpf_mb(cm, mi, do_left_v, do_above_h,
       do_left_v_mbuv, do_above_h_mbuv,
       y_ptr + 16 * y_stride + 16,
@@ -379,17 +357,16 @@
   lpf_sb32(cm, mode_info_context, mb_row, mb_col,
       y_ptr, u_ptr, v_ptr,
       y_stride, uv_stride, y_only, dering);
-  lpf_sb32(cm, mode_info_context + (2 << CONFIG_SB8X8), mb_row, mb_col + 2,
+  lpf_sb32(cm, mode_info_context + 4, mb_row, mb_col + 2,
       y_ptr + 32, u_ptr + 16, v_ptr + 16,
       y_stride, uv_stride, y_only, dering);
-  lpf_sb32(cm, mode_info_context + cm->mode_info_stride * (2 << CONFIG_SB8X8),
+  lpf_sb32(cm, mode_info_context + cm->mode_info_stride * 4,
       mb_row + 2, mb_col,
       y_ptr + 32 * y_stride,
       u_ptr + 16 * uv_stride,
       v_ptr + 16 * uv_stride,
       y_stride, uv_stride, y_only, dering);
-  lpf_sb32(cm, mode_info_context + cm->mode_info_stride *
-      (2 << CONFIG_SB8X8) + (2 << CONFIG_SB8X8),
+  lpf_sb32(cm, mode_info_context + cm->mode_info_stride * 4 + 4,
       mb_row + 2, mb_col + 2,
       y_ptr + 32 * y_stride + 32,
       u_ptr + 16 * uv_stride + 16,
@@ -459,14 +436,14 @@
       y_ptr += 64;
       u_ptr = y_only? 0 : u_ptr + 32;
       v_ptr = y_only? 0 : v_ptr + 32;
-      mode_info_context += 4 << CONFIG_SB8X8;       // step to next SB64
+      mode_info_context += 8;       // step to next SB64
     }
     if (extra_sb32_col) {
       // process 2 SB32s in the extra SB32 col
       lpf_sb32(cm, mode_info_context, mb_row, mb_col,
                y_ptr, u_ptr, v_ptr,
                y_stride, uv_stride, y_only, dering);
-      lpf_sb32(cm, mode_info_context + mis * (2 << CONFIG_SB8X8),
+      lpf_sb32(cm, mode_info_context + mis * 4,
                mb_row + 2, mb_col,
                y_ptr + 32 * y_stride,
                u_ptr + 16 * uv_stride,
@@ -475,63 +452,30 @@
       y_ptr += 32;
       u_ptr = y_only? 0 : u_ptr + 16;
       v_ptr = y_only? 0 : v_ptr + 16;
-      mode_info_context += 2 << CONFIG_SB8X8;       // step to next SB32
+      mode_info_context += 4;       // step to next SB32
       mb_col += 2;
     }
     if (extra_mb_col) {
       // process 4 MB in the extra MB col
-      // process 1st MB
-      mi = mode_info_context;
-      do_left_v = (mb_col > 0);
-      do_above_h = (mb_row > 0);
-      do_left_v_mbuv =  1;
-      do_above_h_mbuv = 1;
-      lpf_mb(cm, mi, do_left_v, do_above_h,
-             do_left_v_mbuv, do_above_h_mbuv,
-             y_ptr,
-             y_only? 0 : u_ptr,
-             y_only? 0 : v_ptr,
-             y_stride, uv_stride, dering);
-      // process 2nd MB
-      mi = mode_info_context + (mis << CONFIG_SB8X8);
-      do_left_v = (mb_col > 0);
-      do_above_h = 1;
-      do_left_v_mbuv =  1;
-      do_above_h_mbuv = 1;
-      lpf_mb(cm, mi, do_left_v, do_above_h,
-             do_left_v_mbuv, do_above_h_mbuv,
-             y_ptr + 16 * y_stride,
-             y_only ? 0 : (u_ptr + 8 * uv_stride),
-             y_only ? 0 : (v_ptr + 8 * uv_stride),
-             y_stride, uv_stride, dering);
-      // process 3nd MB
-      mi = mode_info_context + (mis << CONFIG_SB8X8) * 2;
-      do_left_v = (mb_col > 0);
-      do_above_h = 1;
-      do_left_v_mbuv =  1;
-      do_above_h_mbuv = 1;
-      lpf_mb(cm, mi, do_left_v, do_above_h,
-             do_left_v_mbuv, do_above_h_mbuv,
-             y_ptr + 32 * y_stride,
-             y_only ? 0 : (u_ptr + 16 * uv_stride),
-             y_only ? 0 : (v_ptr + 16 * uv_stride),
-             y_stride, uv_stride, dering);
-      // process 4th MB
-      mi = mode_info_context + (mis << CONFIG_SB8X8) * 3;
-      do_left_v = (mb_col > 0);
-      do_above_h = 1;
-      do_left_v_mbuv =  1;
-      do_above_h_mbuv = 1;
-      lpf_mb(cm, mi, do_left_v, do_above_h,
-             do_left_v_mbuv, do_above_h_mbuv,
-             y_ptr + 48 * y_stride,
-             y_only ? 0 : (u_ptr + 24 * uv_stride),
-             y_only ? 0 : (v_ptr + 24 * uv_stride),
-             y_stride, uv_stride, dering);
+      int k;
+      for (k = 0; k < 4; ++k) {
+        mi = mode_info_context + (mis << 1) * k;
+        do_left_v = (mb_col > 0);
+        do_above_h = k == 0 ? mb_row > 0 : 1;
+        do_left_v_mbuv =  1;
+        do_above_h_mbuv = 1;
+        lpf_mb(cm, mi, do_left_v, do_above_h,
+               do_left_v_mbuv, do_above_h_mbuv,
+               y_ptr + (k * 16) * y_stride,
+               y_only ? 0 : (u_ptr + (k * 8) * uv_stride),
+               y_only ? 0 : (v_ptr + (k * 8) * uv_stride),
+               y_stride, uv_stride, dering);
+      }
+
       y_ptr += 16;
       u_ptr = y_only? 0 : u_ptr + 8;
       v_ptr = y_only? 0 : v_ptr + 8;
-      mode_info_context += 1 << CONFIG_SB8X8;       // step to next MB
+      mode_info_context += 2;       // step to next MB
     }
     // move pointers to the begining of next sb64 row
     y_ptr += y_stride  * 64 - post->y_width;
@@ -540,7 +484,7 @@
       v_ptr += uv_stride *  32 - post->uv_width;
     }
     /* skip to next SB64 row */
-    mode_info_context += mis * (4 << CONFIG_SB8X8) - cm->mi_cols;
+    mode_info_context += mis * 8 - cm->mi_cols;
   }
   if (extra_sb32_row) {
     const int sb32_cols = sb64_cols * 2 + extra_sb32_col;
@@ -551,7 +495,7 @@
       y_ptr += 32;
       u_ptr = y_only? 0 : u_ptr + 16;
       v_ptr = y_only? 0 : v_ptr + 16;
-      mode_info_context += 2 << CONFIG_SB8X8;       // step to next SB32
+      mode_info_context += 4;       // step to next SB32
     }
     if (extra_mb_col) {
       // process 1st MB
@@ -567,7 +511,7 @@
              y_only? NULL : v_ptr,
              y_stride, uv_stride, dering);
       // process 2nd MB
-      mi = mode_info_context + (mis << CONFIG_SB8X8);
+      mi = mode_info_context + (mis << 1);
       do_left_v = (mb_col > 0);
       do_above_h = 1;
       do_left_v_mbuv =  1;
@@ -581,14 +525,14 @@
       y_ptr += 16;
       u_ptr = y_only? 0 : u_ptr + 8;
       v_ptr = y_only? 0 : v_ptr + 8;
-      mode_info_context += 1 << CONFIG_SB8X8;       /* step to next MB */
+      mode_info_context += 2;       /* step to next MB */
     }
     // move pointers to the beginning of next sb64 row
     y_ptr += y_stride * 32 - post->y_width;
     u_ptr += y_only? 0 : uv_stride *  16 - post->uv_width;
     v_ptr += y_only? 0 : uv_stride *  16 - post->uv_width;
     // skip to next MB row if exist
-    mode_info_context += mis * (2 << CONFIG_SB8X8) - cm->mi_cols;
+    mode_info_context += mis * 4 - cm->mi_cols;
     mb_row += 2;
   }
   if (extra_mb_row) {
@@ -607,7 +551,7 @@
       y_ptr += 16;
       u_ptr = y_only? 0 : u_ptr + 8;
       v_ptr = y_only? 0 : v_ptr + 8;
-      mode_info_context += 1 << CONFIG_SB8X8;     // step to next MB
+      mode_info_context += 2;     // step to next MB
     }
   }
 }
diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h
index 81745e4..589984f 100644
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -16,12 +16,6 @@
 #include "vp9/common/vp9_blockd.h"
 
 #define MAX_LOOP_FILTER 63
-
-typedef enum {
-  NORMAL_LOOPFILTER = 0,
-  SIMPLE_LOOPFILTER = 1
-} LOOPFILTER_TYPE;
-
 #define SIMD_WIDTH 16
 
 /* Need to align this structure so when it is declared and
@@ -55,9 +49,6 @@
   void sym(uint8_t *y, uint8_t *u, uint8_t *v, \
            int ystride, int uv_stride, struct loop_filter_info *lfi)
 
-#define prototype_simple_loopfilter(sym) \
-  void sym(uint8_t *y, int ystride, const unsigned char *blimit)
-
 #if ARCH_X86 || ARCH_X86_64
 #include "x86/vp9_loopfilter_x86.h"
 #endif
diff --git a/vp9/common/vp9_loopfilter_filters.c b/vp9/common/vp9_loopfilter_filters.c
index 15785f5..fc7fbc4 100644
--- a/vp9/common/vp9_loopfilter_filters.c
+++ b/vp9/common/vp9_loopfilter_filters.c
@@ -8,15 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <stdlib.h>
 #include "vpx_config.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
 static INLINE int8_t signed_char_clamp(int t) {
-  t = (t < -128 ? -128 : t);
-  t = (t > 127 ? 127 : t);
-  return (int8_t) t;
+  return (int8_t)clamp(t, -128, 127);
 }
 
 // should we apply any filter at all: 11111111 yes, 00000000 no
@@ -36,7 +34,7 @@
   return ~mask;
 }
 
-// is there high variance internal edge: 11111111 yes, 00000000 no
+// is there high edge variance internal edge: 11111111 yes, 00000000 no
 static INLINE int8_t hevmask(uint8_t thresh, uint8_t p1, uint8_t p0,
                              uint8_t q0, uint8_t q1) {
   int8_t hev = 0;
@@ -68,12 +66,9 @@
 
   *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
   *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
-  filter = filter1;
 
   // outer tap adjustments
-  filter += 1;
-  filter >>= 1;
-  filter &= ~hev;
+  filter = ((filter1 + 1) >> 1) & ~hev;
 
   *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
   *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
@@ -84,23 +79,19 @@
                                        const uint8_t *limit,
                                        const uint8_t *thresh,
                                        int count) {
-  int i = 0;
+  int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  do {
-    const int8_t mask = filter_mask(limit[0], blimit[0],
-                                    s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
-                                    s[0 * p],  s[1 * p],  s[2 * p],  s[3 * p]);
-
-    // high edge variance
-    const int8_t hev = hevmask(thresh[0],
-                               s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
-
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p],  q1 = s[1 * p],  q2 = s[2 * p],  q3 = s[3 * p];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
     filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
-
     ++s;
-  } while (++i < count * 8);
+  }
 }
 
 void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch,
@@ -108,21 +99,21 @@
                                      const uint8_t *limit,
                                      const uint8_t *thresh,
                                      int count) {
-  int i = 0;
+  int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  do {
-    const int8_t mask = filter_mask(limit[0], blimit[0],
-                                    s[-4], s[-3], s[-2], s[-1],
-                                    s[0],  s[1],  s[2],  s[3]);
-
-    // high edge variance
-    const int8_t hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
     filter(mask, hev, s - 2, s - 1, s, s + 1);
     s += pitch;
-  } while (++i < count * 8);
+  }
 }
+
 static INLINE int8_t flatmask4(uint8_t thresh,
                                uint8_t p3, uint8_t p2,
                                uint8_t p1, uint8_t p0,
@@ -157,14 +148,8 @@
                             uint8_t *oq2, uint8_t *oq3) {
   // use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line
   if (flat && mask) {
-    const uint8_t p3 = *op3;
-    const uint8_t p2 = *op2;
-    const uint8_t p1 = *op1;
-    const uint8_t p0 = *op0;
-    const uint8_t q0 = *oq0;
-    const uint8_t q1 = *oq1;
-    const uint8_t q2 = *oq2;
-    const uint8_t q3 = *oq3;
+    const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
 
     *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3);
     *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3);
@@ -173,33 +158,7 @@
     *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3);
     *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3);
   } else {
-    int8_t filter1, filter2;
-
-    const int8_t ps1 = (int8_t) *op1 ^ 0x80;
-    const int8_t ps0 = (int8_t) *op0 ^ 0x80;
-    const int8_t qs0 = (int8_t) *oq0 ^ 0x80;
-    const int8_t qs1 = (int8_t) *oq1 ^ 0x80;
-
-    // add outer taps if we have high edge variance
-    int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
-
-    // inner taps
-    filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
-
-    filter1 = signed_char_clamp(filter + 4) >> 3;
-    filter2 = signed_char_clamp(filter + 3) >> 3;
-
-    *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
-    *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
-    filter = filter1;
-
-    // outer tap adjustments
-    filter += 1;
-    filter >>= 1;
-    filter &= ~hev;
-
-    *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
-    *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
+    filter(mask, hev, op1,  op0, oq0, oq1);
   }
 }
 
@@ -208,28 +167,23 @@
                                          const uint8_t *limit,
                                          const uint8_t *thresh,
                                          int count) {
-  int i = 0;
+  int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  do {
-    const int8_t mask = filter_mask(limit[0], blimit[0],
-                                    s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
-                                    s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
 
-    const int8_t hev = hevmask(thresh[0],
-                               s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
-
-    const int8_t flat = flatmask4(1,
-                                  s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
-                                  s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
+    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
     mbfilter(mask, hev, flat,
              s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
              s,         s + 1 * p, s + 2 * p, s + 3 * p);
-
     ++s;
-  } while (++i < count * 8);
-
+  }
 }
 
 void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch,
@@ -237,72 +191,19 @@
                                        const uint8_t *limit,
                                        const uint8_t *thresh,
                                        int count) {
-  int i = 0;
+  int i;
 
-  do {
-    const int8_t mask = filter_mask(limit[0], blimit[0],
-                                    s[-4], s[-3], s[-2], s[-1],
-                                    s[0],  s[1],  s[2],  s[3]);
-
-    const int8_t hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
-    const int8_t flat = flatmask4(1, s[-4], s[-3], s[-2], s[-1],
-                                     s[ 0], s[ 1], s[ 2], s[ 3]);
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t hev = hevmask(thresh[0], p1, p0, q0, q1);
+    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
     mbfilter(mask, hev, flat, s - 4, s - 3, s - 2, s - 1,
                               s,     s + 1, s + 2, s + 3);
     s += pitch;
-  } while (++i < count * 8);
-
-}
-
-// should we apply any filter at all: 11111111 yes, 00000000 no
-static INLINE int8_t simple_filter_mask(uint8_t blimit,
-                                        uint8_t p1, uint8_t p0,
-                                        uint8_t q0, uint8_t q1) {
-  return (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= blimit) * -1;
-}
-
-static INLINE void simple_filter(int8_t mask,
-                                 uint8_t *op1, uint8_t *op0,
-                                 uint8_t *oq0, uint8_t *oq1) {
-  int8_t filter1, filter2;
-  const int8_t p1 = (int8_t) *op1 ^ 0x80;
-  const int8_t p0 = (int8_t) *op0 ^ 0x80;
-  const int8_t q0 = (int8_t) *oq0 ^ 0x80;
-  const int8_t q1 = (int8_t) *oq1 ^ 0x80;
-
-  int8_t filter = signed_char_clamp(p1 - q1);
-  filter = signed_char_clamp(filter + 3 * (q0 - p0));
-  filter &= mask;
-
-  // save bottom 3 bits so that we round one side +4 and the other +3
-  filter1 = signed_char_clamp(filter + 4) >> 3;
-  *oq0  = signed_char_clamp(q0 - filter1) ^ 0x80;
-
-  filter2 = signed_char_clamp(filter + 3) >> 3;
-  *op0 = signed_char_clamp(p0 + filter2) ^ 0x80;
-}
-
-void vp9_loop_filter_simple_horizontal_edge_c(uint8_t *s, int p,
-                                              const uint8_t *blimit) {
-  int i = 0;
-
-  do {
-    const int8_t mask = simple_filter_mask(blimit[0], s[-2 * p], s[-1 * p],
-                                                      s[0 * p],  s[1 * p]);
-    simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p);
-    ++s;
-  } while (++i < 16);
-}
-
-void vp9_loop_filter_simple_vertical_edge_c(uint8_t *s, int p,
-                                            const uint8_t *blimit) {
-  int i = 0;
-
-  do {
-    const int8_t mask = simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
-    simple_filter(mask, s - 2, s - 1, s, s + 1);
-    s += p;
-  } while (++i < 16);
+  }
 }
 
 /* Vertical MB Filtering */
@@ -392,11 +293,6 @@
                                       lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp9_loop_filter_bhs_c(uint8_t *y, int y_stride, const uint8_t *blimit) {
-  vp9_loop_filter_simple_horizontal_edge_c(y + 4 * y_stride, y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_c(y + 8 * y_stride, y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_c(y + 12 * y_stride, y_stride, blimit);
-}
 
 void vp9_loop_filter_bv8x8_c(uint8_t *y, uint8_t *u, uint8_t *v,
                              int y_stride, int uv_stride,
@@ -413,12 +309,6 @@
                                     lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp9_loop_filter_bvs_c(uint8_t *y, int y_stride, const uint8_t *blimit) {
-  vp9_loop_filter_simple_vertical_edge_c(y + 4, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_c(y + 8, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_c(y + 12, y_stride, blimit);
-}
-
 static INLINE void wide_mbfilter(int8_t mask, uint8_t hev,
                                  uint8_t flat, uint8_t flat2,
                                  uint8_t *op7, uint8_t *op6, uint8_t *op5,
@@ -429,22 +319,11 @@
                                  uint8_t *oq7) {
   // use a 15 tap filter [1,1,1,1,1,1,1,2,1,1,1,1,1,1,1] for flat line
   if (flat2 && flat && mask) {
-    const uint8_t p7 = *op7;
-    const uint8_t p6 = *op6;
-    const uint8_t p5 = *op5;
-    const uint8_t p4 = *op4;
-    const uint8_t p3 = *op3;
-    const uint8_t p2 = *op2;
-    const uint8_t p1 = *op1;
-    const uint8_t p0 = *op0;
-    const uint8_t q0 = *oq0;
-    const uint8_t q1 = *oq1;
-    const uint8_t q2 = *oq2;
-    const uint8_t q3 = *oq3;
-    const uint8_t q4 = *oq4;
-    const uint8_t q5 = *oq5;
-    const uint8_t q6 = *oq6;
-    const uint8_t q7 = *oq7;
+    const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4,
+                  p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+
+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3,
+                  q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
 
     *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
                               q0, 4);
@@ -474,49 +353,8 @@
                               q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
     *oq6 = ROUND_POWER_OF_TWO(p0 +
                               q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
-  } else if (flat && mask) {
-    const uint8_t p3 = *op3;
-    const uint8_t p2 = *op2;
-    const uint8_t p1 = *op1;
-    const uint8_t p0 = *op0;
-    const uint8_t q0 = *oq0;
-    const uint8_t q1 = *oq1;
-    const uint8_t q2 = *oq2;
-    const uint8_t q3 = *oq3;
-
-    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3);
-    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3);
-    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3);
-    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3);
-    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3);
-    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3);
   } else {
-    int8_t filter1, filter2;
-
-    const int8_t ps1 = (int8_t) * op1 ^ 0x80;
-    const int8_t ps0 = (int8_t) * op0 ^ 0x80;
-    const int8_t qs0 = (int8_t) * oq0 ^ 0x80;
-    const int8_t qs1 = (int8_t) * oq1 ^ 0x80;
-
-    // add outer taps if we have high edge variance
-    int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
-
-    // inner taps
-    filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
-    filter1 = signed_char_clamp(filter + 4) >> 3;
-    filter2 = signed_char_clamp(filter + 3) >> 3;
-
-    *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
-    *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
-    filter = filter1;
-
-    // outer tap adjustments
-    filter += 1;
-    filter >>= 1;
-    filter &= ~hev;
-
-    *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
-    *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
+    mbfilter(mask, hev, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
   }
 }
 
@@ -525,25 +363,20 @@
                                  const uint8_t *limit,
                                  const uint8_t *thresh,
                                  int count) {
-  int i = 0;
+  int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  do {
-    const int8_t mask = filter_mask(limit[0], blimit[0],
-                                    s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
-                                    s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
-
-    const int8_t hev = hevmask(thresh[0],
-                               s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
-
-    const int8_t flat = flatmask4(1,
-                                  s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
-                                  s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
-
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
+    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
     const int8_t flat2 = flatmask5(1,
-                         s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], s[-1 * p],
-                         s[ 0 * p], s[ 4 * p], s[ 5 * p], s[ 6 * p], s[ 7 * p]);
+                             s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
+                             q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
 
     wide_mbfilter(mask, hev, flat, flat2,
                   s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
@@ -552,33 +385,31 @@
                   s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
 
     ++s;
-  } while (++i < count * 8);
+  }
 }
+
 void vp9_mb_lpf_vertical_edge_w(uint8_t *s, int p,
                                 const uint8_t *blimit,
                                 const uint8_t *limit,
                                 const uint8_t *thresh,
                                 int count) {
-  int i = 0;
+  int i;
 
-  do {
-    const int8_t mask = filter_mask(limit[0], blimit[0],
-                                    s[-4], s[-3], s[-2], s[-1],
-                                    s[0],  s[1],  s[2],  s[3]);
-
-    const int8_t hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
-    const int8_t flat = flatmask4(1, s[-4], s[-3], s[-2], s[-1],
-                                     s[ 0], s[ 1], s[ 2], s[ 3]);
-    const int8_t flat2 = flatmask5(1, s[-8], s[-7], s[-6], s[-5], s[-1],
-                                      s[ 0], s[ 4], s[ 5], s[ 6], s[ 7]);
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1],  q2 = s[2], q3 = s[3];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
+    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat2 = flatmask5(1, s[-8], s[-7], s[-6], s[-5], p0,
+                                   q0, s[4], s[5], s[6], s[7]);
 
     wide_mbfilter(mask, hev, flat, flat2,
-                  s - 8, s - 7, s - 6, s - 5,
-                  s - 4, s - 3, s - 2, s - 1,
-                  s,     s + 1, s + 2, s + 3,
-                  s + 4, s + 5, s + 6, s + 7);
+                  s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
+                  s,     s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
     s += p;
-  } while (++i < count * 8);
+  }
 }
 
 void vp9_lpf_mbv_w_c(uint8_t *y, uint8_t *u, uint8_t *v,
diff --git a/vp9/common/vp9_mbpitch.c b/vp9/common/vp9_mbpitch.c
index 8c05a34..d9f892b 100644
--- a/vp9/common/vp9_mbpitch.c
+++ b/vp9/common/vp9_mbpitch.c
@@ -11,12 +11,13 @@
 
 #include "vp9/common/vp9_blockd.h"
 
-void vp9_setup_block_dptrs(MACROBLOCKD *mb) {
+void vp9_setup_block_dptrs(MACROBLOCKD *mb,
+                           int subsampling_x, int subsampling_y) {
   int i;
 
   for (i = 0; i < MAX_MB_PLANE; i++) {
     mb->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC;
-    mb->plane[i].subsampling_x = !!i;
-    mb->plane[i].subsampling_y = !!i;
+    mb->plane[i].subsampling_x = i ? subsampling_x : 0;
+    mb->plane[i].subsampling_y = i ? subsampling_y : 0;
   }
 }
diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index 7a7ebe6..3f18c69 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -12,7 +12,6 @@
 
 #define MVREF_NEIGHBOURS 8
 
-#if CONFIG_SB8X8
 static int b_mv_ref_search[MVREF_NEIGHBOURS][2] = {
   {0, -1}, {-1, 0}, {-1, -1}, {0, -2},
   {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}
@@ -32,22 +31,6 @@
     {0, -1}, {-1, 0}, {2, -1}, {-1,  2},
     {4, -1}, {-1, 4}, {6, -1}, {-1, -1}
 };
-#else
-static int mb_mv_ref_search[MVREF_NEIGHBOURS][2] = {
-  {0, -1}, {-1, 0}, {-1, -1}, {0, -2},
-  {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}
-};
-
-static int sb_mv_ref_search[MVREF_NEIGHBOURS][2] = {
-  {0, -1}, {-1, 0}, {1, -1}, {-1, 1},
-  {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}
-};
-
-static int sb64_mv_ref_search[MVREF_NEIGHBOURS][2] = {
-  {0, -1}, {-1, 0}, {1, -1}, {-1,  1},
-  {2, -1}, {-1, 2}, {3, -1}, {-1, -1}
-};
-#endif
 
 // clamp_mv_ref
 #define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
@@ -190,15 +173,10 @@
     mv_ref_search = sb64_mv_ref_search;
   } else if (mbmi->sb_type >= BLOCK_SIZE_SB32X32) {
     mv_ref_search = sb_mv_ref_search;
-#if CONFIG_SB8X8
   } else if (mbmi->sb_type >= BLOCK_SIZE_MB16X16) {
     mv_ref_search = mb_mv_ref_search;
   } else {
     mv_ref_search = b_mv_ref_search;
-#else
-  } else {
-    mv_ref_search = mb_mv_ref_search;
-#endif
   }
 
   // We first scan for candidate vectors that match the current reference frame
@@ -208,7 +186,7 @@
 
     if ((mi_search_col >= cm->cur_tile_mi_col_start) &&
         (mi_search_col < cm->cur_tile_mi_col_end) &&
-        ((mv_ref_search[i][1] << (7 - CONFIG_SB8X8)) >= xd->mb_to_top_edge)) {
+        ((mv_ref_search[i][1] << 6) >= xd->mb_to_top_edge)) {
 
       candidate_mi = here + mv_ref_search[i][0] +
                      (mv_ref_search[i][1] * xd->mode_info_stride);
@@ -228,7 +206,7 @@
 
     if ((mi_search_col >= cm->cur_tile_mi_col_start) &&
         (mi_search_col < cm->cur_tile_mi_col_end) &&
-        ((mv_ref_search[i][1] << (7 - CONFIG_SB8X8)) >= xd->mb_to_top_edge)) {
+        ((mv_ref_search[i][1] << 6) >= xd->mb_to_top_edge)) {
       candidate_mi = here + mv_ref_search[i][0] +
                      (mv_ref_search[i][1] * xd->mode_info_stride);
 
@@ -258,7 +236,7 @@
 
     if ((mi_search_col >= cm->cur_tile_mi_col_start) &&
         (mi_search_col < cm->cur_tile_mi_col_end) &&
-        ((mv_ref_search[i][1] << (7 - CONFIG_SB8X8)) >= xd->mb_to_top_edge)) {
+        ((mv_ref_search[i][1] << 6) >= xd->mb_to_top_edge)) {
       candidate_mi = here + mv_ref_search[i][0] +
                      (mv_ref_search[i][1] * xd->mode_info_stride);
 
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 1538a00..2d4cd30 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -55,25 +55,13 @@
   vp9_prob ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */
   vp9_prob sb_ymode_prob[VP9_I32X32_MODES - 1];
   vp9_prob uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1];
-#if !CONFIG_SB8X8
-  vp9_prob i8x8_mode_prob[VP9_I8X8_MODES - 1];
-#endif
   vp9_prob sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
-#if !CONFIG_SB8X8
-  vp9_prob mbsplit_prob[VP9_NUMMBSPLITS - 1];
-#endif
   vp9_prob partition_prob[NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
 
   vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES];
   vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES];
   vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES];
   vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES];
-#if CONFIG_CODE_ZEROGROUP
-  vp9_zpc_probs zpc_probs_4x4;
-  vp9_zpc_probs zpc_probs_8x8;
-  vp9_zpc_probs zpc_probs_16x16;
-  vp9_zpc_probs zpc_probs_32x32;
-#endif
 
   nmv_context nmvc;
   nmv_context pre_nmvc;
@@ -81,37 +69,19 @@
   vp9_prob pre_ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */
   vp9_prob pre_sb_ymode_prob[VP9_I32X32_MODES - 1];
   vp9_prob pre_uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1];
-#if !CONFIG_SB8X8
-  vp9_prob pre_i8x8_mode_prob[VP9_I8X8_MODES - 1];
-#endif
   vp9_prob pre_sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
-#if !CONFIG_SB8X8
-  vp9_prob pre_mbsplit_prob[VP9_NUMMBSPLITS - 1];
-#endif
   vp9_prob pre_partition_prob[NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
   unsigned int bmode_counts[VP9_NKF_BINTRAMODES];
   unsigned int ymode_counts[VP9_YMODES];   /* interframe intra mode probs */
   unsigned int sb_ymode_counts[VP9_I32X32_MODES];
   unsigned int uv_mode_counts[VP9_YMODES][VP9_UV_MODES];
-#if !CONFIG_SB8X8
-  unsigned int i8x8_mode_counts[VP9_I8X8_MODES];   /* interframe intra probs */
-#endif
   unsigned int sub_mv_ref_counts[SUBMVREF_COUNT][VP9_SUBMVREFS];
-#if !CONFIG_SB8X8
-  unsigned int mbsplit_counts[VP9_NUMMBSPLITS];
-#endif
   unsigned int partition_counts[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
 
   vp9_coeff_probs pre_coef_probs_4x4[BLOCK_TYPES];
   vp9_coeff_probs pre_coef_probs_8x8[BLOCK_TYPES];
   vp9_coeff_probs pre_coef_probs_16x16[BLOCK_TYPES];
   vp9_coeff_probs pre_coef_probs_32x32[BLOCK_TYPES];
-#if CONFIG_CODE_ZEROGROUP
-  vp9_zpc_probs pre_zpc_probs_4x4;
-  vp9_zpc_probs pre_zpc_probs_8x8;
-  vp9_zpc_probs pre_zpc_probs_16x16;
-  vp9_zpc_probs pre_zpc_probs_32x32;
-#endif
 
   vp9_coeff_count coef_counts_4x4[BLOCK_TYPES];
   vp9_coeff_count coef_counts_8x8[BLOCK_TYPES];
@@ -120,21 +90,9 @@
   unsigned int eob_branch_counts[TX_SIZE_MAX_SB][BLOCK_TYPES][REF_TYPES]
                                 [COEF_BANDS][PREV_COEF_CONTEXTS];
 
-#if CONFIG_CODE_ZEROGROUP
-  vp9_zpc_count zpc_counts_4x4;
-  vp9_zpc_count zpc_counts_8x8;
-  vp9_zpc_count zpc_counts_16x16;
-  vp9_zpc_count zpc_counts_32x32;
-#endif
-
   nmv_context_counts NMVcount;
   vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
                                  [VP9_SWITCHABLE_FILTERS - 1];
-#if CONFIG_COMP_INTERINTRA_PRED
-  unsigned int interintra_counts[2];
-  vp9_prob interintra_prob;
-  vp9_prob pre_interintra_prob;
-#endif
 
   int vp9_mode_contexts[INTER_MODE_CONTEXTS][4];
   unsigned int mv_ref_ct[INTER_MODE_CONTEXTS][4][2];
@@ -174,6 +132,12 @@
   int last_width;
   int last_height;
 
+  // TODO(jkoleszar): this implies chroma ss right now, but could vary per
+  // plane. Revisit as part of the future change to YV12_BUFFER_CONFIG to
+  // support additional planes.
+  int subsampling_x;
+  int subsampling_y;
+
   YUV_TYPE clr_type;
   CLAMP_TYPE  clamp_type;
 
@@ -204,8 +168,7 @@
 
   int frame_flags;
   // MBs, mb_rows/cols is in 16-pixel units; mi_rows/cols is in
-  // MODE_INFO units (depending on CONFIG_SB8X8, that is either
-  // 16-pixel or 8-pixel)
+  // MODE_INFO (8-pixel) units.
   int MBs;
   int mb_rows, mi_rows;
   int mb_cols, mi_cols;
@@ -241,7 +204,6 @@
   unsigned char *last_frame_seg_map;
 
   INTERPOLATIONFILTERTYPE mcomp_filter_type;
-  LOOPFILTER_TYPE filter_type;
 
   loop_filter_info_n lf_info;
 
@@ -260,7 +222,7 @@
 
   // partition contexts
   PARTITION_CONTEXT *above_seg_context;
-  PARTITION_CONTEXT left_seg_context[4];
+  PARTITION_CONTEXT left_seg_context[8];
 
   /* keyframe block modes are predicted by their above, left neighbors */
 
@@ -311,10 +273,6 @@
   struct postproc_state  postproc_state;
 #endif
 
-#if CONFIG_COMP_INTERINTRA_PRED
-  int use_interintra;
-#endif
-
   int error_resilient_mode;
   int frame_parallel_decoding_mode;
 
@@ -344,8 +302,15 @@
   buf[new_idx]++;
 }
 
-static int mb_cols_aligned_to_sb(VP9_COMMON *cm) {
-  return (cm->mb_cols + 3) & ~3;
+static int mi_cols_aligned_to_sb(VP9_COMMON *cm) {
+  return 2 * ((cm->mb_cols + 3) & ~3);
+}
+
+static INLINE void set_partition_seg_context(VP9_COMMON *cm,
+                                             MACROBLOCKD *xd,
+                                             int mi_row, int mi_col) {
+  xd->above_seg_context = cm->above_seg_context + mi_col;
+  xd->left_seg_context  = cm->left_seg_context + (mi_row & MI_MASK);
 }
 
 static void set_mi_row_col(VP9_COMMON *cm, MACROBLOCKD *xd,
@@ -371,10 +336,6 @@
 }
 
 static int get_token_alloc(int mb_rows, int mb_cols) {
-#if CONFIG_CODE_ZEROGROUP
-  return mb_rows * mb_cols * (24 * 16 * 2);
-#else
-  return mb_rows * mb_cols * (24 * 16 + 4);
-#endif
+  return mb_rows * mb_cols * (48 * 16 + 4);
 }
 #endif  // VP9_COMMON_VP9_ONYXC_INT_H_
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index f93b74c..f81690a 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -631,13 +631,7 @@
 
   if (!flags) {
     *dest = *oci->frame_to_show;
-
-    /* handle problem with extending borders */
-    dest->y_width = oci->width;
-    dest->y_height = oci->height;
-    dest->uv_height = dest->y_height / 2;
     return 0;
-
   }
 
 #if ARCH_X86||ARCH_X86_64
diff --git a/vp9/common/vp9_quant_common.c b/vp9/common/vp9_quant_common.c
index 5907b4f..295c8e7 100644
--- a/vp9/common/vp9_quant_common.c
+++ b/vp9/common/vp9_quant_common.c
@@ -15,7 +15,7 @@
 static int16_t dc_qlookup[QINDEX_RANGE];
 static int16_t ac_qlookup[QINDEX_RANGE];
 
-#define ACDC_MIN 4
+#define ACDC_MIN 8
 
 // TODO(dkovalev) move to common and reuse
 static double poly3(double a, double b, double c, double d, double x) {
@@ -25,10 +25,19 @@
 void vp9_init_quant_tables() {
   int i, val = 4;
 
-  for (i = 0; i < QINDEX_RANGE; i++) {
+  // A "real" q of 1.0 forces lossless mode.
+  // In practice non lossless Q's between 1.0 and 2.0 (represented here by
+  // integer values from 5-7 give poor rd results (lower psnr and often
+  // larger size than the lossless encode. To block out those "not very useful"
+  // values we increment the ac and dc q lookup values by 4 after position 0.
+  ac_qlookup[0] = val;
+  dc_qlookup[0] = val;
+  val += 4;
+
+  for (i = 1; i < QINDEX_RANGE; i++) {
     const int ac_val = val;
 
-    val = (int)(val * 1.02);
+    val = (int)(val * 1.01975);
     if (val == ac_val)
       ++val;
 
diff --git a/vp9/common/vp9_recon.c b/vp9/common/vp9_recon.c
index 6b102d1..69a4720 100644
--- a/vp9/common/vp9_recon.c
+++ b/vp9/common/vp9_recon.c
@@ -34,26 +34,6 @@
   recon(4, 4, diff_ptr, diff_stride, dst_ptr, stride);
 }
 
-#if !CONFIG_SB8X8
-void vp9_recon_uv_b_c(uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr,
-                      int stride) {
-  assert(pred_ptr == dst_ptr);
-  recon(4, 4, diff_ptr, 8, dst_ptr, stride);
-}
-
-void vp9_recon4b_c(uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr,
-                   int stride) {
-  assert(pred_ptr == dst_ptr);
-  recon(4, 16, diff_ptr, 16, dst_ptr, stride);
-}
-
-void vp9_recon2b_c(uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr,
-                   int stride) {
-  assert(pred_ptr == dst_ptr);
-  recon(4, 8, diff_ptr, 8, dst_ptr, stride);
-}
-#endif
-
 static void recon_plane(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize, int plane) {
   const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
   const int bh = 4 << (b_height_log2(bsize) - xd->plane[plane].subsampling_y);
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 0420063..3668fcd 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -18,11 +18,8 @@
 #include "vp9/common/vp9_reconintra.h"
 
 void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
-                                       YV12_BUFFER_CONFIG *other,
+                                       int other_w, int other_h,
                                        int this_w, int this_h) {
-  int other_h = other->y_crop_height;
-  int other_w = other->y_crop_width;
-
   scale->x_num = other_w;
   scale->x_den = this_w;
   scale->x_offset_q4 = 0;  // calculated per-mb
@@ -125,11 +122,6 @@
     case BILINEAR:
       xd->subpix.filter_x = xd->subpix.filter_y = vp9_bilinear_filters;
       break;
-#if CONFIG_ENABLE_6TAP
-    case SIXTAP:
-      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_6;
-      break;
-#endif
   }
   assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
 }
@@ -265,27 +257,19 @@
   return (value < 0 ? value - 2 : value + 2) / 4;
 }
 
-#if CONFIG_SB8X8
-#define IDX1 2
-#define IDX2 3
-#else
-#define IDX1 4
-#define IDX2 5
-#endif
-
-static int mi_mv_pred_row_q4(MACROBLOCKD *mb, int off, int idx) {
-  const int temp = mb->mode_info_context->bmi[off + 0].as_mv[idx].as_mv.row +
-                   mb->mode_info_context->bmi[off + 1].as_mv[idx].as_mv.row +
-                   mb->mode_info_context->bmi[off + IDX1].as_mv[idx].as_mv.row +
-                   mb->mode_info_context->bmi[off + IDX2].as_mv[idx].as_mv.row;
+static int mi_mv_pred_row_q4(MACROBLOCKD *mb, int idx) {
+  const int temp = mb->mode_info_context->bmi[0].as_mv[idx].as_mv.row +
+                   mb->mode_info_context->bmi[1].as_mv[idx].as_mv.row +
+                   mb->mode_info_context->bmi[2].as_mv[idx].as_mv.row +
+                   mb->mode_info_context->bmi[3].as_mv[idx].as_mv.row;
   return round_mv_comp_q4(temp);
 }
 
-static int mi_mv_pred_col_q4(MACROBLOCKD *mb, int off, int idx) {
-  const int temp = mb->mode_info_context->bmi[off + 0].as_mv[idx].as_mv.col +
-                   mb->mode_info_context->bmi[off + 1].as_mv[idx].as_mv.col +
-                   mb->mode_info_context->bmi[off + IDX1].as_mv[idx].as_mv.col +
-                   mb->mode_info_context->bmi[off + IDX2].as_mv[idx].as_mv.col;
+static int mi_mv_pred_col_q4(MACROBLOCKD *mb, int idx) {
+  const int temp = mb->mode_info_context->bmi[0].as_mv[idx].as_mv.col +
+                   mb->mode_info_context->bmi[1].as_mv[idx].as_mv.col +
+                   mb->mode_info_context->bmi[2].as_mv[idx].as_mv.col +
+                   mb->mode_info_context->bmi[3].as_mv[idx].as_mv.col;
   return round_mv_comp_q4(temp);
 }
 
@@ -364,9 +348,12 @@
       if (plane == 0) {
         mv = &xd->mode_info_context->bmi[block].as_mv[which_mv].as_mv;
       } else {
-        const int y_block = (block & 2) * 4 + (block & 1) * 2;
-        split_chroma_mv.row = mi_mv_pred_row_q4(xd, y_block, which_mv);
-        split_chroma_mv.col = mi_mv_pred_col_q4(xd, y_block, which_mv);
+        // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
+        // same MV (the average of the 4 luma MVs) but we could do something
+        // smarter for non-4:2:0. Just punt for now, pending the changes to get
+        // rid of SPLITMV mode entirely.
+        split_chroma_mv.row = mi_mv_pred_row_q4(xd, which_mv);
+        split_chroma_mv.col = mi_mv_pred_col_q4(xd, which_mv);
         mv = &split_chroma_mv;
       }
     } else {
@@ -426,23 +413,9 @@
 void vp9_build_inter_predictors_sb(MACROBLOCKD *xd,
                                    int mi_row, int mi_col,
                                    BLOCK_SIZE_TYPE bsize) {
-#if CONFIG_COMP_INTERINTRA_PRED
-  uint8_t *const y = xd->plane[0].dst.buf;
-  uint8_t *const u = xd->plane[1].dst.buf;
-  uint8_t *const v = xd->plane[2].dst.buf;
-  const int y_stride = xd->plane[0].dst.stride;
-  const int uv_stride = xd->plane[1].dst.stride;
-#endif
 
   vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
   vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize);
-
-#if CONFIG_COMP_INTERINTRA_PRED
-  if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME)
-    vp9_build_interintra_predictors(xd, y, u, v,
-                                    y_stride, uv_stride,
-                                    bsize);
-#endif
 }
 
 /*encoder only*/
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index d2e7455..faf018c 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -35,7 +35,7 @@
                               VP9_COMMON *cm);
 
 void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
-                                       YV12_BUFFER_CONFIG *other,
+                                       int other_w, int other_h,
                                        int this_w, int this_h);
 
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
@@ -67,14 +67,11 @@
   return val;
 }
 
-static int scaled_buffer_offset(int x_offset,
-                                int y_offset,
-                                int stride,
+static int scaled_buffer_offset(int x_offset, int y_offset, int stride,
                                 const struct scale_factors *scale) {
-  if (scale)
-    return scale->scale_value_y(y_offset, scale) * stride +
-        scale->scale_value_x(x_offset, scale);
-  return y_offset * stride + x_offset;
+  const int x = scale ? scale->scale_value_x(x_offset, scale) : x_offset;
+  const int y = scale ? scale->scale_value_y(y_offset, scale) : y_offset;
+  return y * stride + x;
 }
 
 static void setup_pred_plane(struct buf_2d *dst,
@@ -92,18 +89,15 @@
 static void setup_dst_planes(MACROBLOCKD *xd,
                              const YV12_BUFFER_CONFIG *src,
                              int mi_row, int mi_col) {
-  setup_pred_plane(&xd->plane[0].dst,
-                   src->y_buffer, src->y_stride,
-                   mi_row, mi_col, NULL,
-                   xd->plane[0].subsampling_x, xd->plane[0].subsampling_y);
-  setup_pred_plane(&xd->plane[1].dst,
-                   src->u_buffer, src->uv_stride,
-                   mi_row, mi_col, NULL,
-                   xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
-  setup_pred_plane(&xd->plane[2].dst,
-                   src->v_buffer, src->uv_stride,
-                   mi_row, mi_col, NULL,
-                   xd->plane[2].subsampling_x, xd->plane[2].subsampling_y);
+  uint8_t *buffers[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
+  int strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    struct macroblockd_plane *pd = &xd->plane[i];
+    setup_pred_plane(&pd->dst, buffers[i], strides[i], mi_row, mi_col, NULL,
+                     pd->subsampling_x, pd->subsampling_y);
+  }
 }
 
 static void setup_pre_planes(MACROBLOCKD *xd,
@@ -112,57 +106,27 @@
                              int mi_row, int mi_col,
                              const struct scale_factors *scale,
                              const struct scale_factors *scale_uv) {
-  int i;
+  const YV12_BUFFER_CONFIG *srcs[2] = {src0, src1};
+  int i, j;
 
-  for (i = 0; i < 2; i++) {
-    const YV12_BUFFER_CONFIG *src = i ? src1 : src0;
+  for (i = 0; i < 2; ++i) {
+    const YV12_BUFFER_CONFIG *src = srcs[i];
+    if (src) {
+      uint8_t* buffers[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
+      int strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
 
-    if (!src)
-      continue;
-
-    setup_pred_plane(&xd->plane[0].pre[i],
-                     src->y_buffer, src->y_stride,
-                     mi_row, mi_col, scale ? scale + i : NULL,
-                     xd->plane[0].subsampling_x, xd->plane[0].subsampling_y);
-    setup_pred_plane(&xd->plane[1].pre[i],
-                     src->u_buffer, src->uv_stride,
-                     mi_row, mi_col, scale_uv ? scale_uv + i : NULL,
-                     xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
-    setup_pred_plane(&xd->plane[2].pre[i],
-                     src->v_buffer, src->uv_stride,
-                     mi_row, mi_col, scale_uv ? scale_uv + i : NULL,
-                     xd->plane[2].subsampling_x, xd->plane[2].subsampling_y);
+      for (j = 0; j < MAX_MB_PLANE; ++j) {
+        struct macroblockd_plane *pd = &xd->plane[j];
+        const struct scale_factors *sf = j ? scale_uv : scale;
+        setup_pred_plane(&pd->pre[i],
+                         buffers[j], strides[j],
+                         mi_row, mi_col, sf ? &sf[i] : NULL,
+                         pd->subsampling_x, pd->subsampling_y);
+      }
+    }
   }
 }
 
-static void setup_pred_block(YV12_BUFFER_CONFIG *dst,
-                             const YV12_BUFFER_CONFIG *src,
-                             int mi_row, int mi_col,
-                             const struct scale_factors *scale,
-                             const struct scale_factors *scale_uv) {
-  const int recon_y_stride = src->y_stride;
-  const int recon_uv_stride = src->uv_stride;
-  int recon_yoffset;
-  int recon_uvoffset;
-
-  if (scale) {
-    recon_yoffset = scaled_buffer_offset(MI_SIZE * mi_col, MI_SIZE * mi_row,
-                                         recon_y_stride, scale);
-    recon_uvoffset = scaled_buffer_offset(MI_UV_SIZE * mi_col,
-                                          MI_UV_SIZE * mi_row,
-                                          recon_uv_stride, scale_uv);
-  } else {
-    recon_yoffset = MI_SIZE * mi_row * recon_y_stride + MI_SIZE * mi_col;
-    recon_uvoffset = MI_UV_SIZE * mi_row * recon_uv_stride +
-                     MI_UV_SIZE * mi_col;
-  }
-
-  *dst = *src;
-  dst->y_buffer += recon_yoffset;
-  dst->u_buffer += recon_uvoffset;
-  dst->v_buffer += recon_uvoffset;
-}
-
 static void set_scale_factors(MACROBLOCKD *xd,
     int ref0, int ref1,
     struct scale_factors scale_factor[MAX_REF_FRAMES]) {
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index aef34c9..9e580c7 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -363,186 +363,6 @@
   }
 }
 
-#if CONFIG_COMP_INTERINTRA_PRED
-static void combine_interintra(MB_PREDICTION_MODE mode,
-                               uint8_t *interpred,
-                               int interstride,
-                               uint8_t *intrapred,
-                               int intrastride,
-                               int bw, int bh) {
-  // TODO(debargha): Explore different ways of combining predictors
-  //                 or designing the tables below
-  static const int scale_bits = 8;
-  static const int scale_max = 256;     // 1 << scale_bits;
-  static const int scale_round = 127;   // (1 << (scale_bits - 1));
-  // This table is a function A + B*exp(-kx), where x is hor. index
-  static const int weights1d[64] = {
-    128, 125, 122, 119, 116, 114, 111, 109,
-    107, 105, 103, 101,  99,  97,  96,  94,
-     93,  91,  90,  89,  88,  86,  85,  84,
-     83,  82,  81,  81,  80,  79,  78,  78,
-     77,  76,  76,  75,  75,  74,  74,  73,
-     73,  72,  72,  71,  71,  71,  70,  70,
-     70,  70,  69,  69,  69,  69,  68,  68,
-     68,  68,  68,  67,  67,  67,  67,  67,
-  };
-
-  int size = MAX(bw, bh);
-  int size_scale = (size >= 64 ? 1:
-                    size == 32 ? 2 :
-                    size == 16 ? 4 :
-                    size == 8  ? 8 : 16);
-  int i, j;
-  switch (mode) {
-    case V_PRED:
-      for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) {
-          int k = i * interstride + j;
-          int scale = weights1d[i * size_scale];
-          interpred[k] =
-              ((scale_max - scale) * interpred[k] +
-               scale * intrapred[i * intrastride + j] + scale_round)
-              >> scale_bits;
-        }
-      }
-      break;
-
-    case H_PRED:
-      for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) {
-          int k = i * interstride + j;
-          int scale = weights1d[j * size_scale];
-          interpred[k] =
-              ((scale_max - scale) * interpred[k] +
-               scale * intrapred[i * intrastride + j] + scale_round)
-              >> scale_bits;
-        }
-      }
-      break;
-
-    case D63_PRED:
-    case D117_PRED:
-      for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) {
-          int k = i * interstride + j;
-          int scale = (weights1d[i * size_scale] * 3 +
-                       weights1d[j * size_scale]) >> 2;
-          interpred[k] =
-              ((scale_max - scale) * interpred[k] +
-               scale * intrapred[i * intrastride + j] + scale_round)
-              >> scale_bits;
-        }
-      }
-      break;
-
-    case D27_PRED:
-    case D153_PRED:
-      for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) {
-          int k = i * interstride + j;
-          int scale = (weights1d[j * size_scale] * 3 +
-                       weights1d[i * size_scale]) >> 2;
-          interpred[k] =
-              ((scale_max - scale) * interpred[k] +
-               scale * intrapred[i * intrastride + j] + scale_round)
-              >> scale_bits;
-        }
-      }
-      break;
-
-    case D135_PRED:
-      for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) {
-          int k = i * interstride + j;
-          int scale = weights1d[(i < j ? i : j) * size_scale];
-          interpred[k] =
-              ((scale_max - scale) * interpred[k] +
-               scale * intrapred[i * intrastride + j] + scale_round)
-              >> scale_bits;
-        }
-      }
-      break;
-
-    case D45_PRED:
-      for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) {
-          int k = i * interstride + j;
-          int scale = (weights1d[i * size_scale] +
-                       weights1d[j * size_scale]) >> 1;
-          interpred[k] =
-              ((scale_max - scale) * interpred[k] +
-               scale * intrapred[i * intrastride + j] + scale_round)
-              >> scale_bits;
-        }
-      }
-      break;
-
-    case TM_PRED:
-    case DC_PRED:
-    default:
-      // simple average
-      for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) {
-          int k = i * interstride + j;
-          interpred[k] = (interpred[k] + intrapred[i * intrastride + j]) >> 1;
-        }
-      }
-      break;
-  }
-}
-
-void vp9_build_interintra_predictors(MACROBLOCKD *xd,
-                                              uint8_t *ypred,
-                                              uint8_t *upred,
-                                              uint8_t *vpred,
-                                              int ystride, int uvstride,
-                                              BLOCK_SIZE_TYPE bsize) {
-  vp9_build_interintra_predictors_sby(xd, ypred, ystride, bsize);
-  vp9_build_interintra_predictors_sbuv(xd, upred, vpred, uvstride, bsize);
-}
-
-void vp9_build_interintra_predictors_sby(MACROBLOCKD *xd,
-                                               uint8_t *ypred,
-                                               int ystride,
-                                               BLOCK_SIZE_TYPE bsize) {
-  int bwl = mi_width_log2(bsize),  bw = MI_SIZE << bwl;
-  int bhl = mi_height_log2(bsize), bh = MI_SIZE << bhl;
-  uint8_t intrapredictor[4096];
-  vp9_build_intra_predictors(
-      xd->plane[0].dst.buf, xd->plane[0].dst.stride,
-      intrapredictor, bw,
-      xd->mode_info_context->mbmi.interintra_mode, bw, bh,
-      xd->up_available, xd->left_available, xd->right_available);
-  combine_interintra(xd->mode_info_context->mbmi.interintra_mode,
-                     ypred, ystride, intrapredictor, bw, bw, bh);
-}
-
-void vp9_build_interintra_predictors_sbuv(MACROBLOCKD *xd,
-                                                uint8_t *upred,
-                                                uint8_t *vpred,
-                                                int uvstride,
-                                                BLOCK_SIZE_TYPE bsize) {
-  int bwl = mi_width_log2(bsize),  bw = MI_UV_SIZE << bwl;
-  int bhl = mi_height_log2(bsize), bh = MI_UV_SIZE << bhl;
-  uint8_t uintrapredictor[1024];
-  uint8_t vintrapredictor[1024];
-  vp9_build_intra_predictors(
-      xd->plane[1].dst.buf, xd->plane[1].dst.stride,
-      uintrapredictor, bw,
-      xd->mode_info_context->mbmi.interintra_uv_mode, bw, bh,
-      xd->up_available, xd->left_available, xd->right_available);
-  vp9_build_intra_predictors(
-      xd->plane[2].dst.buf, xd->plane[1].dst.stride,
-      vintrapredictor, bw,
-      xd->mode_info_context->mbmi.interintra_uv_mode, bw, bh,
-      xd->up_available, xd->left_available, xd->right_available);
-  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
-                     upred, uvstride, uintrapredictor, bw, bw, bh);
-  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
-                     vpred, uvstride, vintrapredictor, bw, bw, bh);
-}
-#endif  // CONFIG_COMP_INTERINTRA_PRED
-
 void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd,
                                       BLOCK_SIZE_TYPE bsize) {
   const int bwl = b_width_log2(bsize),  bw = 4 << bwl;
@@ -573,23 +393,6 @@
                              xd->left_available, 0 /*xd->right_available*/);
 }
 
-#if !CONFIG_SB8X8
-void vp9_intra8x8_predict(MACROBLOCKD *xd,
-                          int block4x4_idx,
-                          int mode,
-                          uint8_t *predictor, int pre_stride) {
-  const int block_idx = (block4x4_idx >> 2) | !!(block4x4_idx & 2);
-  const int have_top = (block_idx >> 1) || xd->up_available;
-  const int have_left = (block_idx & 1) || xd->left_available;
-  const int have_right = !(block_idx & 1) || xd->right_available;
-
-  vp9_build_intra_predictors(predictor, pre_stride,
-                             predictor, pre_stride,
-                             mode, 8, 8, have_top, have_left,
-                             have_right);
-}
-#endif
-#if !CONFIG_NEWBINTRAMODES
 void vp9_intra4x4_predict(MACROBLOCKD *xd,
                           int block_idx,
                           BLOCK_SIZE_TYPE bsize,
@@ -608,20 +411,3 @@
                              mode, 4, 4, have_top, have_left,
                              have_right);
 }
-#endif
-#if !CONFIG_SB8X8
-void vp9_intra_uv4x4_predict(MACROBLOCKD *xd,
-                             int block4x4_idx,
-                             int mode,
-                             uint8_t *predictor, int pre_stride) {
-  const int block_idx = block4x4_idx & 3;
-  const int have_top = (block_idx >> 1) || xd->up_available;
-  const int have_left = (block_idx & 1) || xd->left_available;
-  const int have_right = !(block_idx & 1);
-
-  vp9_build_intra_predictors(predictor, pre_stride,
-                             predictor, pre_stride,
-                             mode, 4, 4, have_top, have_left,
-                             have_right);
-}
-#endif
diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h
index 21cd7ab..1a715c3 100644
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@@ -21,25 +21,4 @@
 B_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, int block,
                                          uint8_t *ptr, int stride);
 
-#if CONFIG_COMP_INTERINTRA_PRED
-void vp9_build_interintra_predictors(MACROBLOCKD *xd,
-                                     uint8_t *ypred,
-                                     uint8_t *upred,
-                                     uint8_t *vpred,
-                                     int ystride,
-                                     int uvstride,
-                                     BLOCK_SIZE_TYPE bsize);
-
-void vp9_build_interintra_predictors_sby(MACROBLOCKD *xd,
-                                         uint8_t *ypred,
-                                         int ystride,
-                                         BLOCK_SIZE_TYPE bsize);
-
-void vp9_build_interintra_predictors_sbuv(MACROBLOCKD *xd,
-                                          uint8_t *upred,
-                                          uint8_t *vpred,
-                                          int uvstride,
-                                          BLOCK_SIZE_TYPE bsize);
-#endif  // CONFIG_COMP_INTERINTRA_PRED
-
 #endif  // VP9_COMMON_VP9_RECONINTRA_H_
diff --git a/vp9/common/vp9_reconintra4x4.c b/vp9/common/vp9_reconintra4x4.c
deleted file mode 100644
index ce33aa5..0000000
--- a/vp9/common/vp9_reconintra4x4.c
+++ /dev/null
@@ -1,454 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "./vpx_config.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_reconintra.h"
-#include "vp9_rtcd.h"
-
-#if CONFIG_NEWBINTRAMODES
-static int find_grad_measure(uint8_t *x, int stride, int n, int tx, int ty,
-                             int dx, int dy) {
-  int i, j;
-  int count = 0, gsum = 0, gdiv;
-  /* TODO: Make this code more efficient by breaking up into two loops */
-  for (i = -ty; i < n; ++i)
-    for (j = -tx; j < n; ++j) {
-      int g;
-      if (i >= 0 && j >= 0) continue;
-      if (i + dy >= 0 && j + dx >= 0) continue;
-      if (i + dy < -ty || i + dy >= n || j + dx < -tx || j + dx >= n) continue;
-      g = abs(x[(i + dy) * stride + j + dx] - x[i * stride + j]);
-      gsum += g * g;
-      count++;
-    }
-  gdiv = (dx * dx + dy * dy) * count;
-  return ((gsum << 8) + (gdiv >> 1)) / gdiv;
-}
-
-#if CONTEXT_PRED_REPLACEMENTS == 6
-B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
-                                              int stride, int n,
-                                              int tx, int ty) {
-  int g[8], i, imin, imax;
-  g[1] = find_grad_measure(ptr, stride, n, tx, ty,  2, 1);
-  g[2] = find_grad_measure(ptr, stride, n, tx, ty,  1, 1);
-  g[3] = find_grad_measure(ptr, stride, n, tx, ty,  1, 2);
-  g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2);
-  g[6] = find_grad_measure(ptr, stride, n, tx, ty, -1, 1);
-  g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1);
-  imin = 1;
-  for (i = 2; i < 8; i += 1 + (i == 3))
-    imin = (g[i] < g[imin] ? i : imin);
-  imax = 1;
-  for (i = 2; i < 8; i += 1 + (i == 3))
-    imax = (g[i] > g[imax] ? i : imax);
-  /*
-  printf("%d %d %d %d %d %d = %d %d\n",
-         g[1], g[2], g[3], g[5], g[6], g[7], imin, imax);
-         */
-  switch (imin) {
-    case 1:
-      return B_D153_PRED;
-    case 2:
-      return B_D135_PRED;
-    case 3:
-      return B_D117_PRED;
-    case 5:
-      return B_D63_PRED;
-    case 6:
-      return B_D45_PRED;
-    case 7:
-      return B_D27_PRED;
-    default:
-      assert(0);
-  }
-}
-#elif CONTEXT_PRED_REPLACEMENTS == 4
-B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
-                                              int stride, int n,
-                                              int tx, int ty) {
-  int g[8], i, imin, imax;
-  g[1] = find_grad_measure(ptr, stride, n, tx, ty,  2, 1);
-  g[3] = find_grad_measure(ptr, stride, n, tx, ty,  1, 2);
-  g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2);
-  g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1);
-  imin = 1;
-  for (i = 3; i < 8; i+=2)
-    imin = (g[i] < g[imin] ? i : imin);
-  imax = 1;
-  for (i = 3; i < 8; i+=2)
-    imax = (g[i] > g[imax] ? i : imax);
-  /*
-  printf("%d %d %d %d = %d %d\n",
-         g[1], g[3], g[5], g[7], imin, imax);
-         */
-  switch (imin) {
-    case 1:
-      return B_D153_PRED;
-    case 3:
-      return B_D117_PRED;
-    case 5:
-      return B_D63_PRED;
-    case 7:
-      return B_D27_PRED;
-    default:
-      assert(0);
-  }
-}
-#elif CONTEXT_PRED_REPLACEMENTS == 0
-B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
-                                              int stride, int n,
-                                              int tx, int ty) {
-  int g[8], i, imin, imax;
-  g[0] = find_grad_measure(ptr, stride, n, tx, ty,  1, 0);
-  g[1] = find_grad_measure(ptr, stride, n, tx, ty,  2, 1);
-  g[2] = find_grad_measure(ptr, stride, n, tx, ty,  1, 1);
-  g[3] = find_grad_measure(ptr, stride, n, tx, ty,  1, 2);
-  g[4] = find_grad_measure(ptr, stride, n, tx, ty,  0, 1);
-  g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2);
-  g[6] = find_grad_measure(ptr, stride, n, tx, ty, -1, 1);
-  g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1);
-  imax = 0;
-  for (i = 1; i < 8; i++)
-    imax = (g[i] > g[imax] ? i : imax);
-  imin = 0;
-  for (i = 1; i < 8; i++)
-    imin = (g[i] < g[imin] ? i : imin);
-
-  switch (imin) {
-    case 0:
-      return B_H_PRED;
-    case 1:
-      return B_D153_PRED;
-    case 2:
-      return B_D135_PRED;
-    case 3:
-      return B_D117_PRED;
-    case 4:
-      return B_V_PRED;
-    case 5:
-      return B_D63_PRED;
-    case 6:
-      return B_D45_PRED;
-    case 7:
-      return B_D27_PRED;
-    default:
-      assert(0);
-  }
-}
-#endif
-
-B_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, int block_idx,
-                                         uint8_t *ptr, int stride) {
-  const int have_top = (block_idx >> 2) || xd->up_available;
-  const int have_left = (block_idx & 3)  || xd->left_available;
-  int tx = have_left ? 4 : 0;
-  int ty = have_top ? 4 : 0;
-  if (!have_left && !have_top)
-    return B_DC_PRED;
-  return vp9_find_dominant_direction(ptr, stride, 4, tx, ty);
-}
-
-void vp9_intra4x4_predict(MACROBLOCKD *xd,
-                          int block_idx,
-                          BLOCK_SIZE_TYPE bsize,
-                          int b_mode,
-                          uint8_t *predictor,
-                          int ps) {
-  const int bwl = b_width_log2(bsize);
-  const int wmask = (1 << bwl) - 1;
-  int i, r, c;
-  const int have_top = (block_idx >> bwl) || xd->up_available;
-  const int have_left = (block_idx & wmask)  || xd->left_available;
-  const int have_right = (block_idx & wmask) != wmask || xd->right_available;
-  uint8_t left[4], above[8], top_left;
-  /*
-   * 127 127 127 .. 127 127 127 127 127 127
-   * 129  A   B  ..  Y   Z
-   * 129  C   D  ..  W   X
-   * 129  E   F  ..  U   V
-   * 129  G   H  ..  S   T   T   T   T   T
-   *  ..
-   */
-
-  if (have_left) {
-    uint8_t *left_ptr = predictor - 1;
-    const int stride = ps;
-
-    left[0] = left_ptr[0 * stride];
-    left[1] = left_ptr[1 * stride];
-    left[2] = left_ptr[2 * stride];
-    left[3] = left_ptr[3 * stride];
-  } else {
-    left[0] = left[1] = left[2] = left[3] = 129;
-  }
-
-  if (have_top) {
-    uint8_t *above_ptr = predictor - ps;
-    top_left = have_left ? above_ptr[-1] : 127;
-
-    above[0] = above_ptr[0];
-    above[1] = above_ptr[1];
-    above[2] = above_ptr[2];
-    above[3] = above_ptr[3];
-    if (((block_idx & wmask) != wmask) ||
-        (have_right && block_idx == wmask &&
-         ((xd->mb_index != 3 && xd->sb_index != 3) ||
-          ((xd->mb_index & 1) == 0 && xd->sb_index == 3)))) {
-      above[4] = above_ptr[4];
-      above[5] = above_ptr[5];
-      above[6] = above_ptr[6];
-      above[7] = above_ptr[7];
-    } else if (have_right) {
-      uint8_t *above_right = above_ptr + 4;
-
-      if (xd->sb_index == 3 && (xd->mb_index & 1))
-        above_right -= 32 * ps;
-      if (xd->mb_index == 3)
-        above_right -= 16 * ps;
-      above_right -= 4 * (block_idx >> bwl) * ps;
-
-      /* use a more distant above-right (from closest available top-right
-       * corner), but with a "localized DC" (similar'ish to TM-pred):
-       *
-       *  A   B   C   D   E   F   G   H
-       *  I   J   K   L
-       *  M   N   O   P
-       *  Q   R   S   T
-       *  U   V   W   X   x1  x2  x3  x4
-       *
-       * Where:
-       * x1 = clip_pixel(E + X - D)
-       * x2 = clip_pixel(F + X - D)
-       * x3 = clip_pixel(G + X - D)
-       * x4 = clip_pixel(H + X - D)
-       *
-       * This is applied anytime when we use a "distant" above-right edge
-       * that is not immediately top-right to the block that we're going
-       * to do intra prediction for.
-       */
-      above[4] = clip_pixel(above_right[0] + above_ptr[3] - above_right[-1]);
-      above[5] = clip_pixel(above_right[1] + above_ptr[3] - above_right[-1]);
-      above[6] = clip_pixel(above_right[2] + above_ptr[3] - above_right[-1]);
-      above[7] = clip_pixel(above_right[3] + above_ptr[3] - above_right[-1]);
-    } else {
-      // extend edge
-      above[4] = above[5] = above[6] = above[7] = above[3];
-    }
-  } else {
-    above[0] = above[1] = above[2] = above[3] = 127;
-    above[4] = above[5] = above[6] = above[7] = 127;
-    top_left = 127;
-  }
-
-#if CONFIG_NEWBINTRAMODES
-  if (b_mode == B_CONTEXT_PRED)
-    b_mode = xd->mode_info_context->bmi[block_idx].as_mode.context;
-#endif
-
-  switch (b_mode) {
-    case B_DC_PRED: {
-      int expected_dc = 128;
-      if (have_top || have_left) {
-        int average = 0;
-        int count = 0;
-        if (have_top) {
-          for (i = 0; i < 4; i++)
-            average += above[i];
-          count += 4;
-        }
-        if (have_left) {
-          for (i = 0; i < 4; i++)
-            average += left[i];
-          count += 4;
-        }
-        expected_dc = (average + (count >> 1)) / count;
-      }
-      for (r = 0; r < 4; r++) {
-        for (c = 0; c < 4; c++)
-          predictor[c] = expected_dc;
-        predictor += ps;
-      }
-    }
-    break;
-    case B_TM_PRED: {
-      /* prediction similar to true_motion prediction */
-      for (r = 0; r < 4; r++) {
-        for (c = 0; c < 4; c++)
-          predictor[c] = clip_pixel(above[c] - top_left + left[r]);
-        predictor += ps;
-      }
-    }
-    break;
-    case B_V_PRED:
-      for (r = 0; r < 4; r++) {
-        for (c = 0; c < 4; c++)
-          predictor[c] = above[c];
-        predictor += ps;
-      }
-      break;
-    case B_H_PRED:
-      for (r = 0; r < 4; r++) {
-        for (c = 0; c < 4; c++)
-          predictor[c] = left[r];
-        predictor += ps;
-      }
-      break;
-    case B_D45_PRED: {
-      uint8_t *p = above;
-
-      predictor[0 * ps + 0] = ROUND_POWER_OF_TWO(p[0] + p[1] * 2 + p[2], 2);
-      predictor[0 * ps + 1] =
-        predictor[1 * ps + 0] = ROUND_POWER_OF_TWO(p[1] + p[2] * 2 + p[3], 2);
-      predictor[0 * ps + 2] =
-        predictor[1 * ps + 1] =
-          predictor[2 * ps + 0] = ROUND_POWER_OF_TWO(p[2] + p[3] * 2 + p[4], 2);
-      predictor[0 * ps + 3] =
-        predictor[1 * ps + 2] =
-          predictor[2 * ps + 1] =
-            predictor[3 * ps + 0] =
-              ROUND_POWER_OF_TWO(p[3] + p[4] * 2 + p[5], 2);
-      predictor[1 * ps + 3] =
-        predictor[2 * ps + 2] =
-          predictor[3 * ps + 1] = ROUND_POWER_OF_TWO(p[4] + p[5] * 2 + p[6], 2);
-      predictor[2 * ps + 3] =
-        predictor[3 * ps + 2] = ROUND_POWER_OF_TWO(p[5] + p[6] * 2 + p[7], 2);
-      predictor[3 * ps + 3] = ROUND_POWER_OF_TWO(p[6] + p[7] * 2 + p[7], 2);
-
-    }
-    break;
-    case B_D135_PRED: {
-      uint8_t p[9] = { left[3], left[2], left[1], left[0],
-                       top_left,
-                       above[0], above[1], above[2], above[3] };
-
-      predictor[3 * ps + 0] = ROUND_POWER_OF_TWO(p[0] + p[1] * 2 + p[2], 2);
-      predictor[3 * ps + 1] =
-        predictor[2 * ps + 0] = ROUND_POWER_OF_TWO(p[1] + p[2] * 2 + p[3], 2);
-      predictor[3 * ps + 2] =
-        predictor[2 * ps + 1] =
-          predictor[1 * ps + 0] = ROUND_POWER_OF_TWO(p[2] + p[3] * 2 + p[4], 2);
-      predictor[3 * ps + 3] =
-        predictor[2 * ps + 2] =
-          predictor[1 * ps + 1] =
-            predictor[0 * ps + 0] =
-              ROUND_POWER_OF_TWO(p[3] + p[4] * 2 + p[5], 2);
-      predictor[2 * ps + 3] =
-        predictor[1 * ps + 2] =
-          predictor[0 * ps + 1] = ROUND_POWER_OF_TWO(p[4] + p[5] * 2 + p[6], 2);
-      predictor[1 * ps + 3] =
-        predictor[0 * ps + 2] = ROUND_POWER_OF_TWO(p[5] + p[6] * 2 + p[7], 2);
-      predictor[0 * ps + 3] = ROUND_POWER_OF_TWO(p[6] + p[7] * 2 + p[8], 2);
-
-    }
-    break;
-    case B_D117_PRED: {
-      uint8_t p[9] = { left[3], left[2], left[1], left[0],
-                       top_left,
-                       above[0], above[1], above[2], above[3] };
-
-      predictor[3 * ps + 0] = ROUND_POWER_OF_TWO(p[1] + p[2] * 2 + p[3], 2);
-      predictor[2 * ps + 0] = ROUND_POWER_OF_TWO(p[2] + p[3] * 2 + p[4], 2);
-      predictor[3 * ps + 1] =
-        predictor[1 * ps + 0] = ROUND_POWER_OF_TWO(p[3] + p[4] * 2 + p[5], 2);
-      predictor[2 * ps + 1] =
-        predictor[0 * ps + 0] = ROUND_POWER_OF_TWO(p[4] + p[5], 1);
-      predictor[3 * ps + 2] =
-        predictor[1 * ps + 1] = ROUND_POWER_OF_TWO(p[4] + p[5] * 2 + p[6], 2);
-      predictor[2 * ps + 2] =
-        predictor[0 * ps + 1] = ROUND_POWER_OF_TWO(p[5] + p[6], 1);
-      predictor[3 * ps + 3] =
-        predictor[1 * ps + 2] = ROUND_POWER_OF_TWO(p[5] + p[6] * 2 + p[7], 2);
-      predictor[2 * ps + 3] =
-        predictor[0 * ps + 2] = ROUND_POWER_OF_TWO(p[6] + p[7], 1);
-      predictor[1 * ps + 3] = ROUND_POWER_OF_TWO(p[6] + p[7] * 2 + p[8], 2);
-      predictor[0 * ps + 3] = ROUND_POWER_OF_TWO(p[7] + p[8], 1);
-
-    }
-    break;
-    case B_D63_PRED: {
-      uint8_t *p = above;
-
-      predictor[0 * ps + 0] = ROUND_POWER_OF_TWO(p[0] + p[1], 1);
-      predictor[1 * ps + 0] = ROUND_POWER_OF_TWO(p[0] + p[1] * 2 + p[2], 2);
-      predictor[2 * ps + 0] =
-        predictor[0 * ps + 1] = ROUND_POWER_OF_TWO(p[1] + p[2], 1);
-      predictor[1 * ps + 1] =
-        predictor[3 * ps + 0] = ROUND_POWER_OF_TWO(p[1] + p[2] * 2 + p[3], 2);
-      predictor[2 * ps + 1] =
-        predictor[0 * ps + 2] = ROUND_POWER_OF_TWO(p[2] + p[3], 1);
-      predictor[3 * ps + 1] =
-        predictor[1 * ps + 2] = ROUND_POWER_OF_TWO(p[2] + p[3] * 2 + p[4], 2);
-      predictor[0 * ps + 3] =
-        predictor[2 * ps + 2] = ROUND_POWER_OF_TWO(p[3] + p[4], 1);
-      predictor[1 * ps + 3] =
-        predictor[3 * ps + 2] = ROUND_POWER_OF_TWO(p[3] + p[4] * 2 + p[5], 2);
-      predictor[2 * ps + 3] = ROUND_POWER_OF_TWO(p[4] + p[5] * 2 + p[6], 2);
-      predictor[3 * ps + 3] = ROUND_POWER_OF_TWO(p[5] + p[6] * 2 + p[7], 2);
-    }
-    break;
-    case B_D153_PRED: {
-      uint8_t p[9] = { left[3], left[2], left[1], left[0],
-                       top_left,
-                       above[0], above[1], above[2], above[3] };
-
-      predictor[3 * ps + 0] = ROUND_POWER_OF_TWO(p[0] + p[1], 1);
-      predictor[3 * ps + 1] = ROUND_POWER_OF_TWO(p[0] + p[1] * 2 + p[2], 2);
-      predictor[2 * ps + 0] =
-        predictor[3 * ps + 2] = ROUND_POWER_OF_TWO(p[1] + p[2], 1);
-      predictor[2 * ps + 1] =
-        predictor[3 * ps + 3] = ROUND_POWER_OF_TWO(p[1] + p[2] * 2 + p[3], 2);
-      predictor[2 * ps + 2] =
-        predictor[1 * ps + 0] = ROUND_POWER_OF_TWO(p[2] + p[3], 1);
-      predictor[2 * ps + 3] =
-        predictor[1 * ps + 1] = ROUND_POWER_OF_TWO(p[2] + p[3] * 2 + p[4], 2);
-      predictor[1 * ps + 2] =
-        predictor[0 * ps + 0] = ROUND_POWER_OF_TWO(p[3] + p[4], 1);
-      predictor[1 * ps + 3] =
-        predictor[0 * ps + 1] = ROUND_POWER_OF_TWO(p[3] + p[4] * 2 + p[5], 2);
-      predictor[0 * ps + 2] = ROUND_POWER_OF_TWO(p[4] + p[5] * 2 + p[6], 2);
-      predictor[0 * ps + 3] = ROUND_POWER_OF_TWO(p[5] + p[6] * 2 + p[7], 2);
-    }
-    break;
-    case B_D27_PRED: {
-      uint8_t *p = left;
-      predictor[0 * ps + 0] = ROUND_POWER_OF_TWO(p[0] + p[1], 1);
-      predictor[0 * ps + 1] = ROUND_POWER_OF_TWO(p[0] + p[1] * 2 + p[2], 2);
-      predictor[0 * ps + 2] =
-        predictor[1 * ps + 0] = ROUND_POWER_OF_TWO(p[1] + p[2], 1);
-      predictor[0 * ps + 3] =
-        predictor[1 * ps + 1] = ROUND_POWER_OF_TWO(p[1] + p[2] * 2 + p[3], 2);
-      predictor[1 * ps + 2] =
-        predictor[2 * ps + 0] = ROUND_POWER_OF_TWO(p[2] + p[3], 1);
-      predictor[1 * ps + 3] =
-        predictor[2 * ps + 1] = ROUND_POWER_OF_TWO(p[2] + p[3] * 2 + p[3], 2);
-      predictor[2 * ps + 2] =
-        predictor[2 * ps + 3] =
-          predictor[3 * ps + 0] =
-            predictor[3 * ps + 1] =
-              predictor[3 * ps + 2] =
-                predictor[3 * ps + 3] = p[3];
-    }
-    break;
-
-#if CONFIG_NEWBINTRAMODES
-    case B_CONTEXT_PRED:
-    break;
-    /*
-    case B_CORNER_PRED:
-    corner_predictor(predictor, 16, 4, above, left);
-    break;
-    */
-#endif
-  }
-}
-#endif
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index e473d81..02d3253 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -63,23 +63,6 @@
 prototype void vp9_recon_b "uint8_t *pred_ptr, int16_t *diff_ptr, int diff_stride, uint8_t *dst_ptr, int stride"
 specialize vp9_recon_b
 
-if [ "$CONFIG_SB8X8" != "yes" ]; then
-
-prototype void vp9_recon_uv_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
-specialize vp9_recon_uv_b
-
-# TODO(jingning): The prototype functions in c are modified to enable block-size configurable
-# operations. Need to change the sse2 accrodingly.
-prototype void vp9_recon2b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
-specialize vp9_recon2b
-# specialize vp9_recon2b sse2
-
-prototype void vp9_recon4b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
-specialize vp9_recon4b
-# specialize vp9_recon4b sse2
-
-fi
-
 prototype void vp9_recon_sb "struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize"
 specialize vp9_recon_sb
 
@@ -101,16 +84,6 @@
 prototype void vp9_intra4x4_predict "struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride"
 specialize vp9_intra4x4_predict;
 
-if [ "$CONFIG_SB8X8" != "yes" ]; then
-
-prototype void vp9_intra8x8_predict "struct macroblockd *xd, int block, int b_mode, uint8_t *predictor, int pre_stride"
-specialize vp9_intra8x8_predict;
-
-prototype void vp9_intra_uv4x4_predict "struct macroblockd *xd, int block, int b_mode, uint8_t *predictor, int pre_stride"
-specialize vp9_intra_uv4x4_predict;
-
-fi
-
 if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
 prototype void vp9_add_residual_4x4 "const int16_t *diff, uint8_t *dest, int stride"
 specialize vp9_add_residual_4x4 sse2
@@ -155,30 +128,6 @@
 prototype void vp9_loop_filter_bh8x8 "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
 specialize vp9_loop_filter_bh8x8 sse2
 
-prototype void vp9_loop_filter_simple_mbv "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_mbv mmx sse2
-vp9_loop_filter_simple_mbv_c=vp9_loop_filter_simple_vertical_edge_c
-vp9_loop_filter_simple_mbv_mmx=vp9_loop_filter_simple_vertical_edge_mmx
-vp9_loop_filter_simple_mbv_sse2=vp9_loop_filter_simple_vertical_edge_sse2
-
-prototype void vp9_loop_filter_simple_mbh "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_mbh mmx sse2
-vp9_loop_filter_simple_mbh_c=vp9_loop_filter_simple_horizontal_edge_c
-vp9_loop_filter_simple_mbh_mmx=vp9_loop_filter_simple_horizontal_edge_mmx
-vp9_loop_filter_simple_mbh_sse2=vp9_loop_filter_simple_horizontal_edge_sse2
-
-prototype void vp9_loop_filter_simple_bv "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_bv mmx sse2
-vp9_loop_filter_simple_bv_c=vp9_loop_filter_bvs_c
-vp9_loop_filter_simple_bv_mmx=vp9_loop_filter_bvs_mmx
-vp9_loop_filter_simple_bv_sse2=vp9_loop_filter_bvs_sse2
-
-prototype void vp9_loop_filter_simple_bh "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_bh mmx sse2
-vp9_loop_filter_simple_bh_c=vp9_loop_filter_bhs_c
-vp9_loop_filter_simple_bh_mmx=vp9_loop_filter_bhs_mmx
-vp9_loop_filter_simple_bh_sse2=vp9_loop_filter_bhs_sse2
-
 prototype void vp9_lpf_mbh_w "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"
 specialize vp9_lpf_mbh_w sse2
 
@@ -364,41 +313,74 @@
 prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance64x64 sse2
 
+prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance64x64
+
 prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance32x64
 
+prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x64
+
 prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance64x32
 
+prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance64x32
+
 prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance32x16
 
+prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x16
+
 prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x32
 
+prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x32
+
 prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance32x32 sse2
 
+prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x32
+
 prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
 
+prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x16
+
 prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance8x16 sse2 mmx
 vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt
 
+prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance8x16
+
 prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt
 
+prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x8
+
 prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance8x8 sse2 mmx
 vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
 
+prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance8x8
+
 prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance4x4 sse2 mmx
 vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
 
+prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance4x4
+
 prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad64x64 sse2
 
diff --git a/vp9/common/vp9_setupintrarecon.c b/vp9/common/vp9_setupintrarecon.c
deleted file mode 100644
index 6784103..0000000
--- a/vp9/common/vp9_setupintrarecon.c
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp9/common/vp9_setupintrarecon.h"
-#include "vpx_mem/vpx_mem.h"
-
-void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf) {
-  int i;
-
-  // luma
-  vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
-  for (i = 0; i < ybf->y_height; i++)
-    ybf->y_buffer[ybf->y_stride * i - 1] = 129;
-
-  // chroma
-  vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
-  vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
-  for (i = 0; i < ybf->uv_height; i++) {
-    ybf->u_buffer[ybf->uv_stride * i - 1] = 129;
-    ybf->v_buffer[ybf->uv_stride * i - 1] = 129;
-  }
-}
diff --git a/vp9/common/vp9_setupintrarecon.h b/vp9/common/vp9_setupintrarecon.h
deleted file mode 100644
index e389f3c..0000000
--- a/vp9/common/vp9_setupintrarecon.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_SETUPINTRARECON_H_
-#define VP9_COMMON_VP9_SETUPINTRARECON_H_
-
-#include "vpx_scale/yv12config.h"
-
-void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
-
-#endif  // VP9_COMMON_VP9_SETUPINTRARECON_H_
diff --git a/vp9/common/vp9_swapyv12buffer.c b/vp9/common/vp9_swapyv12buffer.c
deleted file mode 100644
index 10c6b41..0000000
--- a/vp9/common/vp9_swapyv12buffer.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp9/common/vp9_swapyv12buffer.h"
-
-void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,
-                          YV12_BUFFER_CONFIG *last_frame) {
-  uint8_t *temp;
-
-  temp = last_frame->buffer_alloc;
-  last_frame->buffer_alloc = new_frame->buffer_alloc;
-  new_frame->buffer_alloc = temp;
-
-  temp = last_frame->y_buffer;
-  last_frame->y_buffer = new_frame->y_buffer;
-  new_frame->y_buffer = temp;
-
-  temp = last_frame->u_buffer;
-  last_frame->u_buffer = new_frame->u_buffer;
-  new_frame->u_buffer = temp;
-
-  temp = last_frame->v_buffer;
-  last_frame->v_buffer = new_frame->v_buffer;
-  new_frame->v_buffer = temp;
-}
diff --git a/vp9/common/vp9_swapyv12buffer.h b/vp9/common/vp9_swapyv12buffer.h
deleted file mode 100644
index 2e11206..0000000
--- a/vp9/common/vp9_swapyv12buffer.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_SWAPYV12BUFFER_H_
-#define VP9_COMMON_VP9_SWAPYV12BUFFER_H_
-
-#include "vpx_scale/yv12config.h"
-
-void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,
-                          YV12_BUFFER_CONFIG *last_frame);
-
-#endif  // VP9_COMMON_VP9_SWAPYV12BUFFER_H_
diff --git a/vp9/common/vp9_tile_common.c b/vp9/common/vp9_tile_common.c
index a9d8cf0..ea26289 100644
--- a/vp9/common/vp9_tile_common.c
+++ b/vp9/common/vp9_tile_common.c
@@ -18,16 +18,12 @@
 static void vp9_get_tile_offsets(VP9_COMMON *cm, int *min_tile_off,
                                  int *max_tile_off, int tile_idx,
                                  int log2_n_tiles, int n_mis) {
-#if CONFIG_SB8X8
   const int n_sbs = (n_mis + 7) >> 3;
-#else
-  const int n_sbs = (n_mis + 3) >> 2;
-#endif
   const int sb_off1 =  (tile_idx      * n_sbs) >> log2_n_tiles;
   const int sb_off2 = ((tile_idx + 1) * n_sbs) >> log2_n_tiles;
 
-  *min_tile_off = MIN(sb_off1 << (2 + CONFIG_SB8X8), n_mis);
-  *max_tile_off = MIN(sb_off2 << (2 + CONFIG_SB8X8), n_mis);
+  *min_tile_off = MIN(sb_off1 << 3, n_mis);
+  *max_tile_off = MIN(sb_off2 << 3, n_mis);
 }
 
 void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx) {
diff --git a/vp9/common/vp9_treecoder.c b/vp9/common/vp9_treecoder.c
index 3f049b5..531fa75 100644
--- a/vp9/common/vp9_treecoder.c
+++ b/vp9/common/vp9_treecoder.c
@@ -14,7 +14,6 @@
 #if defined(CONFIG_DEBUG) && CONFIG_DEBUG
 #include <assert.h>
 #endif
-#include <stdio.h>
 
 #include "vp9/common/vp9_treecoder.h"
 
@@ -57,12 +56,12 @@
     left = convert_distribution(tree[i], tree, probs, branch_ct,
                                 num_events, tok0_offset);
   }
-  if (tree[i + 1] <= 0) {
+  if (tree[i + 1] <= 0)
     right = num_events[-tree[i + 1] - tok0_offset];
-  } else {
+  else
     right = convert_distribution(tree[i + 1], tree, probs, branch_ct,
-                                num_events, tok0_offset);
-  }
+                                 num_events, tok0_offset);
+
   probs[i>>1] = get_binary_prob(left, right);
   branch_ct[i>>1][0] = left;
   branch_ct[i>>1][1] = right;
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_mmx.c b/vp9/common/x86/vp9_loopfilter_intrin_mmx.c
index 2be9e31..7e6c4be 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_mmx.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_mmx.c
@@ -35,16 +35,6 @@
 
 }
 
-void vp9_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride,
-                             const unsigned char *blimit) {
-  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride,
-                                             y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride,
-                                             y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride,
-                                             y_stride, blimit);
-}
-
 /* Vertical B Filtering */
 void vp9_loop_filter_bv_mmx(unsigned char *y_ptr,
                             unsigned char *u_ptr, unsigned char *v_ptr,
@@ -66,9 +56,3 @@
                                       lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride,
-                             const unsigned char *blimit) {
-  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
-}
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
index 08447a6..7982ca6 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -1115,16 +1115,6 @@
                                             v_ptr + 4 * uv_stride);
 }
 
-void vp9_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride,
-                              const unsigned char *blimit) {
-  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride,
-                                              y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride,
-                                              y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride,
-                                              y_stride, blimit);
-}
-
 /* Vertical B Filtering */
 void vp9_loop_filter_bv_sse2(unsigned char *y_ptr,
                              unsigned char *u_ptr, unsigned char *v_ptr,
@@ -1143,9 +1133,3 @@
                                           v_ptr + 4);
 }
 
-void vp9_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride,
-                              const unsigned char *blimit) {
-  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
-}
diff --git a/vp9/common/x86/vp9_loopfilter_mmx.asm b/vp9/common/x86/vp9_loopfilter_mmx.asm
index ceffdf5..4ebb51b 100644
--- a/vp9/common/x86/vp9_loopfilter_mmx.asm
+++ b/vp9/common/x86/vp9_loopfilter_mmx.asm
@@ -593,349 +593,6 @@
     pop         rbp
     ret
 
-
-;void vp9_loop_filter_simple_horizontal_edge_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit
-;)
-global sym(vp9_loop_filter_simple_horizontal_edge_mmx) PRIVATE
-sym(vp9_loop_filter_simple_horizontal_edge_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        mov         rcx, 2                ; count
-.nexts8_h:
-        mov         rdx, arg(2) ;blimit           ; get blimit
-        movq        mm3, [rdx]            ;
-
-        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
-        add         rdi, rax
-        neg         rax
-
-        ; calculate mask
-        movq        mm1, [rsi+2*rax]      ; p1
-        movq        mm0, [rdi]            ; q1
-        movq        mm2, mm1
-        movq        mm7, mm0
-        movq        mm4, mm0
-        psubusb     mm0, mm1              ; q1-=p1
-        psubusb     mm1, mm4              ; p1-=q1
-        por         mm1, mm0              ; abs(p1-q1)
-        pand        mm1, [GLOBAL(tfe)]    ; set lsb of each byte to zero
-        psrlw       mm1, 1                ; abs(p1-q1)/2
-
-        movq        mm5, [rsi+rax]        ; p0
-        movq        mm4, [rsi]            ; q0
-        movq        mm0, mm4              ; q0
-        movq        mm6, mm5              ; p0
-        psubusb     mm5, mm4              ; p0-=q0
-        psubusb     mm4, mm6              ; q0-=p0
-        por         mm5, mm4              ; abs(p0 - q0)
-        paddusb     mm5, mm5              ; abs(p0-q0)*2
-        paddusb     mm5, mm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        psubusb     mm5, mm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        pxor        mm3, mm3
-        pcmpeqb     mm5, mm3
-
-        ; start work on filters
-        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
-        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
-        psubsb      mm2, mm7              ; p1 - q1
-
-        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
-        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
-        movq        mm3, mm0              ; q0
-        psubsb      mm0, mm6              ; q0 - p0
-        paddsb      mm2, mm0              ; p1 - q1 + 1 * (q0 - p0)
-        paddsb      mm2, mm0              ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      mm2, mm0              ; p1 - q1 + 3 * (q0 - p0)
-        pand        mm5, mm2              ; mask filter values we don't care about
-
-        ; do + 4 side
-        paddsb      mm5, [GLOBAL(t4)]     ; 3* (q0 - p0) + (p1 - q1) + 4
-
-        movq        mm0, mm5              ; get a copy of filters
-        psllw       mm0, 8                ; shift left 8
-        psraw       mm0, 3                ; arithmetic shift right 11
-        psrlw       mm0, 8
-        movq        mm1, mm5              ; get a copy of filters
-        psraw       mm1, 11               ; arithmetic shift right 11
-        psllw       mm1, 8                ; shift left 8 to put it back
-
-        por         mm0, mm1              ; put the two together to get result
-
-        psubsb      mm3, mm0              ; q0-= q0 add
-        pxor        mm3, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi], mm3            ; write back
-
-
-        ; now do +3 side
-        psubsb      mm5, [GLOBAL(t1s)]     ; +3 instead of +4
-
-        movq        mm0, mm5              ; get a copy of filters
-        psllw       mm0, 8                ; shift left 8
-        psraw       mm0, 3                ; arithmetic shift right 11
-        psrlw       mm0, 8
-        psraw       mm5, 11               ; arithmetic shift right 11
-        psllw       mm5, 8                ; shift left 8 to put it back
-        por         mm0, mm5              ; put the two together to get result
-
-
-        paddsb      mm6, mm0              ; p0+= p0 add
-        pxor        mm6, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi+rax], mm6        ; write back
-
-        add         rsi,8
-        neg         rax
-        dec         rcx
-        jnz         .nexts8_h
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_loop_filter_simple_vertical_edge_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit
-;)
-global sym(vp9_loop_filter_simple_vertical_edge_mmx) PRIVATE
-sym(vp9_loop_filter_simple_vertical_edge_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub          rsp, 32      ; reserve 32 bytes
-    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
-    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
-
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        lea         rsi, [rsi + rax*4- 2];  ;
-        mov         rcx, 2                                      ; count
-.nexts8_v:
-
-        lea         rdi,        [rsi + rax];
-        movd        mm0,        [rdi + rax * 2]                 ; xx xx xx xx 73 72 71 70
-
-        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 63 62 61 60
-        punpcklbw   mm6,        mm0                             ; 73 63 72 62 71 61 70 60
-
-        movd        mm0,        [rsi + rax]                     ; xx xx xx xx 53 52 51 50
-        movd        mm4,        [rsi]                           ; xx xx xx xx 43 42 41 40
-
-        punpcklbw   mm4,        mm0                             ; 53 43 52 42 51 41 50 40
-        movq        mm5,        mm4                             ; 53 43 52 42 51 41 50 40
-
-        punpcklwd   mm4,        mm6                             ; 71 61 51 41 70 60 50 40
-        punpckhwd   mm5,        mm6                             ; 73 63 53 43 72 62 52 42
-
-        neg         rax
-
-        movd        mm7,        [rsi + rax]                     ; xx xx xx xx 33 32 31 30
-        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 23 22 21 20
-
-        punpcklbw   mm6,        mm7                             ; 33 23 32 22 31 21 30 20
-        movd        mm1,        [rdi + rax * 4]                 ; xx xx xx xx 13 12 11 10
-
-        movd        mm0,        [rsi + rax * 4]                 ; xx xx xx xx 03 02 01 00
-        punpcklbw   mm0,        mm1                             ; 13 03 12 02 11 01 10 00
-
-        movq        mm2,        mm0                             ; 13 03 12 02 11 01 10 00
-        punpcklwd   mm0,        mm6                             ; 31 21 11 01 30 20 10 00
-
-        punpckhwd   mm2,        mm6                             ; 33 23 13 03 32 22 12 02
-        movq        mm1,        mm0                             ; 13 03 12 02 11 01 10 00
-
-        punpckldq   mm0,        mm4                             ; 70 60 50 40 30 20 10 00       = p1
-        movq        mm3,        mm2                             ; 33 23 13 03 32 22 12 02
-
-        punpckhdq   mm1,        mm4                             ; 71 61 51 41 31 21 11 01       = p0
-        punpckldq   mm2,        mm5                             ; 72 62 52 42 32 22 12 02       = q0
-
-        punpckhdq   mm3,        mm5                             ; 73 63 53 43 33 23 13 03       = q1
-
-
-        ; calculate mask
-        movq        mm6,        mm0                             ; p1
-        movq        mm7,        mm3                             ; q1
-        psubusb     mm7,        mm6                             ; q1-=p1
-        psubusb     mm6,        mm3                             ; p1-=q1
-        por         mm6,        mm7                             ; abs(p1-q1)
-        pand        mm6,        [GLOBAL(tfe)]                   ; set lsb of each byte to zero
-        psrlw       mm6,        1                               ; abs(p1-q1)/2
-
-        movq        mm5,        mm1                             ; p0
-        movq        mm4,        mm2                             ; q0
-
-        psubusb     mm5,        mm2                             ; p0-=q0
-        psubusb     mm4,        mm1                             ; q0-=p0
-
-        por         mm5,        mm4                             ; abs(p0 - q0)
-        paddusb     mm5,        mm5                             ; abs(p0-q0)*2
-        paddusb     mm5,        mm6                             ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        mov         rdx,        arg(2) ;blimit                          ; get blimit
-        movq        mm7,        [rdx]
-
-        psubusb     mm5,        mm7                             ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        pxor        mm7,        mm7
-        pcmpeqb     mm5,        mm7                             ; mm5 = mask
-
-        ; start work on filters
-        movq        t0,         mm0
-        movq        t1,         mm3
-
-        pxor        mm0,        [GLOBAL(t80)]                   ; p1 offset to convert to signed values
-        pxor        mm3,        [GLOBAL(t80)]                   ; q1 offset to convert to signed values
-
-        psubsb      mm0,        mm3                             ; p1 - q1
-        movq        mm6,        mm1                             ; p0
-
-        movq        mm7,        mm2                             ; q0
-        pxor        mm6,        [GLOBAL(t80)]                   ; offset to convert to signed values
-
-        pxor        mm7,        [GLOBAL(t80)]                   ; offset to convert to signed values
-        movq        mm3,        mm7                             ; offseted ; q0
-
-        psubsb      mm7,        mm6                             ; q0 - p0
-        paddsb      mm0,        mm7                             ; p1 - q1 + 1 * (q0 - p0)
-
-        paddsb      mm0,        mm7                             ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      mm0,        mm7                             ; p1 - q1 + 3 * (q0 - p0)
-
-        pand        mm5,        mm0                             ; mask filter values we don't care about
-
-        paddsb      mm5,        [GLOBAL(t4)]                    ;  3* (q0 - p0) + (p1 - q1) + 4
-
-        movq        mm0,        mm5                             ; get a copy of filters
-        psllw       mm0,        8                               ; shift left 8
-        psraw       mm0,        3                               ; arithmetic shift right 11
-        psrlw       mm0,        8
-
-        movq        mm7,        mm5                             ; get a copy of filters
-        psraw       mm7,        11                              ; arithmetic shift right 11
-        psllw       mm7,        8                               ; shift left 8 to put it back
-
-        por         mm0,        mm7                             ; put the two together to get result
-
-        psubsb      mm3,        mm0                             ; q0-= q0sz add
-        pxor        mm3,        [GLOBAL(t80)]                   ; unoffset
-
-        ; now do +3 side
-        psubsb      mm5, [GLOBAL(t1s)]                          ; +3 instead of +4
-
-        movq        mm0, mm5                                    ; get a copy of filters
-        psllw       mm0, 8                                      ; shift left 8
-        psraw       mm0, 3                                      ; arithmetic shift right 11
-        psrlw       mm0, 8
-
-        psraw       mm5, 11                                     ; arithmetic shift right 11
-        psllw       mm5, 8                                      ; shift left 8 to put it back
-        por         mm0, mm5                                    ; put the two together to get result
-
-        paddsb      mm6, mm0                                    ; p0+= p0 add
-        pxor        mm6, [GLOBAL(t80)]                          ; unoffset
-
-
-        movq        mm0,        t0
-        movq        mm4,        t1
-
-        ; mm0 = 70 60 50 40 30 20 10 00
-        ; mm6 = 71 61 51 41 31 21 11 01
-        ; mm3 = 72 62 52 42 32 22 12 02
-        ; mm4 = 73 63 53 43 33 23 13 03
-        ; transpose back to write out
-
-        movq        mm1,        mm0                         ;
-        punpcklbw   mm0,        mm6                         ; 31 30 21 20 11 10 01 00
-
-        punpckhbw   mm1,        mm6                         ; 71 70 61 60 51 50 41 40
-        movq        mm2,        mm3                         ;
-
-        punpcklbw   mm2,        mm4                         ; 33 32 23 22 13 12 03 02
-        movq        mm5,        mm1                         ; 71 70 61 60 51 50 41 40
-
-        punpckhbw   mm3,        mm4                         ; 73 72 63 62 53 52 43 42
-        movq        mm6,        mm0                         ; 31 30 21 20 11 10 01 00
-
-        punpcklwd   mm0,        mm2                         ; 13 12 11 10 03 02 01 00
-        punpckhwd   mm6,        mm2                         ; 33 32 31 30 23 22 21 20
-
-        movd        [rsi+rax*4], mm0                        ; write 03 02 01 00
-        punpcklwd   mm1,        mm3                         ; 53 52 51 50 43 42 41 40
-
-        psrlq       mm0,        32                          ; xx xx xx xx 13 12 11 10
-        punpckhwd   mm5,        mm3                         ; 73 72 71 70 63 62 61 60
-
-        movd        [rdi+rax*4], mm0                        ; write 13 12 11 10
-        movd        [rsi+rax*2], mm6                        ; write 23 22 21 20
-
-        psrlq       mm6,        32                          ; 33 32 31 30
-        movd        [rsi],      mm1                         ; write 43 42 41 40
-
-        movd        [rsi + rax], mm6                        ; write 33 32 31 30
-        neg         rax
-
-        movd        [rsi + rax*2], mm5                      ; write 63 62 61 60
-        psrlq       mm1,        32                          ; 53 52 51 50
-
-        movd        [rdi],      mm1                         ; write out 53 52 51 50
-        psrlq       mm5,        32                          ; 73 72 71 70
-
-        movd        [rdi + rax*2], mm5                      ; write 73 72 71 70
-
-        lea         rsi,        [rsi+rax*8]                 ; next 8
-
-        dec         rcx
-        jnz         .nexts8_v
-
-    add rsp, 32
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
-;                  int y_stride,
-;                  loop_filter_info *lfi)
-;{
-;
-;
-;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-;}
-
 SECTION_RODATA
 align 16
 tfe:
diff --git a/vp9/common/x86/vp9_loopfilter_sse2.asm b/vp9/common/x86/vp9_loopfilter_sse2.asm
index ae4c60f..74236cf 100644
--- a/vp9/common/x86/vp9_loopfilter_sse2.asm
+++ b/vp9/common/x86/vp9_loopfilter_sse2.asm
@@ -845,372 +845,6 @@
     pop         rbp
     ret
 
-;void vp9_loop_filter_simple_horizontal_edge_sse2
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit,
-;)
-global sym(vp9_loop_filter_simple_horizontal_edge_sse2) PRIVATE
-sym(vp9_loop_filter_simple_horizontal_edge_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi, arg(0)             ;src_ptr
-        movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
-        mov         rdx, arg(2)             ;blimit
-        movdqa      xmm3, XMMWORD PTR [rdx]
-
-        mov         rdi, rsi                ; rdi points to row +1 for indirect addressing
-        add         rdi, rax
-        neg         rax
-
-        ; calculate mask
-        movdqa      xmm1, [rsi+2*rax]       ; p1
-        movdqa      xmm0, [rdi]             ; q1
-        movdqa      xmm2, xmm1
-        movdqa      xmm7, xmm0
-        movdqa      xmm4, xmm0
-        psubusb     xmm0, xmm1              ; q1-=p1
-        psubusb     xmm1, xmm4              ; p1-=q1
-        por         xmm1, xmm0              ; abs(p1-q1)
-        pand        xmm1, [GLOBAL(tfe)]     ; set lsb of each byte to zero
-        psrlw       xmm1, 1                 ; abs(p1-q1)/2
-
-        movdqa      xmm5, [rsi+rax]         ; p0
-        movdqa      xmm4, [rsi]             ; q0
-        movdqa      xmm0, xmm4              ; q0
-        movdqa      xmm6, xmm5              ; p0
-        psubusb     xmm5, xmm4              ; p0-=q0
-        psubusb     xmm4, xmm6              ; q0-=p0
-        por         xmm5, xmm4              ; abs(p0 - q0)
-        paddusb     xmm5, xmm5              ; abs(p0-q0)*2
-        paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        pxor        xmm3, xmm3
-        pcmpeqb     xmm5, xmm3
-
-        ; start work on filters
-        pxor        xmm2, [GLOBAL(t80)]     ; p1 offset to convert to signed values
-        pxor        xmm7, [GLOBAL(t80)]     ; q1 offset to convert to signed values
-        psubsb      xmm2, xmm7              ; p1 - q1
-
-        pxor        xmm6, [GLOBAL(t80)]     ; offset to convert to signed values
-        pxor        xmm0, [GLOBAL(t80)]     ; offset to convert to signed values
-        movdqa      xmm3, xmm0              ; q0
-        psubsb      xmm0, xmm6              ; q0 - p0
-        paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
-        paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
-        pand        xmm5, xmm2              ; mask filter values we don't care about
-
-        ; do + 4 side
-        paddsb      xmm5, [GLOBAL(t4)]      ; 3* (q0 - p0) + (p1 - q1) + 4
-
-        movdqa      xmm0, xmm5              ; get a copy of filters
-        psllw       xmm0, 8                 ; shift left 8
-        psraw       xmm0, 3                 ; arithmetic shift right 11
-        psrlw       xmm0, 8
-        movdqa      xmm1, xmm5              ; get a copy of filters
-        psraw       xmm1, 11                ; arithmetic shift right 11
-        psllw       xmm1, 8                 ; shift left 8 to put it back
-
-        por         xmm0, xmm1              ; put the two together to get result
-
-        psubsb      xmm3, xmm0              ; q0-= q0 add
-        pxor        xmm3, [GLOBAL(t80)]     ; unoffset
-        movdqa      [rsi], xmm3             ; write back
-
-        ; now do +3 side
-        psubsb      xmm5, [GLOBAL(t1s)]     ; +3 instead of +4
-
-        movdqa      xmm0, xmm5              ; get a copy of filters
-        psllw       xmm0, 8                 ; shift left 8
-        psraw       xmm0, 3                 ; arithmetic shift right 11
-        psrlw       xmm0, 8
-        psraw       xmm5, 11                ; arithmetic shift right 11
-        psllw       xmm5, 8                 ; shift left 8 to put it back
-        por         xmm0, xmm5              ; put the two together to get result
-
-
-        paddsb      xmm6, xmm0              ; p0+= p0 add
-        pxor        xmm6, [GLOBAL(t80)]     ; unoffset
-        movdqa      [rsi+rax], xmm6         ; write back
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_loop_filter_simple_vertical_edge_sse2
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit,
-;)
-global sym(vp9_loop_filter_simple_vertical_edge_sse2) PRIVATE
-sym(vp9_loop_filter_simple_vertical_edge_sse2):
-    push        rbp         ; save old base pointer value.
-    mov         rbp, rsp    ; set new base pointer value.
-    SHADOW_ARGS_TO_STACK 3
-    SAVE_XMM 7
-    GET_GOT     rbx         ; save callee-saved reg
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 32                         ; reserve 32 bytes
-    %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];
-    %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];
-
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        lea         rsi,        [rsi - 2 ]
-        lea         rdi,        [rsi + rax]
-        lea         rdx,        [rsi + rax*4]
-        lea         rcx,        [rdx + rax]
-
-        movd        xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00
-        movd        xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40
-        movd        xmm2,       [rdi]                   ; 13 12 11 10
-        movd        xmm3,       [rcx]                   ; 53 52 51 50
-        punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00
-        punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10
-
-        movd        xmm4,       [rsi + rax*2]           ; 23 22 21 20
-        movd        xmm5,       [rdx + rax*2]           ; 63 62 61 60
-        movd        xmm6,       [rdi + rax*2]           ; 33 32 31 30
-        movd        xmm7,       [rcx + rax*2]           ; 73 72 71 70
-        punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20
-        punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30
-
-        punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
-        punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
-
-        movdqa      xmm1,       xmm0
-        punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-        punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
-
-        movdqa      xmm2,       xmm0
-        punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-        punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-
-        movdqa      t0,         xmm0                    ; save to t0
-        movdqa      t1,         xmm2                    ; save to t1
-
-        lea         rsi,        [rsi + rax*8]
-        lea         rdi,        [rsi + rax]
-        lea         rdx,        [rsi + rax*4]
-        lea         rcx,        [rdx + rax]
-
-        movd        xmm4,       [rsi]                   ; 83 82 81 80
-        movd        xmm1,       [rdx]                   ; c3 c2 c1 c0
-        movd        xmm6,       [rdi]                   ; 93 92 91 90
-        movd        xmm3,       [rcx]                   ; d3 d2 d1 d0
-        punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
-        punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
-
-        movd        xmm0,       [rsi + rax*2]           ; a3 a2 a1 a0
-        movd        xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
-        movd        xmm2,       [rdi + rax*2]           ; b3 b2 b1 b0
-        movd        xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
-        punpckldq   xmm0,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
-        punpckldq   xmm2,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
-
-        punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
-        punpcklbw   xmm0,       xmm2                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
-
-        movdqa      xmm1,       xmm4
-        punpcklwd   xmm4,       xmm0                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
-        punpckhwd   xmm1,       xmm0                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
-
-        movdqa      xmm6,       xmm4
-        punpckldq   xmm4,       xmm1                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
-        punpckhdq   xmm6,       xmm1                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
-
-        movdqa      xmm0,       t0                      ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-        movdqa      xmm2,       t1                      ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-        movdqa      xmm1,       xmm0
-        movdqa      xmm3,       xmm2
-
-        punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-        punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-        punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-        punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-
-        ; calculate mask
-        movdqa      xmm6,       xmm0                            ; p1
-        movdqa      xmm7,       xmm3                            ; q1
-        psubusb     xmm7,       xmm0                            ; q1-=p1
-        psubusb     xmm6,       xmm3                            ; p1-=q1
-        por         xmm6,       xmm7                            ; abs(p1-q1)
-        pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
-        psrlw       xmm6,       1                               ; abs(p1-q1)/2
-
-        movdqa      xmm5,       xmm1                            ; p0
-        movdqa      xmm4,       xmm2                            ; q0
-        psubusb     xmm5,       xmm2                            ; p0-=q0
-        psubusb     xmm4,       xmm1                            ; q0-=p0
-        por         xmm5,       xmm4                            ; abs(p0 - q0)
-        paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
-        paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        mov         rdx,        arg(2)                          ;blimit
-        movdqa      xmm7, XMMWORD PTR [rdx]
-
-        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        pxor        xmm7,        xmm7
-        pcmpeqb     xmm5,        xmm7                           ; mm5 = mask
-
-        ; start work on filters
-        movdqa        t0,        xmm0
-        movdqa        t1,        xmm3
-
-        pxor        xmm0,        [GLOBAL(t80)]                  ; p1 offset to convert to signed values
-        pxor        xmm3,        [GLOBAL(t80)]                  ; q1 offset to convert to signed values
-
-        psubsb      xmm0,        xmm3                           ; p1 - q1
-        movdqa      xmm6,        xmm1                           ; p0
-
-        movdqa      xmm7,        xmm2                           ; q0
-        pxor        xmm6,        [GLOBAL(t80)]                  ; offset to convert to signed values
-
-        pxor        xmm7,        [GLOBAL(t80)]                  ; offset to convert to signed values
-        movdqa      xmm3,        xmm7                           ; offseted ; q0
-
-        psubsb      xmm7,        xmm6                           ; q0 - p0
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 1 * (q0 - p0)
-
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 3 * (q0 - p0)
-
-        pand        xmm5,        xmm0                           ; mask filter values we don't care about
-
-
-        paddsb      xmm5,        [GLOBAL(t4)]                   ;  3* (q0 - p0) + (p1 - q1) + 4
-
-        movdqa      xmm0,        xmm5                           ; get a copy of filters
-        psllw       xmm0,        8                              ; shift left 8
-
-        psraw       xmm0,        3                              ; arithmetic shift right 11
-        psrlw       xmm0,        8
-
-        movdqa      xmm7,        xmm5                           ; get a copy of filters
-        psraw       xmm7,        11                             ; arithmetic shift right 11
-
-        psllw       xmm7,        8                              ; shift left 8 to put it back
-        por         xmm0,        xmm7                           ; put the two together to get result
-
-        psubsb      xmm3,        xmm0                           ; q0-= q0sz add
-        pxor        xmm3,        [GLOBAL(t80)]                  ; unoffset   q0
-
-        ; now do +3 side
-        psubsb      xmm5,        [GLOBAL(t1s)]                  ; +3 instead of +4
-        movdqa      xmm0,        xmm5                           ; get a copy of filters
-
-        psllw       xmm0,        8                              ; shift left 8
-        psraw       xmm0,        3                              ; arithmetic shift right 11
-
-        psrlw       xmm0,        8
-        psraw       xmm5,        11                             ; arithmetic shift right 11
-
-        psllw       xmm5,        8                              ; shift left 8 to put it back
-        por         xmm0,        xmm5                           ; put the two together to get result
-
-        paddsb      xmm6,        xmm0                           ; p0+= p0 add
-        pxor        xmm6,        [GLOBAL(t80)]                  ; unoffset   p0
-
-        movdqa      xmm0,        t0                             ; p1
-        movdqa      xmm4,        t1                             ; q1
-
-        ; transpose back to write out
-        ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-        ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-        ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-        ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-        movdqa      xmm1,       xmm0
-        punpcklbw   xmm0,       xmm6                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
-        punpckhbw   xmm1,       xmm6                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
-
-        movdqa      xmm5,       xmm3
-        punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
-        punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
-        movdqa      xmm2,       xmm0
-        punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
-        punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
-
-        movdqa      xmm3,       xmm1
-        punpcklwd   xmm1,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
-        punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
-
-        ; write out order: xmm0 xmm2 xmm1 xmm3
-        lea         rdx,        [rsi + rax*4]
-
-        movd        [rsi],      xmm1                               ; write the second 8-line result
-        psrldq      xmm1,       4
-        movd        [rdi],      xmm1
-        psrldq      xmm1,       4
-        movd        [rsi + rax*2], xmm1
-        psrldq      xmm1,       4
-        movd        [rdi + rax*2], xmm1
-
-        movd        [rdx],      xmm3
-        psrldq      xmm3,       4
-        movd        [rcx],      xmm3
-        psrldq      xmm3,       4
-        movd        [rdx + rax*2], xmm3
-        psrldq      xmm3,       4
-        movd        [rcx + rax*2], xmm3
-
-        neg         rax
-        lea         rsi,        [rsi + rax*8]
-        neg         rax
-        lea         rdi,        [rsi + rax]
-        lea         rdx,        [rsi + rax*4]
-        lea         rcx,        [rdx + rax]
-
-        movd        [rsi],      xmm0                                ; write the first 8-line result
-        psrldq      xmm0,       4
-        movd        [rdi],      xmm0
-        psrldq      xmm0,       4
-        movd        [rsi + rax*2], xmm0
-        psrldq      xmm0,       4
-        movd        [rdi + rax*2], xmm0
-
-        movd        [rdx],      xmm2
-        psrldq      xmm2,       4
-        movd        [rcx],      xmm2
-        psrldq      xmm2,       4
-        movd        [rdx + rax*2], xmm2
-        psrldq      xmm2,       4
-        movd        [rcx + rax*2], xmm2
-
-    add rsp, 32
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 SECTION_RODATA
 align 16
 tfe:
diff --git a/vp9/common/x86/vp9_loopfilter_x86.h b/vp9/common/x86/vp9_loopfilter_x86.h
index 46a6202..fb5af05 100644
--- a/vp9/common/x86/vp9_loopfilter_x86.h
+++ b/vp9/common/x86/vp9_loopfilter_x86.h
@@ -23,10 +23,6 @@
 extern prototype_loopfilter_block(vp9_loop_filter_bv_mmx);
 extern prototype_loopfilter_block(vp9_loop_filter_mbh_mmx);
 extern prototype_loopfilter_block(vp9_loop_filter_bh_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_mmx);
 #endif
 
 #if HAVE_SSE2
@@ -34,10 +30,6 @@
 extern prototype_loopfilter_block(vp9_loop_filter_bv_sse2);
 extern prototype_loopfilter_block(vp9_loop_filter_mbh_sse2);
 extern prototype_loopfilter_block(vp9_loop_filter_bh_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_sse2);
 #endif
 
 #endif  // LOOPFILTER_X86_H
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 408573f..a1f780a 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -37,11 +37,6 @@
 
 static B_PREDICTION_MODE read_bmode(vp9_reader *r, const vp9_prob *p) {
   B_PREDICTION_MODE m = treed_read(r, vp9_bmode_tree, p);
-#if CONFIG_NEWBINTRAMODES
-  if (m == B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS)
-    m = B_CONTEXT_PRED;
-  assert(m < B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS || m == B_CONTEXT_PRED);
-#endif
   return m;
 }
 
@@ -65,12 +60,6 @@
   return (MB_PREDICTION_MODE)treed_read(r, vp9_kf_ymode_tree, p);
 }
 
-#if !CONFIG_SB8X8
-static int read_i8x8_mode(vp9_reader *r, const vp9_prob *p) {
-  return treed_read(r, vp9_i8x8_mode_tree, p);
-}
-#endif
-
 static MB_PREDICTION_MODE read_uv_mode(vp9_reader *r, const vp9_prob *p) {
   return (MB_PREDICTION_MODE)treed_read(r, vp9_uv_mode_tree, p);
 }
@@ -108,7 +97,7 @@
   return txfm_size;
 }
 
-extern const int vp9_i8x8_block[4];
+
 static void kfread_modes(VP9D_COMP *pbi, MODE_INFO *m,
                          int mi_row, int mi_col,
                          vp9_reader *r) {
@@ -130,11 +119,7 @@
     m->mbmi.mb_skip_coeff = vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
 
   // luma mode
-#if CONFIG_SB8X8
   m->mbmi.mode = m->mbmi.sb_type > BLOCK_SIZE_SB8X8 ?
-#else
-  m->mbmi.mode = m->mbmi.sb_type > BLOCK_SIZE_MB16X16 ?
-#endif
       read_kf_sb_ymode(r, cm->sb_kf_ymode_prob[cm->kf_ymode_probs_index]):
       read_kf_mb_ymode(r, cm->kf_ymode_prob[cm->kf_ymode_probs_index]);
 
@@ -142,57 +127,27 @@
 
   if (m->mbmi.mode == I4X4_PRED) {
     int i;
-    for (i = 0; i < (16 >> (2 * CONFIG_SB8X8)); ++i) {
+    for (i = 0; i < 4; ++i) {
       const B_PREDICTION_MODE a = above_block_mode(m, i, mis);
-      const B_PREDICTION_MODE l = xd->left_available || (i & 3) ?
+      const B_PREDICTION_MODE l = xd->left_available ||
+                                  (i & 1) ?
                                   left_block_mode(m, i) : B_DC_PRED;
       m->bmi[i].as_mode.first = read_kf_bmode(r, cm->kf_bmode_prob[a][l]);
     }
   }
 
-#if !CONFIG_SB8X8
-  if (m->mbmi.mode == I8X8_PRED) {
-    int i;
-    for (i = 0; i < 4; ++i) {
-      const int ib = vp9_i8x8_block[i];
-      const int mode8x8 = read_i8x8_mode(r, cm->fc.i8x8_mode_prob);
-
-      m->bmi[ib + 0].as_mode.first = mode8x8;
-      m->bmi[ib + 1].as_mode.first = mode8x8;
-      m->bmi[ib + 4].as_mode.first = mode8x8;
-      m->bmi[ib + 5].as_mode.first = mode8x8;
-    }
-  }
-
-  // chroma mode
-  if (m->mbmi.mode != I8X8_PRED)
-#endif
-  {
-    m->mbmi.uv_mode = read_uv_mode(r, cm->kf_uv_mode_prob[m->mbmi.mode]);
-  }
+  m->mbmi.uv_mode = read_uv_mode(r, cm->kf_uv_mode_prob[m->mbmi.mode]);
 
   if (cm->txfm_mode == TX_MODE_SELECT &&
-      !m->mbmi.mb_skip_coeff &&
-#if CONFIG_SB8X8
-      m->mbmi.mode != I4X4_PRED
-#else
-      m->mbmi.mode <= I8X8_PRED
-#endif
-      ) {
-#if CONFIG_SB8X8
+      !m->mbmi.mb_skip_coeff && m->mbmi.mode != I4X4_PRED) {
     const int allow_16x16 = m->mbmi.sb_type >= BLOCK_SIZE_MB16X16;
-#else
-    const int allow_16x16 = m->mbmi.mode != I8X8_PRED;
-#endif
     const int allow_32x32 = m->mbmi.sb_type >= BLOCK_SIZE_SB32X32;
     m->mbmi.txfm_size = select_txfm_size(cm, r, allow_16x16, allow_32x32);
   } else if (cm->txfm_mode >= ALLOW_32X32 &&
              m->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
     m->mbmi.txfm_size = TX_32X32;
   } else if (cm->txfm_mode >= ALLOW_16X16 &&
-#if CONFIG_SB8X8
              m->mbmi.sb_type >= BLOCK_SIZE_MB16X16 &&
-#endif
              m->mbmi.mode <= TM_PRED) {
     m->mbmi.txfm_size = TX_16X16;
   } else if (cm->txfm_mode >= ALLOW_8X8 && m->mbmi.mode != I4X4_PRED) {
@@ -448,14 +403,6 @@
 };
 #endif
 
-static const unsigned char mbsplit_fill_count[4] = { 8, 8, 4, 1 };
-static const unsigned char mbsplit_fill_offset[4][16] = {
-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 },
-  { 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,   6,  7, 10, 11, 14, 15 },
-  { 0,  1,  4,  5,  2,  3,  6,  7,  8,  9,  12, 13, 10, 11, 14, 15 },
-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 }
-};
-
 static void read_switchable_interp_probs(VP9D_COMP* const pbi, vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   int i, j;
@@ -484,12 +431,7 @@
 
     if (cm->mcomp_filter_type == SWITCHABLE)
       read_switchable_interp_probs(pbi, r);
-#if CONFIG_COMP_INTERINTRA_PRED
-    if (cm->use_interintra) {
-      if (vp9_read(r, VP9_UPD_INTERINTRA_PROB))
-        cm->fc.interintra_prob = vp9_read_prob(r);
-    }
-#endif
+
     // Baseline probabilities for decoding reference frame
     cm->prob_intra_coded = vp9_read_prob(r);
     cm->prob_last_coded  = vp9_read_prob(r);
@@ -676,12 +618,7 @@
       if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) {
         mbmi->mode = ZEROMV;
       } else {
-        mbmi->mode =
-#if CONFIG_SB8X8
-                     mbmi->sb_type > BLOCK_SIZE_SB8X8 ?
-#else
-                     mbmi->sb_type > BLOCK_SIZE_MB16X16 ?
-#endif
+        mbmi->mode = mbmi->sb_type > BLOCK_SIZE_SB8X8 ?
                                      read_sb_mv_ref(r, mv_ref_p)
                                    : read_mv_ref(r, mv_ref_p);
         vp9_accum_mv_refs(cm, mbmi->mode, mbmi->mb_mode_context[ref_frame]);
@@ -743,61 +680,21 @@
         }
       }
 
-    } else {
-#if CONFIG_COMP_INTERINTRA_PRED
-      if (pbi->common.use_interintra &&
-          mbmi->mode >= NEARESTMV && mbmi->mode < SPLITMV &&
-          mbmi->second_ref_frame == NONE) {
-        mbmi->second_ref_frame = (vp9_read(r, pbi->common.fc.interintra_prob) ?
-                                  INTRA_FRAME : NONE);
-        // printf("-- %d (%d)\n", mbmi->second_ref_frame == INTRA_FRAME,
-        //        pbi->common.fc.interintra_prob);
-        pbi->common.fc.interintra_counts[
-            mbmi->second_ref_frame == INTRA_FRAME]++;
-        if (mbmi->second_ref_frame == INTRA_FRAME) {
-          mbmi->interintra_mode = read_ymode(r, pbi->common.fc.ymode_prob);
-          pbi->common.fc.ymode_counts[mbmi->interintra_mode]++;
-#if SEPARATE_INTERINTRA_UV
-          mbmi->interintra_uv_mode = read_uv_mode(r,
-              pbi->common.fc.uv_mode_prob[mbmi->interintra_mode]);
-          pbi->common.fc.uv_mode_counts[mbmi->interintra_mode]
-                                       [mbmi->interintra_uv_mode]++;
-#else
-          mbmi->interintra_uv_mode = mbmi->interintra_mode;
-#endif
-          // printf("** %d %d\n",
-          //        mbmi->interintra_mode, mbmi->interintra_uv_mode);
-        }
-      }
-#endif
     }
 
     mbmi->uv_mode = DC_PRED;
     switch (mbmi->mode) {
       case SPLITMV: {
-#if CONFIG_SB8X8
         const int num_p = 4;
-#else
-        const int s = treed_read(r, vp9_mbsplit_tree, cm->fc.mbsplit_prob);
-        const int num_p = vp9_mbsplit_count[s];
-#endif
         int j = 0;
 
-#if !CONFIG_SB8X8
-        cm->fc.mbsplit_counts[s]++;
-        mbmi->partitioning = s;
-#endif
         mbmi->need_to_clamp_mvs = 0;
         do {  // for each subset j
           int_mv leftmv, abovemv, second_leftmv, second_abovemv;
           int_mv blockmv, secondmv;
           int mv_contz;
           int blockmode;
-#if CONFIG_SB8X8
           int k = j;
-#else
-          int k = vp9_mbsplit_offset[s][j];  // first block in subset j
-#endif
 
           leftmv.as_int = left_block_mv(xd, mi, k);
           abovemv.as_int = above_block_mv(mi, k, mis);
@@ -851,50 +748,14 @@
             default:
               break;
           }
-
-          /*  Commenting this section out, not sure why this was needed, and
-           *  there are mismatches with this section in rare cases since it is
-           *  not done in the encoder at all.
-          mbmi->need_to_clamp_mvs |= check_mv_bounds(&blockmv,
-                                                     mb_to_left_edge,
-                                                     mb_to_right_edge,
-                                                     mb_to_top_edge,
-                                                     mb_to_bottom_edge);
-          if (mbmi->second_ref_frame > 0) {
-            mbmi->need_to_clamp_mvs |= check_mv_bounds(&secondmv,
-                                                       mb_to_left_edge,
-                                                       mb_to_right_edge,
-                                                       mb_to_top_edge,
-                                                       mb_to_bottom_edge);
-          }
-          */
-
-#if CONFIG_SB8X8
           mi->bmi[j].as_mv[0].as_int = blockmv.as_int;
           if (mbmi->second_ref_frame > 0)
             mi->bmi[j].as_mv[1].as_int = secondmv.as_int;
-#else
-          {
-            /* Fill (uniform) modes, mvs of jth subset.
-             Must do it here because ensuing subsets can
-             refer back to us via "left" or "above". */
-            unsigned int fill_count = mbsplit_fill_count[s];
-            const uint8_t *fill_offset =
-                &mbsplit_fill_offset[s][j * fill_count];
-
-            do {
-              mi->bmi[*fill_offset].as_mv[0].as_int = blockmv.as_int;
-              if (mbmi->second_ref_frame > 0)
-                mi->bmi[*fill_offset].as_mv[1].as_int = secondmv.as_int;
-              fill_offset++;
-            } while (--fill_count);
-          }
-#endif
         } while (++j < num_p);
       }
 
-      mv0->as_int = mi->bmi[15 >> (2 * CONFIG_SB8X8)].as_mv[0].as_int;
-      mv1->as_int = mi->bmi[15 >> (2 * CONFIG_SB8X8)].as_mv[1].as_int;
+      mv0->as_int = mi->bmi[3].as_mv[0].as_int;
+      mv1->as_int = mi->bmi[3].as_mv[1].as_int;
 
       break;  /* done with SPLITMV */
 
@@ -959,12 +820,7 @@
     // required for left and above block mv
     mv0->as_int = 0;
 
-#if CONFIG_SB8X8
-    if (mbmi->sb_type > BLOCK_SIZE_SB8X8)
-#else
-    if (mbmi->sb_type > BLOCK_SIZE_MB16X16)
-#endif
-    {
+    if (mbmi->sb_type > BLOCK_SIZE_SB8X8) {
       mbmi->mode = read_sb_ymode(r, cm->fc.sb_ymode_prob);
       cm->fc.sb_ymode_counts[mbmi->mode]++;
     } else {
@@ -978,77 +834,31 @@
       do {
         int m = read_bmode(r, cm->fc.bmode_prob);
         mi->bmi[j].as_mode.first = m;
-#if CONFIG_NEWBINTRAMODES
-        if (m == B_CONTEXT_PRED) m -= CONTEXT_PRED_REPLACEMENTS;
-#endif
         cm->fc.bmode_counts[m]++;
-      } while (++j < (16 >> (2 * CONFIG_SB8X8)));
+      } while (++j < 4);
     }
 
-#if !CONFIG_SB8X8
-    if (mbmi->mode == I8X8_PRED) {
-      int i;
-      for (i = 0; i < 4; i++) {
-        const int ib = vp9_i8x8_block[i];
-        const int mode8x8 = read_i8x8_mode(r, cm->fc.i8x8_mode_prob);
-
-        mi->bmi[ib + 0].as_mode.first = mode8x8;
-        mi->bmi[ib + 1].as_mode.first = mode8x8;
-        mi->bmi[ib + 4].as_mode.first = mode8x8;
-        mi->bmi[ib + 5].as_mode.first = mode8x8;
-        cm->fc.i8x8_mode_counts[mode8x8]++;
-      }
-    } else
-#endif
-    {
-      mbmi->uv_mode = read_uv_mode(r, cm->fc.uv_mode_prob[mbmi->mode]);
-      cm->fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
-    }
+    mbmi->uv_mode = read_uv_mode(r, cm->fc.uv_mode_prob[mbmi->mode]);
+    cm->fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
   }
-  /*
-  if (cm->current_video_frame == 1)
-    printf("mode: %d skip: %d\n", mbmi->mode, mbmi->mb_skip_coeff);
-    */
 
   if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&
-      ((mbmi->ref_frame == INTRA_FRAME &&
-#if CONFIG_SB8X8
-        mbmi->mode != I4X4_PRED
-#else
-        mbmi->mode <= I8X8_PRED
-#endif
-        ) ||
-       (mbmi->ref_frame != INTRA_FRAME &&
-#if CONFIG_SB8X8
-        mbmi->mode != SPLITMV
-#else
-        !(mbmi->mode == SPLITMV && mbmi->partitioning == PARTITIONING_4X4)
-#endif
-        ))) {
-#if CONFIG_SB8X8
+      ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode != I4X4_PRED) ||
+       (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {
     const int allow_16x16 = mbmi->sb_type >= BLOCK_SIZE_MB16X16;
-#else
-    const int allow_16x16 = mbmi->mode != I8X8_PRED && mbmi->mode != SPLITMV;
-#endif
     const int allow_32x32 = mbmi->sb_type >= BLOCK_SIZE_SB32X32;
     mbmi->txfm_size = select_txfm_size(cm, r, allow_16x16, allow_32x32);
   } else if (mbmi->sb_type >= BLOCK_SIZE_SB32X32 &&
              cm->txfm_mode >= ALLOW_32X32) {
     mbmi->txfm_size = TX_32X32;
   } else if (cm->txfm_mode >= ALLOW_16X16 &&
-#if CONFIG_SB8X8
              mbmi->sb_type >= BLOCK_SIZE_MB16X16 &&
-#endif
       ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= TM_PRED) ||
        (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {
     mbmi->txfm_size = TX_16X16;
   } else if (cm->txfm_mode >= ALLOW_8X8 &&
       (!(mbmi->ref_frame == INTRA_FRAME && mbmi->mode == I4X4_PRED) &&
-       !(mbmi->ref_frame != INTRA_FRAME && mbmi->mode == SPLITMV
-#if !CONFIG_SB8X8
-         && mbmi->partitioning == PARTITIONING_4X4
-#endif
-         ))) {
+       !(mbmi->ref_frame != INTRA_FRAME && mbmi->mode == SPLITMV))) {
     mbmi->txfm_size = TX_8X8;
   } else {
     mbmi->txfm_size = TX_4X4;
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 2f713d3..ea5905b 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -9,7 +9,6 @@
  */
 
 #include <assert.h>
-#include <stdio.h>
 
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/common/vp9_common.h"
@@ -24,7 +23,6 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vpx_scale/vpx_scale.h"
-#include "vp9/common/vp9_setupintrarecon.h"
 
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/common/vp9_extend.h"
@@ -187,58 +185,6 @@
     xd->plane[i].dequant = pc->uv_dequant[xd->q_index];
 }
 
-#if !CONFIG_SB8X8
-static void decode_8x8(MACROBLOCKD *xd) {
-  const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;
-  // luma
-  // if the first one is DCT_DCT assume all the rest are as well
-  TX_TYPE tx_type = get_tx_type_8x8(xd, 0);
-  int i;
-  assert(mode == I8X8_PRED);
-  for (i = 0; i < 4; i++) {
-    int ib = vp9_i8x8_block[i];
-    int idx = (ib & 0x02) ? (ib + 2) : ib;
-    int16_t *q  = BLOCK_OFFSET(xd->plane[0].qcoeff, idx, 16);
-    uint8_t* const dst =
-          raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                    xd->plane[0].dst.buf,
-                                    xd->plane[0].dst.stride);
-    int stride = xd->plane[0].dst.stride;
-    if (mode == I8X8_PRED) {
-      int i8x8mode = xd->mode_info_context->bmi[ib].as_mode.first;
-      vp9_intra8x8_predict(xd, ib, i8x8mode, dst, stride);
-    }
-    tx_type = get_tx_type_8x8(xd, ib);
-    vp9_iht_add_8x8_c(tx_type, q, dst, stride, xd->plane[0].eobs[idx]);
-  }
-
-  // chroma
-  for (i = 0; i < 4; i++) {
-    int ib = vp9_i8x8_block[i];
-    int i8x8mode = xd->mode_info_context->bmi[ib].as_mode.first;
-    uint8_t* dst;
-
-    dst = raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 1, i,
-                                    xd->plane[1].dst.buf,
-                                    xd->plane[1].dst.stride);
-    vp9_intra_uv4x4_predict(xd, 16 + i, i8x8mode,
-                            dst, xd->plane[1].dst.stride);
-    xd->itxm_add(BLOCK_OFFSET(xd->plane[1].qcoeff, i, 16),
-                 dst, xd->plane[1].dst.stride,
-                 xd->plane[1].eobs[i]);
-
-    dst = raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 2, i,
-                                    xd->plane[2].dst.buf,
-                                    xd->plane[1].dst.stride);
-    vp9_intra_uv4x4_predict(xd, 20 + i, i8x8mode,
-                            dst, xd->plane[1].dst.stride);
-    xd->itxm_add(BLOCK_OFFSET(xd->plane[2].qcoeff, i, 16),
-                 dst, xd->plane[1].dst.stride,
-                 xd->plane[2].eobs[i]);
-  }
-}
-#endif
-
 static INLINE void dequant_add_y(MACROBLOCKD *xd, TX_TYPE tx_type, int idx,
                                  BLOCK_SIZE_TYPE bsize) {
   struct macroblockd_plane *const y = &xd->plane[0];
@@ -254,47 +200,6 @@
   }
 }
 
-#if !CONFIG_SB8X8
-static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd, vp9_reader *r) {
-  TX_TYPE tx_type;
-  int i = 0;
-  const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;
-  assert(mode == I8X8_PRED);
-  for (i = 0; i < 4; i++) {
-    int ib = vp9_i8x8_block[i];
-    const int iblock[4] = {0, 1, 4, 5};
-    int j;
-    uint8_t* dst;
-    int i8x8mode = xd->mode_info_context->bmi[ib].as_mode.first;
-
-    dst = raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                    xd->plane[0].dst.buf,
-                                    xd->plane[0].dst.stride);
-    vp9_intra8x8_predict(xd, ib, i8x8mode, dst, xd->plane[0].dst.stride);
-    for (j = 0; j < 4; j++) {
-      tx_type = get_tx_type_4x4(xd, ib + iblock[j]);
-      dequant_add_y(xd, tx_type, ib + iblock[j], BLOCK_SIZE_MB16X16);
-    }
-    dst = raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 1, i,
-                                    xd->plane[1].dst.buf,
-                                    xd->plane[1].dst.stride);
-    vp9_intra_uv4x4_predict(xd, 16 + i, i8x8mode,
-                            dst, xd->plane[1].dst.stride);
-    xd->itxm_add(BLOCK_OFFSET(xd->plane[1].qcoeff, i, 16),
-                 dst, xd->plane[1].dst.stride,
-                 xd->plane[1].eobs[i]);
-    dst = raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 2, i,
-                                    xd->plane[2].dst.buf,
-                                    xd->plane[2].dst.stride);
-    vp9_intra_uv4x4_predict(xd, 20 + i, i8x8mode,
-                            dst, xd->plane[1].dst.stride);
-    xd->itxm_add(BLOCK_OFFSET(xd->plane[2].qcoeff, i, 16),
-                 dst, xd->plane[1].dst.stride,
-                 xd->plane[2].eobs[i]);
-  }
-}
-#endif
-
 static void decode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
                          int ss_txfrm_size, void *arg) {
   MACROBLOCKD* const xd = arg;
@@ -344,25 +249,17 @@
 
   for (i = 0; i < bc; i++) {
     int b_mode = xd->mode_info_context->bmi[i].as_mode.first;
-    uint8_t* dst;
-    dst = raster_block_offset_uint8(xd, bsize, 0, i,
-                                    xd->plane[0].dst.buf,
-                                    xd->plane[0].dst.stride);
-#if CONFIG_NEWBINTRAMODES
-    xd->mode_info_context->bmi[i].as_mode.context =
-        vp9_find_bpred_context(xd, i, dst, xd->plane[0].dst.stride);
-    if (!xd->mode_info_context->mbmi.mb_skip_coeff)
-      vp9_decode_coefs_4x4(pbi, xd, r, PLANE_TYPE_Y_WITH_DC, i);
-#endif
+
+    uint8_t* dst = raster_block_offset_uint8(xd, bsize, 0, i,
+                                             xd->plane[0].dst.buf,
+                                             xd->plane[0].dst.stride);
+
     vp9_intra4x4_predict(xd, i, bsize, b_mode, dst, xd->plane[0].dst.stride);
     // TODO(jingning): refactor to use foreach_transformed_block_in_plane_
     tx_type = get_tx_type_4x4(xd, i);
     dequant_add_y(xd, tx_type, i, bsize);
   }
-#if CONFIG_NEWBINTRAMODES
-  if (!xd->mode_info_context->mbmi.mb_skip_coeff)
-    vp9_decode_mb_tokens_4x4_uv(pbi, xd, r);
-#endif
+
   foreach_transformed_block_uv(xd, bsize, decode_block, xd);
 }
 
@@ -388,9 +285,6 @@
       mb_init_dequantizer(&pbi->common, xd);
 
     if (!vp9_reader_has_error(r)) {
-#if CONFIG_NEWBINTRAMODES
-    if (mbmi->mode != I4X4_PRED)
-#endif
       vp9_decode_tokens(pbi, xd, r, bsize);
     }
   }
@@ -446,35 +340,6 @@
   }
 }
 
-#if !CONFIG_SB8X8
-// TODO(jingning): This only performs I8X8_PRED decoding process, which will be
-// automatically covered by decode_sb, when SB8X8 is on.
-static void decode_mb(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                     int mi_row, int mi_col,
-                     vp9_reader *r) {
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  const int tx_size = mbmi->txfm_size;
-
-  assert(mbmi->sb_type == BLOCK_SIZE_MB16X16);
-
-  if (mbmi->mb_skip_coeff) {
-    vp9_reset_sb_tokens_context(xd, BLOCK_SIZE_MB16X16);
-  } else {
-    // re-initialize macroblock dequantizer before detokenization
-    if (xd->segmentation_enabled)
-      mb_init_dequantizer(&pbi->common, xd);
-
-    if (!vp9_reader_has_error(r))
-      vp9_decode_tokens(pbi, xd, r, BLOCK_SIZE_MB16X16);
-  }
-
-  if (tx_size == TX_8X8)
-    decode_8x8(xd);
-  else
-    decode_4x4(pbi, xd, r);
-}
-#endif
-
 static int get_delta_q(vp9_reader *r, int *dq) {
   const int old_value = *dq;
 
@@ -493,14 +358,8 @@
   const int bw = 1 << mi_width_log2(bsize);
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  int i;
-
   const int mi_idx = mi_row * cm->mode_info_stride + mi_col;
-  const YV12_BUFFER_CONFIG *dst_fb = &cm->yv12_fb[cm->new_fb_idx];
-  const int recon_yoffset =
-      (MI_SIZE * mi_row) * dst_fb->y_stride + (MI_SIZE * mi_col);
-  const int recon_uvoffset =
-      (MI_UV_SIZE * mi_row) * dst_fb->uv_stride + (MI_UV_SIZE * mi_col);
+  int i;
 
   xd->mode_info_context = cm->mi + mi_idx;
   xd->mode_info_context->mbmi.sb_type = bsize;
@@ -508,20 +367,18 @@
 
   for (i = 0; i < MAX_MB_PLANE; i++) {
     xd->plane[i].above_context = cm->above_context[i] +
-        (mi_col * 4 >> (xd->plane[i].subsampling_x + CONFIG_SB8X8));
+        (mi_col * 2 >> xd->plane[i].subsampling_x);
     xd->plane[i].left_context = cm->left_context[i] +
-        (((mi_row * 4 >> CONFIG_SB8X8) & 15) >> xd->plane[i].subsampling_y);
+        (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y);
   }
-  xd->above_seg_context = cm->above_seg_context + (mi_col >> CONFIG_SB8X8);
-  xd->left_seg_context  = cm->left_seg_context + ((mi_row >> CONFIG_SB8X8) & 3);
+  xd->above_seg_context = cm->above_seg_context + mi_col;
+  xd->left_seg_context  = cm->left_seg_context + (mi_row & MI_MASK);
 
   // Distance of Mb to the various image edges. These are specified to 8th pel
   // as they are always compared to values that are in 1/8th pel units
   set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);
 
-  xd->plane[0].dst.buf = dst_fb->y_buffer + recon_yoffset;
-  xd->plane[1].dst.buf = dst_fb->u_buffer + recon_uvoffset;
-  xd->plane[2].dst.buf = dst_fb->v_buffer + recon_uvoffset;
+  setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], mi_row, mi_col);
 }
 
 static void set_refs(VP9D_COMP *pbi, int mi_row, int mi_col) {
@@ -560,33 +417,12 @@
   vp9_decode_mb_mode_mv(pbi, xd, mi_row, mi_col, r);
   set_refs(pbi, mi_row, mi_col);
 
-#if CONFIG_SB8X8
   if (bsize == BLOCK_SIZE_SB8X8 &&
       (xd->mode_info_context->mbmi.mode == SPLITMV ||
        xd->mode_info_context->mbmi.mode == I4X4_PRED))
     decode_atom(pbi, xd, mi_row, mi_col, r, bsize);
   else
     decode_sb(pbi, xd, mi_row, mi_col, r, bsize);
-#else
-  // TODO(jingning): merge decode_sb_ and decode_mb_
-  if (bsize > BLOCK_SIZE_MB16X16) {
-    decode_sb(pbi, xd, mi_row, mi_col, r, bsize);
-  } else {
-    // TODO(jingning): In transition of separating functionalities of decode_mb
-    // into decode_sb and decode_atom. Will remove decode_mb and clean this up
-    // when SB8X8 is on.
-    if (xd->mode_info_context->mbmi.mode == I4X4_PRED ||
-        (xd->mode_info_context->mbmi.mode == SPLITMV &&
-         xd->mode_info_context->mbmi.partitioning == PARTITIONING_4X4))
-      decode_atom(pbi, xd, mi_row, mi_col, r, bsize);
-    else if (xd->mode_info_context->mbmi.mode != I8X8_PRED)
-      decode_sb(pbi, xd, mi_row, mi_col, r, bsize);
-    else
-      // TODO(jingning): decode_mb still carries deocding process of I8X8_PRED.
-      // This will be covered by decode_sb when SB8X8 is on.
-      decode_mb(pbi, xd, mi_row, mi_col, r);
-  }
-#endif
 
   xd->corrupted |= vp9_reader_has_error(r);
 }
@@ -603,16 +439,11 @@
   if (mi_row >= pc->mi_rows || mi_col >= pc->mi_cols)
     return;
 
-#if CONFIG_SB8X8
   if (bsize > BLOCK_SIZE_SB8X8) {
-#else
-  if (bsize > BLOCK_SIZE_MB16X16) {
-#endif
     int pl;
     // read the partition information
-    xd->left_seg_context =
-        pc->left_seg_context + ((mi_row >> CONFIG_SB8X8) & 3);
-    xd->above_seg_context = pc->above_seg_context + (mi_col >> CONFIG_SB8X8);
+    xd->left_seg_context = pc->left_seg_context + (mi_row & MI_MASK);
+    xd->above_seg_context = pc->above_seg_context + mi_col;
     pl = partition_plane_context(xd, bsize);
     partition = treed_read(r, vp9_partition_tree,
                            pc->fc.partition_prob[pl]);
@@ -626,28 +457,18 @@
       break;
     case PARTITION_HORZ:
       decode_modes_b(pbi, mi_row, mi_col, r, subsize);
-      if ((mi_row + bs) < pc->mi_rows)
+      if (mi_row + bs < pc->mi_rows)
         decode_modes_b(pbi, mi_row + bs, mi_col, r, subsize);
       break;
     case PARTITION_VERT:
       decode_modes_b(pbi, mi_row, mi_col, r, subsize);
-      if ((mi_col + bs) < pc->mi_cols)
+      if (mi_col + bs < pc->mi_cols)
         decode_modes_b(pbi, mi_row, mi_col + bs, r, subsize);
       break;
     case PARTITION_SPLIT:
       for (n = 0; n < 4; n++) {
         int j = n >> 1, i = n & 0x01;
-        if (subsize == BLOCK_SIZE_SB32X32)
-          xd->sb_index = n;
-#if CONFIG_SB8X8
-        else if (subsize == BLOCK_SIZE_MB16X16)
-          xd->mb_index = n;
-        else
-          xd->b_index = n;
-#else
-        else
-          xd->mb_index = n;
-#endif
+        *(get_sb_index(xd, subsize)) = n;
         decode_modes_sb(pbi, mi_row + j * bs, mi_col + i * bs, r, subsize);
       }
       break;
@@ -655,16 +476,11 @@
       assert(0);
   }
   // update partition context
-#if CONFIG_SB8X8
-  if ((partition == PARTITION_SPLIT) && (bsize > BLOCK_SIZE_MB16X16))
-#else
-  if ((partition == PARTITION_SPLIT) && (bsize > BLOCK_SIZE_SB32X32))
-#endif
-    return;
-
-  xd->left_seg_context = pc->left_seg_context + ((mi_row >> CONFIG_SB8X8) & 3);
-  xd->above_seg_context = pc->above_seg_context + (mi_col >> CONFIG_SB8X8);
-  update_partition_context(xd, subsize, bsize);
+  if (bsize > BLOCK_SIZE_SB8X8 &&
+      (bsize == BLOCK_SIZE_MB16X16 || partition != PARTITION_SPLIT)) {
+    set_partition_seg_context(pc, xd, mi_row, mi_col);
+    update_partition_context(xd, subsize, bsize);
+  }
 }
 
 static void setup_token_decoder(VP9D_COMP *pbi,
@@ -706,54 +522,6 @@
   xd->mode_info_stride = pc->mode_info_stride;
 }
 
-#if CONFIG_CODE_ZEROGROUP
-static void read_zpc_probs_common(VP9_COMMON *cm,
-                                  vp9_reader* bc,
-                                  TX_SIZE tx_size) {
-  int r, b, p, n;
-  vp9_zpc_probs *zpc_probs;
-  vp9_prob upd = ZPC_UPDATE_PROB;
-  if (!get_zpc_used(tx_size)) return;
-  if (!vp9_read_bit(bc)) return;
-
-  if (tx_size == TX_32X32) {
-    zpc_probs = &cm->fc.zpc_probs_32x32;
-  } else if (tx_size == TX_16X16) {
-    zpc_probs = &cm->fc.zpc_probs_16x16;
-  } else if (tx_size == TX_8X8) {
-    zpc_probs = &cm->fc.zpc_probs_8x8;
-  } else {
-    zpc_probs = &cm->fc.zpc_probs_4x4;
-  }
-  for (r = 0; r < REF_TYPES; ++r) {
-    for (b = 0; b < ZPC_BANDS; ++b) {
-      for (p = 0; p < ZPC_PTOKS; ++p) {
-        for (n = 0; n < ZPC_NODES; ++n) {
-          vp9_prob *q = &(*zpc_probs)[r][b][p][n];
-#if USE_ZPC_EXTRA == 0
-          if (n == 1) continue;
-#endif
-          if (vp9_read(bc, upd)) {
-            *q = read_prob_diff_update(bc, *q);
-          }
-        }
-      }
-    }
-  }
-}
-
-static void read_zpc_probs(VP9_COMMON *cm,
-                           vp9_reader* bc) {
-  read_zpc_probs_common(cm, bc, TX_4X4);
-  if (cm->txfm_mode > ONLY_4X4)
-    read_zpc_probs_common(cm, bc, TX_8X8);
-  if (cm->txfm_mode > ALLOW_8X8)
-    read_zpc_probs_common(cm, bc, TX_16X16);
-  if (cm->txfm_mode > ALLOW_16X16)
-    read_zpc_probs_common(cm, bc, TX_32X32);
-}
-#endif  // CONFIG_CODE_ZEROGROUP
-
 static void read_coef_probs_common(vp9_coeff_probs *coef_probs,
                                    TX_SIZE tx_size,
                                    vp9_reader *r) {
@@ -882,7 +650,6 @@
 }
 
 static void setup_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd, vp9_reader *r) {
-  pc->filter_type = (LOOPFILTER_TYPE) vp9_read_bit(r);
   pc->filter_level = vp9_read_literal(r, 6);
   pc->sharpness_level = vp9_read_literal(r, 3);
 
@@ -1014,13 +781,7 @@
   vp9_copy(fc->pre_sb_ymode_prob, fc->sb_ymode_prob);
   vp9_copy(fc->pre_uv_mode_prob, fc->uv_mode_prob);
   vp9_copy(fc->pre_bmode_prob, fc->bmode_prob);
-#if !CONFIG_SB8X8
-  vp9_copy(fc->pre_i8x8_mode_prob, fc->i8x8_mode_prob);
-#endif
   vp9_copy(fc->pre_sub_mv_ref_prob, fc->sub_mv_ref_prob);
-#if !CONFIG_SB8X8
-  vp9_copy(fc->pre_mbsplit_prob, fc->mbsplit_prob);
-#endif
   vp9_copy(fc->pre_partition_prob, fc->partition_prob);
   fc->pre_nmvc = fc->nmvc;
 
@@ -1033,33 +794,10 @@
   vp9_zero(fc->sb_ymode_counts);
   vp9_zero(fc->uv_mode_counts);
   vp9_zero(fc->bmode_counts);
-#if !CONFIG_SB8X8
-  vp9_zero(fc->i8x8_mode_counts);
-#endif
   vp9_zero(fc->sub_mv_ref_counts);
-#if !CONFIG_SB8X8
-  vp9_zero(fc->mbsplit_counts);
-#endif
   vp9_zero(fc->NMVcount);
   vp9_zero(fc->mv_ref_ct);
   vp9_zero(fc->partition_counts);
-
-#if CONFIG_COMP_INTERINTRA_PRED
-  fc->pre_interintra_prob = fc->interintra_prob;
-  vp9_zero(fc->interintra_counts);
-#endif
-
-#if CONFIG_CODE_ZEROGROUP
-  vp9_copy(fc->pre_zpc_probs_4x4, fc->zpc_probs_4x4);
-  vp9_copy(fc->pre_zpc_probs_8x8, fc->zpc_probs_8x8);
-  vp9_copy(fc->pre_zpc_probs_16x16, fc->zpc_probs_16x16);
-  vp9_copy(fc->pre_zpc_probs_32x32, fc->zpc_probs_32x32);
-
-  vp9_zero(fc->zpc_counts_4x4);
-  vp9_zero(fc->zpc_counts_8x8);
-  vp9_zero(fc->zpc_counts_16x16);
-  vp9_zero(fc->zpc_counts_32x32);
-#endif
 }
 
 static void decode_tile(VP9D_COMP *pbi, vp9_reader *r) {
@@ -1067,14 +805,13 @@
   int mi_row, mi_col;
 
   for (mi_row = pc->cur_tile_mi_row_start;
-       mi_row < pc->cur_tile_mi_row_end; mi_row += (4 << CONFIG_SB8X8)) {
+       mi_row < pc->cur_tile_mi_row_end; mi_row += 64 / MI_SIZE) {
     // For a SB there are 2 left contexts, each pertaining to a MB row within
     vpx_memset(&pc->left_context, 0, sizeof(pc->left_context));
     vpx_memset(pc->left_seg_context, 0, sizeof(pc->left_seg_context));
     for (mi_col = pc->cur_tile_mi_col_start;
-         mi_col < pc->cur_tile_mi_col_end; mi_col += (4 << CONFIG_SB8X8)) {
+         mi_col < pc->cur_tile_mi_col_end; mi_col += 64 / MI_SIZE)
       decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_SIZE_SB64X64);
-    }
   }
 }
 
@@ -1102,11 +839,11 @@
 
   // Note: this memset assumes above_context[0], [1] and [2]
   // are allocated as part of the same buffer.
-  vpx_memset(pc->above_context[0], 0, sizeof(ENTROPY_CONTEXT) * 4 *
-                                      MAX_MB_PLANE * mb_cols_aligned_to_sb(pc));
+  vpx_memset(pc->above_context[0], 0, sizeof(ENTROPY_CONTEXT) * 2 *
+                                      MAX_MB_PLANE * mi_cols_aligned_to_sb(pc));
 
   vpx_memset(pc->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
-                                       mb_cols_aligned_to_sb(pc));
+                                       mi_cols_aligned_to_sb(pc));
 
   if (pbi->oxcf.inv_tile_order) {
     const int n_cols = pc->tile_columns;
@@ -1186,6 +923,8 @@
     pc->version = (data[0] >> 1) & 7;
     pc->show_frame = (data[0] >> 4) & 1;
     scaling_active = (data[0] >> 5) & 1;
+    pc->subsampling_x = (data[0] >> 6) & 1;
+    pc->subsampling_y = (data[0] >> 7) & 1;
     first_partition_size = read_le16(data + 1);
 
     if (!read_is_valid(data, first_partition_size, data_end))
@@ -1218,8 +957,9 @@
   init_frame(pbi);
 
   // Reset the frame pointers to the current frame size
-  vp8_yv12_realloc_frame_buffer(new_fb, pc->width, pc->height,
-                                VP9BORDERINPIXELS);
+  vp9_realloc_frame_buffer(new_fb, pc->width, pc->height,
+                           pc->subsampling_x, pc->subsampling_y,
+                           VP9BORDERINPIXELS);
 
   if (vp9_reader_init(&header_bc, data, first_partition_size))
     vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
@@ -1246,8 +986,6 @@
 
   setup_loopfilter(pc, xd, &header_bc);
 
-  vp9_read_literal(&header_bc, 2);  // unused
-
   setup_quantization(pbi, &header_bc);
 
   // Determine if the golden frame or ARF buffer should be updated and how.
@@ -1271,7 +1009,9 @@
       if (mapped_ref >= NUM_YV12_BUFFERS)
         memset(sf, 0, sizeof(*sf));
       else
-        vp9_setup_scale_factors_for_frame(sf, fb, pc->width, pc->height);
+        vp9_setup_scale_factors_for_frame(sf,
+                                          fb->y_crop_width, fb->y_crop_height,
+                                          pc->width, pc->height);
     }
 
     // Read the sign bias for each reference frame buffer.
@@ -1282,10 +1022,6 @@
     xd->allow_high_precision_mv = vp9_read_bit(&header_bc);
     pc->mcomp_filter_type = read_mcomp_filter_type(&header_bc);
 
-#if CONFIG_COMP_INTERINTRA_PRED
-    pc->use_interintra = vp9_read_bit(&header_bc);
-#endif
-
     // To enable choice of different interpolation filters
     vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
   }
@@ -1323,9 +1059,6 @@
   update_frame_context(&pc->fc);
 
   read_coef_probs(pbi, &header_bc);
-#if CONFIG_CODE_ZEROGROUP
-  read_zpc_probs(pc, &header_bc);
-#endif
 
   // Initialize xd pointers. Any reference should do for xd->pre, so use 0.
   setup_pre_planes(xd, &pc->yv12_fb[pc->active_ref_idx[0]], NULL,
@@ -1337,17 +1070,11 @@
     CHECK_MEM_ERROR(pc->last_frame_seg_map,
                     vpx_calloc((pc->mi_rows * pc->mi_cols), 1));
 
-  // set up frame new frame for intra coded blocks
-  vp9_setup_intra_recon(new_fb);
-
-  vp9_setup_block_dptrs(xd);
+  vp9_setup_block_dptrs(xd, pc->subsampling_x, pc->subsampling_y);
 
   // clear out the coeff buffer
-  vpx_memset(xd->plane[0].qcoeff, 0, sizeof(xd->plane[0].qcoeff));
-  vpx_memset(xd->plane[1].qcoeff, 0, sizeof(xd->plane[1].qcoeff));
-  vpx_memset(xd->plane[2].qcoeff, 0, sizeof(xd->plane[2].qcoeff));
-
-  vp9_read_bit(&header_bc);  // unused
+  for (i = 0; i < MAX_MB_PLANE; ++i)
+    vp9_zero(xd->plane[i].qcoeff);
 
   vp9_decode_mode_mvs_init(pbi, &header_bc);
 
@@ -1369,13 +1096,11 @@
   // Adaptation
   if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) {
     vp9_adapt_coef_probs(pc);
-#if CONFIG_CODE_ZEROGROUP
-    vp9_adapt_zpc_probs(pc);
-#endif
+
     if (pc->frame_type != KEY_FRAME) {
       vp9_adapt_mode_probs(pc);
-      vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);
       vp9_adapt_mode_context(pc);
+      vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);
     }
   }
 
diff --git a/vp9/decoder/vp9_decodframe.h b/vp9/decoder/vp9_decodframe.h
index 3aaae65..00b6d67 100644
--- a/vp9/decoder/vp9_decodframe.h
+++ b/vp9/decoder/vp9_decodframe.h
@@ -13,7 +13,9 @@
 #define VP9_DECODER_VP9_DECODFRAME_H_
 
 struct VP9Common;
+struct VP9Decompressor;
 
 void vp9_init_dequantizer(struct VP9Common *pc);
+int vp9_decode_frame(struct VP9Decompressor *cpi, const uint8_t **p_data_end);
 
 #endif  // VP9_DECODER_VP9_DECODFRAME_H_
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 0ef25ba..22d3cf8 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -60,28 +60,12 @@
 
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
 
-#if CONFIG_CODE_ZEROGROUP
-#define ZEROGROUP_ADVANCE()                \
-  do {                                     \
-    token_cache[scan[c]] = ZERO_TOKEN;     \
-    is_last_zero[o] = 1;                   \
-    c++;                                   \
-  } while (0)
-#define INCREMENT_COUNT(token)             \
-  do {                                     \
-    coef_counts[type][ref][get_coef_band(scan, txfm_size, c)] \
-               [pt][token]++;     \
-    token_cache[scan[c]] = token; \
-    is_last_zero[o] = (token == ZERO_TOKEN);    \
-  } while (0)
-#else
 #define INCREMENT_COUNT(token)               \
   do {                                       \
-    coef_counts[type][ref][get_coef_band(scan, txfm_size, c)] \
+    coef_counts[type][ref][band] \
                [pt][token]++;     \
     token_cache[scan[c]] = token; \
   } while (0)
-#endif
 
 #define WRITE_COEF_CONTINUE(val, token)                  \
   {                                                      \
@@ -92,12 +76,6 @@
     continue;                                            \
   }
 
-#define WRITE_COEF_ONE()                                 \
-{                                                        \
-  qcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(br, 1);  \
-  INCREMENT_COUNT(ONE_TOKEN);                            \
-}
-
 #define ADJUST_COEF(prob, bits_count)  \
   do {                                 \
     if (vp9_read(r, prob))             \
@@ -112,25 +90,15 @@
   ENTROPY_CONTEXT above_ec, left_ec;
   FRAME_CONTEXT *const fc = &dx->common.fc;
   int pt, c = 0, pad, default_eob;
+  int band;
   vp9_coeff_probs *coef_probs;
   vp9_prob *prob;
   vp9_coeff_count *coef_counts;
   const int ref = xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME;
   TX_TYPE tx_type = DCT_DCT;
-#if CONFIG_CODE_ZEROGROUP
-  int is_eoo[3] = {0, 0, 0};
-  int is_last_zero[3] = {0, 0, 0};
-  int o, rc;
-  vp9_zpc_probs *zpc_probs;
-  vp9_zpc_count *zpc_count;
-  vp9_prob *zprobs;
-  int eoo = 0, use_eoo;
-#endif
   const int *scan, *nb;
   uint8_t token_cache[1024];
-#if CONFIG_CODE_ZEROGROUP
-  vpx_memset(token_cache, UNKNOWN_TOKEN, sizeof(token_cache));
-#endif
+  const uint8_t * band_translate;
 
   switch (txfm_size) {
     default:
@@ -143,10 +111,7 @@
       coef_probs  = fc->coef_probs_4x4;
       coef_counts = fc->coef_counts_4x4;
       default_eob = 16;
-#if CONFIG_CODE_ZEROGROUP
-      zpc_probs = &(fc->zpc_probs_4x4);
-      zpc_count = &(fc->zpc_counts_4x4);
-#endif
+      band_translate = vp9_coefband_trans_4x4;
       break;
     }
     case TX_8X8: {
@@ -162,10 +127,7 @@
       above_ec = (A[0] + A[1]) != 0;
       left_ec = (L[0] + L[1]) != 0;
       default_eob = 64;
-#if CONFIG_CODE_ZEROGROUP
-      zpc_probs = &(fc->zpc_probs_8x8);
-      zpc_count = &(fc->zpc_counts_8x8);
-#endif
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_16X16: {
@@ -181,10 +143,7 @@
       above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
       left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
       default_eob = 256;
-#if CONFIG_CODE_ZEROGROUP
-      zpc_probs = &(fc->zpc_probs_16x16);
-      zpc_count = &(fc->zpc_counts_16x16);
-#endif
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_32X32:
@@ -194,10 +153,7 @@
       above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
       left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
       default_eob = 1024;
-#if CONFIG_CODE_ZEROGROUP
-      zpc_probs = &fc->zpc_probs_32x32;
-      zpc_count = &fc->zpc_counts_32x32;
-#endif
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
   }
 
@@ -206,27 +162,17 @@
 
   while (1) {
     int val;
-    int band;
     const uint8_t *cat6 = cat6_prob;
     if (c >= seg_eob)
       break;
     if (c)
       pt = vp9_get_coef_context(scan, nb, pad, token_cache,
                                 c, default_eob);
-    band = get_coef_band(scan, txfm_size, c);
+    band = get_coef_band(band_translate, c);
     prob = coef_probs[type][ref][band][pt];
     fc->eob_branch_counts[txfm_size][type][ref][band][pt]++;
     if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
       break;
-#if CONFIG_CODE_ZEROGROUP
-    rc = scan[c];
-    o = vp9_get_orientation(rc, txfm_size);
-    if (token_cache[rc] == ZERO_TOKEN || is_eoo[o]) {
-      coef_counts[type][ref][band][pt][ZERO_TOKEN]++;
-      ZEROGROUP_ADVANCE();
-      goto SKIP_START;
-    }
-#endif
 
 SKIP_START:
     if (c >= seg_eob)
@@ -234,37 +180,10 @@
     if (c)
       pt = vp9_get_coef_context(scan, nb, pad, token_cache,
                                 c, default_eob);
-    band = get_coef_band(scan, txfm_size, c);
+    band = get_coef_band(band_translate, c);
     prob = coef_probs[type][ref][band][pt];
-#if CONFIG_CODE_ZEROGROUP
-    rc = scan[c];
-    o = vp9_get_orientation(rc, txfm_size);
-    if (token_cache[rc] == ZERO_TOKEN || is_eoo[o]) {
-      ZEROGROUP_ADVANCE();
-      goto SKIP_START;
-    }
-    zprobs = (*zpc_probs)[ref]
-             [coef_to_zpc_band(band)]
-             [coef_to_zpc_ptok(pt)];
-#endif
+
     if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
-#if CONFIG_CODE_ZEROGROUP
-      eoo = 0;
-#if USE_ZPC_EOORIENT == 1
-      use_eoo = vp9_use_eoo(c, seg_eob, scan, txfm_size, is_last_zero, is_eoo);
-#else
-      use_eoo = 0;
-#endif
-      if (use_eoo) {
-        eoo = !vp9_read(r, zprobs[0]);
-        ++(*zpc_count)[ref]
-                      [coef_to_zpc_band(band)]
-                      [coef_to_zpc_ptok(pt)][0][!eoo];
-        if (eoo) {
-          is_eoo[o] = 1;
-        }
-      }
-#endif
       INCREMENT_COUNT(ZERO_TOKEN);
       ++c;
       goto SKIP_START;
@@ -330,8 +249,7 @@
   }
 
   if (c < seg_eob)
-    coef_counts[type][ref][get_coef_band(scan, txfm_size, c)]
-        [pt][DCT_EOB_TOKEN]++;
+    coef_counts[type][ref][band][pt][DCT_EOB_TOKEN]++;
 
   for (pt = 0; pt < (1 << txfm_size); pt++) {
     A[pt] = L[pt] = c > 0;
@@ -388,51 +306,3 @@
   foreach_transformed_block(xd, bsize, decode_block, &args);
   return eobtotal;
 }
-
-#if CONFIG_NEWBINTRAMODES
-static int decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,
-                            vp9_reader *r,
-                            PLANE_TYPE type, int i, int seg_eob) {
-  const struct plane_block_idx pb_idx = plane_block_idx(16, i);
-  const int mod = 2 - xd->plane[pb_idx.plane].subsampling_x;
-  const int aoff = pb_idx.block & ((1 << mod) - 1);
-  const int loff = pb_idx.block >> mod;
-  ENTROPY_CONTEXT *A = xd->plane[pb_idx.plane].above_context;
-  ENTROPY_CONTEXT *L = xd->plane[pb_idx.plane].left_context;
-  const int c = decode_coefs(dx, xd, r, i, type, seg_eob,
-      BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16), TX_4X4,
-      xd->plane[pb_idx.plane].dequant, A + aoff, L + loff);
-  xd->plane[pb_idx.plane].eobs[pb_idx.block] = c;
-  return c;
-}
-
-static int decode_mb_tokens_4x4_uv(VP9D_COMP* const dx,
-                                   MACROBLOCKD* const xd,
-                                   vp9_reader *r,
-                                   int seg_eob) {
-  int i, eobtotal = 0;
-
-  // chroma blocks
-  for (i = 16; i < 24; i++)
-    eobtotal += decode_coefs_4x4(dx, xd, r, PLANE_TYPE_UV, i, seg_eob);
-
-  return eobtotal;
-}
-
-int vp9_decode_mb_tokens_4x4_uv(VP9D_COMP* const dx,
-                                MACROBLOCKD* const xd,
-                                vp9_reader *r) {
-  const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  const int seg_eob = get_eob(xd, segment_id, 16);
-
-  return decode_mb_tokens_4x4_uv(dx, xd, r, seg_eob);
-}
-
-int vp9_decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,
-                         vp9_reader *r,
-                         PLANE_TYPE type, int i) {
-  const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  const int seg_eob = get_eob(xd, segment_id, 16);
-  return decode_coefs_4x4(dx, xd, r, type, i, seg_eob);
-}
-#endif
diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index 0e53c0e..13235bd 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@@ -19,12 +19,4 @@
                       vp9_reader *r,
                       BLOCK_SIZE_TYPE bsize);
 
-#if CONFIG_NEWBINTRAMODES
-int vp9_decode_mb_tokens_4x4_uv(VP9D_COMP* const dx, MACROBLOCKD* const xd,
-                                vp9_reader *r);
-int vp9_decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,
-                         vp9_reader *r,
-                         PLANE_TYPE type, int i);
-#endif
-
 #endif  // VP9_DECODER_VP9_DETOKENIZE_H_
diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index 9582e8f..c0ce311 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@@ -21,8 +21,6 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_loopfilter.h"
-#include "vp9/common/vp9_swapyv12buffer.h"
-
 #include "vp9/common/vp9_quant_common.h"
 #include "vpx_scale/vpx_scale.h"
 #include "vp9/common/vp9_systemdependent.h"
diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h
index a7d444e..8698570 100644
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h
@@ -41,8 +41,6 @@
   int initial_height;
 } VP9D_COMP;
 
-int vp9_decode_frame(VP9D_COMP *cpi, const uint8_t **p_data_end);
-
 
 #if CONFIG_DEBUG
 #define CHECK_MEM_ERROR(lval,expr) do {\
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 7152ac9..ac29a8e 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -50,18 +50,6 @@
 extern unsigned int active_section;
 #endif
 
-#if CONFIG_CODE_ZEROGROUP
-#ifdef ZPC_STATS
-vp9_zpc_count zpc_stats_4x4;
-vp9_zpc_count zpc_stats_8x8;
-vp9_zpc_count zpc_stats_16x16;
-vp9_zpc_count zpc_stats_32x32;
-void init_zpcstats();
-void update_zpcstats(VP9_COMMON *const cm);
-void print_zpcstats();
-#endif
-#endif
-
 #ifdef MODE_STATS
 int count_mb_seg[4] = { 0, 0, 0, 0 };
 #endif
@@ -281,22 +269,12 @@
   write_token(bc, vp9_uv_mode_tree, p, vp9_sb_kf_ymode_encodings + m);
 }
 
-#if !CONFIG_SB8X8
-static void write_i8x8_mode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_i8x8_mode_tree, p, vp9_i8x8_mode_encodings + m);
-}
-#endif
-
 static void write_uv_mode(vp9_writer *bc, int m, const vp9_prob *p) {
   write_token(bc, vp9_uv_mode_tree, p, vp9_uv_mode_encodings + m);
 }
 
 
 static void write_bmode(vp9_writer *bc, int m, const vp9_prob *p) {
-#if CONFIG_NEWBINTRAMODES
-  assert(m < B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS || m == B_CONTEXT_PRED);
-  if (m == B_CONTEXT_PRED) m -= CONTEXT_PRED_REPLACEMENTS;
-#endif
   write_token(bc, vp9_bmode_tree, p, vp9_bmode_encodings + m);
 }
 
@@ -304,12 +282,6 @@
   write_token(bc, vp9_kf_bmode_tree, p, vp9_kf_bmode_encodings + m);
 }
 
-#if !CONFIG_SB8X8
-static void write_split(vp9_writer *bc, int x, const vp9_prob *p) {
-  write_token(bc, vp9_mbsplit_tree, p, vp9_mbsplit_encodings + x);
-}
-#endif
-
 static int prob_update_savings(const unsigned int *ct,
                                const vp9_prob oldp, const vp9_prob newp,
                                const vp9_prob upd) {
@@ -416,20 +388,6 @@
       break;
     }
     assert(pp != 0);
-#if CONFIG_CODE_ZEROGROUP
-    if (t == ZPC_ISOLATED || t == ZPC_EOORIENT) {
-      assert((p - 1)->token == ZERO_TOKEN);
-      encode_bool(bc, t == ZPC_ISOLATED, *pp);
-      ++p;
-      continue;
-    } else if (p->skip_coef_val) {
-      assert(p->skip_eob_node == 0);
-      assert(t == DCT_EOB_TOKEN || t == ZERO_TOKEN);
-      encode_bool(bc, t == ZERO_TOKEN, *pp);
-      ++p;
-      continue;
-    }
-#endif
 
     /* skip one or two nodes */
     if (p->skip_eob_node) {
@@ -671,11 +629,7 @@
     active_section = 6;
 #endif
 
-#if CONFIG_SB8X8
     if (m->mbmi.sb_type > BLOCK_SIZE_SB8X8)
-#else
-    if (m->mbmi.sb_type > BLOCK_SIZE_MB16X16)
-#endif
       write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob);
     else
       write_ymode(bc, mode, pc->fc.ymode_prob);
@@ -685,24 +639,10 @@
       do {
         write_bmode(bc, m->bmi[j].as_mode.first,
                     pc->fc.bmode_prob);
-      } while (++j < (16 >> (CONFIG_SB8X8 * 2)));
+      } while (++j < 4);
     }
-#if !CONFIG_SB8X8
-    if (mode == I8X8_PRED) {
-      write_i8x8_mode(bc, m->bmi[0].as_mode.first,
-                      pc->fc.i8x8_mode_prob);
-      write_i8x8_mode(bc, m->bmi[2].as_mode.first,
-                      pc->fc.i8x8_mode_prob);
-      write_i8x8_mode(bc, m->bmi[8].as_mode.first,
-                      pc->fc.i8x8_mode_prob);
-      write_i8x8_mode(bc, m->bmi[10].as_mode.first,
-                      pc->fc.i8x8_mode_prob);
-    } else
-#endif
-    {
-      write_uv_mode(bc, mi->uv_mode,
-                    pc->fc.uv_mode_prob[mode]);
-    }
+    write_uv_mode(bc, mi->uv_mode,
+                  pc->fc.uv_mode_prob[mode]);
   } else {
     vp9_prob mv_ref_p[VP9_MVREFS - 1];
 
@@ -714,11 +654,7 @@
 
     // If segment skip is not enabled code the mode.
     if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
-#if CONFIG_SB8X8
       if (mi->sb_type > BLOCK_SIZE_SB8X8) {
-#else
-      if (mi->sb_type > BLOCK_SIZE_MB16X16) {
-#endif
         write_sb_mv_ref(bc, mode, mv_ref_p);
       } else {
         write_mv_ref(bc, mode, mv_ref_p);
@@ -744,27 +680,6 @@
       vp9_write(bc, mi->second_ref_frame > INTRA_FRAME,
                 vp9_get_pred_prob(pc, xd, PRED_COMP));
     }
-#if CONFIG_COMP_INTERINTRA_PRED
-    if (cpi->common.use_interintra &&
-        mode >= NEARESTMV && mode < SPLITMV &&
-        mi->second_ref_frame <= INTRA_FRAME) {
-      vp9_write(bc, mi->second_ref_frame == INTRA_FRAME,
-                pc->fc.interintra_prob);
-      // if (!cpi->dummy_packing)
-      //   printf("-- %d (%d)\n", mi->second_ref_frame == INTRA_FRAME,
-      //          pc->fc.interintra_prob);
-      if (mi->second_ref_frame == INTRA_FRAME) {
-        // if (!cpi->dummy_packing)
-        //   printf("** %d %d\n", mi->interintra_mode,
-        // mi->interintra_uv_mode);
-        write_ymode(bc, mi->interintra_mode, pc->fc.ymode_prob);
-#if SEPARATE_INTERINTRA_UV
-        write_uv_mode(bc, mi->interintra_uv_mode,
-                      pc->fc.uv_mode_prob[mi->interintra_mode]);
-#endif
-      }
-    }
-#endif
 
     switch (mode) { /* new, split require MVs */
       case NEWMV:
@@ -788,34 +703,16 @@
         ++count_mb_seg[mi->partitioning];
 #endif
 
-#if !CONFIG_SB8X8
-        write_split(bc, mi->partitioning, cpi->common.fc.mbsplit_prob);
-        cpi->mbsplit_count[mi->partitioning]++;
-#endif
-
         do {
           B_PREDICTION_MODE blockmode;
           int_mv blockmv;
-#if !CONFIG_SB8X8
-          const int *const  L = vp9_mbsplits[mi->partitioning];
-#endif
           int k = -1;  /* first block in subset j */
           int mv_contz;
           int_mv leftmv, abovemv;
 
           blockmode = cpi->mb.partition_info->bmi[j].mode;
           blockmv = cpi->mb.partition_info->bmi[j].mv;
-#if CONFIG_SB8X8
           k = j;
-#else
-#if CONFIG_DEBUG
-          while (j != L[++k])
-            if (k >= 16)
-              assert(0);
-#else
-          while (j != L[++k]);
-#endif
-#endif
           leftmv.as_int = left_block_mv(xd, m, k);
           abovemv.as_int = above_block_mv(m, k, mis);
           mv_contz = vp9_mv_cont(&leftmv, &abovemv);
@@ -847,7 +744,6 @@
     }
   }
 
-#if CONFIG_SB8X8
   if (((rf == INTRA_FRAME && mode != I4X4_PRED) ||
        (rf != INTRA_FRAME && mode != SPLITMV)) &&
       pc->txfm_mode == TX_MODE_SELECT &&
@@ -862,23 +758,6 @@
         vp9_write(bc, sz != TX_16X16, pc->prob_tx[2]);
     }
   }
-#else
-  if (((rf == INTRA_FRAME && mode <= I8X8_PRED) ||
-       (rf != INTRA_FRAME && !(mode == SPLITMV &&
-                               mi->partitioning == PARTITIONING_4X4))) &&
-      pc->txfm_mode == TX_MODE_SELECT &&
-          !(skip_coeff || vp9_segfeature_active(xd, segment_id,
-                                                SEG_LVL_SKIP))) {
-    TX_SIZE sz = mi->txfm_size;
-    // FIXME(rbultje) code ternary symbol once all experiments are merged
-    vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);
-    if (sz != TX_4X4 && mode != I8X8_PRED && mode != SPLITMV) {
-      vp9_write(bc, sz != TX_8X8, pc->prob_tx[1]);
-      if (mi->sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8)
-        vp9_write(bc, sz != TX_16X16, pc->prob_tx[2]);
-    }
-  }
-#endif
 }
 
 static void write_mb_modes_kf(const VP9_COMP *cpi,
@@ -901,11 +780,7 @@
     vp9_write(bc, skip_coeff, vp9_get_pred_prob(c, xd, PRED_MBSKIP));
   }
 
-#if CONFIG_SB8X8
   if (m->mbmi.sb_type > BLOCK_SIZE_SB8X8)
-#else
-  if (m->mbmi.sb_type > BLOCK_SIZE_MB16X16)
-#endif
     sb_kfwrite_ymode(bc, ym, c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
   else
     kfwrite_ymode(bc, ym, c->kf_ymode_prob[c->kf_ymode_probs_index]);
@@ -914,7 +789,8 @@
     int i = 0;
     do {
       const B_PREDICTION_MODE a = above_block_mode(m, i, mis);
-      const B_PREDICTION_MODE l = (xd->left_available || (i & 3)) ?
+      const B_PREDICTION_MODE l = (xd->left_available ||
+                                  (i & 1)) ?
                                   left_block_mode(m, i) : B_DC_PRED;
       const int bm = m->bmi[i].as_mode.first;
 
@@ -922,23 +798,11 @@
       ++intra_mode_stats [A] [L] [bm];
 #endif
       write_kf_bmode(bc, bm, c->kf_bmode_prob[a][l]);
-    } while (++i < (16 >> (CONFIG_SB8X8 * 2)));
+    } while (++i < 4);
   }
-#if !CONFIG_SB8X8
-  if (ym == I8X8_PRED) {
-    write_i8x8_mode(bc, m->bmi[0].as_mode.first, c->fc.i8x8_mode_prob);
-    // printf("    mode: %d\n", m->bmi[0].as_mode.first); fflush(stdout);
-    write_i8x8_mode(bc, m->bmi[2].as_mode.first, c->fc.i8x8_mode_prob);
-    // printf("    mode: %d\n", m->bmi[2].as_mode.first); fflush(stdout);
-    write_i8x8_mode(bc, m->bmi[8].as_mode.first, c->fc.i8x8_mode_prob);
-    // printf("    mode: %d\n", m->bmi[8].as_mode.first); fflush(stdout);
-    write_i8x8_mode(bc, m->bmi[10].as_mode.first, c->fc.i8x8_mode_prob);
-    // printf("    mode: %d\n", m->bmi[10].as_mode.first); fflush(stdout);
-  } else
-#endif
-    write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
 
-#if CONFIG_SB8X8
+  write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
+
   if (ym != I4X4_PRED && c->txfm_mode == TX_MODE_SELECT &&
       !(skip_coeff || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))) {
     TX_SIZE sz = m->mbmi.txfm_size;
@@ -950,166 +814,8 @@
         vp9_write(bc, sz != TX_16X16, c->prob_tx[2]);
     }
   }
-#else
-  if (ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT &&
-      !(skip_coeff || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))) {
-    TX_SIZE sz = m->mbmi.txfm_size;
-    // FIXME(rbultje) code ternary symbol once all experiments are merged
-    vp9_write(bc, sz != TX_4X4, c->prob_tx[0]);
-    if (sz != TX_4X4 && ym <= TM_PRED) {
-      vp9_write(bc, sz != TX_8X8, c->prob_tx[1]);
-      if (m->mbmi.sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8)
-        vp9_write(bc, sz != TX_16X16, c->prob_tx[2]);
-    }
-  }
-#endif
 }
 
-
-#if CONFIG_CODE_ZEROGROUP
-#ifdef ZPC_STATS
-void init_zpcstats() {
-  vp9_zero(zpc_stats_4x4);
-  vp9_zero(zpc_stats_8x8);
-  vp9_zero(zpc_stats_16x16);
-  vp9_zero(zpc_stats_32x32);
-}
-
-void update_zpcstats(VP9_COMMON *const cm) {
-  int r, b, p, n;
-  for (r = 0; r < REF_TYPES; ++r) {
-    for (b = 0; b < ZPC_BANDS; ++b) {
-      for (p = 0; p < ZPC_PTOKS; ++p) {
-        for (n = 0; n < ZPC_NODES; ++n) {
-          zpc_stats_4x4[r][b][p][n][0] += cm->fc.zpc_counts_4x4[r][b][p][n][0];
-          zpc_stats_4x4[r][b][p][n][1] += cm->fc.zpc_counts_4x4[r][b][p][n][1];
-          zpc_stats_8x8[r][b][p][n][0] += cm->fc.zpc_counts_8x8[r][b][p][n][0];
-          zpc_stats_8x8[r][b][p][n][1] += cm->fc.zpc_counts_8x8[r][b][p][n][1];
-          zpc_stats_16x16[r][b][p][n][0] +=
-              cm->fc.zpc_counts_16x16[r][b][p][n][0];
-          zpc_stats_16x16[r][b][p][n][1] +=
-              cm->fc.zpc_counts_16x16[r][b][p][n][1];
-          zpc_stats_32x32[r][b][p][n][0] +=
-              cm->fc.zpc_counts_32x32[r][b][p][n][0];
-          zpc_stats_32x32[r][b][p][n][1] +=
-              cm->fc.zpc_counts_32x32[r][b][p][n][1];
-        }
-      }
-    }
-  }
-}
-
-void print_zpcstats() {
-  int r, b, p, n;
-  FILE *f;
-
-  printf(
-      "static const unsigned int default_zpc_probs_4x4[REF_TYPES]\n"
-      "                                               [ZPC_BANDS]\n"
-      "                                               [ZPC_PTOKS]\n"
-      "                                               [ZPC_NODES] = {\n");
-  for (r = 0; r < REF_TYPES; ++r) {
-    printf("  {\n");
-    for (b = 0; b < ZPC_BANDS; ++b) {
-      printf("    {\n");
-      for (p = 0; p < ZPC_PTOKS; ++p) {
-        printf("      {");
-        for (n = 0; n < ZPC_NODES; ++n) {
-          vp9_prob prob = get_binary_prob(zpc_stats_4x4[r][b][p][n][0],
-                                          zpc_stats_4x4[r][b][p][n][1]);
-          printf(" %-3d [%d/%d],", prob, zpc_stats_4x4[r][b][p][n][0],
-                                         zpc_stats_4x4[r][b][p][n][1]);
-        }
-        printf(" },\n");
-      }
-      printf("    },\n");
-    }
-    printf("  },\n");
-  }
-  printf("};\n");
-  printf(
-    "static const unsigned int default_zpc_probs_8x8[REF_TYPES]\n"
-    "                                               [ZPC_BANDS]\n"
-    "                                               [ZPC_PTOKS]\n"
-    "                                               [ZPC_NODES] = {\n");
-  for (r = 0; r < REF_TYPES; ++r) {
-    printf("  {\n");
-    for (b = 0; b < ZPC_BANDS; ++b) {
-      printf("    {\n");
-      for (p = 0; p < ZPC_PTOKS; ++p) {
-        printf("      {");
-        for (n = 0; n < ZPC_NODES; ++n) {
-          vp9_prob prob = get_binary_prob(zpc_stats_8x8[r][b][p][n][0],
-                                          zpc_stats_8x8[r][b][p][n][1]);
-          printf(" %-3d [%d/%d],", prob, zpc_stats_8x8[r][b][p][n][0],
-                                         zpc_stats_8x8[r][b][p][n][1]);
-        }
-        printf(" },\n");
-      }
-      printf("    },\n");
-    }
-    printf("  },\n");
-  }
-  printf("};\n");
-  printf(
-    "static const unsigned int default_zpc_probs_16x16[REF_TYPES]\n"
-    "                                                 [ZPC_BANDS]\n"
-    "                                                 [ZPC_PTOKS]\n"
-    "                                                 [ZPC_NODES] = {\n");
-  for (r = 0; r < REF_TYPES; ++r) {
-    printf("  {\n");
-    for (b = 0; b < ZPC_BANDS; ++b) {
-      printf("    {\n");
-      for (p = 0; p < ZPC_PTOKS; ++p) {
-        printf("      {");
-        for (n = 0; n < ZPC_NODES; ++n) {
-          vp9_prob prob = get_binary_prob(zpc_stats_16x16[r][b][p][n][0],
-                                          zpc_stats_16x16[r][b][p][n][1]);
-          printf(" %-3d [%d/%d],", prob, zpc_stats_16x16[r][b][p][n][0],
-                                         zpc_stats_16x16[r][b][p][n][1]);
-        }
-        printf(" },\n");
-      }
-      printf("    },\n");
-    }
-    printf("  },\n");
-  }
-  printf("};\n");
-  printf(
-    "static const unsigned int default_zpc_probs_32x32[REF_TYPES]\n"
-    "                                                 [ZPC_BANDS]\n"
-    "                                                 [ZPC_PTOKS]\n"
-    "                                                 [ZPC_NODES] = {\n");
-  for (r = 0; r < REF_TYPES; ++r) {
-    printf("  {\n");
-    for (b = 0; b < ZPC_BANDS; ++b) {
-      printf("    {\n");
-      for (p = 0; p < ZPC_PTOKS; ++p) {
-        printf("      {");
-        for (n = 0; n < ZPC_NODES; ++n) {
-          vp9_prob prob = get_binary_prob(zpc_stats_32x32[r][b][p][n][0],
-                                          zpc_stats_32x32[r][b][p][n][1]);
-          printf(" %-3d [%d/%d],", prob, zpc_stats_32x32[r][b][p][n][0],
-                                         zpc_stats_32x32[r][b][p][n][1]);
-        }
-        printf(" },\n");
-      }
-      printf("    },\n");
-    }
-    printf("  },\n");
-  }
-  printf("};\n");
-
-  f = fopen("zpcstats.bin", "wb");
-  fwrite(zpc_stats_4x4, sizeof(zpc_stats_4x4), 1, f);
-  fwrite(zpc_stats_8x8, sizeof(zpc_stats_8x8), 1, f);
-  fwrite(zpc_stats_16x16, sizeof(zpc_stats_16x16), 1, f);
-  fwrite(zpc_stats_32x32, sizeof(zpc_stats_32x32), 1, f);
-  fclose(f);
-}
-#endif
-#endif  // CONFIG_CODE_ZEROGROUP
-
 static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
                           TOKENEXTRA **tok, TOKENEXTRA *tok_end,
                           int mi_row, int mi_col) {
@@ -1170,15 +876,10 @@
   else
     assert(0);
 
-#if CONFIG_SB8X8
   if (bsize > BLOCK_SIZE_SB8X8) {
-#else
-  if (bsize > BLOCK_SIZE_MB16X16) {
-#endif
     int pl;
-    xd->left_seg_context =
-        cm->left_seg_context + ((mi_row >> CONFIG_SB8X8) & 3);
-    xd->above_seg_context = cm->above_seg_context + (mi_col >> CONFIG_SB8X8);
+    xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
+    xd->above_seg_context = cm->above_seg_context + mi_col;
     pl = partition_plane_context(xd, bsize);
     // encode the partition information
     write_token(bc, vp9_partition_tree, cm->fc.partition_prob[pl],
@@ -1204,6 +905,7 @@
     case PARTITION_SPLIT:
       for (n = 0; n < 4; n++) {
         int j = n >> 1, i = n & 0x01;
+        *(get_sb_index(xd, subsize)) = n;
         write_modes_sb(cpi, m + j * bs * mis + i * bs, bc, tok, tok_end,
                        mi_row + j * bs, mi_col + i * bs, subsize);
       }
@@ -1213,16 +915,11 @@
   }
 
   // update partition context
-#if CONFIG_SB8X8
-  if ((partition == PARTITION_SPLIT) && (bsize > BLOCK_SIZE_MB16X16))
-#else
-  if ((partition == PARTITION_SPLIT) && (bsize > BLOCK_SIZE_SB32X32))
-#endif
-    return;
-
-  xd->left_seg_context = cm->left_seg_context + ((mi_row >> CONFIG_SB8X8) & 3);
-  xd->above_seg_context = cm->above_seg_context + (mi_col >> CONFIG_SB8X8);
-  update_partition_context(xd, subsize, bsize);
+  if (bsize > BLOCK_SIZE_SB8X8 &&
+      (bsize == BLOCK_SIZE_MB16X16 || partition != PARTITION_SPLIT)) {
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    update_partition_context(xd, subsize, bsize);
+  }
 }
 
 static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
@@ -1234,16 +931,16 @@
 
   m_ptr += c->cur_tile_mi_col_start + c->cur_tile_mi_row_start * mis;
   vpx_memset(c->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
-             mb_cols_aligned_to_sb(c));
+             mi_cols_aligned_to_sb(c));
 
   for (mi_row = c->cur_tile_mi_row_start;
        mi_row < c->cur_tile_mi_row_end;
-       mi_row += (4 << CONFIG_SB8X8), m_ptr += (4 << CONFIG_SB8X8) * mis) {
+       mi_row += 8, m_ptr += 8 * mis) {
     m = m_ptr;
     vpx_memset(c->left_seg_context, 0, sizeof(c->left_seg_context));
     for (mi_col = c->cur_tile_mi_col_start;
          mi_col < c->cur_tile_mi_col_end;
-         mi_col += (4 << CONFIG_SB8X8), m += (4 << CONFIG_SB8X8))
+         mi_col += 8, m += 8)
       write_modes_sb(cpi, m, bc, tok, tok_end, mi_row, mi_col,
                      BLOCK_SIZE_SB64X64);
   }
@@ -1354,129 +1051,6 @@
                           cpi->frame_branch_ct_32x32, BLOCK_TYPES);
 }
 
-#if CONFIG_CODE_ZEROGROUP
-static void update_zpc_probs_common(VP9_COMP* cpi,
-                                    vp9_writer* const bc,
-                                    TX_SIZE tx_size) {
-  int r, b, p, n;
-  VP9_COMMON *const cm = &cpi->common;
-  int update[2] = {0, 0};
-  int savings = 0;
-  vp9_zpc_probs newprobs;
-  vp9_zpc_probs *zpc_probs;
-  vp9_zpc_count *zpc_counts;
-  vp9_prob upd = ZPC_UPDATE_PROB;
-
-  if (!get_zpc_used(tx_size)) return;
-  if (tx_size == TX_32X32) {
-    zpc_probs = &cm->fc.zpc_probs_32x32;
-    zpc_counts = &cm->fc.zpc_counts_32x32;
-  } else if (tx_size == TX_16X16) {
-    zpc_probs = &cm->fc.zpc_probs_16x16;
-    zpc_counts = &cm->fc.zpc_counts_16x16;
-  } else if (tx_size == TX_8X8) {
-    zpc_probs = &cm->fc.zpc_probs_8x8;
-    zpc_counts = &cm->fc.zpc_counts_8x8;
-  } else {
-    zpc_probs = &cm->fc.zpc_probs_4x4;
-    zpc_counts = &cm->fc.zpc_counts_4x4;
-  }
-  for (r = 0; r < REF_TYPES; ++r) {
-    for (b = 0; b < ZPC_BANDS; ++b) {
-      for (p = 0; p < ZPC_PTOKS; ++p) {
-        for (n = 0; n < ZPC_NODES; ++n) {
-          newprobs[r][b][p][n] = get_binary_prob((*zpc_counts)[r][b][p][n][0],
-                                                 (*zpc_counts)[r][b][p][n][1]);
-        }
-      }
-    }
-  }
-  for (r = 0; r < REF_TYPES; ++r) {
-    for (b = 0; b < ZPC_BANDS; ++b) {
-      for (p = 0; p < ZPC_PTOKS; ++p) {
-        for (n = 0; n < ZPC_NODES; ++n) {
-          vp9_prob newp = newprobs[r][b][p][n];
-          vp9_prob oldp = (*zpc_probs)[r][b][p][n];
-          int s, u = 0;
-#if USE_ZPC_EXTRA == 0
-          if (n == 1) continue;
-#endif
-#if defined(SEARCH_NEWP)
-          s = prob_diff_update_savings_search((*zpc_counts)[r][b][p][n],
-                                              oldp, &newp, upd);
-          if (s > 0 && newp != oldp)
-            u = 1;
-          if (u)
-            savings += s - (int)(vp9_cost_zero(upd));
-          else
-            savings -= (int)(vp9_cost_zero(upd));
-#else
-          s = prob_update_savings((*zpc_counts)[r][b][p][n],
-                                  oldp, newp, upd);
-          if (s > 0)
-            u = 1;
-          if (u)
-            savings += s;
-#endif
-          update[u]++;
-        }
-      }
-    }
-  }
-  if (update[1] == 0 || savings < 0) {
-    vp9_write_bit(bc, 0);
-    return;
-  }
-  vp9_write_bit(bc, 1);
-  for (r = 0; r < REF_TYPES; ++r) {
-    for (b = 0; b < ZPC_BANDS; ++b) {
-      for (p = 0; p < ZPC_PTOKS; ++p) {
-        for (n = 0; n < ZPC_NODES; ++n) {
-          vp9_prob newp = newprobs[r][b][p][n];
-          vp9_prob *oldp = &(*zpc_probs)[r][b][p][n];
-          int s, u = 0;
-#if USE_ZPC_EXTRA == 0
-          if (n == 1) continue;
-#endif
-#if defined(SEARCH_NEWP)
-          s = prob_diff_update_savings_search((*zpc_counts)[r][b][p][n],
-                                              *oldp, &newp, upd);
-          if (s > 0 && newp != *oldp)
-            u = 1;
-#else
-          s = prob_update_savings((*zpc_counts)[r][b][p][n],
-                                  *oldp, newp, upd);
-          if (s > 0)
-            u = 1;
-#endif
-          vp9_write(bc, u, upd);
-          if (u) {
-            /* send/use new probability */
-            write_prob_diff_update(bc, newp, *oldp);
-            *oldp = newp;
-          }
-        }
-      }
-    }
-  }
-}
-
-static void update_zpc_probs(VP9_COMP* cpi,
-                             vp9_writer* const bc) {
-  update_zpc_probs_common(cpi, bc, TX_4X4);
-  if (cpi->common.txfm_mode != ONLY_4X4)
-    update_zpc_probs_common(cpi, bc, TX_8X8);
-  if (cpi->common.txfm_mode > ALLOW_8X8)
-    update_zpc_probs_common(cpi, bc, TX_16X16);
-  if (cpi->common.txfm_mode > ALLOW_16X16)
-    update_zpc_probs_common(cpi, bc, TX_32X32);
-#ifdef ZPC_STATS
-  if (!cpi->dummy_packing)
-    update_zpcstats(&cpi->common);
-#endif
-}
-#endif  // CONFIG_CODE_ZEROGROUP
-
 static void update_coef_probs_common(vp9_writer* const bc,
                                      VP9_COMP *cpi,
 #ifdef ENTROPY_STATS
@@ -1668,16 +1242,6 @@
 FILE *vpxlogc = 0;
 #endif
 
-static void put_delta_q(vp9_writer *bc, int delta_q) {
-  if (delta_q != 0) {
-    vp9_write_bit(bc, 1);
-    vp9_write_literal(bc, abs(delta_q), 4);
-    vp9_write_bit(bc, delta_q < 0);
-  } else {
-    vp9_write_bit(bc, 0);
-  }
-}
-
 static void decide_kf_ymode_entropy(VP9_COMP *cpi) {
   int mode_cost[MB_MODE_COUNT];
   int bestcost = INT_MAX;
@@ -1724,6 +1288,92 @@
   }
 }
 
+static void encode_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd, vp9_writer *w) {
+  int i;
+
+  // Encode the loop filter level and type
+  vp9_write_literal(w, pc->filter_level, 6);
+  vp9_write_literal(w, pc->sharpness_level, 3);
+#if CONFIG_LOOP_DERING
+  if (pc->dering_enabled) {
+    vp9_write_bit(w, 1);
+    vp9_write_literal(w, pc->dering_enabled - 1, 4);
+  } else {
+    vp9_write_bit(w, 0);
+  }
+#endif
+
+  // Write out loop filter deltas applied at the MB level based on mode or
+  // ref frame (if they are enabled).
+  vp9_write_bit(w, xd->mode_ref_lf_delta_enabled);
+
+  if (xd->mode_ref_lf_delta_enabled) {
+    // Do the deltas need to be updated
+    vp9_write_bit(w, xd->mode_ref_lf_delta_update);
+    if (xd->mode_ref_lf_delta_update) {
+      // Send update
+      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
+        const int delta = xd->ref_lf_deltas[i];
+
+        // Frame level data
+        if (delta != xd->last_ref_lf_deltas[i]) {
+          xd->last_ref_lf_deltas[i] = delta;
+          vp9_write_bit(w, 1);
+
+          if (delta > 0) {
+            vp9_write_literal(w, delta & 0x3F, 6);
+            vp9_write_bit(w, 0);  // sign
+          } else {
+            assert(delta < 0);
+            vp9_write_literal(w, (-delta) & 0x3F, 6);
+            vp9_write_bit(w, 1);  // sign
+          }
+        } else {
+          vp9_write_bit(w, 0);
+        }
+      }
+
+      // Send update
+      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+        const int delta = xd->mode_lf_deltas[i];
+        if (delta != xd->last_mode_lf_deltas[i]) {
+          xd->last_mode_lf_deltas[i] = delta;
+          vp9_write_bit(w, 1);
+
+          if (delta > 0) {
+            vp9_write_literal(w, delta & 0x3F, 6);
+            vp9_write_bit(w, 0);  // sign
+          } else {
+            assert(delta < 0);
+            vp9_write_literal(w, (-delta) & 0x3F, 6);
+            vp9_write_bit(w, 1);  // sign
+          }
+        } else {
+          vp9_write_bit(w, 0);
+        }
+      }
+    }
+  }
+}
+
+static void put_delta_q(vp9_writer *bc, int delta_q) {
+  if (delta_q != 0) {
+    vp9_write_bit(bc, 1);
+    vp9_write_literal(bc, abs(delta_q), 4);
+    vp9_write_bit(bc, delta_q < 0);
+  } else {
+    vp9_write_bit(bc, 0);
+  }
+}
+
+static void encode_quantization(VP9_COMMON *pc, vp9_writer *w) {
+  vp9_write_literal(w, pc->base_qindex, QINDEX_BITS);
+  put_delta_q(w, pc->y_dc_delta_q);
+  put_delta_q(w, pc->uv_dc_delta_q);
+  put_delta_q(w, pc->uv_ac_delta_q);
+}
+
+
 static void encode_segmentation(VP9_COMP *cpi, vp9_writer *w) {
   int i, j;
   VP9_COMMON *const pc = &cpi->common;
@@ -1865,81 +1515,9 @@
   // lossless mode: note this needs to be before loopfilter
   vp9_write_bit(&header_bc, cpi->mb.e_mbd.lossless);
 
-  // Encode the loop filter level and type
-  vp9_write_bit(&header_bc, pc->filter_type);
-  vp9_write_literal(&header_bc, pc->filter_level, 6);
-  vp9_write_literal(&header_bc, pc->sharpness_level, 3);
-#if CONFIG_LOOP_DERING
-  if (pc->dering_enabled) {
-    vp9_write_bit(&header_bc, 1);
-    vp9_write_literal(&header_bc, pc->dering_enabled - 1, 4);
-  } else {
-    vp9_write_bit(&header_bc, 0);
-  }
-#endif
+  encode_loopfilter(pc, xd, &header_bc);
 
-  // Write out loop filter deltas applied at the MB level based on mode or ref frame (if they are enabled).
-  vp9_write_bit(&header_bc, (xd->mode_ref_lf_delta_enabled) ? 1 : 0);
-
-  if (xd->mode_ref_lf_delta_enabled) {
-    // Do the deltas need to be updated
-    vp9_write_bit(&header_bc, xd->mode_ref_lf_delta_update);
-    if (xd->mode_ref_lf_delta_update) {
-      // Send update
-      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
-        const int delta = xd->ref_lf_deltas[i];
-
-        // Frame level data
-        if (delta != xd->last_ref_lf_deltas[i]) {
-          xd->last_ref_lf_deltas[i] = delta;
-          vp9_write_bit(&header_bc, 1);
-
-          if (delta > 0) {
-            vp9_write_literal(&header_bc, delta & 0x3F, 6);
-            vp9_write_bit(&header_bc, 0);  // sign
-          } else {
-            assert(delta < 0);
-            vp9_write_literal(&header_bc, (-delta) & 0x3F, 6);
-            vp9_write_bit(&header_bc, 1);  // sign
-          }
-        } else {
-          vp9_write_bit(&header_bc, 0);
-        }
-      }
-
-      // Send update
-      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
-        const int delta = xd->mode_lf_deltas[i];
-
-        if (delta != xd->last_mode_lf_deltas[i]) {
-          xd->last_mode_lf_deltas[i] = delta;
-          vp9_write_bit(&header_bc, 1);
-
-          if (delta > 0) {
-            vp9_write_literal(&header_bc, delta & 0x3F, 6);
-            vp9_write_bit(&header_bc, 0);  // sign
-          } else {
-            assert(delta < 0);
-            vp9_write_literal(&header_bc, (-delta) & 0x3F, 6);
-            vp9_write_bit(&header_bc, 1);  // sign
-          }
-        } else {
-          vp9_write_bit(&header_bc, 0);
-        }
-      }
-    }
-  }
-
-  // TODO(jkoleszar): remove these unused bits
-  vp9_write_literal(&header_bc, 0, 2);
-
-  // Frame Q baseline quantizer index
-  vp9_write_literal(&header_bc, pc->base_qindex, QINDEX_BITS);
-
-  // Transmit Dc, Second order and Uv quantizer delta information
-  put_delta_q(&header_bc, pc->y_dc_delta_q);
-  put_delta_q(&header_bc, pc->uv_dc_delta_q);
-  put_delta_q(&header_bc, pc->uv_ac_delta_q);
+  encode_quantization(pc, &header_bc);
 
   // When there is a key frame all reference buffers are updated using the new key frame
   if (pc->frame_type != KEY_FRAME) {
@@ -2015,15 +1593,6 @@
     vp9_write_bit(&header_bc, (pc->mcomp_filter_type == SWITCHABLE));
     if (pc->mcomp_filter_type != SWITCHABLE)
       vp9_write_literal(&header_bc, (pc->mcomp_filter_type), 2);
-#if CONFIG_COMP_INTERINTRA_PRED
-    //  printf("Counts: %d %d\n", cpi->interintra_count[0],
-    //         cpi->interintra_count[1]);
-    if (!cpi->dummy_packing && pc->use_interintra)
-      pc->use_interintra = (cpi->interintra_count[1] > 0);
-    vp9_write_bit(&header_bc, pc->use_interintra);
-    if (!pc->use_interintra)
-      vp9_zero(cpi->interintra_count);
-#endif
   }
 
   if (!pc->error_resilient_mode) {
@@ -2139,48 +1708,23 @@
            cpi->common.fc.coef_probs_16x16);
   vp9_copy(cpi->common.fc.pre_coef_probs_32x32,
            cpi->common.fc.coef_probs_32x32);
-#if CONFIG_CODE_ZEROGROUP
-  vp9_copy(cpi->common.fc.pre_zpc_probs_4x4,
-           cpi->common.fc.zpc_probs_4x4);
-  vp9_copy(cpi->common.fc.pre_zpc_probs_8x8,
-           cpi->common.fc.zpc_probs_8x8);
-  vp9_copy(cpi->common.fc.pre_zpc_probs_16x16,
-           cpi->common.fc.zpc_probs_16x16);
-  vp9_copy(cpi->common.fc.pre_zpc_probs_32x32,
-           cpi->common.fc.zpc_probs_32x32);
-#endif
+
   vp9_copy(cpi->common.fc.pre_sb_ymode_prob, cpi->common.fc.sb_ymode_prob);
   vp9_copy(cpi->common.fc.pre_ymode_prob, cpi->common.fc.ymode_prob);
   vp9_copy(cpi->common.fc.pre_uv_mode_prob, cpi->common.fc.uv_mode_prob);
   vp9_copy(cpi->common.fc.pre_bmode_prob, cpi->common.fc.bmode_prob);
   vp9_copy(cpi->common.fc.pre_sub_mv_ref_prob, cpi->common.fc.sub_mv_ref_prob);
-#if !CONFIG_SB8X8
-  vp9_copy(cpi->common.fc.pre_mbsplit_prob, cpi->common.fc.mbsplit_prob);
-  vp9_copy(cpi->common.fc.pre_i8x8_mode_prob, cpi->common.fc.i8x8_mode_prob);
-#endif
   vp9_copy(cpi->common.fc.pre_partition_prob, cpi->common.fc.partition_prob);
   cpi->common.fc.pre_nmvc = cpi->common.fc.nmvc;
-#if CONFIG_COMP_INTERINTRA_PRED
-  cpi->common.fc.pre_interintra_prob = cpi->common.fc.interintra_prob;
-#endif
   vp9_zero(cpi->sub_mv_ref_count);
-#if !CONFIG_SB8X8
-  vp9_zero(cpi->mbsplit_count);
-#endif
   vp9_zero(cpi->common.fc.mv_ref_ct);
 
   update_coef_probs(cpi, &header_bc);
-#if CONFIG_CODE_ZEROGROUP
-  update_zpc_probs(cpi, &header_bc);
-#endif
 
 #ifdef ENTROPY_STATS
   active_section = 2;
 #endif
 
-  // TODO(jkoleszar): remove this unused bit
-  vp9_write_bit(&header_bc, 1);
-
   vp9_update_skip_probs(cpi);
   for (i = 0; i < MBSKIP_CONTEXTS; ++i) {
     vp9_write_prob(&header_bc, pc->mbskip_pred_probs[i]);
@@ -2201,15 +1745,6 @@
     if (pc->mcomp_filter_type == SWITCHABLE)
       update_switchable_interp_probs(cpi, &header_bc);
 
-#if CONFIG_COMP_INTERINTRA_PRED
-    if (pc->use_interintra) {
-      vp9_cond_prob_update(&header_bc,
-                           &pc->fc.interintra_prob,
-                           VP9_UPD_INTERINTRA_PROB,
-                           cpi->interintra_count);
-    }
-#endif
-
     vp9_write_prob(&header_bc, pc->prob_intra_coded);
     vp9_write_prob(&header_bc, pc->prob_last_coded);
     vp9_write_prob(&header_bc, pc->prob_gf_coded);
@@ -2272,6 +1807,8 @@
     int scaling = (pc->width != pc->display_width ||
                    pc->height != pc->display_height);
     int v = (oh.first_partition_length_in_bytes << 8) |
+            (pc->subsampling_y << 7) |
+            (pc->subsampling_x << 6) |
             (scaling << 5) |
             (oh.show_frame << 4) |
             (oh.version << 1) |
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index e6c24f0..4426148 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -29,7 +29,7 @@
     B_PREDICTION_MODE mode;
     int_mv mv;
     int_mv second_mv;
-  } bmi[16 >> (2 * CONFIG_SB8X8)];
+  } bmi[4];
 } PARTITION_INFO;
 
 // Structure to hold snapshot of coding context during the mode picking process
@@ -117,9 +117,6 @@
   int mbmode_cost[2][MB_MODE_COUNT];
   int intra_uv_mode_cost[2][MB_MODE_COUNT];
   int bmode_costs[VP9_KF_BINTRAMODES][VP9_KF_BINTRAMODES][VP9_KF_BINTRAMODES];
-#if !CONFIG_SB8X8
-  int i8x8_mode_costs[MB_MODE_COUNT];
-#endif
   int inter_bmode_costs[B_MODE_COUNT];
   int switchable_interp_costs[VP9_SWITCHABLE_FILTERS + 1]
                              [VP9_SWITCHABLE_FILTERS];
@@ -141,13 +138,11 @@
 
   int optimize;
 
-  // Structure to hold context for each of the 4 MBs within a SB:
-  // when encoded as 4 independent MBs:
-#if CONFIG_SB8X8
+  // TODO(jingning): Need to refactor the structure arrays that buffers the
+  // coding mode decisions of each partition type.
   PICK_MODE_CONTEXT sb8_context[4][4][4];
   PICK_MODE_CONTEXT sb8x16_context[4][4][2];
   PICK_MODE_CONTEXT sb16x8_context[4][4][2];
-#endif
   PICK_MODE_CONTEXT mb_context[4][4];
   PICK_MODE_CONTEXT sb32x16_context[4][2];
   PICK_MODE_CONTEXT sb16x32_context[4][2];
@@ -158,18 +153,16 @@
   PICK_MODE_CONTEXT sb64_context;
   int partition_cost[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
 
+  BLOCK_SIZE_TYPE mb_partitioning[4][4];
+  BLOCK_SIZE_TYPE sb_partitioning[4];
+  BLOCK_SIZE_TYPE sb64_partitioning;
+
   void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch);
   void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch);
   void (*fwd_txm8x8)(int16_t *input, int16_t *output, int pitch);
   void (*fwd_txm16x16)(int16_t *input, int16_t *output, int pitch);
   void (*quantize_b_4x4)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type,
                          int y_blocks);
-#if !CONFIG_SB8X8
-  void (*quantize_b_4x4_pair)(MACROBLOCK *x, int b_idx1, int b_idx2,
-                              int y_blocks);
-  void (*quantize_b_8x8)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type,
-                         int y_blocks);
-#endif
 };
 
 #endif  // VP9_ENCODER_VP9_BLOCK_H_
diff --git a/vp9/encoder/vp9_boolhuff.c b/vp9/encoder/vp9_boolhuff.c
index 2137421..e9436af 100644
--- a/vp9/encoder/vp9_boolhuff.c
+++ b/vp9/encoder/vp9_boolhuff.c
@@ -39,7 +39,7 @@
   22,   21,   19,   18,   16,   15,   13,   12,   10,    9,    7,    6,    4,    3,    1,   1
 };
 
-void vp9_start_encode(BOOL_CODER *br, unsigned char *source) {
+void vp9_start_encode(vp9_writer *br, uint8_t *source) {
   br->lowvalue = 0;
   br->range    = 255;
   br->value    = 0;
@@ -48,11 +48,11 @@
   br->pos      = 0;
 }
 
-void vp9_stop_encode(BOOL_CODER *br) {
+void vp9_stop_encode(vp9_writer *br) {
   int i;
 
   for (i = 0; i < 32; i++)
-    encode_bool(br, 0, 128);
+    vp9_write_bit(br, 0);
 
   // Ensure there's no ambigous collision with any index marker bytes
   if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0)
@@ -60,17 +60,10 @@
 }
 
 
-void vp9_encode_value(BOOL_CODER *br, int data, int bits) {
-  int bit;
-
-  for (bit = bits - 1; bit >= 0; bit--)
-    encode_bool(br, (1 & (data >> bit)), 0x80);
-}
-
-void vp9_encode_unsigned_max(BOOL_CODER *br, int data, int max) {
+void vp9_encode_unsigned_max(vp9_writer *br, int data, int max) {
   assert(data <= max);
   while (max) {
-    encode_bool(br, data & 1, 128);
+    vp9_write_bit(br, data & 1);
     data >>= 1;
     max >>= 1;
   }
@@ -92,16 +85,16 @@
   return cat;
 }
 
-void vp9_encode_uniform(BOOL_CODER *br, int v, int n) {
+void vp9_encode_uniform(vp9_writer *br, int v, int n) {
   int l = get_unsigned_bits(n);
   int m;
   if (l == 0) return;
   m = (1 << l) - n;
   if (v < m)
-    vp9_encode_value(br, v, l - 1);
+    vp9_write_literal(br, v, l - 1);
   else {
-    vp9_encode_value(br, m + ((v - m) >> 1), l - 1);
-    vp9_encode_value(br, (v - m) & 1, 1);
+    vp9_write_literal(br, m + ((v - m) >> 1), l - 1);
+    vp9_write_literal(br, (v - m) & 1, 1);
   }
 }
 
@@ -116,7 +109,7 @@
     return l;
 }
 
-void vp9_encode_term_subexp(BOOL_CODER *br, int word, int k, int num_syms) {
+void vp9_encode_term_subexp(vp9_writer *br, int word, int k, int num_syms) {
   int i = 0;
   int mk = 0;
   while (1) {
@@ -127,12 +120,12 @@
       break;
     } else {
       int t = (word >= mk + a);
-      vp9_encode_value(br, t, 1);
+      vp9_write_literal(br, t, 1);
       if (t) {
         i = i + 1;
         mk += a;
       } else {
-        vp9_encode_value(br, word - mk, b);
+        vp9_write_literal(br, word - mk, b);
         break;
       }
     }
diff --git a/vp9/encoder/vp9_boolhuff.h b/vp9/encoder/vp9_boolhuff.h
index 0be4b53..58b40fb 100644
--- a/vp9/encoder/vp9_boolhuff.h
+++ b/vp9/encoder/vp9_boolhuff.h
@@ -27,30 +27,30 @@
   unsigned int value;
   int count;
   unsigned int pos;
-  unsigned char *buffer;
+  uint8_t *buffer;
 
   // Variables used to track bit costs without outputing to the bitstream
   unsigned int  measure_cost;
   unsigned long bit_counter;
-} BOOL_CODER;
+} vp9_writer;
 
-extern void vp9_start_encode(BOOL_CODER *bc, unsigned char *buffer);
-
-extern void vp9_encode_value(BOOL_CODER *br, int data, int bits);
-extern void vp9_encode_unsigned_max(BOOL_CODER *br, int data, int max);
-extern void vp9_stop_encode(BOOL_CODER *bc);
 extern const unsigned int vp9_prob_cost[256];
 
-extern void vp9_encode_uniform(BOOL_CODER *bc, int v, int n);
-extern void vp9_encode_term_subexp(BOOL_CODER *bc, int v, int k, int n);
-extern int vp9_count_uniform(int v, int n);
-extern int vp9_count_term_subexp(int v, int k, int n);
-extern int vp9_recenter_nonneg(int v, int m);
+void vp9_start_encode(vp9_writer *bc, uint8_t *buffer);
+void vp9_encode_unsigned_max(vp9_writer *br, int data, int max);
+void vp9_stop_encode(vp9_writer *bc);
+
+
+void vp9_encode_uniform(vp9_writer *bc, int v, int n);
+void vp9_encode_term_subexp(vp9_writer *bc, int v, int k, int n);
+int vp9_count_uniform(int v, int n);
+int vp9_count_term_subexp(int v, int k, int n);
+int vp9_recenter_nonneg(int v, int m);
 
 DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);
 
 
-static void encode_bool(BOOL_CODER *br, int bit, int probability) {
+static void vp9_write(vp9_writer *br, int bit, int probability) {
   unsigned int split;
   int count = br->count;
   unsigned int range = br->range;
@@ -89,7 +89,7 @@
       int x = br->pos - 1;
 
       while (x >= 0 && br->buffer[x] == 0xff) {
-        br->buffer[x] = (unsigned char)0;
+        br->buffer[x] = 0;
         x--;
       }
 
@@ -109,4 +109,16 @@
   br->range = range;
 }
 
+static void vp9_write_bit(vp9_writer *w, int bit) {
+  vp9_write(w, bit, 128);  // vp9_prob_half
+}
+
+static void vp9_write_literal(vp9_writer *w, int data, int bits) {
+  int bit;
+
+  for (bit = bits - 1; bit >= 0; bit--)
+    vp9_write_bit(w, 1 & (data >> bit));
+}
+
+
 #endif  // VP9_ENCODER_VP9_BOOLHUFF_H_
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index f866597..2edeb78 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -20,7 +20,6 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/encoder/vp9_segmentation.h"
-#include "vp9/common/vp9_setupintrarecon.h"
 #include "vp9/encoder/vp9_encodeintra.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_invtrans.h"
@@ -47,11 +46,6 @@
 
 void vp9_select_interp_filter_type(VP9_COMP *cpi);
 
-#if !CONFIG_SB8X8
-static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
-                              int output_enabled, int mi_row, int mi_col);
-#endif
-
 static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
                               int output_enabled, int mi_row, int mi_col,
                               BLOCK_SIZE_TYPE bsize);
@@ -392,9 +386,9 @@
                sizeof(PARTITION_INFO));
 
     mbmi->mv[0].as_int =
-        x->partition_info->bmi[15 >> (CONFIG_SB8X8 * 2)].mv.as_int;
+        x->partition_info->bmi[3].mv.as_int;
     mbmi->mv[1].as_int =
-        x->partition_info->bmi[15 >> (CONFIG_SB8X8 * 2)].second_mv.as_int;
+        x->partition_info->bmi[3].second_mv.as_int;
   }
 
   x->skip = ctx->skip;
@@ -448,9 +442,6 @@
       THR_D27_PRED /*D27_PRED*/,
       THR_D63_PRED /*D63_PRED*/,
       THR_TM /*TM_PRED*/,
-#if !CONFIG_SB8X8
-      THR_I8X8_PRED /*I8X8_PRED*/,
-#endif
       THR_B_PRED /*I4X4_PRED*/,
     };
     cpi->mode_chosen_counts[kf_mode_index[mb_mode]]++;
@@ -489,28 +480,14 @@
       mbmi->best_second_mv.as_int = best_second_mv.as_int;
       vp9_update_nmv_count(cpi, x, &best_mv, &best_second_mv);
     }
-#if CONFIG_SB8X8
+
     if (bsize > BLOCK_SIZE_SB8X8 && mbmi->mode == NEWMV) {
       int i, j;
       for (j = 0; j < bh; ++j)
         for (i = 0; i < bw; ++i)
           xd->mode_info_context[mis * j + i].mbmi = *mbmi;
     }
-#endif
-#if CONFIG_COMP_INTERINTRA_PRED
-    if (mbmi->mode >= NEARESTMV && mbmi->mode < SPLITMV &&
-        mbmi->second_ref_frame <= INTRA_FRAME) {
-      if (mbmi->second_ref_frame == INTRA_FRAME) {
-        ++cpi->interintra_count[1];
-        ++cpi->ymode_count[mbmi->interintra_mode];
-#if SEPARATE_INTERINTRA_UV
-        ++cpi->y_uv_mode_count[mbmi->interintra_mode][mbmi->interintra_uv_mode];
-#endif
-      } else {
-        ++cpi->interintra_count[0];
-      }
-    }
-#endif
+
     if (cpi->common.mcomp_filter_type == SWITCHABLE &&
         is_inter_mode(mbmi->mode)) {
       ++cpi->switchable_interp_count
@@ -562,15 +539,6 @@
                    x->e_mbd.plane[2].subsampling_y);
 }
 
-static INLINE void set_partition_seg_context(VP9_COMP *cpi,
-                                             int mi_row, int mi_col) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-
-  xd->above_seg_context = cm->above_seg_context + (mi_col >> CONFIG_SB8X8);
-  xd->left_seg_context  = cm->left_seg_context + ((mi_row >> CONFIG_SB8X8) & 3);
-}
-
 static void set_offsets(VP9_COMP *cpi,
                         int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) {
   MACROBLOCK *const x = &cpi->mb;
@@ -580,21 +548,21 @@
   const int dst_fb_idx = cm->new_fb_idx;
   const int idx_str = xd->mode_info_stride * mi_row + mi_col;
   const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
-  const int mb_row = mi_row >> CONFIG_SB8X8;
-  const int mb_col = mi_col >> CONFIG_SB8X8;
+  const int mb_row = mi_row >> 1;
+  const int mb_col = mi_col >> 1;
   const int idx_map = mb_row * cm->mb_cols + mb_col;
   int i;
 
   // entropy context structures
   for (i = 0; i < MAX_MB_PLANE; i++) {
     xd->plane[i].above_context = cm->above_context[i] +
-        (mi_col * 4 >> (CONFIG_SB8X8 + xd->plane[i].subsampling_x));
+        (mi_col * 2 >>  xd->plane[i].subsampling_x);
     xd->plane[i].left_context = cm->left_context[i] +
-        (((mi_row * 4 >> CONFIG_SB8X8) & 15) >> xd->plane[i].subsampling_y);
+        (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y);
   }
 
   // partition contexts
-  set_partition_seg_context(cpi, mi_row, mi_col);
+  set_partition_seg_context(cm, xd, mi_row, mi_col);
 
   // Activity map pointer
   x->mb_activity_ptr = &cpi->mb_activity_map[idx_map];
@@ -653,9 +621,9 @@
       const int p16 = ((mb_row & 1) << 1) +  (mb_col & 1);
       const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1);
       const int tile_progress =
-          cm->cur_tile_mi_col_start * cm->mb_rows >> CONFIG_SB8X8;
+          cm->cur_tile_mi_col_start * cm->mb_rows >> 1;
       const int mb_cols =
-          (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start) >> CONFIG_SB8X8;
+          (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start) >> 1;
 
       cpi->seg0_progress =
           ((y * mb_cols + x * 4 + p32 + p16 + tile_progress) << 16) / cm->MBs;
@@ -665,49 +633,6 @@
   }
 }
 
-#if !CONFIG_SB8X8
-static int pick_mb_mode(VP9_COMP *cpi,
-                        int mi_row,
-                        int mi_col,
-                        TOKENEXTRA **tp,
-                        int *totalrate,
-                        int *totaldist) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int splitmodes_used = 0;
-  MB_MODE_INFO *mbmi;
-
-  set_offsets(cpi, mi_row, mi_col, BLOCK_SIZE_MB16X16);
-
-  if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
-    vp9_activity_masking(cpi, x);
-
-  mbmi = &xd->mode_info_context->mbmi;
-  mbmi->sb_type = BLOCK_SIZE_MB16X16;
-
-  // Find best coding mode & reconstruct the MB so it is available
-  // as a predictor for MBs that follow in the SB
-  if (cm->frame_type == KEY_FRAME) {
-    vp9_rd_pick_intra_mode(cpi, x, totalrate, totaldist);
-
-    // Save the coding context
-    vpx_memcpy(&x->mb_context[xd->sb_index][xd->mb_index].mic,
-               xd->mode_info_context, sizeof(MODE_INFO));
-  } else {
-    vp9_pick_mode_inter_macroblock(cpi, x, mi_row, mi_col,
-                                   totalrate, totaldist);
-    splitmodes_used += (mbmi->mode == SPLITMV);
-
-    if (cpi->mb.e_mbd.segmentation_enabled && mbmi->segment_id == 0) {
-      cpi->seg0_idx++;
-    }
-  }
-
-  return splitmodes_used;
-}
-#endif
-
 static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
                           TOKENEXTRA **tp, int *totalrate, int *totaldist,
                           BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx) {
@@ -790,18 +715,16 @@
                             BLOCK_SIZE_TYPE bsize) {
   if (bsize >= BLOCK_SIZE_SB32X32) {
     xd->sb_index = idx;
-#if CONFIG_SB8X8
   } else if (bsize >= BLOCK_SIZE_MB16X16) {
     xd->mb_index = idx;
   } else {
     xd->b_index = idx;
-#else
-  } else {
-    xd->mb_index = idx;
-#endif
   }
 }
 
+// TODO(jingning): the variables used here are little complicated. need further
+// refactoring on organizing the the temporary buffers, when recursive
+// partition down to 4x4 block size is enabled.
 static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
                                             BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -821,20 +744,64 @@
       return &x->sb16x32_context[xd->sb_index][xd->mb_index];
     case BLOCK_SIZE_MB16X16:
       return &x->mb_context[xd->sb_index][xd->mb_index];
-#if CONFIG_SB8X8
     case BLOCK_SIZE_SB16X8:
       return &x->sb16x8_context[xd->sb_index][xd->mb_index][xd->b_index];
     case BLOCK_SIZE_SB8X16:
       return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index];
     case BLOCK_SIZE_SB8X8:
       return &x->sb8_context[xd->sb_index][xd->mb_index][xd->b_index];
-#endif
     default:
       assert(0);
       return NULL;
   }
 }
 
+static BLOCK_SIZE_TYPE *get_sb_partitioning(MACROBLOCK *x,
+                                            BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  switch (bsize) {
+    case BLOCK_SIZE_SB64X64:
+      return &x->sb64_partitioning;
+    case BLOCK_SIZE_SB32X32:
+      return &x->sb_partitioning[xd->sb_index];
+    case BLOCK_SIZE_MB16X16:
+      return &x->mb_partitioning[xd->sb_index][xd->mb_index];
+    default:
+      assert(0);
+      return NULL;
+  }
+}
+
+static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
+                            ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
+                            ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
+                            PARTITION_CONTEXT sa[8],
+                            PARTITION_CONTEXT sl[8],
+                            BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int p;
+  int bwl = b_width_log2(bsize), bw = 1 << bwl;
+  int bhl = b_height_log2(bsize), bh = 1 << bhl;
+  int mwl = mi_width_log2(bsize), mw = 1 << mwl;
+  int mhl = mi_height_log2(bsize), mh = 1 << mhl;
+  for (p = 0; p < MAX_MB_PLANE; p++) {
+    vpx_memcpy(cm->above_context[p] +
+               ((mi_col * 2) >> xd->plane[p].subsampling_x),
+               a + bw * p,
+               sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
+    vpx_memcpy(cm->left_context[p] +
+               ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+               l + bh * p,
+               sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
+  }
+  vpx_memcpy(cm->above_seg_context + mi_col, sa,
+             sizeof(PARTITION_CONTEXT) * mw);
+  vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl,
+             sizeof(PARTITION_CONTEXT) * mh);
+}
+
 static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
                      int mi_row, int mi_col, int output_enabled,
                      BLOCK_SIZE_TYPE bsize, int sub_index) {
@@ -849,17 +816,7 @@
     set_block_index(xd, sub_index, bsize);
   set_offsets(cpi, mi_row, mi_col, bsize);
   update_state(cpi, get_block_context(x, bsize), bsize, output_enabled);
-#if !CONFIG_SB8X8
-  if (bsize == BLOCK_SIZE_MB16X16) {
-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
-      vp9_activity_masking(cpi, x);
-
-    encode_macroblock(cpi, tp, output_enabled, mi_row, mi_col);
-  } else
-#endif
-  {
-    encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize);
-  }
+  encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize);
 
   if (output_enabled) {
     update_stats(cpi, mi_row, mi_col);
@@ -871,39 +828,28 @@
 
 static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
                       int mi_row, int mi_col, int output_enabled,
-                      BLOCK_SIZE_TYPE level,
-                      BLOCK_SIZE_TYPE c1, BLOCK_SIZE_TYPE c2[4]
-#if CONFIG_SB8X8
-                      , BLOCK_SIZE_TYPE c3[4][4]
-#endif
-                      ) {
+                      BLOCK_SIZE_TYPE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const int bsl = mi_width_log2(level), bs = 1 << (bsl - 1);
-  const int bwl = mi_width_log2(c1), bhl = mi_height_log2(c1);
+  BLOCK_SIZE_TYPE c1 = BLOCK_SIZE_SB8X8;
+  const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1);
+  int bwl, bhl;
   int UNINITIALIZED_IS_SAFE(pl);
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-#if CONFIG_SB8X8
-  if (level > BLOCK_SIZE_SB8X8) {
-#endif
-    set_partition_seg_context(cpi, mi_row, mi_col);
-    pl = partition_plane_context(xd, level);
-#if CONFIG_SB8X8
+  if (bsize > BLOCK_SIZE_SB8X8) {
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    pl = partition_plane_context(xd, bsize);
+    c1 = *(get_sb_partitioning(x, bsize));
   }
-#endif
+
+  bwl = mi_width_log2(c1), bhl = mi_height_log2(c1);
 
   if (bsl == bwl && bsl == bhl) {
-    if (output_enabled &&
-#if CONFIG_SB8X8
-        level > BLOCK_SIZE_SB8X8
-#else
-        level > BLOCK_SIZE_MB16X16
-#endif
-        )
+    if (output_enabled && bsize > BLOCK_SIZE_SB8X8)
       cpi->partition_count[pl][PARTITION_NONE]++;
     encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
   } else if (bsl == bhl && bsl > bwl) {
@@ -921,19 +867,13 @@
     int i;
 
     assert(bwl < bsl && bhl < bsl);
-    if (level == BLOCK_SIZE_SB64X64) {
+    if (bsize == BLOCK_SIZE_SB64X64) {
       subsize = BLOCK_SIZE_SB32X32;
-#if CONFIG_SB8X8
-    } else if (level == BLOCK_SIZE_SB32X32) {
+    } else if (bsize == BLOCK_SIZE_SB32X32) {
       subsize = BLOCK_SIZE_MB16X16;
     } else {
-      assert(level == BLOCK_SIZE_MB16X16);
+      assert(bsize == BLOCK_SIZE_MB16X16);
       subsize = BLOCK_SIZE_SB8X8;
-#else
-    } else {
-      assert(level == BLOCK_SIZE_SB32X32);
-      subsize = BLOCK_SIZE_MB16X16;
-#endif
     }
 
     if (output_enabled)
@@ -944,36 +884,189 @@
 
       set_block_index(xd, i, subsize);
       encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
-                output_enabled, subsize,
-#if CONFIG_SB8X8
-                c2 ? c2[i] : c1, c3 ? c3[i] : NULL, NULL);
-#else
-                c2 ? c2[i] : c1, NULL);
-#endif
+                output_enabled, subsize);
     }
   }
 
-#if CONFIG_SB8X8
-  if (level > BLOCK_SIZE_SB8X8 &&
-      (level == BLOCK_SIZE_MB16X16 || bsl == bwl || bsl == bhl))
-#else
-  if (level > BLOCK_SIZE_MB16X16 &&
-      (level == BLOCK_SIZE_SB32X32 || bsl == bwl || bsl == bhl))
-#endif
-  {
-    set_partition_seg_context(cpi, mi_row, mi_col);
-    update_partition_context(xd, c1, level);
+  if (bsize > BLOCK_SIZE_SB8X8 &&
+      (bsize == BLOCK_SIZE_MB16X16 || bsl == bwl || bsl == bhl)) {
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    update_partition_context(xd, c1, bsize);
   }
 }
 
-static void encode_sb_row(VP9_COMP *cpi,
-                          int mi_row,
-                          TOKENEXTRA **tp,
-                          int *totalrate) {
+
+// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
+// unlikely to be selected depending on previously rate-distortion optimization
+// results, for encoding speed-up.
+static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
+                              int mi_row, int mi_col,
+                              BLOCK_SIZE_TYPE bsize,
+                              int *rate, int *dist) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  int mi_col, pl;
+  int bsl = b_width_log2(bsize), bs = 1 << bsl;
+  int msl = mi_height_log2(bsize), ms = 1 << msl;
+  ENTROPY_CONTEXT   l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+  PARTITION_CONTEXT sl[8], sa[8];
+  TOKENEXTRA *tp_orig = *tp;
+  int i, p, pl;
+  BLOCK_SIZE_TYPE subsize;
+  int srate = INT_MAX, sdist = INT_MAX;
+
+  assert(mi_height_log2(bsize) == mi_width_log2(bsize));
+
+  // buffer the above/left context information of the block in search.
+  for (p = 0; p < MAX_MB_PLANE; ++p) {
+    vpx_memcpy(a + bs * p, cm->above_context[p] +
+               (mi_col * 2 >> xd->plane[p].subsampling_x),
+               sizeof(ENTROPY_CONTEXT) * bs >> xd->plane[p].subsampling_x);
+    vpx_memcpy(l + bs * p, cm->left_context[p] +
+               ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+               sizeof(ENTROPY_CONTEXT) * bs >> xd->plane[p].subsampling_y);
+  }
+  vpx_memcpy(sa, cm->above_seg_context + mi_col,
+             sizeof(PARTITION_CONTEXT) * ms);
+  vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK),
+             sizeof(PARTITION_CONTEXT) * ms);
+
+  // PARTITION_SPLIT
+  if (bsize >= BLOCK_SIZE_MB16X16) {
+    int r4 = 0, d4 = 0;
+    subsize = get_subsize(bsize, PARTITION_SPLIT);
+    *(get_sb_partitioning(x, bsize)) = subsize;
+
+    for (i = 0; i < 4; ++i) {
+      int x_idx = (i & 1) * (ms >> 1);
+      int y_idx = (i >> 1) * (ms >> 1);
+      int r, d;
+
+      if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+        continue;
+
+      *(get_sb_index(xd, subsize)) = i;
+      rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+                        &r, &d);
+      r4 += r;
+      d4 += d;
+    }
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    pl = partition_plane_context(xd, bsize);
+    r4 += x->partition_cost[pl][PARTITION_SPLIT];
+
+    srate = r4;
+    sdist = d4;
+    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
+
+  // PARTITION_HORZ
+  if ((mi_col + ms <= cm->mi_cols) && (mi_row + (ms >> 1) <= cm->mi_rows) &&
+      (bsize >= BLOCK_SIZE_MB16X16)) {
+    int r2, d2;
+    int mb_skip = 0;
+    subsize = get_subsize(bsize, PARTITION_HORZ);
+    *(get_sb_index(xd, subsize)) = 0;
+    pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
+                  get_block_context(x, subsize));
+
+    if (mi_row + ms <= cm->mi_rows) {
+      int r, d;
+      update_state(cpi, get_block_context(x, subsize), subsize, 0);
+      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+      *(get_sb_index(xd, subsize)) = 1;
+      pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, tp, &r, &d, subsize,
+                    get_block_context(x, subsize));
+      r2 += r;
+      d2 += d;
+    } else {
+      if (mi_row + (ms >> 1) != cm->mi_rows)
+        mb_skip = 1;
+    }
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    pl = partition_plane_context(xd, bsize);
+    r2 += x->partition_cost[pl][PARTITION_HORZ];
+
+    if ((RDCOST(x->rdmult, x->rddiv, r2, d2) <
+         RDCOST(x->rdmult, x->rddiv, srate, sdist)) && !mb_skip) {
+      srate = r2;
+      sdist = d2;
+      *(get_sb_partitioning(x, bsize)) = subsize;
+    }
+    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
+
+  // PARTITION_VERT
+  if ((mi_row + ms <= cm->mi_rows) && (mi_col + (ms >> 1) <= cm->mi_cols) &&
+      (bsize >= BLOCK_SIZE_MB16X16)) {
+    int r2, d2;
+    int mb_skip = 0;
+    subsize = get_subsize(bsize, PARTITION_VERT);
+    *(get_sb_index(xd, subsize)) = 0;
+    pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
+                  get_block_context(x, subsize));
+    if (mi_col + ms <= cm->mi_cols) {
+      int r, d;
+      update_state(cpi, get_block_context(x, subsize), subsize, 0);
+      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+      *(get_sb_index(xd, subsize)) = 1;
+      pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), tp, &r, &d, subsize,
+                    get_block_context(x, subsize));
+      r2 += r;
+      d2 += d;
+    } else {
+      if (mi_col + (ms >> 1) != cm->mi_cols)
+        mb_skip = 1;
+    }
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    pl = partition_plane_context(xd, bsize);
+    r2 += x->partition_cost[pl][PARTITION_VERT];
+
+    if ((RDCOST(x->rdmult, x->rddiv, r2, d2) <
+         RDCOST(x->rdmult, x->rddiv, srate, sdist)) && !mb_skip) {
+      srate = r2;
+      sdist = d2;
+      *(get_sb_partitioning(x, bsize)) = subsize;
+    }
+    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
+
+  // PARTITION_NONE
+  if (mi_row + ms <= cm->mi_rows && mi_col + ms <= cm->mi_cols) {
+    int r, d;
+    pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,
+                  get_block_context(x, bsize));
+    if (bsize >= BLOCK_SIZE_MB16X16) {
+      set_partition_seg_context(cm, xd, mi_row, mi_col);
+      pl = partition_plane_context(xd, bsize);
+      r += x->partition_cost[pl][PARTITION_NONE];
+    }
+
+    if (RDCOST(x->rdmult, x->rddiv, r, d) <
+        RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
+      srate = r;
+      sdist = d;
+      if (bsize >= BLOCK_SIZE_MB16X16)
+        *(get_sb_partitioning(x, bsize)) = bsize;
+    }
+  }
+
+  assert(srate < INT_MAX && sdist < INT_MAX);
+  *rate = srate;
+  *dist = sdist;
+
+  encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);
+
+  if (bsize == BLOCK_SIZE_SB64X64)
+    assert(tp_orig < *tp);
+  else
+    assert(tp_orig == *tp);
+}
+
+static void encode_sb_row(VP9_COMP *cpi, int mi_row,
+                       TOKENEXTRA **tp, int *totalrate) {
+  VP9_COMMON *const cm = &cpi->common;
+  int mi_col;
 
   // Initialize the left context for the new SB row
   vpx_memset(&cm->left_context, 0, sizeof(cm->left_context));
@@ -981,556 +1074,10 @@
 
   // Code each SB in the row
   for (mi_col = cm->cur_tile_mi_col_start;
-       mi_col < cm->cur_tile_mi_col_end; mi_col += (4 << CONFIG_SB8X8)) {
-    int i, p;
-#if CONFIG_SB8X8
-    BLOCK_SIZE_TYPE mb_partitioning[4][4];
-#endif
-    BLOCK_SIZE_TYPE sb_partitioning[4];
-    BLOCK_SIZE_TYPE sb64_partitioning = BLOCK_SIZE_SB32X32;
-    int sb64_rate = 0, sb64_dist = 0;
-    int sb64_skip = 0;
-    ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
-    PARTITION_CONTEXT seg_l[4], seg_a[4];
-    TOKENEXTRA *tp_orig = *tp;
-
-    for (p = 0; p < MAX_MB_PLANE; p++) {
-      memcpy(a + 16 * p, cm->above_context[p] +
-                 (mi_col * 4 >> (CONFIG_SB8X8 + xd->plane[p].subsampling_x)),
-             sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
-      memcpy(l + 16 * p, cm->left_context[p],
-             sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
-    }
-    memcpy(&seg_a, cm->above_seg_context + (mi_col >> CONFIG_SB8X8),
-           sizeof(seg_a));
-    memcpy(&seg_l, cm->left_seg_context, sizeof(seg_l));
-
-    // FIXME(rbultje): this function should probably be rewritten to be
-    // recursive at some point in the future.
-    for (i = 0; i < 4; i++) {
-      const int x_idx = (i & 1) << (1 + CONFIG_SB8X8);
-      const int y_idx = (i & 2) << CONFIG_SB8X8;
-      int sb32_rate = 0, sb32_dist = 0;
-      int splitmodes_used = 0;
-      int sb32_skip = 0;
-      int j;
-      ENTROPY_CONTEXT l2[8 * MAX_MB_PLANE], a2[8 * MAX_MB_PLANE];
-
-      sb_partitioning[i] = BLOCK_SIZE_MB16X16;
-      if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
-        continue;
-
-      xd->sb_index = i;
-
-      /* Function should not modify L & A contexts; save and restore on exit */
-      for (p = 0; p < MAX_MB_PLANE; p++) {
-        vpx_memcpy(l2 + 8 * p,
-                   cm->left_context[p] +
-                       (y_idx * 4 >> (CONFIG_SB8X8 +
-                                      xd->plane[p].subsampling_y)),
-                   sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
-        vpx_memcpy(a2 + 8 * p,
-                   cm->above_context[p] +
-                       ((mi_col + x_idx) * 4 >> (CONFIG_SB8X8 +
-                                                 xd->plane[p].subsampling_x)),
-                   sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
-      }
-
-      /* Encode MBs in raster order within the SB */
-      for (j = 0; j < 4; j++) {
-        const int x_idx_m = x_idx + ((j & 1) << CONFIG_SB8X8);
-        const int y_idx_m = y_idx + ((j >> 1) << CONFIG_SB8X8);
-        int r, d;
-#if CONFIG_SB8X8
-        int r2, d2, mb16_rate = 0, mb16_dist = 0, k;
-        ENTROPY_CONTEXT l3[4 * MAX_MB_PLANE], a3[4 * MAX_MB_PLANE];
-
-        mb_partitioning[i][j] = BLOCK_SIZE_SB8X8;
-#endif
-
-        if (mi_row + y_idx_m >= cm->mi_rows ||
-            mi_col + x_idx_m >= cm->mi_cols) {
-          // MB lies outside frame, move on
-          continue;
-        }
-
-        // Index of the MB in the SB 0..3
-        xd->mb_index = j;
-
-#if CONFIG_SB8X8
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(l3 + 4 * p,
-                     cm->left_context[p] +
-                         (y_idx_m * 4 >> (CONFIG_SB8X8 +
-                                          xd->plane[p].subsampling_y)),
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(a3 + 4 * p,
-                     cm->above_context[p] +
-                         ((mi_col + x_idx_m) * 4 >> (CONFIG_SB8X8 +
-                                                   xd->plane[p].subsampling_x)),
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
-        }
-
-        for (k = 0; k < 4; k++) {
-          xd->b_index = k;
-
-          // try 8x8 coding
-          pick_sb_modes(cpi, mi_row + y_idx_m + (k >> 1),
-                        mi_col + x_idx_m + (k & 1),
-                        tp, &r, &d, BLOCK_SIZE_SB8X8,
-                        &x->sb8_context[xd->sb_index][xd->mb_index]
-                                       [xd->b_index]);
-          mb16_rate += r;
-          mb16_dist += d;
-          update_state(cpi, &x->sb8_context[xd->sb_index][xd->mb_index]
-                                           [xd->b_index],
-                       BLOCK_SIZE_SB8X8, 0);
-          encode_superblock(cpi, tp,
-                            0, mi_row + y_idx_m + (k >> 1),
-                            mi_col + x_idx_m + (k & 1),
-                            BLOCK_SIZE_SB8X8);
-        }
-        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
-        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
-        mb16_rate += x->partition_cost[pl][PARTITION_SPLIT];
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(cm->left_context[p] +
-                         (y_idx_m * 4 >> (CONFIG_SB8X8 +
-                                          xd->plane[p].subsampling_y)),
-                     l3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx_m) * 4 >> (CONFIG_SB8X8 +
-                                                   xd->plane[p].subsampling_x)),
-                     a3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
-        }
-
-        // try 8x16 coding
-        r2 = 0;
-        d2 = 0;
-        xd->b_index = 0;
-        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
-                      tp, &r, &d, BLOCK_SIZE_SB8X16,
-                      &x->sb8x16_context[xd->sb_index][xd->mb_index]
-                                        [xd->b_index]);
-        r2 += r;
-        d2 += d;
-        update_state(cpi, &x->sb8x16_context[xd->sb_index][xd->mb_index]
-                                            [xd->b_index],
-                     BLOCK_SIZE_SB8X16, 0);
-        encode_superblock(cpi, tp,
-                          0, mi_row + y_idx_m, mi_col + x_idx_m,
-                          BLOCK_SIZE_SB8X16);
-        xd->b_index = 1;
-        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m + 1,
-                      tp, &r, &d, BLOCK_SIZE_SB8X16,
-                      &x->sb8x16_context[xd->sb_index][xd->mb_index]
-                                        [xd->b_index]);
-        r2 += r;
-        d2 += d;
-        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
-        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
-        r2 += x->partition_cost[pl][PARTITION_VERT];
-        if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
-                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
-          mb16_rate = r2;
-          mb16_dist = d2;
-          mb_partitioning[i][j] = BLOCK_SIZE_SB8X16;
-        }
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(cm->left_context[p] +
-                         (y_idx_m * 4 >> (CONFIG_SB8X8 +
-                                          xd->plane[p].subsampling_y)),
-                     l3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx_m) * 4 >> (CONFIG_SB8X8 +
-                                                   xd->plane[p].subsampling_x)),
-                     a3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
-        }
-
-        // try 16x8 coding
-        r2 = 0;
-        d2 = 0;
-        xd->b_index = 0;
-        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
-                      tp, &r, &d, BLOCK_SIZE_SB16X8,
-                      &x->sb16x8_context[xd->sb_index][xd->mb_index]
-                                        [xd->b_index]);
-        r2 += r;
-        d2 += d;
-        update_state(cpi, &x->sb16x8_context[xd->sb_index][xd->mb_index]
-                                            [xd->b_index],
-                     BLOCK_SIZE_SB16X8, 0);
-        encode_superblock(cpi, tp,
-                          0, mi_row + y_idx_m, mi_col + x_idx_m,
-                          BLOCK_SIZE_SB16X8);
-        xd->b_index = 1;
-        pick_sb_modes(cpi, mi_row + y_idx_m + 1, mi_col + x_idx_m,
-                      tp, &r, &d, BLOCK_SIZE_SB16X8,
-                      &x->sb16x8_context[xd->sb_index][xd->mb_index]
-                                        [xd->b_index]);
-        r2 += r;
-        d2 += d;
-        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
-        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
-        r2 += x->partition_cost[pl][PARTITION_HORZ];
-        if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
-                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
-          mb16_rate = r2;
-          mb16_dist = d2;
-          mb_partitioning[i][j] = BLOCK_SIZE_SB16X8;
-        }
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(cm->left_context[p] +
-                         (y_idx_m * 4 >> (CONFIG_SB8X8 +
-                                          xd->plane[p].subsampling_y)),
-                     l3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx_m) * 4 >> (CONFIG_SB8X8 +
-                                                   xd->plane[p].subsampling_x)),
-                     a3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
-        }
-
-        // try as 16x16
-        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
-                      tp, &r, &d, BLOCK_SIZE_MB16X16,
-                      &x->mb_context[xd->sb_index][xd->mb_index]);
-        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
-        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
-        r += x->partition_cost[pl][PARTITION_NONE];
-        if (RDCOST(x->rdmult, x->rddiv, r, d) <
-                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
-          mb16_rate = r;
-          mb16_dist = d;
-          mb_partitioning[i][j] = BLOCK_SIZE_MB16X16;
-        }
-        sb32_rate += mb16_rate;
-        sb32_dist += mb16_dist;
-#else
-        splitmodes_used += pick_mb_mode(cpi, mi_row + y_idx_m,
-                                        mi_col + x_idx_m, tp, &r, &d);
-        sb32_rate += r;
-        sb32_dist += d;
-#endif
-
-        // Dummy encode, do not do the tokenization
-#if CONFIG_SB8X8
-        encode_sb(cpi, tp, mi_row + y_idx_m, mi_col + x_idx_m, 0,
-                  BLOCK_SIZE_MB16X16, mb_partitioning[i][j], NULL, NULL);
-#else
-        encode_macroblock(cpi, tp, 0, mi_row + y_idx_m,
-                          mi_col + x_idx_m);
-#endif
-      }
-
-      /* Restore L & A coding context to those in place on entry */
-      for (p = 0; p < MAX_MB_PLANE; p++) {
-        vpx_memcpy(cm->left_context[p] +
-                       (y_idx * 4 >> (CONFIG_SB8X8 +
-                                      xd->plane[p].subsampling_y)),
-                   l2 + 8 * p,
-                   sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
-        vpx_memcpy(cm->above_context[p] +
-                       ((mi_col + x_idx) * 4 >> (CONFIG_SB8X8 +
-                                                 xd->plane[p].subsampling_x)),
-                   a2 + 8 * p,
-                   sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
-      }
-
-      set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
-      pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
-      sb32_rate += x->partition_cost[pl][PARTITION_SPLIT];
-
-      if (cpi->sf.splitmode_breakout) {
-        sb32_skip = splitmodes_used;
-        sb64_skip += splitmodes_used;
-      }
-
-      // check 32x16
-      if (mi_col + x_idx + (2 << CONFIG_SB8X8) <= cm->mi_cols) {
-        int r, d;
-
-        xd->mb_index = 0;
-        pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
-                      tp, &r, &d, BLOCK_SIZE_SB32X16,
-                      &x->sb32x16_context[xd->sb_index][xd->mb_index]);
-        if (mi_row + y_idx + (1 << CONFIG_SB8X8) < cm->mi_rows) {
-          int r2, d2;
-
-          update_state(cpi, &x->sb32x16_context[xd->sb_index][xd->mb_index],
-                       BLOCK_SIZE_SB32X16, 0);
-          encode_superblock(cpi, tp,
-                            0, mi_row + y_idx, mi_col + x_idx,
-                            BLOCK_SIZE_SB32X16);
-          xd->mb_index = 1;
-          pick_sb_modes(cpi, mi_row + y_idx + (1 << CONFIG_SB8X8),
-                        mi_col + x_idx, tp, &r2, &d2, BLOCK_SIZE_SB32X16,
-                        &x->sb32x16_context[xd->sb_index][xd->mb_index]);
-          r += r2;
-          d += d2;
-        }
-
-        set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
-        pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
-        r += x->partition_cost[pl][PARTITION_HORZ];
-
-        /* is this better than MB coding? */
-        if (RDCOST(x->rdmult, x->rddiv, r, d) <
-                RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
-          sb32_rate = r;
-          sb32_dist = d;
-          sb_partitioning[i] = BLOCK_SIZE_SB32X16;
-        }
-
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(cm->left_context[p] +
-                         (y_idx * 4 >> (CONFIG_SB8X8 +
-                                        xd->plane[p].subsampling_y)),
-                     l2 + 8 * p,
-                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx) * 4 >> (CONFIG_SB8X8 +
-                                                   xd->plane[p].subsampling_x)),
-                     a2 + 8 * p,
-                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
-        }
-      }
-
-      // check 16x32
-      if (mi_row + y_idx + (2 << CONFIG_SB8X8) <= cm->mi_rows) {
-        int r, d;
-
-        xd->mb_index = 0;
-        pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
-                      tp, &r, &d, BLOCK_SIZE_SB16X32,
-                      &x->sb16x32_context[xd->sb_index][xd->mb_index]);
-        if (mi_col + x_idx + (1 << CONFIG_SB8X8) < cm->mi_cols) {
-          int r2, d2;
-
-          update_state(cpi, &x->sb16x32_context[xd->sb_index][xd->mb_index],
-                       BLOCK_SIZE_SB16X32, 0);
-          encode_superblock(cpi, tp,
-                            0, mi_row + y_idx, mi_col + x_idx,
-                            BLOCK_SIZE_SB16X32);
-          xd->mb_index = 1;
-          pick_sb_modes(cpi, mi_row + y_idx,
-                        mi_col + x_idx + (1 << CONFIG_SB8X8),
-                        tp, &r2, &d2, BLOCK_SIZE_SB16X32,
-                        &x->sb16x32_context[xd->sb_index][xd->mb_index]);
-          r += r2;
-          d += d2;
-        }
-
-        set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
-        pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
-        r += x->partition_cost[pl][PARTITION_VERT];
-
-        /* is this better than MB coding? */
-        if (RDCOST(x->rdmult, x->rddiv, r, d) <
-                RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
-          sb32_rate = r;
-          sb32_dist = d;
-          sb_partitioning[i] = BLOCK_SIZE_SB16X32;
-        }
-
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(cm->left_context[p] +
-                         (y_idx * 4 >> (CONFIG_SB8X8 +
-                                        xd->plane[p].subsampling_y)),
-                     l2 + 8 * p,
-                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx) * 4 >> (CONFIG_SB8X8 +
-                                                   xd->plane[p].subsampling_x)),
-                     a2 + 8 * p,
-                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
-        }
-      }
-
-      if (!sb32_skip &&
-          mi_col + x_idx + (2 << CONFIG_SB8X8) <= cm->mi_cols &&
-          mi_row + y_idx + (2 << CONFIG_SB8X8) <= cm->mi_rows) {
-        int r, d;
-
-        /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
-        pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
-                      tp, &r, &d, BLOCK_SIZE_SB32X32,
-                      &x->sb32_context[xd->sb_index]);
-
-        set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
-        pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
-        r += x->partition_cost[pl][PARTITION_NONE];
-
-        if (RDCOST(x->rdmult, x->rddiv, r, d) <
-                RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
-          sb32_rate = r;
-          sb32_dist = d;
-          sb_partitioning[i] = BLOCK_SIZE_SB32X32;
-        }
-      }
-
-      // If we used 16x16 instead of 32x32 then skip 64x64 (if enabled).
-      if (cpi->sf.mb16_breakout && sb_partitioning[i] != BLOCK_SIZE_SB32X32) {
-        ++sb64_skip;
-      }
-
-      sb64_rate += sb32_rate;
-      sb64_dist += sb32_dist;
-
-      /* Encode SB using best computed mode(s) */
-      // FIXME(rbultje): there really shouldn't be any need to encode_mb/sb
-      // for each level that we go up, we can just keep tokens and recon
-      // pixels of the lower level; also, inverting SB/MB order (big->small
-      // instead of small->big) means we can use as threshold for small, which
-      // may enable breakouts if RD is not good enough (i.e. faster)
-      encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0,
-#if CONFIG_SB8X8
-                BLOCK_SIZE_SB32X32, sb_partitioning[i], mb_partitioning[i],
-                NULL);
-#else
-                BLOCK_SIZE_SB32X32, sb_partitioning[i], NULL);
-#endif
-    }
-
-    for (p = 0; p < MAX_MB_PLANE; p++) {
-      memcpy(cm->above_context[p] +
-                 (mi_col * 4 >> (CONFIG_SB8X8 + xd->plane[p].subsampling_x)),
-             a + 16 * p,
-             sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
-      memcpy(cm->left_context[p], l + 16 * p,
-             sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
-    }
-    memcpy(cm->above_seg_context + (mi_col >> CONFIG_SB8X8), &seg_a,
-           sizeof(seg_a));
-    memcpy(cm->left_seg_context, &seg_l, sizeof(seg_l));
-
-    set_partition_seg_context(cpi, mi_row, mi_col);
-    pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
-    sb64_rate += x->partition_cost[pl][PARTITION_SPLIT];
-
-    // check 64x32
-    if (mi_col + (4 << CONFIG_SB8X8) <= cm->mi_cols && !(cm->mb_rows & 1)) {
-      int r, d;
-
-      xd->sb_index = 0;
-      pick_sb_modes(cpi, mi_row, mi_col,
-                    tp, &r, &d, BLOCK_SIZE_SB64X32,
-                    &x->sb64x32_context[xd->sb_index]);
-      if (mi_row + (2 << CONFIG_SB8X8) != cm->mi_rows) {
-        int r2, d2;
-
-        update_state(cpi, &x->sb64x32_context[xd->sb_index],
-                     BLOCK_SIZE_SB64X32, 0);
-        encode_superblock(cpi, tp,
-                          0, mi_row, mi_col, BLOCK_SIZE_SB64X32);
-        xd->sb_index = 1;
-        pick_sb_modes(cpi, mi_row + (2 << CONFIG_SB8X8), mi_col,
-                      tp, &r2, &d2, BLOCK_SIZE_SB64X32,
-                      &x->sb64x32_context[xd->sb_index]);
-        r += r2;
-        d += d2;
-      }
-
-      set_partition_seg_context(cpi, mi_row, mi_col);
-      pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
-      r += x->partition_cost[pl][PARTITION_HORZ];
-
-      /* is this better than MB coding? */
-      if (RDCOST(x->rdmult, x->rddiv, r, d) <
-              RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
-        sb64_rate = r;
-        sb64_dist = d;
-        sb64_partitioning = BLOCK_SIZE_SB64X32;
-      }
-
-      for (p = 0; p < MAX_MB_PLANE; p++) {
-        memcpy(cm->above_context[p] +
-                   (mi_col * 4 >> (CONFIG_SB8X8 + xd->plane[p].subsampling_x)),
-               a + 16 * p,
-               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
-        memcpy(cm->left_context[p], l + 16 * p,
-               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
-      }
-    }
-
-    // check 32x64
-    if (mi_row + (4 << CONFIG_SB8X8) <= cm->mi_rows && !(cm->mb_cols & 1)) {
-      int r, d;
-
-      xd->sb_index = 0;
-      pick_sb_modes(cpi, mi_row, mi_col,
-                    tp, &r, &d, BLOCK_SIZE_SB32X64,
-                    &x->sb32x64_context[xd->sb_index]);
-      if (mi_col + (2 << CONFIG_SB8X8) != cm->mi_cols) {
-        int r2, d2;
-
-        update_state(cpi, &x->sb32x64_context[xd->sb_index],
-                     BLOCK_SIZE_SB32X64, 0);
-        encode_superblock(cpi, tp,
-                          0, mi_row, mi_col, BLOCK_SIZE_SB32X64);
-        xd->sb_index = 1;
-        pick_sb_modes(cpi, mi_row, mi_col + (2 << CONFIG_SB8X8),
-                      tp, &r2, &d2, BLOCK_SIZE_SB32X64,
-                      &x->sb32x64_context[xd->sb_index]);
-        r += r2;
-        d += d2;
-      }
-
-      set_partition_seg_context(cpi, mi_row, mi_col);
-      pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
-      r += x->partition_cost[pl][PARTITION_VERT];
-
-      /* is this better than MB coding? */
-      if (RDCOST(x->rdmult, x->rddiv, r, d) <
-              RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
-        sb64_rate = r;
-        sb64_dist = d;
-        sb64_partitioning = BLOCK_SIZE_SB32X64;
-      }
-
-      for (p = 0; p < MAX_MB_PLANE; p++) {
-        memcpy(cm->above_context[p] +
-                   (mi_col * 4 >> (CONFIG_SB8X8 + xd->plane[p].subsampling_x)),
-               a + 16 * p,
-               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
-        memcpy(cm->left_context[p], l + 16 * p,
-               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
-      }
-    }
-
-    if (!sb64_skip &&
-        mi_col + (4 << CONFIG_SB8X8) <= cm->mi_cols &&
-        mi_row + (4 << CONFIG_SB8X8) <= cm->mi_rows) {
-      int r, d;
-
-      pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d,
-                    BLOCK_SIZE_SB64X64, &x->sb64_context);
-
-      set_partition_seg_context(cpi, mi_row, mi_col);
-      pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
-      r += x->partition_cost[pl][PARTITION_NONE];
-
-      if (RDCOST(x->rdmult, x->rddiv, r, d) <
-              RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
-        sb64_rate = r;
-        sb64_dist = d;
-        sb64_partitioning = BLOCK_SIZE_SB64X64;
-      }
-    }
-
-    assert(tp_orig == *tp);
-    encode_sb(cpi, tp, mi_row, mi_col, 1, BLOCK_SIZE_SB64X64,
-#if CONFIG_SB8X8
-              sb64_partitioning, sb_partitioning, mb_partitioning);
-#else
-              sb64_partitioning, sb_partitioning);
-#endif
-    assert(tp_orig < *tp);
+       mi_col < cm->cur_tile_mi_col_end; mi_col += 8) {
+    int dummy_rate, dummy_dist;
+    rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+                      &dummy_rate, &dummy_dist);
   }
 }
 
@@ -1561,12 +1108,9 @@
                    0, 0, NULL, NULL);
   setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], 0, 0);
 
-  // set up frame for intra coded blocks
-  vp9_setup_intra_recon(&cm->yv12_fb[cm->new_fb_idx]);
-
   vp9_build_block_offsets(x);
 
-  vp9_setup_block_dptrs(&x->e_mbd);
+  vp9_setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
   xd->mode_info_context->mbmi.mode = DC_PRED;
   xd->mode_info_context->mbmi.uv_mode = DC_PRED;
@@ -1574,29 +1118,18 @@
   vp9_zero(cpi->count_mb_ref_frame_usage)
   vp9_zero(cpi->bmode_count)
   vp9_zero(cpi->ymode_count)
-#if !CONFIG_SB8X8
-  vp9_zero(cpi->i8x8_mode_count)
-#endif
   vp9_zero(cpi->y_uv_mode_count)
   vp9_zero(cpi->sub_mv_ref_count)
-#if !CONFIG_SB8X8
-  vp9_zero(cpi->mbsplit_count)
-#endif
   vp9_zero(cpi->common.fc.mv_ref_ct)
   vp9_zero(cpi->sb_ymode_count)
   vp9_zero(cpi->partition_count);
 
-#if CONFIG_COMP_INTERINTRA_PRED
-  vp9_zero(cpi->interintra_count);
-  vp9_zero(cpi->interintra_select_count);
-#endif
-
   // Note: this memset assumes above_context[0], [1] and [2]
   // are allocated as part of the same buffer.
-  vpx_memset(cm->above_context[0], 0, sizeof(ENTROPY_CONTEXT) * 4 *
-                                      MAX_MB_PLANE * mb_cols_aligned_to_sb(cm));
+  vpx_memset(cm->above_context[0], 0, sizeof(ENTROPY_CONTEXT) * 2 *
+                                      MAX_MB_PLANE * mi_cols_aligned_to_sb(cm));
   vpx_memset(cm->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
-                                       mb_cols_aligned_to_sb(cm));
+                                       mi_cols_aligned_to_sb(cm));
 }
 
 static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
@@ -1664,12 +1197,6 @@
   vp9_zero(cpi->coef_counts_16x16);
   vp9_zero(cpi->coef_counts_32x32);
   vp9_zero(cm->fc.eob_branch_counts);
-#if CONFIG_CODE_ZEROGROUP
-  vp9_zero(cm->fc.zpc_counts_4x4);
-  vp9_zero(cm->fc.zpc_counts_8x8);
-  vp9_zero(cm->fc.zpc_counts_16x16);
-  vp9_zero(cm->fc.zpc_counts_32x32);
-#endif
 
   cpi->mb.e_mbd.lossless = (cm->base_qindex == 0 &&
                             cm->y_dc_delta_q == 0 &&
@@ -1719,9 +1246,8 @@
           vp9_get_tile_col_offsets(cm, tile_col);
           for (mi_row = cm->cur_tile_mi_row_start;
                mi_row < cm->cur_tile_mi_row_end;
-               mi_row += (4 << CONFIG_SB8X8)) {
+               mi_row += 8)
             encode_sb_row(cpi, mi_row, &tp, &totalrate);
-          }
           cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old);
           assert(tp - cpi->tok <=
                  get_token_alloc(cm->mb_rows, cm->mb_cols));
@@ -1848,17 +1374,11 @@
     assert(bwl < bsl && bhl < bsl);
     if (bsize == BLOCK_SIZE_SB64X64) {
       subsize = BLOCK_SIZE_SB32X32;
-#if CONFIG_SB8X8
     } else if (bsize == BLOCK_SIZE_SB32X32) {
       subsize = BLOCK_SIZE_MB16X16;
     } else {
       assert(bsize == BLOCK_SIZE_MB16X16);
       subsize = BLOCK_SIZE_SB8X8;
-#else
-    } else {
-      assert(bsize == BLOCK_SIZE_SB32X32);
-      subsize = BLOCK_SIZE_MB16X16;
-#endif
     }
 
     for (n = 0; n < 4; n++) {
@@ -1878,10 +1398,10 @@
   MODE_INFO *mi, *mi_ptr = cm->mi;
 
   for (mi_row = 0; mi_row < cm->mi_rows;
-       mi_row += (4 << CONFIG_SB8X8), mi_ptr += (4 << CONFIG_SB8X8) * mis) {
+       mi_row += 8, mi_ptr += 8 * mis) {
     mi = mi_ptr;
     for (mi_col = 0; mi_col < cm->mi_cols;
-         mi_col += (4 << CONFIG_SB8X8), mi += (4 << CONFIG_SB8X8)) {
+         mi_col += 8, mi += 8) {
       reset_skip_txfm_size_sb(cpi, mi, txfm_max,
                               mi_row, mi_col, BLOCK_SIZE_SB64X64);
     }
@@ -2063,50 +1583,22 @@
 
     do {
       ++ bct[xd->block[b].bmi.as_mode.first];
-    } while (++b < (16 >> (CONFIG_SB8X8 * 2)));
-  }
-
-#if !CONFIG_SB8X8
-  if (m == I8X8_PRED) {
-    i8x8_modes[xd->block[0].bmi.as_mode.first]++;
-    i8x8_modes[xd->block[2].bmi.as_mode.first]++;
-    i8x8_modes[xd->block[8].bmi.as_mode.first]++;
-    i8x8_modes[xd->block[10].bmi.as_mode.first]++;
+    } while (++b < 4);
   }
 #endif
-#endif
 
-#if CONFIG_SB8X8
-  if (xd->mode_info_context->mbmi.sb_type > BLOCK_SIZE_SB8X8)
-#else
-  if (xd->mode_info_context->mbmi.sb_type > BLOCK_SIZE_MB16X16)
-#endif
-  {
+  if (xd->mode_info_context->mbmi.sb_type > BLOCK_SIZE_SB8X8) {
     ++cpi->sb_ymode_count[m];
   } else {
     ++cpi->ymode_count[m];
   }
-#if !CONFIG_SB8X8
-  if (m != I8X8_PRED)
-#endif
     ++cpi->y_uv_mode_count[m][uvm];
-#if !CONFIG_SB8X8
-  else {
-    cpi->i8x8_mode_count[xd->mode_info_context->bmi[0].as_mode.first]++;
-    cpi->i8x8_mode_count[xd->mode_info_context->bmi[2].as_mode.first]++;
-    cpi->i8x8_mode_count[xd->mode_info_context->bmi[8].as_mode.first]++;
-    cpi->i8x8_mode_count[xd->mode_info_context->bmi[10].as_mode.first]++;
-  }
-#endif
   if (m == I4X4_PRED) {
     int b = 0;
     do {
       int m = xd->mode_info_context->bmi[b].as_mode.first;
-#if CONFIG_NEWBINTRAMODES
-      if (m == B_CONTEXT_PRED) m -= CONTEXT_PRED_REPLACEMENTS;
-#endif
       ++cpi->bmode_count[m];
-    } while (++b < (16 >> (CONFIG_SB8X8 * 2)));
+    } while (++b < 4);
   }
 }
 
@@ -2131,257 +1623,6 @@
 #endif
 }
 
-#if !CONFIG_SB8X8
-static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
-                              int output_enabled,
-                              int mi_row, int mi_col) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *mi = xd->mode_info_context;
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const int mis = cm->mode_info_stride;
-#if CONFIG_SB8X8
-  int n;
-#endif
-
-  assert(xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_MB16X16);
-
-#ifdef ENC_DEBUG
-  enc_debug = (cpi->common.current_video_frame == 11 && cm->show_frame &&
-               mb_row == 8 && mb_col == 0 && output_enabled);
-  if (enc_debug)
-    printf("Encode MB %d %d output %d\n", mb_row, mb_col, output_enabled);
-#endif
-  if (cm->frame_type == KEY_FRAME) {
-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM && output_enabled) {
-      // Adjust the zbin based on this MB rate.
-      adjust_act_zbin(cpi, x);
-      vp9_update_zbin_extra(cpi, x);
-    }
-  } else {
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
-
-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
-      // Adjust the zbin based on this MB rate.
-      adjust_act_zbin(cpi, x);
-    }
-
-    // Experimental code. Special case for gf and arf zeromv modes.
-    // Increase zbin size to suppress noise
-    cpi->zbin_mode_boost = 0;
-    if (cpi->zbin_mode_boost_enabled) {
-      if (mbmi->ref_frame != INTRA_FRAME) {
-        if (mbmi->mode == ZEROMV) {
-          if (mbmi->ref_frame != LAST_FRAME)
-            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
-          else
-            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
-        } else if (mbmi->mode == SPLITMV)
-          cpi->zbin_mode_boost = SPLIT_MV_ZBIN_BOOST;
-        else
-          cpi->zbin_mode_boost = MV_ZBIN_BOOST;
-      } else {
-        cpi->zbin_mode_boost = INTRA_ZBIN_BOOST;
-      }
-    }
-
-    vp9_update_zbin_extra(cpi, x);
-  }
-
-  if (mbmi->ref_frame == INTRA_FRAME) {
-#if 0  // def ENC_DEBUG
-    if (enc_debug) {
-      printf("Mode %d skip %d tx_size %d\n", mbmi->mode, x->skip,
-             mbmi->txfm_size);
-    }
-#endif
-    if (mbmi->mode == I4X4_PRED) {
-      vp9_encode_intra16x16mbuv(cm, x);
-      vp9_encode_intra4x4mby(x, BLOCK_SIZE_MB16X16);
-    } else if (mbmi->mode == I8X8_PRED) {
-      vp9_encode_intra8x8mby(x);
-      vp9_encode_intra8x8mbuv(x);
-    } else {
-      vp9_encode_intra16x16mbuv(cm, x);
-      vp9_encode_intra16x16mby(cm, x);
-    }
-
-    if (output_enabled)
-      sum_intra_stats(cpi, x);
-  } else {
-    int ref_fb_idx, second_ref_fb_idx;
-#ifdef ENC_DEBUG
-    if (enc_debug)
-      printf("Mode %d skip %d tx_size %d ref %d ref2 %d mv %d %d interp %d\n",
-             mbmi->mode, x->skip, mbmi->txfm_size,
-             mbmi->ref_frame, mbmi->second_ref_frame,
-             mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
-             mbmi->interp_filter);
-#endif
-
-    assert(cm->frame_type != KEY_FRAME);
-
-    if (mbmi->ref_frame == LAST_FRAME)
-      ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
-    else if (mbmi->ref_frame == GOLDEN_FRAME)
-      ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
-    else
-      ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
-
-    if (mbmi->second_ref_frame > 0) {
-      if (mbmi->second_ref_frame == LAST_FRAME)
-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
-      else if (mbmi->second_ref_frame == GOLDEN_FRAME)
-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
-      else
-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
-    }
-
-    setup_pre_planes(xd,
-        &cpi->common.yv12_fb[ref_fb_idx],
-        mbmi->second_ref_frame > 0 ? &cpi->common.yv12_fb[second_ref_fb_idx]
-                                   : NULL,
-        mi_row, mi_col, xd->scale_factor, xd->scale_factor_uv);
-
-    if (!x->skip) {
-      vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_SIZE_MB16X16);
-      vp9_encode_sb(cm, x, BLOCK_SIZE_MB16X16);
-    } else {
-      vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_SIZE_MB16X16);
-#if CONFIG_COMP_INTERINTRA_PRED
-      if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
-        vp9_build_interintra_predictors(xd,
-                                        xd->plane[0].dst.buf,
-                                        xd->plane[1].dst.buf,
-                                        xd->plane[2].dst.buf,
-                                        xd->plane[0].dst.stride,
-                                        xd->plane[1].dst.stride,
-                                        BLOCK_SIZE_MB16X16);
-      }
-#endif
-    }
-  }
-
-  if (!x->skip) {
-#ifdef ENC_DEBUG
-    if (enc_debug) {
-      int i, j;
-      printf("\n");
-      printf("qcoeff\n");
-      for (i = 0; i < 384; i++) {
-        printf("%3d ", xd->qcoeff[i]);
-        if (i % 16 == 15) printf("\n");
-      }
-      printf("\n");
-      printf("predictor\n");
-      for (i = 0; i < 384; i++) {
-        printf("%3d ", xd->predictor[i]);
-        if (i % 16 == 15) printf("\n");
-      }
-      printf("\n");
-      printf("src_diff\n");
-      for (i = 0; i < 384; i++) {
-        printf("%3d ", x->src_diff[i]);
-        if (i % 16 == 15) printf("\n");
-      }
-      printf("\n");
-      printf("diff\n");
-      for (i = 0; i < 384; i++) {
-        printf("%3d ", xd->block[0].diff[i]);
-        if (i % 16 == 15) printf("\n");
-      }
-      printf("\n");
-      printf("final y\n");
-      for (i = 0; i < 16; i++) {
-        for (j = 0; j < 16; j++)
-          printf("%3d ", xd->plane[0].dst.buf[i * xd->plane[0].dst.stride + j]);
-        printf("\n");
-      }
-      printf("\n");
-      printf("final u\n");
-      for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++)
-          printf("%3d ", xd->plane[1].dst.buf[i * xd->plane[1].dst.stride + j]);
-        printf("\n");
-      }
-      printf("\n");
-      printf("final v\n");
-      for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++)
-          printf("%3d ", xd->plane[2].dst.buf[i * xd->plane[1].dst.stride + j]);
-        printf("\n");
-      }
-      fflush(stdout);
-    }
-#endif
-
-    vp9_tokenize_sb(cpi, xd, t, !output_enabled, BLOCK_SIZE_MB16X16);
-  } else {
-    // FIXME(rbultje): not tile-aware (mi - 1)
-    int mb_skip_context =
-      (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff;
-
-    mbmi->mb_skip_coeff = 1;
-    if (output_enabled)
-      cpi->skip_true_count[mb_skip_context]++;
-    vp9_reset_sb_tokens_context(xd, BLOCK_SIZE_MB16X16);
-  }
-
-#if CONFIG_SB8X8
-  // copy skip flag on all mb_mode_info contexts in this SB
-  // if this was a skip at this txfm size
-  for (n = 1; n < 4; n++) {
-    const int x_idx = n & 1, y_idx = n >> 1;
-    if (mi_col + x_idx < cm->mi_cols && mi_row + y_idx < cm->mi_rows)
-      mi[x_idx + y_idx * mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
-  }
-#endif
-
-  if (output_enabled) {
-    int segment_id = mbmi->segment_id;
-    if (cpi->common.txfm_mode == TX_MODE_SELECT &&
-        !(mbmi->mb_skip_coeff ||
-          vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_SKIP))) {
-      assert(mbmi->txfm_size <= TX_16X16);
-      if (mbmi->mode != I4X4_PRED && mbmi->mode != I8X8_PRED &&
-          mbmi->mode != SPLITMV) {
-        cpi->txfm_count_16x16p[mbmi->txfm_size]++;
-      } else if (mbmi->mode == I8X8_PRED ||
-                 (mbmi->mode == SPLITMV &&
-                  mbmi->partitioning != PARTITIONING_4X4)) {
-        cpi->txfm_count_8x8p[mbmi->txfm_size]++;
-      }
-    } else {
-#if CONFIG_SB8X8
-      int y, x;
-#endif
-      if (mbmi->mode != I4X4_PRED && mbmi->mode != I8X8_PRED &&
-          mbmi->mode != SPLITMV && cpi->common.txfm_mode >= ALLOW_16X16) {
-        mbmi->txfm_size = TX_16X16;
-      } else if (mbmi->mode != I4X4_PRED &&
-                 !(mbmi->mode == SPLITMV &&
-                   mbmi->partitioning == PARTITIONING_4X4) &&
-                 cpi->common.txfm_mode >= ALLOW_8X8) {
-        mbmi->txfm_size = TX_8X8;
-      } else {
-        mbmi->txfm_size = TX_4X4;
-      }
-
-#if CONFIG_SB8X8
-      for (y = 0; y < 2; y++) {
-        for (x = !y; x < 2; x++) {
-          if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows) {
-            mi[mis * y + x].mbmi.txfm_size = mbmi->txfm_size;
-          }
-        }
-      }
-#endif
-    }
-  }
-}
-#endif
-
 static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
                               int output_enabled, int mi_row, int mi_col,
                               BLOCK_SIZE_TYPE bsize) {
@@ -2431,7 +1672,6 @@
     vp9_update_zbin_extra(cpi, x);
   }
 
-#if CONFIG_SB8X8
   if (xd->mode_info_context->mbmi.mode == I4X4_PRED) {
     assert(bsize == BLOCK_SIZE_SB8X8 &&
            xd->mode_info_context->mbmi.txfm_size == TX_4X4);
@@ -2442,9 +1682,7 @@
 
     if (output_enabled)
       sum_intra_stats(cpi, x);
-  } else
-#endif
-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+  } else if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
     vp9_build_intra_predictors_sby_s(&x->e_mbd, bsize);
     vp9_build_intra_predictors_sbuv_s(&x->e_mbd, bsize);
     if (output_enabled)
@@ -2479,13 +1717,10 @@
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
   }
 
-#if CONFIG_SB8X8
   if (xd->mode_info_context->mbmi.mode == I4X4_PRED) {
     assert(bsize == BLOCK_SIZE_SB8X8);
     vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled, bsize);
-  } else
-#endif
-  if (!x->skip) {
+  } else if (!x->skip) {
     vp9_encode_sb(cm, x, bsize);
     vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled, bsize);
   } else {
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index 268058e..fe5bdb3 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -61,11 +61,6 @@
 
   assert(ib < (1 << (bwl + bhl)));
 
-#if CONFIG_NEWBINTRAMODES
-  xd->mode_info_context->bmi[ib].as_mode.context =
-    vp9_find_bpred_context(&x->e_mbd, ib, dst, xd->plane[0].dst.stride);
-#endif
-
   vp9_intra4x4_predict(&x->e_mbd, ib, bsize,
                        xd->mode_info_context->bmi[ib].as_mode.first,
                        dst, xd->plane[0].dst.stride);
@@ -112,151 +107,3 @@
   vp9_build_intra_predictors_sbuv_s(xd, BLOCK_SIZE_MB16X16);
   vp9_encode_sbuv(cm, x, BLOCK_SIZE_MB16X16);
 }
-
-#if !CONFIG_SB8X8
-void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  uint8_t* const src =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                x->plane[0].src.buf, x->plane[0].src.stride);
-  int16_t* const src_diff =
-      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                x->plane[0].src_diff);
-  int16_t* const diff =
-      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                xd->plane[0].diff);
-  uint8_t* const dst =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                xd->plane[0].dst.buf, xd->plane[0].dst.stride);
-  const int iblock[4] = {0, 1, 4, 5};
-  int i;
-  TX_TYPE tx_type;
-
-  vp9_intra8x8_predict(xd, ib, xd->mode_info_context->bmi[ib].as_mode.first,
-                       dst, xd->plane[0].dst.stride);
-  // generate residual blocks
-  vp9_subtract_block(8, 8, src_diff, 16,
-                     src, x->plane[0].src.stride,
-                     dst, xd->plane[0].dst.stride);
-
-  if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
-    int idx = (ib & 0x02) ? (ib + 2) : ib;
-    int16_t* const dqcoeff = BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16);
-    int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, idx, 16);
-
-    assert(idx < 16);
-    tx_type = get_tx_type_8x8(xd, ib);
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht8x8(src_diff, coeff, 16, tx_type);
-      x->quantize_b_8x8(x, idx, tx_type, 16);
-      vp9_short_iht8x8(dqcoeff, diff, 16, tx_type);
-    } else {
-      x->fwd_txm8x8(src_diff, coeff, 32);
-      x->quantize_b_8x8(x, idx, DCT_DCT, 16);
-      vp9_short_idct8x8(dqcoeff, diff, 32);
-    }
-  } else {
-    for (i = 0; i < 4; i++) {
-      int idx = ib + iblock[i];
-      int16_t* const dqcoeff = BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16);
-      int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, idx, 16);
-      int16_t* const src_diff =
-          raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, idx,
-                                    x->plane[0].src_diff);
-      int16_t* const diff =
-          raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, idx,
-                                    xd->plane[0].diff);
-
-      assert(idx < 16);
-      tx_type = get_tx_type_4x4(xd, ib + iblock[i]);
-      if (tx_type != DCT_DCT) {
-        vp9_short_fht4x4(src_diff, coeff, 16, tx_type);
-        x->quantize_b_4x4(x, ib + iblock[i], tx_type, 16);
-        vp9_short_iht4x4(dqcoeff, diff, 16, tx_type);
-      } else if (!(i & 1) &&
-                 get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) {
-        x->fwd_txm8x4(src_diff, coeff, 32);
-        x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1, 16);
-        vp9_inverse_transform_b_4x4(xd, xd->plane[0].eobs[ib + iblock[i]],
-                                    dqcoeff, diff, 32);
-        vp9_inverse_transform_b_4x4(xd, xd->plane[0].eobs[ib + iblock[i] + 1],
-                                    dqcoeff + 16, diff + 4, 32);
-        i++;
-      } else {
-        x->fwd_txm4x4(src_diff, coeff, 32);
-        x->quantize_b_4x4(x, ib + iblock[i], tx_type, 16);
-        vp9_inverse_transform_b_4x4(xd, xd->plane[0].eobs[ib + iblock[i]],
-                                    dqcoeff, diff, 32);
-      }
-    }
-  }
-
-  // reconstruct submacroblock
-  for (i = 0; i < 4; i++) {
-    int16_t* const diff =
-        raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib + iblock[i],
-                                  xd->plane[0].diff);
-    uint8_t* const dst =
-        raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib + iblock[i],
-                                  xd->plane[0].dst.buf,
-                                  xd->plane[0].dst.stride);
-    vp9_recon_b_c(dst, diff, 16, dst, xd->plane[0].dst.stride);
-  }
-}
-
-void vp9_encode_intra8x8mby(MACROBLOCK *x) {
-  int i;
-
-  for (i = 0; i < 4; i++)
-    vp9_encode_intra8x8(x, vp9_i8x8_block[i]);
-}
-
-static void encode_intra_uv4x4(MACROBLOCK *x, int ib, int mode) {
-  MACROBLOCKD * const xd = &x->e_mbd;
-  int16_t * const dqcoeff = MB_SUBBLOCK_FIELD(xd, dqcoeff, ib);
-  int16_t* const coeff = MB_SUBBLOCK_FIELD(x, coeff, ib);
-  const int plane = ib < 20 ? 1 : 2;
-  const int block = ib < 20 ? ib - 16 : ib - 20;
-  uint8_t* const src =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, plane, block,
-                                x->plane[plane].src.buf,
-                                x->plane[plane].src.stride);
-  int16_t* const src_diff =
-      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, plane, block,
-                                x->plane[plane].src_diff);
-  int16_t* const diff =
-      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, plane, block,
-                                xd->plane[plane].diff);
-  uint8_t* const dst =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, plane, block,
-                                xd->plane[plane].dst.buf,
-                                xd->plane[plane].dst.stride);
-
-  assert(ib >= 16 && ib < 24);
-  vp9_intra_uv4x4_predict(&x->e_mbd, ib, mode,
-                          dst, xd->plane[plane].dst.stride);
-
-  assert(xd->plane[1].subsampling_x == 1);
-  vp9_subtract_block(4, 4, src_diff, 8,
-                     src, x->plane[plane].src.stride,
-                     dst, xd->plane[plane].dst.stride);
-
-  x->fwd_txm4x4(src_diff, coeff, 16);
-  x->quantize_b_4x4(x, ib, DCT_DCT, 16);
-  vp9_inverse_transform_b_4x4(&x->e_mbd, xd->plane[plane].eobs[block],
-                              dqcoeff, diff, 16);
-
-  vp9_recon_uv_b_c(dst, diff, dst, xd->plane[plane].dst.stride);
-}
-
-void vp9_encode_intra8x8mbuv(MACROBLOCK *x) {
-  int i;
-
-  for (i = 0; i < 4; i++) {
-    int mode = x->e_mbd.mode_info_context->bmi[vp9_i8x8_block[i]].as_mode.first;
-
-    encode_intra_uv4x4(x, i + 16, mode);  // u
-    encode_intra_uv4x4(x, i + 20, mode);  // v
-  }
-}
-#endif
diff --git a/vp9/encoder/vp9_encodeintra.h b/vp9/encoder/vp9_encodeintra.h
index a4f4c18..c262004 100644
--- a/vp9/encoder/vp9_encodeintra.h
+++ b/vp9/encoder/vp9_encodeintra.h
@@ -17,10 +17,4 @@
 void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_encode_intra4x4mby(MACROBLOCK *mb, BLOCK_SIZE_TYPE bs);
-#if !CONFIG_SB8X8
-void vp9_encode_intra8x8mby(MACROBLOCK *x);
-void vp9_encode_intra8x8mbuv(MACROBLOCK *x);
-void vp9_encode_intra8x8(MACROBLOCK *x, int ib);
-#endif
-
 #endif  // VP9_ENCODER_VP9_ENCODEINTRA_H_
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 6e28f90..e4002d6 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -139,6 +139,7 @@
   const int ib = txfrm_block_to_raster_block(xd, bsize, plane,
                                              block, 2 * tx_size);
   const int16_t *dequant_ptr = xd->plane[plane].dequant;
+  const uint8_t * band_translate;
 
   assert((!type && !plane) || (type && plane));
   dqcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16);
@@ -149,23 +150,27 @@
       const TX_TYPE tx_type = plane == 0 ? get_tx_type_4x4(xd, ib) : DCT_DCT;
       default_eob = 16;
       scan = get_scan_4x4(tx_type);
+      band_translate = vp9_coefband_trans_4x4;
       break;
     }
     case TX_8X8: {
       const TX_TYPE tx_type = plane == 0 ? get_tx_type_8x8(xd, ib) : DCT_DCT;
       scan = get_scan_8x8(tx_type);
       default_eob = 64;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_16X16: {
       const TX_TYPE tx_type = plane == 0 ? get_tx_type_16x16(xd, ib) : DCT_DCT;
       scan = get_scan_16x16(tx_type);
       default_eob = 256;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_32X32:
       scan = vp9_default_zig_zag1d_32x32;
       default_eob = 1024;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
   }
   assert(eob <= default_eob);
@@ -204,7 +209,7 @@
       t0 = (vp9_dct_value_tokens_ptr + x)->token;
       /* Consider both possible successor states. */
       if (next < default_eob) {
-        band = get_coef_band(scan, tx_size, i + 1);
+        band = get_coef_band(band_translate, i + 1);
         pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
                                        pad, default_eob);
         rate0 +=
@@ -254,7 +259,7 @@
         t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token;
       }
       if (next < default_eob) {
-        band = get_coef_band(scan, tx_size, i + 1);
+        band = get_coef_band(band_translate, i + 1);
         if (t0 != DCT_EOB_TOKEN) {
           pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
                                          pad, default_eob);
@@ -291,7 +296,7 @@
      *  add a new trellis node, but we do need to update the costs.
      */
     else {
-      band = get_coef_band(scan, tx_size, i + 1);
+      band = get_coef_band(band_translate, i + 1);
       t0 = tokens[next][0].token;
       t1 = tokens[next][1].token;
       /* Update the cost of each path if we're past the EOB token. */
@@ -310,7 +315,7 @@
   }
 
   /* Now pick the best path through the whole trellis. */
-  band = get_coef_band(scan, tx_size, i + 1);
+  band = get_coef_band(band_translate, i + 1);
   pt = combine_entropy_contexts(*a, *l);
   rate0 = tokens[next][0].rate;
   rate1 = tokens[next][1].rate;
@@ -378,7 +383,8 @@
     const struct macroblockd_plane* const plane = &xd->plane[p];
     const int bwl = b_width_log2(bsize) - plane->subsampling_x;
     const int bhl = b_height_log2(bsize) - plane->subsampling_y;
-    const TX_SIZE tx_size = tx_size_for_plane(xd, bsize, p);
+    const TX_SIZE tx_size = p ? get_uv_tx_size(xd)
+                              : xd->mode_info_context->mbmi.txfm_size;
     int i, j;
 
     for (i = 0; i < 1 << bwl; i += 1 << tx_size) {
@@ -404,9 +410,6 @@
   struct optimize_block_args arg = {cm, x, &ctx};
   vp9_optimize_init(&x->e_mbd, bsize, &ctx);
   foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
-#if !CONFIG_SB8X8
-  0,
-#endif
                                      optimize_block, &arg);
 }
 
@@ -551,9 +554,6 @@
   struct encode_b_args arg = {cm, x, NULL};
 
   foreach_transformed_block_in_plane(xd, bsize, 0,
-#if !CONFIG_SB8X8
-                                     0,
-#endif
                                      xform_quant, &arg);
 }
 
@@ -576,9 +576,6 @@
     vp9_optimize_init(xd, bsize, &ctx);
 
   foreach_transformed_block_in_plane(xd, bsize, 0,
-#if !CONFIG_SB8X8
-                                     0,
-#endif
                                      encode_block, &arg);
 
   vp9_recon_sby(xd, bsize);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 4d28f1b..ddcf849 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -14,7 +14,6 @@
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_variance.h"
 #include "vp9/encoder/vp9_encodeintra.h"
-#include "vp9/common/vp9_setupintrarecon.h"
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vpx_scale/vpx_scale.h"
@@ -23,7 +22,7 @@
 #include "vp9/common/vp9_extend.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_swapyv12buffer.h"
+#include "vpx_scale/yv12config.h"
 #include <stdio.h>
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rdopt.h"
@@ -48,11 +47,17 @@
 #define KF_MB_INTRA_MIN 150
 #define GF_MB_INTRA_MIN 100
 
-#define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001)
+#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)
 
 #define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0
 #define POW2 (double)cpi->oxcf.two_pass_vbrbias/100.0
 
+static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {
+  YV12_BUFFER_CONFIG temp = *a;
+  *a = *b;
+  *b = temp;
+}
+
 static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame);
 
 static int select_cq_level(int qindex) {
@@ -73,8 +78,8 @@
 
 
 // Resets the first pass file to the given position using a relative seek from the current position
-static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *Position) {
-  cpi->twopass.stats_in = Position;
+static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *position) {
+  cpi->twopass.stats_in = position;
 }
 
 static int lookup_next_frame_stats(VP9_COMP *cpi, FIRSTPASS_STATS *next_frame) {
@@ -247,17 +252,11 @@
 
 // Calculate a modified Error used in distributing bits between easier and harder frames
 static double calculate_modified_err(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
-  double av_err = (cpi->twopass.total_stats->ssim_weighted_pred_err /
-                   cpi->twopass.total_stats->count);
-  double this_err = this_frame->ssim_weighted_pred_err;
-  double modified_err;
-
-  if (this_err > av_err)
-    modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW1);
-  else
-    modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW2);
-
-  return modified_err;
+  const FIRSTPASS_STATS *const stats = &cpi->twopass.total_stats;
+  const double av_err = stats->ssim_weighted_pred_err / stats->count;
+  const double this_err = this_frame->ssim_weighted_pred_err;
+  return av_err * pow(this_err / DOUBLE_DIVIDE_CHECK(av_err),
+                      this_err > av_err ? POW1 : POW2);
 }
 
 static const double weight_table[256] = {
@@ -323,28 +322,22 @@
 static int frame_max_bits(VP9_COMP *cpi) {
   // Max allocation for a single frame based on the max section guidelines
   // passed in and how many bits are left.
-  int max_bits;
-
   // For VBR base this on the bits and frames left plus the
   // two_pass_vbrmax_section rate passed in by the user.
-  max_bits = (int) (((double) cpi->twopass.bits_left
-      / (cpi->twopass.total_stats->count - (double) cpi->common
-             .current_video_frame))
-                    * ((double) cpi->oxcf.two_pass_vbrmax_section / 100.0));
+  const double max_bits = (1.0 * cpi->twopass.bits_left /
+      (cpi->twopass.total_stats.count - cpi->common.current_video_frame)) *
+      (cpi->oxcf.two_pass_vbrmax_section / 100.0);
 
   // Trap case where we are out of bits.
-  if (max_bits < 0)
-    max_bits = 0;
-
-  return max_bits;
+  return MAX((int)max_bits, 0);
 }
 
 void vp9_init_first_pass(VP9_COMP *cpi) {
-  zero_stats(cpi->twopass.total_stats);
+  zero_stats(&cpi->twopass.total_stats);
 }
 
 void vp9_end_first_pass(VP9_COMP *cpi) {
-  output_stats(cpi, cpi->output_pkt_list, cpi->twopass.total_stats);
+  output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats);
 }
 
 static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset) {
@@ -445,13 +438,13 @@
   MACROBLOCKD *const xd = &x->e_mbd;
 
   int recon_yoffset, recon_uvoffset;
-  YV12_BUFFER_CONFIG *lst_yv12 =
-      &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]];
-  YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
-  YV12_BUFFER_CONFIG *gld_yv12 =
-      &cm->yv12_fb[cm->ref_frame_map[cpi->gld_fb_idx]];
-  int recon_y_stride = lst_yv12->y_stride;
-  int recon_uv_stride = lst_yv12->uv_stride;
+  const int lst_yv12_idx = cm->ref_frame_map[cpi->lst_fb_idx];
+  const int gld_yv12_idx = cm->ref_frame_map[cpi->gld_fb_idx];
+  YV12_BUFFER_CONFIG *const lst_yv12 = &cm->yv12_fb[lst_yv12_idx];
+  YV12_BUFFER_CONFIG *const new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
+  YV12_BUFFER_CONFIG *const gld_yv12 = &cm->yv12_fb[gld_yv12_idx];
+  const int recon_y_stride = lst_yv12->y_stride;
+  const int recon_uv_stride = lst_yv12->uv_stride;
   int64_t intra_error = 0;
   int64_t coded_error = 0;
   int64_t sr_coded_error = 0;
@@ -484,10 +477,8 @@
 
   vp9_build_block_offsets(x);
 
-  vp9_setup_block_dptrs(&x->e_mbd);
+  vp9_setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
-  // set up frame new frame for intra coded blocks
-  vp9_setup_intra_recon(new_yv12);
   vp9_frame_init_quantizer(cpi);
 
   // Initialise the MV cost table to the defaults
@@ -521,9 +512,9 @@
       int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
 
       set_mi_row_col(cm, xd,
-                     mb_row << CONFIG_SB8X8,
+                     mb_row << 1,
                      1 << mi_height_log2(BLOCK_SIZE_MB16X16),
-                     mb_col << CONFIG_SB8X8,
+                     mb_col << 1,
                      1 << mi_height_log2(BLOCK_SIZE_MB16X16));
 
       xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
@@ -626,10 +617,10 @@
           this_error = motion_error;
           vp9_set_mbmode_and_mvs(x, NEWMV, &mv);
           xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-          vp9_build_inter_predictors_sby(xd, mb_row << CONFIG_SB8X8,
-                                         mb_col << CONFIG_SB8X8,
+          vp9_build_inter_predictors_sby(xd, mb_row << 1,
+                                         mb_col << 1,
                                          BLOCK_SIZE_MB16X16);
-          vp9_encode_sb(cm, x, BLOCK_SIZE_MB16X16);
+          vp9_encode_sby(cm, x, BLOCK_SIZE_MB16X16);
           sum_mvr += mv.as_mv.row;
           sum_mvr_abs += abs(mv.as_mv.row);
           sum_mvc += mv.as_mv.col;
@@ -754,20 +745,20 @@
                             - cpi->source->ts_start);
 
     // don't want to do output stats with a stack variable!
-    memcpy(cpi->twopass.this_frame_stats,
+    memcpy(&cpi->twopass.this_frame_stats,
            &fps,
            sizeof(FIRSTPASS_STATS));
-    output_stats(cpi, cpi->output_pkt_list, cpi->twopass.this_frame_stats);
-    accumulate_stats(cpi->twopass.total_stats, &fps);
+    output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.this_frame_stats);
+    accumulate_stats(&cpi->twopass.total_stats, &fps);
   }
 
   // Copy the previous Last Frame back into gf and and arf buffers if
   // the prediction is good enough... but also dont allow it to lag too far
   if ((cpi->twopass.sr_update_lag > 3) ||
       ((cm->current_video_frame > 0) &&
-       (cpi->twopass.this_frame_stats->pcnt_inter > 0.20) &&
-       ((cpi->twopass.this_frame_stats->intra_error /
-         DOUBLE_DIVIDE_CHECK(cpi->twopass.this_frame_stats->coded_error)) >
+       (cpi->twopass.this_frame_stats.pcnt_inter > 0.20) &&
+       ((cpi->twopass.this_frame_stats.intra_error /
+         DOUBLE_DIVIDE_CHECK(cpi->twopass.this_frame_stats.coded_error)) >
         2.0))) {
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
     cpi->twopass.sr_update_lag = 1;
@@ -775,14 +766,13 @@
     cpi->twopass.sr_update_lag++;
 
   // swap frame pointers so last frame refers to the frame we just compressed
-  vp9_swap_yv12_buffer(lst_yv12, new_yv12);
+  swap_yv12(lst_yv12, new_yv12);
+
   vp8_yv12_extend_frame_borders(lst_yv12);
 
   // Special case for the first frame. Copy into the GF buffer as a second reference.
-  if (cm->current_video_frame == 0) {
+  if (cm->current_video_frame == 0)
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
-  }
-
 
   // use this to see what the first pass reconstruction looks like
   if (0) {
@@ -852,26 +842,18 @@
                                      double err_divisor,
                                      double pt_low,
                                      double pt_high,
-                                     int Q) {
-  double power_term;
-  double error_term = err_per_mb / err_divisor;
-  double correction_factor;
+                                     int q) {
+  const double error_term = err_per_mb / err_divisor;
 
   // Adjustment based on actual quantizer to power term.
-  power_term = (vp9_convert_qindex_to_q(Q) * 0.01) + pt_low;
-  power_term = (power_term > pt_high) ? pt_high : power_term;
+  const double power_term = MIN(vp9_convert_qindex_to_q(q) * 0.01 + pt_low,
+                                pt_high);
 
   // Calculate correction factor
   if (power_term < 1.0)
     assert(error_term >= 0.0);
-  correction_factor = pow(error_term, power_term);
 
-  // Clip range
-  correction_factor =
-    (correction_factor < 0.05)
-    ? 0.05 : (correction_factor > 5.0) ? 5.0 : correction_factor;
-
-  return correction_factor;
+  return fclamp(pow(error_term, power_term), 0.05, 5.0);
 }
 
 // Given a current maxQ value sets a range for future values.
@@ -880,10 +862,8 @@
 // (now uses the actual quantizer) but has not been tuned.
 static void adjust_maxq_qrange(VP9_COMP *cpi) {
   int i;
-  double q;
-
   // Set the max corresponding to cpi->avg_q * 2.0
-  q = cpi->avg_q * 2.0;
+  double q = cpi->avg_q * 2.0;
   cpi->twopass.maxq_max_limit = cpi->worst_quality;
   for (i = cpi->best_quality; i <= cpi->worst_quality; i++) {
     cpi->twopass.maxq_max_limit = i;
@@ -904,12 +884,11 @@
 static int estimate_max_q(VP9_COMP *cpi,
                           FIRSTPASS_STATS *fpstats,
                           int section_target_bandwitdh) {
-  int Q;
+  int q;
   int num_mbs = cpi->common.MBs;
   int target_norm_bits_per_mb;
 
-  double section_err = (fpstats->coded_error / fpstats->count);
-  double sr_err_diff;
+  double section_err = fpstats->coded_error / fpstats->count;
   double sr_correction;
   double err_per_mb = section_err / num_mbs;
   double err_correction_factor;
@@ -918,92 +897,74 @@
   if (section_target_bandwitdh <= 0)
     return cpi->twopass.maxq_max_limit;          // Highest value allowed
 
-  target_norm_bits_per_mb =
-    (section_target_bandwitdh < (1 << 20))
-    ? (512 * section_target_bandwitdh) / num_mbs
-    : 512 * (section_target_bandwitdh / num_mbs);
+  target_norm_bits_per_mb = section_target_bandwitdh < (1 << 20)
+                              ? (512 * section_target_bandwitdh) / num_mbs
+                              : 512 * (section_target_bandwitdh / num_mbs);
 
   // Look at the drop in prediction quality between the last frame
   // and the GF buffer (which contained an older frame).
   if (fpstats->sr_coded_error > fpstats->coded_error) {
-    sr_err_diff =
-      (fpstats->sr_coded_error - fpstats->coded_error) /
-      (fpstats->count * cpi->common.MBs);
-    sr_correction = (sr_err_diff / 32.0);
-    sr_correction = pow(sr_correction, 0.25);
-    if (sr_correction < 0.75)
-      sr_correction = 0.75;
-    else if (sr_correction > 1.25)
-      sr_correction = 1.25;
+    double sr_err_diff = (fpstats->sr_coded_error - fpstats->coded_error) /
+                             (fpstats->count * cpi->common.MBs);
+    sr_correction = fclamp(pow(sr_err_diff / 32.0, 0.25), 0.75, 1.25);
   } else {
     sr_correction = 0.75;
   }
 
   // Calculate a corrective factor based on a rolling ratio of bits spent
   // vs target bits
-  if ((cpi->rolling_target_bits > 0) &&
-      (cpi->active_worst_quality < cpi->worst_quality)) {
-    double rolling_ratio;
-
-    rolling_ratio = (double)cpi->rolling_actual_bits /
-                    (double)cpi->rolling_target_bits;
+  if (cpi->rolling_target_bits > 0 &&
+      cpi->active_worst_quality < cpi->worst_quality) {
+    double rolling_ratio = (double)cpi->rolling_actual_bits /
+                               (double)cpi->rolling_target_bits;
 
     if (rolling_ratio < 0.95)
       cpi->twopass.est_max_qcorrection_factor -= 0.005;
     else if (rolling_ratio > 1.05)
       cpi->twopass.est_max_qcorrection_factor += 0.005;
 
-    cpi->twopass.est_max_qcorrection_factor =
-      (cpi->twopass.est_max_qcorrection_factor < 0.1)
-      ? 0.1
-      : (cpi->twopass.est_max_qcorrection_factor > 10.0)
-      ? 10.0 : cpi->twopass.est_max_qcorrection_factor;
+    cpi->twopass.est_max_qcorrection_factor = fclamp(
+        cpi->twopass.est_max_qcorrection_factor, 0.1, 10.0);
   }
 
   // Corrections for higher compression speed settings
   // (reduced compression expected)
-  if (cpi->compressor_speed == 1) {
-    if (cpi->oxcf.cpu_used <= 5)
-      speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
-    else
-      speed_correction = 1.25;
-  }
+  if (cpi->compressor_speed == 1)
+    speed_correction = cpi->oxcf.cpu_used <= 5 ?
+                          1.04 + (cpi->oxcf.cpu_used * 0.04) :
+                          1.25;
 
   // Try and pick a max Q that will be high enough to encode the
   // content at the given rate.
-  for (Q = cpi->twopass.maxq_min_limit; Q < cpi->twopass.maxq_max_limit; Q++) {
+  for (q = cpi->twopass.maxq_min_limit; q < cpi->twopass.maxq_max_limit; q++) {
     int bits_per_mb_at_this_q;
 
-    err_correction_factor =
-      calc_correction_factor(err_per_mb, ERR_DIVISOR, 0.4, 0.90, Q) *
-      sr_correction * speed_correction *
-      cpi->twopass.est_max_qcorrection_factor;
+    err_correction_factor = calc_correction_factor(err_per_mb,
+                                                   ERR_DIVISOR, 0.4, 0.90, q) *
+                                sr_correction * speed_correction *
+                                cpi->twopass.est_max_qcorrection_factor;
 
-
-    bits_per_mb_at_this_q =
-      vp9_bits_per_mb(INTER_FRAME, Q, err_correction_factor);
+    bits_per_mb_at_this_q = vp9_bits_per_mb(INTER_FRAME, q,
+                                            err_correction_factor);
 
     if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
       break;
   }
 
   // Restriction on active max q for constrained quality mode.
-  if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
-      (Q < cpi->cq_target_quality)) {
-    Q = cpi->cq_target_quality;
-  }
+  if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
+      q < cpi->cq_target_quality)
+    q = cpi->cq_target_quality;
 
   // Adjust maxq_min_limit and maxq_max_limit limits based on
   // average q observed in clip for non kf/gf/arf frames
   // Give average a chance to settle though.
   // PGW TODO.. This code is broken for the extended Q range
-  if ((cpi->ni_frames >
-       ((int)cpi->twopass.total_stats->count >> 8)) &&
-      (cpi->ni_frames > 25)) {
+  if (cpi->ni_frames > ((int)cpi->twopass.total_stats.count >> 8) &&
+      cpi->ni_frames > 25)
     adjust_maxq_qrange(cpi);
-  }
 
-  return Q;
+  return q;
 }
 
 // For cq mode estimate a cq level that matches the observed
@@ -1011,7 +972,7 @@
 static int estimate_cq(VP9_COMP *cpi,
                        FIRSTPASS_STATS *fpstats,
                        int section_target_bandwitdh) {
-  int Q;
+  int q;
   int num_mbs = cpi->common.MBs;
   int target_norm_bits_per_mb;
 
@@ -1055,36 +1016,36 @@
   }
 
   // II ratio correction factor for clip as a whole
-  clip_iiratio = cpi->twopass.total_stats->intra_error /
-                 DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats->coded_error);
+  clip_iiratio = cpi->twopass.total_stats.intra_error /
+                 DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats.coded_error);
   clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);
   if (clip_iifactor < 0.80)
     clip_iifactor = 0.80;
 
   // Try and pick a Q that can encode the content at the given rate.
-  for (Q = 0; Q < MAXQ; Q++) {
+  for (q = 0; q < MAXQ; q++) {
     int bits_per_mb_at_this_q;
 
     // Error per MB based correction factor
     err_correction_factor =
-      calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, Q) *
+      calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, q) *
       sr_correction * speed_correction * clip_iifactor;
 
     bits_per_mb_at_this_q =
-      vp9_bits_per_mb(INTER_FRAME, Q, err_correction_factor);
+      vp9_bits_per_mb(INTER_FRAME, q, err_correction_factor);
 
     if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
       break;
   }
 
   // Clip value to range "best allowed to (worst allowed - 1)"
-  Q = select_cq_level(Q);
-  if (Q >= cpi->worst_quality)
-    Q = cpi->worst_quality - 1;
-  if (Q < cpi->best_quality)
-    Q = cpi->best_quality;
+  q = select_cq_level(q);
+  if (q >= cpi->worst_quality)
+    q = cpi->worst_quality - 1;
+  if (q < cpi->best_quality)
+    q = cpi->best_quality;
 
-  return Q;
+  return q;
 }
 
 
@@ -1101,28 +1062,27 @@
   if (two_pass_min_rate < lower_bounds_min_rate)
     two_pass_min_rate = lower_bounds_min_rate;
 
-  zero_stats(cpi->twopass.total_stats);
-  zero_stats(cpi->twopass.total_left_stats);
+  zero_stats(&cpi->twopass.total_stats);
+  zero_stats(&cpi->twopass.total_left_stats);
 
   if (!cpi->twopass.stats_in_end)
     return;
 
-  *cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
-  *cpi->twopass.total_left_stats = *cpi->twopass.total_stats;
+  cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
+  cpi->twopass.total_left_stats = cpi->twopass.total_stats;
 
   // each frame can have a different duration, as the frame rate in the source
   // isn't guaranteed to be constant.   The frame rate prior to the first frame
   // encoded in the second pass is a guess.  However the sum duration is not.
   // Its calculated based on the actual durations of all frames from the first
   // pass.
-  vp9_new_frame_rate(cpi,
-                     10000000.0 * cpi->twopass.total_stats->count /
-                     cpi->twopass.total_stats->duration);
+  vp9_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats.count /
+                       cpi->twopass.total_stats.duration);
 
   cpi->output_frame_rate = cpi->oxcf.frame_rate;
-  cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats->duration *
+  cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration *
                                      cpi->oxcf.target_bandwidth / 10000000.0);
-  cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats->duration *
+  cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration *
                                       two_pass_min_rate / 10000000.0);
 
   // Calculate a minimum intra value to be used in determining the IIratio
@@ -1148,7 +1108,8 @@
       sum_iiratio += IIRatio;
     }
 
-    cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats->count);
+    cpi->twopass.avg_iiratio = sum_iiratio /
+        DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats.count);
 
     // Reset file position
     reset_fpf_position(cpi, start_pos);
@@ -1188,9 +1149,8 @@
 
   // Look at the observed drop in prediction quality between the last frame
   // and the GF buffer (which contains an older frame).
-  mb_sr_err_diff =
-    (next_frame->sr_coded_error - next_frame->coded_error) /
-    (cpi->common.MBs);
+  mb_sr_err_diff = (next_frame->sr_coded_error - next_frame->coded_error) /
+                   cpi->common.MBs;
   if (mb_sr_err_diff <= 512.0) {
     second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0);
     second_ref_decay = pow(second_ref_decay, 0.5);
@@ -1222,9 +1182,9 @@
   // Break clause to detect very still sections after motion
   // For example a static image after a fade or other transition
   // instead of a clean scene cut.
-  if ((frame_interval > MIN_GF_INTERVAL) &&
-      (loop_decay_rate >= 0.999) &&
-      (last_decay_rate < 0.9)) {
+  if (frame_interval > MIN_GF_INTERVAL &&
+      loop_decay_rate >= 0.999 &&
+      last_decay_rate < 0.9) {
     int j;
     FIRSTPASS_STATS *position = cpi->twopass.stats_in;
     FIRSTPASS_STATS tmp_next_frame;
@@ -1268,10 +1228,9 @@
     // are reasonably well predicted by an earlier (pre flash) frame.
     // The recovery after a flash is indicated by a high pcnt_second_ref
     // comapred to pcnt_inter.
-    if ((next_frame.pcnt_second_ref > next_frame.pcnt_inter) &&
-        (next_frame.pcnt_second_ref >= 0.5)) {
+    if (next_frame.pcnt_second_ref > next_frame.pcnt_inter &&
+        next_frame.pcnt_second_ref >= 0.5)
       flash_detected = 1;
-    }
   }
 
   return flash_detected;
@@ -1353,13 +1312,9 @@
   return frame_boost;
 }
 
-static int calc_arf_boost(
-  VP9_COMP *cpi,
-  int offset,
-  int f_frames,
-  int b_frames,
-  int *f_boost,
-  int *b_boost) {
+static int calc_arf_boost(VP9_COMP *cpi, int offset,
+                          int f_frames, int b_frames,
+                          int *f_boost, int *b_boost) {
   FIRSTPASS_STATS this_frame;
 
   int i;
@@ -1389,8 +1344,7 @@
 
     // Cumulative effect of prediction quality decay
     if (!flash_detected) {
-      decay_accumulator =
-        decay_accumulator * get_prediction_decay_rate(cpi, &this_frame);
+      decay_accumulator *= get_prediction_decay_rate(cpi, &this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                           ? MIN_DECAY_FACTOR : decay_accumulator;
     }
@@ -1426,10 +1380,9 @@
 
     // Cumulative effect of prediction quality decay
     if (!flash_detected) {
-      decay_accumulator =
-        decay_accumulator * get_prediction_decay_rate(cpi, &this_frame);
+      decay_accumulator *= get_prediction_decay_rate(cpi, &this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
-                          ? MIN_DECAY_FACTOR : decay_accumulator;
+                              ? MIN_DECAY_FACTOR : decay_accumulator;
     }
 
     boost_score += (decay_accumulator *
@@ -1831,7 +1784,7 @@
   // where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.
   // This is also important for short clips where there may only be one
   // key frame.
-  if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats->count -
+  if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats.count -
                                           cpi->common.current_video_frame)) {
     cpi->twopass.kf_group_bits =
       (cpi->twopass.bits_left > 0) ? cpi->twopass.bits_left : 0;
@@ -1868,26 +1821,20 @@
   for (i = 0;
       i <= (cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME);
       ++i) {
-    int boost;
     int allocation_chunks;
-    int Q =
-        (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+    int q = cpi->oxcf.fixed_q < 0 ? cpi->last_q[INTER_FRAME]
+                                  : cpi->oxcf.fixed_q;
     int gf_bits;
 
-    boost = (cpi->gfu_boost * vp9_gfboost_qadjust(Q)) / 100;
+    int boost = (cpi->gfu_boost * vp9_gfboost_qadjust(q)) / 100;
 
     // Set max and minimum boost and hence minimum allocation
-    if (boost > ((cpi->baseline_gf_interval + 1) * 200))
-      boost = ((cpi->baseline_gf_interval + 1) * 200);
-    else if (boost < 125)
-      boost = 125;
+    boost = clamp(boost, 125, (cpi->baseline_gf_interval + 1) * 200);
 
     if (cpi->source_alt_ref_pending && i == 0)
-      allocation_chunks =
-        ((cpi->baseline_gf_interval + 1) * 100) + boost;
+      allocation_chunks = ((cpi->baseline_gf_interval + 1) * 100) + boost;
     else
-      allocation_chunks =
-        (cpi->baseline_gf_interval * 100) + (boost - 100);
+      allocation_chunks = (cpi->baseline_gf_interval * 100) + (boost - 100);
 
     // Prevent overflow
     if (boost > 1023) {
@@ -1898,41 +1845,34 @@
 
     // Calculate the number of bits to be spent on the gf or arf based on
     // the boost number
-    gf_bits = (int)((double)boost *
-                    (cpi->twopass.gf_group_bits /
-                     (double)allocation_chunks));
+    gf_bits = (int)((double)boost * (cpi->twopass.gf_group_bits /
+                                       (double)allocation_chunks));
 
     // If the frame that is to be boosted is simpler than the average for
     // the gf/arf group then use an alternative calculation
     // based on the error score of the frame itself
     if (mod_frame_err < gf_group_err / (double)cpi->baseline_gf_interval) {
-      double  alt_gf_grp_bits;
-      int     alt_gf_bits;
-
-      alt_gf_grp_bits =
+      double alt_gf_grp_bits =
         (double)cpi->twopass.kf_group_bits  *
         (mod_frame_err * (double)cpi->baseline_gf_interval) /
         DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left);
 
-      alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /
+      int alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /
                                            (double)allocation_chunks));
 
-      if (gf_bits > alt_gf_bits) {
+      if (gf_bits > alt_gf_bits)
         gf_bits = alt_gf_bits;
-      }
     }
     // Else if it is harder than other frames in the group make sure it at
     // least receives an allocation in keeping with its relative error
     // score, otherwise it may be worse off than an "un-boosted" frame
     else {
-      int alt_gf_bits =
-        (int)((double)cpi->twopass.kf_group_bits *
-              mod_frame_err /
-              DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left));
+      int alt_gf_bits = (int)((double)cpi->twopass.kf_group_bits *
+                        mod_frame_err /
+                        DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left));
 
-      if (alt_gf_bits > gf_bits) {
+      if (alt_gf_bits > gf_bits)
         gf_bits = alt_gf_bits;
-      }
     }
 
     // Dont allow a negative value for gf_bits
@@ -1980,14 +1920,11 @@
     // despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the
     // calculation of alt_extra_bits.
     if (cpi->baseline_gf_interval >= 3) {
-      int boost = (cpi->source_alt_ref_pending)
-                  ? b_boost : cpi->gfu_boost;
+      const int boost = cpi->source_alt_ref_pending ? b_boost : cpi->gfu_boost;
 
       if (boost >= 150) {
-        int pct_extra;
         int alt_extra_bits;
-
-        pct_extra = (boost - 100) / 50;
+        int pct_extra = (boost - 100) / 50;
         pct_extra = (pct_extra > 20) ? 20 : pct_extra;
 
         alt_extra_bits = (int)((cpi->twopass.gf_group_bits * pct_extra) / 100);
@@ -2068,38 +2005,26 @@
 // Make a damped adjustment to the active max q.
 static int adjust_active_maxq(int old_maxqi, int new_maxqi) {
   int i;
-  int ret_val = new_maxqi;
-  double old_q;
-  double new_q;
-  double target_q;
-
-  old_q = vp9_convert_qindex_to_q(old_maxqi);
-  new_q = vp9_convert_qindex_to_q(new_maxqi);
-
-  target_q = ((old_q * 7.0) + new_q) / 8.0;
+  const double old_q = vp9_convert_qindex_to_q(old_maxqi);
+  const double new_q = vp9_convert_qindex_to_q(new_maxqi);
+  const double target_q = ((old_q * 7.0) + new_q) / 8.0;
 
   if (target_q > old_q) {
-    for (i = old_maxqi; i <= new_maxqi; i++) {
-      if (vp9_convert_qindex_to_q(i) >= target_q) {
-        ret_val = i;
-        break;
-      }
-    }
+    for (i = old_maxqi; i <= new_maxqi; i++)
+      if (vp9_convert_qindex_to_q(i) >= target_q)
+        return i;
   } else {
-    for (i = old_maxqi; i >= new_maxqi; i--) {
-      if (vp9_convert_qindex_to_q(i) <= target_q) {
-        ret_val = i;
-        break;
-      }
-    }
+    for (i = old_maxqi; i >= new_maxqi; i--)
+      if (vp9_convert_qindex_to_q(i) <= target_q)
+        return i;
   }
 
-  return ret_val;
+  return new_maxqi;
 }
 
 void vp9_second_pass(VP9_COMP *cpi) {
   int tmp_q;
-  int frames_left = (int)(cpi->twopass.total_stats->count -
+  int frames_left = (int)(cpi->twopass.total_stats.count -
                           cpi->common.current_video_frame);
 
   FIRSTPASS_STATS this_frame;
@@ -2108,9 +2033,8 @@
   double this_frame_intra_error;
   double this_frame_coded_error;
 
-  if (!cpi->twopass.stats_in) {
+  if (!cpi->twopass.stats_in)
     return;
-  }
 
   vp9_clear_system_state();
 
@@ -2120,12 +2044,8 @@
 
     // Set a cq_level in constrained quality mode.
     if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-      int est_cq;
-
-      est_cq =
-        estimate_cq(cpi,
-                    cpi->twopass.total_left_stats,
-                    (int)(cpi->twopass.bits_left / frames_left));
+      int est_cq = estimate_cq(cpi, &cpi->twopass.total_left_stats,
+                               (int)(cpi->twopass.bits_left / frames_left));
 
       cpi->cq_target_quality = cpi->oxcf.cq_level;
       if (est_cq > cpi->cq_target_quality)
@@ -2136,14 +2056,12 @@
     cpi->twopass.maxq_max_limit = cpi->worst_quality;
     cpi->twopass.maxq_min_limit = cpi->best_quality;
 
-    tmp_q = estimate_max_q(
-              cpi,
-              cpi->twopass.total_left_stats,
-              (int)(cpi->twopass.bits_left / frames_left));
+    tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats,
+                           (int)(cpi->twopass.bits_left / frames_left));
 
-    cpi->active_worst_quality         = tmp_q;
-    cpi->ni_av_qi                     = tmp_q;
-    cpi->avg_q                        = vp9_convert_qindex_to_q(tmp_q);
+    cpi->active_worst_quality = tmp_q;
+    cpi->ni_av_qi = tmp_q;
+    cpi->avg_q = vp9_convert_qindex_to_q(tmp_q);
 
 #ifndef ONE_SHOT_Q_ESTIMATE
     // Limit the maxq value returned subsequently.
@@ -2161,15 +2079,15 @@
   // radical adjustments to the allowed quantizer range just to use up a
   // few surplus bits or get beneath the target rate.
   else if ((cpi->common.current_video_frame <
-            (((unsigned int)cpi->twopass.total_stats->count * 255) >> 8)) &&
+            (((unsigned int)cpi->twopass.total_stats.count * 255) >> 8)) &&
            ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
-            (unsigned int)cpi->twopass.total_stats->count)) {
+            (unsigned int)cpi->twopass.total_stats.count)) {
     if (frames_left < 1)
       frames_left = 1;
 
     tmp_q = estimate_max_q(
               cpi,
-              cpi->twopass.total_left_stats,
+              &cpi->twopass.total_left_stats,
               (int)(cpi->twopass.bits_left / frames_left));
 
     // Make a damped adjustment to active max Q
@@ -2248,7 +2166,7 @@
   cpi->twopass.frames_to_key--;
 
   // Update the total stats remaining structure
-  subtract_stats(cpi->twopass.total_left_stats, &this_frame);
+  subtract_stats(&cpi->twopass.total_left_stats, &this_frame);
 }
 
 static int test_candidate_kf(VP9_COMP *cpi,
@@ -2401,9 +2319,9 @@
     if (cpi->oxcf.auto_key
         && lookup_next_frame_stats(cpi, &next_frame) != EOF) {
       // Normal scene cut check
-      if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame)) {
+      if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame))
         break;
-      }
+
 
       // How fast is prediction quality decaying
       loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
@@ -2413,19 +2331,14 @@
       // quality since the last GF or KF.
       recent_loop_decay[i % 8] = loop_decay_rate;
       decay_accumulator = 1.0;
-      for (j = 0; j < 8; j++) {
-        decay_accumulator = decay_accumulator * recent_loop_decay[j];
-      }
+      for (j = 0; j < 8; j++)
+        decay_accumulator *= recent_loop_decay[j];
 
       // Special check for transition or high motion followed by a
       // to a static scene.
-      if (detect_transition_to_still(cpi, i,
-                                     (cpi->key_frame_frequency - i),
-                                     loop_decay_rate,
-                                     decay_accumulator)) {
+      if (detect_transition_to_still(cpi, i, cpi->key_frame_frequency - i,
+                                     loop_decay_rate, decay_accumulator))
         break;
-      }
-
 
       // Step on to the next frame
       cpi->twopass.frames_to_key++;
diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index a89d254..708fe45 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -46,7 +46,7 @@
       unsigned int i;
 
       for (i = 0; i < ctx->max_sz; i++)
-        vp8_yv12_de_alloc_frame_buffer(&ctx->buf[i].img);
+        vp9_free_frame_buffer(&ctx->buf[i].img);
       free(ctx->buf);
     }
     free(ctx);
@@ -56,6 +56,8 @@
 
 struct lookahead_ctx * vp9_lookahead_init(unsigned int width,
                                           unsigned int height,
+                                          unsigned int subsampling_x,
+                                          unsigned int subsampling_y,
                                           unsigned int depth) {
   struct lookahead_ctx *ctx = NULL;
 
@@ -71,8 +73,9 @@
     if (!ctx->buf)
       goto bail;
     for (i = 0; i < depth; i++)
-      if (vp8_yv12_alloc_frame_buffer(&ctx->buf[i].img,
-                                      width, height, VP9BORDERINPIXELS))
+      if (vp9_alloc_frame_buffer(&ctx->buf[i].img,
+                                 width, height, subsampling_x, subsampling_y,
+                                 VP9BORDERINPIXELS))
         goto bail;
   }
   return ctx;
diff --git a/vp9/encoder/vp9_lookahead.h b/vp9/encoder/vp9_lookahead.h
index 2406618..81baa2c 100644
--- a/vp9/encoder/vp9_lookahead.h
+++ b/vp9/encoder/vp9_lookahead.h
@@ -31,6 +31,8 @@
  */
 struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
                                          unsigned int height,
+                                         unsigned int subsampling_x,
+                                         unsigned int subsampling_y,
                                          unsigned int depth);
 
 
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index af62ec3..7d4906c 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -9,13 +9,13 @@
  */
 
 #include <limits.h>
+
+#include <vpx_mem/vpx_mem.h>
 #include <vp9/encoder/vp9_encodeintra.h>
 #include <vp9/encoder/vp9_rdopt.h>
-#include <vp9/common/vp9_setupintrarecon.h>
 #include <vp9/common/vp9_blockd.h>
 #include <vp9/common/vp9_reconinter.h>
 #include <vp9/common/vp9_systemdependent.h>
-#include <vpx_mem/vpx_mem.h>
 #include <vp9/encoder/vp9_segmentation.h>
 
 static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
@@ -28,15 +28,15 @@
   vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
   unsigned int best_err;
 
-  int tmp_col_min = x->mv_col_min;
-  int tmp_col_max = x->mv_col_max;
-  int tmp_row_min = x->mv_row_min;
-  int tmp_row_max = x->mv_row_max;
+  const int tmp_col_min = x->mv_col_min;
+  const int tmp_col_max = x->mv_col_max;
+  const int tmp_row_min = x->mv_row_min;
+  const int tmp_row_max = x->mv_row_max;
   int_mv ref_full;
 
   // Further step/diamond searches as necessary
   int step_param = cpi->sf.first_step +
-      (cpi->Speed < 8 ? (cpi->Speed > 5 ? 1 : 0) : 2);
+      (cpi->speed < 8 ? (cpi->speed > 5 ? 1 : 0) : 2);
 
   vp9_clamp_mv_min_max(x, ref_mv);
 
@@ -44,15 +44,8 @@
   ref_full.as_mv.row = ref_mv->as_mv.row >> 3;
 
   /*cpi->sf.search_method == HEX*/
-  best_err = vp9_hex_search(
-      x,
-      &ref_full, dst_mv,
-      step_param,
-      x->errorperbit,
-      &v_fn_ptr,
-      NULL, NULL,
-      NULL, NULL,
-      ref_mv);
+  best_err = vp9_hex_search(x, &ref_full, dst_mv, step_param, x->errorperbit,
+                            &v_fn_ptr, NULL, NULL, NULL, NULL, ref_mv);
 
   // Try sub-pixel MC
   // if (bestsme > error_thresh && bestsme < INT_MAX)
@@ -82,18 +75,11 @@
   return best_err;
 }
 
-static int do_16x16_motion_search
-(
-  VP9_COMP *cpi,
-  int_mv *ref_mv,
-  int_mv *dst_mv,
-  YV12_BUFFER_CONFIG *buf,
-  int buf_mb_y_offset,
-  YV12_BUFFER_CONFIG *ref,
-  int mb_y_offset,
-  int mb_row,
-  int mb_col) {
-  MACROBLOCK   *const x  = &cpi->mb;
+static int do_16x16_motion_search(VP9_COMP *cpi,
+                                  int_mv *ref_mv, int_mv *dst_mv,
+                                  int buf_mb_y_offset, int mb_y_offset,
+                                  int mb_row, int mb_col) {
+  MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   unsigned int err, tmp_err;
   int_mv tmp_mv;
@@ -109,7 +95,7 @@
   // starting point (best reference) for the search
   tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv, mb_row, mb_col);
   if (tmp_err < err) {
-    err            = tmp_err;
+    err = tmp_err;
     dst_mv->as_int = tmp_mv.as_int;
   }
 
@@ -130,16 +116,10 @@
   return err;
 }
 
-static int do_16x16_zerozero_search
-(
-  VP9_COMP *cpi,
-  int_mv *dst_mv,
-  YV12_BUFFER_CONFIG *buf,
-  int buf_mb_y_offset,
-  YV12_BUFFER_CONFIG *ref,
-  int mb_y_offset
-) {
-  MACROBLOCK   *const x  = &cpi->mb;
+static int do_16x16_zerozero_search(VP9_COMP *cpi,
+                                    int_mv *dst_mv,
+                                    int buf_mb_y_offset, int mb_y_offset) {
+  MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   unsigned int err;
 
@@ -153,13 +133,9 @@
 
   return err;
 }
-static int find_best_16x16_intra
-(
-  VP9_COMP *cpi,
-  YV12_BUFFER_CONFIG *buf,
-  int mb_y_offset,
-  MB_PREDICTION_MODE *pbest_mode
-) {
+static int find_best_16x16_intra(VP9_COMP *cpi,
+                                 int mb_y_offset,
+                                 MB_PREDICTION_MODE *pbest_mode) {
   MACROBLOCK   *const x  = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_PREDICTION_MODE best_mode = -1, mode;
@@ -210,7 +186,7 @@
   int mb_row,
   int mb_col
 ) {
-  MACROBLOCK   *const x  = &cpi->mb;
+  MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   int intra_error;
   VP9_COMMON *cm = &cpi->common;
@@ -223,7 +199,7 @@
   xd->plane[0].dst.stride = cm->yv12_fb[cm->new_fb_idx].y_stride;
 
   // do intra 16x16 prediction
-  intra_error = find_best_16x16_intra(cpi, buf, mb_y_offset,
+  intra_error = find_best_16x16_intra(cpi, mb_y_offset,
                                       &stats->ref[INTRA_FRAME].m.mode);
   if (intra_error <= 0)
     intra_error = 1;
@@ -234,10 +210,10 @@
     int g_motion_error;
     xd->plane[0].pre[0].buf = golden_ref->y_buffer + mb_y_offset;
     xd->plane[0].pre[0].stride = golden_ref->y_stride;
-    g_motion_error = do_16x16_motion_search(cpi, prev_golden_ref_mv,
+    g_motion_error = do_16x16_motion_search(cpi,
+                                            prev_golden_ref_mv,
                                             &stats->ref[GOLDEN_FRAME].m.mv,
-                                            buf, mb_y_offset,
-                                            golden_ref, gld_y_offset,
+                                            mb_y_offset, gld_y_offset,
                                             mb_row, mb_col);
     stats->ref[GOLDEN_FRAME].err = g_motion_error;
   } else {
@@ -252,8 +228,7 @@
     xd->plane[0].pre[0].stride = alt_ref->y_stride;
     a_motion_error = do_16x16_zerozero_search(cpi,
                                               &stats->ref[ALTREF_FRAME].m.mv,
-                                              buf, mb_y_offset,
-                                              alt_ref, arf_y_offset);
+                                              mb_y_offset, arf_y_offset);
 
     stats->ref[ALTREF_FRAME].err = a_motion_error;
   } else {
@@ -262,17 +237,15 @@
   }
 }
 
-static void update_mbgraph_frame_stats
-(
-  VP9_COMP *cpi,
-  MBGRAPH_FRAME_STATS *stats,
-  YV12_BUFFER_CONFIG *buf,
-  YV12_BUFFER_CONFIG *golden_ref,
-  YV12_BUFFER_CONFIG *alt_ref
-) {
-  MACROBLOCK   *const x  = &cpi->mb;
-  VP9_COMMON   *const cm = &cpi->common;
+static void update_mbgraph_frame_stats(VP9_COMP *cpi,
+                                       MBGRAPH_FRAME_STATS *stats,
+                                       YV12_BUFFER_CONFIG *buf,
+                                       YV12_BUFFER_CONFIG *golden_ref,
+                                       YV12_BUFFER_CONFIG *alt_ref) {
+  MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
+  VP9_COMMON *const cm = &cpi->common;
+
   int mb_col, mb_row, offset = 0;
   int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
   int_mv arf_top_mv, gld_top_mv;
@@ -361,17 +334,16 @@
     for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;
          offset += cm->mb_cols, mb_row++) {
       for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-        MBGRAPH_MB_STATS *mb_stats =
-          &frame_stats->mb_stats[offset + mb_col];
+        MBGRAPH_MB_STATS *mb_stats = &frame_stats->mb_stats[offset + mb_col];
 
         int altref_err = mb_stats->ref[ALTREF_FRAME].err;
         int intra_err  = mb_stats->ref[INTRA_FRAME ].err;
         int golden_err = mb_stats->ref[GOLDEN_FRAME].err;
 
         // Test for altref vs intra and gf and that its mv was 0,0.
-        if ((altref_err > 1000) ||
-            (altref_err > intra_err) ||
-            (altref_err > golden_err)) {
+        if (altref_err > 1000 ||
+            altref_err > intra_err ||
+            altref_err > golden_err) {
           arf_not_zz[offset + mb_col]++;
         }
       }
@@ -386,7 +358,6 @@
       // goes in segment 0
       if (arf_not_zz[offset + mb_col]) {
         ncnt[0]++;
-#if CONFIG_SB8X8
         cpi->segmentation_map[offset * 4 + 2 * mb_col] = 0;
         cpi->segmentation_map[offset * 4 + 2 * mb_col + 1] = 0;
         cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols] = 0;
@@ -396,11 +367,6 @@
         cpi->segmentation_map[offset * 4 + 2 * mb_col + 1] = 1;
         cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols] = 1;
         cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols + 1] = 1;
-#else
-        cpi->segmentation_map[offset + mb_col] = 0;
-      } else {
-        cpi->segmentation_map[offset + mb_col] = 1;
-#endif
         ncnt[1]++;
       }
     }
@@ -457,8 +423,7 @@
   // the ARF MC search backwards, to get optimal results for MV caching
   for (i = 0; i < n_frames; i++) {
     MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
-    struct lookahead_entry *q_cur =
-      vp9_lookahead_peek(cpi->lookahead, i);
+    struct lookahead_entry *q_cur = vp9_lookahead_peek(cpi->lookahead, i);
 
     assert(q_cur != NULL);
 
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 74caba5..aff5637 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -413,6 +413,201 @@
 
   return besterr;
 }
+
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+#undef DIST
+/* returns subpixel variance error function */
+#define DIST(r, c) \
+    vfp->svaf(PRE(r, c), y_stride, SP(c), SP(r), \
+              z, src_stride, &sse, second_pred)
+
+int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
+                                 int_mv *bestmv, int_mv *ref_mv,
+                                 int error_per_bit,
+                                 const vp9_variance_fn_ptr_t *vfp,
+                                 int *mvjcost, int *mvcost[2],
+                                 int *distortion,
+                                 unsigned int *sse1,
+                                 const uint8_t *second_pred, int w, int h) {
+  uint8_t *z = x->plane[0].src.buf;
+  int src_stride = x->plane[0].src.stride;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  int rr, rc, br, bc, hstep;
+  int tr, tc;
+  unsigned int besterr = INT_MAX;
+  unsigned int left, right, up, down, diag;
+  unsigned int sse;
+  unsigned int whichdir;
+  unsigned int halfiters = 4;
+  unsigned int quarteriters = 4;
+  unsigned int eighthiters = 4;
+  int thismse;
+  int maxc, minc, maxr, minr;
+  int y_stride;
+  int offset;
+  int usehp = xd->allow_high_precision_mv;
+
+  uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
+  uint8_t *y = xd->plane[0].pre[0].buf +
+               (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
+               bestmv->as_mv.col;
+
+  y_stride = xd->plane[0].pre[0].stride;
+
+  rr = ref_mv->as_mv.row;
+  rc = ref_mv->as_mv.col;
+  br = bestmv->as_mv.row << 3;
+  bc = bestmv->as_mv.col << 3;
+  hstep = 4;
+  minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) -
+             ((1 << MV_MAX_BITS) - 1));
+  maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) +
+             ((1 << MV_MAX_BITS) - 1));
+  minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) -
+             ((1 << MV_MAX_BITS) - 1));
+  maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) +
+             ((1 << MV_MAX_BITS) - 1));
+
+  tr = br;
+  tc = bc;
+
+
+  offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+
+  // central mv
+  bestmv->as_mv.row <<= 3;
+  bestmv->as_mv.col <<= 3;
+
+  // calculate central point error
+  // TODO(yunqingwang): central pointer error was already calculated in full-
+  // pixel search, and can be passed in this function.
+  comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+  besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost,
+                         error_per_bit, xd->allow_high_precision_mv);
+
+  // Each subsequent iteration checks at least one point in
+  // common with the last iteration could be 2 ( if diag selected)
+  while (--halfiters) {
+    // 1/2 pel
+    CHECK_BETTER(left, tr, tc - hstep);
+    CHECK_BETTER(right, tr, tc + hstep);
+    CHECK_BETTER(up, tr - hstep, tc);
+    CHECK_BETTER(down, tr + hstep, tc);
+
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+    switch (whichdir) {
+      case 0:
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);
+        break;
+      case 1:
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);
+        break;
+      case 2:
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);
+        break;
+      case 3:
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+        break;
+    }
+
+    // no reason to check the same one again.
+    if (tr == br && tc == bc)
+      break;
+
+    tr = br;
+    tc = bc;
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+  hstep >>= 1;
+  while (--quarteriters) {
+    CHECK_BETTER(left, tr, tc - hstep);
+    CHECK_BETTER(right, tr, tc + hstep);
+    CHECK_BETTER(up, tr - hstep, tc);
+    CHECK_BETTER(down, tr + hstep, tc);
+
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+    switch (whichdir) {
+      case 0:
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);
+        break;
+      case 1:
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);
+        break;
+      case 2:
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);
+        break;
+      case 3:
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+        break;
+    }
+
+    // no reason to check the same one again.
+    if (tr == br && tc == bc)
+      break;
+
+    tr = br;
+    tc = bc;
+  }
+
+  if (xd->allow_high_precision_mv) {
+    usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+  } else {
+    usehp = 0;
+  }
+
+  if (usehp) {
+    hstep >>= 1;
+    while (--eighthiters) {
+      CHECK_BETTER(left, tr, tc - hstep);
+      CHECK_BETTER(right, tr, tc + hstep);
+      CHECK_BETTER(up, tr - hstep, tc);
+      CHECK_BETTER(down, tr + hstep, tc);
+
+      whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+      switch (whichdir) {
+        case 0:
+          CHECK_BETTER(diag, tr - hstep, tc - hstep);
+          break;
+        case 1:
+          CHECK_BETTER(diag, tr - hstep, tc + hstep);
+          break;
+        case 2:
+          CHECK_BETTER(diag, tr + hstep, tc - hstep);
+          break;
+        case 3:
+          CHECK_BETTER(diag, tr + hstep, tc + hstep);
+          break;
+      }
+
+      // no reason to check the same one again.
+      if (tr == br && tc == bc)
+        break;
+
+      tr = br;
+      tc = bc;
+    }
+  }
+  bestmv->as_mv.row = br;
+  bestmv->as_mv.col = bc;
+
+  vpx_free(comp_pred);
+
+  if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
+
 #undef MVC
 #undef PRE
 #undef DIST
@@ -2132,7 +2327,109 @@
     return INT_MAX;
 }
 
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+/* This function is called when we do joint motion search in comp_inter_inter
+ * mode.
+ */
+int vp9_refining_search_8p_c(MACROBLOCK *x,
+                             int_mv *ref_mv, int error_per_bit,
+                             int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+                             int *mvjcost, int *mvcost[2], int_mv *center_mv,
+                             const uint8_t *second_pred, int w, int h) {
+  const MACROBLOCKD* const xd = &x->e_mbd;
+  MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
+      {-1, -1}, {1, -1}, {-1, 1}, {1, 1}};
+  int i, j;
+  int this_row_offset, this_col_offset;
 
+  int what_stride = x->plane[0].src.stride;
+  int in_what_stride = xd->plane[0].pre[0].stride;
+  uint8_t *what = x->plane[0].src.buf;
+  uint8_t *best_address = xd->plane[0].pre[0].buf +
+                          (ref_mv->as_mv.row * xd->plane[0].pre[0].stride) +
+                          ref_mv->as_mv.col;
+  uint8_t *check_here;
+  unsigned int thissad;
+  int_mv this_mv;
+  unsigned int bestsad = INT_MAX;
+  int_mv fcenter_mv;
+
+  int *mvjsadcost = x->nmvjointsadcost;
+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+  /* Compound pred buffer */
+  uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
+
+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+  /* Get compound pred by averaging two pred blocks. */
+  comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
+
+  bestsad = fn_ptr->sdf(what, what_stride, comp_pred, w, 0x7fffffff) +
+      mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+
+    for (j = 0; j < 8; j++) {
+      this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
+      this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
+
+      if ((this_col_offset > x->mv_col_min) &&
+          (this_col_offset < x->mv_col_max) &&
+          (this_row_offset > x->mv_row_min) &&
+          (this_row_offset < x->mv_row_max)) {
+        check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +
+            best_address;
+
+        /* Get compound block and use it to calculate SAD. */
+        comp_avg_pred(comp_pred, second_pred, w, h, check_here,
+                      in_what_stride);
+        thissad = fn_ptr->sdf(what, what_stride, comp_pred, w, bestsad);
+
+        if (thissad < bestsad) {
+          this_mv.as_mv.row = this_row_offset;
+          this_mv.as_mv.col = this_col_offset;
+          thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost,
+                                    mvsadcost, error_per_bit);
+
+          if (thissad < bestsad) {
+            bestsad = thissad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      ref_mv->as_mv.row += neighbors[best_site].row;
+      ref_mv->as_mv.col += neighbors[best_site].col;
+      best_address += (neighbors[best_site].row) * in_what_stride +
+          neighbors[best_site].col;
+    }
+  }
+
+  this_mv.as_mv.row = ref_mv->as_mv.row << 3;
+  this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+
+  if (bestsad < INT_MAX) {
+    int besterr;
+    comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
+    besterr = fn_ptr->vf(what, what_stride, comp_pred, w,
+        (unsigned int *)(&thissad)) +
+        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
+                    xd->allow_high_precision_mv);
+    vpx_free(comp_pred);
+    return besterr;
+  } else {
+    vpx_free(comp_pred);
+    return INT_MAX;
+  }
+}
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
 
 #ifdef ENTROPY_STATS
 void print_mode_context(VP9_COMMON *pc) {
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index e1ba7fd..cdbd29a 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -79,5 +79,21 @@
                                        int *mvjcost, int *mvcost[2],
                                        int_mv *center_mv);
 
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
+                                 int_mv *bestmv, int_mv *ref_mv,
+                                 int error_per_bit,
+                                 const vp9_variance_fn_ptr_t *vfp,
+                                 int *mvjcost, int *mvcost[2],
+                                 int *distortion, unsigned int *sse1,
+                                 const uint8_t *second_pred,
+                                 int w, int h);
 
+int vp9_refining_search_8p_c(MACROBLOCK *x,
+                             int_mv *ref_mv, int error_per_bit,
+                             int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+                             int *mvjcost, int *mvcost[2],
+                             int_mv *center_mv, const uint8_t *second_pred,
+                             int w, int h);
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
 #endif  // VP9_ENCODER_VP9_MCOMP_H_
diff --git a/vp9/encoder/vp9_modecosts.c b/vp9/encoder/vp9_modecosts.c
index 88cd1f4..e26daf0 100644
--- a/vp9/encoder/vp9_modecosts.c
+++ b/vp9/encoder/vp9_modecosts.c
@@ -41,10 +41,6 @@
                   x->fc.uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);
   vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
                   x->kf_uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);
-#if !CONFIG_SB8X8
-  vp9_cost_tokens(c->mb.i8x8_mode_costs,
-                  x->fc.i8x8_mode_prob, vp9_i8x8_mode_tree);
-#endif
 
   for (i = 0; i <= VP9_SWITCHABLE_FILTERS; ++i)
     vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 300fa32..3d8003c 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -32,7 +32,6 @@
 #include "vp9/common/vp9_postproc.h"
 #endif
 #include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_swapyv12buffer.h"
 #include "vpx_ports/vpx_timer.h"
 
 #include "vp9/common/vp9_seg_common.h"
@@ -114,13 +113,6 @@
 extern void print_nmvstats();
 #endif
 
-#if CONFIG_CODE_ZEROGROUP
-#ifdef ZPC_STATS
-extern void init_zpcstats();
-extern void print_zpcstats();
-#endif
-#endif
-
 #ifdef SPEEDSTATS
 unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 #endif
@@ -161,6 +153,11 @@
   const double minqtarget = MIN(((x3 * maxq + x2) * maxq + x1) * maxq + c,
                                 maxq);
 
+  // Special case handling to deal with the step from q2.0
+  // down to lossless mode represented by q 1.0.
+  if (minqtarget <= 2.0)
+    return 0;
+
   for (i = 0; i < QINDEX_RANGE; i++) {
     if (minqtarget <= vp9_convert_qindex_to_q(i))
       return i;
@@ -316,9 +313,9 @@
 
   vp9_free_frame_buffers(&cpi->common);
 
-  vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf);
-  vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source);
-  vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer);
+  vp9_free_frame_buffer(&cpi->last_frame_uf);
+  vp9_free_frame_buffer(&cpi->scaled_source);
+  vp9_free_frame_buffer(&cpi->alt_ref_buffer);
   vp9_lookahead_destroy(cpi->lookahead);
 
   vpx_free(cpi->tok);
@@ -332,15 +329,6 @@
 
   vpx_free(cpi->mb.pip);
   cpi->mb.pip = 0;
-
-  vpx_free(cpi->twopass.total_stats);
-  cpi->twopass.total_stats = 0;
-
-  vpx_free(cpi->twopass.total_left_stats);
-  cpi->twopass.total_left_stats = 0;
-
-  vpx_free(cpi->twopass.this_frame_stats);
-  cpi->twopass.this_frame_stats = 0;
 }
 
 // Computes a q delta (in "q index" terms) to get from a starting q value
@@ -626,9 +614,6 @@
   sf->thresh_mult[THR_D63_PRED ] += speed_multiplier * 1500;
 
   sf->thresh_mult[THR_B_PRED   ] += speed_multiplier * 2500;
-#if !CONFIG_SB8X8
-  sf->thresh_mult[THR_I8X8_PRED] += speed_multiplier * 2500;
-#endif
 
   sf->thresh_mult[THR_NEWMV    ] += speed_multiplier * 1000;
   sf->thresh_mult[THR_NEWG     ] += speed_multiplier * 1000;
@@ -658,24 +643,6 @@
   sf->thresh_mult[THR_COMP_SPLITGA  ] += speed_multiplier * 4500;
   sf->thresh_mult[THR_COMP_SPLITLG  ] += speed_multiplier * 4500;
 
-#if CONFIG_COMP_INTERINTRA_PRED
-  sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL   ] += speed_multiplier * 1500;
-  sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG   ] += speed_multiplier * 1500;
-  sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA   ] += speed_multiplier * 1500;
-
-  sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] += speed_multiplier * 1500;
-  sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] += speed_multiplier * 1500;
-  sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] += speed_multiplier * 1500;
-
-  sf->thresh_mult[THR_COMP_INTERINTRA_NEARL   ] += speed_multiplier * 1500;
-  sf->thresh_mult[THR_COMP_INTERINTRA_NEARG   ] += speed_multiplier * 1500;
-  sf->thresh_mult[THR_COMP_INTERINTRA_NEARA   ] += speed_multiplier * 1500;
-
-  sf->thresh_mult[THR_COMP_INTERINTRA_NEWL    ] += speed_multiplier * 2000;
-  sf->thresh_mult[THR_COMP_INTERINTRA_NEWG    ] += speed_multiplier * 2000;
-  sf->thresh_mult[THR_COMP_INTERINTRA_NEWA    ] += speed_multiplier * 2000;
-#endif
-
   /* disable frame modes if flags not set */
   if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) {
     sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
@@ -683,12 +650,6 @@
     sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;
     sf->thresh_mult[THR_NEARMV   ] = INT_MAX;
     sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
-#if CONFIG_COMP_INTERINTRA_PRED
-    sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARL   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEWL    ] = INT_MAX;
-#endif
   }
   if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
     sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
@@ -696,12 +657,6 @@
     sf->thresh_mult[THR_NEARG    ] = INT_MAX;
     sf->thresh_mult[THR_NEWG     ] = INT_MAX;
     sf->thresh_mult[THR_SPLITG   ] = INT_MAX;
-#if CONFIG_COMP_INTERINTRA_PRED
-    sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARG   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEWG    ] = INT_MAX;
-#endif
   }
   if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) {
     sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
@@ -709,12 +664,6 @@
     sf->thresh_mult[THR_NEARA    ] = INT_MAX;
     sf->thresh_mult[THR_NEWA     ] = INT_MAX;
     sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
-#if CONFIG_COMP_INTERINTRA_PRED
-    sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEWA    ] = INT_MAX;
-#endif
   }
 
   if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) !=
@@ -746,7 +695,7 @@
 void vp9_set_speed_features(VP9_COMP *cpi) {
   SPEED_FEATURES *sf = &cpi->sf;
   int mode = cpi->compressor_speed;
-  int speed = cpi->Speed;
+  int speed = cpi->speed;
   int i;
 
   // Only modes 0 and 1 supported for now in experimental code basae
@@ -867,10 +816,6 @@
   }
 
   cpi->mb.quantize_b_4x4      = vp9_regular_quantize_b_4x4;
-#if !CONFIG_SB8X8
-  cpi->mb.quantize_b_4x4_pair = vp9_regular_quantize_b_4x4_pair;
-  cpi->mb.quantize_b_8x8      = vp9_regular_quantize_b_8x8;
-#endif
 
   vp9_init_quantizer(cpi);
 
@@ -885,20 +830,24 @@
   cpi->mb.optimize = cpi->sf.optimize_coefficients == 1 && cpi->pass != 1;
 
 #ifdef SPEEDSTATS
-  frames_at_speed[cpi->Speed]++;
+  frames_at_speed[cpi->speed]++;
 #endif
 }
 
 static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+
   cpi->lookahead = vp9_lookahead_init(cpi->oxcf.width, cpi->oxcf.height,
+                                      cm->subsampling_x, cm->subsampling_y,
                                       cpi->oxcf.lag_in_frames);
   if (!cpi->lookahead)
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate lag buffers");
 
-  if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer,
-                                  cpi->oxcf.width, cpi->oxcf.height,
-                                  VP9BORDERINPIXELS))
+  if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer,
+                               cpi->oxcf.width, cpi->oxcf.height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9BORDERINPIXELS))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate altref buffer");
 }
@@ -928,13 +877,17 @@
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate partition data");
 
-  if (vp8_yv12_alloc_frame_buffer(&cpi->last_frame_uf,
-                                  cm->width, cm->height, VP9BORDERINPIXELS))
+  if (vp9_alloc_frame_buffer(&cpi->last_frame_uf,
+                             cm->width, cm->height,
+                             cm->subsampling_x, cm->subsampling_y,
+                             VP9BORDERINPIXELS))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate last frame buffer");
 
-  if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source,
-                                  cm->width, cm->height, VP9BORDERINPIXELS))
+  if (vp9_alloc_frame_buffer(&cpi->scaled_source,
+                             cm->width, cm->height,
+                             cm->subsampling_x, cm->subsampling_y,
+                             VP9BORDERINPIXELS))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate scaled source buffer");
 
@@ -960,23 +913,6 @@
   CHECK_MEM_ERROR(cpi->mb_norm_activity_map,
                   vpx_calloc(sizeof(unsigned int),
                              cm->mb_rows * cm->mb_cols));
-
-  vpx_free(cpi->twopass.total_stats);
-
-  cpi->twopass.total_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
-  vpx_free(cpi->twopass.total_left_stats);
-  cpi->twopass.total_left_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
-  vpx_free(cpi->twopass.this_frame_stats);
-
-  cpi->twopass.this_frame_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
-  if (!cpi->twopass.total_stats ||
-      !cpi->twopass.total_left_stats ||
-      !cpi->twopass.this_frame_stats)
-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate firstpass stats");
 }
 
 
@@ -986,13 +922,17 @@
   vp9_update_frame_size(cm);
 
   // Update size of buffers local to this frame
-  if (vp8_yv12_realloc_frame_buffer(&cpi->last_frame_uf,
-                                    cm->width, cm->height, VP9BORDERINPIXELS))
+  if (vp9_realloc_frame_buffer(&cpi->last_frame_uf,
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9BORDERINPIXELS))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to reallocate last frame buffer");
 
-  if (vp8_yv12_realloc_frame_buffer(&cpi->scaled_source,
-                                    cm->width, cm->height, VP9BORDERINPIXELS))
+  if (vp9_realloc_frame_buffer(&cpi->scaled_source,
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9BORDERINPIXELS))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to reallocate scaled source buffer");
 
@@ -1104,6 +1044,9 @@
 
   cm->width = oxcf->width;
   cm->height = oxcf->height;
+  cm->subsampling_x = 0;
+  cm->subsampling_y = 0;
+  vp9_alloc_compressor_data(cpi);
 
   // change includes all joint functionality
   vp9_change_config(ptr, oxcf);
@@ -1268,17 +1211,13 @@
 
   cm->sharpness_level = cpi->oxcf.Sharpness;
 
-  // Increasing the size of the frame beyond the first seen frame, or some
-  // otherwise signalled maximum size, is not supported.
-  // TODO(jkoleszar): exit gracefully.
-  if (!cpi->initial_width) {
-    alloc_raw_frame_buffers(cpi);
-    vp9_alloc_compressor_data(cpi);
-    cpi->initial_width = cm->width;
-    cpi->initial_height = cm->height;
+  if (cpi->initial_width) {
+    // Increasing the size of the frame beyond the first seen frame, or some
+    // otherwise signalled maximum size, is not supported.
+    // TODO(jkoleszar): exit gracefully.
+    assert(cm->width <= cpi->initial_width);
+    assert(cm->height <= cpi->initial_height);
   }
-  assert(cm->width <= cpi->initial_width);
-  assert(cm->height <= cpi->initial_height);
   update_frame_size(cpi);
 
   if (cpi->oxcf.fixed_q >= 0) {
@@ -1287,7 +1226,7 @@
     cpi->last_boosted_qindex = cpi->oxcf.fixed_q;
   }
 
-  cpi->Speed = cpi->oxcf.cpu_used;
+  cpi->speed = cpi->oxcf.cpu_used;
 
   if (cpi->oxcf.lag_in_frames == 0) {
     // force to allowlag to 0 if lag_in_frames is 0;
@@ -1462,11 +1401,6 @@
 #ifdef NMV_STATS
   init_nmvstats();
 #endif
-#if CONFIG_CODE_ZEROGROUP
-#ifdef ZPC_STATS
-  init_zpcstats();
-#endif
-#endif
 
   /*Initialize the feed-forward activity masking.*/
   cpi->activity_avg = 90 << 12;
@@ -1593,10 +1527,11 @@
   for (i = 0; i < MAX_MODES; i++)
     cpi->rd_thresh_mult[i] = 128;
 
-#define BFP(BT, SDF, VF, SVF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF) \
+#define BFP(BT, SDF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF)\
     cpi->fn_ptr[BT].sdf            = SDF; \
     cpi->fn_ptr[BT].vf             = VF; \
     cpi->fn_ptr[BT].svf            = SVF; \
+    cpi->fn_ptr[BT].svaf           = SVAF; \
     cpi->fn_ptr[BT].svf_halfpix_h  = SVFHH; \
     cpi->fn_ptr[BT].svf_halfpix_v  = SVFHV; \
     cpi->fn_ptr[BT].svf_halfpix_hv = SVFHHV; \
@@ -1605,57 +1540,64 @@
     cpi->fn_ptr[BT].sdx4df         = SDX4DF;
 
   BFP(BLOCK_32X16, vp9_sad32x16, vp9_variance32x16, vp9_sub_pixel_variance32x16,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance32x16, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad32x16x4d)
 
   BFP(BLOCK_16X32, vp9_sad16x32, vp9_variance16x32, vp9_sub_pixel_variance16x32,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance16x32, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad16x32x4d)
 
   BFP(BLOCK_64X32, vp9_sad64x32, vp9_variance64x32, vp9_sub_pixel_variance64x32,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance64x32, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad64x32x4d)
 
   BFP(BLOCK_32X64, vp9_sad32x64, vp9_variance32x64, vp9_sub_pixel_variance32x64,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance32x64, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad32x64x4d)
 
   BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,
-      vp9_variance_halfpixvar32x32_h, vp9_variance_halfpixvar32x32_v,
+      vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h,
+      vp9_variance_halfpixvar32x32_v,
       vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
       vp9_sad32x32x4d)
 
   BFP(BLOCK_64X64, vp9_sad64x64, vp9_variance64x64, vp9_sub_pixel_variance64x64,
-      vp9_variance_halfpixvar64x64_h, vp9_variance_halfpixvar64x64_v,
+      vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h,
+      vp9_variance_halfpixvar64x64_v,
       vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8,
       vp9_sad64x64x4d)
 
   BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,
-       vp9_variance_halfpixvar16x16_h, vp9_variance_halfpixvar16x16_v,
-       vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
-       vp9_sad16x16x4d)
+      vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h,
+      vp9_variance_halfpixvar16x16_v,
+      vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
+      vp9_sad16x16x4d)
 
   BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8,
-      NULL, NULL, NULL, vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
+      vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL,
+      vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
 
   BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16,
-      NULL, NULL, NULL, vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
+      vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL,
+      vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
 
   BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,
-      NULL, NULL, NULL, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
+      vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL,
+      vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
 
-#if CONFIG_SB8X8
   BFP(BLOCK_4X8, NULL, vp9_variance4x8, NULL,
-      NULL, NULL, NULL, NULL, NULL, NULL)
+      NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+
   BFP(BLOCK_8X4, NULL, vp9_variance8x4, NULL,
-      NULL, NULL, NULL, NULL, NULL, NULL)
-#endif
+      NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+
   BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
-      NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
+      vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL,
+      vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
 
   cpi->full_search_sad = vp9_full_search_sad;
   cpi->diamond_search_sad = vp9_diamond_search_sad;
@@ -1676,12 +1618,6 @@
   cpi->common.error.setjmp = 0;
 
   vp9_zero(cpi->y_uv_mode_count)
-#if CONFIG_CODE_ZEROGROUP
-  vp9_zero(cm->fc.zpc_counts_4x4);
-  vp9_zero(cm->fc.zpc_counts_8x8);
-  vp9_zero(cm->fc.zpc_counts_16x16);
-  vp9_zero(cm->fc.zpc_counts_32x32);
-#endif
 
   return (VP9_PTR) cpi;
 }
@@ -1709,12 +1645,6 @@
     if (cpi->pass != 1)
       print_nmvstats();
 #endif
-#if CONFIG_CODE_ZEROGROUP
-#ifdef ZPC_STATS
-    if (cpi->pass != 1)
-      print_zpcstats();
-#endif
-#endif
 
 #if CONFIG_INTERNAL_STATS
 
@@ -2019,8 +1949,8 @@
   pkt.data.psnr.samples[0] = width * height;
   pkt.data.psnr.samples[1] = width * height;
 
-  width = (width + 1) / 2;
-  height = (height + 1) / 2;
+  width = orig->uv_width;
+  height = orig->uv_height;
 
   sse = calc_plane_error(orig->u_buffer, orig->uv_stride,
                          recon->u_buffer, recon->uv_stride,
@@ -2174,7 +2104,7 @@
   } while (--h);
 
   src = s->u_buffer;
-  h = (cm->height + 1) / 2;
+  h = s->uv_height;
 
   do {
     fwrite(src, s->uv_width, 1,  yuv_rec_file);
@@ -2182,7 +2112,7 @@
   } while (--h);
 
   src = s->v_buffer;
-  h = (cm->height + 1) / 2;
+  h = s->uv_height;
 
   do {
     fwrite(src, s->uv_width, 1, yuv_rec_file);
@@ -2198,49 +2128,31 @@
   const int in_h = src_fb->y_crop_height;
   const int out_w = dst_fb->y_crop_width;
   const int out_h = dst_fb->y_crop_height;
-  int x, y;
+  int x, y, i;
+
+  uint8_t *srcs[3] = {src_fb->y_buffer, src_fb->u_buffer, src_fb->v_buffer};
+  int src_strides[3] = {src_fb->y_stride, src_fb->uv_stride, src_fb->uv_stride};
+
+  uint8_t *dsts[3] = {dst_fb->y_buffer, dst_fb->u_buffer, dst_fb->v_buffer};
+  int dst_strides[3] = {dst_fb->y_stride, dst_fb->uv_stride, dst_fb->uv_stride};
 
   for (y = 0; y < out_h; y += 16) {
     for (x = 0; x < out_w; x += 16) {
-      int x_q4 = x * 16 * in_w / out_w;
-      int y_q4 = y * 16 * in_h / out_h;
-      uint8_t *src = src_fb->y_buffer + y * in_h / out_h * src_fb->y_stride +
-                     x * in_w / out_w;
-      uint8_t *dst = dst_fb->y_buffer + y * dst_fb->y_stride + x;
-      int src_stride = src_fb->y_stride;
-      int dst_stride = dst_fb->y_stride;
+      for (i = 0; i < MAX_MB_PLANE; ++i) {
+        const int factor = i == 0 ? 1 : 2;
+        const int x_q4 = x * (16 / factor) * in_w / out_w;
+        const int y_q4 = y * (16 / factor) * in_h / out_h;
+        const int src_stride = src_strides[i];
+        const int dst_stride = dst_strides[i];
+        uint8_t *src = srcs[i] + y / factor * in_h / out_h * src_stride +
+                                 x / factor * in_w / out_w;
+        uint8_t *dst = dsts[i] + y * dst_stride + x;
 
-      vp9_convolve8(src, src_stride, dst, dst_stride,
-                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
-                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
-                    16, 16);
-
-      x_q4 >>= 1;
-      y_q4 >>= 1;
-      src_stride = src_fb->uv_stride;
-      dst_stride = dst_fb->uv_stride;
-
-      src = src_fb->u_buffer +
-          y / 2 * in_h / out_h * src_fb->uv_stride +
-          x / 2 * in_w / out_w;
-      dst = dst_fb->u_buffer +
-          y / 2 * dst_fb->uv_stride +
-          x / 2;
-      vp9_convolve8(src, src_stride, dst, dst_stride,
-                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
-                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
-                    8, 8);
-
-      src = src_fb->v_buffer +
-          y / 2 * in_h / out_h * src_fb->uv_stride +
-          x / 2 * in_w / out_w;
-      dst = dst_fb->v_buffer +
-          y / 2 * dst_fb->uv_stride +
-          x / 2;
-      vp9_convolve8(src, src_stride, dst, dst_stride,
-                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
-                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
-                    8, 8);
+        vp9_convolve8(src, src_stride, dst, dst_stride,
+                      vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
+                      vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
+                      16 / factor, 16 / factor);
+      }
     }
   }
 
@@ -2570,20 +2482,6 @@
   }
 }
 
-#if CONFIG_COMP_INTERINTRA_PRED
-static void select_interintra_mode(VP9_COMP *cpi) {
-  static const double threshold = 0.01;
-  VP9_COMMON *cm = &cpi->common;
-  // FIXME(debargha): Make this RD based
-  int sum = cpi->interintra_select_count[1] + cpi->interintra_select_count[0];
-  if (sum) {
-    double fraction = (double) cpi->interintra_select_count[1] / sum;
-    // printf("fraction: %f\n", fraction);
-    cm->use_interintra = (fraction > threshold);
-  }
-}
-#endif
-
 static void scale_references(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   int i;
@@ -2595,9 +2493,10 @@
         ref->y_crop_height != cm->height) {
       int new_fb = get_free_fb(cm);
 
-      vp8_yv12_realloc_frame_buffer(&cm->yv12_fb[new_fb],
-                                    cm->width, cm->height,
-                                    VP9BORDERINPIXELS);
+      vp9_realloc_frame_buffer(&cm->yv12_fb[new_fb],
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9BORDERINPIXELS);
       scale_and_extend_frame(ref, &cm->yv12_fb[new_fb]);
       cpi->scaled_ref_idx[i] = new_fb;
     } else {
@@ -2652,11 +2551,7 @@
 
   /* list of filters to search over */
   int mcomp_filters_to_search[] = {
-#if CONFIG_ENABLE_6TAP
-      EIGHTTAP, EIGHTTAP_SHARP, SIXTAP, SWITCHABLE
-#else
-      EIGHTTAP, EIGHTTAP_SHARP, EIGHTTAP_SMOOTH, SWITCHABLE
-#endif
+    EIGHTTAP, EIGHTTAP_SHARP, EIGHTTAP_SMOOTH, SWITCHABLE
   };
   int mcomp_filters = sizeof(mcomp_filters_to_search) /
       sizeof(*mcomp_filters_to_search);
@@ -2944,12 +2839,6 @@
     set_mvcost(&cpi->mb);
   }
 
-#if CONFIG_COMP_INTERINTRA_PRED
-  if (cm->current_video_frame == 0) {
-    cm->use_interintra = 1;
-  }
-#endif
-
 #if CONFIG_POSTPROC
 
   if (cpi->oxcf.noise_sensitivity > 0) {
@@ -3323,9 +3212,6 @@
   if (!cpi->common.error_resilient_mode &&
       !cpi->common.frame_parallel_decoding_mode) {
     vp9_adapt_coef_probs(&cpi->common);
-#if CONFIG_CODE_ZEROGROUP
-    vp9_adapt_zpc_probs(&cpi->common);
-#endif
   }
 
   if (cpi->common.frame_type != KEY_FRAME) {
@@ -3333,17 +3219,8 @@
     vp9_copy(cpi->common.fc.ymode_counts, cpi->ymode_count);
     vp9_copy(cpi->common.fc.uv_mode_counts, cpi->y_uv_mode_count);
     vp9_copy(cpi->common.fc.bmode_counts, cpi->bmode_count);
-#if !CONFIG_SB8X8
-    vp9_copy(cpi->common.fc.i8x8_mode_counts, cpi->i8x8_mode_count);
-#endif
     vp9_copy(cpi->common.fc.sub_mv_ref_counts, cpi->sub_mv_ref_count);
-#if !CONFIG_SB8X8
-    vp9_copy(cpi->common.fc.mbsplit_counts, cpi->mbsplit_count);
-#endif
     vp9_copy(cpi->common.fc.partition_counts, cpi->partition_count);
-#if CONFIG_COMP_INTERINTRA_PRED
-    vp9_copy(cpi->common.fc.interintra_counts, cpi->interintra_count);
-#endif
     cpi->common.fc.NMVcount = cpi->NMVcount;
     if (!cpi->common.error_resilient_mode &&
         !cpi->common.frame_parallel_decoding_mode) {
@@ -3352,10 +3229,6 @@
       vp9_adapt_nmv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);
     }
   }
-#if CONFIG_COMP_INTERINTRA_PRED
-  if (cm->frame_type != KEY_FRAME)
-    select_interintra_mode(cpi);
-#endif
 
   /* Move storing frame_type out of the above loop since it is also
    * needed in motion search besides loopfilter */
@@ -3708,6 +3581,15 @@
   struct vpx_usec_timer  timer;
   int                    res = 0;
 
+  if (!cpi->initial_width) {
+    // TODO(jkoleszar): Support 1/4 subsampling?
+    cm->subsampling_x = sd->uv_width < sd->y_width;
+    cm->subsampling_y = sd->uv_height < sd->y_height;
+    alloc_raw_frame_buffers(cpi);
+
+    cpi->initial_width = cm->width;
+    cpi->initial_height = cm->height;
+  }
   vpx_usec_timer_start(&timer);
   if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags,
                          cpi->active_map_enabled ? cpi->active_map : NULL))
@@ -3972,20 +3854,21 @@
   cm->frame_flags = *frame_flags;
 
   // Reset the frame pointers to the current frame size
-  vp8_yv12_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx],
-                                cm->width, cm->height,
-                                VP9BORDERINPIXELS);
+  vp9_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx],
+                           cm->width, cm->height,
+                           cm->subsampling_x, cm->subsampling_y,
+                           VP9BORDERINPIXELS);
 
   // Calculate scaling factors for each of the 3 available references
   for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
     if (cm->active_ref_idx[i] >= NUM_YV12_BUFFERS) {
       memset(&cm->active_ref_scale[i], 0, sizeof(cm->active_ref_scale[i]));
-      continue;
+    } else {
+      YV12_BUFFER_CONFIG *fb = &cm->yv12_fb[cm->active_ref_idx[i]];
+      vp9_setup_scale_factors_for_frame(&cm->active_ref_scale[i],
+                                        fb->y_crop_width, fb->y_crop_height,
+                                        cm->width, cm->height);
     }
-
-    vp9_setup_scale_factors_for_frame(&cm->active_ref_scale[i],
-                                      &cm->yv12_fb[cm->active_ref_idx[i]],
-                                      cm->width, cm->height);
   }
 
   vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 9d1e984..aba4c0e 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -47,11 +47,7 @@
 
 #define KEY_FRAME_CONTEXT 5
 
-#if CONFIG_COMP_INTERINTRA_PRED
-#define MAX_MODES 54 - CONFIG_SB8X8
-#else
-#define MAX_MODES 42 - CONFIG_SB8X8
-#endif
+#define MAX_MODES 41
 
 #define MIN_THRESHMULT  32
 #define MAX_THRESHMULT  512
@@ -72,9 +68,6 @@
   // Stats
   int y_modes[VP9_YMODES];
   int uv_modes[VP9_UV_MODES];
-#if !CONFIG_SB8X8
-  int i8x8_modes[VP9_I8X8_MODES];
-#endif
   int b_modes[B_MODE_COUNT];
   int inter_y_modes[MB_MODE_COUNT];
   int inter_uv_modes[VP9_UV_MODES];
@@ -102,30 +95,15 @@
   vp9_prob ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */
   vp9_prob uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1];
   vp9_prob bmode_prob[VP9_NKF_BINTRAMODES - 1];
-#if !CONFIG_SB8X8
-  vp9_prob i8x8_mode_prob[VP9_I8X8_MODES - 1];
-#endif
   vp9_prob sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
-#if !CONFIG_SB8X8
-  vp9_prob mbsplit_prob[VP9_NUMMBSPLITS - 1];
-#endif
   vp9_prob partition_prob[NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
 
   vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
                                  [VP9_SWITCHABLE_FILTERS - 1];
-#if CONFIG_COMP_INTERINTRA_PRED
-  vp9_prob interintra_prob;
-#endif
 
   int mv_ref_ct[INTER_MODE_CONTEXTS][4][2];
   int vp9_mode_contexts[INTER_MODE_CONTEXTS][4];
 
-#if CONFIG_CODE_ZEROGROUP
-  vp9_zpc_probs zpc_probs_4x4;
-  vp9_zpc_probs zpc_probs_8x8;
-  vp9_zpc_probs zpc_probs_16x16;
-  vp9_zpc_probs zpc_probs_32x32;
-#endif
 } CODING_CONTEXT;
 
 typedef struct {
@@ -213,9 +191,6 @@
   THR_SPLITA,
 
   THR_B_PRED,
-#if !CONFIG_SB8X8
-  THR_I8X8_PRED,
-#endif
 
   THR_COMP_ZEROLG,
   THR_COMP_NEARESTLG,
@@ -236,22 +211,6 @@
   THR_COMP_SPLITLG,
   THR_COMP_SPLITLA,
   THR_COMP_SPLITGA,
-#if CONFIG_COMP_INTERINTRA_PRED
-  THR_COMP_INTERINTRA_ZEROL,
-  THR_COMP_INTERINTRA_NEARESTL,
-  THR_COMP_INTERINTRA_NEARL,
-  THR_COMP_INTERINTRA_NEWL,
-
-  THR_COMP_INTERINTRA_ZEROG,
-  THR_COMP_INTERINTRA_NEARESTG,
-  THR_COMP_INTERINTRA_NEARG,
-  THR_COMP_INTERINTRA_NEWG,
-
-  THR_COMP_INTERINTRA_ZEROA,
-  THR_COMP_INTERINTRA_NEARESTA,
-  THR_COMP_INTERINTRA_NEARA,
-  THR_COMP_INTERINTRA_NEWA,
-#endif
 }
 THR_MODES;
 
@@ -281,22 +240,14 @@
 } SPEED_FEATURES;
 
 enum BlockSize {
-#if CONFIG_SB8X8
   BLOCK_4X4,
   BLOCK_4X8,
   BLOCK_8X4,
   BLOCK_8X8,
   BLOCK_8X16,
   BLOCK_16X8,
-#else
-  BLOCK_16X8 = PARTITIONING_16X8,
-  BLOCK_8X16 = PARTITIONING_8X16,
-  BLOCK_8X8 = PARTITIONING_8X8,
-  BLOCK_4X4 = PARTITIONING_4X4,
-#endif
   BLOCK_16X16,
-  BLOCK_MAX_SEGMENTS,
-  BLOCK_32X32 = BLOCK_MAX_SEGMENTS,
+  BLOCK_32X32,
   BLOCK_32X16,
   BLOCK_16X32,
   BLOCK_64X32,
@@ -307,17 +258,17 @@
 
 typedef struct VP9_COMP {
 
-  DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, unsigned char, Y1quant_shift[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, y_quant[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, unsigned char, y_quant_shift[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, y_zbin[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, y_round[QINDEX_RANGE][16]);
 
-  DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, uv_quant[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, unsigned char, uv_quant_shift[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, uv_zbin[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, uv_round[QINDEX_RANGE][16]);
 
-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y[QINDEX_RANGE][16]);
   DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
 
   MACROBLOCK mb;
@@ -468,19 +419,9 @@
   int sb_ymode_count [VP9_I32X32_MODES];
   int ymode_count[VP9_YMODES];        /* intra MB type cts this frame */
   int bmode_count[VP9_NKF_BINTRAMODES];
-#if !CONFIG_SB8X8
-  int i8x8_mode_count[VP9_I8X8_MODES];
-#endif
   int sub_mv_ref_count[SUBMVREF_COUNT][VP9_SUBMVREFS];
-#if !CONFIG_SB8X8
-  int mbsplit_count[VP9_NUMMBSPLITS];
-#endif
   int y_uv_mode_count[VP9_YMODES][VP9_UV_MODES];
   unsigned int partition_count[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
-#if CONFIG_COMP_INTERINTRA_PRED
-  unsigned int interintra_count[2];
-  unsigned int interintra_select_count[2];
-#endif
 
   nmv_context_counts NMVcount;
 
@@ -525,7 +466,7 @@
   // for real time encoding
   int avg_encode_time;              // microsecond
   int avg_pick_mode_time;            // microsecond
-  int Speed;
+  int speed;
   unsigned int cpu_freq;           // Mhz
   int compressor_speed;
 
@@ -578,10 +519,10 @@
     unsigned int section_intra_rating;
     unsigned int next_iiratio;
     unsigned int this_iiratio;
-    FIRSTPASS_STATS *total_stats;
-    FIRSTPASS_STATS *this_frame_stats;
+    FIRSTPASS_STATS total_stats;
+    FIRSTPASS_STATS this_frame_stats;
     FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start;
-    FIRSTPASS_STATS *total_left_stats;
+    FIRSTPASS_STATS total_left_stats;
     int first_pass_done;
     int64_t bits_left;
     int64_t clip_bits_total;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 6cf99d6..aea350b 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -133,39 +133,6 @@
            pt_scan, 1);
 }
 
-#if !CONFIG_SB8X8
-void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
-                                int y_blocks) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
-  const int *pt_scan = get_scan_8x8(tx_type);
-
-  quantize(mb->plane[pb_idx.plane].zrun_zbin_boost,
-           BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
-           64, mb->skip_block,
-           mb->plane[pb_idx.plane].zbin,
-           mb->plane[pb_idx.plane].round,
-           mb->plane[pb_idx.plane].quant,
-           mb->plane[pb_idx.plane].quant_shift,
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16),
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16),
-           xd->plane[pb_idx.plane].dequant,
-           mb->plane[pb_idx.plane].zbin_extra,
-           &xd->plane[pb_idx.plane].eobs[pb_idx.block],
-           pt_scan, 1);
-}
-
-/* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
- * these two C functions if corresponding optimized routine is not available.
- * NEON optimized version implements currently the fast quantization for pair
- * of blocks. */
-void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *x, int b_idx1, int b_idx2,
-                                     int y_blocks) {
-  vp9_regular_quantize_b_4x4(x, b_idx1, DCT_DCT, y_blocks);
-  vp9_regular_quantize_b_4x4(x, b_idx2, DCT_DCT, y_blocks);
-}
-#endif
-
 static void invert_quant(int16_t *quant, uint8_t *shift, int d) {
   unsigned t;
   int l;
@@ -195,16 +162,17 @@
     }
     // dc values
     quant_val = vp9_dc_quant(q, cpi->common.y_dc_delta_q);
-    invert_quant(cpi->Y1quant[q] + 0, cpi->Y1quant_shift[q] + 0, quant_val);
-    cpi->Y1zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
-    cpi->Y1round[q][0] = (qrounding_factor * quant_val) >> 7;
+    invert_quant(cpi->y_quant[q] + 0, cpi->y_quant_shift[q] + 0, quant_val);
+    cpi->y_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
+    cpi->y_round[q][0] = (qrounding_factor * quant_val) >> 7;
     cpi->common.y_dequant[q][0] = quant_val;
-    cpi->zrun_zbin_boost_y1[q][0] = (quant_val * zbin_boost[0]) >> 7;
+    cpi->zrun_zbin_boost_y[q][0] = (quant_val * zbin_boost[0]) >> 7;
+
 
     quant_val = vp9_dc_quant(q, cpi->common.uv_dc_delta_q);
-    invert_quant(cpi->UVquant[q] + 0, cpi->UVquant_shift[q] + 0, quant_val);
-    cpi->UVzbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
-    cpi->UVround[q][0] = (qrounding_factor * quant_val) >> 7;
+    invert_quant(cpi->uv_quant[q] + 0, cpi->uv_quant_shift[q] + 0, quant_val);
+    cpi->uv_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
+    cpi->uv_round[q][0] = (qrounding_factor * quant_val) >> 7;
     cpi->common.uv_dequant[q][0] = quant_val;
     cpi->zrun_zbin_boost_uv[q][0] = (quant_val * zbin_boost[0]) >> 7;
 
@@ -216,16 +184,16 @@
     for (i = 1; i < 16; i++) {
       int rc = vp9_default_zig_zag1d_4x4[i];
 
-      invert_quant(cpi->Y1quant[q] + rc, cpi->Y1quant_shift[q] + rc, quant_val);
-      cpi->Y1zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
-      cpi->Y1round[q][rc] = (qrounding_factor * quant_val) >> 7;
-      cpi->zrun_zbin_boost_y1[q][i] =
+      invert_quant(cpi->y_quant[q] + rc, cpi->y_quant_shift[q] + rc, quant_val);
+      cpi->y_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
+      cpi->y_round[q][rc] = (qrounding_factor * quant_val) >> 7;
+      cpi->zrun_zbin_boost_y[q][i] =
           ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7);
 
-      invert_quant(cpi->UVquant[q] + rc, cpi->UVquant_shift[q] + rc,
+      invert_quant(cpi->uv_quant[q] + rc, cpi->uv_quant_shift[q] + rc,
         quant_uv_val);
-      cpi->UVzbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_uv_val, 7);
-      cpi->UVround[q][rc] = (qrounding_factor * quant_uv_val) >> 7;
+      cpi->uv_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_uv_val, 7);
+      cpi->uv_round[q][rc] = (qrounding_factor * quant_uv_val) >> 7;
       cpi->zrun_zbin_boost_uv[q][i] =
           ROUND_POWER_OF_TWO(quant_uv_val * zbin_boost[i], 7);
     }
@@ -243,11 +211,11 @@
   zbin_extra = (cpi->common.y_dequant[qindex][1] *
                  (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;
 
-  x->plane[0].quant = cpi->Y1quant[qindex];
-  x->plane[0].quant_shift = cpi->Y1quant_shift[qindex];
-  x->plane[0].zbin = cpi->Y1zbin[qindex];
-  x->plane[0].round = cpi->Y1round[qindex];
-  x->plane[0].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[qindex];
+  x->plane[0].quant = cpi->y_quant[qindex];
+  x->plane[0].quant_shift = cpi->y_quant_shift[qindex];
+  x->plane[0].zbin = cpi->y_zbin[qindex];
+  x->plane[0].round = cpi->y_round[qindex];
+  x->plane[0].zrun_zbin_boost = cpi->zrun_zbin_boost_y[qindex];
   x->plane[0].zbin_extra = (int16_t)zbin_extra;
   x->e_mbd.plane[0].dequant = cpi->common.y_dequant[qindex];
 
@@ -256,10 +224,10 @@
                 (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;
 
   for (i = 1; i < 3; i++) {
-    x->plane[i].quant = cpi->UVquant[qindex];
-    x->plane[i].quant_shift = cpi->UVquant_shift[qindex];
-    x->plane[i].zbin = cpi->UVzbin[qindex];
-    x->plane[i].round = cpi->UVround[qindex];
+    x->plane[i].quant = cpi->uv_quant[qindex];
+    x->plane[i].quant_shift = cpi->uv_quant_shift[qindex];
+    x->plane[i].zbin = cpi->uv_zbin[qindex];
+    x->plane[i].round = cpi->uv_round[qindex];
     x->plane[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[qindex];
     x->plane[i].zbin_extra = (int16_t)zbin_extra;
     x->e_mbd.plane[i].dequant = cpi->common.uv_dequant[qindex];
@@ -296,10 +264,6 @@
 
   cm->base_qindex = Q;
 
-  // Set lossless mode
-  if (cm->base_qindex <= 4)
-    cm->base_qindex = 0;
-
   // if any of the delta_q values are changing update flag will
   // have to be set.
   cm->y_dc_delta_q = 0;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 42d339d..10a3b2e 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -138,13 +138,7 @@
   vp9_copy(cc->sb_ymode_prob, cm->fc.sb_ymode_prob);
   vp9_copy(cc->bmode_prob, cm->fc.bmode_prob);
   vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob);
-#if !CONFIG_SB8X8
-  vp9_copy(cc->i8x8_mode_prob, cm->fc.i8x8_mode_prob);
-#endif
   vp9_copy(cc->sub_mv_ref_prob, cm->fc.sub_mv_ref_prob);
-#if !CONFIG_SB8X8
-  vp9_copy(cc->mbsplit_prob, cm->fc.mbsplit_prob);
-#endif
   vp9_copy(cc->partition_prob, cm->fc.partition_prob);
 
   // Stats
@@ -173,15 +167,6 @@
   vp9_copy(cc->coef_probs_16x16, cm->fc.coef_probs_16x16);
   vp9_copy(cc->coef_probs_32x32, cm->fc.coef_probs_32x32);
   vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);
-#if CONFIG_COMP_INTERINTRA_PRED
-  cc->interintra_prob = cm->fc.interintra_prob;
-#endif
-#if CONFIG_CODE_ZEROGROUP
-  vp9_copy(cc->zpc_probs_4x4, cm->fc.zpc_probs_4x4);
-  vp9_copy(cc->zpc_probs_8x8, cm->fc.zpc_probs_8x8);
-  vp9_copy(cc->zpc_probs_16x16, cm->fc.zpc_probs_16x16);
-  vp9_copy(cc->zpc_probs_32x32, cm->fc.zpc_probs_32x32);
-#endif
 }
 
 void vp9_restore_coding_context(VP9_COMP *cpi) {
@@ -202,14 +187,8 @@
   vp9_copy(cm->fc.ymode_prob, cc->ymode_prob);
   vp9_copy(cm->fc.sb_ymode_prob, cc->sb_ymode_prob);
   vp9_copy(cm->fc.bmode_prob, cc->bmode_prob);
-#if !CONFIG_SB8X8
-  vp9_copy(cm->fc.i8x8_mode_prob, cc->i8x8_mode_prob);
-#endif
   vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob);
   vp9_copy(cm->fc.sub_mv_ref_prob, cc->sub_mv_ref_prob);
-#if !CONFIG_SB8X8
-  vp9_copy(cm->fc.mbsplit_prob, cc->mbsplit_prob);
-#endif
   vp9_copy(cm->fc.partition_prob, cc->partition_prob);
 
   // Stats
@@ -239,15 +218,6 @@
   vp9_copy(cm->fc.coef_probs_16x16, cc->coef_probs_16x16);
   vp9_copy(cm->fc.coef_probs_32x32, cc->coef_probs_32x32);
   vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);
-#if CONFIG_COMP_INTERINTRA_PRED
-  cm->fc.interintra_prob = cc->interintra_prob;
-#endif
-#if CONFIG_CODE_ZEROGROUP
-  vp9_copy(cm->fc.zpc_probs_4x4, cc->zpc_probs_4x4);
-  vp9_copy(cm->fc.zpc_probs_8x8, cc->zpc_probs_8x8);
-  vp9_copy(cm->fc.zpc_probs_16x16, cc->zpc_probs_16x16);
-  vp9_copy(cm->fc.zpc_probs_32x32, cc->zpc_probs_32x32);
-#endif
 }
 
 void vp9_setup_key_frame(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 439905f..765a071 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -46,26 +46,6 @@
 /* Factor to weigh the rate for switchable interp filters */
 #define SWITCHABLE_INTERP_RATE_FACTOR 1
 
-static const int auto_speed_thresh[17] = {
-  1000,
-  200,
-  150,
-  130,
-  150,
-  125,
-  120,
-  115,
-  115,
-  115,
-  115,
-  115,
-  115,
-  115,
-  115,
-  115,
-  105
-};
-
 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   {ZEROMV,    LAST_FRAME,   NONE},
   {DC_PRED,   INTRA_FRAME,  NONE},
@@ -102,9 +82,6 @@
   {SPLITMV,   ALTREF_FRAME, NONE},
 
   {I4X4_PRED,    INTRA_FRAME,  NONE},
-#if !CONFIG_SB8X8
-  {I8X8_PRED, INTRA_FRAME,  NONE},
-#endif
 
   /* compound prediction modes */
   {ZEROMV,    LAST_FRAME,   GOLDEN_FRAME},
@@ -126,24 +103,6 @@
   {SPLITMV,   LAST_FRAME,   GOLDEN_FRAME},
   {SPLITMV,   ALTREF_FRAME, LAST_FRAME  },
   {SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME},
-
-#if CONFIG_COMP_INTERINTRA_PRED
-  /* compound inter-intra prediction */
-  {ZEROMV,    LAST_FRAME,   INTRA_FRAME},
-  {NEARESTMV, LAST_FRAME,   INTRA_FRAME},
-  {NEARMV,    LAST_FRAME,   INTRA_FRAME},
-  {NEWMV,     LAST_FRAME,   INTRA_FRAME},
-
-  {ZEROMV,    GOLDEN_FRAME,   INTRA_FRAME},
-  {NEARESTMV, GOLDEN_FRAME,   INTRA_FRAME},
-  {NEARMV,    GOLDEN_FRAME,   INTRA_FRAME},
-  {NEWMV,     GOLDEN_FRAME,   INTRA_FRAME},
-
-  {ZEROMV,    ALTREF_FRAME,   INTRA_FRAME},
-  {NEARESTMV, ALTREF_FRAME,   INTRA_FRAME},
-  {NEARMV,    ALTREF_FRAME,   INTRA_FRAME},
-  {NEWMV,     ALTREF_FRAME,   INTRA_FRAME},
-#endif
 };
 
 static void fill_token_costs(vp9_coeff_count *c,
@@ -308,24 +267,12 @@
   ENTROPY_CONTEXT above_ec, left_ec;
   TX_TYPE tx_type = DCT_DCT;
 
-#if CONFIG_CODE_ZEROGROUP
-  int last_nz_pos[3] = {-1, -1, -1};  // Encoder only
-  int is_eoo_list[3] = {0, 0, 0};
-  int is_eoo_negative[3] = {0, 0, 0};
-  int is_last_zero[3] = {0, 0, 0};
-  int o, rc, skip_coef_val;
-  vp9_zpc_probs *zpc_probs;
-  uint8_t token_cache_full[1024];
-#endif
   const int segment_id = xd->mode_info_context->mbmi.segment_id;
   vp9_prob (*coef_probs)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
                         [ENTROPY_NODES];
   int seg_eob, default_eob;
   uint8_t token_cache[1024];
-
-#if CONFIG_CODE_ZEROGROUP
-  vpx_memset(token_cache, UNKNOWN_TOKEN, sizeof(token_cache));
-#endif
+  const uint8_t * band_translate;
 
   // Check for consistency of tx_size with mode info
   assert((!type && !plane) || (type && plane));
@@ -345,9 +292,7 @@
       coef_probs = cm->fc.coef_probs_4x4;
       seg_eob = 16;
       scan = get_scan_4x4(tx_type);
-#if CONFIG_CODE_ZEROGROUP
-      zpc_probs = &cm->fc.zpc_probs_4x4;
-#endif
+      band_translate = vp9_coefband_trans_4x4;
       break;
     }
     case TX_8X8: {
@@ -361,9 +306,7 @@
       scan = get_scan_8x8(tx_type);
       coef_probs = cm->fc.coef_probs_8x8;
       seg_eob = 64;
-#if CONFIG_CODE_ZEROGROUP
-      zpc_probs = &cm->fc.zpc_probs_8x8;
-#endif
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_16X16: {
@@ -377,9 +320,7 @@
       seg_eob = 256;
       above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
       left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
-#if CONFIG_CODE_ZEROGROUP
-      zpc_probs = &cm->fc.zpc_probs_16x16;
-#endif
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_32X32:
@@ -388,10 +329,7 @@
       seg_eob = 1024;
       above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
       left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
-
-#if CONFIG_CODE_ZEROGROUP
-      zpc_probs = &cm->fc.zpc_probs_32x32;
-#endif
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     default:
       abort();
@@ -410,100 +348,25 @@
   if (eob < seg_eob)
     assert(qcoeff_ptr[scan[eob]] == 0);
 
-#if CONFIG_CODE_ZEROGROUP
-  vpx_memset(token_cache_full, ZERO_TOKEN, sizeof(token_cache_full));
-  for (c = 0; c < eob; ++c) {
-    rc = scan[c];
-    token_cache_full[rc] = vp9_dct_value_tokens_ptr[qcoeff_ptr[rc]].token;
-    o = vp9_get_orientation(rc, tx_size);
-    if (qcoeff_ptr[rc] != 0)
-      last_nz_pos[o] = c;
-  }
-#endif
   {
     for (c = 0; c < eob; c++) {
       int v = qcoeff_ptr[scan[c]];
       int t = vp9_dct_value_tokens_ptr[v].token;
-      int band = get_coef_band(scan, tx_size, c);
+      int band = get_coef_band(band_translate, c);
       if (c)
         pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
-#if CONFIG_CODE_ZEROGROUP
-      rc = scan[c];
-      o = vp9_get_orientation(rc, tx_size);
-      skip_coef_val = (token_cache[rc] == ZERO_TOKEN || is_eoo_list[o]);
-      if (!skip_coef_val) {
-        cost += token_costs[band][pt][t] + vp9_dct_value_cost_ptr[v];
-      } else {
-        assert(v == 0);
-      }
-#else
+
       cost += token_costs[band][pt][t] + vp9_dct_value_cost_ptr[v];
-#endif
+
       if (!c || token_cache[scan[c - 1]])
         cost += vp9_cost_bit(coef_probs[type][ref][band][pt][0], 1);
       token_cache[scan[c]] = t;
-#if CONFIG_CODE_ZEROGROUP
-      if (t == ZERO_TOKEN && !skip_coef_val) {
-        int eoo = 0, use_eoo;
-#if USE_ZPC_EOORIENT == 1
-        use_eoo = vp9_use_eoo(c, seg_eob, scan, tx_size,
-                              is_last_zero, is_eoo_list);
-#else
-        use_eoo = 0;
-#endif
-        if (use_eoo) {
-          eoo = vp9_is_eoo(c, eob, scan, tx_size, qcoeff_ptr, last_nz_pos);
-          if (eoo && is_eoo_negative[o]) eoo = 0;
-          if (eoo) {
-            int c_;
-            int savings = 0;
-            int zsaved = 0;
-            savings = vp9_cost_bit((*zpc_probs)[ref]
-                                   [coef_to_zpc_band(band)]
-                                   [coef_to_zpc_ptok(pt)][0], 1) -
-                      vp9_cost_bit((*zpc_probs)[ref]
-                                   [coef_to_zpc_band(band)]
-                                   [coef_to_zpc_ptok(pt)][0], 0);
-            for (c_ = c + 1; c_ < eob; ++c_) {
-              if (o == vp9_get_orientation(scan[c_], tx_size)) {
-                int pt_ = vp9_get_coef_context(scan, nb, pad,
-                                               token_cache_full, c_,
-                                               default_eob);
-                int band_ = get_coef_band(scan, tx_size, c_);
-                assert(token_cache_full[scan[c_]] == ZERO_TOKEN);
-                if (!c_ || token_cache_full[scan[c_ - 1]])
-                  savings += vp9_cost_bit(
-                      coef_probs[type][ref][band_][pt_][0], 1);
-                savings += vp9_cost_bit(
-                    coef_probs[type][ref][band_][pt_][1], 0);
-                zsaved++;
-              }
-            }
-            if (savings < 0) {
-            // if (zsaved < ZPC_ZEROSSAVED_EOO) {
-              eoo = 0;
-              is_eoo_negative[o] = 1;
-            }
-          }
-        }
-        if (use_eoo) {
-          cost += vp9_cost_bit((*zpc_probs)[ref]
-                                           [coef_to_zpc_band(band)]
-                                           [coef_to_zpc_ptok(pt)][0], !eoo);
-          if (eoo) {
-            assert(is_eoo_list[o] == 0);
-            is_eoo_list[o] = 1;
-          }
-        }
-      }
-      is_last_zero[o] = (t == ZERO_TOKEN);
-#endif
     }
     if (c < seg_eob) {
       if (c)
         pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
       cost += mb->token_costs[tx_size][type][ref]
-          [get_coef_band(scan, tx_size, c)]
+          [get_coef_band(band_translate, c)]
           [pt][DCT_EOB_TOKEN];
     }
   }
@@ -703,10 +566,7 @@
 
   choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,
                            TX_32X32 - (bs < BLOCK_SIZE_SB32X32)
-#if CONFIG_SB8X8
-                           - (bs < BLOCK_SIZE_MB16X16)
-#endif
-                           );
+                           - (bs < BLOCK_SIZE_MB16X16));
 }
 
 static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
@@ -724,39 +584,23 @@
   const int src_stride = x->plane[0].src.stride;
   uint8_t* const src =
       raster_block_offset_uint8(xd,
-#if CONFIG_SB8X8
                                 BLOCK_SIZE_SB8X8,
-#else
-                                BLOCK_SIZE_MB16X16,
-#endif
                                 0, ib,
                                 x->plane[0].src.buf, src_stride);
   int16_t* const src_diff =
       raster_block_offset_int16(xd,
-#if CONFIG_SB8X8
                                 BLOCK_SIZE_SB8X8,
-#else
-                                BLOCK_SIZE_MB16X16,
-#endif
                                 0, ib,
                                 x->plane[0].src_diff);
   int16_t* const diff =
       raster_block_offset_int16(xd,
-#if CONFIG_SB8X8
                                 BLOCK_SIZE_SB8X8,
-#else
-                                BLOCK_SIZE_MB16X16,
-#endif
                                 0, ib,
                                 xd->plane[0].diff);
   int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);
   uint8_t* const dst =
       raster_block_offset_uint8(xd,
-#if CONFIG_SB8X8
                                 BLOCK_SIZE_SB8X8,
-#else
-                                BLOCK_SIZE_MB16X16,
-#endif
                                 0, ib,
                                 xd->plane[0].dst.buf, xd->plane[0].dst.stride);
   ENTROPY_CONTEXT ta = *a, tempa = *a;
@@ -770,52 +614,30 @@
    * */
   DECLARE_ALIGNED_ARRAY(16, int16_t, best_dqcoeff, 16);
 
-  assert(ib < (16 >> (2 * CONFIG_SB8X8)));
-#if CONFIG_NEWBINTRAMODES
-  xd->mode_info_context->bmi[ib].as_mode.context =
-    vp9_find_bpred_context(xd, ib, dst, xd->plane[0].dst.stride);
-#endif
+  assert(ib < 4);
+
   xd->mode_info_context->mbmi.txfm_size = TX_4X4;
   for (mode = B_DC_PRED; mode < LEFT4X4; mode++) {
     int64_t this_rd;
     int ratey;
 
-#if CONFIG_NEWBINTRAMODES
-    if (xd->frame_type == KEY_FRAME) {
-      if (mode == B_CONTEXT_PRED) continue;
-    } else {
-      if (mode >= B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS &&
-          mode < B_CONTEXT_PRED)
-        continue;
-    }
-#endif
-
     xd->mode_info_context->bmi[ib].as_mode.first = mode;
-#if CONFIG_NEWBINTRAMODES
-    rate = bmode_costs[
-        mode == B_CONTEXT_PRED ? mode - CONTEXT_PRED_REPLACEMENTS : mode];
-#else
     rate = bmode_costs[mode];
-#endif
 
     vp9_intra4x4_predict(xd, ib,
-#if CONFIG_SB8X8
                          BLOCK_SIZE_SB8X8,
-#else
-                         BLOCK_SIZE_MB16X16,
-#endif
                          mode, dst, xd->plane[0].dst.stride);
-    vp9_subtract_block(4, 4, src_diff, 16 >> CONFIG_SB8X8,
+    vp9_subtract_block(4, 4, src_diff, 8,
                        src, src_stride,
                        dst, xd->plane[0].dst.stride);
 
     xd->mode_info_context->bmi[ib].as_mode.first = mode;
     tx_type = get_tx_type_4x4(xd, ib);
     if (tx_type != DCT_DCT) {
-      vp9_short_fht4x4(src_diff, coeff, 16 >> CONFIG_SB8X8, tx_type);
+      vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
       x->quantize_b_4x4(x, ib, tx_type, 16);
     } else {
-      x->fwd_txm4x4(src_diff, coeff, 32 >> CONFIG_SB8X8);
+      x->fwd_txm4x4(src_diff, coeff, 16);
       x->quantize_b_4x4(x, ib, tx_type, 16);
     }
 
@@ -848,19 +670,15 @@
 
   // inverse transform
   if (best_tx_type != DCT_DCT)
-    vp9_short_iht4x4(best_dqcoeff, diff, 16 >> CONFIG_SB8X8, best_tx_type);
+    vp9_short_iht4x4(best_dqcoeff, diff, 8, best_tx_type);
   else
-    xd->inv_txm4x4(best_dqcoeff, diff, 32 >> CONFIG_SB8X8);
+    xd->inv_txm4x4(best_dqcoeff, diff, 16);
 
   vp9_intra4x4_predict(xd, ib,
-#if CONFIG_SB8X8
                        BLOCK_SIZE_SB8X8,
-#else
-                       BLOCK_SIZE_MB16X16,
-#endif
                        *best_mode,
                        dst, xd->plane[0].dst.stride);
-  vp9_recon_b(dst, diff, 16 >> CONFIG_SB8X8,
+  vp9_recon_b(dst, diff, 8,
               dst, xd->plane[0].dst.stride);
 
   return best_rd;
@@ -875,7 +693,7 @@
   int distortion = 0;
   int tot_rate_y = 0;
   int64_t total_rd = 0;
-  ENTROPY_CONTEXT t_above[4 >> CONFIG_SB8X8], t_left[4 >> CONFIG_SB8X8];
+  ENTROPY_CONTEXT t_above[2], t_left[2];
   int *bmode_costs;
 
   vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
@@ -884,24 +702,12 @@
   xd->mode_info_context->mbmi.mode = I4X4_PRED;
   bmode_costs = mb->inter_bmode_costs;
 
-  for (i = 0; i < (16 >> (2 * CONFIG_SB8X8)); i++) {
-    const int x_idx = i & (3 >> CONFIG_SB8X8), y_idx = i >> (2 >> CONFIG_SB8X8);
+  for (i = 0; i < 4; i++) {
+    const int x_idx = i & 1, y_idx = i >> 1;
     MODE_INFO *const mic = xd->mode_info_context;
     const int mis = xd->mode_info_stride;
     B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
     int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
-#if CONFIG_NEWBINTRAMODES
-    uint8_t* const dst =
-        raster_block_offset_uint8(xd,
-#if CONFIG_SB8X8
-                                  BLOCK_SIZE_SB8X8,
-#else
-                                  BLOCK_SIZE_MB16X16,
-#endif
-                                  0, i,
-                                  xd->plane[0].dst.buf,
-                                  xd->plane[0].dst.stride);
-#endif
 
     if (xd->frame_type == KEY_FRAME) {
       const B_PREDICTION_MODE A = above_block_mode(mic, i, mis);
@@ -909,10 +715,6 @@
 
       bmode_costs  = mb->bmode_costs[A][L];
     }
-#if CONFIG_NEWBINTRAMODES
-    mic->bmi[i].as_mode.context = vp9_find_bpred_context(xd, i, dst,
-        xd->plane[0].dst.stride);
-#endif
 
     total_rd += rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
                                       t_above + x_idx, t_left + y_idx,
@@ -924,10 +726,6 @@
 
     mic->bmi[i].as_mode.first = best_mode;
 
-#if 0  // CONFIG_NEWBINTRAMODES
-    printf("%d %d\n", mic->bmi[i].as_mode.first, mic->bmi[i].as_mode.context);
-#endif
-
     if (total_rd >= best_rd)
       break;
   }
@@ -995,246 +793,6 @@
   return best_rd;
 }
 
-#if !CONFIG_SB8X8
-static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
-                                     B_PREDICTION_MODE *best_mode,
-                                     int *mode_costs,
-                                     ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                                     int *bestrate, int *bestratey,
-                                     int *bestdistortion) {
-  VP9_COMMON *const cm = &cpi->common;
-  MB_PREDICTION_MODE mode;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int64_t best_rd = INT64_MAX;
-  int distortion = 0, rate = 0;
-  ENTROPY_CONTEXT ta[2], tl[2], ta_temp[2], tl_temp[2];
-  // perform transformation of dimension 8x8
-  // note the input and output index mapping
-  int idx = (ib & 0x02) ? (ib + 2) : ib;
-  const int src_stride = x->plane[0].src.stride;
-  uint8_t* const src =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                x->plane[0].src.buf, src_stride);
-  int16_t* const src_diff =
-      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                x->plane[0].src_diff);
-  int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, idx, 16);
-  uint8_t* const dst =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                xd->plane[0].dst.buf, xd->plane[0].dst.stride);
-
-  assert(ib < 16);
-  vpx_memcpy(ta, a, sizeof(ta));
-  vpx_memcpy(tl, l, sizeof(tl));
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    int64_t this_rd;
-    int rate_t = 0;
-
-    // FIXME rate for compound mode and second intrapred mode
-    rate = mode_costs[mode];
-    xd->mode_info_context->bmi[ib].as_mode.first = mode;
-
-    vp9_intra8x8_predict(xd, ib, mode, dst, xd->plane[0].dst.stride);
-
-    vp9_subtract_block(8, 8, src_diff, 16,
-                       src, src_stride,
-                       dst, xd->plane[0].dst.stride);
-
-    vpx_memcpy(ta_temp, ta, sizeof(ta));
-    vpx_memcpy(tl_temp, tl, sizeof(tl));
-
-    if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
-      TX_TYPE tx_type = get_tx_type_8x8(xd, ib);
-      if (tx_type != DCT_DCT)
-        vp9_short_fht8x8(src_diff, coeff, 16, tx_type);
-      else
-        x->fwd_txm8x8(src_diff, coeff, 32);
-      x->quantize_b_8x8(x, idx, tx_type, 16);
-
-      // compute quantization mse of 8x8 block
-      distortion = vp9_block_error_c(coeff,
-          BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64);
-
-      rate_t = cost_coeffs(cm, x, 0, idx, PLANE_TYPE_Y_WITH_DC,
-                           ta_temp, tl_temp, TX_8X8, 16);
-
-      rate += rate_t;
-    } else {
-      static const int iblock[4] = {0, 1, 4, 5};
-      TX_TYPE tx_type;
-      int i;
-
-      distortion = 0;
-      rate_t = 0;
-      for (i = 0; i < 4; ++i) {
-        int16_t* const src_diff =
-            raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16,
-                                      0, ib + iblock[i],
-                                      x->plane[0].src_diff);
-        int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff,
-                                            ib + iblock[i], 16);
-        int do_two = 0;
-        tx_type = get_tx_type_4x4(xd, ib + iblock[i]);
-        if (tx_type != DCT_DCT) {
-          vp9_short_fht4x4(src_diff, coeff, 16, tx_type);
-          x->quantize_b_4x4(x, ib + iblock[i], tx_type, 16);
-        } else if (!(i & 1) &&
-                   get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) {
-          x->fwd_txm8x4(src_diff, coeff, 32);
-          x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1, 16);
-          do_two = 1;
-        } else {
-          x->fwd_txm4x4(src_diff, coeff, 32);
-          x->quantize_b_4x4(x, ib + iblock[i], tx_type, 16);
-        }
-        distortion += vp9_block_error_c(coeff,
-            BLOCK_OFFSET(xd->plane[0].dqcoeff, ib + iblock[i], 16),
-            16 << do_two);
-        rate_t += cost_coeffs(cm, x, 0, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
-                              &ta_temp[i & 1], &tl_temp[i >> 1],
-                              TX_4X4, 16);
-        if (do_two) {
-          i++;
-          rate_t += cost_coeffs(cm, x, 0, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
-                                &ta_temp[i & 1], &tl_temp[i >> 1],
-                                TX_4X4, 16);
-        }
-      }
-      rate += rate_t;
-    }
-
-    distortion >>= 2;
-    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
-    if (this_rd < best_rd) {
-      *bestrate = rate;
-      *bestratey = rate_t;
-      *bestdistortion = distortion;
-      vpx_memcpy(a, ta_temp, sizeof(ta_temp));
-      vpx_memcpy(l, tl_temp, sizeof(tl_temp));
-      best_rd = this_rd;
-      *best_mode = mode;
-    }
-  }
-  xd->mode_info_context->bmi[ib].as_mode.first = (*best_mode);
-  vp9_encode_intra8x8(x, ib);
-
-  return best_rd;
-}
-
-static int64_t rd_pick_intra8x8mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
-                                         int *Rate, int *rate_y,
-                                         int *Distortion, int64_t best_rd) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  int i, ib;
-  int cost = mb->mbmode_cost [xd->frame_type] [I8X8_PRED];
-  int distortion = 0;
-  int tot_rate_y = 0;
-  int64_t total_rd = 0;
-  ENTROPY_CONTEXT t_above[4], t_left[4];
-  int *i8x8mode_costs;
-
-  vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
-  vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
-
-  xd->mode_info_context->mbmi.mode = I8X8_PRED;
-  i8x8mode_costs  = mb->i8x8_mode_costs;
-
-  for (i = 0; i < 4; i++) {
-    const int x_idx = i & 1, y_idx = i >> 1;
-    MODE_INFO *const mic = xd->mode_info_context;
-    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
-    int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
-
-    ib = vp9_i8x8_block[i];
-    total_rd += rd_pick_intra8x8block(cpi, mb, ib, &best_mode, i8x8mode_costs,
-                                      t_above + x_idx * 2, t_left + y_idx * 2,
-                                      &r, &ry, &d);
-    cost += r;
-    distortion += d;
-    tot_rate_y += ry;
-    mic->bmi[ib].as_mode.first = best_mode;
-  }
-
-  *Rate = cost;
-  *rate_y = tot_rate_y;
-  *Distortion = distortion;
-  return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
-}
-
-static int64_t rd_pick_intra8x8mby_modes_and_txsz(VP9_COMP *cpi, MACROBLOCK *x,
-                                                  int *rate, int *rate_y,
-                                                  int *distortion,
-                                                  int *mode8x8,
-                                                  int64_t best_yrd,
-                                                  int64_t *txfm_cache) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-  int cost0 = vp9_cost_bit(cm->prob_tx[0], 0);
-  int cost1 = vp9_cost_bit(cm->prob_tx[0], 1);
-  int64_t tmp_rd_4x4s, tmp_rd_8x8s;
-  int64_t tmp_rd_4x4, tmp_rd_8x8, tmp_rd;
-  int r4x4, tok4x4, d4x4, r8x8, tok8x8, d8x8;
-
-  mbmi->txfm_size = TX_4X4;
-  tmp_rd_4x4 = rd_pick_intra8x8mby_modes(cpi, x, &r4x4, &tok4x4,
-                                         &d4x4, best_yrd);
-  mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;
-  mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;
-  mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;
-  mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;
-  mbmi->txfm_size = TX_8X8;
-  tmp_rd_8x8 = rd_pick_intra8x8mby_modes(cpi, x, &r8x8, &tok8x8,
-                                         &d8x8, best_yrd);
-  txfm_cache[ONLY_4X4]  = tmp_rd_4x4;
-  txfm_cache[ALLOW_8X8] = tmp_rd_8x8;
-  txfm_cache[ALLOW_16X16] = tmp_rd_8x8;
-  tmp_rd_4x4s = tmp_rd_4x4 + RDCOST(x->rdmult, x->rddiv, cost0, 0);
-  tmp_rd_8x8s = tmp_rd_8x8 + RDCOST(x->rdmult, x->rddiv, cost1, 0);
-  txfm_cache[TX_MODE_SELECT] = tmp_rd_4x4s < tmp_rd_8x8s ?
-                               tmp_rd_4x4s : tmp_rd_8x8s;
-  if (cm->txfm_mode == TX_MODE_SELECT) {
-    if (tmp_rd_4x4s < tmp_rd_8x8s) {
-      *rate = r4x4 + cost0;
-      *rate_y = tok4x4 + cost0;
-      *distortion = d4x4;
-      mbmi->txfm_size = TX_4X4;
-      tmp_rd = tmp_rd_4x4s;
-    } else {
-      *rate = r8x8 + cost1;
-      *rate_y = tok8x8 + cost1;
-      *distortion = d8x8;
-      mbmi->txfm_size = TX_8X8;
-      tmp_rd = tmp_rd_8x8s;
-
-      mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;
-      mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;
-      mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;
-      mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;
-    }
-  } else if (cm->txfm_mode == ONLY_4X4) {
-    *rate = r4x4;
-    *rate_y = tok4x4;
-    *distortion = d4x4;
-    mbmi->txfm_size = TX_4X4;
-    tmp_rd = tmp_rd_4x4;
-  } else {
-    *rate = r8x8;
-    *rate_y = tok8x8;
-    *distortion = d8x8;
-    mbmi->txfm_size = TX_8X8;
-    tmp_rd = tmp_rd_8x8;
-
-    mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;
-    mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;
-    mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;
-    mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;
-  }
-
-  return tmp_rd;
-}
-#endif  // !CONFIG_SB8X8
-
 static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
                                       int *rate, int *distortion,
                                       int *skippable, BLOCK_SIZE_TYPE bsize,
@@ -1329,7 +887,6 @@
   x->e_mbd.mode_info_context->mbmi.mv[0].as_int = mv->as_int;
 }
 
-#if CONFIG_SB8X8
 static int labels2mode(MACROBLOCK *x,
                        int const *labelings, int which_label,
                        B_PREDICTION_MODE this_mode,
@@ -1415,13 +972,7 @@
              left_second_mv.as_int == this_second_mv->as_int))
           m = LEFT4X4;
       }
-
-#if CONFIG_NEWBINTRAMODES
-      cost = x->inter_bmode_costs[m == B_CONTEXT_PRED ?
-                                  m - CONTEXT_PRED_REPLACEMENTS : m];
-#else
       cost = x->inter_bmode_costs[m];
-#endif
     }
 
     mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
@@ -1523,11 +1074,8 @@
   B_PREDICTION_MODE modes[4];
   int_mv mvs[4], second_mvs[4];
   int eobs[4];
-
   int mvthresh;
-  int *mdcounts;
 } BEST_SEG_INFO;
-#endif  // CONFIG_SB8X8
 
 static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
   int r = 0;
@@ -1538,7 +1086,6 @@
   return r;
 }
 
-#if CONFIG_SB8X8
 static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
                                     BEST_SEG_INFO *bsi,
                                     int_mv seg_mvs[4][MAX_REF_FRAMES - 1]) {
@@ -1778,7 +1325,6 @@
                                        int_mv *best_ref_mv,
                                        int_mv *second_best_ref_mv,
                                        int64_t best_rd,
-                                       int *mdcounts,
                                        int *returntotrate,
                                        int *returnyrate,
                                        int *returndistortion,
@@ -1795,7 +1341,6 @@
   bsi.second_ref_mv = second_best_ref_mv;
   bsi.mvp.as_int = best_ref_mv->as_int;
   bsi.mvthresh = mvthresh;
-  bsi.mdcounts = mdcounts;
 
   for (i = 0; i < 4; i++)
     bsi.modes[i] = ZERO4X4;
@@ -1835,905 +1380,6 @@
   return (int)(bsi.segment_rd);
 }
 
-#else  // !CONFIG_SB8X8
-
-static int labels2mode(
-  MACROBLOCK *x,
-  int const *labelings, int which_label,
-  B_PREDICTION_MODE this_mode,
-  int_mv *this_mv, int_mv *this_second_mv,
-  int_mv seg_mvs[MAX_REF_FRAMES - 1],
-  int_mv *best_ref_mv,
-  int_mv *second_best_ref_mv,
-  int *mvjcost, int *mvcost[2], VP9_COMP *cpi) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *const mic = xd->mode_info_context;
-  MB_MODE_INFO * mbmi = &mic->mbmi;
-  const int mis = xd->mode_info_stride;
-
-  int i, cost = 0, thismvcost = 0;
-
-  /* We have to be careful retrieving previously-encoded motion vectors.
-     Ones from this macroblock have to be pulled from the BLOCKD array
-     as they have not yet made it to the bmi array in our MB_MODE_INFO. */
-  for (i = 0; i < 16; ++i) {
-    const int row = i >> 2,  col = i & 3;
-
-    B_PREDICTION_MODE m;
-
-    if (labelings[i] != which_label)
-      continue;
-
-    if (col  &&  labelings[i] == labelings[i - 1])
-      m = LEFT4X4;
-    else if (row  &&  labelings[i] == labelings[i - 4])
-      m = ABOVE4X4;
-    else {
-      // the only time we should do costing for new motion vector or mode
-      // is when we are on a new label  (jbb May 08, 2007)
-      switch (m = this_mode) {
-        case NEW4X4 :
-          if (mbmi->second_ref_frame > 0) {
-            this_mv->as_int = seg_mvs[mbmi->ref_frame - 1].as_int;
-            this_second_mv->as_int =
-              seg_mvs[mbmi->second_ref_frame - 1].as_int;
-          }
-
-          thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost,
-                                        102, xd->allow_high_precision_mv);
-          if (mbmi->second_ref_frame > 0) {
-            thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,
-                                          mvjcost, mvcost, 102,
-                                          xd->allow_high_precision_mv);
-          }
-          break;
-        case LEFT4X4:
-          this_mv->as_int = col ? mic->bmi[i - 1].as_mv[0].as_int :
-                                  left_block_mv(xd, mic, i);
-          if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int = col ? mic->bmi[i - 1].as_mv[1].as_int :
-                                           left_block_second_mv(xd, mic, i);
-          break;
-        case ABOVE4X4:
-          this_mv->as_int = row ? mic->bmi[i - 4].as_mv[0].as_int :
-                                  above_block_mv(mic, i, mis);
-          if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int = row ? mic->bmi[i - 4].as_mv[1].as_int :
-                                           above_block_second_mv(mic, i, mis);
-          break;
-        case ZERO4X4:
-          this_mv->as_int = 0;
-          if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int = 0;
-          break;
-        default:
-          break;
-      }
-
-      if (m == ABOVE4X4) { // replace above with left if same
-        int_mv left_mv, left_second_mv;
-
-        left_second_mv.as_int = 0;
-        left_mv.as_int = col ? mic->bmi[i - 1].as_mv[0].as_int :
-                         left_block_mv(xd, mic, i);
-        if (mbmi->second_ref_frame > 0)
-          left_second_mv.as_int = col ? mic->bmi[i - 1].as_mv[1].as_int :
-                                  left_block_second_mv(xd, mic, i);
-
-        if (left_mv.as_int == this_mv->as_int &&
-            (mbmi->second_ref_frame <= 0 ||
-             left_second_mv.as_int == this_second_mv->as_int))
-          m = LEFT4X4;
-      }
-
-#if CONFIG_NEWBINTRAMODES
-      cost = x->inter_bmode_costs[
-          m == B_CONTEXT_PRED ? m - CONTEXT_PRED_REPLACEMENTS : m];
-#else
-      cost = x->inter_bmode_costs[m];
-#endif
-    }
-
-    mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
-    if (mbmi->second_ref_frame > 0)
-      mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
-
-    x->partition_info->bmi[i].mode = m;
-    x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
-    if (mbmi->second_ref_frame > 0)
-      x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;
-  }
-
-  cost += thismvcost;
-  return cost;
-}
-
-static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
-                                       MACROBLOCK *x,
-                                       int const *labels,
-                                       int which_label,
-                                       int *labelyrate,
-                                       int *distortion,
-                                       ENTROPY_CONTEXT *ta,
-                                       ENTROPY_CONTEXT *tl) {
-  int i;
-  MACROBLOCKD *xd = &x->e_mbd;
-
-  *labelyrate = 0;
-  *distortion = 0;
-  for (i = 0; i < 16; i++) {
-    if (labels[i] == which_label) {
-      const int src_stride = x->plane[0].src.stride;
-      uint8_t* const src =
-          raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, i,
-                                    x->plane[0].src.buf, src_stride);
-      int16_t* const src_diff =
-          raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, i,
-                                    x->plane[0].src_diff);
-      int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);
-      uint8_t* const pre =
-          raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, i,
-                                    xd->plane[0].pre[0].buf,
-                                    xd->plane[0].pre[0].stride);
-      uint8_t* const dst =
-          raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, i,
-                                    xd->plane[0].dst.buf,
-                                    xd->plane[0].dst.stride);
-      int thisdistortion;
-
-      vp9_build_inter_predictor(pre,
-                                xd->plane[0].pre[0].stride,
-                                dst,
-                                xd->plane[0].dst.stride,
-                                &xd->mode_info_context->bmi[i].as_mv[0],
-                                &xd->scale_factor[0],
-                                4, 4, 0 /* no avg */, &xd->subpix);
-
-      // TODO(debargha): Make this work properly with the
-      // implicit-compoundinter-weight experiment when implicit
-      // weighting for splitmv modes is turned on.
-      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-        uint8_t* const second_pre =
-          raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, i,
-                                    xd->plane[0].pre[1].buf,
-                                    xd->plane[0].pre[1].stride);
-        vp9_build_inter_predictor(
-            second_pre, xd->plane[0].pre[1].stride,
-            dst, xd->plane[0].dst.stride,
-            &xd->mode_info_context->bmi[i].as_mv[1],
-            &xd->scale_factor[1], 4, 4, 1,
-            &xd->subpix);
-      }
-
-      vp9_subtract_block(4, 4, src_diff, 16,
-                         src, src_stride,
-                         dst, xd->plane[0].dst.stride);
-      x->fwd_txm4x4(src_diff, coeff, 32);
-      x->quantize_b_4x4(x, i, DCT_DCT, 16);
-      thisdistortion = vp9_block_error(coeff,
-          BLOCK_OFFSET(xd->plane[0].dqcoeff, i, 16), 16);
-      *distortion += thisdistortion;
-      *labelyrate += cost_coeffs(cm, x, 0, i, PLANE_TYPE_Y_WITH_DC,
-                                 ta + (i & 3),
-                                 tl + (i >> 2), TX_4X4, 16);
-    }
-  }
-  *distortion >>= 2;
-  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
-}
-
-static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm,
-                                           MACROBLOCK *x,
-                                           int const *labels,
-                                           int which_label,
-                                           int *labelyrate,
-                                           int *distortion,
-                                           int64_t *otherrd,
-                                           ENTROPY_CONTEXT *ta,
-                                           ENTROPY_CONTEXT *tl) {
-  int i, j;
-  MACROBLOCKD *xd = &x->e_mbd;
-  const int iblock[4] = { 0, 1, 4, 5 };
-  int othercost = 0, otherdist = 0;
-  ENTROPY_CONTEXT tac[4], tlc[4];
-
-  if (otherrd) {
-    memcpy(&tac, ta, sizeof(tac));
-    memcpy(&tlc, tl, sizeof(tlc));
-  }
-
-  *distortion = 0;
-  *labelyrate = 0;
-  for (i = 0; i < 4; i++) {
-    int ib = vp9_i8x8_block[i];
-
-    if (labels[ib] == which_label) {
-      const int use_second_ref =
-          xd->mode_info_context->mbmi.second_ref_frame > 0;
-      int which_mv;
-      const int idx = (ib & 8) + ((ib & 2) << 1);
-      const int src_stride = x->plane[0].src.stride;
-      uint8_t* const src =
-          raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                    x->plane[0].src.buf, src_stride);
-      int16_t* const src_diff =
-          raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                    x->plane[0].src_diff);
-      int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, idx, 16);
-      int thisdistortion;
-      uint8_t* const dst =
-          raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                    xd->plane[0].dst.buf,
-                                    xd->plane[0].dst.stride);
-
-      assert(idx < 16);
-      for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-        uint8_t* const pre =
-            raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                      xd->plane[0].pre[which_mv].buf,
-                                      xd->plane[0].pre[which_mv].stride);
-
-        // TODO(debargha): Make this work properly with the
-        // implicit-compoundinter-weight experiment when implicit
-        // weighting for splitmv modes is turned on.
-        vp9_build_inter_predictor(
-            pre, xd->plane[0].pre[which_mv].stride,
-            dst, xd->plane[0].dst.stride,
-            &xd->mode_info_context->bmi[ib].as_mv[which_mv],
-            &xd->scale_factor[which_mv], 8, 8,
-            which_mv, &xd->subpix);
-      }
-
-      vp9_subtract_block(8, 8, src_diff, 16,
-                         src, src_stride,
-                         dst, xd->plane[0].dst.stride);
-
-      if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) {
-        if (otherrd) {
-          x->fwd_txm8x8(src_diff, coeff, 32);
-          x->quantize_b_8x8(x, idx, DCT_DCT, 16);
-          thisdistortion = vp9_block_error_c(coeff,
-              BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64);
-          otherdist += thisdistortion;
-          xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-          othercost += cost_coeffs(cm, x, 0, idx, PLANE_TYPE_Y_WITH_DC,
-                                   tac + (i & 1) * 2,
-                                   tlc + (i & 2),
-                                   TX_8X8, 16);
-          xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-        }
-        for (j = 0; j < 4; j += 2) {
-          int16_t* const src_diff =
-              raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16,
-                                        0, ib + iblock[j],
-                                        x->plane[0].src_diff);
-          int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff,
-                                              ib + iblock[j], 16);
-          x->fwd_txm8x4(src_diff, coeff, 32);
-          x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j] + 1, 16);
-          thisdistortion = vp9_block_error_c(coeff,
-              BLOCK_OFFSET(xd->plane[0].dqcoeff, ib + iblock[j], 16), 32);
-          *distortion += thisdistortion;
-          *labelyrate +=
-              cost_coeffs(cm, x, 0, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
-                          ta + (i & 1) * 2,
-                          tl + (i & 2) + ((j & 2) >> 1),
-                          TX_4X4, 16);
-          *labelyrate +=
-              cost_coeffs(cm, x, 0, ib + iblock[j] + 1,
-                          PLANE_TYPE_Y_WITH_DC,
-                          ta + (i & 1) * 2 + 1,
-                          tl + (i & 2) + ((j & 2) >> 1),
-                          TX_4X4, 16);
-        }
-      } else /* 8x8 */ {
-        if (otherrd) {
-          for (j = 0; j < 4; j += 2) {
-            int16_t* const src_diff =
-                raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16,
-                                          0, ib + iblock[j],
-                                          x->plane[0].src_diff);
-            int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff,
-                                                ib + iblock[j], 16);
-            x->fwd_txm8x4(src_diff, coeff, 32);
-            x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j] + 1, 16);
-            thisdistortion = vp9_block_error_c(coeff,
-                BLOCK_OFFSET(xd->plane[0].dqcoeff, ib + iblock[j], 16), 32);
-            otherdist += thisdistortion;
-            xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-            othercost +=
-                cost_coeffs(cm, x, 0, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
-                            tac + (i & 1) * 2,
-                            tlc + (i & 2) + ((j & 2) >> 1),
-                            TX_4X4, 16);
-            othercost +=
-                cost_coeffs(cm, x, 0, ib + iblock[j] + 1,
-                            PLANE_TYPE_Y_WITH_DC,
-                            tac + (i & 1) * 2 + 1,
-                            tlc + (i & 2) + ((j & 2) >> 1),
-                            TX_4X4, 16);
-            xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-          }
-        }
-        x->fwd_txm8x8(src_diff, coeff, 32);
-        x->quantize_b_8x8(x, idx, DCT_DCT, 16);
-        thisdistortion = vp9_block_error_c(coeff,
-            BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64);
-        *distortion += thisdistortion;
-        *labelyrate += cost_coeffs(cm, x, 0, idx, PLANE_TYPE_Y_WITH_DC,
-                                   ta + (i & 1) * 2,
-                                   tl + (i & 2),
-                                   TX_8X8, 16);
-      }
-    }
-  }
-  *distortion >>= 2;
-  if (otherrd) {
-    otherdist >>= 2;
-    *otherrd = RDCOST(x->rdmult, x->rddiv, othercost, otherdist);
-  }
-  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
-}
-
-static const unsigned int segmentation_to_sseshift[4] = {3, 3, 2, 0};
-
-
-typedef struct {
-  int_mv *ref_mv, *second_ref_mv;
-  int_mv mvp;
-
-  int64_t segment_rd;
-  SPLITMV_PARTITIONING_TYPE segment_num;
-  TX_SIZE txfm_size;
-  int r;
-  int d;
-  int segment_yrate;
-  B_PREDICTION_MODE modes[16];
-  int_mv mvs[16], second_mvs[16];
-  int eobs[16];
-
-  int mvthresh;
-  int *mdcounts;
-
-  int_mv sv_mvp[4];     // save 4 mvp from 8x8
-  int sv_istep[2];  // save 2 initial step_param for 16x8/8x16
-
-} BEST_SEG_INFO;
-
-static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
-                                    BEST_SEG_INFO *bsi,
-                                    SPLITMV_PARTITIONING_TYPE segmentation,
-                                    TX_SIZE tx_size, int64_t *otherrds,
-                                    int64_t *rds, int *completed,
-                                    /* 16 = n_blocks */
-                                    int_mv seg_mvs[16 /* n_blocks */]
-                                                  [MAX_REF_FRAMES - 1]) {
-  int i, j;
-  int const *labels;
-  int br = 0, bd = 0;
-  B_PREDICTION_MODE this_mode;
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-
-  int label_count;
-  int64_t this_segment_rd = 0, other_segment_rd;
-  int label_mv_thresh;
-  int rate = 0;
-  int sbr = 0, sbd = 0;
-  int segmentyrate = 0;
-  int best_eobs[16] = { 0 };
-
-  vp9_variance_fn_ptr_t *v_fn_ptr;
-
-  ENTROPY_CONTEXT t_above[4], t_left[4];
-  ENTROPY_CONTEXT t_above_b[4], t_left_b[4];
-
-  vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above));
-  vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left));
-
-  v_fn_ptr = &cpi->fn_ptr[segmentation];
-  labels = vp9_mbsplits[segmentation];
-  label_count = vp9_mbsplit_count[segmentation];
-
-  // 64 makes this threshold really big effectively
-  // making it so that we very rarely check mvs on
-  // segments.   setting this to 1 would make mv thresh
-  // roughly equal to what it is for macroblocks
-  label_mv_thresh = 1 * bsi->mvthresh / label_count;
-
-  // Segmentation method overheads
-  rate = cost_token(vp9_mbsplit_tree, vp9_mbsplit_probs,
-                    vp9_mbsplit_encodings + segmentation);
-  rate += vp9_cost_mv_ref(cpi, SPLITMV,
-                          mbmi->mb_mode_context[mbmi->ref_frame]);
-  this_segment_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
-  br += rate;
-  other_segment_rd = this_segment_rd;
-
-  mbmi->txfm_size = tx_size;
-  for (i = 0; i < label_count && this_segment_rd < bsi->segment_rd; i++) {
-    int_mv mode_mv[B_MODE_COUNT], second_mode_mv[B_MODE_COUNT];
-    int64_t best_label_rd = INT64_MAX, best_other_rd = INT64_MAX;
-    B_PREDICTION_MODE mode_selected = ZERO4X4;
-    int bestlabelyrate = 0;
-
-    // search for the best motion vector on this segment
-    for (this_mode = LEFT4X4; this_mode <= NEW4X4; this_mode ++) {
-      int64_t this_rd, other_rd;
-      int distortion;
-      int labelyrate;
-      ENTROPY_CONTEXT t_above_s[4], t_left_s[4];
-
-      vpx_memcpy(t_above_s, t_above, sizeof(t_above_s));
-      vpx_memcpy(t_left_s, t_left, sizeof(t_left_s));
-
-      // motion search for newmv (single predictor case only)
-      if (mbmi->second_ref_frame <= 0 && this_mode == NEW4X4) {
-        int sseshift, n;
-        int step_param = 0;
-        int further_steps;
-        int thissme, bestsme = INT_MAX;
-        const struct buf_2d orig_src = x->plane[0].src;
-        const struct buf_2d orig_pre = x->e_mbd.plane[0].pre[0];
-
-        /* Is the best so far sufficiently good that we cant justify doing
-         * and new motion search. */
-        if (best_label_rd < label_mv_thresh)
-          break;
-
-        if (cpi->compressor_speed) {
-          if (segmentation == PARTITIONING_8X16 ||
-              segmentation == PARTITIONING_16X8) {
-            bsi->mvp.as_int = bsi->sv_mvp[i].as_int;
-            if (i == 1 && segmentation == PARTITIONING_16X8)
-              bsi->mvp.as_int = bsi->sv_mvp[2].as_int;
-
-            step_param = bsi->sv_istep[i];
-          }
-
-          // use previous block's result as next block's MV predictor.
-          if (segmentation == PARTITIONING_4X4 && i > 0) {
-            bsi->mvp.as_int =
-              x->e_mbd.mode_info_context->bmi[i - 1].as_mv[0].as_int;
-            if (i == 4 || i == 8 || i == 12)
-              bsi->mvp.as_int =
-                x->e_mbd.mode_info_context->bmi[i - 4].as_mv[0].as_int;
-            step_param = 2;
-          }
-        }
-
-        further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
-
-        {
-          int sadpb = x->sadperbit4;
-          int_mv mvp_full;
-
-          mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;
-          mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
-
-          // find first label
-          n = vp9_mbsplit_offset[segmentation][i];
-
-          // adjust src pointer for this segment
-          x->plane[0].src.buf =
-              raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_MB16X16, 0, n,
-                                        x->plane[0].src.buf,
-                                        x->plane[0].src.stride);
-          assert(((intptr_t)x->e_mbd.plane[0].pre[0].buf & 0xf) == 0);
-          x->e_mbd.plane[0].pre[0].buf =
-              raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_MB16X16, 0, n,
-                                        x->e_mbd.plane[0].pre[0].buf,
-                                        x->e_mbd.plane[0].pre[0].stride);
-
-          bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
-                                           sadpb, further_steps, 0, v_fn_ptr,
-                                           bsi->ref_mv, &mode_mv[NEW4X4]);
-
-          sseshift = segmentation_to_sseshift[segmentation];
-
-          // Should we do a full search (best quality only)
-          if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000) {
-            /* Check if mvp_full is within the range. */
-            clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
-                     x->mv_row_min, x->mv_row_max);
-
-            thissme = cpi->full_search_sad(x, &mvp_full,
-                                           sadpb, 16, v_fn_ptr,
-                                           x->nmvjointcost, x->mvcost,
-                                           bsi->ref_mv,
-                                           n);
-
-            if (thissme < bestsme) {
-              bestsme = thissme;
-              mode_mv[NEW4X4].as_int =
-                x->e_mbd.mode_info_context->bmi[n].as_mv[0].as_int;
-            } else {
-              /* The full search result is actually worse so re-instate the
-               * previous best vector */
-              x->e_mbd.mode_info_context->bmi[n].as_mv[0].as_int =
-                mode_mv[NEW4X4].as_int;
-            }
-          }
-        }
-
-        if (bestsme < INT_MAX) {
-          int distortion;
-          unsigned int sse;
-          cpi->find_fractional_mv_step(x, &mode_mv[NEW4X4],
-                                       bsi->ref_mv, x->errorperbit, v_fn_ptr,
-                                       x->nmvjointcost, x->mvcost,
-                                       &distortion, &sse);
-
-          // safe motion search result for use in compound prediction
-          seg_mvs[i][mbmi->ref_frame - 1].as_int = mode_mv[NEW4X4].as_int;
-        }
-
-        // restore src pointers
-        x->plane[0].src = orig_src;
-        x->e_mbd.plane[0].pre[0] = orig_pre;
-      } else if (mbmi->second_ref_frame > 0 && this_mode == NEW4X4) {
-        /* NEW4X4 */
-        /* motion search not completed? Then skip newmv for this block with
-         * comppred */
-        if (seg_mvs[i][mbmi->second_ref_frame - 1].as_int == INVALID_MV ||
-            seg_mvs[i][mbmi->ref_frame        - 1].as_int == INVALID_MV) {
-          continue;
-        }
-      }
-
-      rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode],
-                         &second_mode_mv[this_mode], seg_mvs[i],
-                         bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
-                         x->mvcost, cpi);
-
-      // Trap vectors that reach beyond the UMV borders
-      if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||
-          ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
-          ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||
-          ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) {
-        continue;
-      }
-      if (mbmi->second_ref_frame > 0 &&
-          mv_check_bounds(x, &second_mode_mv[this_mode]))
-        continue;
-
-      if (segmentation == PARTITIONING_4X4) {
-        this_rd = encode_inter_mb_segment(&cpi->common,
-                                          x, labels, i, &labelyrate,
-                                          &distortion, t_above_s, t_left_s);
-        other_rd = this_rd;
-      } else {
-        this_rd = encode_inter_mb_segment_8x8(&cpi->common,
-                                              x, labels, i, &labelyrate,
-                                              &distortion, &other_rd,
-                                              t_above_s, t_left_s);
-      }
-      this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
-      rate += labelyrate;
-
-      if (this_rd < best_label_rd) {
-        sbr = rate;
-        sbd = distortion;
-        bestlabelyrate = labelyrate;
-        mode_selected = this_mode;
-        best_label_rd = this_rd;
-        if (x->e_mbd.mode_info_context->mbmi.txfm_size == TX_4X4) {
-          for (j = 0; j < 16; j++)
-            if (labels[j] == i)
-              best_eobs[j] = x->e_mbd.plane[0].eobs[j];
-        } else {
-          for (j = 0; j < 4; j++) {
-            int ib = vp9_i8x8_block[j], idx = j * 4;
-
-            if (labels[ib] == i)
-              best_eobs[idx] = x->e_mbd.plane[0].eobs[idx];
-          }
-        }
-        if (other_rd < best_other_rd)
-          best_other_rd = other_rd;
-
-        vpx_memcpy(t_above_b, t_above_s, sizeof(t_above_s));
-        vpx_memcpy(t_left_b, t_left_s, sizeof(t_left_s));
-
-      }
-    } /*for each 4x4 mode*/
-
-    vpx_memcpy(t_above, t_above_b, sizeof(t_above));
-    vpx_memcpy(t_left, t_left_b, sizeof(t_left));
-
-    labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
-                &second_mode_mv[mode_selected], seg_mvs[i],
-                bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
-                x->mvcost, cpi);
-
-    br += sbr;
-    bd += sbd;
-    segmentyrate += bestlabelyrate;
-    this_segment_rd += best_label_rd;
-    other_segment_rd += best_other_rd;
-    if (rds)
-      rds[i] = this_segment_rd;
-    if (otherrds)
-      otherrds[i] = other_segment_rd;
-  } /* for each label */
-
-  if (this_segment_rd < bsi->segment_rd) {
-    bsi->r = br;
-    bsi->d = bd;
-    bsi->segment_yrate = segmentyrate;
-    bsi->segment_rd = this_segment_rd;
-    bsi->segment_num = segmentation;
-    bsi->txfm_size = mbmi->txfm_size;
-
-    // store everything needed to come back to this!!
-    for (i = 0; i < 16; i++) {
-      bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv;
-      if (mbmi->second_ref_frame > 0)
-        bsi->second_mvs[i].as_mv = x->partition_info->bmi[i].second_mv.as_mv;
-      bsi->modes[i] = x->partition_info->bmi[i].mode;
-      bsi->eobs[i] = best_eobs[i];
-    }
-  }
-
-  if (completed) {
-    *completed = i;
-  }
-}
-
-static void rd_check_segment(VP9_COMP *cpi, MACROBLOCK *x,
-                             BEST_SEG_INFO *bsi,
-                             unsigned int segmentation,
-                             /* 16 = n_blocks */
-                             int_mv seg_mvs[16][MAX_REF_FRAMES - 1],
-                             int64_t txfm_cache[NB_TXFM_MODES]) {
-  int i, n, c = vp9_mbsplit_count[segmentation];
-
-  if (segmentation == PARTITIONING_4X4) {
-    int64_t rd[16];
-
-    rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, NULL,
-                            rd, &n, seg_mvs);
-    if (n == c) {
-      for (i = 0; i < NB_TXFM_MODES; i++) {
-        if (rd[c - 1] < txfm_cache[i])
-          txfm_cache[i] = rd[c - 1];
-      }
-    }
-  } else {
-    int64_t diff, base_rd;
-    int cost4x4 = vp9_cost_bit(cpi->common.prob_tx[0], 0);
-    int cost8x8 = vp9_cost_bit(cpi->common.prob_tx[0], 1);
-
-    if (cpi->common.txfm_mode == TX_MODE_SELECT) {
-      int64_t rd4x4[4], rd8x8[4];
-      int n4x4, n8x8, nmin;
-      BEST_SEG_INFO bsi4x4, bsi8x8;
-
-      /* factor in cost of cost4x4/8x8 in decision */
-      vpx_memcpy(&bsi4x4, bsi, sizeof(*bsi));
-      vpx_memcpy(&bsi8x8, bsi, sizeof(*bsi));
-      rd_check_segment_txsize(cpi, x, &bsi4x4, segmentation,
-                              TX_4X4, NULL, rd4x4, &n4x4, seg_mvs);
-      rd_check_segment_txsize(cpi, x, &bsi8x8, segmentation,
-                              TX_8X8, NULL, rd8x8, &n8x8, seg_mvs);
-      if (bsi4x4.segment_num == segmentation) {
-        bsi4x4.segment_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);
-        if (bsi4x4.segment_rd < bsi->segment_rd)
-          vpx_memcpy(bsi, &bsi4x4, sizeof(*bsi));
-      }
-      if (bsi8x8.segment_num == segmentation) {
-        bsi8x8.segment_rd += RDCOST(x->rdmult, x->rddiv, cost8x8, 0);
-        if (bsi8x8.segment_rd < bsi->segment_rd)
-          vpx_memcpy(bsi, &bsi8x8, sizeof(*bsi));
-      }
-      n = n4x4 > n8x8 ? n4x4 : n8x8;
-      if (n == c) {
-        nmin = n4x4 < n8x8 ? n4x4 : n8x8;
-        diff = rd8x8[nmin - 1] - rd4x4[nmin - 1];
-        if (n == n4x4) {
-          base_rd = rd4x4[c - 1];
-        } else {
-          base_rd = rd8x8[c - 1] - diff;
-        }
-      }
-    } else {
-      int64_t rd[4], otherrd[4];
-
-      if (cpi->common.txfm_mode == ONLY_4X4) {
-        rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, otherrd,
-                                rd, &n, seg_mvs);
-        if (n == c) {
-          base_rd = rd[c - 1];
-          diff = otherrd[c - 1] - rd[c - 1];
-        }
-      } else /* use 8x8 transform */ {
-        rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_8X8, otherrd,
-                                rd, &n, seg_mvs);
-        if (n == c) {
-          diff = rd[c - 1] - otherrd[c - 1];
-          base_rd = otherrd[c - 1];
-        }
-      }
-    }
-
-    if (n == c) {
-      if (base_rd < txfm_cache[ONLY_4X4]) {
-        txfm_cache[ONLY_4X4] = base_rd;
-      }
-      if (base_rd + diff < txfm_cache[ALLOW_8X8]) {
-        txfm_cache[ALLOW_8X8] = txfm_cache[ALLOW_16X16] =
-            txfm_cache[ALLOW_32X32] = base_rd + diff;
-      }
-      if (diff < 0) {
-        base_rd += diff + RDCOST(x->rdmult, x->rddiv, cost8x8, 0);
-      } else {
-        base_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);
-      }
-      if (base_rd < txfm_cache[TX_MODE_SELECT]) {
-        txfm_cache[TX_MODE_SELECT] = base_rd;
-      }
-    }
-  }
-}
-
-static INLINE void cal_step_param(int sr, int *sp) {
-  int step = 0;
-
-  if (sr > MAX_FIRST_STEP) sr = MAX_FIRST_STEP;
-  else if (sr < 1) sr = 1;
-
-  while (sr >>= 1)
-    step++;
-
-  *sp = MAX_MVSEARCH_STEPS - 1 - step;
-}
-
-static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
-                                       int_mv *best_ref_mv,
-                                       int_mv *second_best_ref_mv,
-                                       int64_t best_rd,
-                                       int *mdcounts,
-                                       int *returntotrate,
-                                       int *returnyrate,
-                                       int *returndistortion,
-                                       int *skippable, int mvthresh,
-                                       int_mv seg_mvs[NB_PARTITIONINGS]
-                                                     [16 /* n_blocks */]
-                                                     [MAX_REF_FRAMES - 1],
-                                       int64_t txfm_cache[NB_TXFM_MODES]) {
-  int i;
-  BEST_SEG_INFO bsi;
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-
-  vpx_memset(&bsi, 0, sizeof(bsi));
-  for (i = 0; i < NB_TXFM_MODES; i++)
-    txfm_cache[i] = INT64_MAX;
-
-  bsi.segment_rd = best_rd;
-  bsi.ref_mv = best_ref_mv;
-  bsi.second_ref_mv = second_best_ref_mv;
-  bsi.mvp.as_int = best_ref_mv->as_int;
-  bsi.mvthresh = mvthresh;
-  bsi.mdcounts = mdcounts;
-  bsi.txfm_size = TX_4X4;
-
-  for (i = 0; i < 16; i++)
-    bsi.modes[i] = ZERO4X4;
-
-  if (cpi->compressor_speed == 0) {
-    /* for now, we will keep the original segmentation order
-       when in best quality mode */
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,
-                     seg_mvs[PARTITIONING_16X8], txfm_cache);
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,
-                     seg_mvs[PARTITIONING_8X16], txfm_cache);
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,
-                     seg_mvs[PARTITIONING_8X8], txfm_cache);
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,
-                     seg_mvs[PARTITIONING_4X4], txfm_cache);
-  } else {
-    int sr;
-
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,
-                     seg_mvs[PARTITIONING_8X8], txfm_cache);
-
-    if (bsi.segment_rd < best_rd) {
-      int tmp_col_min = x->mv_col_min;
-      int tmp_col_max = x->mv_col_max;
-      int tmp_row_min = x->mv_row_min;
-      int tmp_row_max = x->mv_row_max;
-
-      vp9_clamp_mv_min_max(x, best_ref_mv);
-
-      /* Get 8x8 result */
-      bsi.sv_mvp[0].as_int = bsi.mvs[0].as_int;
-      bsi.sv_mvp[1].as_int = bsi.mvs[2].as_int;
-      bsi.sv_mvp[2].as_int = bsi.mvs[8].as_int;
-      bsi.sv_mvp[3].as_int = bsi.mvs[10].as_int;
-
-      /* Use 8x8 result as 16x8/8x16's predictor MV. Adjust search range
-       * according to the closeness of 2 MV. */
-      /* block 8X16 */
-      sr = MAX(abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[2].as_mv.row) >> 3,
-               abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[2].as_mv.col) >> 3);
-      cal_step_param(sr, &bsi.sv_istep[0]);
-
-      sr = MAX(abs(bsi.sv_mvp[1].as_mv.row - bsi.sv_mvp[3].as_mv.row) >> 3,
-               abs(bsi.sv_mvp[1].as_mv.col - bsi.sv_mvp[3].as_mv.col) >> 3);
-      cal_step_param(sr, &bsi.sv_istep[1]);
-
-      rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,
-                       seg_mvs[PARTITIONING_8X16], txfm_cache);
-
-      /* block 16X8 */
-      sr = MAX(abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[1].as_mv.row) >> 3,
-               abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[1].as_mv.col) >> 3);
-      cal_step_param(sr, &bsi.sv_istep[0]);
-
-      sr = MAX(abs(bsi.sv_mvp[2].as_mv.row - bsi.sv_mvp[3].as_mv.row) >> 3,
-               abs(bsi.sv_mvp[2].as_mv.col - bsi.sv_mvp[3].as_mv.col) >> 3);
-      cal_step_param(sr, &bsi.sv_istep[1]);
-
-      rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,
-                       seg_mvs[PARTITIONING_16X8], txfm_cache);
-
-      /* If 8x8 is better than 16x8/8x16, then do 4x4 search */
-      /* Not skip 4x4 if speed=0 (good quality) */
-      if (cpi->sf.no_skip_block4x4_search ||
-          bsi.segment_num == PARTITIONING_8X8) {
-        /* || (sv_segment_rd8x8-bsi.segment_rd) < sv_segment_rd8x8>>5) */
-        bsi.mvp.as_int = bsi.sv_mvp[0].as_int;
-        rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,
-                         seg_mvs[PARTITIONING_4X4], txfm_cache);
-      }
-
-      /* restore UMV window */
-      x->mv_col_min = tmp_col_min;
-      x->mv_col_max = tmp_col_max;
-      x->mv_row_min = tmp_row_min;
-      x->mv_row_max = tmp_row_max;
-    }
-  }
-
-  /* set it to the best */
-  for (i = 0; i < 16; i++) {
-    x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int = bsi.mvs[i].as_int;
-    if (mbmi->second_ref_frame > 0)
-      x->e_mbd.mode_info_context->bmi[i].as_mv[1].as_int =
-        bsi.second_mvs[i].as_int;
-    x->e_mbd.plane[0].eobs[i] = bsi.eobs[i];
-  }
-
-  /* save partitions */
-  mbmi->txfm_size = bsi.txfm_size;
-  mbmi->partitioning = bsi.segment_num;
-  x->partition_info->count = vp9_mbsplit_count[bsi.segment_num];
-
-  for (i = 0; i < x->partition_info->count; i++) {
-    int j;
-
-    j = vp9_mbsplit_offset[bsi.segment_num][i];
-
-    x->partition_info->bmi[i].mode = bsi.modes[j];
-    x->partition_info->bmi[i].mv.as_mv = bsi.mvs[j].as_mv;
-    if (mbmi->second_ref_frame > 0)
-      x->partition_info->bmi[i].second_mv.as_mv = bsi.second_mvs[j].as_mv;
-  }
-  /*
-   * used to set mbmi->mv.as_int
-   */
-  x->partition_info->bmi[15].mv.as_int = bsi.mvs[15].as_int;
-  if (mbmi->second_ref_frame > 0)
-    x->partition_info->bmi[15].second_mv.as_int = bsi.second_mvs[15].as_int;
-
-  *returntotrate = bsi.r;
-  *returndistortion = bsi.d;
-  *returnyrate = bsi.segment_yrate;
-  *skippable = vp9_sby_is_skippable(&x->e_mbd, BLOCK_SIZE_MB16X16);
-
-  return (int)(bsi.segment_rd);
-}
-#endif  // !CONFIG_SB8X8
-
 static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
                     uint8_t *ref_y_buffer, int ref_y_stride,
                     int ref_frame, enum BlockSize block_size ) {
@@ -2779,24 +1425,10 @@
   x->mv_best_ref_index[ref_frame] = best_index;
 }
 
-#if !CONFIG_SB8X8
-static void set_i8x8_block_modes(MACROBLOCK *x, int modes[4]) {
-  int i;
-  MACROBLOCKD *xd = &x->e_mbd;
-  for (i = 0; i < 4; i++) {
-    int ib = vp9_i8x8_block[i];
-    xd->mode_info_context->bmi[ib + 0].as_mode.first = modes[i];
-    xd->mode_info_context->bmi[ib + 1].as_mode.first = modes[i];
-    xd->mode_info_context->bmi[ib + 4].as_mode.first = modes[i];
-    xd->mode_info_context->bmi[ib + 5].as_mode.first = modes[i];
-    // printf("%d,%d,%d,%d\n",
-    //       modes[0], modes[1], modes[2], modes[3]);
-  }
-}
-#endif
-
 extern void vp9_calc_ref_probs(int *count, vp9_prob *probs);
-static void estimate_curframe_refprobs(VP9_COMP *cpi, vp9_prob mod_refprobs[3], int pred_ref) {
+static void estimate_curframe_refprobs(VP9_COMP *cpi,
+                                       vp9_prob mod_refprobs[3],
+                                       int pred_ref) {
   int norm_cnt[MAX_REF_FRAMES];
   const int *const rfct = cpi->count_mb_ref_frame_usage;
   int intra_count = rfct[INTRA_FRAME];
@@ -2846,7 +1478,8 @@
   return (0x8000 + weight * cost1 + (0x10000 - weight) * cost0) >> 16;
 }
 
-static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, unsigned int *ref_costs) {
+static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
+                                     unsigned int *ref_costs) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
   vp9_prob *mod_refprobs;
@@ -2895,10 +1528,10 @@
       // Get the prediction for the current mb
       cost = weighted_cost(&pred_prob, &new_pred_prob, 0,
                            pred_flag, cpi->seg0_progress);
-      if (cost > 1024) cost = 768; // i.e. account for 4 bits max.
+      if (cost > 1024) cost = 768;  // i.e. account for 4 bits max.
 
       // for incorrectly predicted cases
-      if (! pred_flag) {
+      if (!pred_flag) {
         vp9_prob curframe_mod_refprobs[3];
 
         if (cpi->seg0_progress) {
@@ -2952,14 +1585,35 @@
   memcpy(ctx->txfm_rd_diff, txfm_size_diff, sizeof(ctx->txfm_rd_diff));
 }
 
+static void setup_pred_block(const MACROBLOCKD *xd,
+                             struct buf_2d dst[MAX_MB_PLANE],
+                             const YV12_BUFFER_CONFIG *src,
+                             int mi_row, int mi_col,
+                             const struct scale_factors *scale,
+                             const struct scale_factors *scale_uv) {
+  int i;
+
+  dst[0].buf = src->y_buffer;
+  dst[0].stride = src->y_stride;
+  dst[1].buf = src->u_buffer;
+  dst[2].buf = src->v_buffer;
+  dst[1].stride = dst[2].stride = src->uv_stride;
+
+  // TODO(jkoleszar): Make scale factors per-plane data
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col,
+                     i ? scale_uv : scale,
+                     xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);
+  }
+}
+
 static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                                int idx, MV_REFERENCE_FRAME frame_type,
                                enum BlockSize block_size,
                                int mi_row, int mi_col,
                                int_mv frame_nearest_mv[MAX_REF_FRAMES],
                                int_mv frame_near_mv[MAX_REF_FRAMES],
-                               int frame_mdcounts[4][4],
-                               YV12_BUFFER_CONFIG yv12_mb[4],
+                               struct buf_2d yv12_mb[4][MAX_MB_PLANE],
                                struct scale_factors scale[MAX_REF_FRAMES]) {
   VP9_COMMON *cm = &cpi->common;
   YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]];
@@ -2978,7 +1632,7 @@
 
   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
   // use the UV scaling factors.
-  setup_pred_block(&yv12_mb[frame_type], yv12, mi_row, mi_col,
+  setup_pred_block(xd, yv12_mb[frame_type], yv12, mi_row, mi_col,
                    &scale[frame_type], &scale[frame_type]);
 
   // Gets an initial list of candidate vectors from neighbours and orders them
@@ -3002,10 +1656,55 @@
   // The current implementation doesn't support scaling.
   if (scale[frame_type].x_num == scale[frame_type].x_den &&
       scale[frame_type].y_num == scale[frame_type].y_den)
-    mv_pred(cpi, x, yv12_mb[frame_type].y_buffer, yv12->y_stride,
+    mv_pred(cpi, x, yv12_mb[frame_type][0].buf, yv12->y_stride,
             frame_type, block_size);
 }
 
+
+static enum BlockSize get_block_size(int bw, int bh) {
+  if (bw == 4 && bh == 4)
+    return BLOCK_4X4;
+
+  if (bw == 4 && bh == 8)
+    return BLOCK_4X8;
+
+  if (bw == 8 && bh == 4)
+    return BLOCK_8X4;
+
+  if (bw == 8 && bh == 8)
+    return BLOCK_8X8;
+
+  if (bw == 8 && bh == 16)
+    return BLOCK_8X16;
+
+  if (bw == 16 && bh == 8)
+    return BLOCK_16X8;
+
+  if (bw == 16 && bh == 16)
+    return BLOCK_16X16;
+
+  if (bw == 32 && bh == 32)
+    return BLOCK_32X32;
+
+  if (bw == 32 && bh == 16)
+    return BLOCK_32X16;
+
+  if (bw == 16 && bh == 32)
+    return BLOCK_16X32;
+
+  if (bw == 64 && bh == 32)
+    return BLOCK_64X32;
+
+  if (bw == 32 && bh == 64)
+    return BLOCK_32X64;
+
+  if (bw == 64 && bh == 64)
+    return BLOCK_64X64;
+
+  assert(0);
+  return -1;
+}
+
 static void model_rd_from_var_lapndz(int var, int n, int qstep,
                                      int *rate, int *dist) {
   // This function models the rate and distortion for a Laplacian
@@ -3049,54 +1748,58 @@
   vp9_clear_system_state();
 }
 
-static enum BlockSize y_to_uv_block_size(enum BlockSize bs) {
-  switch (bs) {
-    case BLOCK_64X64: return BLOCK_32X32;
-    case BLOCK_64X32: return BLOCK_32X16;
-    case BLOCK_32X64: return BLOCK_16X32;
-    case BLOCK_32X32: return BLOCK_16X16;
-    case BLOCK_32X16: return BLOCK_16X8;
-    case BLOCK_16X32: return BLOCK_8X16;
-    case BLOCK_16X16: return BLOCK_8X8;
-#if CONFIG_SB8X8
-    case BLOCK_16X8:  return BLOCK_8X4;
-    case BLOCK_8X16:  return BLOCK_4X8;
-    case BLOCK_8X8:   return BLOCK_4X4;
-#endif
-    default:
-      assert(0);
-      return -1;
-  }
+static enum BlockSize get_plane_block_size(BLOCK_SIZE_TYPE bsize,
+                                           struct macroblockd_plane *pd) {
+  const int bwl = b_width_log2(bsize) - pd->subsampling_x;
+  const int bhl = b_height_log2(bsize) - pd->subsampling_y;
+  return get_block_size(4 << bwl, 4 << bhl);
 }
 
-static enum BlockSize y_bsizet_to_block_size(BLOCK_SIZE_TYPE bs) {
-  switch (bs) {
-    case BLOCK_SIZE_SB64X64: return BLOCK_64X64;
-    case BLOCK_SIZE_SB64X32: return BLOCK_64X32;
-    case BLOCK_SIZE_SB32X64: return BLOCK_32X64;
-    case BLOCK_SIZE_SB32X32: return BLOCK_32X32;
-    case BLOCK_SIZE_SB32X16: return BLOCK_32X16;
-    case BLOCK_SIZE_SB16X32: return BLOCK_16X32;
-    case BLOCK_SIZE_MB16X16: return BLOCK_16X16;
-#if CONFIG_SB8X8
-    case BLOCK_SIZE_SB16X8:  return BLOCK_16X8;
-    case BLOCK_SIZE_SB8X16:  return BLOCK_8X16;
-    case BLOCK_SIZE_SB8X8:   return BLOCK_8X8;
-#endif
-    default:
-      assert(0);
-      return -1;
+static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+                            MACROBLOCK *x, MACROBLOCKD *xd,
+                            int *out_rate_sum, int *out_dist_sum) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse, var;
+  int i, rate_sum = 0, dist_sum = 0;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    struct macroblock_plane *const p = &x->plane[i];
+    struct macroblockd_plane *const pd = &xd->plane[i];
+
+    // TODO(dkovalev) the same code in get_plane_block_size
+    const int bwl = b_width_log2(bsize) - pd->subsampling_x;
+    const int bhl = b_height_log2(bsize) - pd->subsampling_y;
+    const enum BlockSize bs = get_block_size(4 << bwl, 4 << bhl);
+    int rate, dist;
+    var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+                             pd->dst.buf, pd->dst.stride, &sse);
+    model_rd_from_var_lapndz(var, 16 << (bwl + bhl),
+                             pd->dequant[1] >> 3, &rate, &dist);
+
+    rate_sum += rate;
+    dist_sum += dist;
   }
+
+  *out_rate_sum = rate_sum;
+  *out_dist_sum = dist_sum;
+}
+
+static INLINE int get_switchable_rate(VP9_COMMON *cm, MACROBLOCK *x) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+
+  const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
+  const int m = vp9_switchable_interp_map[mbmi->interp_filter];
+  return SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
 }
 
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  BLOCK_SIZE_TYPE bsize,
-                                 int mdcounts[4], int64_t txfm_cache[],
+                                 int64_t txfm_cache[],
                                  int *rate2, int *distortion, int *skippable,
                                  int *compmode_cost,
-#if CONFIG_COMP_INTERINTRA_PRED
-                                 int *compmode_interintra_cost,
-#endif
                                  int *rate_y, int *distortion_y,
                                  int *rate_uv, int *distortion_uv,
                                  int *mode_excluded, int *disable_skip,
@@ -3104,18 +1807,18 @@
                                  INTERPOLATIONFILTERTYPE *best_filter,
                                  int_mv frame_mv[MB_MODE_COUNT]
                                                 [MAX_REF_FRAMES],
-                                 YV12_BUFFER_CONFIG *scaled_ref_frame,
-                                 int mi_row, int mi_col) {
+                                 YV12_BUFFER_CONFIG **scaled_ref_frame,
+                                 int mi_row, int mi_col,
+                                 int_mv single_newmv[MAX_REF_FRAMES]) {
   const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
-  const enum BlockSize block_size = y_bsizet_to_block_size(bsize);
-  const enum BlockSize uv_block_size = y_to_uv_block_size(block_size);
+
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
+  const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);
+  const enum BlockSize uv_block_size = get_plane_block_size(bsize,
+                                                            &xd->plane[1]);
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   const int is_comp_pred = (mbmi->second_ref_frame > 0);
-#if CONFIG_COMP_INTERINTRA_PRED
-  const int is_comp_interintra_pred = (mbmi->second_ref_frame == INTRA_FRAME);
-#endif
   const int num_refs = is_comp_pred ? 2 : 1;
   const int this_mode = mbmi->mode;
   int i;
@@ -3124,9 +1827,7 @@
   int_mv cur_mv[2];
   int_mv ref_mv[2];
   int64_t this_rd = 0;
-  unsigned char tmp_ybuf[64 * 64];
-  unsigned char tmp_ubuf[32 * 32];
-  unsigned char tmp_vbuf[32 * 32];
+  unsigned char tmp_buf[MAX_MB_PLANE][64 * 64];
   int pred_exists = 0;
   int interpolating_intpel_seen = 0;
   int intpel_mv;
@@ -3138,6 +1839,152 @@
       ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
 
       if (is_comp_pred) {
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+        const int b_sz[BLOCK_SIZE_TYPES][2] = {
+            {4, 4},
+            {8, 8},
+            {8, 16},
+            {16, 8},
+            {16, 16},
+            {16, 32},
+            {32, 16},
+            {32, 32},
+            {32, 64},
+            {64, 32},
+            {64, 64}
+        };
+
+        int ite;
+        // Prediction buffer from second frame.
+        uint8_t *second_pred = vpx_memalign(16, b_sz[bsize][0] *
+                                            b_sz[bsize][1] * sizeof(uint8_t));
+
+        // Do joint motion search in compound mode to get more accurate mv.
+        struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+        struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
+        struct buf_2d scaled_first_yv12;
+
+        if (scaled_ref_frame[0]) {
+          int i;
+
+          // Swap out the reference frame for a version that's been scaled to
+          // match the resolution of the current frame, allowing the existing
+          // motion search code to be used without additional modifications.
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            backup_yv12[i] = xd->plane[i].pre[0];
+
+          setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
+                           NULL, NULL);
+        }
+
+        if (scaled_ref_frame[1]) {
+          int i;
+
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            backup_second_yv12[i] = xd->plane[i].pre[1];
+
+          setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col,
+                           NULL, NULL);
+        }
+        xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
+                                                mi_row, mi_col);
+        xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
+                                                mi_row, mi_col);
+
+        scaled_first_yv12 = xd->plane[0].pre[0];
+
+        // Initialize mv using single prediction mode result.
+        frame_mv[NEWMV][refs[0]].as_int = single_newmv[refs[0]].as_int;
+        frame_mv[NEWMV][refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+        // Iteration: joint search is done once for each ref frame.
+        // Tried allowing search multiple times iteratively, and break out if
+        // it couldn't find better mv. But tests didn't show noticeable
+        // improvement.
+        for (ite = 0; ite < 2; ite++) {
+          struct buf_2d ref_yv12[2] = {xd->plane[0].pre[0],
+                                       xd->plane[0].pre[1]};
+          int bestsme = INT_MAX;
+          int sadpb = x->sadperbit16;
+          int_mv tmp_mv;
+          int search_range = 3;
+
+          int tmp_col_min = x->mv_col_min;
+          int tmp_col_max = x->mv_col_max;
+          int tmp_row_min = x->mv_row_min;
+          int tmp_row_max = x->mv_row_max;
+          int id = ite % 2;
+
+          // Get pred block from second frame.
+          vp9_build_inter_predictor(ref_yv12[!id].buf,
+                                    ref_yv12[!id].stride,
+                                    second_pred, b_sz[bsize][0],
+                                    &frame_mv[NEWMV][refs[!id]],
+                                    &xd->scale_factor[!id],
+                                    b_sz[bsize][0], b_sz[bsize][1], 0,
+                                    &xd->subpix);
+
+          // Compound motion search on first ref frame.
+          if (id)
+            xd->plane[0].pre[0] = ref_yv12[id];
+          vp9_clamp_mv_min_max(x, &ref_mv[id]);
+
+          // Use mv result from single mode as mvp.
+          tmp_mv.as_int = frame_mv[NEWMV][refs[id]].as_int;
+
+          tmp_mv.as_mv.col >>= 3;
+          tmp_mv.as_mv.row >>= 3;
+
+          // Small-range full-pixel motion search
+          bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
+                                             search_range,
+                                             &cpi->fn_ptr[block_size],
+                                             x->nmvjointcost, x->mvcost,
+                                             &ref_mv[id], second_pred,
+                                             b_sz[bsize][0], b_sz[bsize][1]);
+
+          x->mv_col_min = tmp_col_min;
+          x->mv_col_max = tmp_col_max;
+          x->mv_row_min = tmp_row_min;
+          x->mv_row_max = tmp_row_max;
+
+          if (bestsme < INT_MAX) {
+            int dis; /* TODO: use dis in distortion calculation later. */
+            unsigned int sse;
+
+            vp9_find_best_sub_pixel_comp(x, &tmp_mv,
+                                         &ref_mv[id],
+                                         x->errorperbit,
+                                         &cpi->fn_ptr[block_size],
+                                         x->nmvjointcost, x->mvcost,
+                                         &dis, &sse, second_pred,
+                                         b_sz[bsize][0], b_sz[bsize][1]);
+          }
+
+          frame_mv[NEWMV][refs[id]].as_int =
+              xd->mode_info_context->bmi[0].as_mv[1].as_int = tmp_mv.as_int;
+          if (id)
+            xd->plane[0].pre[0] = scaled_first_yv12;
+        }
+
+        // restore the predictor
+        if (scaled_ref_frame[0]) {
+          int i;
+
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            xd->plane[i].pre[0] = backup_yv12[i];
+        }
+
+        if (scaled_ref_frame[1]) {
+          int i;
+
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            xd->plane[i].pre[1] = backup_second_yv12[i];
+        }
+
+        vpx_free(second_pred);
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
+
         if (frame_mv[NEWMV][refs[0]].as_int == INVALID_MV ||
             frame_mv[NEWMV][refs[1]].as_int == INVALID_MV)
           return INT64_MAX;
@@ -3162,7 +2009,7 @@
         int tmp_row_min = x->mv_row_min;
         int tmp_row_max = x->mv_row_max;
 
-        if (scaled_ref_frame) {
+        if (scaled_ref_frame[0]) {
           int i;
 
           // Swap out the reference frame for a version that's been scaled to
@@ -3171,7 +2018,7 @@
           for (i = 0; i < MAX_MB_PLANE; i++)
             backup_yv12[i] = xd->plane[i].pre[0];
 
-          setup_pre_planes(xd, scaled_ref_frame, NULL, mi_row, mi_col,
+          setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
                            NULL, NULL);
         }
 
@@ -3214,6 +2061,7 @@
         }
         frame_mv[NEWMV][refs[0]].as_int =
           xd->mode_info_context->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
+        single_newmv[refs[0]].as_int = tmp_mv.as_int;
 
         // Add the new motion vector cost to our rolling cost variable
         *rate2 += vp9_mv_bit_cost(&tmp_mv, &ref_mv[0],
@@ -3221,7 +2069,7 @@
                                   96, xd->allow_high_precision_mv);
 
         // restore the predictor, if required
-        if (scaled_ref_frame) {
+        if (scaled_ref_frame[0]) {
           int i;
 
           for (i = 0; i < MAX_MB_PLANE; i++)
@@ -3257,20 +2105,6 @@
                                 is_comp_pred);
   *rate2 += vp9_cost_mv_ref(cpi, this_mode,
                             mbmi->mb_mode_context[mbmi->ref_frame]);
-#if CONFIG_COMP_INTERINTRA_PRED
-  if (!is_comp_pred) {
-    *compmode_interintra_cost = vp9_cost_bit(cm->fc.interintra_prob,
-                                             is_comp_interintra_pred);
-    if (is_comp_interintra_pred) {
-      *compmode_interintra_cost +=
-          x->mbmode_cost[xd->frame_type][mbmi->interintra_mode];
-#if SEPARATE_INTERINTRA_UV
-      *compmode_interintra_cost +=
-          x->intra_uv_mode_cost[xd->frame_type][mbmi->interintra_uv_mode];
-#endif
-    }
-  }
-#endif
 
   pred_exists = 0;
   interpolating_intpel_seen = 0;
@@ -3283,131 +2117,84 @@
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
   if (1) {
-    int switchable_filter_index, newbest;
-    int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0;
-    int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0;
-    for (switchable_filter_index = 0;
-         switchable_filter_index < VP9_SWITCHABLE_FILTERS;
-         ++switchable_filter_index) {
+    int i, newbest;
+    int tmp_rate_sum = 0, tmp_dist_sum = 0;
+    for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
       int rs = 0;
-      mbmi->interp_filter = vp9_switchable_interp[switchable_filter_index];
-      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+      const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i];
+      const int is_intpel_interp = intpel_mv &&
+                                   vp9_is_interpolating_filter[filter];
+      mbmi->interp_filter = filter;
+      vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
 
-      if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-        const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
-        const int m = vp9_switchable_interp_map[mbmi->interp_filter];
-        rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
-      }
-      if (interpolating_intpel_seen && intpel_mv &&
-          vp9_is_interpolating_filter[mbmi->interp_filter]) {
-        rd = RDCOST(x->rdmult, x->rddiv,
-                    rs + tmp_rate_y_i + tmp_rate_u_i + tmp_rate_v_i,
-                    tmp_dist_y_i + tmp_dist_u_i + tmp_dist_v_i);
+      if (cm->mcomp_filter_type == SWITCHABLE)
+        rs = get_switchable_rate(cm, x);
+
+      if (interpolating_intpel_seen && is_intpel_interp) {
+        rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_sum, tmp_dist_sum);
       } else {
-        unsigned int sse, var;
-        int tmp_rate_y, tmp_rate_u, tmp_rate_v;
-        int tmp_dist_y, tmp_dist_u, tmp_dist_v;
+        int rate_sum = 0, dist_sum = 0;
         vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-        var = cpi->fn_ptr[block_size].vf(x->plane[0].src.buf,
-                                         x->plane[0].src.stride,
-                                         xd->plane[0].dst.buf,
-                                         xd->plane[0].dst.stride,
-                                         &sse);
-        // Note our transform coeffs are 8 times an orthogonal transform.
-        // Hence quantizer step is also 8 times. To get effective quantizer
-        // we need to divide by 8 before sending to modeling function.
-        model_rd_from_var_lapndz(var, MI_SIZE * bw * MI_SIZE * bh,
-                                 xd->plane[0].dequant[1] >> 3,
-                                 &tmp_rate_y, &tmp_dist_y);
-        var = cpi->fn_ptr[uv_block_size].vf(x->plane[1].src.buf,
-                                            x->plane[1].src.stride,
-                                            xd->plane[1].dst.buf,
-                                            xd->plane[1].dst.stride,
-                                            &sse);
-        model_rd_from_var_lapndz(var, MI_UV_SIZE * bw * MI_UV_SIZE * bh,
-                                 xd->plane[1].dequant[1] >> 3,
-                                 &tmp_rate_u, &tmp_dist_u);
-        var = cpi->fn_ptr[uv_block_size].vf(x->plane[2].src.buf,
-                                            x->plane[1].src.stride,
-                                            xd->plane[2].dst.buf,
-                                            xd->plane[1].dst.stride,
-                                            &sse);
-        model_rd_from_var_lapndz(var, MI_UV_SIZE * bw * MI_UV_SIZE * bh,
-                                 xd->plane[2].dequant[1] >> 3,
-                                 &tmp_rate_v, &tmp_dist_v);
-        rd = RDCOST(x->rdmult, x->rddiv,
-                    rs + tmp_rate_y + tmp_rate_u + tmp_rate_v,
-                    tmp_dist_y + tmp_dist_u + tmp_dist_v);
-        if (!interpolating_intpel_seen && intpel_mv &&
-            vp9_is_interpolating_filter[mbmi->interp_filter]) {
-          tmp_rate_y_i = tmp_rate_y;
-          tmp_rate_u_i = tmp_rate_u;
-          tmp_rate_v_i = tmp_rate_v;
-          tmp_dist_y_i = tmp_dist_y;
-          tmp_dist_u_i = tmp_dist_u;
-          tmp_dist_v_i = tmp_dist_v;
+        model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
+        rd = RDCOST(x->rdmult, x->rddiv, rs + rate_sum, dist_sum);
+        if (!interpolating_intpel_seen && is_intpel_interp) {
+          tmp_rate_sum = rate_sum;
+          tmp_dist_sum = dist_sum;
         }
       }
-      newbest = (switchable_filter_index == 0 || rd < best_rd);
+      newbest = i == 0 || rd < best_rd;
+
       if (newbest) {
         best_rd = rd;
         *best_filter = mbmi->interp_filter;
       }
+
       if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
           (cm->mcomp_filter_type != SWITCHABLE &&
            cm->mcomp_filter_type == mbmi->interp_filter)) {
-        int i;
-        for (i = 0; i < MI_SIZE * bh; ++i)
-          vpx_memcpy(tmp_ybuf + i * MI_SIZE * bw,
-                     xd->plane[0].dst.buf + i * xd->plane[0].dst.stride,
-                     sizeof(unsigned char) * MI_SIZE * bw);
-        for (i = 0; i < MI_UV_SIZE * bh; ++i)
-          vpx_memcpy(tmp_ubuf + i * MI_UV_SIZE * bw,
-                     xd->plane[1].dst.buf + i * xd->plane[1].dst.stride,
-                     sizeof(unsigned char) * MI_UV_SIZE * bw);
-        for (i = 0; i < MI_UV_SIZE * bh; ++i)
-          vpx_memcpy(tmp_vbuf + i * MI_UV_SIZE * bw,
-                     xd->plane[2].dst.buf + i * xd->plane[1].dst.stride,
-                     sizeof(unsigned char) * MI_UV_SIZE * bw);
+        int p;
+
+        for (p = 0; p < MAX_MB_PLANE; p++) {
+          const int y = (MI_SIZE * bh) >> xd->plane[p].subsampling_y;
+          const int x = (MI_SIZE * bw) >> xd->plane[p].subsampling_x;
+          int i;
+
+          for (i = 0; i < y; i++)
+            vpx_memcpy(&tmp_buf[p][64 * i],
+                       xd->plane[p].dst.buf + i * xd->plane[p].dst.stride, x);
+        }
         pred_exists = 1;
       }
-      interpolating_intpel_seen |=
-        intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter];
+      interpolating_intpel_seen |= is_intpel_interp;
     }
   }
 
   // Set the appripriate filter
-  if (cm->mcomp_filter_type != SWITCHABLE)
-    mbmi->interp_filter = cm->mcomp_filter_type;
-  else
-    mbmi->interp_filter = *best_filter;
-  vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+  mbmi->interp_filter = cm->mcomp_filter_type != SWITCHABLE ?
+                             cm->mcomp_filter_type : *best_filter;
+  vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+
 
   if (pred_exists) {
-    // FIXME(rbultje): mb code still predicts into xd->predictor
-    for (i = 0; i < bh * MI_SIZE; ++i)
-      vpx_memcpy(xd->plane[0].dst.buf + i * xd->plane[0].dst.stride,
-                 tmp_ybuf + i * bw * MI_SIZE,
-                 sizeof(unsigned char) * bw * MI_SIZE);
-    for (i = 0; i < bh * MI_UV_SIZE; ++i)
-      vpx_memcpy(xd->plane[1].dst.buf + i * xd->plane[1].dst.stride,
-                 tmp_ubuf + i * bw * MI_UV_SIZE,
-                 sizeof(unsigned char) * bw * MI_UV_SIZE);
-    for (i = 0; i < bh * MI_UV_SIZE; ++i)
-      vpx_memcpy(xd->plane[2].dst.buf + i * xd->plane[1].dst.stride,
-                 tmp_vbuf + i * bw * MI_UV_SIZE,
-                 sizeof(unsigned char) * bw * MI_UV_SIZE);
+    int p;
+
+    for (p = 0; p < MAX_MB_PLANE; p++) {
+      const int y = (MI_SIZE * bh) >> xd->plane[p].subsampling_y;
+      const int x = (MI_SIZE * bw) >> xd->plane[p].subsampling_x;
+      int i;
+
+      for (i = 0; i < y; i++)
+        vpx_memcpy(xd->plane[p].dst.buf + i * xd->plane[p].dst.stride,
+                   &tmp_buf[p][64 * i], x);
+    }
   } else {
     // Handles the special case when a filter that is not in the
     // switchable list (ex. bilinear, 6-tap) is indicated at the frame level
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
   }
 
-  if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-    const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
-    const int m = vp9_switchable_interp_map[mbmi->interp_filter];
-    *rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
-  }
+  if (cpi->common.mcomp_filter_type == SWITCHABLE)
+    *rate2 += get_switchable_rate(cm, x);
 
   if (cpi->active_map_enabled && x->active_ptr[0] == 0)
     x->skip = 1;
@@ -3419,17 +2206,11 @@
     if (threshold < x->encode_breakout)
       threshold = x->encode_breakout;
 
-    if (bsize != BLOCK_SIZE_MB16X16) {
-      var = cpi->fn_ptr[block_size].vf(x->plane[0].src.buf,
-                                       x->plane[0].src.stride,
-                                       xd->plane[0].dst.buf,
-                                       xd->plane[0].dst.stride,
-                                       &sse);
-    } else {
-      var = vp9_variance16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                              xd->plane[0].dst.buf, xd->plane[0].dst.stride,
-                              &sse);
-    }
+    var = cpi->fn_ptr[block_size].vf(x->plane[0].src.buf,
+                                     x->plane[0].src.stride,
+                                     xd->plane[0].dst.buf,
+                                     xd->plane[0].dst.stride,
+                                     &sse);
 
     if ((int)sse < threshold) {
       unsigned int q2dc = xd->plane[0].dequant[0];
@@ -3439,29 +2220,16 @@
           (sse / 2 > var && sse - var < 64)) {
         // Check u and v to make sure skip is ok
         int sse2;
-
-        if (bsize != BLOCK_SIZE_MB16X16) {
-          unsigned int sse2u, sse2v;
-          // FIXME(rbultje): mb predictors predict into xd->predictor
-          var = cpi->fn_ptr[uv_block_size].vf(x->plane[1].src.buf,
-                                              x->plane[1].src.stride,
-                                              xd->plane[1].dst.buf,
-                                              xd->plane[1].dst.stride, &sse2u);
-          var = cpi->fn_ptr[uv_block_size].vf(x->plane[2].src.buf,
-                                              x->plane[1].src.stride,
-                                              xd->plane[2].dst.buf,
-                                              xd->plane[1].dst.stride, &sse2v);
-          sse2 = sse2u + sse2v;
-        } else {
-          unsigned int sse2u, sse2v;
-          var = vp9_variance8x8(x->plane[1].src.buf, x->plane[1].src.stride,
-                                xd->plane[1].dst.buf, xd->plane[1].dst.stride,
-                                &sse2u);
-          var = vp9_variance8x8(x->plane[2].src.buf, x->plane[1].src.stride,
-                                xd->plane[2].dst.buf, xd->plane[1].dst.stride,
-                                &sse2v);
-          sse2 = sse2u + sse2v;
-        }
+        unsigned int sse2u, sse2v;
+        var = cpi->fn_ptr[uv_block_size].vf(x->plane[1].src.buf,
+                                            x->plane[1].src.stride,
+                                            xd->plane[1].dst.buf,
+                                            xd->plane[1].dst.stride, &sse2u);
+        var = cpi->fn_ptr[uv_block_size].vf(x->plane[2].src.buf,
+                                            x->plane[1].src.stride,
+                                            xd->plane[2].dst.buf,
+                                            xd->plane[1].dst.stride, &sse2v);
+        sse2 = sse2u + sse2v;
 
         if (sse2 * 2 < threshold) {
           x->skip = 1;
@@ -3502,877 +2270,11 @@
     } else {
       *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
     }
-#if CONFIG_COMP_INTERINTRA_PRED
-    if (is_comp_interintra_pred && !cm->use_interintra) *mode_excluded = 1;
-#endif
   }
 
   return this_rd;  // if 0, this will be re-calculated by caller
 }
 
-#if !CONFIG_SB8X8
-static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                               int mi_row, int mi_col,
-                               int *returnrate, int *returndistortion,
-                               int64_t *returnintra) {
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-    VP9_ALT_FLAG };
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  union b_mode_info best_bmodes[16];
-  MB_MODE_INFO best_mbmode;
-  PARTITION_INFO best_partition;
-  int_mv best_ref_mv, second_best_ref_mv;
-  MB_PREDICTION_MODE this_mode;
-  MB_PREDICTION_MODE best_mode = DC_PRED;
-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
-  int i, best_mode_index = 0;
-  int mode8x8[4];
-  unsigned char segment_id = mbmi->segment_id;
-
-  int mode_index;
-  int mdcounts[4];
-  int rate, distortion;
-  int rate2, distortion2;
-  int64_t best_txfm_rd[NB_TXFM_MODES];
-  int64_t best_txfm_diff[NB_TXFM_MODES];
-  int64_t best_pred_diff[NB_PREDICTION_TYPES];
-  int64_t best_pred_rd[NB_PREDICTION_TYPES];
-  int64_t best_rd = INT64_MAX, best_intra_rd = INT64_MAX;
-#if CONFIG_COMP_INTERINTRA_PRED
-  int is_best_interintra = 0;
-  int64_t best_intra16_rd = INT64_MAX;
-  int best_intra16_mode = DC_PRED;
-#if SEPARATE_INTERINTRA_UV
-  int best_intra16_uv_mode = DC_PRED;
-#endif
-#endif
-  int64_t best_overall_rd = INT64_MAX;
-  INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;
-  INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
-  int uv_intra_rate[2], uv_intra_distortion[2], uv_intra_rate_tokenonly[2];
-  int uv_intra_skippable[2];
-  MB_PREDICTION_MODE uv_intra_mode[2];
-  int rate_y, UNINITIALIZED_IS_SAFE(rate_uv);
-  int distortion_uv = INT_MAX;
-  int64_t best_yrd = INT64_MAX;
-
-  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-  int frame_mdcounts[4][4];
-  YV12_BUFFER_CONFIG yv12_mb[4];
-
-  unsigned int ref_costs[MAX_REF_FRAMES];
-  int_mv seg_mvs[NB_PARTITIONINGS][16 /* n_blocks */][MAX_REF_FRAMES - 1];
-
-  int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
-                                             cpi->common.y_dc_delta_q);
-  int64_t mode_distortions[MB_MODE_COUNT] = {-1};
-  int64_t frame_distortions[MAX_REF_FRAMES] = {-1};
-  int ref_frame;
-
-  struct scale_factors scale_factor[4];
-
-  vpx_memset(mode8x8, 0, sizeof(mode8x8));
-  vpx_memset(&frame_mv, 0, sizeof(frame_mv));
-  vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
-  vpx_memset(&best_bmodes, 0, sizeof(best_bmodes));
-  vpx_memset(&x->mb_context[xd->sb_index][xd->mb_index], 0,
-             sizeof(PICK_MODE_CONTEXT));
-
-  x->mb_context[xd->sb_index][xd->mb_index].frames_with_high_error = 0;
-  x->mb_context[xd->sb_index][xd->mb_index].modes_with_high_error = 0;
-
-  for (i = 0; i < MAX_REF_FRAMES; i++)
-    frame_mv[NEWMV][i].as_int = INVALID_MV;
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i)
-    best_pred_rd[i] = INT64_MAX;
-  for (i = 0; i < NB_TXFM_MODES; i++)
-    best_txfm_rd[i] = INT64_MAX;
-
-  for (i = 0; i < NB_PARTITIONINGS; i++) {
-    int j, k;
-
-    for (j = 0; j < 16; j++)
-      for (k = 0; k < MAX_REF_FRAMES - 1; k++)
-        seg_mvs[i][j][k].as_int = INVALID_MV;
-  }
-
-  if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
-    setup_buffer_inter(cpi, x, cpi->lst_fb_idx,
-                       LAST_FRAME, BLOCK_16X16, mi_row, mi_col,
-                       frame_mv[NEARESTMV], frame_mv[NEARMV],
-                       frame_mdcounts, yv12_mb, scale_factor);
-  }
-
-  if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
-    setup_buffer_inter(cpi, x, cpi->gld_fb_idx,
-                       GOLDEN_FRAME, BLOCK_16X16, mi_row, mi_col,
-                       frame_mv[NEARESTMV], frame_mv[NEARMV],
-                       frame_mdcounts, yv12_mb, scale_factor);
-  }
-
-  if (cpi->ref_frame_flags & VP9_ALT_FLAG) {
-    setup_buffer_inter(cpi, x, cpi->alt_fb_idx,
-                       ALTREF_FRAME, BLOCK_16X16, mi_row, mi_col,
-                       frame_mv[NEARESTMV], frame_mv[NEARMV],
-                       frame_mdcounts, yv12_mb, scale_factor);
-  }
-
-  *returnintra = INT64_MAX;
-
-  mbmi->ref_frame = INTRA_FRAME;
-
-  /* Initialize zbin mode boost for uv costing */
-  cpi->zbin_mode_boost = 0;
-  vp9_update_zbin_extra(cpi, x);
-
-  xd->mode_info_context->mbmi.mode = DC_PRED;
-
-  for (i = 0; i <= TX_8X8; i++) {
-    mbmi->txfm_size = i;
-    rd_pick_intra_sbuv_mode(cpi, x, &uv_intra_rate[i],
-                            &uv_intra_rate_tokenonly[i],
-                            &uv_intra_distortion[i],
-                            &uv_intra_skippable[i],
-                            BLOCK_SIZE_MB16X16);
-    uv_intra_mode[i] = mbmi->uv_mode;
-  }
-
-  // Get estimates of reference frame costs for each reference frame
-  // that depend on the current prediction etc.
-  estimate_ref_frame_costs(cpi, segment_id, ref_costs);
-
-  for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
-    int64_t this_rd = INT64_MAX;
-    int disable_skip = 0, skippable = 0;
-    int other_cost = 0;
-    int compmode_cost = 0;
-#if CONFIG_COMP_INTERINTRA_PRED
-    int compmode_interintra_cost = 0;
-#endif
-    int mode_excluded = 0;
-    int64_t txfm_cache[NB_TXFM_MODES] = { 0 };
-    YV12_BUFFER_CONFIG *scaled_ref_frame;
-
-    // These variables hold are rolling total cost and distortion for this mode
-    rate2 = 0;
-    distortion2 = 0;
-    rate_y = 0;
-    rate_uv = 0;
-
-    x->skip = 0;
-
-    this_mode = vp9_mode_order[mode_index].mode;
-    mbmi->mode = this_mode;
-    mbmi->uv_mode = DC_PRED;
-    mbmi->ref_frame = vp9_mode_order[mode_index].ref_frame;
-    mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
-
-    mbmi->interp_filter = cm->mcomp_filter_type;
-
-    set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
-                      scale_factor);
-
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-
-    // Test best rd so far against threshold for trying this mode.
-    if (best_rd <= cpi->rd_threshes[mode_index])
-      continue;
-
-    // Ensure that the references used by this mode are available.
-    if (mbmi->ref_frame &&
-        !(cpi->ref_frame_flags & flag_list[mbmi->ref_frame]))
-      continue;
-
-    if (mbmi->second_ref_frame > 0 &&
-        !(cpi->ref_frame_flags & flag_list[mbmi->second_ref_frame]))
-      continue;
-
-    // only scale on zeromv.
-    if (mbmi->ref_frame > 0 &&
-          (yv12_mb[mbmi->ref_frame].y_width != cm->mb_cols * 16 ||
-           yv12_mb[mbmi->ref_frame].y_height != cm->mb_rows * 16) &&
-        this_mode != ZEROMV)
-      continue;
-
-    if (mbmi->second_ref_frame > 0 &&
-          (yv12_mb[mbmi->second_ref_frame].y_width != cm->mb_cols * 16 ||
-           yv12_mb[mbmi->second_ref_frame].y_height != cm->mb_rows * 16) &&
-        this_mode != ZEROMV)
-      continue;
-
-    // current coding mode under rate-distortion optimization test loop
-#if CONFIG_COMP_INTERINTRA_PRED
-    mbmi->interintra_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-    mbmi->interintra_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
-
-    // If the segment reference frame feature is enabled....
-    // then do nothing if the current ref frame is not allowed..
-    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
-        !vp9_check_segref(xd, segment_id, mbmi->ref_frame)) {
-      continue;
-    // If the segment skip feature is enabled....
-    // then do nothing if the current mode is not allowed..
-    } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) &&
-               (this_mode != ZEROMV)) {
-      continue;
-    // Disable this drop out case if  the ref frame segment
-    // level feature is enabled for this segment. This is to
-    // prevent the possibility that the we end up unable to pick any mode.
-    } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME)) {
-      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame overlay,
-      // unless ARNR filtering is enabled in which case we want
-      // an unfiltered alternative
-      if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-        if (this_mode != ZEROMV ||
-            mbmi->ref_frame != ALTREF_FRAME) {
-          continue;
-        }
-      }
-    }
-
-    /* everything but intra */
-    scaled_ref_frame = NULL;
-    if (mbmi->ref_frame) {
-      int ref = mbmi->ref_frame;
-      int fb;
-
-      best_ref_mv = mbmi->ref_mvs[ref][0];
-      vpx_memcpy(mdcounts, frame_mdcounts[ref], sizeof(mdcounts));
-
-      if (mbmi->ref_frame == LAST_FRAME) {
-        fb = cpi->lst_fb_idx;
-      } else if (mbmi->ref_frame == GOLDEN_FRAME) {
-        fb = cpi->gld_fb_idx;
-      } else {
-        fb = cpi->alt_fb_idx;
-      }
-
-      if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
-        scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
-    }
-
-    if (mbmi->second_ref_frame > 0) {
-      int ref = mbmi->second_ref_frame;
-
-      second_best_ref_mv = mbmi->ref_mvs[ref][0];
-    }
-
-    // TODO(jkoleszar) scaling/translation handled during creation of yv12_mb
-    // currently.
-    setup_pre_planes(xd, &yv12_mb[mbmi->ref_frame],
-        mbmi->second_ref_frame > 0 ? &yv12_mb[mbmi->second_ref_frame] : NULL,
-        0, 0, NULL, NULL);
-
-    // Experimental code. Special case for gf and arf zeromv modes.
-    // Increase zbin size to suppress noise
-    if (cpi->zbin_mode_boost_enabled) {
-      if (vp9_mode_order[mode_index].ref_frame == INTRA_FRAME)
-        cpi->zbin_mode_boost = 0;
-      else {
-        if (vp9_mode_order[mode_index].mode == ZEROMV) {
-          if (vp9_mode_order[mode_index].ref_frame != LAST_FRAME)
-            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
-          else
-            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
-        } else if (vp9_mode_order[mode_index].mode == SPLITMV)
-          cpi->zbin_mode_boost = 0;
-        else
-          cpi->zbin_mode_boost = MV_ZBIN_BOOST;
-      }
-
-      vp9_update_zbin_extra(cpi, x);
-    }
-
-    // Intra
-    if (!mbmi->ref_frame) {
-      switch (this_mode) {
-        default:
-        case V_PRED:
-        case H_PRED:
-        case D45_PRED:
-        case D135_PRED:
-        case D117_PRED:
-        case D153_PRED:
-        case D27_PRED:
-        case D63_PRED:
-          rate2 += intra_cost_penalty;
-        case DC_PRED:
-        case TM_PRED:
-          mbmi->ref_frame = INTRA_FRAME;
-          // FIXME compound intra prediction
-          vp9_build_intra_predictors_sby_s(&x->e_mbd, BLOCK_SIZE_MB16X16);
-          // vp9_build_intra_predictors_mby(&x->e_mbd);
-          super_block_yrd(cpi, x, &rate_y, &distortion, &skippable,
-                          BLOCK_SIZE_MB16X16, txfm_cache);
-          rate2 += rate_y;
-          distortion2 += distortion;
-          rate2 += x->mbmode_cost[xd->frame_type][mbmi->mode];
-
-          rate2 += uv_intra_rate[mbmi->txfm_size != TX_4X4];
-          rate_uv = uv_intra_rate_tokenonly[mbmi->txfm_size != TX_4X4];
-          distortion2 += uv_intra_distortion[mbmi->txfm_size != TX_4X4];
-          distortion_uv = uv_intra_distortion[mbmi->txfm_size != TX_4X4];
-          skippable = skippable &&
-                      uv_intra_skippable[mbmi->txfm_size != TX_4X4];
-          break;
-        case I4X4_PRED: {
-          int64_t tmp_rd;
-
-          // Note the rate value returned here includes the cost of coding
-          // the I4X4_PRED mode : x->mbmode_cost[xd->frame_type][I4X4_PRED];
-          mbmi->txfm_size = TX_4X4;
-          tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,
-                                             &distortion, best_yrd);
-          rate2 += rate;
-          rate2 += intra_cost_penalty;
-          distortion2 += distortion;
-
-          if (tmp_rd < best_yrd) {
-            rate2 += uv_intra_rate[TX_4X4];
-            rate_uv = uv_intra_rate_tokenonly[TX_4X4];
-            distortion2 += uv_intra_distortion[TX_4X4];
-            distortion_uv = uv_intra_distortion[TX_4X4];
-          } else {
-            this_rd = INT64_MAX;
-            disable_skip = 1;
-          }
-        }
-        break;
-        case I8X8_PRED: {
-          int64_t tmp_rd;
-
-          tmp_rd = rd_pick_intra8x8mby_modes_and_txsz(cpi, x, &rate, &rate_y,
-                                                      &distortion, mode8x8,
-                                                      best_yrd, txfm_cache);
-          rate2 += rate;
-          rate2 += intra_cost_penalty;
-          distortion2 += distortion;
-
-          /* TODO: uv rate maybe over-estimated here since there is UV intra
-                   mode coded in I8X8_PRED prediction */
-          if (tmp_rd < best_yrd) {
-            rate2 += uv_intra_rate[TX_4X4];
-            rate_uv = uv_intra_rate_tokenonly[TX_4X4];
-            distortion2 += uv_intra_distortion[TX_4X4];
-            distortion_uv = uv_intra_distortion[TX_4X4];
-          } else {
-            this_rd = INT64_MAX;
-            disable_skip = 1;
-          }
-        }
-        break;
-      }
-    }
-    // Split MV. The code is very different from the other inter modes so
-    // special case it.
-    else if (this_mode == SPLITMV) {
-      const int is_comp_pred = mbmi->second_ref_frame > 0;
-      int64_t this_rd_thresh;
-      int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
-      int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
-      int tmp_best_distortion = INT_MAX, tmp_best_skippable = 0;
-      int switchable_filter_index;
-      int_mv *second_ref = is_comp_pred ? &second_best_ref_mv : NULL;
-      union b_mode_info tmp_best_bmodes[16];
-      MB_MODE_INFO tmp_best_mbmode;
-      PARTITION_INFO tmp_best_partition;
-      int pred_exists = 0;
-
-      this_rd_thresh =
-          (mbmi->ref_frame == LAST_FRAME) ?
-          cpi->rd_threshes[THR_NEWMV] : cpi->rd_threshes[THR_NEWA];
-      this_rd_thresh =
-          (mbmi->ref_frame == GOLDEN_FRAME) ?
-          cpi->rd_threshes[THR_NEWG] : this_rd_thresh;
-      xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-
-      for (switchable_filter_index = 0;
-           switchable_filter_index < VP9_SWITCHABLE_FILTERS;
-           ++switchable_filter_index) {
-        int newbest;
-        mbmi->interp_filter =
-            vp9_switchable_interp[switchable_filter_index];
-        vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-
-        tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
-                                             second_ref, best_yrd, mdcounts,
-                                             &rate, &rate_y, &distortion,
-                                             &skippable,
-                                             (int)this_rd_thresh, seg_mvs,
-                                             txfm_cache);
-        if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-          int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
-                   [vp9_get_pred_context(&cpi->common, xd,
-                                         PRED_SWITCHABLE_INTERP)]
-                   [vp9_switchable_interp_map[mbmi->interp_filter]];
-          tmp_rd += RDCOST(x->rdmult, x->rddiv, rs, 0);
-        }
-        newbest = (tmp_rd < tmp_best_rd);
-        if (newbest) {
-          tmp_best_filter = mbmi->interp_filter;
-          tmp_best_rd = tmp_rd;
-        }
-        if ((newbest && cm->mcomp_filter_type == SWITCHABLE) ||
-            (mbmi->interp_filter == cm->mcomp_filter_type &&
-             cm->mcomp_filter_type != SWITCHABLE)) {
-          tmp_best_rdu = tmp_rd;
-          tmp_best_rate = rate;
-          tmp_best_ratey = rate_y;
-          tmp_best_distortion = distortion;
-          tmp_best_skippable = skippable;
-          vpx_memcpy(&tmp_best_mbmode, mbmi, sizeof(MB_MODE_INFO));
-          vpx_memcpy(&tmp_best_partition, x->partition_info,
-                     sizeof(PARTITION_INFO));
-          for (i = 0; i < 16; i++) {
-            tmp_best_bmodes[i] = xd->mode_info_context->bmi[i];
-          }
-          pred_exists = 1;
-        }
-      }  // switchable_filter_index loop
-
-      mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ?
-                             tmp_best_filter : cm->mcomp_filter_type);
-      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-      if (!pred_exists) {
-        // Handles the special case when a filter that is not in the
-        // switchable list (bilinear, 6-tap) is indicated at the frame level
-        tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
-                                             second_ref, best_yrd, mdcounts,
-                                             &rate, &rate_y, &distortion,
-                                             &skippable,
-                                             (int)this_rd_thresh, seg_mvs,
-                                             txfm_cache);
-      } else {
-        if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-          int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
-                   [vp9_get_pred_context(&cpi->common, xd,
-                                         PRED_SWITCHABLE_INTERP)]
-                   [vp9_switchable_interp_map[mbmi->interp_filter]];
-          tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);
-        }
-        tmp_rd = tmp_best_rdu;
-        rate = tmp_best_rate;
-        rate_y = tmp_best_ratey;
-        distortion = tmp_best_distortion;
-        skippable = tmp_best_skippable;
-        vpx_memcpy(mbmi, &tmp_best_mbmode, sizeof(MB_MODE_INFO));
-        vpx_memcpy(x->partition_info, &tmp_best_partition,
-                   sizeof(PARTITION_INFO));
-        for (i = 0; i < 16; i++) {
-          xd->mode_info_context->bmi[i] = tmp_best_bmodes[i];
-        }
-      }
-
-      rate2 += rate;
-      distortion2 += distortion;
-
-      if (cpi->common.mcomp_filter_type == SWITCHABLE)
-        rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
-            [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
-            [vp9_switchable_interp_map[mbmi->interp_filter]];
-
-      // If even the 'Y' rd value of split is higher than best so far
-      // then dont bother looking at UV
-      if (tmp_rd < best_yrd) {
-        int uv_skippable;
-
-        vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
-                                        BLOCK_SIZE_MB16X16);
-
-        vp9_subtract_sbuv(x, BLOCK_SIZE_MB16X16);
-
-        super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
-                                  &uv_skippable, BLOCK_SIZE_MB16X16, TX_4X4);
-        rate2 += rate_uv;
-        distortion2 += distortion_uv;
-        skippable = skippable && uv_skippable;
-      } else {
-        this_rd = INT64_MAX;
-        disable_skip = 1;
-      }
-
-      if (!mode_excluded) {
-        if (is_comp_pred)
-          mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
-        else
-          mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
-      }
-
-      compmode_cost =
-        vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);
-      mbmi->mode = this_mode;
-    }
-    else {
-#if CONFIG_COMP_INTERINTRA_PRED
-      if (mbmi->second_ref_frame == INTRA_FRAME) {
-        if (best_intra16_mode == DC_PRED - 1) continue;
-        mbmi->interintra_mode = best_intra16_mode;
-#if SEPARATE_INTERINTRA_UV
-        mbmi->interintra_uv_mode = best_intra16_uv_mode;
-#else
-        mbmi->interintra_uv_mode = best_intra16_mode;
-#endif
-      }
-#endif
-      this_rd = handle_inter_mode(cpi, x, BLOCK_SIZE_MB16X16,
-                                  mdcounts, txfm_cache,
-                                  &rate2, &distortion2, &skippable,
-                                  &compmode_cost,
-#if CONFIG_COMP_INTERINTRA_PRED
-                                  &compmode_interintra_cost,
-#endif
-                                  &rate_y, &distortion,
-                                  &rate_uv, &distortion_uv,
-                                  &mode_excluded, &disable_skip,
-                                  mode_index, &tmp_best_filter, frame_mv,
-                                  scaled_ref_frame, mi_row, mi_col);
-      if (this_rd == INT64_MAX)
-        continue;
-    }
-
-#if CONFIG_COMP_INTERINTRA_PRED
-    if (cpi->common.use_interintra)
-      rate2 += compmode_interintra_cost;
-#endif
-
-    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION)
-      rate2 += compmode_cost;
-
-    // Estimate the reference frame signaling cost and add it
-    // to the rolling cost variable.
-    rate2 += ref_costs[mbmi->ref_frame];
-
-    if (!disable_skip) {
-      // Test for the condition where skip block will be activated
-      // because there are no non zero coefficients and make any
-      // necessary adjustment for rate. Ignore if skip is coded at
-      // segment level as the cost wont have been added in.
-      int mb_skip_allowed;
-
-      // Is Mb level skip allowed (i.e. not coded at segment level).
-      mb_skip_allowed = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
-
-      if (skippable) {
-        mbmi->mb_skip_coeff = 1;
-
-        // Back out the coefficient coding costs
-        rate2 -= (rate_y + rate_uv);
-        // for best_yrd calculation
-        rate_uv = 0;
-
-        if (mb_skip_allowed) {
-          int prob_skip_cost;
-
-          // Cost the skip mb case
-          vp9_prob skip_prob =
-            vp9_get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP);
-
-          if (skip_prob) {
-            prob_skip_cost = vp9_cost_bit(skip_prob, 1);
-            rate2 += prob_skip_cost;
-            other_cost += prob_skip_cost;
-          }
-        }
-      } else {
-        // Add in the cost of the no skip flag.
-        mbmi->mb_skip_coeff = 0;
-        if (mb_skip_allowed) {
-          int prob_skip_cost = vp9_cost_bit(
-                 vp9_get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP), 0);
-          rate2 += prob_skip_cost;
-          other_cost += prob_skip_cost;
-        }
-      }
-
-      // Calculate the final RD estimate for this mode.
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-    }
-
-    // Keep record of best intra distortion
-    if ((mbmi->ref_frame == INTRA_FRAME) &&
-        (this_rd < best_intra_rd)) {
-      best_intra_rd = this_rd;
-      *returnintra = distortion2;
-    }
-#if CONFIG_COMP_INTERINTRA_PRED
-    if ((mbmi->ref_frame == INTRA_FRAME) &&
-        (this_mode <= TM_PRED) &&
-        (this_rd < best_intra16_rd)) {
-      best_intra16_rd = this_rd;
-      best_intra16_mode = this_mode;
-#if SEPARATE_INTERINTRA_UV
-      best_intra16_uv_mode = uv_intra_mode[mbmi->txfm_size != TX_4X4];
-#endif
-    }
-#endif
-
-    if (!disable_skip && mbmi->ref_frame == INTRA_FRAME)
-      for (i = 0; i < NB_PREDICTION_TYPES; ++i)
-        best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
-
-    if (this_rd < best_overall_rd) {
-      best_overall_rd = this_rd;
-      best_filter = tmp_best_filter;
-      best_mode = this_mode;
-#if CONFIG_COMP_INTERINTRA_PRED
-      is_best_interintra = (mbmi->second_ref_frame == INTRA_FRAME);
-#endif
-    }
-
-    // Store the respective mode distortions for later use.
-    // Store the respective mode distortions for later use.
-    if (mode_distortions[this_mode] == -1
-        || distortion2 < mode_distortions[this_mode]) {
-      mode_distortions[this_mode] = distortion2;
-    }
-    if (frame_distortions[mbmi->ref_frame] == -1 ||
-        distortion2 < frame_distortions[mbmi->ref_frame]) {
-       frame_distortions[mbmi->ref_frame] = distortion2;
-    }
-
-    // Did this mode help.. i.e. is it the new best mode
-    if (this_rd < best_rd || x->skip) {
-      if (!mode_excluded) {
-        /*
-        if (mbmi->second_ref_frame == INTRA_FRAME) {
-          printf("rd %d best %d bestintra16 %d\n", this_rd, best_rd, best_intra16_rd);
-        }
-        */
-        // Note index of best mode so far
-        best_mode_index = mode_index;
-
-        if (this_mode <= I4X4_PRED) {
-          if (mbmi->txfm_size != TX_4X4
-              && this_mode != I4X4_PRED
-              && this_mode != I8X8_PRED)
-            mbmi->uv_mode = uv_intra_mode[TX_8X8];
-          else
-            mbmi->uv_mode = uv_intra_mode[TX_4X4];
-          /* required for left and above block mv */
-          mbmi->mv[0].as_int = 0;
-        }
-
-        other_cost += ref_costs[mbmi->ref_frame];
-
-        /* Calculate the final y RD estimate for this mode */
-        best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),
-                          (distortion2 - distortion_uv));
-
-        *returnrate = rate2;
-        *returndistortion = distortion2;
-        best_rd = this_rd;
-        vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));
-        vpx_memcpy(&best_partition, x->partition_info, sizeof(PARTITION_INFO));
-
-        if ((this_mode == I4X4_PRED)
-            || (this_mode == I8X8_PRED)
-            || (this_mode == SPLITMV))
-          for (i = 0; i < 16; i++) {
-            best_bmodes[i] = xd->mode_info_context->bmi[i];
-          }
-      }
-
-      // Testing this mode gave rise to an improvement in best error score.
-      // Lower threshold a bit for next time
-      cpi->rd_thresh_mult[mode_index] =
-          (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
-          cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
-      cpi->rd_threshes[mode_index] =
-          (cpi->rd_baseline_thresh[mode_index] >> 7) *
-          cpi->rd_thresh_mult[mode_index];
-    } else {
-      // If the mode did not help improve the best error case then raise the
-      // threshold for testing that mode next time around.
-      cpi->rd_thresh_mult[mode_index] += 4;
-
-      if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
-        cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
-
-      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7)
-          * cpi->rd_thresh_mult[mode_index];
-    }
-
-    /* keep record of best compound/single-only prediction */
-    if (!disable_skip && mbmi->ref_frame != INTRA_FRAME) {
-      int64_t single_rd, hybrid_rd;
-      int single_rate, hybrid_rate;
-
-      if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
-        single_rate = rate2 - compmode_cost;
-        hybrid_rate = rate2;
-      } else {
-        single_rate = rate2;
-        hybrid_rate = rate2 + compmode_cost;
-      }
-
-      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
-      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
-
-      if (mbmi->second_ref_frame <= INTRA_FRAME &&
-          single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
-        best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
-      } else if (mbmi->second_ref_frame > INTRA_FRAME &&
-                 single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
-        best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
-      }
-      if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
-        best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
-    }
-
-    /* keep record of best txfm size */
-    if (!mode_excluded && this_rd != INT64_MAX) {
-      for (i = 0; i < NB_TXFM_MODES; i++) {
-        int64_t adj_rd;
-        if (this_mode != I4X4_PRED) {
-          const int64_t txfm_mode_diff =
-              txfm_cache[i] - txfm_cache[cm->txfm_mode];
-          adj_rd = this_rd + txfm_mode_diff;
-        } else {
-          adj_rd = this_rd;
-        }
-        if (adj_rd < best_txfm_rd[i])
-          best_txfm_rd[i] = adj_rd;
-      }
-    }
-
-    if (x->skip && !mode_excluded)
-      break;
-  }
-
-  assert((cm->mcomp_filter_type == SWITCHABLE) ||
-         (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
-         (best_mbmode.mode <= I4X4_PRED));
-
-#if CONFIG_COMP_INTERINTRA_PRED
-  ++cpi->interintra_select_count[is_best_interintra];
-#endif
-
-  // Accumulate filter usage stats
-  // TODO(agrange): Use RD criteria to select interpolation filter mode.
-  if (is_inter_mode(best_mode))
-    ++cpi->best_switchable_interp_count[vp9_switchable_interp_map[best_filter]];
-
-  // Reduce the activation RD thresholds for the best choice mode
-  if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
-      (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {
-    int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
-
-    cpi->rd_thresh_mult[best_mode_index] =
-        (cpi->rd_thresh_mult[best_mode_index] >=
-         (MIN_THRESHMULT + best_adjustment)) ?
-        cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
-    cpi->rd_threshes[best_mode_index] =
-        (cpi->rd_baseline_thresh[best_mode_index] >> 7) *
-        cpi->rd_thresh_mult[best_mode_index];
-  }
-
-  // This code forces Altref,0,0 and skip for the frame that overlays a
-  // an alrtef unless Altref is filtered. However, this is unsafe if
-  // segment level coding of ref frame is enabled for this
-  // segment.
-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
-      cpi->is_src_frame_alt_ref &&
-      (cpi->oxcf.arnr_max_frames == 0) &&
-      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
-    mbmi->mode = ZEROMV;
-    if (cm->txfm_mode <= ALLOW_8X8)
-      mbmi->txfm_size = cm->txfm_mode;
-    else
-      mbmi->txfm_size = TX_16X16;
-    mbmi->ref_frame = ALTREF_FRAME;
-    mbmi->mv[0].as_int = 0;
-    mbmi->uv_mode = DC_PRED;
-    mbmi->mb_skip_coeff = 1;
-    mbmi->partitioning = 0;
-    set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
-                      scale_factor);
-
-    vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));
-    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
-    goto end;
-  }
-
-  // macroblock modes
-  vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
-  if (best_mbmode.mode == I4X4_PRED) {
-    for (i = 0; i < 16; i++) {
-      xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;
-    }
-  }
-
-  if (best_mbmode.mode == I8X8_PRED)
-    set_i8x8_block_modes(x, mode8x8);
-
-  if (best_mbmode.mode == SPLITMV) {
-    for (i = 0; i < 16; i++)
-      xd->mode_info_context->bmi[i].as_mv[0].as_int =
-          best_bmodes[i].as_mv[0].as_int;
-    if (mbmi->second_ref_frame > 0)
-      for (i = 0; i < 16; i++)
-        xd->mode_info_context->bmi[i].as_mv[1].as_int =
-            best_bmodes[i].as_mv[1].as_int;
-
-    vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));
-
-    mbmi->mv[0].as_int = x->partition_info->bmi[15].mv.as_int;
-    mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;
-  }
-
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
-    if (best_pred_rd[i] == INT64_MAX)
-      best_pred_diff[i] = INT_MIN;
-    else
-      best_pred_diff[i] = best_rd - best_pred_rd[i];
-  }
-
-  if (!x->skip) {
-    for (i = 0; i < NB_TXFM_MODES; i++) {
-      if (best_txfm_rd[i] == INT64_MAX)
-        best_txfm_diff[i] = 0;
-      else
-        best_txfm_diff[i] = best_rd - best_txfm_rd[i];
-    }
-  } else {
-    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
-  }
-
-end:
-
-  // Flag all modes that have a distortion thats > 2x the best we found at
-  // this level.
-  for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
-    if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV
-        || mode_index == SPLITMV)
-      continue;
-
-    if (mode_distortions[mode_index] > 2 * *returndistortion) {
-      x->mb_context[xd->sb_index][xd->mb_index].modes_with_high_error |= (1
-          << mode_index);
-    }
-  }
-
-  // Flag all ref frames that have a distortion thats > 2x the best we found at
-  // this level.
-  for (ref_frame = INTRA_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
-    if (frame_distortions[ref_frame] > 2 * *returndistortion) {
-      x->mb_context[xd->sb_index][xd->mb_index].frames_with_high_error |= (1
-          << ref_frame);
-    }
-  }
-
-  set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
-                    scale_factor);
-  store_coding_context(x, &x->mb_context[xd->sb_index][xd->mb_index],
-                       best_mode_index, &best_partition,
-                       &mbmi->ref_mvs[mbmi->ref_frame][0],
-                       &mbmi->ref_mvs[mbmi->second_ref_frame < 0 ? 0 :
-                                      mbmi->second_ref_frame][0],
-                       best_pred_diff, best_txfm_diff);
-}
-#endif  // !CONFIG_SB8X8
-
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                int *returnrate, int *returndist,
                                BLOCK_SIZE_TYPE bsize,
@@ -4384,30 +2286,26 @@
   int dist_y = 0, dist_uv;
   int y_skip = 0, uv_skip;
   int64_t txfm_cache[NB_TXFM_MODES], err;
-#if CONFIG_SB8X8
   MB_PREDICTION_MODE mode;
   TX_SIZE txfm_size;
   int rate4x4_y, rate4x4_y_tokenonly, dist4x4_y;
   int64_t err4x4 = INT64_MAX;
-#endif
   int i;
 
   ctx->skip = 0;
   xd->mode_info_context->mbmi.mode = DC_PRED;
   err = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
                                &dist_y, &y_skip, bsize, txfm_cache);
-#if CONFIG_SB8X8
   mode = xd->mode_info_context->mbmi.mode;
   txfm_size = xd->mode_info_context->mbmi.txfm_size;
-#endif
   rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
-                          &dist_uv, &uv_skip, bsize);
-#if CONFIG_SB8X8
+                          &dist_uv, &uv_skip,
+                          (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 :
+                                                       bsize);
   if (bsize == BLOCK_SIZE_SB8X8)
     err4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4_y,
                                        &rate4x4_y_tokenonly,
                                        &dist4x4_y, err);
-#endif
 
   if (y_skip && uv_skip) {
     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
@@ -4415,7 +2313,6 @@
     *returndist = dist_y + (dist_uv >> 2);
     memset(ctx->txfm_rd_diff, 0,
            sizeof(x->sb32_context[xd->sb_index].txfm_rd_diff));
-#if CONFIG_SB8X8
     xd->mode_info_context->mbmi.mode = mode;
     xd->mode_info_context->mbmi.txfm_size = txfm_size;
   } else if (bsize == BLOCK_SIZE_SB8X8 && err4x4 < err) {
@@ -4426,181 +2323,44 @@
       ctx->txfm_rd_diff[i] = MIN(err4x4, err - txfm_cache[i]);
     }
     xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-#endif
   } else {
     *returnrate = rate_y + rate_uv +
         vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
     *returndist = dist_y + (dist_uv >> 2);
     for (i = 0; i < NB_TXFM_MODES; i++) {
-#if CONFIG_SB8X8
       ctx->txfm_rd_diff[i] = MIN(err4x4, err - txfm_cache[i]);
-#else
-      ctx->txfm_rd_diff[i] = err - txfm_cache[i];
-#endif
     }
-#if CONFIG_SB8X8
     xd->mode_info_context->mbmi.txfm_size = txfm_size;
     xd->mode_info_context->mbmi.mode = mode;
-#endif
   }
 
   vpx_memcpy(&ctx->mic, xd->mode_info_context, sizeof(MODE_INFO));
 }
 
-#if !CONFIG_SB8X8
-void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                            int *returnrate, int *returndist) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-  int64_t error4x4, error16x16;
-  int rate4x4, rate16x16 = 0, rateuv[2];
-  int dist4x4 = 0, dist16x16 = 0, distuv[2];
-  int rate;
-  int rate4x4_tokenonly = 0;
-  int rate16x16_tokenonly = 0;
-  int rateuv_tokenonly[2];
-  int64_t error8x8;
-  int rate8x8_tokenonly=0;
-  int rate8x8, dist8x8;
-  int mode16x16;
-  int mode8x8[4];
-  int dist;
-  int modeuv[2], uv_intra_skippable[2];
-  int y_intra16x16_skippable = 0;
-  int64_t txfm_cache[2][NB_TXFM_MODES];
-  TX_SIZE txfm_size_16x16, txfm_size_8x8;
-  int i;
-
-  x->mb_context[xd->sb_index][xd->mb_index].skip = 0;
-  mbmi->ref_frame = INTRA_FRAME;
-  mbmi->mode = DC_PRED;
-  for (i = 0; i <= TX_8X8; i++) {
-    mbmi->txfm_size = i;
-    rd_pick_intra_sbuv_mode(cpi, x, &rateuv[i], &rateuv_tokenonly[i],
-                            &distuv[i], &uv_intra_skippable[i],
-                            BLOCK_SIZE_MB16X16);
-    modeuv[i] = mbmi->uv_mode;
-  }
-
-  // current macroblock under rate-distortion optimization test loop
-  error16x16 = rd_pick_intra_sby_mode(cpi, x, &rate16x16,
-                                      &rate16x16_tokenonly, &dist16x16,
-                                      &y_intra16x16_skippable,
-                                      BLOCK_SIZE_MB16X16, txfm_cache[1]);
-  mode16x16 = mbmi->mode;
-  txfm_size_16x16 = mbmi->txfm_size;
-  if (y_intra16x16_skippable &&
-      ((cm->txfm_mode == ONLY_4X4 && uv_intra_skippable[TX_4X4]) ||
-       (cm->txfm_mode != ONLY_4X4 && uv_intra_skippable[TX_8X8]))) {
-    error16x16 -= RDCOST(x->rdmult, x->rddiv, rate16x16_tokenonly, 0);
-    rate16x16 -= rate16x16_tokenonly;
-  }
-  for (i = 0; i < NB_TXFM_MODES; i++) {
-    txfm_cache[0][i] = error16x16 - txfm_cache[1][cm->txfm_mode] +
-                       txfm_cache[1][i];
-  }
-
-  error8x8 = rd_pick_intra8x8mby_modes_and_txsz(cpi, x, &rate8x8,
-                                                &rate8x8_tokenonly,
-                                                &dist8x8, mode8x8,
-                                                error16x16, txfm_cache[1]);
-  txfm_size_8x8 = mbmi->txfm_size;
-  for (i = 0; i < NB_TXFM_MODES; i++) {
-    int64_t tmp_rd = error8x8 - txfm_cache[1][cm->txfm_mode] + txfm_cache[1][i];
-    if (tmp_rd < txfm_cache[0][i])
-      txfm_cache[0][i] = tmp_rd;
-  }
-
-  mbmi->txfm_size = TX_4X4;
-  error4x4 = rd_pick_intra4x4mby_modes(cpi, x,
-                                       &rate4x4, &rate4x4_tokenonly,
-                                       &dist4x4, error16x16);
-  for (i = 0; i < NB_TXFM_MODES; i++) {
-    if (error4x4 < txfm_cache[0][i])
-      txfm_cache[0][i] = error4x4;
-  }
-
-  mbmi->mb_skip_coeff = 0;
-  if (y_intra16x16_skippable &&
-      ((cm->txfm_mode == ONLY_4X4 && uv_intra_skippable[TX_4X4]) ||
-       (cm->txfm_mode != ONLY_4X4 && uv_intra_skippable[TX_8X8]))) {
-    mbmi->mb_skip_coeff = 1;
-    mbmi->mode = mode16x16;
-    mbmi->uv_mode = modeuv[cm->txfm_mode != ONLY_4X4];
-    rate = rate16x16 + vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
-    dist = dist16x16;
-    rate += rateuv[cm->txfm_mode != ONLY_4X4] -
-            rateuv_tokenonly[cm->txfm_mode != ONLY_4X4];
-    dist += (distuv[cm->txfm_mode != ONLY_4X4] >> 2);
-    mbmi->txfm_size = txfm_size_16x16;
-  } else if (error8x8 > error16x16) {
-    if (error4x4 < error16x16) {
-      rate = rateuv[TX_4X4] + rate4x4;
-      mbmi->mode = I4X4_PRED;
-      mbmi->txfm_size = TX_4X4;
-      dist = dist4x4 + (distuv[TX_4X4] >> 2);
-      mbmi->uv_mode = modeuv[TX_4X4];
-    } else {
-      mbmi->txfm_size = txfm_size_16x16;
-      mbmi->mode = mode16x16;
-      rate = rate16x16 + rateuv[mbmi->txfm_size != TX_4X4];
-      dist = dist16x16 + (distuv[mbmi->txfm_size != TX_4X4] >> 2);
-      mbmi->uv_mode = modeuv[mbmi->txfm_size != TX_4X4];
-    }
-    rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
-  } else {
-    if (error4x4 < error8x8) {
-      rate = rateuv[TX_4X4] + rate4x4;
-      mbmi->mode = I4X4_PRED;
-      mbmi->txfm_size = TX_4X4;
-      dist = dist4x4 + (distuv[TX_4X4] >> 2);
-      mbmi->uv_mode = modeuv[TX_4X4];
-    } else {
-      mbmi->mode = I8X8_PRED;
-      mbmi->txfm_size = txfm_size_8x8;
-      set_i8x8_block_modes(x, mode8x8);
-      rate = rate8x8 + rateuv[TX_4X4];
-      dist = dist8x8 + (distuv[TX_4X4] >> 2);
-    }
-    rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
-  }
-
-  for (i = 0; i < NB_TXFM_MODES; i++) {
-    x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff[i] =
-        txfm_cache[0][cm->txfm_mode] - txfm_cache[0][i];
-  }
-
-  *returnrate = rate;
-  *returndist = dist;
-}
-#endif
-
 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   int mi_row, int mi_col,
                                   int *returnrate,
                                   int *returndistortion,
                                   BLOCK_SIZE_TYPE bsize,
                                   PICK_MODE_CONTEXT *ctx) {
-  const enum BlockSize block_size = y_bsizet_to_block_size(bsize);
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);
   MB_PREDICTION_MODE this_mode;
   MB_PREDICTION_MODE best_mode = DC_PRED;
-  MV_REFERENCE_FRAME ref_frame, second_ref;
+  MV_REFERENCE_FRAME ref_frame, second_ref = INTRA_FRAME;
   unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
   int comp_pred, i;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-  int frame_mdcounts[4][4];
-  YV12_BUFFER_CONFIG yv12_mb[4];
+  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  int_mv single_newmv[MAX_REF_FRAMES];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
   int idx_list[4] = {0,
                      cpi->lst_fb_idx,
                      cpi->gld_fb_idx,
                      cpi->alt_fb_idx};
-  int mdcounts[4];
   int64_t best_rd = INT64_MAX;
   int64_t best_txfm_rd[NB_TXFM_MODES];
   int64_t best_txfm_diff[NB_TXFM_MODES];
@@ -4610,14 +2370,6 @@
   int j;
   int mode_index, best_mode_index = 0;
   unsigned int ref_costs[MAX_REF_FRAMES];
-#if CONFIG_COMP_INTERINTRA_PRED
-  int is_best_interintra = 0;
-  int64_t best_intra16_rd = INT64_MAX;
-  int best_intra16_mode = DC_PRED;
-#if SEPARATE_INTERINTRA_UV
-  int best_intra16_uv_mode = DC_PRED;
-#endif
-#endif
   int64_t best_overall_rd = INT64_MAX;
   INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;
   INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
@@ -4631,20 +2383,16 @@
   int64_t frame_distortions[MAX_REF_FRAMES] = {-1};
   int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
                                              cpi->common.y_dc_delta_q);
-#if CONFIG_SB8X8
   int_mv seg_mvs[4][MAX_REF_FRAMES - 1];
   union b_mode_info best_bmodes[4];
   PARTITION_INFO best_partition;
-#endif
 
-#if CONFIG_SB8X8
   for (i = 0; i < 4; i++) {
     int j;
 
     for (j = 0; j < MAX_REF_FRAMES - 1; j++)
       seg_mvs[i][j].as_int = INVALID_MV;
   }
-#endif
   // Everywhere the flag is set the error is much higher than its neighbors.
   ctx->frames_with_high_error = 0;
   ctx->modes_with_high_error = 0;
@@ -4652,6 +2400,7 @@
   xd->mode_info_context->mbmi.segment_id = segment_id;
   estimate_ref_frame_costs(cpi, segment_id, ref_costs);
   vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
+  vpx_memset(&single_newmv, 0, sizeof(single_newmv));
 
   for (i = 0; i < NB_PREDICTION_TYPES; ++i)
     best_pred_rd[i] = INT64_MAX;
@@ -4659,7 +2408,7 @@
     best_txfm_rd[i] = INT64_MAX;
 
   // Create a mask set to 1 for each frame used by a smaller resolution.
-  if (cpi->Speed > 0) {
+  if (cpi->speed > 0) {
     switch (block_size) {
       case BLOCK_64X64:
         for (i = 0; i < 4; i++) {
@@ -4694,13 +2443,13 @@
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size,
                          mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV],
-                         frame_mdcounts, yv12_mb, scale_factor);
+                         yv12_mb, scale_factor);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
-  if (cpi->Speed == 0
-      || (cpi->Speed > 0 && (ref_frame_mask & (1 << INTRA_FRAME)))) {
+  if (cpi->speed == 0
+      || (cpi->speed > 0 && (ref_frame_mask & (1 << INTRA_FRAME)))) {
     mbmi->mode = DC_PRED;
     for (i = 0; i <= (bsize < BLOCK_SIZE_MB16X16 ? TX_4X4 :
                       (bsize < BLOCK_SIZE_SB32X32 ? TX_8X8 :
@@ -4723,9 +2472,6 @@
     int distortion2 = 0, distortion_y = 0, distortion_uv = 0;
     int skippable;
     int64_t txfm_cache[NB_TXFM_MODES];
-#if CONFIG_COMP_INTERINTRA_PRED
-    int compmode_interintra_cost = 0;
-#endif
 
     // Test best rd so far against threshold for trying this mode.
     if (best_rd <= cpi->rd_threshes[mode_index] ||
@@ -4736,11 +2482,12 @@
     x->skip = 0;
     this_mode = vp9_mode_order[mode_index].mode;
     ref_frame = vp9_mode_order[mode_index].ref_frame;
+
     if (!(ref_frame == INTRA_FRAME
         || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
       continue;
     }
-    if (cpi->Speed > 0) {
+    if (cpi->speed > 0) {
       if (!(ref_frame_mask & (1 << ref_frame))) {
         continue;
       }
@@ -4757,51 +2504,39 @@
     mbmi->ref_frame = ref_frame;
     mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
 
-    // TODO(jingning): scaling not supported in SPLITMV mode.
+    // TODO(jingning, jkoleszar): scaling reference frame not supported for
+    // SPLITMV.
     if (mbmi->ref_frame > 0 &&
-          (yv12_mb[mbmi->ref_frame].y_width != cm->mb_cols * 16 ||
-           yv12_mb[mbmi->ref_frame].y_height != cm->mb_rows * 16) &&
+          (scale_factor[mbmi->ref_frame].x_num !=
+           scale_factor[mbmi->ref_frame].x_den ||
+           scale_factor[mbmi->ref_frame].y_num !=
+           scale_factor[mbmi->ref_frame].y_den) &&
         this_mode == SPLITMV)
       continue;
 
     if (mbmi->second_ref_frame > 0 &&
-          (yv12_mb[mbmi->second_ref_frame].y_width != cm->mb_cols * 16 ||
-           yv12_mb[mbmi->second_ref_frame].y_height != cm->mb_rows * 16) &&
+          (scale_factor[mbmi->second_ref_frame].x_num !=
+           scale_factor[mbmi->second_ref_frame].x_den ||
+           scale_factor[mbmi->second_ref_frame].y_num !=
+           scale_factor[mbmi->second_ref_frame].y_den) &&
         this_mode == SPLITMV)
       continue;
 
-
     set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
                       scale_factor);
     comp_pred = mbmi->second_ref_frame > INTRA_FRAME;
     mbmi->mode = this_mode;
     mbmi->uv_mode = DC_PRED;
-#if CONFIG_COMP_INTERINTRA_PRED
-    mbmi->interintra_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-    mbmi->interintra_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
 
     // Evaluate all sub-pel filters irrespective of whether we can use
     // them for this frame.
     mbmi->interp_filter = cm->mcomp_filter_type;
     vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
 
-    // if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
-    //  continue;
-
-    if (
-#if CONFIG_SB8X8
-        bsize != BLOCK_SIZE_SB8X8 &&
-        (this_mode == I4X4_PRED || this_mode == SPLITMV)
-#else
-        this_mode == I4X4_PRED ||
-        this_mode == I8X8_PRED ||
-        this_mode == SPLITMV
-#endif
-        )
+    if (bsize != BLOCK_SIZE_SB8X8 &&
+        (this_mode == I4X4_PRED || this_mode == SPLITMV))
       continue;
-    //  if (vp9_mode_order[mode_index].second_ref_frame == INTRA_FRAME)
-    //  continue;
+
 
     if (comp_pred) {
       if (ref_frame == ALTREF_FRAME) {
@@ -4825,17 +2560,15 @@
           mode_excluded =
               mode_excluded ?
                   mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY;
-#if CONFIG_COMP_INTERINTRA_PRED
-        else
-          mode_excluded = mode_excluded ? mode_excluded : !cm->use_interintra;
-#endif
       }
     }
 
-    setup_pre_planes(xd, &yv12_mb[ref_frame],
-        comp_pred ? &yv12_mb[second_ref] : NULL, 0, 0, NULL, NULL);
-
-    vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts));
+    // Select predictors
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+      if (comp_pred)
+        xd->plane[i].pre[1] = yv12_mb[second_ref][i];
+    }
 
     // If the segment reference frame feature is enabled....
     // then do nothing if the current ref frame is not allowed..
@@ -4861,7 +2594,6 @@
       }
     }
 
-#if CONFIG_SB8X8
     if (this_mode == I4X4_PRED) {
       int rate;
 
@@ -4880,9 +2612,7 @@
       distortion2 += dist_uv[TX_4X4];
       distortion_uv = dist_uv[TX_4X4];
       mbmi->uv_mode = mode_uv[TX_4X4];
-    } else
-#endif
-    if (ref_frame == INTRA_FRAME) {
+    } else if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
       vp9_build_intra_predictors_sby_s(xd, bsize);
       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
@@ -4905,7 +2635,6 @@
       if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
-#if CONFIG_SB8X8
     } else if (this_mode == SPLITMV) {
       const int is_comp_pred = mbmi->second_ref_frame > 0;
       int rate, distortion;
@@ -4938,15 +2667,12 @@
 
         tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
                                              &mbmi->ref_mvs[mbmi->ref_frame][0],
-                                             second_ref, INT64_MAX, mdcounts,
+                                             second_ref, INT64_MAX,
                                              &rate, &rate_y, &distortion,
                                              &skippable,
                                              (int)this_rd_thresh, seg_mvs);
         if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-          int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
-          [vp9_get_pred_context(&cpi->common, xd,
-                                PRED_SWITCHABLE_INTERP)]
-          [vp9_switchable_interp_map[mbmi->interp_filter]];
+          const int rs = get_switchable_rate(cm, x);
           tmp_rd += RDCOST(x->rdmult, x->rddiv, rs, 0);
         }
         newbest = (tmp_rd < tmp_best_rd);
@@ -4980,16 +2706,13 @@
         // switchable list (bilinear, 6-tap) is indicated at the frame level
         tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
                                              &mbmi->ref_mvs[mbmi->ref_frame][0],
-                                             second_ref, INT64_MAX, mdcounts,
+                                             second_ref, INT64_MAX,
                                              &rate, &rate_y, &distortion,
                                              &skippable,
                                              (int)this_rd_thresh, seg_mvs);
       } else {
         if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-          int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
-              [vp9_get_pred_context(&cpi->common, xd,
-                                    PRED_SWITCHABLE_INTERP)]
-              [vp9_switchable_interp_map[mbmi->interp_filter]];
+          int rs = get_switchable_rate(cm, x);
           tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);
         }
         tmp_rd = tmp_best_rdu;
@@ -5009,17 +2732,15 @@
       distortion2 += distortion;
 
       if (cpi->common.mcomp_filter_type == SWITCHABLE)
-        rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
-            [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
-            [vp9_switchable_interp_map[mbmi->interp_filter]];
+        rate2 += get_switchable_rate(cm, x);
 
       // If even the 'Y' rd value of split is higher than best so far
       // then dont bother looking at UV
       vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
-                                      bsize);
-      vp9_subtract_sbuv(x, bsize);
+                                      BLOCK_SIZE_SB8X8);
+      vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8);
       super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
-                                &uv_skippable, bsize, TX_4X4);
+                                &uv_skippable, BLOCK_SIZE_SB8X8, TX_4X4);
       rate2 += rate_uv;
       distortion2 += distortion_uv;
       skippable = skippable && uv_skippable;
@@ -5034,9 +2755,8 @@
       compmode_cost =
           vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);
       mbmi->mode = this_mode;
-#endif
     } else {
-      YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;
+      YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};
       int fb;
 
       if (mbmi->ref_frame == LAST_FRAME) {
@@ -5048,40 +2768,35 @@
       }
 
       if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
-        scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+        scaled_ref_frame[0] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
 
-#if CONFIG_COMP_INTERINTRA_PRED
-      if (mbmi->second_ref_frame == INTRA_FRAME) {
-        if (best_intra16_mode == DC_PRED - 1) continue;
-        mbmi->interintra_mode = best_intra16_mode;
-#if SEPARATE_INTERINTRA_UV
-        mbmi->interintra_uv_mode = best_intra16_uv_mode;
-#else
-        mbmi->interintra_uv_mode = best_intra16_mode;
-#endif
+      if (comp_pred) {
+        if (mbmi->second_ref_frame == LAST_FRAME) {
+          fb = cpi->lst_fb_idx;
+        } else if (mbmi->second_ref_frame == GOLDEN_FRAME) {
+          fb = cpi->gld_fb_idx;
+        } else {
+          fb = cpi->alt_fb_idx;
+        }
+
+        if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
+          scaled_ref_frame[1] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
       }
-#endif
+
       this_rd = handle_inter_mode(cpi, x, bsize,
-                                  mdcounts, txfm_cache,
+                                  txfm_cache,
                                   &rate2, &distortion2, &skippable,
                                   &compmode_cost,
-#if CONFIG_COMP_INTERINTRA_PRED
-                                  &compmode_interintra_cost,
-#endif
                                   &rate_y, &distortion_y,
                                   &rate_uv, &distortion_uv,
                                   &mode_excluded, &disable_skip,
                                   mode_index, &tmp_best_filter, frame_mv,
-                                  scaled_ref_frame, mi_row, mi_col);
+                                  scaled_ref_frame, mi_row, mi_col,
+                                  single_newmv);
       if (this_rd == INT64_MAX)
         continue;
     }
 
-#if CONFIG_COMP_INTERINTRA_PRED
-    if (cpi->common.use_interintra) {
-      rate2 += compmode_interintra_cost;
-    }
-#endif
     if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
       rate2 += compmode_cost;
     }
@@ -5139,18 +2854,6 @@
       *returnintra = distortion2;
     }
 #endif
-#if CONFIG_COMP_INTERINTRA_PRED
-    if ((mbmi->ref_frame == INTRA_FRAME) &&
-        (this_mode <= TM_PRED) &&
-        (this_rd < best_intra16_rd)) {
-      best_intra16_rd = this_rd;
-      best_intra16_mode = this_mode;
-#if SEPARATE_INTERINTRA_UV
-      best_intra16_uv_mode = (mbmi->txfm_size != TX_4X4 ?
-                              mode_uv_8x8 : mode_uv_4x4);
-#endif
-    }
-#endif
 
     if (!disable_skip && mbmi->ref_frame == INTRA_FRAME)
       for (i = 0; i < NB_PREDICTION_TYPES; ++i)
@@ -5160,9 +2863,6 @@
       best_overall_rd = this_rd;
       best_filter = tmp_best_filter;
       best_mode = this_mode;
-#if CONFIG_COMP_INTERINTRA_PRED
-      is_best_interintra = (mbmi->second_ref_frame == INTRA_FRAME);
-#endif
     }
 
     // Store the respective mode distortions for later use.
@@ -5191,7 +2891,6 @@
         *returndistortion = distortion2;
         best_rd = this_rd;
         vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));
-#if CONFIG_SB8X8
         vpx_memcpy(&best_partition, x->partition_info, sizeof(PARTITION_INFO));
 
         if (this_mode == I4X4_PRED || this_mode == SPLITMV) {
@@ -5199,7 +2898,6 @@
             best_bmodes[i] = xd->mode_info_context->bmi[i];
           }
         }
-#endif
       }
 #if 0
       // Testing this mode gave rise to an improvement in best error score.
@@ -5303,11 +3001,6 @@
          (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
          (best_mbmode.mode <= I4X4_PRED));
 
-#if CONFIG_COMP_INTERINTRA_PRED
-  ++cpi->interintra_select_count[is_best_interintra];
-  // if (is_best_interintra)  printf("best_interintra\n");
-#endif
-
   // Accumulate filter usage stats
   // TODO(agrange): Use RD criteria to select interpolation filter mode.
   if (is_inter_mode(best_mode))
@@ -5341,22 +3034,13 @@
     mbmi->mv[0].as_int = 0;
     mbmi->uv_mode = DC_PRED;
     mbmi->mb_skip_coeff = 1;
-#if !CONFIG_SB8X8
-    mbmi->partitioning = 0;
-#endif
     if (cm->txfm_mode == TX_MODE_SELECT) {
       if (bsize >= BLOCK_SIZE_SB32X32)
         mbmi->txfm_size = TX_32X32;
-#if CONFIG_SB8X8
       else if (bsize >= BLOCK_SIZE_MB16X16)
-#else
-      else
-#endif
         mbmi->txfm_size = TX_16X16;
-#if CONFIG_SB8X8
       else
         mbmi->txfm_size = TX_8X8;
-#endif
     }
 
     vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
@@ -5366,7 +3050,6 @@
 
   // macroblock modes
   vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
-#if CONFIG_SB8X8
   if (best_mbmode.mode == I4X4_PRED) {
     for (i = 0; i < 4; i++) {
       xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;
@@ -5387,7 +3070,7 @@
     mbmi->mv[0].as_int = x->partition_info->bmi[3].mv.as_int;
     mbmi->mv[1].as_int = x->partition_info->bmi[3].second_mv.as_int;
   }
-#endif
+
   for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
     if (best_pred_rd[i] == INT64_MAX)
       best_pred_diff[i] = INT_MIN;
@@ -5410,11 +3093,7 @@
   set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
                     scale_factor);
   store_coding_context(x, ctx, best_mode_index,
-#if CONFIG_SB8X8
                        &best_partition,
-#else
-                       NULL,
-#endif
                        &mbmi->ref_mvs[mbmi->ref_frame][0],
                        &mbmi->ref_mvs[mbmi->second_ref_frame < 0 ? 0 :
                                       mbmi->second_ref_frame][0],
@@ -5422,42 +3101,3 @@
 
   return best_rd;
 }
-
-#if !CONFIG_SB8X8
-void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
-                                    int mi_row, int mi_col,
-                                    int *totalrate, int *totaldist) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-  int rate, distortion;
-  int64_t intra_error = 0;
-  unsigned char *segment_id = &mbmi->segment_id;
-
-  x->encode_breakout = xd->segmentation_enabled ?
-                         cpi->segment_encode_breakout[*segment_id] :
-                         cpi->oxcf.encode_breakout;
-
-  // if (cpi->sf.RD)
-  // For now this codebase is limited to a single rd encode path
-  {
-    int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;
-
-    rd_pick_inter_mode(cpi, x, mi_row, mi_col, &rate,
-                       &distortion, &intra_error);
-
-    /* restore cpi->zbin_mode_boost_enabled */
-    cpi->zbin_mode_boost_enabled = zbin_mode_boost_enabled;
-  }
-  // else
-  // The non rd encode path has been deleted from this code base
-  // to simplify development
-  //    vp9_pick_inter_mode
-
-  // Store metrics so they can be added in to totals if this mode is picked
-  x->mb_context[xd->sb_index][xd->mb_index].distortion  = distortion;
-  x->mb_context[xd->sb_index][xd->mb_index].intra_error = intra_error;
-
-  *totalrate = rate;
-  *totaldist = distortion;
-}
-#endif
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 6533a82..dcf5d00 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -19,21 +19,10 @@
 
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
 
-#if !CONFIG_SB8X8
-void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                            int *r, int *d);
-#endif
-
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                int *r, int *d, BLOCK_SIZE_TYPE bsize,
                                PICK_MODE_CONTEXT *ctx);
 
-#if !CONFIG_SB8X8
-void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
-                                    int mi_row, int mi_col,
-                                    int *r, int *d);
-#endif
-
 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   int mi_row, int mi_col,
                                   int *r, int *d, BLOCK_SIZE_TYPE bsize,
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index c53fff4..fe995ad 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -193,17 +193,11 @@
     assert(bwl < bsl && bhl < bsl);
     if (bsize == BLOCK_SIZE_SB64X64) {
       subsize = BLOCK_SIZE_SB32X32;
-#if CONFIG_SB8X8
     } else if (bsize == BLOCK_SIZE_SB32X32) {
       subsize = BLOCK_SIZE_MB16X16;
     } else {
       assert(bsize == BLOCK_SIZE_MB16X16);
       subsize = BLOCK_SIZE_SB8X8;
-#else
-    } else {
-      assert(bsize == BLOCK_SIZE_SB32X32);
-      subsize = BLOCK_SIZE_MB16X16;
-#endif
     }
 
     for (n = 0; n < 4; n++) {
@@ -253,11 +247,11 @@
     vp9_get_tile_col_offsets(cm, tile_col);
     mi_ptr = cm->mi + cm->cur_tile_mi_col_start;
     for (mi_row = 0; mi_row < cm->mi_rows;
-         mi_row += (4 << CONFIG_SB8X8), mi_ptr += (4 << CONFIG_SB8X8) * mis) {
+         mi_row += 8, mi_ptr += 8 * mis) {
       mi = mi_ptr;
       for (mi_col = cm->cur_tile_mi_col_start;
            mi_col < cm->cur_tile_mi_col_end;
-           mi_col += (4 << CONFIG_SB8X8), mi += (4 << CONFIG_SB8X8)) {
+           mi_col += 8, mi += 8) {
         count_segs_sb(cpi, mi, no_pred_segcounts, temporal_predictor_count,
                       t_unpred_seg_counts, mi_row, mi_col, BLOCK_SIZE_SB64X64);
       }
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 1e6b984..30143d7 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -26,7 +26,6 @@
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_swapyv12buffer.h"
 #include "vpx_ports/vpx_timer.h"
 
 #define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering
@@ -119,9 +118,9 @@
 #if ALT_REF_MC_ENABLED
 
 static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
-                                              YV12_BUFFER_CONFIG *arf_frame,
-                                              YV12_BUFFER_CONFIG *frame_ptr,
-                                              int mb_offset,
+                                              uint8_t *arf_frame_buf,
+                                              uint8_t *frame_ptr_buf,
+                                              int stride,
                                               int error_thresh) {
   MACROBLOCK *x = &cpi->mb;
   MACROBLOCKD* const xd = &x->e_mbd;
@@ -142,18 +141,16 @@
   best_ref_mv1_full.as_mv.row = best_ref_mv1.as_mv.row >> 3;
 
   // Setup frame pointers
-  x->plane[0].src.buf = arf_frame->y_buffer + mb_offset;
-  x->plane[0].src.stride = arf_frame->y_stride;
-  xd->plane[0].pre[0].buf = frame_ptr->y_buffer + mb_offset;
-  xd->plane[0].pre[0].stride = arf_frame->y_stride;
+  x->plane[0].src.buf = arf_frame_buf;
+  x->plane[0].src.stride = stride;
+  xd->plane[0].pre[0].buf = frame_ptr_buf;
+  xd->plane[0].pre[0].stride = stride;
 
   // Further step/diamond searches as necessary
-  if (cpi->Speed < 8) {
-    step_param = cpi->sf.first_step +
-                 ((cpi->Speed > 5) ? 1 : 0);
-  } else {
+  if (cpi->speed < 8)
+    step_param = cpi->sf.first_step + ((cpi->speed > 5) ? 1 : 0);
+  else
     step_param = cpi->sf.first_step + 2;
-  }
 
   /*cpi->sf.search_method == HEX*/
   // TODO Check that the 16x16 vf & sdf are selected here
@@ -261,9 +258,9 @@
           // Find best match in this frame by MC
           err = temporal_filter_find_matching_mb_c
                 (cpi,
-                 cpi->frames[alt_ref_index],
-                 cpi->frames[frame],
-                 mb_y_offset,
+                 cpi->frames[alt_ref_index]->y_buffer + mb_y_offset,
+                 cpi->frames[frame]->y_buffer + mb_y_offset,
+                 cpi->frames[frame]->y_stride,
                  THRESH_LOW);
 #endif
           // Assign higher weight to matching MB if it's error
@@ -361,10 +358,10 @@
 }
 
 void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
+  VP9_COMMON *const cm = &cpi->common;
+
   int frame = 0;
 
-  int num_frames_backward = 0;
-  int num_frames_forward = 0;
   int frames_to_blur_backward = 0;
   int frames_to_blur_forward = 0;
   int frames_to_blur = 0;
@@ -374,15 +371,13 @@
   int blur_type = cpi->oxcf.arnr_type;
   int max_frames = cpi->active_arnr_frames;
 
-  num_frames_backward = distance;
-  num_frames_forward = vp9_lookahead_depth(cpi->lookahead)
-                       - (num_frames_backward + 1);
+  const int num_frames_backward = distance;
+  const int num_frames_forward = vp9_lookahead_depth(cpi->lookahead)
+                               - (num_frames_backward + 1);
 
   switch (blur_type) {
     case 1:
-      /////////////////////////////////////////
       // Backward Blur
-
       frames_to_blur_backward = num_frames_backward;
 
       if (frames_to_blur_backward >= max_frames)
@@ -392,7 +387,6 @@
       break;
 
     case 2:
-      /////////////////////////////////////////
       // Forward Blur
 
       frames_to_blur_forward = num_frames_forward;
@@ -405,7 +399,6 @@
 
     case 3:
     default:
-      /////////////////////////////////////////
       // Center Blur
       frames_to_blur_forward = num_frames_forward;
       frames_to_blur_backward = num_frames_backward;
@@ -445,25 +438,22 @@
 
   // Setup scaling factors. Scaling on each of the arnr frames is not supported
   vp9_setup_scale_factors_for_frame(&cpi->mb.e_mbd.scale_factor[0],
-      &cpi->common.yv12_fb[cpi->common.new_fb_idx],
-      cpi->common.width,
-      cpi->common.height);
+      cm->yv12_fb[cm->new_fb_idx].y_crop_width,
+      cm->yv12_fb[cm->new_fb_idx].y_crop_height,
+      cm->width, cm->height);
   cpi->mb.e_mbd.scale_factor_uv[0] = cpi->mb.e_mbd.scale_factor[0];
 
   // Setup frame pointers, NULL indicates frame not included in filter
   vpx_memset(cpi->frames, 0, max_frames * sizeof(YV12_BUFFER_CONFIG *));
   for (frame = 0; frame < frames_to_blur; frame++) {
-    int which_buffer =  start_frame - frame;
+    int which_buffer = start_frame - frame;
     struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead,
                                                      which_buffer);
     cpi->frames[frames_to_blur - 1 - frame] = &buf->img;
   }
 
-  temporal_filter_iterate_c(
-    cpi,
-    frames_to_blur,
-    frames_to_blur_backward,
-    strength);
+  temporal_filter_iterate_c(cpi, frames_to_blur, frames_to_blur_backward,
+                            strength);
 }
 
 void configure_arnr_filter(VP9_COMP *cpi, const unsigned int this_frame,
@@ -481,7 +471,7 @@
   // Note: this_frame->frame has been updated in the loop
   // so it now points at the ARF frame.
   half_gf_int = cpi->baseline_gf_interval >> 1;
-  frames_after_arf = (int)(cpi->twopass.total_stats->count - this_frame - 1);
+  frames_after_arf = (int)(cpi->twopass.total_stats.count - this_frame - 1);
 
   switch (cpi->oxcf.arnr_type) {
     case 1:  // Backward filter
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 7c2728b..4420d49 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -136,20 +136,7 @@
   ENTROPY_CONTEXT above_ec, left_ec;
   uint8_t token_cache[1024];
   TX_TYPE tx_type = DCT_DCT;
-#if CONFIG_CODE_ZEROGROUP
-  int last_nz_pos[3] = {-1, -1, -1};  // Encoder only
-  int is_eoo_list[3] = {0, 0, 0};
-  int is_last_zero[3] = {0, 0, 0};
-  int is_eoo_negative[3] = {0, 0, 0};
-  int o;
-  vp9_zpc_probs *zpc_probs;
-  vp9_zpc_count *zpc_count;
-  uint8_t token_cache_full[1024];
-#endif
-#if CONFIG_CODE_ZEROGROUP
-  vpx_memset(token_cache, UNKNOWN_TOKEN, sizeof(token_cache));
-#endif
-
+  const uint8_t * band_translate;
   assert((!type && !plane) || (type && plane));
 
   switch (tx_size) {
@@ -163,10 +150,7 @@
       scan = get_scan_4x4(tx_type);
       counts = cpi->coef_counts_4x4;
       coef_probs = cpi->common.fc.coef_probs_4x4;
-#if CONFIG_CODE_ZEROGROUP
-      zpc_count = &cpi->common.fc.zpc_counts_4x4;
-      zpc_probs = &cpi->common.fc.zpc_probs_4x4;
-#endif
+      band_translate = vp9_coefband_trans_4x4;
       break;
     }
     case TX_8X8: {
@@ -180,10 +164,7 @@
       scan = get_scan_8x8(tx_type);
       counts = cpi->coef_counts_8x8;
       coef_probs = cpi->common.fc.coef_probs_8x8;
-#if CONFIG_CODE_ZEROGROUP
-      zpc_count = &cpi->common.fc.zpc_counts_8x8;
-      zpc_probs = &cpi->common.fc.zpc_probs_8x8;
-#endif
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_16X16: {
@@ -197,10 +178,7 @@
       scan = get_scan_16x16(tx_type);
       counts = cpi->coef_counts_16x16;
       coef_probs = cpi->common.fc.coef_probs_16x16;
-#if CONFIG_CODE_ZEROGROUP
-      zpc_count = &cpi->common.fc.zpc_counts_16x16;
-      zpc_probs = &cpi->common.fc.zpc_probs_16x16;
-#endif
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_32X32:
@@ -210,10 +188,7 @@
       scan = vp9_default_zig_zag1d_32x32;
       counts = cpi->coef_counts_32x32;
       coef_probs = cpi->common.fc.coef_probs_32x32;
-#if CONFIG_CODE_ZEROGROUP
-      zpc_count = &cpi->common.fc.zpc_counts_32x32;
-      zpc_probs = &cpi->common.fc.zpc_probs_32x32;
-#endif
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
   }
 
@@ -224,20 +199,9 @@
   if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))
     seg_eob = 0;
 
-#if CONFIG_CODE_ZEROGROUP
-  vpx_memset(token_cache_full, ZERO_TOKEN, sizeof(token_cache_full));
-  for (c = 0; c < eob; ++c) {
-    rc = scan[c];
-    token_cache_full[rc] = vp9_dct_value_tokens_ptr[qcoeff_ptr[rc]].token;
-    o = vp9_get_orientation(rc, tx_size);
-    if (qcoeff_ptr[rc] != 0) {
-      last_nz_pos[o] = c;
-    }
-  }
-#endif
   c = 0;
   do {
-    const int band = get_coef_band(scan, tx_size, c);
+    const int band = get_coef_band(band_translate, c);
     int token;
     int v = 0;
     rc = scan[c];
@@ -257,94 +221,13 @@
     t->context_tree = coef_probs[type][ref][band][pt];
       t->skip_eob_node = (c > 0) && (token_cache[scan[c - 1]] == 0);
     assert(vp9_coef_encodings[t->token].len - t->skip_eob_node > 0);
-#if CONFIG_CODE_ZEROGROUP
-    o = vp9_get_orientation(rc, tx_size);
-    t->skip_coef_val = (token_cache[rc] == ZERO_TOKEN || is_eoo_list[o]);
-    if (t->skip_coef_val) {
-      assert(v == 0);
-    }
-    // No need to transmit any token
-    if (t->skip_eob_node && t->skip_coef_val) {
-      assert(token == ZERO_TOKEN);
-      is_last_zero[o] = 1;
-      token_cache[scan[c]] = ZERO_TOKEN;
-      continue;
-    }
-#endif
+
     if (!dry_run) {
       ++counts[type][ref][band][pt][token];
       if (!t->skip_eob_node)
         ++cpi->common.fc.eob_branch_counts[tx_size][type][ref][band][pt];
     }
     token_cache[scan[c]] = token;
-#if CONFIG_CODE_ZEROGROUP
-    if (token == ZERO_TOKEN && !t->skip_coef_val) {
-      int eoo = 0, use_eoo;
-#if USE_ZPC_EOORIENT == 1
-      use_eoo = vp9_use_eoo(c, seg_eob, scan, tx_size,
-                            is_last_zero, is_eoo_list);
-#else
-      use_eoo = 0;
-#endif
-      if (use_eoo) {
-        eoo = vp9_is_eoo(c, eob, scan, tx_size, qcoeff_ptr, last_nz_pos);
-        if (eoo && is_eoo_negative[o]) eoo = 0;
-        if (eoo) {
-          int c_;
-          int savings = 0;
-          int zsaved = 0;
-          savings =
-              vp9_cost_bit((*zpc_probs)[ref]
-                           [coef_to_zpc_band(band)]
-                           [coef_to_zpc_ptok(pt)][0], 1) -
-              vp9_cost_bit((*zpc_probs)[ref]
-                           [coef_to_zpc_band(band)]
-                           [coef_to_zpc_ptok(pt)][0], 0);
-          for (c_ = c + 1; c_ < eob; ++c_) {
-            if (o == vp9_get_orientation(scan[c_], tx_size)) {
-              int pt_ = vp9_get_coef_context(scan, nb, pad, token_cache_full,
-                                             c_, default_eob);
-              int band_ = get_coef_band(scan, tx_size, c_);
-              assert(token_cache_full[scan[c_]] == ZERO_TOKEN);
-              if (!c_ || token_cache_full[scan[c_ - 1]])
-                savings +=
-                    vp9_cost_bit(coef_probs[type][ref][band_][pt_][0], 1);
-              savings += vp9_cost_bit(coef_probs[type][ref][band_][pt_][1], 0);
-              zsaved++;
-            }
-          }
-          /*
-          if (!dry_run)
-            if (savings > 0)
-              printf("savings %d zsaved %d (%d, %d)\n",
-                     savings, zsaved, tx_size, band);
-                     */
-          if (savings < 0) {
-            eoo = 0;
-            is_eoo_negative[o] = 1;
-          }
-        }
-      }
-      if (use_eoo) {
-        t++;
-        t->skip_eob_node = t->skip_coef_val = 0;
-        // transmit the eoo symbol
-        t->token = !eoo ? ZPC_ISOLATED : ZPC_EOORIENT;
-        t->context_tree = &((*zpc_probs)[ref]
-                            [coef_to_zpc_band(band)]
-                            [coef_to_zpc_ptok(pt)][0]);
-        if (!dry_run)
-          (*zpc_count)[ref]
-              [coef_to_zpc_band(band)]
-              [coef_to_zpc_ptok(pt)][0][!eoo]++;
-        if (eoo) {
-          assert(is_eoo_list[o] == 0);
-          is_eoo_list[o] = 1;
-        }
-      }
-    }
-    is_last_zero[o] = (token == ZERO_TOKEN);
-#endif
     ++t;
   } while (c < eob && ++c < seg_eob);
 
@@ -375,9 +258,6 @@
   int result = 1;
   struct is_skippable_args args = {xd, &result};
   foreach_transformed_block_in_plane(xd, bsize, 0,
-#if !CONFIG_SB8X8
-                                     0,
-#endif
                                      is_skippable, &args);
   return result;
 }
diff --git a/vp9/encoder/vp9_treewriter.c b/vp9/encoder/vp9_treewriter.c
index 52da3c6..e4aed53 100644
--- a/vp9/encoder/vp9_treewriter.c
+++ b/vp9/encoder/vp9_treewriter.c
@@ -8,35 +8,31 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #include "vp9/encoder/vp9_treewriter.h"
-#include "vp9/common/vp9_common.h"
 
-static void cost(
-  int *const C,
-  vp9_tree T,
-  const vp9_prob *const P,
-  int i,
-  int c
-) {
-  const vp9_prob p = P [i >> 1];
+static void cost(int *costs, vp9_tree tree, const vp9_prob *probs,
+                 int i, int c) {
+  const vp9_prob prob = probs[i / 2];
+  int b;
 
-  do {
-    const vp9_tree_index j = T[i];
-    const int d = c + vp9_cost_bit(p, i & 1);
+  for (b = 0; b <= 1; ++b) {
+    const int cc = c + vp9_cost_bit(prob, b);
+    const vp9_tree_index ii = tree[i + b];
 
-    if (j <= 0)
-      C[-j] = d;
+    if (ii <= 0)
+      costs[-ii] = cc;
     else
-      cost(C, T, P, j, d);
-  } while (++i & 1);
-}
-void vp9_cost_tokens(int *c, const vp9_prob *p, vp9_tree t) {
-  cost(c, t, p, 0, 0);
+      cost(costs, tree, probs, ii, cc);
+  }
 }
 
-void vp9_cost_tokens_skip(int *c, const vp9_prob *p, vp9_tree t) {
-  assert(t[1] > 0 && t[0] <= 0);
-  c[-t[0]] = vp9_cost_bit(p[0], 0);
-  cost(c, t, p, 2, 0);
+void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree) {
+  cost(costs, tree, probs, 0, 0);
+}
+
+void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree) {
+  assert(tree[0] <= 0 && tree[1] > 0);
+
+  costs[-tree[0]] = vp9_cost_bit(probs[0], 0);
+  cost(costs, tree, probs, 2, 0);
 }
diff --git a/vp9/encoder/vp9_treewriter.h b/vp9/encoder/vp9_treewriter.h
index af2c122..eeda5cd 100644
--- a/vp9/encoder/vp9_treewriter.h
+++ b/vp9/encoder/vp9_treewriter.h
@@ -19,11 +19,7 @@
 
 #include "vp9/encoder/vp9_boolhuff.h"       /* for now */
 
-typedef BOOL_CODER vp9_writer;
 
-#define vp9_write encode_bool
-#define vp9_write_literal vp9_encode_value
-#define vp9_write_bit(w, v) vp9_write((w), (v), vp9_prob_half)
 #define vp9_write_prob(w, v) vp9_write_literal((w), (v), 8)
 
 /* Approximate length of an encoded bool in 256ths of a bit at given prob */
@@ -39,66 +35,53 @@
 /* Both of these return bits, not scaled bits. */
 static INLINE unsigned int cost_branch256(const unsigned int ct[2],
                                           vp9_prob p) {
-  /* Imitate existing calculation */
   return ct[0] * vp9_cost_zero(p) + ct[1] * vp9_cost_one(p);
 }
 
 static INLINE unsigned int cost_branch(const unsigned int ct[2],
                                        vp9_prob p) {
-  /* Imitate existing calculation */
   return cost_branch256(ct, p) >> 8;
 }
 
 
-/* Small functions to write explicit values and tokens, as well as
-   estimate their lengths. */
-
-static INLINE void treed_write(vp9_writer *const w,
-                               vp9_tree t,
-                               const vp9_prob *const p,
-                               int v,
-                               /* number of bits in v, assumed nonzero */
-                               int n) {
+static INLINE void treed_write(vp9_writer *w,
+                               vp9_tree tree, const vp9_prob *probs,
+                               int bits, int len) {
   vp9_tree_index i = 0;
 
   do {
-    const int b = (v >> --n) & 1;
-    vp9_write(w, b, p[i >> 1]);
-    i = t[i + b];
-  } while (n);
+    const int bit = (bits >> --len) & 1;
+    vp9_write(w, bit, probs[i >> 1]);
+    i = tree[i + bit];
+  } while (len);
 }
 
-static INLINE void write_token(vp9_writer *w, vp9_tree t, const vp9_prob *p,
-                               const struct vp9_token *x) {
-  treed_write(w, t, p, x->value, x->len);
+static INLINE void write_token(vp9_writer *w, vp9_tree tree,
+                               const vp9_prob *probs,
+                               const struct vp9_token *token) {
+  treed_write(w, tree, probs, token->value, token->len);
 }
 
-static INLINE int treed_cost(vp9_tree t,
-                             const vp9_prob *const p,
-                             int v,
-                             /* number of bits in v, assumed nonzero */
-                             int n) {
-  int c = 0;
+static INLINE int treed_cost(vp9_tree tree, const vp9_prob *probs,
+                             int bits, int len) {
+  int cost = 0;
   vp9_tree_index i = 0;
 
   do {
-    const int b = (v >> --n) & 1;
-    c += vp9_cost_bit(p[i >> 1], b);
-    i = t[i + b];
-  } while (n);
+    const int bit = (bits >> --len) & 1;
+    cost += vp9_cost_bit(probs[i >> 1], bit);
+    i = tree[i + bit];
+  } while (len);
 
-  return c;
+  return cost;
 }
 
-static INLINE int cost_token(vp9_tree t, const vp9_prob *p,
-                             const struct vp9_token *x) {
-  return treed_cost(t, p, x->value, x->len);
+static INLINE int cost_token(vp9_tree tree, const vp9_prob *probs,
+                             const struct vp9_token *token) {
+  return treed_cost(tree, probs, token->value, token->len);
 }
 
-/* Fill array of costs for all possible token values. */
-
-void vp9_cost_tokens(int *Costs, const vp9_prob *, vp9_tree);
-
-void vp9_cost_tokens_skip(int *c, const vp9_prob *p, vp9_tree t);
+void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree);
+void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree);
 
 #endif  // VP9_ENCODER_VP9_TREEWRITER_H_
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index 13dabbd..306476b 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -12,6 +12,7 @@
 #define VP9_ENCODER_VP9_VARIANCE_H_
 
 #include "vpx/vpx_integer.h"
+// #include "./vpx_config.h"
 
 typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
                                     int source_stride,
@@ -50,6 +51,15 @@
                                                 int Refstride,
                                                 unsigned int *sse);
 
+typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr,
+                                                   int source_stride,
+                                                   int xoffset,
+                                                   int yoffset,
+                                                   const uint8_t *ref_ptr,
+                                                   int Refstride,
+                                                   unsigned int *sse,
+                                                   const uint8_t *second_pred);
+
 typedef void (*vp9_ssimpf_fn_t)(uint8_t *s, int sp, uint8_t *r,
                                 int rp, unsigned long *sum_s,
                                 unsigned long *sum_r, unsigned long *sum_sq_s,
@@ -64,15 +74,33 @@
                                                    int  ref_stride);
 
 typedef struct vp9_variance_vtable {
-    vp9_sad_fn_t            sdf;
-    vp9_variance_fn_t       vf;
-    vp9_subpixvariance_fn_t svf;
-    vp9_variance_fn_t       svf_halfpix_h;
-    vp9_variance_fn_t       svf_halfpix_v;
-    vp9_variance_fn_t       svf_halfpix_hv;
-    vp9_sad_multi_fn_t      sdx3f;
-    vp9_sad_multi1_fn_t     sdx8f;
-    vp9_sad_multi_d_fn_t    sdx4df;
+    vp9_sad_fn_t               sdf;
+    vp9_variance_fn_t          vf;
+    vp9_subpixvariance_fn_t    svf;
+    vp9_subp_avg_variance_fn_t svaf;
+    vp9_variance_fn_t          svf_halfpix_h;
+    vp9_variance_fn_t          svf_halfpix_v;
+    vp9_variance_fn_t          svf_halfpix_hv;
+    vp9_sad_multi_fn_t         sdx3f;
+    vp9_sad_multi1_fn_t        sdx8f;
+    vp9_sad_multi_d_fn_t       sdx4df;
 } vp9_variance_fn_ptr_t;
 
+// #if CONFIG_COMP_INTER_JOINT_SEARCH
+static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int weight,
+                          int height, uint8_t *ref, int ref_stride) {
+  int i, j;
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < weight; j++) {
+      int tmp;
+      tmp = pred[j] + ref[j];
+      comp_pred[j] = (tmp + 1) >> 1;
+    }
+    comp_pred += weight;
+    pred += weight;
+    ref += ref_stride;
+  }
+}
+// #endif  // CONFIG_COMP_INTER_JOINT_SEARCH
 #endif  // VP9_ENCODER_VP9_VARIANCE_H_
diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance_c.c
index c2a6004..fa53abd 100644
--- a/vp9/encoder/vp9_variance_c.c
+++ b/vp9/encoder/vp9_variance_c.c
@@ -13,6 +13,7 @@
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_subpelvar.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
 
 unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
   unsigned int i, sum = 0;
@@ -58,6 +59,29 @@
   return vp9_variance64x32_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 64, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter);
+  comp_avg_pred(temp3, second_pred, 64, 32, temp2, 64);
+  return vp9_variance64x32_c(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance32x64_c(const uint8_t *src_ptr,
                                  int  source_stride,
                                  const uint8_t *ref_ptr,
@@ -92,6 +116,29 @@
   return vp9_variance32x64_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 64);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 65, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter);
+  comp_avg_pred(temp3, second_pred, 32, 64, temp2, 32);
+  return vp9_variance32x64_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance32x16_c(const uint8_t *src_ptr,
                                  int  source_stride,
                                  const uint8_t *ref_ptr,
@@ -126,6 +173,29 @@
   return vp9_variance32x16_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 16);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter);
+  comp_avg_pred(temp3, second_pred, 32, 16, temp2, 32);
+  return vp9_variance32x16_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance16x32_c(const uint8_t *src_ptr,
                                  int  source_stride,
                                  const uint8_t *ref_ptr,
@@ -160,6 +230,29 @@
   return vp9_variance16x32_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 32);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter);
+  comp_avg_pred(temp3, second_pred, 16, 32, temp2, 16);
+  return vp9_variance16x32_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance64x64_c(const uint8_t *src_ptr,
                                  int  source_stride,
                                  const uint8_t *ref_ptr,
@@ -317,6 +410,31 @@
   return vp9_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr,
+                                             int  src_pixels_per_line,
+                                             int  xoffset,
+                                             int  yoffset,
+                                             const uint8_t *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse,
+                                             const uint8_t *second_pred) {
+  uint8_t temp2[20 * 16];
+  const int16_t *hfilter, *vfilter;
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 4);  // compound pred buffer
+  uint16_t fdata3[5 * 4];  // Temp data bufffer used in filtering
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  // First filter 1d Horizontal
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 5, 4, hfilter);
+
+  // Now filter Verticaly
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 4,  4,  4,  4, vfilter);
+  comp_avg_pred(temp3, second_pred, 4, 4, temp2, 4);
+  return vp9_variance4x4_c(temp3, 4, dst_ptr, dst_pixels_per_line, sse);
+}
 
 unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,
                                          int  src_pixels_per_line,
@@ -339,6 +457,29 @@
   return vp9_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr,
+                                             int  src_pixels_per_line,
+                                             int  xoffset,
+                                             int  yoffset,
+                                             const uint8_t *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse,
+                                             const uint8_t *second_pred) {
+  uint16_t fdata3[9 * 8];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 8);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 9, 8, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter);
+  comp_avg_pred(temp3, second_pred, 8, 8, temp2, 8);
+  return vp9_variance8x8_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,
                                            int  src_pixels_per_line,
                                            int  xoffset,
@@ -360,6 +501,30 @@
   return vp9_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[17 * 16];
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 16);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter);
+
+  comp_avg_pred(temp3, second_pred, 16, 16, temp2, 16);
+  return vp9_variance16x16_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
                                            int  src_pixels_per_line,
                                            int  xoffset,
@@ -381,6 +546,29 @@
   return vp9_variance64x64_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 65, 64, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter);
+  comp_avg_pred(temp3, second_pred, 64, 64, temp2, 64);
+  return vp9_variance64x64_c(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
                                            int  src_pixels_per_line,
                                            int  xoffset,
@@ -402,6 +590,29 @@
   return vp9_variance32x32_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 32);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter);
+  comp_avg_pred(temp3, second_pred, 32, 32, temp2, 32);
+  return vp9_variance32x32_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr,
                                               int  source_stride,
                                               const uint8_t *ref_ptr,
@@ -543,6 +754,29 @@
   return vp9_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr,
+                                              int  src_pixels_per_line,
+                                              int  xoffset,
+                                              int  yoffset,
+                                              const uint8_t *dst_ptr,
+                                              int dst_pixels_per_line,
+                                              unsigned int *sse,
+                                              const uint8_t *second_pred) {
+  uint16_t fdata3[16 * 9];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 8);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 9, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter);
+  comp_avg_pred(temp3, second_pred, 16, 8, temp2, 16);
+  return vp9_variance16x8_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,
                                           int  src_pixels_per_line,
                                           int  xoffset,
@@ -564,3 +798,25 @@
   return vp9_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr,
+                                              int  src_pixels_per_line,
+                                              int  xoffset,
+                                              int  yoffset,
+                                              const uint8_t *dst_ptr,
+                                              int dst_pixels_per_line,
+                                              unsigned int *sse,
+                                              const uint8_t *second_pred) {
+  uint16_t fdata3[9 * 16];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 16);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 8, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter);
+  comp_avg_pred(temp3, second_pred, 8, 16, temp2, 8);
+  return vp9_variance8x16_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
+}
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index cbe3aa3..ace7e6f 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -56,8 +56,6 @@
 VP9_COMMON_SRCS-yes += common/vp9_subpelvar.h
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.h
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.c
-VP9_COMMON_SRCS-yes += common/vp9_setupintrarecon.h
-VP9_COMMON_SRCS-yes += common/vp9_swapyv12buffer.h
 VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h
 VP9_COMMON_SRCS-yes += common/vp9_textblit.h
 VP9_COMMON_SRCS-yes += common/vp9_tile_common.h
@@ -75,9 +73,6 @@
 VP9_COMMON_SRCS-yes += common/vp9_recon.c
 VP9_COMMON_SRCS-yes += common/vp9_reconinter.c
 VP9_COMMON_SRCS-yes += common/vp9_reconintra.c
-VP9_COMMON_SRCS-yes += common/vp9_reconintra4x4.c
-VP9_COMMON_SRCS-yes += common/vp9_setupintrarecon.c
-VP9_COMMON_SRCS-yes += common/vp9_swapyv12buffer.c
 VP9_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/vp9_textblit.c
 VP9_COMMON_SRCS-yes += common/vp9_treecoder.c
 VP9_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/vp9_implicit_segmentation.c
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 1d95eed..45609da 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -211,11 +211,12 @@
   switch (img->fmt) {
     case VPX_IMG_FMT_YV12:
     case VPX_IMG_FMT_I420:
-    case VPX_IMG_FMT_VPXI420:
-    case VPX_IMG_FMT_VPXYV12:
+    case VPX_IMG_FMT_I422:
+    case VPX_IMG_FMT_I444:
       break;
     default:
-      ERROR("Invalid image format. Only YV12 and I420 images are supported");
+      ERROR("Invalid image format. Only YV12, I420, I422, I444 images are "
+            "supported.");
   }
 
   if ((img->d_w != ctx->cfg.g_w) || (img->d_h != ctx->cfg.g_h))
@@ -553,14 +554,17 @@
   yv12->y_crop_height = img->d_h;
   yv12->y_width  = img->d_w;
   yv12->y_height = img->d_h;
-  yv12->uv_width = (1 + yv12->y_width) / 2;
-  yv12->uv_height = (1 + yv12->y_height) / 2;
+
+  yv12->uv_width = img->x_chroma_shift == 1 ? (1 + yv12->y_width) / 2
+                                            : yv12->y_width;
+  yv12->uv_height = img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2
+                                             : yv12->y_height;
 
   yv12->y_stride = img->stride[VPX_PLANE_Y];
   yv12->uv_stride = img->stride[VPX_PLANE_U];
 
   yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
-  yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12); // REG_YUV = 0
+  yv12->clrtype = REG_YUV;
   return res;
 }
 
@@ -940,39 +944,7 @@
   }
 
   if (0 == vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags)) {
-
-    /*
-    vpx_img_wrap(&ctx->preview_img, VPX_IMG_FMT_YV12,
-        sd.y_width + 2*VP9BORDERINPIXELS,
-        sd.y_height + 2*VP9BORDERINPIXELS,
-        1,
-        sd.buffer_alloc);
-    vpx_img_set_rect(&ctx->preview_img,
-        VP9BORDERINPIXELS, VP9BORDERINPIXELS,
-        sd.y_width, sd.y_height);
-        */
-
-    ctx->preview_img.bps = 12;
-    ctx->preview_img.planes[VPX_PLANE_Y] = sd.y_buffer;
-    ctx->preview_img.planes[VPX_PLANE_U] = sd.u_buffer;
-    ctx->preview_img.planes[VPX_PLANE_V] = sd.v_buffer;
-
-    if (sd.clrtype == REG_YUV)
-      ctx->preview_img.fmt = VPX_IMG_FMT_I420;
-    else
-      ctx->preview_img.fmt = VPX_IMG_FMT_VPXI420;
-
-    ctx->preview_img.x_chroma_shift = 1;
-    ctx->preview_img.y_chroma_shift = 1;
-
-    ctx->preview_img.d_w = sd.y_width;
-    ctx->preview_img.d_h = sd.y_height;
-    ctx->preview_img.stride[VPX_PLANE_Y] = sd.y_stride;
-    ctx->preview_img.stride[VPX_PLANE_U] = sd.uv_stride;
-    ctx->preview_img.stride[VPX_PLANE_V] = sd.uv_stride;
-    ctx->preview_img.w   = sd.y_width;
-    ctx->preview_img.h   = sd.y_height;
-
+    yuvconfig2image(&ctx->preview_img, &sd, NULL);
     return &ctx->preview_img;
   } else
     return NULL;
diff --git a/vp9/vp9_iface_common.h b/vp9/vp9_iface_common.h
index 87d7ca6..96de5f5 100644
--- a/vp9/vp9_iface_common.h
+++ b/vp9/vp9_iface_common.h
@@ -16,13 +16,24 @@
     * the Y, U, and V planes, nor other alignment adjustments that
     * might be representable by a YV12_BUFFER_CONFIG, so we just
     * initialize all the fields.*/
-  img->fmt = yv12->clrtype == REG_YUV ? VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420;
+  int bps = 12;
+  if (yv12->uv_height == yv12->y_height) {
+    if (yv12->uv_width == yv12->y_width) {
+      img->fmt = VPX_IMG_FMT_I444;
+      bps = 24;
+    } else {
+      img->fmt = VPX_IMG_FMT_I422;
+      bps = 16;
+    }
+  } else {
+    img->fmt = VPX_IMG_FMT_I420;
+  }
   img->w = yv12->y_stride;
   img->h = multiple16(yv12->y_height + 2 * VP9BORDERINPIXELS);
-  img->d_w = yv12->y_width;
-  img->d_h = yv12->y_height;
-  img->x_chroma_shift = 1;
-  img->y_chroma_shift = 1;
+  img->d_w = yv12->y_crop_width;
+  img->d_h = yv12->y_crop_height;
+  img->x_chroma_shift = yv12->uv_width < yv12->y_width;
+  img->y_chroma_shift = yv12->uv_height < yv12->y_height;
   img->planes[VPX_PLANE_Y] = yv12->y_buffer;
   img->planes[VPX_PLANE_U] = yv12->u_buffer;
   img->planes[VPX_PLANE_V] = yv12->v_buffer;
@@ -31,7 +42,7 @@
   img->stride[VPX_PLANE_U] = yv12->uv_stride;
   img->stride[VPX_PLANE_V] = yv12->uv_stride;
   img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
-  img->bps = 12;
+  img->bps = bps;
   img->user_priv = user_priv;
   img->img_data = yv12->buffer_alloc;
   img->img_data_owner = 0;
diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h
index 809fa38..c304bac 100644
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h
@@ -55,9 +55,11 @@
     VPX_IMG_FMT_YV12    = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 1, /**< planar YVU */
     VPX_IMG_FMT_I420    = VPX_IMG_FMT_PLANAR | 2,
     VPX_IMG_FMT_VPXYV12 = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 3, /** < planar 4:2:0 format with vpx color space */
-    VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4   /** < planar 4:2:0 format with vpx color space */
-  }
-                        vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
+    VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4,
+    VPX_IMG_FMT_I422    = VPX_IMG_FMT_PLANAR | 5,
+    VPX_IMG_FMT_I444    = VPX_IMG_FMT_PLANAR | 6,
+    VPX_IMG_FMT_444A    = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 7
+  } vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
 
 #if !defined(VPX_CODEC_DISABLE_COMPAT) || !VPX_CODEC_DISABLE_COMPAT
 #define IMG_FMT_PLANAR         VPX_IMG_FMT_PLANAR     /**< \deprecated Use #VPX_IMG_FMT_PLANAR */
diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c
index fc7f828..cd66f00 100644
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
+#include "./vpx_config.h"
 #include "vpx_scale/yv12config.h"
 #include "vpx_mem/vpx_mem.h"
 
@@ -97,3 +97,89 @@
   }
   return -2;
 }
+
+#if CONFIG_VP9
+// TODO(jkoleszar): Maybe replace this with struct vpx_image
+
+int vp9_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
+  if (ybf) {
+    vpx_free(ybf->buffer_alloc);
+
+    /* buffer_alloc isn't accessed by most functions.  Rather y_buffer,
+      u_buffer and v_buffer point to buffer_alloc and are used.  Clear out
+      all of this so that a freed pointer isn't inadvertently used */
+    vpx_memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));
+  } else {
+    return -1;
+  }
+
+  return 0;
+}
+
+int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                             int width, int height,
+                             int ss_x, int ss_y, int border) {
+  if (ybf) {
+    const int aligned_width = (width + 15) & ~15;
+    const int aligned_height = (height + 15) & ~15;
+    const int y_stride = ((aligned_width + 2 * border) + 31) & ~31;
+    const int yplane_size = (aligned_height + 2 * border) * y_stride;
+    const int uv_width = aligned_width >> ss_x;
+    const int uv_height = aligned_height >> ss_y;
+    const int uv_stride = y_stride >> ss_x;
+    const int uv_border_w = border >> ss_x;
+    const int uv_border_h = border >> ss_y;
+    const int uvplane_size = (uv_height + 2 * uv_border_h) * uv_stride;
+    const int frame_size = yplane_size + 2 * uvplane_size;
+
+    if (!ybf->buffer_alloc) {
+      ybf->buffer_alloc = vpx_memalign(32, frame_size);
+      ybf->buffer_alloc_sz = frame_size;
+    }
+
+    if (!ybf->buffer_alloc || ybf->buffer_alloc_sz < frame_size)
+      return -1;
+
+    /* Only support allocating buffers that have a border that's a multiple
+     * of 32. The border restriction is required to get 16-byte alignment of
+     * the start of the chroma rows without intoducing an arbitrary gap
+     * between planes, which would break the semantics of things like
+     * vpx_img_set_rect(). */
+    if (border & 0x1f)
+      return -3;
+
+    ybf->y_crop_width = width;
+    ybf->y_crop_height = height;
+    ybf->y_width  = aligned_width;
+    ybf->y_height = aligned_height;
+    ybf->y_stride = y_stride;
+
+    ybf->uv_width = uv_width;
+    ybf->uv_height = uv_height;
+    ybf->uv_stride = uv_stride;
+
+    ybf->border = border;
+    ybf->frame_size = frame_size;
+
+    ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border;
+    ybf->u_buffer = ybf->buffer_alloc + yplane_size +
+                    (uv_border_h * uv_stride) + uv_border_w;
+    ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size +
+                    (uv_border_h * uv_stride) + uv_border_w;
+
+    ybf->corrupted = 0; /* assume not currupted by errors */
+    return 0;
+  }
+  return -2;
+}
+
+int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                           int width, int height,
+                           int ss_x, int ss_y, int border) {
+  if (ybf) {
+    vp9_free_frame_buffer(ybf);
+    return vp9_realloc_frame_buffer(ybf, width, height, ss_x, ss_y, border);
+  }
+  return -2;
+}
+#endif
diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h
index 22df399..85396c0 100644
--- a/vpx_scale/yv12config.h
+++ b/vpx_scale/yv12config.h
@@ -72,6 +72,14 @@
                                     int width, int height, int border);
   int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf);
 
+  int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                             int width, int height, int ss_x, int ss_y,
+                             int border);
+  int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                               int width, int height, int ss_x, int ss_y,
+                               int border);
+  int vp9_free_frame_buffer(YV12_BUFFER_CONFIG *ybf);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/vpxdec.c b/vpxdec.c
index 41c654f..811d41b 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -12,6 +12,7 @@
 /* This is a simple program that reads ivf files and decodes them
  * using the new interface. Decoded frames are output as YV12 raw.
  */
+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdarg.h>
@@ -890,6 +891,7 @@
 
   if (use_y4m && !noblit) {
     char buffer[128];
+
     if (!single_file) {
       fprintf(stderr, "YUV4MPEG2 not supported with output patterns,"
               " try --i420 or --yv12.\n");
@@ -907,8 +909,8 @@
     /*Note: We can't output an aspect ratio here because IVF doesn't
        store one, and neither does VP8.
       That will have to wait until these tools support WebM natively.*/
-    sprintf(buffer, "YUV4MPEG2 C%s W%u H%u F%u:%u I%c\n",
-            "420jpeg", width, height, fps_num, fps_den, 'p');
+    snprintf(buffer, sizeof(buffer), "YUV4MPEG2 W%u H%u F%u:%u I%c ",
+             width, height, fps_num, fps_den, 'p');
     out_put(out, (unsigned char *)buffer,
             (unsigned int)strlen(buffer), do_md5);
   }
@@ -1023,6 +1025,17 @@
       show_progress(frame_in, frame_out, dx_time);
 
     if (!noblit) {
+      if (frame_out == 1 && use_y4m) {
+        /* Write out the color format to terminate the header line */
+        const char *color =
+            img->fmt == VPX_IMG_FMT_444A ? "C444alpha\n" :
+            img->fmt == VPX_IMG_FMT_I444 ? "C444\n" :
+            img->fmt == VPX_IMG_FMT_I422 ? "C422\n" :
+            "C420jpeg\n";
+
+        out_put(out, (const unsigned char*)color, strlen(color), do_md5);
+      }
+
       if (do_scale) {
         if (img && frame_out == 1) {
           stream_w = img->d_w;
@@ -1031,6 +1044,7 @@
                                      stream_w, stream_h, 16);
         }
         if (img && (img->d_w != stream_w || img->d_h != stream_h)) {
+          assert(img->fmt == VPX_IMG_FMT_I420);
           I420Scale(img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y],
                     img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U],
                     img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V],
@@ -1051,6 +1065,12 @@
         unsigned int y;
         char out_fn[PATH_MAX];
         uint8_t *buf;
+        unsigned int c_w =
+            img->x_chroma_shift ? (1 + img->d_w) >> img->x_chroma_shift
+                                : img->d_w;
+        unsigned int c_h =
+            img->y_chroma_shift ? (1 + img->d_h) >> img->y_chroma_shift
+                                : img->d_h;
 
         if (!single_file) {
           size_t len = sizeof(out_fn) - 1;
@@ -1071,15 +1091,15 @@
 
         buf = img->planes[flipuv ? VPX_PLANE_V : VPX_PLANE_U];
 
-        for (y = 0; y < (1 + img->d_h) / 2; y++) {
-          out_put(out, buf, (1 + img->d_w) / 2, do_md5);
+        for (y = 0; y < c_h; y++) {
+          out_put(out, buf, c_w, do_md5);
           buf += img->stride[VPX_PLANE_U];
         }
 
         buf = img->planes[flipuv ? VPX_PLANE_U : VPX_PLANE_V];
 
-        for (y = 0; y < (1 + img->d_h) / 2; y++) {
-          out_put(out, buf, (1 + img->d_w) / 2, do_md5);
+        for (y = 0; y < c_h; y++) {
+          out_put(out, buf, c_w, do_md5);
           buf += img->stride[VPX_PLANE_V];
         }
 
diff --git a/vpxenc.c b/vpxenc.c
index 95c6cf2..8d0bf29 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -326,6 +326,7 @@
   unsigned int          h;
   struct vpx_rational   framerate;
   int                   use_i420;
+  int                   only_i420;
 };
 
 
@@ -1483,7 +1484,10 @@
 static void find_mismatch(vpx_image_t *img1, vpx_image_t *img2,
                           int yloc[4], int uloc[4], int vloc[4]) {
   const unsigned int bsize = 64;
-  const unsigned int bsize2 = bsize >> 1;
+  const unsigned int bsizey = bsize >> img1->y_chroma_shift;
+  const unsigned int bsizex = bsize >> img1->x_chroma_shift;
+  const int c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const int c_h = (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
   unsigned int match = 1;
   unsigned int i, j;
   yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;
@@ -1512,11 +1516,11 @@
   }
 
   uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;
-  for (i = 0, match = 1; match && i < (img1->d_h + 1) / 2; i += bsize2) {
-    for (j = 0; j < match && (img1->d_w + 1) / 2; j += bsize2) {
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; j < match && c_w; j += bsizex) {
       int k, l;
-      int si = mmin(i + bsize2, (img1->d_h + 1) / 2) - i;
-      int sj = mmin(j + bsize2, (img1->d_w + 1) / 2) - j;
+      int si = mmin(i + bsizey, c_h - i);
+      int sj = mmin(j + bsizex, c_w - j);
       for (k = 0; match && k < si; k++)
         for (l = 0; match && l < sj; l++) {
           if (*(img1->planes[VPX_PLANE_U] +
@@ -1536,11 +1540,11 @@
     }
   }
   vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;
-  for (i = 0, match = 1; match && i < (img1->d_h + 1) / 2; i += bsize2) {
-    for (j = 0; j < match && (img1->d_w + 1) / 2; j += bsize2) {
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; j < match && c_w; j += bsizex) {
       int k, l;
-      int si = mmin(i + bsize2, (img1->d_h + 1) / 2) - i;
-      int sj = mmin(j + bsize2, (img1->d_w + 1) / 2) - j;
+      int si = mmin(i + bsizey, c_h - i);
+      int sj = mmin(j + bsizex, c_w - j);
       for (k = 0; match && k < si; k++)
         for (l = 0; match && l < sj; l++) {
           if (*(img1->planes[VPX_PLANE_V] +
@@ -1563,6 +1567,8 @@
 
 static int compare_img(vpx_image_t *img1, vpx_image_t *img2)
 {
+  const int c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const int c_h = (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
   int match = 1;
   unsigned int i;
 
@@ -1575,15 +1581,15 @@
                      img2->planes[VPX_PLANE_Y]+i*img2->stride[VPX_PLANE_Y],
                      img1->d_w) == 0);
 
-  for (i = 0; i < img1->d_h/2; i++)
+  for (i = 0; i < c_h; i++)
     match &= (memcmp(img1->planes[VPX_PLANE_U]+i*img1->stride[VPX_PLANE_U],
                      img2->planes[VPX_PLANE_U]+i*img2->stride[VPX_PLANE_U],
-                     (img1->d_w + 1) / 2) == 0);
+                     c_w) == 0);
 
-  for (i = 0; i < img1->d_h/2; i++)
+  for (i = 0; i < c_h; i++)
     match &= (memcmp(img1->planes[VPX_PLANE_V]+i*img1->stride[VPX_PLANE_U],
                      img2->planes[VPX_PLANE_V]+i*img2->stride[VPX_PLANE_U],
-                     (img1->d_w + 1) / 2) == 0);
+                     c_w) == 0);
 
   return match;
 }
@@ -1793,7 +1799,8 @@
 
   if (input->detect.buf_read == 4
       && file_is_y4m(input->file, &input->y4m, input->detect.buf)) {
-    if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4) >= 0) {
+    if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4,
+                       input->only_i420) >= 0) {
       input->file_type = FILE_TYPE_Y4M;
       input->w = input->y4m.pic_w;
       input->h = input->y4m.pic_h;
@@ -2517,6 +2524,7 @@
   input.framerate.num = 30;
   input.framerate.den = 1;
   input.use_i420 = 1;
+  input.only_i420 = 1;
 
   /* First parse the global configuration values, because we want to apply
    * other parameters on top of the default configuration provided by the
@@ -2551,6 +2559,12 @@
   if (!input.fn)
     usage_exit();
 
+#if CONFIG_NON420
+  /* Decide if other chroma subsamplings than 4:2:0 are supported */
+  if (global.codec->fourcc == VP9_FOURCC)
+    input.only_i420 = 0;
+#endif
+
   for (pass = global.pass ? global.pass - 1 : 0; pass < global.passes; pass++) {
     int frames_in = 0, seen_frames = 0;
     int64_t estimated_time_left = -1;
diff --git a/y4minput.c b/y4minput.c
index 24f0c15..47f005a 100644
--- a/y4minput.c
+++ b/y4minput.c
@@ -659,7 +659,8 @@
                              unsigned char *_aux) {
 }
 
-int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip) {
+int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
+                   int only_420) {
   char buffer[80];
   int  ret;
   int  i;
@@ -701,6 +702,8 @@
             "Only progressive scan handled.\n");
     return -1;
   }
+  _y4m->vpx_fmt = VPX_IMG_FMT_I420;
+  _y4m->vpx_bps = 12;
   if (strcmp(_y4m->chroma_type, "420") == 0 ||
       strcmp(_y4m->chroma_type, "420jpeg") == 0) {
     _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = _y4m->dst_c_dec_v = 2;
@@ -734,16 +737,30 @@
     _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
     _y4m->convert = y4m_convert_422jpeg_420jpeg;
   } else if (strcmp(_y4m->chroma_type, "422") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = 2;
+    _y4m->src_c_dec_h = 2;
     _y4m->src_c_dec_v = 1;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
-    /*Chroma filter required: read into the aux buf first.
-      We need to make two filter passes, so we need some extra space in the
-       aux buffer.*/
-    _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-    _y4m->convert = y4m_convert_422_420jpeg;
+    if (only_420) {
+      _y4m->dst_c_dec_h = 2;
+      _y4m->dst_c_dec_v = 2;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      /*Chroma filter required: read into the aux buf first.
+        We need to make two filter passes, so we need some extra space in the
+         aux buffer.*/
+      _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz +
+          ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->convert = y4m_convert_422_420jpeg;
+    } else {
+      _y4m->vpx_fmt = VPX_IMG_FMT_I422;
+      _y4m->vpx_bps = 16;
+      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h
+                              + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      /*Natively supported: no conversion required.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+      _y4m->convert = y4m_convert_null;
+      }
   } else if (strcmp(_y4m->chroma_type, "411") == 0) {
     _y4m->src_c_dec_h = 4;
     _y4m->dst_c_dec_h = 2;
@@ -758,29 +775,52 @@
     _y4m->convert = y4m_convert_411_420jpeg;
   } else if (strcmp(_y4m->chroma_type, "444") == 0) {
     _y4m->src_c_dec_h = 1;
-    _y4m->dst_c_dec_h = 2;
     _y4m->src_c_dec_v = 1;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
-    /*Chroma filter required: read into the aux buf first.
-      We need to make two filter passes, so we need some extra space in the
-       aux buffer.*/
-    _y4m->aux_buf_read_sz = 2 * _y4m->pic_w * _y4m->pic_h;
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-    _y4m->convert = y4m_convert_444_420jpeg;
+    if (only_420) {
+      _y4m->dst_c_dec_h = 2;
+      _y4m->dst_c_dec_v = 2;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      /*Chroma filter required: read into the aux buf first.
+        We need to make two filter passes, so we need some extra space in the
+         aux buffer.*/
+      _y4m->aux_buf_read_sz = 2 * _y4m->pic_w * _y4m->pic_h;
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz +
+          ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->convert = y4m_convert_444_420jpeg;
+    } else {
+      _y4m->vpx_fmt = VPX_IMG_FMT_I444;
+      _y4m->vpx_bps = 24;
+      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+      _y4m->dst_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
+      /*Natively supported: no conversion required.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+      _y4m->convert = y4m_convert_null;
+    }
   } else if (strcmp(_y4m->chroma_type, "444alpha") == 0) {
     _y4m->src_c_dec_h = 1;
-    _y4m->dst_c_dec_h = 2;
     _y4m->src_c_dec_v = 1;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
-    /*Chroma filter required: read into the aux buf first.
-      We need to make two filter passes, so we need some extra space in the
-       aux buffer.
-      The extra plane also gets read into the aux buf.
-      It will be discarded.*/
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
-    _y4m->convert = y4m_convert_444_420jpeg;
+    if (only_420) {
+      _y4m->dst_c_dec_h = 2;
+      _y4m->dst_c_dec_v = 2;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      /*Chroma filter required: read into the aux buf first.
+        We need to make two filter passes, so we need some extra space in the
+         aux buffer.
+        The extra plane also gets read into the aux buf.
+        It will be discarded.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
+      _y4m->convert = y4m_convert_444_420jpeg;
+    } else {
+      _y4m->vpx_fmt = VPX_IMG_FMT_444A;
+      _y4m->vpx_bps = 32;
+      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+      _y4m->dst_buf_read_sz = 4 * _y4m->pic_w * _y4m->pic_h;
+      /*Natively supported: no conversion required.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+      _y4m->convert = y4m_convert_null;
+    }
   } else if (strcmp(_y4m->chroma_type, "mono") == 0) {
     _y4m->src_c_dec_h = _y4m->src_c_dec_v = 0;
     _y4m->dst_c_dec_h = _y4m->dst_c_dec_v = 2;
@@ -847,22 +887,23 @@
      sizes, which would require a separate fread call for every row.*/
   memset(_img, 0, sizeof(*_img));
   /*Y4M has the planes in Y'CbCr order, which libvpx calls Y, U, and V.*/
-  _img->fmt = IMG_FMT_I420;
+  _img->fmt = _y4m->vpx_fmt;
   _img->w = _img->d_w = _y4m->pic_w;
   _img->h = _img->d_h = _y4m->pic_h;
-  /*This is hard-coded to 4:2:0 for now, as that's all VP8 supports.*/
-  _img->x_chroma_shift = 1;
-  _img->y_chroma_shift = 1;
-  _img->bps = 12;
+  _img->x_chroma_shift = _y4m->dst_c_dec_h >> 1;
+  _img->y_chroma_shift = _y4m->dst_c_dec_v >> 1;
+  _img->bps = _y4m->vpx_bps;
+
   /*Set up the buffer pointers.*/
   pic_sz = _y4m->pic_w * _y4m->pic_h;
   c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
   c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
   c_sz = c_w * c_h;
-  _img->stride[PLANE_Y] = _y4m->pic_w;
+  _img->stride[PLANE_Y] = _img->stride[PLANE_ALPHA] = _y4m->pic_w;
   _img->stride[PLANE_U] = _img->stride[PLANE_V] = c_w;
   _img->planes[PLANE_Y] = _y4m->dst_buf;
   _img->planes[PLANE_U] = _y4m->dst_buf + pic_sz;
   _img->planes[PLANE_V] = _y4m->dst_buf + pic_sz + c_sz;
+  _img->planes[PLANE_ALPHA] = _y4m->dst_buf + pic_sz + 2 * c_sz;
   return 1;
 }
diff --git a/y4minput.h b/y4minput.h
index 2fa3767..b2a390c 100644
--- a/y4minput.h
+++ b/y4minput.h
@@ -51,9 +51,12 @@
   y4m_convert_func  convert;
   unsigned char    *dst_buf;
   unsigned char    *aux_buf;
+  enum vpx_img_fmt  vpx_fmt;
+  int               vpx_bps;
 };
 
-int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip);
+int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
+                   int only_420);
 void y4m_input_close(y4m_input *_y4m);
 int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *img);