64x64 blocksize support.

3.2% gains on std/hd, 1.0% gains on hd.

Change-Id: I481d5df23d8a4fc650a5bcba956554490b2bd200
diff --git a/configure b/configure
index 1126ea8..5ed688e 100755
--- a/configure
+++ b/configure
@@ -240,6 +240,7 @@
     csm
     comp_intra_pred
     superblocks
+    superblocks64
     pred_filter
     lossless
     subpelrefmv
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index c430ea2..9ca2b22 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -226,6 +226,16 @@
   MAX_REF_FRAMES = 4
 } MV_REFERENCE_FRAME;
 
+#if CONFIG_SUPERBLOCKS
+typedef enum {
+  BLOCK_SIZE_MB16X16 = 0,
+  BLOCK_SIZE_SB32X32 = 1,
+#if CONFIG_SUPERBLOCKS64
+  BLOCK_SIZE_SB64X64 = 2,
+#endif
+} BLOCK_SIZE_TYPE;
+#endif
+
 typedef struct {
   MB_PREDICTION_MODE mode, uv_mode;
 #if CONFIG_COMP_INTRA_PRED
@@ -268,8 +278,8 @@
 
 #if CONFIG_SUPERBLOCKS
   // FIXME need a SB array of 4 MB_MODE_INFOs that
-  // only needs one encoded_as_sb.
-  unsigned char encoded_as_sb;
+  // only needs one sb_type.
+  BLOCK_SIZE_TYPE sb_type;
 #endif
 } MB_MODE_INFO;
 
@@ -415,6 +425,7 @@
   DECLARE_ALIGNED(32, uint8_t, y_buf[22 * 32]);
 #endif
 
+  int sb_index;
   int mb_index;   // Index of the MB in the SB (0..3)
   int q_index;
 
@@ -519,7 +530,7 @@
     return tx_type;
 #if CONFIG_SUPERBLOCKS
   // TODO(rbultje, debargha): Explore ADST usage for superblocks
-  if (xd->mode_info_context->mbmi.encoded_as_sb)
+  if (xd->mode_info_context->mbmi.sb_type)
     return tx_type;
 #endif
   if (xd->mode_info_context->mbmi.mode == B_PRED &&
@@ -576,7 +587,7 @@
     return tx_type;
 #if CONFIG_SUPERBLOCKS
   // TODO(rbultje, debargha): Explore ADST usage for superblocks
-  if (xd->mode_info_context->mbmi.encoded_as_sb)
+  if (xd->mode_info_context->mbmi.sb_type)
     return tx_type;
 #endif
   if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&
@@ -611,7 +622,7 @@
     return tx_type;
 #if CONFIG_SUPERBLOCKS
   // TODO(rbultje, debargha): Explore ADST usage for superblocks
-  if (xd->mode_info_context->mbmi.encoded_as_sb)
+  if (xd->mode_info_context->mbmi.sb_type)
     return tx_type;
 #endif
   if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index ee02758..2e1ee4b 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -21,6 +21,9 @@
 #define TRUE    1
 #define FALSE   0
 
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+
 /* Only need this for fixed-size arrays, for structs just assign. */
 
 #define vp9_copy(Dest, Src) { \
diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c
index b5d6bda..85982fc 100644
--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@@ -191,7 +191,7 @@
                                    above_src, xd->dst.y_stride, &sse);
       score += sse;
 #if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
         vp9_sub_pixel_variance16x2_c(above_ref + offset + 16,
                                      ref_y_stride,
                                      SP(this_mv.as_mv.col),
@@ -199,6 +199,22 @@
                                      above_src + 16, xd->dst.y_stride, &sse);
         score += sse;
       }
+#if CONFIG_SUPERBLOCKS64
+      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) {
+        vp9_sub_pixel_variance16x2_c(above_ref + offset + 32,
+                                     ref_y_stride,
+                                     SP(this_mv.as_mv.col),
+                                     SP(this_mv.as_mv.row),
+                                     above_src + 32, xd->dst.y_stride, &sse);
+        score += sse;
+        vp9_sub_pixel_variance16x2_c(above_ref + offset + 48,
+                                     ref_y_stride,
+                                     SP(this_mv.as_mv.col),
+                                     SP(this_mv.as_mv.row),
+                                     above_src + 48, xd->dst.y_stride, &sse);
+        score += sse;
+      }
+#endif
 #endif
     }
     if (xd->left_available) {
@@ -208,7 +224,7 @@
                                    left_src, xd->dst.y_stride, &sse);
       score += sse;
 #if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
         vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 16,
                                      ref_y_stride,
                                      SP(this_mv.as_mv.col),
@@ -217,6 +233,24 @@
                                      xd->dst.y_stride, &sse);
         score += sse;
       }
+#if CONFIG_SUPERBLOCKS64
+      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) {
+        vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 32,
+                                     ref_y_stride,
+                                     SP(this_mv.as_mv.col),
+                                     SP(this_mv.as_mv.row),
+                                     left_src + xd->dst.y_stride * 32,
+                                     xd->dst.y_stride, &sse);
+        score += sse;
+        vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 48,
+                                     ref_y_stride,
+                                     SP(this_mv.as_mv.col),
+                                     SP(this_mv.as_mv.row),
+                                     left_src + xd->dst.y_stride * 48,
+                                     xd->dst.y_stride, &sse);
+        score += sse;
+      }
+#endif
 #endif
     }
 #else
@@ -230,22 +264,42 @@
       score += vp9_sad16x3(above_src, xd->dst.y_stride,
                            above_ref + offset, ref_y_stride);
 #if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
         score += vp9_sad16x3(above_src + 16, xd->dst.y_stride,
                              above_ref + offset + 16, ref_y_stride);
       }
+#if CONFIG_SUPERBLOCKS64
+      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) {
+        score += vp9_sad16x3(above_src + 32, xd->dst.y_stride,
+                             above_ref + offset + 32, ref_y_stride);
+        score += vp9_sad16x3(above_src + 48, xd->dst.y_stride,
+                             above_ref + offset + 48, ref_y_stride);
+      }
+#endif
 #endif
     }
     if (xd->left_available) {
       score += vp9_sad3x16(left_src, xd->dst.y_stride,
                            left_ref + offset, ref_y_stride);
 #if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
         score += vp9_sad3x16(left_src + xd->dst.y_stride * 16,
                              xd->dst.y_stride,
                              left_ref + offset + ref_y_stride * 16,
                              ref_y_stride);
       }
+#if CONFIG_SUPERBLOCKS64
+      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) {
+        score += vp9_sad3x16(left_src + xd->dst.y_stride * 32,
+                             xd->dst.y_stride,
+                             left_ref + offset + ref_y_stride * 32,
+                             ref_y_stride);
+        score += vp9_sad3x16(left_src + xd->dst.y_stride * 48,
+                             xd->dst.y_stride,
+                             left_ref + offset + ref_y_stride * 48,
+                             ref_y_stride);
+      }
+#endif
 #endif
     }
 #endif
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index a928a92..5188aa4 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -228,7 +228,7 @@
 
           if (mb_col > 0
 #if CONFIG_SUPERBLOCKS
-              && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&
+              && !((mb_col & 1) && mode_info_context->mbmi.sb_type &&
                    ((mode_info_context[0].mbmi.mb_skip_coeff &&
                      mode_info_context[-1].mbmi.mb_skip_coeff)
 #if CONFIG_TX32X32
@@ -253,7 +253,7 @@
           /* don't apply across umv border */
           if (mb_row > 0
 #if CONFIG_SUPERBLOCKS
-              && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&
+              && !((mb_row & 1) && mode_info_context->mbmi.sb_type &&
                    ((mode_info_context[0].mbmi.mb_skip_coeff &&
                      mode_info_context[-mis].mbmi.mb_skip_coeff)
 #if CONFIG_TX32X32
@@ -277,7 +277,7 @@
           // FIXME: Not 8x8 aware
           if (mb_col > 0
 #if CONFIG_SUPERBLOCKS
-              && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&
+              && !((mb_col & 1) && mode_info_context->mbmi.sb_type &&
                    mode_info_context[0].mbmi.mb_skip_coeff &&
                    mode_info_context[-1].mbmi.mb_skip_coeff)
 #endif
@@ -292,7 +292,7 @@
           /* don't apply across umv border */
           if (mb_row > 0
 #if CONFIG_SUPERBLOCKS
-              && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&
+              && !((mb_row & 1) && mode_info_context->mbmi.sb_type &&
                    mode_info_context[0].mbmi.mb_skip_coeff &&
                    mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)
 #endif
diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index 4b576e8..bfdc1af 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -237,7 +237,7 @@
   vpx_memset(candidate_scores, 0, sizeof(candidate_scores));
 
 #if CONFIG_SUPERBLOCKS
-  if (mbmi->encoded_as_sb) {
+  if (mbmi->sb_type) {
     mv_ref_search = sb_mv_ref_search;
     ref_distance_weight = sb_ref_distance_weight;
   } else {
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index d96e76c..3b62dac 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -229,7 +229,7 @@
 
   /* Y,U,V,Y2 */
   ENTROPY_CONTEXT_PLANES *above_context;   /* row of context for each plane */
-  ENTROPY_CONTEXT_PLANES left_context[2];  /* (up to) 4 contexts "" */
+  ENTROPY_CONTEXT_PLANES left_context[4];  /* (up to) 4 contexts "" */
 
   /* keyframe block modes are predicted by their above, left neighbors */
 
@@ -248,7 +248,10 @@
   vp9_prob prob_last_coded;
   vp9_prob prob_gf_coded;
 #if CONFIG_SUPERBLOCKS
-  vp9_prob sb_coded;
+  vp9_prob sb32_coded;
+#if CONFIG_SUPERBLOCKS64
+  vp9_prob sb64_coded;
+#endif  // CONFIG_SUPERBLOCKS64
 #endif
 
   // Context probabilities when using predictive coding of segment id
diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c
index e8a3c4f..f2f35a3 100644
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -9,6 +9,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_treecoder.h"
@@ -230,13 +231,18 @@
     case PRED_SEG_ID:
       xd->mode_info_context->mbmi.seg_id_predicted = pred_flag;
 #if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        if (xd->mb_to_right_edge >= 0)
-          xd->mode_info_context[1].mbmi.seg_id_predicted = pred_flag;
-        if (xd->mb_to_bottom_edge >= 0) {
-          xd->mode_info_context[mis].mbmi.seg_id_predicted = pred_flag;
-          if (xd->mb_to_right_edge >= 0)
-            xd->mode_info_context[mis + 1].mbmi.seg_id_predicted = pred_flag;
+      if (xd->mode_info_context->mbmi.sb_type) {
+#define sub(a, b) (b) < 0 ? (a) + (b) : (a)
+        const int n_mbs = 1 << xd->mode_info_context->mbmi.sb_type;
+        const int x_mbs = sub(n_mbs, xd->mb_to_right_edge >> 7);
+        const int y_mbs = sub(n_mbs, xd->mb_to_bottom_edge >> 7);
+        int x, y;
+
+        for (y = 0; y < y_mbs; y++) {
+          for (x = !y; x < x_mbs; x++) {
+            xd->mode_info_context[y * mis + x].mbmi.seg_id_predicted =
+                pred_flag;
+          }
         }
       }
 #endif
@@ -245,13 +251,16 @@
     case PRED_REF:
       xd->mode_info_context->mbmi.ref_predicted = pred_flag;
 #if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        if (xd->mb_to_right_edge >= 0)
-          xd->mode_info_context[1].mbmi.ref_predicted = pred_flag;
-        if (xd->mb_to_bottom_edge >= 0) {
-          xd->mode_info_context[mis].mbmi.ref_predicted = pred_flag;
-          if (xd->mb_to_right_edge >= 0)
-            xd->mode_info_context[mis + 1].mbmi.ref_predicted = pred_flag;
+      if (xd->mode_info_context->mbmi.sb_type) {
+        const int n_mbs = 1 << xd->mode_info_context->mbmi.sb_type;
+        const int x_mbs = sub(n_mbs, xd->mb_to_right_edge >> 7);
+        const int y_mbs = sub(n_mbs, xd->mb_to_bottom_edge >> 7);
+        int x, y;
+
+        for (y = 0; y < y_mbs; y++) {
+          for (x = !y; x < x_mbs; x++) {
+            xd->mode_info_context[y * mis + x].mbmi.ref_predicted = pred_flag;
+          }
         }
       }
 #endif
@@ -260,13 +269,16 @@
     case PRED_MBSKIP:
       xd->mode_info_context->mbmi.mb_skip_coeff = pred_flag;
 #if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        if (xd->mb_to_right_edge >= 0)
-          xd->mode_info_context[1].mbmi.mb_skip_coeff = pred_flag;
-        if (xd->mb_to_bottom_edge >= 0) {
-          xd->mode_info_context[mis].mbmi.mb_skip_coeff = pred_flag;
-          if (xd->mb_to_right_edge >= 0)
-            xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = pred_flag;
+      if (xd->mode_info_context->mbmi.sb_type) {
+        const int n_mbs = 1 << xd->mode_info_context->mbmi.sb_type;
+        const int x_mbs = sub(n_mbs, xd->mb_to_right_edge >> 7);
+        const int y_mbs = sub(n_mbs, xd->mb_to_bottom_edge >> 7);
+        int x, y;
+
+        for (y = 0; y < y_mbs; y++) {
+          for (x = !y; x < x_mbs; x++) {
+            xd->mode_info_context[y * mis + x].mbmi.mb_skip_coeff = pred_flag;
+          }
         }
       }
 #endif
@@ -288,21 +300,25 @@
   // Currently the prediction for the macroblock segment ID is
   // the value stored for this macroblock in the previous frame.
 #if CONFIG_SUPERBLOCKS
-  if (!xd->mode_info_context->mbmi.encoded_as_sb) {
+  if (!xd->mode_info_context->mbmi.sb_type) {
 #endif
     return cm->last_frame_seg_map[MbIndex];
 #if CONFIG_SUPERBLOCKS
   } else {
-    int seg_id = cm->last_frame_seg_map[MbIndex];
-    int mb_col = MbIndex % cm->mb_cols;
-    int mb_row = MbIndex / cm->mb_cols;
-    if (mb_col + 1 < cm->mb_cols)
-      seg_id = seg_id && cm->last_frame_seg_map[MbIndex + 1];
-    if (mb_row + 1 < cm->mb_rows) {
-      seg_id = seg_id && cm->last_frame_seg_map[MbIndex + cm->mb_cols];
-      if (mb_col + 1 < cm->mb_cols)
-        seg_id = seg_id && cm->last_frame_seg_map[MbIndex + cm->mb_cols + 1];
+    const int n_mbs = 1 << xd->mode_info_context->mbmi.sb_type;
+    const int mb_col = MbIndex % cm->mb_cols;
+    const int mb_row = MbIndex / cm->mb_cols;
+    const int x_mbs = MIN(n_mbs, cm->mb_cols - mb_col);
+    const int y_mbs = MIN(n_mbs, cm->mb_rows - mb_row);
+    int x, y;
+    unsigned seg_id = -1;
+
+    for (y = mb_row; y < mb_row + y_mbs; y++) {
+      for (x = mb_col; x < mb_col + x_mbs; x++) {
+        seg_id = MIN(seg_id, cm->last_frame_seg_map[cm->mb_cols * y + x]);
+      }
     }
+
     return seg_id;
   }
 #endif
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 01d332f..c1d4a29 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -780,6 +780,70 @@
   }
 #endif
 }
+
+void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,
+                                        uint8_t *dst_y,
+                                        uint8_t *dst_u,
+                                        uint8_t *dst_v,
+                                        int dst_ystride,
+                                        int dst_uvstride) {
+  uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;
+  uint8_t *y2 = x->second_pre.y_buffer, *u2 = x->second_pre.u_buffer,
+          *v2 = x->second_pre.v_buffer;
+  int edge[4], n;
+
+  edge[0] = x->mb_to_top_edge;
+  edge[1] = x->mb_to_bottom_edge;
+  edge[2] = x->mb_to_left_edge;
+  edge[3] = x->mb_to_right_edge;
+
+  for (n = 0; n < 4; n++) {
+    const int x_idx = n & 1, y_idx = n >> 1;
+
+    x->mb_to_top_edge    = edge[0] -      ((y_idx  * 32) << 3);
+    x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 32) << 3);
+    x->mb_to_left_edge   = edge[2] -      ((x_idx  * 32) << 3);
+    x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 32) << 3);
+
+    x->pre.y_buffer = y1 + y_idx * 32 * x->pre.y_stride  + x_idx * 32;
+    x->pre.u_buffer = u1 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;
+    x->pre.v_buffer = v1 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;
+
+    if (x->mode_info_context->mbmi.second_ref_frame > 0) {
+      x->second_pre.y_buffer = y2 + y_idx * 32 * x->pre.y_stride  + x_idx * 32;
+      x->second_pre.u_buffer = u2 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;
+      x->second_pre.v_buffer = v2 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;
+    }
+
+    vp9_build_inter32x32_predictors_sb(x,
+        dst_y + y_idx * 32 * dst_ystride  + x_idx * 32,
+        dst_u + y_idx * 16 * dst_uvstride + x_idx * 16,
+        dst_v + y_idx * 16 * dst_uvstride + x_idx * 16,
+        dst_ystride, dst_uvstride);
+  }
+
+  x->mb_to_top_edge    = edge[0];
+  x->mb_to_bottom_edge = edge[1];
+  x->mb_to_left_edge   = edge[2];
+  x->mb_to_right_edge  = edge[3];
+
+  x->pre.y_buffer = y1;
+  x->pre.u_buffer = u1;
+  x->pre.v_buffer = v1;
+
+  if (x->mode_info_context->mbmi.second_ref_frame > 0) {
+    x->second_pre.y_buffer = y2;
+    x->second_pre.u_buffer = u2;
+    x->second_pre.v_buffer = v2;
+  }
+
+#if CONFIG_COMP_INTERINTRA_PRED
+  if (x->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
+    vp9_build_interintra_64x64_predictors_sb(x, dst_y, dst_u, dst_v,
+                                             dst_ystride, dst_uvstride);
+  }
+#endif
+}
 #endif
 
 /*
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index b104f83..5e45b68 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -54,6 +54,13 @@
                                                uint8_t *dst_v,
                                                int dst_ystride,
                                                int dst_uvstride);
+
+extern void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,
+                                               uint8_t *dst_y,
+                                               uint8_t *dst_u,
+                                               uint8_t *dst_v,
+                                               int dst_ystride,
+                                               int dst_uvstride);
 #endif
 
 extern void vp9_build_inter_predictors_mb(MACROBLOCKD *xd);
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index b893df1..3fec98a 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -254,7 +254,7 @@
                                          int up_available, int left_available) {
 
   uint8_t *yabove_row = src - src_stride;
-  uint8_t yleft_col[32];
+  uint8_t yleft_col[64];
   uint8_t ytop_left = yabove_row[-1];
   int r, c, i;
 
@@ -271,15 +271,19 @@
       int average = 0;
       int log2_bsize_minus_1;
 
-      assert(bsize == 4 || bsize == 8 || bsize == 16 || bsize == 32);
+      assert(bsize == 4 || bsize == 8 || bsize == 16 || bsize == 32 ||
+             bsize == 64);
       if (bsize == 4) {
         log2_bsize_minus_1 = 1;
       } else if (bsize == 8) {
         log2_bsize_minus_1 = 2;
       } else if (bsize == 16) {
         log2_bsize_minus_1 = 3;
-      } else /* bsize == 32 */ {
+      } else if (bsize == 32) {
         log2_bsize_minus_1 = 4;
+      } else {
+        assert(bsize == 64);
+        log2_bsize_minus_1 = 5;
       }
 
       if (up_available || left_available) {
@@ -517,16 +521,17 @@
     71,  70,  70,  70,  69,  69,  69,  68,
     68,  68,  68,  68,  67,  67,  67,  67,
   };
-  int size_scale = (size == 32 ? 1 :
+  int size_scale = (size >= 32 ? 1 :
                     size == 16 ? 2 :
                     size == 8  ? 4 : 8);
+  int size_shift = size == 64 ? 1 : 0;
   int i, j;
   switch (mode) {
     case V_PRED:
       for (i = 0; i < size; ++i) {
         for (j = 0; j < size; ++j) {
           int k = i * interstride + j;
-          int scale = weights1d[i * size_scale];
+          int scale = weights1d[i * size_scale >> size_shift];
           interpred[k] =
               ((scale_max - scale) * interpred[k] +
                scale * intrapred[i * intrastride + j] + scale_round)
@@ -539,7 +544,7 @@
       for (i = 0; i < size; ++i) {
         for (j = 0; j < size; ++j) {
           int k = i * interstride + j;
-          int scale = weights1d[j * size_scale];
+          int scale = weights1d[j * size_scale >> size_shift];
           interpred[k] =
               ((scale_max - scale) * interpred[k] +
                scale * intrapred[i * intrastride + j] + scale_round)
@@ -553,8 +558,9 @@
       for (i = 0; i < size; ++i) {
         for (j = 0; j < size; ++j) {
           int k = i * interstride + j;
-          int scale = (weights2d[i * size_scale * 32 + j * size_scale] +
-                       weights1d[i * size_scale]) >> 1;
+          int scale = (weights2d[(i * size_scale * 32 +
+                                  j * size_scale) >> size_shift] +
+                       weights1d[i * size_scale >> size_shift]) >> 1;
           interpred[k] =
               ((scale_max - scale) * interpred[k] +
                scale * intrapred[i * intrastride + j] + scale_round)
@@ -568,8 +574,9 @@
       for (i = 0; i < size; ++i) {
         for (j = 0; j < size; ++j) {
           int k = i * interstride + j;
-          int scale = (weights2d[i * size_scale * 32 + j * size_scale] +
-                       weights1d[j * size_scale]) >> 1;
+          int scale = (weights2d[(i * size_scale * 32 +
+                                  j * size_scale) >> size_shift] +
+                       weights1d[j * size_scale >> size_shift]) >> 1;
           interpred[k] =
               ((scale_max - scale) * interpred[k] +
                scale * intrapred[i * intrastride + j] + scale_round)
@@ -582,7 +589,8 @@
       for (i = 0; i < size; ++i) {
         for (j = 0; j < size; ++j) {
           int k = i * interstride + j;
-          int scale = weights2d[i * size_scale * 32 + j * size_scale];
+          int scale = weights2d[(i * size_scale * 32 +
+                                 j * size_scale) >> size_shift];
           interpred[k] =
               ((scale_max - scale) * interpred[k] +
                scale * intrapred[i * intrastride + j] + scale_round)
@@ -695,6 +703,47 @@
   vp9_build_interintra_32x32_predictors_sby(xd, ypred, ystride);
   vp9_build_interintra_32x32_predictors_sbuv(xd, upred, vpred, uvstride);
 }
+
+void vp9_build_interintra_64x64_predictors_sby(MACROBLOCKD *xd,
+                                               uint8_t *ypred,
+                                               int ystride) {
+  uint8_t intrapredictor[4096];
+  const int mode = xd->mode_info_context->mbmi.interintra_mode;
+  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
+                                      intrapredictor, 64, mode, 64,
+                                      xd->up_available, xd->left_available);
+  combine_interintra(xd->mode_info_context->mbmi.interintra_mode,
+                     ypred, ystride, intrapredictor, 64, 64);
+}
+
+void vp9_build_interintra_64x64_predictors_sbuv(MACROBLOCKD *xd,
+                                                uint8_t *upred,
+                                                uint8_t *vpred,
+                                                int uvstride) {
+  uint8_t uintrapredictor[1024];
+  uint8_t vintrapredictor[1024];
+  const int mode = xd->mode_info_context->mbmi.interintra_uv_mode;
+  vp9_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride,
+                                      uintrapredictor, 32, mode, 32,
+                                      xd->up_available, xd->left_available);
+  vp9_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride,
+                                      vintrapredictor, 32, mode, 32,
+                                      xd->up_available, xd->left_available);
+  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
+                     upred, uvstride, uintrapredictor, 32, 32);
+  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
+                     vpred, uvstride, vintrapredictor, 32, 32);
+}
+
+void vp9_build_interintra_64x64_predictors_sb(MACROBLOCKD *xd,
+                                              uint8_t *ypred,
+                                              uint8_t *upred,
+                                              uint8_t *vpred,
+                                              int ystride,
+                                              int uvstride) {
+  vp9_build_interintra_64x64_predictors_sby(xd, ypred, ystride);
+  vp9_build_interintra_64x64_predictors_sbuv(xd, upred, vpred, uvstride);
+}
 #endif
 #endif
 
@@ -719,6 +768,13 @@
                                       xd->mode_info_context->mbmi.mode, 32,
                                       xd->up_available, xd->left_available);
 }
+
+void vp9_build_intra_predictors_sb64y_s(MACROBLOCKD *xd) {
+  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
+                                      xd->dst.y_buffer, xd->dst.y_stride,
+                                      xd->mode_info_context->mbmi.mode, 64,
+                                      xd->up_available, xd->left_available);
+}
 #endif
 
 #if CONFIG_COMP_INTRA_PRED
@@ -778,6 +834,13 @@
                                            xd->mode_info_context->mbmi.uv_mode,
                                            16);
 }
+
+void vp9_build_intra_predictors_sb64uv_s(MACROBLOCKD *xd) {
+  vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,
+                                           xd->dst.v_buffer, xd->dst.uv_stride,
+                                           xd->mode_info_context->mbmi.uv_mode,
+                                           32);
+}
 #endif
 
 #if CONFIG_COMP_INTRA_PRED
diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h
index f3016dd..7bdcb4e 100644
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@@ -41,6 +41,12 @@
                                                      uint8_t *vpred,
                                                      int ystride,
                                                      int uvstride);
+extern void vp9_build_interintra_64x64_predictors_sb(MACROBLOCKD *xd,
+                                                     uint8_t *ypred,
+                                                     uint8_t *upred,
+                                                     uint8_t *vpred,
+                                                     int ystride,
+                                                     int uvstride);
 #endif  // CONFIG_SUPERBLOCKS
 
 #endif  // VP9_COMMON_VP9_RECONINTRA_H_
diff --git a/vp9/common/vp9_reconintra4x4.c b/vp9/common/vp9_reconintra4x4.c
index d61a515..c41b55b 100644
--- a/vp9/common/vp9_reconintra4x4.c
+++ b/vp9/common/vp9_reconintra4x4.c
@@ -434,12 +434,9 @@
  * to the right prediction have filled in pixels to use.
  */
 void vp9_intra_prediction_down_copy(MACROBLOCKD *xd) {
-  int extend_edge = (xd->mb_to_right_edge == 0 && xd->mb_index < 2);
+  int extend_edge = xd->mb_to_right_edge == 0 && xd->mb_index < 2;
   uint8_t *above_right = *(xd->block[0].base_dst) + xd->block[0].dst -
                                xd->block[0].dst_stride + 16;
-  uint32_t *src_ptr = (uint32_t *)
-      (above_right - (xd->mb_index == 3 ? 16 * xd->block[0].dst_stride : 0));
-
   uint32_t *dst_ptr0 = (uint32_t *)above_right;
   uint32_t *dst_ptr1 =
     (uint32_t *)(above_right + 4 * xd->block[0].dst_stride);
@@ -448,6 +445,17 @@
   uint32_t *dst_ptr3 =
     (uint32_t *)(above_right + 12 * xd->block[0].dst_stride);
 
+  uint32_t *src_ptr = (uint32_t *) above_right;
+
+  if ((xd->sb_index >= 2 && xd->mb_to_right_edge == 0) ||
+      (xd->sb_index == 3 && xd->mb_index & 1))
+    src_ptr = (uint32_t *) (((uint8_t *) src_ptr) - 32 *
+                                                    xd->block[0].dst_stride);
+  if (xd->mb_index == 3 ||
+      (xd->mb_to_right_edge == 0 && xd->mb_index == 2))
+    src_ptr = (uint32_t *) (((uint8_t *) src_ptr) - 16 *
+                                                    xd->block[0].dst_stride);
+
   if (extend_edge) {
     *src_ptr = ((uint8_t *) src_ptr)[-1] * 0x01010101U;
   }
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 9cf7121..f02ee02 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -418,6 +418,9 @@
 prototype unsigned int vp9_variance32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance32x32
 
+prototype unsigned int vp9_variance64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance64x64
+
 prototype unsigned int vp9_variance16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance16x16 mmx sse2
 vp9_variance16x16_sse2=vp9_variance16x16_wmt
@@ -443,6 +446,9 @@
 vp9_variance4x4_sse2=vp9_variance4x4_wmt
 vp9_variance4x4_mmx=vp9_variance4x4_mmx
 
+prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_variance64x64
+
 prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
 specialize vp9_sub_pixel_variance32x32
 
@@ -467,6 +473,9 @@
 specialize vp9_sub_pixel_variance4x4 sse2 mmx
 vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
 
+prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
+specialize vp9_sad64x64
+
 prototype unsigned int vp9_sad32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad32x32
 
@@ -502,6 +511,15 @@
 specialize vp9_variance_halfpixvar16x16_hv mmx sse2
 vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt
 
+prototype unsigned int vp9_variance_halfpixvar64x64_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance_halfpixvar64x64_h
+
+prototype unsigned int vp9_variance_halfpixvar64x64_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance_halfpixvar64x64_v
+
+prototype unsigned int vp9_variance_halfpixvar64x64_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance_halfpixvar64x64_hv
+
 prototype unsigned int vp9_variance_halfpixvar32x32_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance_halfpixvar32x32_h
 
@@ -511,6 +529,9 @@
 prototype unsigned int vp9_variance_halfpixvar32x32_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance_halfpixvar32x32_hv
 
+prototype void vp9_sad64x64x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp9_sad64x64x3
+
 prototype void vp9_sad32x32x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad32x32x3
 
@@ -529,6 +550,9 @@
 prototype void vp9_sad4x4x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad4x4x3 sse3
 
+prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+specialize vp9_sad64x64x8
+
 prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
 specialize vp9_sad32x32x8
 
@@ -547,6 +571,9 @@
 prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
 specialize vp9_sad4x4x8 sse4
 
+prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp9_sad64x64x4d
+
 prototype void vp9_sad32x32x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad32x32x4d
 
@@ -583,6 +610,9 @@
 specialize vp9_mse16x16 mmx sse2
 vp9_mse16x16_sse2=vp9_mse16x16_wmt
 
+prototype unsigned int vp9_sub_pixel_mse64x64 "const uint8_t *src_ptr, int  source_stride, int  xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_mse64x64
+
 prototype unsigned int vp9_sub_pixel_mse32x32 "const uint8_t *src_ptr, int  source_stride, int  xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
 specialize vp9_sub_pixel_mse32x32
 
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index cbd3fb9..bbe2e95 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -14,7 +14,7 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/common/vp9_findnearmv.h"
-
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_entropy.h"
@@ -122,7 +122,24 @@
   m->mbmi.segment_id = 0;
   if (pbi->mb.update_mb_segmentation_map) {
     read_mb_segid(bc, &m->mbmi, &pbi->mb);
-    pbi->common.last_frame_seg_map[map_index] = m->mbmi.segment_id;
+#if CONFIG_SUPERBLOCKS
+    if (m->mbmi.sb_type) {
+      const int nmbs = 1 << m->mbmi.sb_type;
+      const int ymbs = MIN(cm->mb_rows - mb_row, nmbs);
+      const int xmbs = MIN(cm->mb_cols - mb_col, nmbs);
+      int x, y;
+
+      for (y = 0; y < ymbs; y++) {
+        for (x = 0; x < xmbs; x++) {
+          cm->last_frame_seg_map[map_index + x + y * cm->mb_cols] =
+              m->mbmi.segment_id;
+        }
+      }
+    } else
+#endif
+    {
+      cm->last_frame_seg_map[map_index] = m->mbmi.segment_id;
+    }
   }
 
   m->mbmi.mb_skip_coeff = 0;
@@ -145,7 +162,7 @@
   }
 
 #if CONFIG_SUPERBLOCKS
-  if (m->mbmi.encoded_as_sb) {
+  if (m->mbmi.sb_type) {
     y_mode = (MB_PREDICTION_MODE) read_kf_sb_ymode(bc,
       pbi->common.sb_kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
   } else
@@ -212,12 +229,12 @@
     if (m->mbmi.txfm_size != TX_4X4 && m->mbmi.mode != I8X8_PRED) {
       m->mbmi.txfm_size += vp9_read(bc, cm->prob_tx[1]);
 #if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
-      if (m->mbmi.txfm_size != TX_8X8 && m->mbmi.encoded_as_sb)
+      if (m->mbmi.txfm_size != TX_8X8 && m->mbmi.sb_type)
         m->mbmi.txfm_size += vp9_read(bc, cm->prob_tx[2]);
 #endif
     }
 #if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
-  } else if (cm->txfm_mode >= ALLOW_32X32 && m->mbmi.encoded_as_sb) {
+  } else if (cm->txfm_mode >= ALLOW_32X32 && m->mbmi.sb_type) {
     m->mbmi.txfm_size = TX_32X32;
 #endif
   } else if (cm->txfm_mode >= ALLOW_16X16 && m->mbmi.mode <= TM_PRED) {
@@ -638,14 +655,17 @@
         read_mb_segid(bc, mbmi, xd);
       }
 #if CONFIG_SUPERBLOCKS
-      if (mbmi->encoded_as_sb) {
-        cm->last_frame_seg_map[index] = mbmi->segment_id;
-        if (mb_col + 1 < cm->mb_cols)
-          cm->last_frame_seg_map[index + 1] = mbmi->segment_id;
-        if (mb_row + 1 < cm->mb_rows) {
-          cm->last_frame_seg_map[index + cm->mb_cols] = mbmi->segment_id;
-          if (mb_col + 1 < cm->mb_cols)
-            cm->last_frame_seg_map[index + cm->mb_cols + 1] = mbmi->segment_id;
+      if (mbmi->sb_type) {
+        const int nmbs = 1 << mbmi->sb_type;
+        const int ymbs = MIN(cm->mb_rows - mb_row, nmbs);
+        const int xmbs = MIN(cm->mb_cols - mb_col, nmbs);
+        int x, y;
+
+        for (y = 0; y < ymbs; y++) {
+          for (x = 0; x < xmbs; x++) {
+            cm->last_frame_seg_map[index + x + y * cm->mb_cols] =
+                mbmi->segment_id;
+          }
         }
       } else
 #endif
@@ -654,18 +674,21 @@
       }
     } else {
 #if CONFIG_SUPERBLOCKS
-      if (mbmi->encoded_as_sb) {
-        mbmi->segment_id = cm->last_frame_seg_map[index];
-        if (mb_col < cm->mb_cols - 1)
-          mbmi->segment_id = mbmi->segment_id &&
-                             cm->last_frame_seg_map[index + 1];
-        if (mb_row < cm->mb_rows - 1) {
-          mbmi->segment_id = mbmi->segment_id &&
-                             cm->last_frame_seg_map[index + cm->mb_cols];
-          if (mb_col < cm->mb_cols - 1)
-            mbmi->segment_id = mbmi->segment_id &&
-                               cm->last_frame_seg_map[index + cm->mb_cols + 1];
+      if (mbmi->sb_type) {
+        const int nmbs = 1 << mbmi->sb_type;
+        const int ymbs = MIN(cm->mb_rows - mb_row, nmbs);
+        const int xmbs = MIN(cm->mb_cols - mb_col, nmbs);
+        unsigned segment_id = -1;
+        int x, y;
+
+        for (y = 0; y < ymbs; y++) {
+          for (x = 0; x < xmbs; x++) {
+            segment_id = MIN(segment_id,
+                             cm->last_frame_seg_map[index + x +
+                                                    y * cm->mb_cols]);
+          }
         }
+        mbmi->segment_id = segment_id;
       } else
 #endif
       {
@@ -693,6 +716,11 @@
   int mb_to_right_edge;
   int mb_to_top_edge;
   int mb_to_bottom_edge;
+#if CONFIG_SUPERBLOCKS
+  const int mb_size = 1 << mi->mbmi.sb_type;
+#else
+  const int mb_size = 1;
+#endif
 
   mb_to_top_edge = xd->mb_to_top_edge;
   mb_to_bottom_edge = xd->mb_to_bottom_edge;
@@ -707,18 +735,8 @@
   xd->mb_to_left_edge =
     mb_to_left_edge = -((mb_col * 16) << 3);
   mb_to_left_edge -= LEFT_TOP_MARGIN;
-
-#if CONFIG_SUPERBLOCKS
-  if (mi->mbmi.encoded_as_sb) {
-    xd->mb_to_right_edge =
-      mb_to_right_edge = ((pbi->common.mb_cols - 2 - mb_col) * 16) << 3;
-  } else {
-#endif
-    xd->mb_to_right_edge =
-      mb_to_right_edge = ((pbi->common.mb_cols - 1 - mb_col) * 16) << 3;
-#if CONFIG_SUPERBLOCKS
-  }
-#endif
+  xd->mb_to_right_edge =
+      mb_to_right_edge = ((pbi->common.mb_cols - mb_size - mb_col) * 16) << 3;
   mb_to_right_edge += RIGHT_BOTTOM_MARGIN;
 
   // Make sure the MACROBLOCKD mode info pointer is pointed at the
@@ -801,7 +819,7 @@
           vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
       } else {
 #if CONFIG_SUPERBLOCKS
-        if (mbmi->encoded_as_sb)
+        if (mbmi->sb_type)
           mbmi->mode = read_sb_mv_ref(bc, mv_ref_p);
         else
 #endif
@@ -1155,7 +1173,7 @@
       mbmi->mode = (MB_PREDICTION_MODE)
                    vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
 #if CONFIG_SUPERBLOCKS
-    } else if (mbmi->encoded_as_sb) {
+    } else if (mbmi->sb_type) {
       mbmi->mode = (MB_PREDICTION_MODE)
                    read_sb_ymode(bc, pbi->common.fc.sb_ymode_prob);
       pbi->common.fc.sb_ymode_counts[mbmi->mode]++;
@@ -1232,12 +1250,12 @@
         mbmi->mode != SPLITMV) {
       mbmi->txfm_size += vp9_read(bc, cm->prob_tx[1]);
 #if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
-      if (mbmi->encoded_as_sb && mbmi->txfm_size != TX_8X8)
+      if (mbmi->sb_type && mbmi->txfm_size != TX_8X8)
         mbmi->txfm_size += vp9_read(bc, cm->prob_tx[2]);
 #endif
     }
 #if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
-  } else if (mbmi->encoded_as_sb && cm->txfm_mode >= ALLOW_32X32) {
+  } else if (mbmi->sb_type && cm->txfm_mode >= ALLOW_32X32) {
     mbmi->txfm_size = TX_32X32;
 #endif
   } else if (cm->txfm_mode >= ALLOW_16X16 &&
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index af34582..d524ade 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -10,6 +10,7 @@
 
 
 #include "vp9/decoder/vp9_onyxd_int.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_header.h"
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_reconintra4x4.h"
@@ -172,55 +173,69 @@
 static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd) {
   if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
 #if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+#if CONFIG_SUPERBLOCKS64
+    if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
+      vp9_build_intra_predictors_sb64uv_s(xd);
+      vp9_build_intra_predictors_sb64y_s(xd);
+    } else
+#endif  // CONFIG_SUPERBLOCKS64
+    if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {
       vp9_build_intra_predictors_sbuv_s(xd);
       vp9_build_intra_predictors_sby_s(xd);
-    } else {
-#endif
-    vp9_build_intra_predictors_mbuv_s(xd);
-    vp9_build_intra_predictors_mby_s(xd);
-#if CONFIG_SUPERBLOCKS
+    } else
+#endif  // CONFIG_SUPERBLOCKS
+    {
+      vp9_build_intra_predictors_mbuv_s(xd);
+      vp9_build_intra_predictors_mby_s(xd);
     }
-#endif
   } else {
 #if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+#if CONFIG_SUPERBLOCKS64
+    if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
+      vp9_build_inter64x64_predictors_sb(xd,
+                                         xd->dst.y_buffer,
+                                         xd->dst.u_buffer,
+                                         xd->dst.v_buffer,
+                                         xd->dst.y_stride,
+                                         xd->dst.uv_stride);
+    } else
+#endif  // CONFIG_SUPERBLOCKS64
+    if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {
       vp9_build_inter32x32_predictors_sb(xd,
                                          xd->dst.y_buffer,
                                          xd->dst.u_buffer,
                                          xd->dst.v_buffer,
                                          xd->dst.y_stride,
                                          xd->dst.uv_stride);
-    } else {
-#endif
-    vp9_build_1st_inter16x16_predictors_mb(xd,
-                                           xd->dst.y_buffer,
-                                           xd->dst.u_buffer,
-                                           xd->dst.v_buffer,
-                                           xd->dst.y_stride,
-                                           xd->dst.uv_stride);
-
-    if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-      vp9_build_2nd_inter16x16_predictors_mb(xd,
+    } else
+#endif  // CONFIG_SUPERBLOCKS
+    {
+      vp9_build_1st_inter16x16_predictors_mb(xd,
                                              xd->dst.y_buffer,
                                              xd->dst.u_buffer,
                                              xd->dst.v_buffer,
                                              xd->dst.y_stride,
                                              xd->dst.uv_stride);
-    }
-#if CONFIG_COMP_INTERINTRA_PRED
-    else if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
-      vp9_build_interintra_16x16_predictors_mb(xd,
+
+      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
+        vp9_build_2nd_inter16x16_predictors_mb(xd,
                                                xd->dst.y_buffer,
                                                xd->dst.u_buffer,
                                                xd->dst.v_buffer,
                                                xd->dst.y_stride,
                                                xd->dst.uv_stride);
-    }
+      }
+#if CONFIG_COMP_INTERINTRA_PRED
+      else if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
+        vp9_build_interintra_16x16_predictors_mb(xd,
+                                                 xd->dst.y_buffer,
+                                                 xd->dst.u_buffer,
+                                                 xd->dst.v_buffer,
+                                                 xd->dst.y_stride,
+                                                 xd->dst.uv_stride);
+      }
 #endif
-#if CONFIG_SUPERBLOCKS
     }
-#endif
   }
 }
 
@@ -546,8 +561,9 @@
 
 #if CONFIG_SUPERBLOCKS
 static void decode_16x16_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                            BOOL_DECODER* const bc, int n) {
-  int x_idx = n & 1, y_idx = n >> 1;
+                            BOOL_DECODER* const bc, int n,
+                            int maska, int shiftb) {
+  int x_idx = n & maska, y_idx = n >> shiftb;
   TX_TYPE tx_type = get_tx_type_16x16(xd, &xd->block[0]);
   if (tx_type != DCT_DCT) {
     vp9_ht_dequant_idct_add_16x16_c(
@@ -571,9 +587,10 @@
 };
 
 static void decode_8x8_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                          BOOL_DECODER* const bc, int n) {
+                          BOOL_DECODER* const bc, int n,
+                          int maska, int shiftb) {
+  int x_idx = n & maska, y_idx = n >> shiftb;
   BLOCKD *b = &xd->block[24];
-  int x_idx = n & 1, y_idx = n >> 1;
   TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[0]);
   if (tx_type != DCT_DCT) {
     int i;
@@ -632,9 +649,10 @@
 };
 
 static void decode_4x4_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                          BOOL_DECODER* const bc, int n) {
+                          BOOL_DECODER* const bc, int n,
+                          int maska, int shiftb) {
+  int x_idx = n & maska, y_idx = n >> shiftb;
   BLOCKD *b = &xd->block[24];
-  int x_idx = n & 1, y_idx = n >> 1;
   TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[0]);
   if (tx_type != DCT_DCT) {
     int i;
@@ -687,16 +705,148 @@
       xd->dst.uv_stride, xd->eobs + 16, xd);
 };
 
-static void decode_superblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                              int mb_row, unsigned int mb_col,
-                              BOOL_DECODER* const bc) {
+#if CONFIG_SUPERBLOCKS64
+static void decode_superblock64(VP9D_COMP *pbi, MACROBLOCKD *xd,
+                                int mb_row, unsigned int mb_col,
+                                BOOL_DECODER* const bc) {
   int i, n, eobtotal;
   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
   VP9_COMMON *const pc = &pbi->common;
   MODE_INFO *orig_mi = xd->mode_info_context;
   const int mis = pc->mode_info_stride;
 
-  assert(xd->mode_info_context->mbmi.encoded_as_sb);
+  assert(xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64);
+
+  if (pbi->common.frame_type != KEY_FRAME)
+    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter, pc);
+
+  // re-initialize macroblock dequantizer before detokenization
+  if (xd->segmentation_enabled)
+    mb_init_dequantizer(pbi, xd);
+
+  if (xd->mode_info_context->mbmi.mb_skip_coeff) {
+    int n;
+
+    vp9_reset_mb_tokens_context(xd);
+    for (n = 1; n <= 3; n++) {
+      if (mb_col < pc->mb_cols - n)
+        xd->above_context += n;
+      if (mb_row < pc->mb_rows - n)
+        xd->left_context += n;
+      vp9_reset_mb_tokens_context(xd);
+      if (mb_col < pc->mb_cols - n)
+        xd->above_context -= n;
+      if (mb_row < pc->mb_rows - n)
+        xd->left_context -= n;
+    }
+
+    /* Special case:  Force the loopfilter to skip when eobtotal and
+     * mb_skip_coeff are zero.
+     */
+    skip_recon_mb(pbi, xd);
+    return;
+  }
+
+  /* do prediction */
+  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+    vp9_build_intra_predictors_sb64y_s(xd);
+    vp9_build_intra_predictors_sb64uv_s(xd);
+  } else {
+    vp9_build_inter64x64_predictors_sb(xd, xd->dst.y_buffer,
+                                       xd->dst.u_buffer, xd->dst.v_buffer,
+                                       xd->dst.y_stride, xd->dst.uv_stride);
+  }
+
+  /* dequantization and idct */
+#if CONFIG_TX32X32
+  if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {
+    for (n = 0; n < 4; n++) {
+      const int x_idx = n & 1, y_idx = n >> 1;
+
+      if (mb_col + x_idx * 2 >= pc->mb_cols ||
+          mb_row + y_idx * 2 >= pc->mb_rows)
+        continue;
+
+      xd->left_context = pc->left_context + (y_idx << 1);
+      xd->above_context = pc->above_context + mb_col + (x_idx << 1);
+      xd->mode_info_context = orig_mi + x_idx * 2 + y_idx * 2 * mis;
+      eobtotal = vp9_decode_sb_tokens(pbi, xd, bc);
+      if (eobtotal == 0) {  // skip loopfilter
+        xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+        if (mb_col + 1 < pc->mb_cols)
+          xd->mode_info_context[1].mbmi.mb_skip_coeff = 1;
+        if (mb_row + 1 < pc->mb_rows) {
+          xd->mode_info_context[mis].mbmi.mb_skip_coeff = 1;
+          if (mb_col + 1 < pc->mb_cols)
+            xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = 1;
+        }
+      } else {
+        vp9_dequant_idct_add_32x32(xd->sb_coeff_data.qcoeff, xd->block[0].dequant,
+                                   xd->dst.y_buffer + x_idx * 32 +
+                                       xd->dst.y_stride * y_idx * 32,
+                                   xd->dst.y_buffer + x_idx * 32 +
+                                       xd->dst.y_stride * y_idx * 32,
+                                   xd->dst.y_stride, xd->dst.y_stride,
+                                   xd->eobs[0]);
+        vp9_dequant_idct_add_uv_block_16x16_c(xd->sb_coeff_data.qcoeff + 1024,
+                                              xd->block[16].dequant,
+                                              xd->dst.u_buffer + x_idx * 16 +
+                                                xd->dst.uv_stride * y_idx * 16,
+                                              xd->dst.v_buffer + x_idx * 16 +
+                                                xd->dst.uv_stride * y_idx * 16,
+                                              xd->dst.uv_stride, xd->eobs + 16);
+      }
+    }
+  } else {
+#endif
+    for (n = 0; n < 16; n++) {
+      int x_idx = n & 3, y_idx = n >> 2;
+
+      if (mb_col + x_idx >= pc->mb_cols || mb_row + y_idx >= pc->mb_rows)
+        continue;
+
+      xd->above_context = pc->above_context + mb_col + x_idx;
+      xd->left_context = pc->left_context + y_idx;
+      xd->mode_info_context = orig_mi + x_idx + y_idx * mis;
+      for (i = 0; i < 25; i++) {
+        xd->block[i].eob = 0;
+        xd->eobs[i] = 0;
+      }
+
+      eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);
+      if (eobtotal == 0) {  // skip loopfilter
+        xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+        continue;
+      }
+
+      if (tx_size == TX_16X16) {
+        decode_16x16_sb(pbi, xd, bc, n, 3, 2);
+      } else if (tx_size == TX_8X8) {
+        decode_8x8_sb(pbi, xd, bc, n, 3, 2);
+      } else {
+        decode_4x4_sb(pbi, xd, bc, n, 3, 2);
+      }
+    }
+#if CONFIG_TX32X32
+  }
+#endif
+
+  xd->above_context = pc->above_context + mb_col;
+  xd->left_context = pc->left_context;
+  xd->mode_info_context = orig_mi;
+}
+#endif  // CONFIG_SUPERBLOCKS64
+
+static void decode_superblock32(VP9D_COMP *pbi, MACROBLOCKD *xd,
+                                int mb_row, unsigned int mb_col,
+                                BOOL_DECODER* const bc) {
+  int i, n, eobtotal;
+  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
+  VP9_COMMON *const pc = &pbi->common;
+  MODE_INFO *orig_mi = xd->mode_info_context;
+  const int mis = pc->mode_info_stride;
+
+  assert(xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32);
 
   if (pbi->common.frame_type != KEY_FRAME)
     vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter, pc);
@@ -767,7 +917,7 @@
 
 
     xd->above_context = pc->above_context + mb_col + x_idx;
-    xd->left_context = pc->left_context + y_idx;
+    xd->left_context = pc->left_context + y_idx + (mb_row & 2);
     xd->mode_info_context = orig_mi + x_idx + y_idx * mis;
     for (i = 0; i < 25; i++) {
       xd->block[i].eob = 0;
@@ -781,16 +931,16 @@
     }
 
     if (tx_size == TX_16X16) {
-      decode_16x16_sb(pbi, xd, bc, n);
+      decode_16x16_sb(pbi, xd, bc, n, 1, 1);
     } else if (tx_size == TX_8X8) {
-      decode_8x8_sb(pbi, xd, bc, n);
+      decode_8x8_sb(pbi, xd, bc, n, 1, 1);
     } else {
-      decode_4x4_sb(pbi, xd, bc, n);
+      decode_4x4_sb(pbi, xd, bc, n, 1, 1);
     }
   }
 
   xd->above_context = pc->above_context + mb_col;
-  xd->left_context = pc->left_context;
+  xd->left_context = pc->left_context + (mb_row & 2);
   xd->mode_info_context = orig_mi;
 #if CONFIG_TX32X32
   }
@@ -807,7 +957,7 @@
   int tx_size;
 
 #if CONFIG_SUPERBLOCKS
-  assert(!xd->mode_info_context->mbmi.encoded_as_sb);
+  assert(!xd->mode_info_context->mbmi.sb_type);
 #endif
 
   // re-initialize macroblock dequantizer before detokenization
@@ -930,190 +1080,186 @@
 FILE *vpxlog = 0;
 #endif
 
+static void set_offsets(VP9D_COMP *pbi, int block_size,
+                        int mb_row, int mb_col) {
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  const int mis = cm->mode_info_stride;
+  const int idx = mis * mb_row + mb_col;
+  const int dst_fb_idx = cm->new_fb_idx;
+  const int recon_y_stride = cm->yv12_fb[dst_fb_idx].y_stride;
+  const int recon_uv_stride = cm->yv12_fb[dst_fb_idx].uv_stride;
+  const int recon_yoffset = mb_row * 16 * recon_y_stride + 16 * mb_col;
+  const int recon_uvoffset = mb_row * 8 * recon_uv_stride + 8 * mb_col;
+
+  xd->mode_info_context = cm->mi + idx;
+#if CONFIG_SUPERBLOCKS
+  xd->mode_info_context->mbmi.sb_type = block_size >> 5;
+#endif
+  xd->prev_mode_info_context = cm->prev_mi + idx;
+  xd->above_context = cm->above_context + mb_col;
+  xd->left_context = cm->left_context + (mb_row & 3);
+
+  /* Distance of Mb to the various image edges.
+   * These are specified to 8th pel as they are always compared to
+   * values that are in 1/8th pel units
+   */
+  block_size >>= 4;  // in mb units
+  xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+  xd->mb_to_left_edge = -((mb_col * 16) << 3);
+  xd->mb_to_bottom_edge = ((cm->mb_rows - block_size - mb_row) * 16) << 3;
+  xd->mb_to_right_edge = ((cm->mb_cols - block_size - mb_col) * 16) << 3;
+
+  xd->up_available = (mb_row != 0);
+  xd->left_available = (mb_col != 0);
+
+  xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+  xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+  xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+}
+
+static void set_refs(VP9D_COMP *pbi, int block_size,
+                     int mb_row, int mb_col) {
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  MODE_INFO *mi = xd->mode_info_context;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+
+  if (mbmi->ref_frame > INTRA_FRAME) {
+    int ref_fb_idx, ref_yoffset, ref_uvoffset, ref_y_stride, ref_uv_stride;
+
+    /* Select the appropriate reference frame for this MB */
+    if (mbmi->ref_frame == LAST_FRAME)
+      ref_fb_idx = cm->lst_fb_idx;
+    else if (mbmi->ref_frame == GOLDEN_FRAME)
+      ref_fb_idx = cm->gld_fb_idx;
+    else
+      ref_fb_idx = cm->alt_fb_idx;
+
+    ref_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
+    ref_yoffset = mb_row * 16 * ref_y_stride + 16 * mb_col;
+    xd->pre.y_buffer = cm->yv12_fb[ref_fb_idx].y_buffer + ref_yoffset;
+    ref_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+    ref_uvoffset = mb_row * 8 * ref_uv_stride + 8 * mb_col;
+    xd->pre.u_buffer = cm->yv12_fb[ref_fb_idx].u_buffer + ref_uvoffset;
+    xd->pre.v_buffer = cm->yv12_fb[ref_fb_idx].v_buffer + ref_uvoffset;
+
+    /* propagate errors from reference frames */
+    xd->corrupted |= cm->yv12_fb[ref_fb_idx].corrupted;
+
+    if (mbmi->second_ref_frame > INTRA_FRAME) {
+      int second_ref_fb_idx;
+
+      /* Select the appropriate reference frame for this MB */
+      if (mbmi->second_ref_frame == LAST_FRAME)
+        second_ref_fb_idx = cm->lst_fb_idx;
+      else if (mbmi->second_ref_frame == GOLDEN_FRAME)
+        second_ref_fb_idx = cm->gld_fb_idx;
+      else
+        second_ref_fb_idx = cm->alt_fb_idx;
+
+      xd->second_pre.y_buffer =
+          cm->yv12_fb[second_ref_fb_idx].y_buffer + ref_yoffset;
+      xd->second_pre.u_buffer =
+          cm->yv12_fb[second_ref_fb_idx].u_buffer + ref_uvoffset;
+      xd->second_pre.v_buffer =
+          cm->yv12_fb[second_ref_fb_idx].v_buffer + ref_uvoffset;
+
+      /* propagate errors from reference frames */
+      xd->corrupted |= cm->yv12_fb[second_ref_fb_idx].corrupted;
+    }
+  }
+
+#if CONFIG_SUPERBLOCKS
+  if (mbmi->sb_type) {
+    const int n_mbs = 1 << mbmi->sb_type;
+    const int y_mbs = MIN(n_mbs, cm->mb_rows - mb_row);
+    const int x_mbs = MIN(n_mbs, cm->mb_cols - mb_col);
+    const int mis = cm->mode_info_stride;
+    int x, y;
+
+    for (y = 0; y < y_mbs; y++) {
+      for (x = !y; x < x_mbs; x++) {
+        mi[y * mis + x] = *mi;
+      }
+    }
+  }
+#endif
+}
+
 /* Decode a row of Superblocks (2x2 region of MBs) */
-static void
-decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc, int mbrow, MACROBLOCKD *xd,
-              BOOL_DECODER* const bc) {
-  int i;
-  int sb_col;
-  int mb_row, mb_col;
-  int recon_yoffset, recon_uvoffset;
-  int ref_fb_idx = pc->lst_fb_idx;
-  int dst_fb_idx = pc->new_fb_idx;
-  int recon_y_stride = pc->yv12_fb[ref_fb_idx].y_stride;
-  int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;
-  int row_delta[4] = { 0, +1,  0, -1};
-  int col_delta[4] = { +1, -1, +1, +1};
-  int sb_cols = (pc->mb_cols + 1) >> 1;
+static void decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc,
+                          int mb_row, MACROBLOCKD *xd,
+                          BOOL_DECODER* const bc) {
+  int mb_col;
 
   // For a SB there are 2 left contexts, each pertaining to a MB row within
   vpx_memset(pc->left_context, 0, sizeof(pc->left_context));
 
-  mb_row = mbrow;
-  mb_col = 0;
-
-  for (sb_col = 0; sb_col < sb_cols; sb_col++) {
-    MODE_INFO *mi = xd->mode_info_context;
-
-#if CONFIG_SUPERBLOCKS
-    mi->mbmi.encoded_as_sb = vp9_read(bc, pc->sb_coded);
-#endif
-
-    // Process the 4 MBs within the SB in the order:
-    // top-left, top-right, bottom-left, bottom-right
-    for (i = 0; i < 4; i++) {
-      int dy = row_delta[i];
-      int dx = col_delta[i];
-      int offset_extended = dy * xd->mode_info_stride + dx;
-
-      xd->mb_index = i;
-
-      mi = xd->mode_info_context;
-      if ((mb_row >= pc->mb_rows) || (mb_col >= pc->mb_cols)) {
-        // MB lies outside frame, skip on to next
-        mb_row += dy;
-        mb_col += dx;
-        xd->mode_info_context += offset_extended;
-        xd->prev_mode_info_context += offset_extended;
-        continue;
-      }
-#if CONFIG_SUPERBLOCKS
-      if (i)
-        mi->mbmi.encoded_as_sb = 0;
-#endif
-
-      // Set above context pointer
-      xd->above_context = pc->above_context + mb_col;
-      xd->left_context = pc->left_context + (i >> 1);
-
-      /* Distance of Mb to the various image edges.
-       * These are specified to 8th pel as they are always compared to
-       * values that are in 1/8th pel units
-       */
-      xd->mb_to_top_edge = -((mb_row * 16)) << 3;
-      xd->mb_to_left_edge = -((mb_col * 16) << 3);
-#if CONFIG_SUPERBLOCKS
-      if (mi->mbmi.encoded_as_sb) {
-        xd->mb_to_bottom_edge = ((pc->mb_rows - 2 - mb_row) * 16) << 3;
-        xd->mb_to_right_edge = ((pc->mb_cols - 2 - mb_col) * 16) << 3;
-      } else {
-#endif
-        xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
-        xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
-#if CONFIG_SUPERBLOCKS
-      }
-#endif
-#ifdef DEC_DEBUG
-      dec_debug = (pbi->common.current_video_frame == 46 &&
-                   mb_row == 5 && mb_col == 2);
-      if (dec_debug)
-#if CONFIG_SUPERBLOCKS
-        printf("Enter Debug %d %d sb %d\n", mb_row, mb_col,
-               mi->mbmi.encoded_as_sb);
-#else
-        printf("Enter Debug %d %d\n", mb_row, mb_col);
-#endif
-#endif
-      xd->up_available = (mb_row != 0);
-      xd->left_available = (mb_col != 0);
-
-
-      recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
-      recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
-
-      xd->dst.y_buffer = pc->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
-      xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
-      xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
-
+  for (mb_col = 0; mb_col < pc->mb_cols; mb_col += 4) {
+#if CONFIG_SUPERBLOCKS64 && CONFIG_SUPERBLOCKS
+    if (vp9_read(bc, pc->sb64_coded)) {
+      set_offsets(pbi, 64, mb_row, mb_col);
       vp9_decode_mb_mode_mv(pbi, xd, mb_row, mb_col, bc);
+      set_refs(pbi, 64, mb_row, mb_col);
+      decode_superblock64(pbi, xd, mb_row, mb_col, bc);
+      xd->corrupted |= bool_error(bc);
+    } else
+#endif  // CONFIG_SUPERBLOCKS64
+    {
+      int j;
 
-      update_blockd_bmi(xd);
-#ifdef DEC_DEBUG
-      if (dec_debug)
-        printf("Hello\n");
-#endif
+      for (j = 0; j < 4; j++) {
+        const int x_idx_sb = (j & 1) << 1, y_idx_sb = j & 2;
 
-      /* Select the appropriate reference frame for this MB */
-      if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
-        ref_fb_idx = pc->lst_fb_idx;
-      else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
-        ref_fb_idx = pc->gld_fb_idx;
-      else
-        ref_fb_idx = pc->alt_fb_idx;
+        if (mb_row + y_idx_sb >= pc->mb_rows ||
+            mb_col + x_idx_sb >= pc->mb_cols) {
+          // MB lies outside frame, skip on to next
+          continue;
+        }
 
-      xd->pre.y_buffer = pc->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
-      xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
-      xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
-
-      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-        int second_ref_fb_idx;
-
-        /* Select the appropriate reference frame for this MB */
-        if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
-          second_ref_fb_idx = pc->lst_fb_idx;
-        else if (xd->mode_info_context->mbmi.second_ref_frame ==
-                 GOLDEN_FRAME)
-          second_ref_fb_idx = pc->gld_fb_idx;
-        else
-          second_ref_fb_idx = pc->alt_fb_idx;
-
-        xd->second_pre.y_buffer =
-          pc->yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;
-        xd->second_pre.u_buffer =
-          pc->yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;
-        xd->second_pre.v_buffer =
-          pc->yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;
-      }
-
-      if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
-        /* propagate errors from reference frames */
-        xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;
-      }
+        xd->sb_index = j;
 
 #if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        if (mb_col < pc->mb_cols - 1)
-          mi[1] = mi[0];
-        if (mb_row < pc->mb_rows - 1) {
-          mi[pc->mode_info_stride] = mi[0];
-          if (mb_col < pc->mb_cols - 1)
-            mi[pc->mode_info_stride + 1] = mi[0];
+        if (vp9_read(bc, pc->sb32_coded)) {
+          set_offsets(pbi, 32, mb_row + y_idx_sb, mb_col + x_idx_sb);
+          vp9_decode_mb_mode_mv(pbi,
+                                xd, mb_row + y_idx_sb, mb_col + x_idx_sb, bc);
+          set_refs(pbi, 32, mb_row + y_idx_sb, mb_col + x_idx_sb);
+          decode_superblock32(pbi,
+                              xd, mb_row + y_idx_sb, mb_col + x_idx_sb, bc);
+          xd->corrupted |= bool_error(bc);
+        } else
+#endif  // CONFIG_SUPERBLOCKS
+        {
+          int i;
+
+          // Process the 4 MBs within the SB in the order:
+          // top-left, top-right, bottom-left, bottom-right
+          for (i = 0; i < 4; i++) {
+            const int x_idx = x_idx_sb + (i & 1), y_idx = y_idx_sb + (i >> 1);
+
+            if (mb_row + y_idx >= pc->mb_rows ||
+                mb_col + x_idx >= pc->mb_cols) {
+              // MB lies outside frame, skip on to next
+              continue;
+            }
+
+            set_offsets(pbi, 16, mb_row + y_idx, mb_col + x_idx);
+            xd->mb_index = i;
+            vp9_decode_mb_mode_mv(pbi, xd, mb_row + y_idx, mb_col + x_idx, bc);
+            update_blockd_bmi(xd);
+            set_refs(pbi, 16, mb_row + y_idx, mb_col + x_idx);
+            vp9_intra_prediction_down_copy(xd);
+            decode_macroblock(pbi, xd, mb_row, mb_col, bc);
+
+            /* check if the boolean decoder has suffered an error */
+            xd->corrupted |= bool_error(bc);
+          }
         }
       }
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        decode_superblock(pbi, xd, mb_row, mb_col, bc);
-      } else {
-#endif
-        vp9_intra_prediction_down_copy(xd);
-        decode_macroblock(pbi, xd, mb_row, mb_col, bc);
-#if CONFIG_SUPERBLOCKS
-      }
-#endif
-
-      /* check if the boolean decoder has suffered an error */
-      xd->corrupted |= bool_error(bc);
-
-#if CONFIG_SUPERBLOCKS
-      if (mi->mbmi.encoded_as_sb) {
-        assert(!i);
-        mb_col += 2;
-        xd->mode_info_context += 2;
-        xd->prev_mode_info_context += 2;
-        break;
-      }
-#endif
-
-      // skip to next MB
-      xd->mode_info_context += offset_extended;
-      xd->prev_mode_info_context += offset_extended;
-      mb_row += dy;
-      mb_col += dx;
     }
   }
-
-  /* skip prediction column */
-  xd->mode_info_context += 1 - (pc->mb_cols & 0x1) + xd->mode_info_stride;
-  xd->prev_mode_info_context += 1 - (pc->mb_cols & 0x1) + xd->mode_info_stride;
 }
 
 static unsigned int read_partition_size(const unsigned char *cx_size) {
@@ -1462,7 +1608,10 @@
   }
 
 #if CONFIG_SUPERBLOCKS
-  pc->sb_coded = vp9_read_literal(&header_bc, 8);
+#if CONFIG_SUPERBLOCKS64
+  pc->sb64_coded = vp9_read_literal(&header_bc, 8);
+#endif
+  pc->sb32_coded = vp9_read_literal(&header_bc, 8);
 #endif
 
   /* Read the loop filter level and type */
@@ -1727,12 +1876,8 @@
 
   vpx_memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
 
-  // Resset the macroblock mode info context to the start of the list
-  xd->mode_info_context = pc->mi;
-  xd->prev_mode_info_context = pc->prev_mi;
-
   /* Decode a row of superblocks */
-  for (mb_row = 0; mb_row < pc->mb_rows; mb_row += 2) {
+  for (mb_row = 0; mb_row < pc->mb_rows; mb_row += 4) {
     decode_sb_row(pbi, pc, mb_row, xd, &residual_bc);
   }
   corrupt_tokens |= xd->corrupted;
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 956c16c..a8fdc66 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -562,19 +562,7 @@
                            const MB_MODE_INFO *mi, const MACROBLOCKD *xd) {
   // Encode the MB segment id.
   int seg_id = mi->segment_id;
-#if CONFIG_SUPERBLOCKS
-  if (mi->encoded_as_sb) {
-    if (xd->mb_to_right_edge >= 0)
-      seg_id = seg_id && xd->mode_info_context[1].mbmi.segment_id;
-    if (xd->mb_to_bottom_edge >= 0) {
-      seg_id = seg_id &&
-               xd->mode_info_context[xd->mode_info_stride].mbmi.segment_id;
-      if (xd->mb_to_right_edge >= 0)
-        seg_id = seg_id &&
-                xd->mode_info_context[xd->mode_info_stride + 1].mbmi.segment_id;
-    }
-  }
-#endif
+
   if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
     switch (seg_id) {
       case 0:
@@ -703,443 +691,364 @@
   vp9_compute_mod_refprobs(cm);
 }
 
-static void pack_inter_mode_mvs(VP9_COMP *const cpi, vp9_writer *const bc) {
+static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
+                                vp9_writer *bc,
+                                int mb_rows_left, int mb_cols_left) {
   VP9_COMMON *const pc = &cpi->common;
   const nmv_context *nmvc = &pc->fc.nmvc;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  MODE_INFO *m;
-  MODE_INFO *prev_m;
-  TOKENEXTRA *tok = cpi->tok;
-  TOKENEXTRA *tok_end = tok + cpi->tok_count;
-
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
   const int mis = pc->mode_info_stride;
-  int mb_row, mb_col;
-  int row, col;
-
-  // Values used in prediction model coding
-  vp9_prob pred_prob;
-  unsigned char prediction_flag;
-
-  int row_delta[4] = { 0, +1,  0, -1};
-  int col_delta[4] = { +1, -1, +1, +1};
-
-  cpi->mb.partition_info = cpi->mb.pi;
-
-  mb_row = 0;
-  for (row = 0; row < pc->mb_rows; row += 2) {
-    m = pc->mi + row * mis;
-    prev_m = pc->prev_mi + row * mis;
-
-    mb_col = 0;
-    for (col = 0; col < pc->mb_cols; col += 2) {
-      int i;
-
-      // Process the 4 MBs in the order:
-      // top-left, top-right, bottom-left, bottom-right
+  MB_MODE_INFO *const mi = &m->mbmi;
+  const MV_REFERENCE_FRAME rf = mi->ref_frame;
+  const MB_PREDICTION_MODE mode = mi->mode;
+  const int segment_id = mi->segment_id;
 #if CONFIG_SUPERBLOCKS
-      vp9_write(bc, m->mbmi.encoded_as_sb, pc->sb_coded);
-#endif
-      for (i = 0; i < 4; i++) {
-        MB_MODE_INFO *mi;
-        MV_REFERENCE_FRAME rf;
-        MV_REFERENCE_FRAME sec_ref_frame;
-        MB_PREDICTION_MODE mode;
-        int segment_id, skip_coeff;
-
-        int dy = row_delta[i];
-        int dx = col_delta[i];
-        int offset_extended = dy * mis + dx;
-
-        if ((mb_row >= pc->mb_rows) || (mb_col >= pc->mb_cols)) {
-          // MB lies outside frame, move on
-          mb_row += dy;
-          mb_col += dx;
-          m += offset_extended;
-          prev_m += offset_extended;
-          cpi->mb.partition_info += offset_extended;
-          continue;
-        }
-
-        mi = &m->mbmi;
-        rf = mi->ref_frame;
-        sec_ref_frame = mi->second_ref_frame;
-        mode = mi->mode;
-        segment_id = mi->segment_id;
-
-        // Distance of Mb to the various image edges.
-        // These specified to 8th pel as they are always compared to MV
-        // values that are in 1/8th pel units
-        xd->mb_to_left_edge = -((mb_col * 16) << 3);
-        xd->mb_to_top_edge = -((mb_row * 16)) << 3;
-
-#if CONFIG_SUPERBLOCKS
-        if (mi->encoded_as_sb) {
-          xd->mb_to_right_edge = ((pc->mb_cols - 2 - mb_col) * 16) << 3;
-          xd->mb_to_bottom_edge = ((pc->mb_rows - 2 - mb_row) * 16) << 3;
-        } else {
-#endif
-          xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
-          xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
-#if CONFIG_SUPERBLOCKS
-        }
-#endif
-
-        // Make sure the MacroBlockD mode info pointer is set correctly
-        xd->mode_info_context = m;
-        xd->prev_mode_info_context = prev_m;
-
-#ifdef ENTROPY_STATS
-        active_section = 9;
-#endif
-        if (cpi->mb.e_mbd.update_mb_segmentation_map) {
-          // Is temporal coding of the segment map enabled
-          if (pc->temporal_update) {
-            prediction_flag = vp9_get_pred_flag(xd, PRED_SEG_ID);
-            pred_prob = vp9_get_pred_prob(pc, xd, PRED_SEG_ID);
-
-            // Code the segment id prediction flag for this mb
-            vp9_write(bc, prediction_flag, pred_prob);
-
-            // If the mb segment id wasn't predicted code explicitly
-            if (!prediction_flag)
-              write_mb_segid(bc, mi, &cpi->mb.e_mbd);
-          } else {
-            // Normal unpredicted coding
-            write_mb_segid(bc, mi, &cpi->mb.e_mbd);
-          }
-        }
-
-        skip_coeff = 1;
-        if (pc->mb_no_coeff_skip &&
-            (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
-             (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
-          skip_coeff = mi->mb_skip_coeff;
-#if CONFIG_SUPERBLOCKS
-          if (mi->encoded_as_sb) {
-            skip_coeff &= m[1].mbmi.mb_skip_coeff;
-            skip_coeff &= m[mis].mbmi.mb_skip_coeff;
-            skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;
-          }
-#endif
-          vp9_write(bc, skip_coeff,
-                    vp9_get_pred_prob(pc, xd, PRED_MBSKIP));
-        }
-
-        // Encode the reference frame.
-        if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)
-            || vp9_get_segdata(xd, segment_id, SEG_LVL_MODE) >= NEARESTMV) {
-          encode_ref_frame(bc, pc, xd, segment_id, rf);
-        } else {
-          assert(rf == INTRA_FRAME);
-        }
-
-        if (rf == INTRA_FRAME) {
-#ifdef ENTROPY_STATS
-          active_section = 6;
-#endif
-
-          if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
-#if CONFIG_SUPERBLOCKS
-            if (m->mbmi.encoded_as_sb)
-              write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob);
-            else
-#endif
-            write_ymode(bc, mode, pc->fc.ymode_prob);
-          }
-          if (mode == B_PRED) {
-            int j = 0;
-#if CONFIG_COMP_INTRA_PRED
-            int uses_second =
-              m->bmi[0].as_mode.second !=
-              (B_PREDICTION_MODE)(B_DC_PRED - 1);
-            vp9_write(bc, uses_second, DEFAULT_COMP_INTRA_PROB);
-#endif
-            do {
-#if CONFIG_COMP_INTRA_PRED
-              B_PREDICTION_MODE mode2 = m->bmi[j].as_mode.second;
-#endif
-              write_bmode(bc, m->bmi[j].as_mode.first,
-                          pc->fc.bmode_prob);
-#if CONFIG_COMP_INTRA_PRED
-              if (uses_second) {
-                write_bmode(bc, mode2, pc->fc.bmode_prob);
-              }
-#endif
-            } while (++j < 16);
-          }
-          if (mode == I8X8_PRED) {
-            write_i8x8_mode(bc, m->bmi[0].as_mode.first,
-                            pc->fc.i8x8_mode_prob);
-            write_i8x8_mode(bc, m->bmi[2].as_mode.first,
-                            pc->fc.i8x8_mode_prob);
-            write_i8x8_mode(bc, m->bmi[8].as_mode.first,
-                            pc->fc.i8x8_mode_prob);
-            write_i8x8_mode(bc, m->bmi[10].as_mode.first,
-                            pc->fc.i8x8_mode_prob);
-          } else {
-            write_uv_mode(bc, mi->uv_mode,
-                          pc->fc.uv_mode_prob[mode]);
-          }
-        } else {
-          vp9_prob mv_ref_p [VP9_MVREFS - 1];
-
-          vp9_mv_ref_probs(&cpi->common, mv_ref_p, mi->mb_mode_context[rf]);
-
-
-// #ifdef ENTROPY_STATS
-#ifdef ENTROPY_STATS
-          accum_mv_refs(mode, ct);
-          active_section = 3;
-#endif
-
-          // Is the segment coding of mode enabled
-          if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
-#if CONFIG_SUPERBLOCKS
-            if (mi->encoded_as_sb) {
-              write_sb_mv_ref(bc, mode, mv_ref_p);
-            } else
-#endif
-            {
-              write_mv_ref(bc, mode, mv_ref_p);
-            }
-            vp9_accum_mv_refs(&cpi->common, mode, mi->mb_mode_context[rf]);
-          }
-
-#if CONFIG_PRED_FILTER
-          // Is the prediction filter enabled
-          if (mode >= NEARESTMV && mode < SPLITMV) {
-            if (cpi->common.pred_filter_mode == 2)
-              vp9_write(bc, mi->pred_filter_enabled,
-                        pc->prob_pred_filter_off);
-            else
-              assert(mi->pred_filter_enabled ==
-                     cpi->common.pred_filter_mode);
-          }
-#endif
-          if (mode >= NEARESTMV && mode <= SPLITMV)
-          {
-            if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-              write_token(bc, vp9_switchable_interp_tree,
-                          vp9_get_pred_probs(&cpi->common, xd,
-                                             PRED_SWITCHABLE_INTERP),
-                          vp9_switchable_interp_encodings +
-                              vp9_switchable_interp_map[mi->interp_filter]);
-            } else {
-              assert (mi->interp_filter ==
-                      cpi->common.mcomp_filter_type);
-            }
-          }
-
-          // does the feature use compound prediction or not
-          // (if not specified at the frame/segment level)
-          if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
-            vp9_write(bc, mi->second_ref_frame > INTRA_FRAME,
-                      vp9_get_pred_prob(pc, xd, PRED_COMP));
-          }
-#if CONFIG_COMP_INTERINTRA_PRED
-          if (cpi->common.use_interintra &&
-              mode >= NEARESTMV && mode < SPLITMV &&
-              mi->second_ref_frame <= INTRA_FRAME) {
-            vp9_write(bc, mi->second_ref_frame == INTRA_FRAME,
-                      pc->fc.interintra_prob);
-            // if (!cpi->dummy_packing)
-            //   printf("-- %d (%d)\n", mi->second_ref_frame == INTRA_FRAME,
-            //          pc->fc.interintra_prob);
-            if (mi->second_ref_frame == INTRA_FRAME) {
-              // if (!cpi->dummy_packing)
-              //   printf("** %d %d\n", mi->interintra_mode,
-                       // mi->interintra_uv_mode);
-              write_ymode(bc, mi->interintra_mode, pc->fc.ymode_prob);
-#if SEPARATE_INTERINTRA_UV
-              write_uv_mode(bc, mi->interintra_uv_mode,
-                            pc->fc.uv_mode_prob[mi->interintra_mode]);
-#endif
-            }
-          }
-#endif
-
-#if CONFIG_NEW_MVREF
-          // if ((mode == NEWMV) || (mode == SPLITMV)) {
-          if (mode == NEWMV) {
-            // Encode the index of the choice.
-            vp9_write_mv_ref_id(bc,
-                                xd->mb_mv_ref_probs[rf], mi->best_index);
-
-            if (mi->second_ref_frame > 0) {
-              // Encode the index of the choice.
-              vp9_write_mv_ref_id(
-                bc, xd->mb_mv_ref_probs[mi->second_ref_frame],
-                mi->best_second_index);
-            }
-          }
-#endif
-          {
-            switch (mode) { /* new, split require MVs */
-              case NEWMV:
-#ifdef ENTROPY_STATS
-                active_section = 5;
-#endif
-                write_nmv(bc, &mi->mv[0].as_mv, &mi->best_mv,
-                          (const nmv_context*) nmvc,
-                          xd->allow_high_precision_mv);
-
-                if (mi->second_ref_frame > 0) {
-                  write_nmv(bc, &mi->mv[1].as_mv, &mi->best_second_mv,
-                            (const nmv_context*) nmvc,
-                            xd->allow_high_precision_mv);
-                }
-                break;
-              case SPLITMV: {
-                int j = 0;
-
-#ifdef MODE_STATS
-                ++count_mb_seg [mi->partitioning];
-#endif
-
-                write_split(bc, mi->partitioning, cpi->common.fc.mbsplit_prob);
-                cpi->mbsplit_count[mi->partitioning]++;
-
-                do {
-                  B_PREDICTION_MODE blockmode;
-                  int_mv blockmv;
-                  const int *const  L =
-                    vp9_mbsplits [mi->partitioning];
-                  int k = -1;  /* first block in subset j */
-                  int mv_contz;
-                  int_mv leftmv, abovemv;
-
-                  blockmode = cpi->mb.partition_info->bmi[j].mode;
-                  blockmv = cpi->mb.partition_info->bmi[j].mv;
-#if CONFIG_DEBUG
-                  while (j != L[++k])
-                    if (k >= 16)
-                      assert(0);
+  const int mb_size = 1 << mi->sb_type;
 #else
-                  while (j != L[++k]);
+  const int mb_size = 1;
 #endif
-                  leftmv.as_int = left_block_mv(m, k);
-                  abovemv.as_int = above_block_mv(m, k, mis);
-                  mv_contz = vp9_mv_cont(&leftmv, &abovemv);
+  int skip_coeff;
 
-                  write_sub_mv_ref(bc, blockmode,
-                                   cpi->common.fc.sub_mv_ref_prob [mv_contz]);
-                  cpi->sub_mv_ref_count[mv_contz][blockmode - LEFT4X4]++;
-                  if (blockmode == NEW4X4) {
-#ifdef ENTROPY_STATS
-                    active_section = 11;
-#endif
-                    write_nmv(bc, &blockmv.as_mv, &mi->best_mv,
-                              (const nmv_context*) nmvc,
-                              xd->allow_high_precision_mv);
+  int mb_row = pc->mb_rows - mb_rows_left;
+  int mb_col = pc->mb_cols - mb_cols_left;
+  xd->prev_mode_info_context = pc->prev_mi + (m - pc->mi);
+  x->partition_info = x->pi + (m - pc->mi);
 
-                    if (mi->second_ref_frame > 0) {
-                      write_nmv(bc,
-                                &cpi->mb.partition_info->bmi[j].second_mv.as_mv,
-                                &mi->best_second_mv,
-                                (const nmv_context*) nmvc,
-                                xd->allow_high_precision_mv);
-                    }
-                  }
-                } while (++j < cpi->mb.partition_info->count);
-              }
-              break;
-              default:
-                break;
-            }
-          }
-        }
-
-        if (((rf == INTRA_FRAME && mode <= I8X8_PRED) ||
-             (rf != INTRA_FRAME && !(mode == SPLITMV &&
-                                     mi->partitioning == PARTITIONING_4X4))) &&
-            pc->txfm_mode == TX_MODE_SELECT &&
-            !((pc->mb_no_coeff_skip && skip_coeff) ||
-              (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
-               vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
-          TX_SIZE sz = mi->txfm_size;
-          // FIXME(rbultje) code ternary symbol once all experiments are merged
-          vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);
-          if (sz != TX_4X4 && mode != I8X8_PRED && mode != SPLITMV) {
-            vp9_write(bc, sz != TX_8X8, pc->prob_tx[1]);
-#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
-            if (mi->encoded_as_sb && sz != TX_8X8)
-              vp9_write(bc, sz != TX_16X16, pc->prob_tx[2]);
-#endif
-          }
-        }
+  // Distance of Mb to the various image edges.
+  // These specified to 8th pel as they are always compared to MV
+  // values that are in 1/8th pel units
+  xd->mb_to_left_edge = -((mb_col * 16) << 3);
+  xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+  xd->mb_to_right_edge = ((pc->mb_cols - mb_size - mb_col) * 16) << 3;
+  xd->mb_to_bottom_edge = ((pc->mb_rows - mb_size - mb_row) * 16) << 3;
 
 #ifdef ENTROPY_STATS
-        active_section = 1;
-#endif
-        assert(tok < tok_end);
-        pack_mb_tokens(bc, &tok, tok_end);
-
-#if CONFIG_SUPERBLOCKS
-        if (m->mbmi.encoded_as_sb) {
-          assert(!i);
-          mb_col += 2;
-          m += 2;
-          cpi->mb.partition_info += 2;
-          prev_m += 2;
-          break;
-        }
+  active_section = 9;
 #endif
 
-        // Next MB
-        mb_row += dy;
-        mb_col += dx;
-        m += offset_extended;
-        prev_m += offset_extended;
-        cpi->mb.partition_info += offset_extended;
-#if CONFIG_DEBUG
-        assert((prev_m - cpi->common.prev_mip) == (m - cpi->common.mip));
-        assert((prev_m - cpi->common.prev_mi) == (m - cpi->common.mi));
-#endif
+  if (cpi->mb.e_mbd.update_mb_segmentation_map) {
+    // Is temporal coding of the segment map enabled
+    if (pc->temporal_update) {
+      unsigned char prediction_flag = vp9_get_pred_flag(xd, PRED_SEG_ID);
+      vp9_prob pred_prob = vp9_get_pred_prob(pc, xd, PRED_SEG_ID);
+
+      // Code the segment id prediction flag for this mb
+      vp9_write(bc, prediction_flag, pred_prob);
+
+      // If the mb segment id wasn't predicted code explicitly
+      if (!prediction_flag)
+        write_mb_segid(bc, mi, &cpi->mb.e_mbd);
+    } else {
+      // Normal unpredicted coding
+      write_mb_segid(bc, mi, &cpi->mb.e_mbd);
+    }
+  }
+
+  if (!pc->mb_no_coeff_skip) {
+    skip_coeff = 0;
+  } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+             vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) {
+    skip_coeff = 1;
+  } else {
+    const int nmbs = mb_size;
+    const int xmbs = MIN(nmbs, mb_cols_left);
+    const int ymbs = MIN(nmbs, mb_rows_left);
+    int x, y;
+
+    skip_coeff = 1;
+    for (y = 0; y < ymbs; y++) {
+      for (x = 0; x < xmbs; x++) {
+        skip_coeff = skip_coeff && m[y * mis + x].mbmi.mb_skip_coeff;
       }
     }
 
-    // Next SB
-    mb_row += 2;
-    m += mis + (1 - (pc->mb_cols & 0x1));
-    prev_m += mis + (1 - (pc->mb_cols & 0x1));
-    cpi->mb.partition_info += mis + (1 - (pc->mb_cols & 0x1));
+    vp9_write(bc, skip_coeff,
+              vp9_get_pred_prob(pc, xd, PRED_MBSKIP));
+  }
+
+  // Encode the reference frame.
+  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)
+      || vp9_get_segdata(xd, segment_id, SEG_LVL_MODE) >= NEARESTMV) {
+    encode_ref_frame(bc, pc, xd, segment_id, rf);
+  } else {
+    assert(rf == INTRA_FRAME);
+  }
+
+  if (rf == INTRA_FRAME) {
+#ifdef ENTROPY_STATS
+    active_section = 6;
+#endif
+
+    if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
+#if CONFIG_SUPERBLOCKS
+      if (m->mbmi.sb_type)
+        write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob);
+      else
+#endif
+        write_ymode(bc, mode, pc->fc.ymode_prob);
+    }
+    if (mode == B_PRED) {
+      int j = 0;
+#if CONFIG_COMP_INTRA_PRED
+      int uses_second =
+      m->bmi[0].as_mode.second !=
+      (B_PREDICTION_MODE)(B_DC_PRED - 1);
+      vp9_write(bc, uses_second, DEFAULT_COMP_INTRA_PROB);
+#endif
+      do {
+#if CONFIG_COMP_INTRA_PRED
+        B_PREDICTION_MODE mode2 = m->bmi[j].as_mode.second;
+#endif
+        write_bmode(bc, m->bmi[j].as_mode.first,
+                    pc->fc.bmode_prob);
+#if CONFIG_COMP_INTRA_PRED
+        if (uses_second) {
+          write_bmode(bc, mode2, pc->fc.bmode_prob);
+        }
+#endif
+      } while (++j < 16);
+    }
+    if (mode == I8X8_PRED) {
+      write_i8x8_mode(bc, m->bmi[0].as_mode.first,
+                      pc->fc.i8x8_mode_prob);
+      write_i8x8_mode(bc, m->bmi[2].as_mode.first,
+                      pc->fc.i8x8_mode_prob);
+      write_i8x8_mode(bc, m->bmi[8].as_mode.first,
+                      pc->fc.i8x8_mode_prob);
+      write_i8x8_mode(bc, m->bmi[10].as_mode.first,
+                      pc->fc.i8x8_mode_prob);
+    } else {
+      write_uv_mode(bc, mi->uv_mode,
+                    pc->fc.uv_mode_prob[mode]);
+    }
+  } else {
+    vp9_prob mv_ref_p[VP9_MVREFS - 1];
+
+    vp9_mv_ref_probs(&cpi->common, mv_ref_p, mi->mb_mode_context[rf]);
+
+    // #ifdef ENTROPY_STATS
+#ifdef ENTROPY_STATS
+    accum_mv_refs(mode, ct);
+    active_section = 3;
+#endif
+
+    // Is the segment coding of mode enabled
+    if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
+#if CONFIG_SUPERBLOCKS
+      if (mi->sb_type) {
+        write_sb_mv_ref(bc, mode, mv_ref_p);
+      } else
+#endif
+      {
+        write_mv_ref(bc, mode, mv_ref_p);
+      }
+      vp9_accum_mv_refs(&cpi->common, mode, mi->mb_mode_context[rf]);
+    }
+
+#if CONFIG_PRED_FILTER
+    // Is the prediction filter enabled
+    if (mode >= NEARESTMV && mode < SPLITMV) {
+      if (cpi->common.pred_filter_mode == 2)
+        vp9_write(bc, mi->pred_filter_enabled,
+                  pc->prob_pred_filter_off);
+      else
+        assert(mi->pred_filter_enabled ==
+               cpi->common.pred_filter_mode);
+    }
+#endif
+    if (mode >= NEARESTMV && mode <= SPLITMV) {
+      if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+        write_token(bc, vp9_switchable_interp_tree,
+                    vp9_get_pred_probs(&cpi->common, xd,
+                                       PRED_SWITCHABLE_INTERP),
+                    vp9_switchable_interp_encodings +
+                    vp9_switchable_interp_map[mi->interp_filter]);
+      } else {
+        assert(mi->interp_filter == cpi->common.mcomp_filter_type);
+      }
+    }
+
+    // does the feature use compound prediction or not
+    // (if not specified at the frame/segment level)
+    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+      vp9_write(bc, mi->second_ref_frame > INTRA_FRAME,
+                vp9_get_pred_prob(pc, xd, PRED_COMP));
+    }
+#if CONFIG_COMP_INTERINTRA_PRED
+    if (cpi->common.use_interintra &&
+        mode >= NEARESTMV && mode < SPLITMV &&
+        mi->second_ref_frame <= INTRA_FRAME) {
+      vp9_write(bc, mi->second_ref_frame == INTRA_FRAME,
+                pc->fc.interintra_prob);
+      // if (!cpi->dummy_packing)
+      //   printf("-- %d (%d)\n", mi->second_ref_frame == INTRA_FRAME,
+      //          pc->fc.interintra_prob);
+      if (mi->second_ref_frame == INTRA_FRAME) {
+        // if (!cpi->dummy_packing)
+        //   printf("** %d %d\n", mi->interintra_mode,
+        // mi->interintra_uv_mode);
+        write_ymode(bc, mi->interintra_mode, pc->fc.ymode_prob);
+#if SEPARATE_INTERINTRA_UV
+        write_uv_mode(bc, mi->interintra_uv_mode,
+                      pc->fc.uv_mode_prob[mi->interintra_mode]);
+#endif
+      }
+    }
+#endif
+
+#if CONFIG_NEW_MVREF
+    // if ((mode == NEWMV) || (mode == SPLITMV)) {
+    if (mode == NEWMV) {
+      // Encode the index of the choice.
+      vp9_write_mv_ref_id(bc,
+                          xd->mb_mv_ref_probs[rf], mi->best_index);
+
+      if (mi->second_ref_frame > 0) {
+        // Encode the index of the choice.
+        vp9_write_mv_ref_id(
+                            bc, xd->mb_mv_ref_probs[mi->second_ref_frame],
+                            mi->best_second_index);
+      }
+    }
+#endif
+
+    switch (mode) { /* new, split require MVs */
+      case NEWMV:
+#ifdef ENTROPY_STATS
+        active_section = 5;
+#endif
+        write_nmv(bc, &mi->mv[0].as_mv, &mi->best_mv,
+                  (const nmv_context*) nmvc,
+                  xd->allow_high_precision_mv);
+
+        if (mi->second_ref_frame > 0) {
+          write_nmv(bc, &mi->mv[1].as_mv, &mi->best_second_mv,
+                    (const nmv_context*) nmvc,
+                    xd->allow_high_precision_mv);
+        }
+        break;
+      case SPLITMV: {
+        int j = 0;
+
+#ifdef MODE_STATS
+        ++count_mb_seg[mi->partitioning];
+#endif
+
+        write_split(bc, mi->partitioning, cpi->common.fc.mbsplit_prob);
+        cpi->mbsplit_count[mi->partitioning]++;
+
+        do {
+          B_PREDICTION_MODE blockmode;
+          int_mv blockmv;
+          const int *const  L = vp9_mbsplits[mi->partitioning];
+          int k = -1;  /* first block in subset j */
+          int mv_contz;
+          int_mv leftmv, abovemv;
+
+          blockmode = cpi->mb.partition_info->bmi[j].mode;
+          blockmv = cpi->mb.partition_info->bmi[j].mv;
+#if CONFIG_DEBUG
+          while (j != L[++k])
+            if (k >= 16)
+              assert(0);
+#else
+          while (j != L[++k]);
+#endif
+          leftmv.as_int = left_block_mv(m, k);
+          abovemv.as_int = above_block_mv(m, k, mis);
+          mv_contz = vp9_mv_cont(&leftmv, &abovemv);
+
+          write_sub_mv_ref(bc, blockmode,
+                           cpi->common.fc.sub_mv_ref_prob[mv_contz]);
+          cpi->sub_mv_ref_count[mv_contz][blockmode - LEFT4X4]++;
+          if (blockmode == NEW4X4) {
+#ifdef ENTROPY_STATS
+            active_section = 11;
+#endif
+            write_nmv(bc, &blockmv.as_mv, &mi->best_mv,
+                      (const nmv_context*) nmvc,
+                      xd->allow_high_precision_mv);
+
+            if (mi->second_ref_frame > 0) {
+              write_nmv(bc,
+                        &cpi->mb.partition_info->bmi[j].second_mv.as_mv,
+                        &mi->best_second_mv,
+                        (const nmv_context*) nmvc,
+                        xd->allow_high_precision_mv);
+            }
+          }
+        } while (++j < cpi->mb.partition_info->count);
+        break;
+      }
+      default:
+        break;
+    }
+  }
+
+  if (((rf == INTRA_FRAME && mode <= I8X8_PRED) ||
+       (rf != INTRA_FRAME && !(mode == SPLITMV &&
+                               mi->partitioning == PARTITIONING_4X4))) &&
+      pc->txfm_mode == TX_MODE_SELECT &&
+      !((pc->mb_no_coeff_skip && skip_coeff) ||
+        (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+         vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
+    TX_SIZE sz = mi->txfm_size;
+    // FIXME(rbultje) code ternary symbol once all experiments are merged
+    vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);
+    if (sz != TX_4X4 && mode != I8X8_PRED && mode != SPLITMV) {
+      vp9_write(bc, sz != TX_8X8, pc->prob_tx[1]);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      if (mi->sb_type && sz != TX_8X8)
+        vp9_write(bc, sz != TX_16X16, pc->prob_tx[2]);
+#endif
+    }
   }
 }
 
-
-static void write_mb_modes_kf(const VP9_COMMON  *c,
-                              const MACROBLOCKD *xd,
-                              const MODE_INFO   *m,
-                              int                mode_info_stride,
-                              vp9_writer *const  bc) {
-  int ym;
-  int segment_id;
-
-  ym = m->mbmi.mode;
-  segment_id = m->mbmi.segment_id;
+static void write_mb_modes_kf(const VP9_COMP *cpi,
+                              const MODE_INFO *m,
+                              vp9_writer *bc,
+                              int mb_rows_left, int mb_cols_left) {
+  const VP9_COMMON *const c = &cpi->common;
+  const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  const int mis = c->mode_info_stride;
+  const int ym = m->mbmi.mode;
+  const int segment_id = m->mbmi.segment_id;
+  int skip_coeff;
 
   if (xd->update_mb_segmentation_map) {
     write_mb_segid(bc, &m->mbmi, xd);
   }
 
-  if (c->mb_no_coeff_skip &&
-      (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
-       (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
-        int skip_coeff = m->mbmi.mb_skip_coeff;
+  if (!c->mb_no_coeff_skip) {
+    skip_coeff = 0;
+  } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+             vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) {
+    skip_coeff = 1;
+  } else {
 #if CONFIG_SUPERBLOCKS
-        const int mis = mode_info_stride;
-        if (m->mbmi.encoded_as_sb) {
-          skip_coeff &= m[1].mbmi.mb_skip_coeff;
-          skip_coeff &= m[mis].mbmi.mb_skip_coeff;
-          skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;
-        }
+    const int nmbs = 1 << m->mbmi.sb_type;
+#else
+    const int nmbs = 1;
 #endif
-        vp9_write(bc, skip_coeff,
-                  vp9_get_pred_prob(c, xd, PRED_MBSKIP));
+    const int xmbs = MIN(nmbs, mb_cols_left);
+    const int ymbs = MIN(nmbs, mb_rows_left);
+    int x, y;
+
+    skip_coeff = 1;
+    for (y = 0; y < ymbs; y++) {
+      for (x = 0; x < xmbs; x++) {
+        skip_coeff = skip_coeff && m[y * mis + x].mbmi.mb_skip_coeff;
+      }
+    }
+
+    vp9_write(bc, skip_coeff,
+              vp9_get_pred_prob(c, xd, PRED_MBSKIP));
   }
 
 #if CONFIG_SUPERBLOCKS
-  if (m->mbmi.encoded_as_sb) {
+  if (m->mbmi.sb_type) {
     sb_kfwrite_ymode(bc, ym,
                      c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
   } else
@@ -1150,7 +1059,6 @@
   }
 
   if (ym == B_PRED) {
-    const int mis = c->mode_info_stride;
     int i = 0;
 #if CONFIG_COMP_INTRA_PRED
     int uses_second =
@@ -1195,7 +1103,7 @@
     write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
 
   if (ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT &&
-      !((c->mb_no_coeff_skip && m->mbmi.mb_skip_coeff) ||
+      !((c->mb_no_coeff_skip && skip_coeff) ||
         (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
          vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
     TX_SIZE sz = m->mbmi.txfm_size;
@@ -1204,75 +1112,99 @@
     if (sz != TX_4X4 && ym <= TM_PRED) {
       vp9_write(bc, sz != TX_8X8, c->prob_tx[1]);
 #if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
-      if (m->mbmi.encoded_as_sb && sz != TX_8X8)
+      if (m->mbmi.sb_type && sz != TX_8X8)
         vp9_write(bc, sz != TX_16X16, c->prob_tx[2]);
 #endif
     }
   }
 }
 
-static void write_kfmodes(VP9_COMP* const cpi, vp9_writer* const bc) {
+static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
+                          TOKENEXTRA **tok, TOKENEXTRA *tok_end,
+                          int mb_row, int mb_col) {
+  VP9_COMMON *const c = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+
+  xd->mode_info_context = m;
+  if (c->frame_type == KEY_FRAME) {
+    write_mb_modes_kf(cpi, m, bc,
+                      c->mb_rows - mb_row, c->mb_cols - mb_col);
+#ifdef ENTROPY_STATS
+    active_section = 8;
+#endif
+  } else {
+    pack_inter_mode_mvs(cpi, m, bc,
+                        c->mb_rows - mb_row, c->mb_cols - mb_col);
+#ifdef ENTROPY_STATS
+    active_section = 1;
+#endif
+  }
+
+  assert(*tok < tok_end);
+  pack_mb_tokens(bc, tok, tok_end);
+}
+
+static void write_modes(VP9_COMP *cpi, vp9_writer* const bc) {
   VP9_COMMON *const c = &cpi->common;
   const int mis = c->mode_info_stride;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  MODE_INFO *m;
-  int i;
-  int row, col;
-  int mb_row, mb_col;
-  int row_delta[4] = { 0, +1,  0, -1};
-  int col_delta[4] = { +1, -1, +1, +1};
+  MODE_INFO *m, *m_ptr = c->mi;
+  int i, mb_row, mb_col;
   TOKENEXTRA *tok = cpi->tok;
   TOKENEXTRA *tok_end = tok + cpi->tok_count;
 
-  mb_row = 0;
-  for (row = 0; row < c->mb_rows; row += 2) {
-    m = c->mi + row * mis;
+  for (mb_row = 0; mb_row < c->mb_rows; mb_row += 4, m_ptr += 4 * mis) {
+    m = m_ptr;
+    for (mb_col = 0; mb_col < c->mb_cols; mb_col += 4, m += 4) {
+#if CONFIG_SUPERBLOCKS && CONFIG_SUPERBLOCKS64
+      vp9_write(bc, m->mbmi.sb_type == BLOCK_SIZE_SB64X64, c->sb64_coded);
+      if (m->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
+        write_modes_b(cpi, m, bc, &tok, tok_end, mb_row, mb_col);
+      } else
+#endif
+      {
+        int j;
 
-    mb_col = 0;
-    for (col = 0; col < c->mb_cols; col += 2) {
+        for (j = 0; j < 4; j++) {
+          const int x_idx_sb = (j & 1) << 1, y_idx_sb = j & 2;
 #if CONFIG_SUPERBLOCKS
-      vp9_write(bc, m->mbmi.encoded_as_sb, c->sb_coded);
+          MODE_INFO *sb_m = m + y_idx_sb * mis + x_idx_sb;
 #endif
-      // Process the 4 MBs in the order:
-      // top-left, top-right, bottom-left, bottom-right
-      for (i = 0; i < 4; i++) {
-        int dy = row_delta[i];
-        int dx = col_delta[i];
-        int offset_extended = dy * mis + dx;
 
-        if ((mb_row >= c->mb_rows) || (mb_col >= c->mb_cols)) {
-          // MB lies outside frame, move on
-          mb_row += dy;
-          mb_col += dx;
-          m += offset_extended;
-          continue;
-        }
-
-        // Make sure the MacroBlockD mode info pointer is set correctly
-        xd->mode_info_context = m;
-
-        write_mb_modes_kf(c, xd, m, mis, bc);
-#ifdef ENTROPY_STATS
-        active_section = 8;
-#endif
-        assert(tok < tok_end);
-        pack_mb_tokens(bc, &tok, tok_end);
+          if (mb_col + x_idx_sb >= c->mb_cols ||
+              mb_row + y_idx_sb >= c->mb_rows)
+            continue;
 
 #if CONFIG_SUPERBLOCKS
-        if (m->mbmi.encoded_as_sb) {
-          assert(!i);
-          mb_col += 2;
-          m += 2;
-          break;
-        }
+          vp9_write(bc, sb_m->mbmi.sb_type, c->sb32_coded);
+          if (sb_m->mbmi.sb_type) {
+            assert(sb_m->mbmi.sb_type == BLOCK_SIZE_SB32X32);
+            write_modes_b(cpi, sb_m, bc, &tok, tok_end,
+                          mb_row + y_idx_sb, mb_col + x_idx_sb);
+          } else
 #endif
-        // Next MB
-        mb_row += dy;
-        mb_col += dx;
-        m += offset_extended;
+          {
+            // Process the 4 MBs in the order:
+            // top-left, top-right, bottom-left, bottom-right
+            for (i = 0; i < 4; i++) {
+              const int x_idx = x_idx_sb + (i & 1), y_idx = y_idx_sb + (i >> 1);
+              MODE_INFO *mb_m = m + x_idx + y_idx * mis;
+
+              if (mb_row + y_idx >= c->mb_rows ||
+                  mb_col + x_idx >= c->mb_cols) {
+                // MB lies outside frame, move on
+                continue;
+              }
+
+#if CONFIG_SUPERBLOCKS
+              assert(mb_m->mbmi.sb_type == BLOCK_SIZE_MB16X16);
+#endif
+              write_modes_b(cpi, mb_m, bc, &tok, tok_end,
+                            mb_row + y_idx, mb_col + x_idx);
+            }
+          }
+        }
       }
     }
-    mb_row += 2;
   }
 }
 
@@ -1800,13 +1732,12 @@
   }
 
 #if CONFIG_SUPERBLOCKS
-  {
-    /* sb mode probability */
-    const int sb_max = (((pc->mb_rows + 1) >> 1) * ((pc->mb_cols + 1) >> 1));
-
-    pc->sb_coded = get_prob(sb_max - cpi->sb_count, sb_max);
-    vp9_write_literal(&header_bc, pc->sb_coded, 8);
-  }
+#if CONFIG_SUPERBLOCKS64
+  pc->sb64_coded = get_binary_prob(cpi->sb64_count[0], cpi->sb64_count[1]);
+  vp9_write_literal(&header_bc, pc->sb64_coded, 8);
+#endif
+  pc->sb32_coded = get_binary_prob(cpi->sb32_count[0], cpi->sb32_count[1]);
+  vp9_write_literal(&header_bc, pc->sb32_coded, 8);
 #endif
 
   {
@@ -2195,12 +2126,12 @@
 
   if (pc->frame_type == KEY_FRAME) {
     decide_kf_ymode_entropy(cpi);
-    write_kfmodes(cpi, &residual_bc);
+    write_modes(cpi, &residual_bc);
   } else {
     /* This is not required if the counts in cpi are consistent with the
      * final packing pass */
     // if (!cpi->dummy_packing) vp9_zero(cpi->NMVcount);
-    pack_inter_mode_mvs(cpi, &residual_bc);
+    write_modes(cpi, &residual_bc);
 
     vp9_update_mode_context(&cpi->common);
   }
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index f5cfbd1..e8f6f46 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -181,10 +181,13 @@
 
   // Structure to hold context for each of the 4 MBs within a SB:
   // when encoded as 4 independent MBs:
-  PICK_MODE_CONTEXT mb_context[4];
+  PICK_MODE_CONTEXT mb_context[4][4];
 #if CONFIG_SUPERBLOCKS
   // when 4 MBs share coding parameters:
-  PICK_MODE_CONTEXT sb_context[4];
+  PICK_MODE_CONTEXT sb32_context[4];
+#if CONFIG_SUPERBLOCKS64
+  PICK_MODE_CONTEXT sb64_context;
+#endif  // CONFIG_SUPERBLOCKS64
 #endif
 
   void (*vp9_short_fdct4x4)(int16_t *input, int16_t *output, int pitch);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 3219e12..2192950 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -47,14 +47,17 @@
 
 extern void select_interp_filter_type(VP9_COMP *cpi);
 
-static void encode_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
-                              TOKENEXTRA **t, int recon_yoffset,
-                              int recon_uvoffset, int output_enabled,
-                              int mb_col, int mb_row);
+static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
+                              int recon_yoffset, int recon_uvoffset,
+                              int output_enabled, int mb_row, int mb_col);
 
-static void encode_superblock(VP9_COMP *cpi, MACROBLOCK *x,
-                              TOKENEXTRA **t, int recon_yoffset,
-                              int recon_uvoffset, int mb_col, int mb_row);
+static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,
+                                int recon_yoffset, int recon_uvoffset,
+                                int output_enabled, int mb_row, int mb_col);
+
+static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t,
+                                int recon_yoffset, int recon_uvoffset,
+                                int output_enabled, int mb_row, int mb_col);
 
 static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
 
@@ -431,37 +434,45 @@
 #endif
 
 static void update_state(VP9_COMP *cpi, MACROBLOCK *x,
-                         PICK_MODE_CONTEXT *ctx) {
-  int i;
+                         PICK_MODE_CONTEXT *ctx, int block_size,
+                         int output_enabled) {
+  int i, x_idx, y;
   MACROBLOCKD *xd = &x->e_mbd;
   MODE_INFO *mi = &ctx->mic;
-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
   int mb_mode = mi->mbmi.mode;
   int mb_mode_index = ctx->best_mode_index;
+  const int mis = cpi->common.mode_info_stride;
+#if CONFIG_SUPERBLOCKS
+  int mb_block_size = 1 << mi->mbmi.sb_type;
+#else
+  int mb_block_size = 1;
+#endif
 
 #if CONFIG_DEBUG
   assert(mb_mode < MB_MODE_COUNT);
   assert(mb_mode_index < MAX_MODES);
   assert(mi->mbmi.ref_frame < MAX_REF_FRAMES);
 #endif
+#if CONFIG_SUPERBLOCKS
+  assert(mi->mbmi.sb_type == (block_size >> 5));
+#endif
 
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
-  vpx_memcpy(xd->mode_info_context, mi, sizeof(MODE_INFO));
-#if CONFIG_SUPERBLOCKS
-  if (mi->mbmi.encoded_as_sb) {
-    const int mis = cpi->common.mode_info_stride;
-    if (xd->mb_to_right_edge >= 0)
-      vpx_memcpy(xd->mode_info_context + 1, mi, sizeof(MODE_INFO));
-    if (xd->mb_to_bottom_edge >= 0) {
-      vpx_memcpy(xd->mode_info_context + mis, mi, sizeof(MODE_INFO));
-      if (xd->mb_to_right_edge >= 0)
-        vpx_memcpy(xd->mode_info_context + mis + 1, mi, sizeof(MODE_INFO));
+  for (y = 0; y < mb_block_size; y++) {
+    for (x_idx = 0; x_idx < mb_block_size; x_idx++) {
+      if ((xd->mb_to_right_edge >> 7) + mb_block_size > x_idx &&
+          (xd->mb_to_bottom_edge >> 7) + mb_block_size > y) {
+        MODE_INFO *mi_addr = xd->mode_info_context + x_idx + y * mis;
+
+        vpx_memcpy(mi_addr, mi, sizeof(MODE_INFO));
+      }
     }
+  }
 #if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
-  } else {
+  if (block_size == 16) {
     ctx->txfm_rd_diff[ALLOW_32X32] = ctx->txfm_rd_diff[ALLOW_16X16];
-#endif
   }
 #endif
 
@@ -482,6 +493,9 @@
     mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;
   }
 
+  if (!output_enabled)
+    return;
+
   {
     int segment_id = mbmi->segment_id;
     if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
@@ -603,6 +617,135 @@
   }
 }
 
+static unsigned find_seg_id(uint8_t *buf, int block_size,
+                            int start_y, int height, int start_x, int width) {
+  const int end_x = MIN(start_x + block_size, width);
+  const int end_y = MIN(start_y + block_size, height);
+  int x, y;
+  unsigned seg_id = -1;
+
+  buf += width * start_y;
+  for (y = start_y; y < end_y; y++, buf += width) {
+    for (x = start_x; x < end_x; x++) {
+      seg_id = MIN(seg_id, buf[x]);
+    }
+  }
+
+  return seg_id;
+}
+
+static void set_offsets(VP9_COMP *cpi,
+                        int mb_row, int mb_col, int block_size,
+                        int *ref_yoffset, int *ref_uvoffset) {
+  MACROBLOCK *const x = &cpi->mb;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi;
+  const int dst_fb_idx = cm->new_fb_idx;
+  const int recon_y_stride = cm->yv12_fb[dst_fb_idx].y_stride;
+  const int recon_uv_stride = cm->yv12_fb[dst_fb_idx].uv_stride;
+  const int recon_yoffset = 16 * mb_row * recon_y_stride + 16 * mb_col;
+  const int recon_uvoffset = 8 * mb_row * recon_uv_stride + 8 * mb_col;
+  const int src_y_stride = x->src.y_stride;
+  const int src_uv_stride = x->src.uv_stride;
+  const int src_yoffset = 16 * mb_row * src_y_stride + 16 * mb_col;
+  const int src_uvoffset = 8 * mb_row * src_uv_stride + 8 * mb_col;
+  const int ref_fb_idx = cm->lst_fb_idx;
+  const int ref_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
+  const int ref_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+  const int idx_map = mb_row * cm->mb_cols + mb_col;
+  const int idx_str = xd->mode_info_stride * mb_row + mb_col;
+
+  // entropy context structures
+  xd->above_context = cm->above_context + mb_col;
+  xd->left_context  = cm->left_context + (mb_row & 3);
+
+  // GF active flags data structure
+  x->gf_active_ptr = (signed char *)&cpi->gf_active_flags[idx_map];
+
+  // Activity map pointer
+  x->mb_activity_ptr = &cpi->mb_activity_map[idx_map];
+  x->active_ptr = cpi->active_map + idx_map;
+
+  /* pointers to mode info contexts */
+  x->partition_info          = x->pi + idx_str;
+  xd->mode_info_context      = cm->mi + idx_str;
+  mbmi = &xd->mode_info_context->mbmi;
+  xd->prev_mode_info_context = cm->prev_mi + idx_str;
+
+  // Set up destination pointers
+  xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+  xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+  xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+
+  /* Set up limit values for MV components to prevent them from
+   * extending beyond the UMV borders assuming 16x16 block size */
+  x->mv_row_min = -((mb_row * 16) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
+  x->mv_col_min = -((mb_col * 16) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
+  x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
+                   (VP9BORDERINPIXELS - block_size - VP9_INTERP_EXTEND));
+  x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
+                   (VP9BORDERINPIXELS - block_size - VP9_INTERP_EXTEND));
+
+  // Set up distance of MB to edge of frame in 1/8th pel units
+  block_size >>= 4;  // in macroblock units
+  assert(!(mb_col & (block_size - 1)) && !(mb_row & (block_size - 1)));
+  xd->mb_to_top_edge    = -((mb_row * 16) << 3);
+  xd->mb_to_left_edge   = -((mb_col * 16) << 3);
+  xd->mb_to_bottom_edge = ((cm->mb_rows - block_size - mb_row) * 16) << 3;
+  xd->mb_to_right_edge  = ((cm->mb_cols - block_size - mb_col) * 16) << 3;
+
+  // Are edges available for intra prediction?
+  xd->up_available   = (mb_row != 0);
+  xd->left_available = (mb_col != 0);
+
+  /* Reference buffer offsets */
+  *ref_yoffset  = (mb_row * ref_y_stride * 16) + (mb_col * 16);
+  *ref_uvoffset = (mb_row * ref_uv_stride * 8) + (mb_col *  8);
+
+  /* set up source buffers */
+  x->src.y_buffer = cpi->Source->y_buffer + src_yoffset;
+  x->src.u_buffer = cpi->Source->u_buffer + src_uvoffset;
+  x->src.v_buffer = cpi->Source->v_buffer + src_uvoffset;
+
+  /* R/D setup */
+  x->rddiv = cpi->RDDIV;
+  x->rdmult = cpi->RDMULT;
+
+  /* segment ID */
+  if (xd->segmentation_enabled) {
+    if (xd->update_mb_segmentation_map) {
+      mbmi->segment_id = find_seg_id(cpi->segmentation_map, block_size,
+                                     mb_row, cm->mb_rows, mb_col, cm->mb_cols);
+    } else {
+      mbmi->segment_id = find_seg_id(cm->last_frame_seg_map, block_size,
+                                     mb_row, cm->mb_rows, mb_col, cm->mb_cols);
+    }
+    assert(mbmi->segment_id <= 3);
+    vp9_mb_init_quantizer(cpi, x);
+
+    if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
+        !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&
+        vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME) &&
+        vp9_check_segref(xd, 1, INTRA_FRAME)  +
+        vp9_check_segref(xd, 1, LAST_FRAME)   +
+        vp9_check_segref(xd, 1, GOLDEN_FRAME) +
+        vp9_check_segref(xd, 1, ALTREF_FRAME) == 1) {
+      cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
+    } else {
+      const int y = mb_row & ~3;
+      const int x = mb_col & ~3;
+      const int p16 = ((mb_row & 1) << 1) +  (mb_col & 1);
+      const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1);
+
+      cpi->seg0_progress =
+          ((y * cm->mb_cols + x * 4 + p32 + p16) << 16) / cm->MBs;
+    }
+  } else {
+    mbmi->segment_id = 0;
+  }
+}
+
 static void pick_mb_modes(VP9_COMP *cpi,
                           VP9_COMMON *cm,
                           int mb_row,
@@ -613,24 +756,15 @@
                           int *totalrate,
                           int *totaldist) {
   int i;
-  int map_index;
   int recon_yoffset, recon_uvoffset;
-  int ref_fb_idx = cm->lst_fb_idx;
-  int dst_fb_idx = cm->new_fb_idx;
-  int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
-  int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
   ENTROPY_CONTEXT_PLANES left_context[2];
   ENTROPY_CONTEXT_PLANES above_context[2];
   ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context
                                                       + mb_col;
 
-  // Offsets to move pointers from MB to MB within a SB in raster order
-  int row_delta[4] = { 0, +1,  0, -1};
-  int col_delta[4] = { +1, -1, +1, +1};
-
   /* Function should not modify L & A contexts; save and restore on exit */
   vpx_memcpy(left_context,
-             cm->left_context,
+             cm->left_context + (mb_row & 2),
              sizeof(left_context));
   vpx_memcpy(above_context,
              initial_above_context_ptr,
@@ -638,113 +772,36 @@
 
   /* Encode MBs in raster order within the SB */
   for (i = 0; i < 4; i++) {
-    int dy = row_delta[i];
-    int dx = col_delta[i];
-    int offset_unextended = dy * cm->mb_cols + dx;
-    int offset_extended   = dy * xd->mode_info_stride + dx;
-    MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+    const int x_idx = i & 1, y_idx = i >> 1;
+    MB_MODE_INFO *mbmi;
 
-    // TODO Many of the index items here can be computed more efficiently!
-
-    if ((mb_row >= cm->mb_rows) || (mb_col >= cm->mb_cols)) {
+    if ((mb_row + y_idx >= cm->mb_rows) || (mb_col + x_idx >= cm->mb_cols)) {
       // MB lies outside frame, move on
-      mb_row += dy;
-      mb_col += dx;
-
-      // Update pointers
-      x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
-      x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);
-      x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);
-
-      x->gf_active_ptr += offset_unextended;
-      x->partition_info += offset_extended;
-      xd->mode_info_context += offset_extended;
-      xd->prev_mode_info_context += offset_extended;
-#if CONFIG_DEBUG
-      assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
-             (xd->mode_info_context - cpi->common.mip));
-#endif
       continue;
     }
 
     // Index of the MB in the SB 0..3
     xd->mb_index = i;
-
-    map_index = (mb_row * cpi->common.mb_cols) + mb_col;
-    x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
-
-    // set above context pointer
-    xd->above_context = cm->above_context + mb_col;
-
-    // Restore the appropriate left context depending on which
-    // row in the SB the MB is situated
-    xd->left_context = cm->left_context + (i >> 1);
-
-    // Set up distance of MB to edge of frame in 1/8th pel units
-    xd->mb_to_top_edge    = -((mb_row * 16) << 3);
-    xd->mb_to_left_edge   = -((mb_col * 16) << 3);
-    xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
-    xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
-
-    // Set up limit values for MV components to prevent them from
-    // extending beyond the UMV borders assuming 16x16 block size
-    x->mv_row_min = -((mb_row * 16) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
-    x->mv_col_min = -((mb_col * 16) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
-    x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
-                     (VP9BORDERINPIXELS - 16 - VP9_INTERP_EXTEND));
-    x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
-                     (VP9BORDERINPIXELS - 16 - VP9_INTERP_EXTEND));
-
-    xd->up_available   = (mb_row != 0);
-    xd->left_available = (mb_col != 0);
-
-    recon_yoffset  = (mb_row * recon_y_stride * 16) + (mb_col * 16);
-    recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col *  8);
-
-    xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
-    xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
-    xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+    set_offsets(cpi, mb_row + y_idx, mb_col + x_idx, 16,
+                &recon_yoffset, &recon_uvoffset);
 
 #if !CONFIG_SUPERBLOCKS
     // Copy current MB to a work buffer
     vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
 #endif
 
-    x->rddiv = cpi->RDDIV;
-    x->rdmult = cpi->RDMULT;
-
     if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
       vp9_activity_masking(cpi, x);
 
-    // Is segmentation enabled
-    if (xd->segmentation_enabled) {
-      // Code to set segment id in xd->mbmi.segment_id
-      if (xd->update_mb_segmentation_map)
-        mbmi->segment_id = cpi->segmentation_map[map_index];
-      else
-        mbmi->segment_id = cm->last_frame_seg_map[map_index];
-      if (mbmi->segment_id > 3)
-        mbmi->segment_id = 0;
-
-      vp9_mb_init_quantizer(cpi, x);
-    } else
-      // Set to Segment 0 by default
-      mbmi->segment_id = 0;
-
-    x->active_ptr = cpi->active_map + map_index;
-
+    mbmi = &xd->mode_info_context->mbmi;
 #if CONFIG_SUPERBLOCKS
-    xd->mode_info_context->mbmi.encoded_as_sb = 0;
+    mbmi->sb_type = BLOCK_SIZE_MB16X16;
 #endif
 
     cpi->update_context = 0;    // TODO Do we need this now??
 
     vp9_intra_prediction_down_copy(xd);
 
-#ifdef ENC_DEBUG
-      enc_debug = (cpi->common.current_video_frame == 46 &&
-                   mb_row == 5 && mb_col == 2);
-#endif
     // Find best coding mode & reconstruct the MB so it is available
     // as a predictor for MBs that follow in the SB
     if (cm->frame_type == KEY_FRAME) {
@@ -758,28 +815,16 @@
       *totaldist += d;
 
       // Dummy encode, do not do the tokenization
-      encode_macroblock(cpi, x, tp,
-                        recon_yoffset, recon_uvoffset, 0, mb_col, mb_row);
+      encode_macroblock(cpi, tp, recon_yoffset, recon_uvoffset, 0,
+                        mb_row + y_idx, mb_col + x_idx);
       // Note the encoder may have changed the segment_id
 
       // Save the coding context
-      vpx_memcpy(&x->mb_context[i].mic, xd->mode_info_context,
+      vpx_memcpy(&x->mb_context[xd->sb_index][i].mic, xd->mode_info_context,
                  sizeof(MODE_INFO));
     } else {
       int seg_id, r, d;
 
-      if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
-          !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&
-          vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME) &&
-          vp9_check_segref(xd, 1, INTRA_FRAME)  +
-          vp9_check_segref(xd, 1, LAST_FRAME)   +
-          vp9_check_segref(xd, 1, GOLDEN_FRAME) +
-          vp9_check_segref(xd, 1, ALTREF_FRAME) == 1) {
-        cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
-      } else {
-        cpi->seg0_progress = (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols + i) << 16) / cm->MBs;
-      }
-
 #ifdef ENC_DEBUG
       if (enc_debug)
         printf("inter pick_mb_modes %d %d\n", mb_row, mb_col);
@@ -790,8 +835,8 @@
       *totaldist += d;
 
       // Dummy encode, do not do the tokenization
-      encode_macroblock(cpi, x, tp,
-                        recon_yoffset, recon_uvoffset, 0, mb_col, mb_row);
+      encode_macroblock(cpi, tp, recon_yoffset, recon_uvoffset, 0,
+                        mb_row + y_idx, mb_col + x_idx);
 
       seg_id = mbmi->segment_id;
       if (cpi->mb.e_mbd.segmentation_enabled && seg_id == 0) {
@@ -811,28 +856,10 @@
         cpi->ref_pred_count[pred_context][pred_flag]++;
       }
     }
-
-    // Next MB
-    mb_row += dy;
-    mb_col += dx;
-
-    x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
-    x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);
-    x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);
-
-    x->gf_active_ptr += offset_unextended;
-    x->partition_info += offset_extended;
-    xd->mode_info_context += offset_extended;
-    xd->prev_mode_info_context += offset_extended;
-
-#if CONFIG_DEBUG
-    assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
-           (xd->mode_info_context - cpi->common.mip));
-#endif
   }
 
   /* Restore L & A coding context to those in place on entry */
-  vpx_memcpy(cm->left_context,
+  vpx_memcpy(cm->left_context + (mb_row & 2),
              left_context,
              sizeof(left_context));
   vpx_memcpy(initial_above_context_ptr,
@@ -841,392 +868,204 @@
 }
 
 #if CONFIG_SUPERBLOCKS
-static void pick_sb_modes (VP9_COMP *cpi,
-                           VP9_COMMON *cm,
-                           int mb_row,
-                           int mb_col,
-                           MACROBLOCK  *x,
-                           MACROBLOCKD *xd,
-                           TOKENEXTRA **tp,
-                           int *totalrate,
-                           int *totaldist)
-{
-  int map_index;
+static void pick_sb_modes(VP9_COMP *cpi,
+                          VP9_COMMON *cm,
+                          int mb_row,
+                          int mb_col,
+                          MACROBLOCK  *x,
+                          MACROBLOCKD *xd,
+                          TOKENEXTRA **tp,
+                          int *totalrate,
+                          int *totaldist) {
   int recon_yoffset, recon_uvoffset;
-  int ref_fb_idx = cm->lst_fb_idx;
-  int dst_fb_idx = cm->new_fb_idx;
-  int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
-  int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
-  ENTROPY_CONTEXT_PLANES left_context[2];
-  ENTROPY_CONTEXT_PLANES above_context[2];
-  ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context
-    + mb_col;
 
-  /* Function should not modify L & A contexts; save and restore on exit */
-  vpx_memcpy (left_context,
-              cm->left_context,
-              sizeof(left_context));
-  vpx_memcpy (above_context,
-              initial_above_context_ptr,
-              sizeof(above_context));
-
-  map_index = (mb_row * cpi->common.mb_cols) + mb_col;
-  x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
-
-  /* set above context pointer */
-  xd->above_context = cm->above_context + mb_col;
-
-  /* Restore the appropriate left context depending on which
-   * row in the SB the MB is situated */
-  xd->left_context = cm->left_context;
-
-  // Set up distance of MB to edge of frame in 1/8th pel units
-  xd->mb_to_top_edge    = -((mb_row * 16) << 3);
-  xd->mb_to_left_edge   = -((mb_col * 16) << 3);
-  xd->mb_to_bottom_edge = ((cm->mb_rows - 2 - mb_row) * 16) << 3;
-  xd->mb_to_right_edge  = ((cm->mb_cols - 2 - mb_col) * 16) << 3;
-
-  /* Set up limit values for MV components to prevent them from
-   * extending beyond the UMV borders assuming 16x16 block size */
-  x->mv_row_min = -((mb_row * 16) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
-  x->mv_col_min = -((mb_col * 16) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
-  x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
-                   (VP9BORDERINPIXELS - 32 - VP9_INTERP_EXTEND));
-  x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
-                   (VP9BORDERINPIXELS - 32 - VP9_INTERP_EXTEND));
-
-  xd->up_available   = (mb_row != 0);
-  xd->left_available = (mb_col != 0);
-
-  recon_yoffset  = (mb_row * recon_y_stride * 16) + (mb_col * 16);
-  recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col *  8);
-
-  xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
-  xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
-  xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
-#if 0 // FIXME
-  /* Copy current MB to a work buffer */
-  vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
-#endif
-  x->rddiv = cpi->RDDIV;
-  x->rdmult = cpi->RDMULT;
-  if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+  set_offsets(cpi, mb_row, mb_col, 32, &recon_yoffset, &recon_uvoffset);
+  xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB32X32;
+  if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
     vp9_activity_masking(cpi, x);
-  /* Is segmentation enabled */
-  if (xd->segmentation_enabled)
-  {
-    /* Code to set segment id in xd->mbmi.segment_id */
-    if (xd->update_mb_segmentation_map)
-      xd->mode_info_context->mbmi.segment_id =
-            cpi->segmentation_map[map_index] &&
-            cpi->segmentation_map[map_index + 1] &&
-            cpi->segmentation_map[map_index + cm->mb_cols] &&
-            cpi->segmentation_map[map_index + cm->mb_cols + 1];
-    else
-      xd->mode_info_context->mbmi.segment_id =
-            cm->last_frame_seg_map[map_index] &&
-            cm->last_frame_seg_map[map_index + 1] &&
-            cm->last_frame_seg_map[map_index + cm->mb_cols] &&
-            cm->last_frame_seg_map[map_index + cm->mb_cols + 1];
-    if (xd->mode_info_context->mbmi.segment_id > 3)
-      xd->mode_info_context->mbmi.segment_id = 0;
-
-    vp9_mb_init_quantizer(cpi, x);
-  }
-  else
-    /* Set to Segment 0 by default */
-    xd->mode_info_context->mbmi.segment_id = 0;
-
-  x->active_ptr = cpi->active_map + map_index;
-
   cpi->update_context = 0;    // TODO Do we need this now??
 
   /* Find best coding mode & reconstruct the MB so it is available
    * as a predictor for MBs that follow in the SB */
-  if (cm->frame_type == KEY_FRAME)
-  {
-    vp9_rd_pick_intra_mode_sb(cpi, x,
-                              totalrate,
-                              totaldist);
+  if (cm->frame_type == KEY_FRAME) {
+    vp9_rd_pick_intra_mode_sb32(cpi, x,
+                                totalrate,
+                                totaldist);
 
     /* Save the coding context */
-    vpx_memcpy(&x->sb_context[0].mic, xd->mode_info_context,
+    vpx_memcpy(&x->sb32_context[xd->sb_index].mic, xd->mode_info_context,
                sizeof(MODE_INFO));
   } else {
-    if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
-        !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&
-        vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME) &&
-        vp9_check_segref(xd, 1, INTRA_FRAME)  +
-        vp9_check_segref(xd, 1, LAST_FRAME)   +
-        vp9_check_segref(xd, 1, GOLDEN_FRAME) +
-        vp9_check_segref(xd, 1, ALTREF_FRAME) == 1) {
-      cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
-    } else {
-      cpi->seg0_progress =
-        (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols) << 16) / cm->MBs;
+    vp9_rd_pick_inter_mode_sb32(cpi, x,
+                                recon_yoffset,
+                                recon_uvoffset,
+                                totalrate,
+                                totaldist);
+  }
+}
+
+#if CONFIG_SUPERBLOCKS64
+static void pick_sb64_modes(VP9_COMP *cpi,
+                            VP9_COMMON *cm,
+                            int mb_row,
+                            int mb_col,
+                            MACROBLOCK  *x,
+                            MACROBLOCKD *xd,
+                            TOKENEXTRA **tp,
+                            int *totalrate,
+                            int *totaldist) {
+  int recon_yoffset, recon_uvoffset;
+
+  set_offsets(cpi, mb_row, mb_col, 64, &recon_yoffset, &recon_uvoffset);
+  xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB64X64;
+  if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
+    vp9_activity_masking(cpi, x);
+  cpi->update_context = 0;    // TODO(rbultje) Do we need this now??
+
+  /* Find best coding mode & reconstruct the MB so it is available
+   * as a predictor for MBs that follow in the SB */
+  if (cm->frame_type == KEY_FRAME) {
+    vp9_rd_pick_intra_mode_sb64(cpi, x,
+                                totalrate,
+                                totaldist);
+
+    /* Save the coding context */
+    vpx_memcpy(&x->sb64_context.mic, xd->mode_info_context,
+               sizeof(MODE_INFO));
+  } else {
+    vp9_rd_pick_inter_mode_sb64(cpi, x,
+                                recon_yoffset,
+                                recon_uvoffset,
+                                totalrate,
+                                totaldist);
+  }
+}
+#endif  // CONFIG_SUPERBLOCKS64
+#endif  // CONFIG_SUPERBLOCKS
+
+static void update_stats(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *mi = xd->mode_info_context;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+
+  if (cm->frame_type == KEY_FRAME) {
+#ifdef MODE_STATS
+    y_modes[mbmi->mode]++;
+#endif
+  } else {
+    int segment_id, seg_ref_active;
+
+    if (mbmi->ref_frame) {
+      int pred_context = vp9_get_pred_context(cm, xd, PRED_COMP);
+
+      if (mbmi->second_ref_frame <= INTRA_FRAME)
+        cpi->single_pred_count[pred_context]++;
+      else
+        cpi->comp_pred_count[pred_context]++;
     }
 
-    vp9_rd_pick_inter_mode_sb(cpi, x,
-                              recon_yoffset,
-                              recon_uvoffset,
-                              totalrate,
-                              totaldist);
-  }
+#ifdef MODE_STATS
+    inter_y_modes[mbmi->mode]++;
 
-  /* Restore L & A coding context to those in place on entry */
-  vpx_memcpy (cm->left_context,
-              left_context,
-              sizeof(left_context));
-  vpx_memcpy (initial_above_context_ptr,
-              above_context,
-              sizeof(above_context));
-}
+    if (mbmi->mode == SPLITMV) {
+      int b;
+
+      for (b = 0; b < x->partition_info->count; b++) {
+        inter_b_modes[x->partition_info->bmi[b].mode]++;
+      }
+    }
 #endif
 
+    // If we have just a single reference frame coded for a segment then
+    // exclude from the reference frame counts used to work out
+    // probabilities. NOTE: At the moment we dont support custom trees
+    // for the reference frame coding for each segment but this is a
+    // possible future action.
+    segment_id = mbmi->segment_id;
+    seg_ref_active = vp9_segfeature_active(xd, segment_id,
+                                           SEG_LVL_REF_FRAME);
+    if (!seg_ref_active ||
+        ((vp9_check_segref(xd, segment_id, INTRA_FRAME) +
+          vp9_check_segref(xd, segment_id, LAST_FRAME) +
+          vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +
+          vp9_check_segref(xd, segment_id, ALTREF_FRAME)) > 1)) {
+      cpi->count_mb_ref_frame_usage[mbmi->ref_frame]++;
+    }
+    // Count of last ref frame 0,0 usage
+    if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))
+      cpi->inter_zz_count++;
+  }
+}
+
 static void encode_sb(VP9_COMP *cpi,
                       VP9_COMMON *cm,
-                      int mbrow,
-                      int mbcol,
+                      int mb_row,
+                      int mb_col,
+                      int output_enabled,
                       MACROBLOCK  *x,
                       MACROBLOCKD *xd,
-                      TOKENEXTRA **tp) {
-  int i;
-  int map_index;
-  int mb_row, mb_col;
+                      TOKENEXTRA **tp, int is_sb) {
   int recon_yoffset, recon_uvoffset;
-  int ref_fb_idx = cm->lst_fb_idx;
-  int dst_fb_idx = cm->new_fb_idx;
-  int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
-  int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
-  int row_delta[4] = { 0, +1,  0, -1};
-  int col_delta[4] = { +1, -1, +1, +1};
-
-  mb_row = mbrow;
-  mb_col = mbcol;
-
-  /* Encode MBs in raster order within the SB */
-  for (i = 0; i < 4; i++) {
-    int dy = row_delta[i];
-    int dx = col_delta[i];
-    int offset_extended   = dy * xd->mode_info_stride + dx;
-    int offset_unextended = dy * cm->mb_cols + dx;
-    MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
-
-    if ((mb_row >= cm->mb_rows) || (mb_col >= cm->mb_cols)) {
-      // MB lies outside frame, move on
-      mb_row += dy;
-      mb_col += dx;
-
-      x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
-      x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);
-      x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);
-
-      x->gf_active_ptr      += offset_unextended;
-      x->partition_info     += offset_extended;
-      xd->mode_info_context += offset_extended;
-      xd->prev_mode_info_context += offset_extended;
-
-#if CONFIG_DEBUG
-      assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
-             (xd->mode_info_context - cpi->common.mip));
-#endif
-      continue;
-    }
-
-    xd->mb_index = i;
-
-    // Restore MB state to that when it was picked
-#if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb) {
-      update_state(cpi, x, &x->sb_context[i]);
-      cpi->sb_count++;
-    } else
-#endif
-      update_state(cpi, x, &x->mb_context[i]);
-
-    map_index = (mb_row * cpi->common.mb_cols) + mb_col;
-    x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
-
-    // reset above block coeffs
-    xd->above_context = cm->above_context + mb_col;
-    xd->left_context  = cm->left_context + (i >> 1);
-
-    // Set up distance of MB to edge of the frame in 1/8th pel units
-    // Set up limit values for MV components to prevent them from
-    // extending beyond the UMV borders assuming 32x32 block size
-    x->mv_row_min = -((mb_row * 16) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
-    x->mv_col_min = -((mb_col * 16) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
-
-    xd->mb_to_top_edge    = -((mb_row * 16) << 3);
-    xd->mb_to_left_edge   = -((mb_col * 16) << 3);
 
 #if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb) {
-      x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
-                       (VP9BORDERINPIXELS - 32 - VP9_INTERP_EXTEND));
-      x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
-                       (VP9BORDERINPIXELS - 32 - VP9_INTERP_EXTEND));
+  cpi->sb32_count[is_sb]++;
+  if (is_sb) {
+    set_offsets(cpi, mb_row, mb_col, 32, &recon_yoffset, &recon_uvoffset);
+    update_state(cpi, x, &x->sb32_context[xd->sb_index], 32, output_enabled);
 
-      xd->mb_to_bottom_edge = ((cm->mb_rows - 2 - mb_row) * 16) << 3;
-      xd->mb_to_right_edge  = ((cm->mb_cols - 2 - mb_col) * 16) << 3;
-    } else {
-#endif
-      x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
-                       (VP9BORDERINPIXELS - 16 - VP9_INTERP_EXTEND));
-      x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
-                       (VP9BORDERINPIXELS - 16 - VP9_INTERP_EXTEND));
+    encode_superblock32(cpi, tp, recon_yoffset, recon_uvoffset,
+                        output_enabled, mb_row, mb_col);
+    if (output_enabled)
+      update_stats(cpi);
 
-      xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
-      xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
-#if CONFIG_SUPERBLOCKS
-    }
-#endif
-
-    xd->up_available = (mb_row != 0);
-    xd->left_available = (mb_col != 0);
-
-    recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
-    recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
-
-    xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
-    xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
-    xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
-
-#if !CONFIG_SUPERBLOCKS
-    // Copy current MB to a work buffer
-    vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
-#endif
-
-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
-      vp9_activity_masking(cpi, x);
-
-    // Is segmentation enabled
-    if (xd->segmentation_enabled) {
-      vp9_mb_init_quantizer(cpi, x);
-    }
-
-    x->active_ptr = cpi->active_map + map_index;
-
-    cpi->update_context = 0;
-
-#if CONFIG_SUPERBLOCKS
-    if (!xd->mode_info_context->mbmi.encoded_as_sb)
-#endif
-      vp9_intra_prediction_down_copy(xd);
-
-    if (cm->frame_type == KEY_FRAME) {
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb)
-        encode_superblock(cpi, x, tp, recon_yoffset, recon_uvoffset,
-                          mb_col, mb_row);
-      else
-#endif
-        encode_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset, 1,
-                          mb_col, mb_row);
-        // Note the encoder may have changed the segment_id
-
-#ifdef MODE_STATS
-      y_modes[mbmi->mode]++;
-#endif
-    } else {
-      unsigned char *segment_id;
-      int seg_ref_active;
-
-      if (xd->mode_info_context->mbmi.ref_frame) {
-        unsigned char pred_context;
-
-        pred_context = vp9_get_pred_context(cm, xd, PRED_COMP);
-
-        if (xd->mode_info_context->mbmi.second_ref_frame <= INTRA_FRAME)
-          cpi->single_pred_count[pred_context]++;
-        else
-          cpi->comp_pred_count[pred_context]++;
-      }
-
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb)
-        encode_superblock(cpi, x, tp, recon_yoffset, recon_uvoffset,
-                          mb_col, mb_row);
-      else
-#endif
-        encode_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset, 1,
-                          mb_col, mb_row);
-        // Note the encoder may have changed the segment_id
-
-#ifdef MODE_STATS
-      inter_y_modes[mbmi->mode]++;
-
-      if (mbmi->mode == SPLITMV) {
-        int b;
-
-        for (b = 0; b < x->partition_info->count; b++) {
-          inter_b_modes[x->partition_info->bmi[b].mode]++;
-        }
-      }
-
-#endif
-
-      // If we have just a single reference frame coded for a segment then
-      // exclude from the reference frame counts used to work out
-      // probabilities. NOTE: At the moment we dont support custom trees
-      // for the reference frame coding for each segment but this is a
-      // possible future action.
-      segment_id = &mbmi->segment_id;
-      seg_ref_active = vp9_segfeature_active(xd, *segment_id,
-                                             SEG_LVL_REF_FRAME);
-      if (!seg_ref_active ||
-          ((vp9_check_segref(xd, *segment_id, INTRA_FRAME) +
-            vp9_check_segref(xd, *segment_id, LAST_FRAME) +
-            vp9_check_segref(xd, *segment_id, GOLDEN_FRAME) +
-            vp9_check_segref(xd, *segment_id, ALTREF_FRAME)) > 1)) {
-        {
-          cpi->count_mb_ref_frame_usage[mbmi->ref_frame]++;
-        }
-      }
-
-      // Count of last ref frame 0,0 usage
-      if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))
-        cpi->inter_zz_count++;
-    }
-
-#if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb) {
-      x->src.y_buffer += 32;
-      x->src.u_buffer += 16;
-      x->src.v_buffer += 16;
-
-      x->gf_active_ptr      += 2;
-      x->partition_info     += 2;
-      xd->mode_info_context += 2;
-      xd->prev_mode_info_context += 2;
-
+    if (output_enabled) {
       (*tp)->Token = EOSB_TOKEN;
       (*tp)++;
-      if (mb_row < cm->mb_rows) cpi->tplist[mb_row].stop = *tp;
-      break;
+      if (mb_row < cm->mb_rows)
+        cpi->tplist[mb_row].stop = *tp;
     }
+  } else
+#endif
+  {
+    int i;
+
+    for (i = 0; i < 4; i++) {
+      const int x_idx = i & 1, y_idx = i >> 1;
+
+      if ((mb_row + y_idx >= cm->mb_rows) || (mb_col + x_idx >= cm->mb_cols)) {
+        // MB lies outside frame, move on
+        continue;
+      }
+
+      set_offsets(cpi, mb_row + y_idx, mb_col + x_idx, 16,
+                  &recon_yoffset, &recon_uvoffset);
+      xd->mb_index = i;
+      update_state(cpi, x, &x->mb_context[xd->sb_index][i], 16, output_enabled);
+
+#if !CONFIG_SUPERBLOCKS
+      // Copy current MB to a work buffer
+      vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
 #endif
 
-    // Next MB
-    mb_row += dy;
-    mb_col += dx;
+      if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
+        vp9_activity_masking(cpi, x);
 
-    x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
-    x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);
-    x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);
+      vp9_intra_prediction_down_copy(xd);
 
-    x->gf_active_ptr      += offset_unextended;
-    x->partition_info     += offset_extended;
-    xd->mode_info_context += offset_extended;
-    xd->prev_mode_info_context += offset_extended;
+      encode_macroblock(cpi, tp, recon_yoffset, recon_uvoffset,
+                        output_enabled, mb_row + y_idx, mb_col + x_idx);
+      if (output_enabled)
+        update_stats(cpi);
 
-#if CONFIG_DEBUG
-    assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
-           (xd->mode_info_context - cpi->common.mip));
-#endif
-    (*tp)->Token = EOSB_TOKEN;
-    (*tp)++;
-    if (mb_row < cm->mb_rows) cpi->tplist[mb_row].stop = *tp;
+      if (output_enabled) {
+        (*tp)->Token = EOSB_TOKEN;
+        (*tp)++;
+        if (mb_row + y_idx < cm->mb_rows)
+          cpi->tplist[mb_row + y_idx].stop = *tp;
+      }
+    }
   }
 
   // debug output
@@ -1240,14 +1079,54 @@
 #endif
 }
 
-static
-void encode_sb_row(VP9_COMP *cpi,
-                   VP9_COMMON *cm,
-                   int mb_row,
-                   MACROBLOCK  *x,
-                   MACROBLOCKD *xd,
-                   TOKENEXTRA **tp,
-                   int *totalrate) {
+#if CONFIG_SUPERBLOCKS && CONFIG_SUPERBLOCKS64
+static void encode_sb64(VP9_COMP *cpi,
+                        VP9_COMMON *cm,
+                        int mb_row,
+                        int mb_col,
+                        MACROBLOCK  *x,
+                        MACROBLOCKD *xd,
+                        TOKENEXTRA **tp, int is_sb[4]) {
+  cpi->sb64_count[is_sb[0] == 2]++;
+  if (is_sb[0] == 2) {
+    int recon_yoffset, recon_uvoffset;
+
+    set_offsets(cpi, mb_row, mb_col, 64, &recon_yoffset, &recon_uvoffset);
+    update_state(cpi, x, &x->sb64_context, 64, 1);
+    encode_superblock64(cpi, tp, recon_yoffset, recon_uvoffset,
+                        1, mb_row, mb_col);
+    update_stats(cpi);
+
+    (*tp)->Token = EOSB_TOKEN;
+    (*tp)++;
+    if (mb_row < cm->mb_rows)
+      cpi->tplist[mb_row].stop = *tp;
+  } else {
+    int i;
+
+    for (i = 0; i < 4; i++) {
+      const int x_idx = i & 1, y_idx = i >> 1;
+
+      if (mb_row + y_idx * 2 >= cm->mb_rows ||
+          mb_col + x_idx * 2 >= cm->mb_cols) {
+        // MB lies outside frame, move on
+        continue;
+      }
+      xd->sb_index = i;
+      encode_sb(cpi, cm, mb_row + 2 * y_idx, mb_col + 2 * x_idx, 1, x, xd, tp,
+                is_sb[i]);
+    }
+  }
+}
+#endif // CONFIG_SUPERBLOCKS && CONFIG_SUPERBLOCKS64
+
+static void encode_sb_row(VP9_COMP *cpi,
+                          VP9_COMMON *cm,
+                          int mb_row,
+                          MACROBLOCK  *x,
+                          MACROBLOCKD *xd,
+                          TOKENEXTRA **tp,
+                          int *totalrate) {
   int mb_col;
   int mb_cols = cm->mb_cols;
 
@@ -1255,105 +1134,103 @@
   vpx_memset(cm->left_context, 0, sizeof(cm->left_context));
 
   // Code each SB in the row
-  for (mb_col = 0; mb_col < mb_cols; mb_col += 2) {
-    int mb_rate = 0, mb_dist = 0;
+  for (mb_col = 0; mb_col < mb_cols; mb_col += 4) {
+    int i;
+    int sb32_rate = 0, sb32_dist = 0;
+    int is_sb[4];
+#if CONFIG_SUPERBLOCKS && CONFIG_SUPERBLOCKS64
+    int sb64_rate = INT_MAX, sb64_dist;
+    ENTROPY_CONTEXT_PLANES l[4], a[4];
+    TOKENEXTRA *tp_orig = *tp;
+
+    memcpy(&a, cm->above_context + mb_col, sizeof(a));
+    memcpy(&l, cm->left_context, sizeof(l));
+#endif  // CONFIG_SUPERBLOCKS && CONFIG_SUPERBLOCKS64
+    for (i = 0; i < 4; i++) {
+      const int x_idx = (i & 1) << 1, y_idx = i & 2;
+      int mb_rate = 0, mb_dist = 0;
 #if CONFIG_SUPERBLOCKS
-    int sb_rate = INT_MAX, sb_dist;
+      int sb_rate = INT_MAX, sb_dist;
 #endif
 
-#if CONFIG_DEBUG
-    MODE_INFO *mic = xd->mode_info_context;
-    PARTITION_INFO *pi = x->partition_info;
-    signed char  *gfa = x->gf_active_ptr;
-    uint8_t *yb = x->src.y_buffer;
-    uint8_t *ub = x->src.u_buffer;
-    uint8_t *vb = x->src.v_buffer;
+      if (mb_row + y_idx >= cm->mb_rows || mb_col + x_idx >= cm->mb_cols)
+        continue;
+
+      xd->sb_index = i;
+
+      pick_mb_modes(cpi, cm, mb_row + y_idx, mb_col + x_idx,
+                    x, xd, tp, &mb_rate, &mb_dist);
+#if CONFIG_SUPERBLOCKS
+      mb_rate += vp9_cost_bit(cm->sb32_coded, 0);
 #endif
 
 #if CONFIG_SUPERBLOCKS
-    // Pick modes assuming the SB is coded as 4 independent MBs
-    xd->mode_info_context->mbmi.encoded_as_sb = 0;
+      if (!(((    mb_cols & 1) && mb_col + x_idx ==     mb_cols - 1) ||
+            ((cm->mb_rows & 1) && mb_row + y_idx == cm->mb_rows - 1))) {
+        /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
+        pick_sb_modes(cpi, cm, mb_row + y_idx, mb_col + x_idx,
+                      x, xd, tp, &sb_rate, &sb_dist);
+        sb_rate += vp9_cost_bit(cm->sb32_coded, 1);
+      }
+
+      /* Decide whether to encode as a SB or 4xMBs */
+      if (sb_rate < INT_MAX &&
+          RDCOST(x->rdmult, x->rddiv, sb_rate, sb_dist) <
+              RDCOST(x->rdmult, x->rddiv, mb_rate, mb_dist)) {
+        is_sb[i] = 1;
+        sb32_rate += sb_rate;
+        sb32_dist += sb_dist;
+      } else
 #endif
-    pick_mb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &mb_rate, &mb_dist);
+      {
 #if CONFIG_SUPERBLOCKS
-    mb_rate += vp9_cost_bit(cm->sb_coded, 0);
+        is_sb[i] = 0;
 #endif
+        sb32_rate += mb_rate;
+        sb32_dist += mb_dist;
+      }
 
-    x->src.y_buffer -= 32;
-    x->src.u_buffer -= 16;
-    x->src.v_buffer -= 16;
+      /* Encode SB using best computed mode(s) */
+      // FIXME(rbultje): there really shouldn't be any need to encode_mb/sb
+      // for each level that we go up, we can just keep tokens and recon
+      // pixels of the lower level; also, inverting SB/MB order (big->small
+      // instead of small->big) means we can use as threshold for small, which
+      // may enable breakouts if RD is not good enough (i.e. faster)
+      encode_sb(cpi, cm, mb_row + y_idx, mb_col + x_idx,
+                !(CONFIG_SUPERBLOCKS && CONFIG_SUPERBLOCKS64),
+                x, xd, tp, is_sb[i]);
+    }
 
-    x->gf_active_ptr -= 2;
-    x->partition_info -= 2;
-    xd->mode_info_context -= 2;
-    xd->prev_mode_info_context -= 2;
+#if CONFIG_SUPERBLOCKS && CONFIG_SUPERBLOCKS64
+    memcpy(cm->above_context + mb_col, &a, sizeof(a));
+    memcpy(cm->left_context, &l, sizeof(l));
+    sb32_rate += vp9_cost_bit(cm->sb64_coded, 0);
 
-#if CONFIG_DEBUG
-    assert(x->gf_active_ptr == gfa);
-    assert(x->partition_info == pi);
-    assert(xd->mode_info_context == mic);
-    assert(x->src.y_buffer == yb);
-    assert(x->src.u_buffer == ub);
-    assert(x->src.v_buffer == vb);
-#endif
-
-#if CONFIG_SUPERBLOCKS
-    if (!(((    mb_cols & 1) && mb_col ==     mb_cols - 1) ||
-          ((cm->mb_rows & 1) && mb_row == cm->mb_rows - 1))) {
-      /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
-      xd->mode_info_context->mbmi.encoded_as_sb = 1;
-      pick_sb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &sb_rate, &sb_dist);
-      sb_rate += vp9_cost_bit(cm->sb_coded, 1);
+    if (!(((    mb_cols & 3) && mb_col + 3 >=     mb_cols) ||
+          ((cm->mb_rows & 3) && mb_row + 3 >= cm->mb_rows))) {
+      pick_sb64_modes(cpi, cm, mb_row, mb_col,
+                      x, xd, tp, &sb64_rate, &sb64_dist);
+      sb64_rate += vp9_cost_bit(cm->sb64_coded, 1);
     }
 
     /* Decide whether to encode as a SB or 4xMBs */
-    if (sb_rate < INT_MAX &&
-        RDCOST(x->rdmult, x->rddiv, sb_rate, sb_dist) <
-          RDCOST(x->rdmult, x->rddiv, mb_rate, mb_dist)) {
-      xd->mode_info_context->mbmi.encoded_as_sb = 1;
-      xd->mode_info_context[1].mbmi.encoded_as_sb = 1;
-      xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 1;
-      xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 1;
-      *totalrate += sb_rate;
+    if (sb64_rate < INT_MAX &&
+        RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist) <
+            RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
+      is_sb[0] = 2;
+      *totalrate += sb64_rate;
     } else
 #endif
     {
-#if CONFIG_SUPERBLOCKS
-      xd->mode_info_context->mbmi.encoded_as_sb = 0;
-      if (cm->mb_cols - 1 > mb_col)
-        xd->mode_info_context[1].mbmi.encoded_as_sb = 0;
-      if (cm->mb_rows - 1 > mb_row) {
-        xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 0;
-        if (cm->mb_cols - 1 > mb_col)
-          xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 0;
-      }
-#endif
-      *totalrate += mb_rate;
+      *totalrate += sb32_rate;
     }
 
-    /* Encode SB using best computed mode(s) */
-    encode_sb(cpi, cm, mb_row, mb_col, x, xd, tp);
-
-#if CONFIG_DEBUG
-    assert(x->gf_active_ptr == gfa + 2);
-    assert(x->partition_info == pi + 2);
-    assert(xd->mode_info_context == mic + 2);
-    assert(x->src.y_buffer == yb + 32);
-    assert(x->src.u_buffer == ub + 16);
-    assert(x->src.v_buffer == vb + 16);
-#endif
+#if CONFIG_SUPERBLOCKS && CONFIG_SUPERBLOCKS64
+    assert(tp_orig == *tp);
+    encode_sb64(cpi, cm, mb_row, mb_col, x, xd, tp, is_sb);
+    assert(tp_orig < *tp);
+#endif  // CONFIG_SUPERBLOCKS && CONFIG_SUPERBLOCKS64
   }
-
-  // this is to account for the border
-  x->gf_active_ptr += mb_cols - (mb_cols & 0x1);
-  x->partition_info += xd->mode_info_stride + 1 - (mb_cols & 0x1);
-  xd->mode_info_context += xd->mode_info_stride + 1 - (mb_cols & 0x1);
-  xd->prev_mode_info_context += xd->mode_info_stride + 1 - (mb_cols & 0x1);
-
-#if CONFIG_DEBUG
-  assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
-         (xd->mode_info_context - cpi->common.mip));
-#endif
 }
 
 static void init_encode_frame_mb_context(VP9_COMP *cpi) {
@@ -1361,22 +1238,11 @@
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  // GF active flags data structure
-  x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
-
-  // Activity map pointer
-  x->mb_activity_ptr = cpi->mb_activity_map;
-
   x->act_zbin_adj = 0;
   cpi->seg0_idx = 0;
   vpx_memset(cpi->ref_pred_count, 0, sizeof(cpi->ref_pred_count));
 
-  x->partition_info = x->pi;
-
-  xd->mode_info_context = cm->mi;
   xd->mode_info_stride = cm->mode_info_stride;
-  xd->prev_mode_info_context = cm->prev_mi;
-
   xd->frame_type = cm->frame_type;
 
   xd->frames_since_golden = cm->frames_since_golden;
@@ -1387,7 +1253,7 @@
     vp9_init_mbmode_probs(cm);
 
   // Copy data over into macro block data structures.
-  x->src = * cpi->Source;
+  x->src = *cpi->Source;
   xd->pre = cm->yv12_fb[cm->lst_fb_idx];
   xd->dst = cm->yv12_fb[cm->new_fb_idx];
 
@@ -1413,8 +1279,11 @@
   vp9_zero(cpi->common.fc.mv_ref_ct)
 #if CONFIG_SUPERBLOCKS
   vp9_zero(cpi->sb_ymode_count)
-  cpi->sb_count = 0;
-#endif
+  vp9_zero(cpi->sb32_count);
+#if CONFIG_SUPERBLOCKS64
+  vp9_zero(cpi->sb64_count);
+#endif  // CONFIG_SUPERBLOCKS64
+#endif  // CONFIG_SUPERBLOCKS
 #if CONFIG_COMP_INTERINTRA_PRED
   vp9_zero(cpi->interintra_count);
   vp9_zero(cpi->interintra_select_count);
@@ -1527,15 +1396,8 @@
 
     {
       // For each row of SBs in the frame
-      for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 2) {
-        int offset = (cm->mb_cols + 1) & ~0x1;
-
+      for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4) {
         encode_sb_row(cpi, cm, mb_row, x, xd, &tp, &totalrate);
-
-        // adjust to the next row of SBs
-        x->src.y_buffer += 32 * x->src.y_stride - 16 * offset;
-        x->src.u_buffer += 16 * x->src.uv_stride - 8 * offset;
-        x->src.v_buffer += 16 * x->src.uv_stride - 8 * offset;
       }
 
       cpi->tok_count = (unsigned int)(tp - cpi->tok);
@@ -1580,78 +1442,150 @@
   }
 }
 
-static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
-  VP9_COMMON *cm = &cpi->common;
-  int mb_row, mb_col, mis = cm->mode_info_stride, segment_id;
-  MODE_INFO *mi, *mi_ptr = cm->mi;
-#if CONFIG_SUPERBLOCKS
-  int skip;
-  MODE_INFO *sb_mi_ptr = cm->mi, *sb_mi;
-  MB_MODE_INFO *sb_mbmi;
-#endif
-  MB_MODE_INFO *mbmi;
-  MACROBLOCK *x = &cpi->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
+static void reset_skip_txfm_size_mb(VP9_COMP *cpi,
+                                    MODE_INFO *mi, TX_SIZE txfm_max) {
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
 
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++, mi_ptr += mis) {
-    mi = mi_ptr;
+  if (mbmi->txfm_size > txfm_max) {
+    VP9_COMMON *const cm = &cpi->common;
+    MACROBLOCK *const x = &cpi->mb;
+    MACROBLOCKD *const xd = &x->e_mbd;
+    const int segment_id = mbmi->segment_id;
+
+    xd->mode_info_context = mi;
+    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+            vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||
+           (cm->mb_no_coeff_skip && mbmi->mb_skip_coeff));
+    mbmi->txfm_size = txfm_max;
+  }
+}
+
 #if CONFIG_SUPERBLOCKS
-    sb_mi = sb_mi_ptr;
-#endif
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++, mi++) {
-      mbmi = &mi->mbmi;
-#if CONFIG_SUPERBLOCKS
-      sb_mbmi = &sb_mi->mbmi;
-#endif
-      if (mbmi->txfm_size > txfm_max) {
-#if CONFIG_SUPERBLOCKS
-        if (sb_mbmi->encoded_as_sb) {
-          if (!((mb_col & 1) || (mb_row & 1))) {
-            segment_id = mbmi->segment_id;
-            skip = mbmi->mb_skip_coeff;
-            if (mb_col < cm->mb_cols - 1) {
-              segment_id = segment_id && mi[1].mbmi.segment_id;
-              skip = skip && mi[1].mbmi.mb_skip_coeff;
-            }
-            if (mb_row < cm->mb_rows - 1) {
-              segment_id = segment_id &&
-                           mi[cm->mode_info_stride].mbmi.segment_id;
-              skip = skip && mi[cm->mode_info_stride].mbmi.mb_skip_coeff;
-              if (mb_col < cm->mb_cols - 1) {
-                segment_id = segment_id &&
-                             mi[cm->mode_info_stride + 1].mbmi.segment_id;
-                skip = skip && mi[cm->mode_info_stride + 1].mbmi.mb_skip_coeff;
-              }
-            }
-            xd->mode_info_context = mi;
-            assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
-                    vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||
-                   (cm->mb_no_coeff_skip && skip));
-            mbmi->txfm_size = txfm_max;
-          } else {
-            mbmi->txfm_size = sb_mbmi->txfm_size;
-          }
-        } else {
-#endif
-          segment_id = mbmi->segment_id;
-          xd->mode_info_context = mi;
-          assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
-                  vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||
-                 (cm->mb_no_coeff_skip && mbmi->mb_skip_coeff));
-          mbmi->txfm_size = txfm_max;
-#if CONFIG_SUPERBLOCKS
-        }
-#endif
-      }
-#if CONFIG_SUPERBLOCKS
-      if (mb_col & 1)
-        sb_mi += 2;
-#endif
+static int get_skip_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs) {
+  int x, y;
+
+  for (y = 0; y < ymbs; y++) {
+    for (x = 0; x < xmbs; x++) {
+      if (!mi[y * mis + x].mbmi.mb_skip_coeff)
+        return 0;
     }
-#if CONFIG_SUPERBLOCKS
-    if (mb_row & 1)
-      sb_mi_ptr += 2 * mis;
+  }
+
+  return 1;
+}
+
+static void set_txfm_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs,
+                          TX_SIZE txfm_size) {
+  int x, y;
+
+  for (y = 0; y < ymbs; y++) {
+    for (x = 0; x < xmbs; x++) {
+      mi[y * mis + x].mbmi.txfm_size = txfm_size;
+    }
+  }
+}
+
+static void reset_skip_txfm_size_sb32(VP9_COMP *cpi, MODE_INFO *mi,
+                                      int mis, TX_SIZE txfm_max,
+                                      int mb_rows_left, int mb_cols_left) {
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+
+  if (mbmi->txfm_size > txfm_max) {
+    VP9_COMMON *const cm = &cpi->common;
+    MACROBLOCK *const x = &cpi->mb;
+    MACROBLOCKD *const xd = &x->e_mbd;
+    const int segment_id = mbmi->segment_id;
+    const int ymbs = MIN(2, mb_rows_left);
+    const int xmbs = MIN(2, mb_cols_left);
+
+    xd->mode_info_context = mi;
+    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+            vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||
+           (cm->mb_no_coeff_skip && get_skip_flag(mi, mis, ymbs, xmbs)));
+    set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);
+  }
+}
+
+#if CONFIG_SUPERBLOCKS64
+static void reset_skip_txfm_size_sb64(VP9_COMP *cpi, MODE_INFO *mi,
+                                      int mis, TX_SIZE txfm_max,
+                                      int mb_rows_left, int mb_cols_left) {
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+
+  if (mbmi->txfm_size > txfm_max) {
+    VP9_COMMON *const cm = &cpi->common;
+    MACROBLOCK *const x = &cpi->mb;
+    MACROBLOCKD *const xd = &x->e_mbd;
+    const int segment_id = mbmi->segment_id;
+    const int ymbs = MIN(4, mb_rows_left);
+    const int xmbs = MIN(4, mb_cols_left);
+
+    xd->mode_info_context = mi;
+    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+            vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||
+           (cm->mb_no_coeff_skip && get_skip_flag(mi, mis, ymbs, xmbs)));
+    set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);
+  }
+}
 #endif
+#endif
+
+static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
+  VP9_COMMON *const cm = &cpi->common;
+  int mb_row, mb_col;
+  const int mis = cm->mode_info_stride;
+  MODE_INFO *mi, *mi_ptr = cm->mi;
+
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4, mi_ptr += 4 * mis) {
+    mi = mi_ptr;
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 4, mi += 4) {
+#if CONFIG_SUPERBLOCKS && CONFIG_SUPERBLOCKS64
+      if (mi->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
+        reset_skip_txfm_size_sb64(cpi, mi, mis, txfm_max,
+                                  cm->mb_rows - mb_row, cm->mb_cols - mb_col);
+      } else
+#endif  // CONFIG_SUPERBLOCKS && CONFIG_SUPERBLOCKS64
+      {
+        int i;
+
+        for (i = 0; i < 4; i++) {
+          const int x_idx_sb = (i & 1) << 1, y_idx_sb = i & 2;
+#if CONFIG_SUPERBLOCKS
+          MODE_INFO *sb_mi = mi + y_idx_sb * mis + x_idx_sb;
+#endif
+
+          if (mb_row + y_idx_sb >= cm->mb_rows ||
+              mb_col + x_idx_sb >= cm->mb_cols)
+            continue;
+
+#if CONFIG_SUPERBLOCKS
+          if (sb_mi->mbmi.sb_type) {
+            reset_skip_txfm_size_sb32(cpi, sb_mi, mis, txfm_max,
+                                      cm->mb_rows - mb_row - y_idx_sb,
+                                      cm->mb_cols - mb_col - x_idx_sb);
+          } else
+#endif
+          {
+            int m;
+
+            for (m = 0; m < 4; m++) {
+              const int x_idx = x_idx_sb + (m & 1), y_idx = y_idx_sb + (m >> 1);
+              MODE_INFO *mb_mi;
+
+              if (mb_col + x_idx >= cm->mb_cols ||
+                  mb_row + y_idx >= cm->mb_rows)
+                continue;
+
+              mb_mi = mi + y_idx * mis + x_idx;
+#if CONFIG_SUPERBLOCKS
+              assert(mb_mi->mbmi.sb_type == BLOCK_SIZE_MB16X16);
+#endif
+              reset_skip_txfm_size_mb(cpi, mb_mi, txfm_max);
+            }
+          }
+        }
+      }
+    }
   }
 }
 
@@ -1961,7 +1895,7 @@
 #endif
 
 #if CONFIG_SUPERBLOCKS
-  if (xd->mode_info_context->mbmi.encoded_as_sb) {
+  if (xd->mode_info_context->mbmi.sb_type) {
     ++cpi->sb_ymode_count[m];
   } else
 #endif
@@ -2014,9 +1948,9 @@
                                        ENTROPY_CONTEXT_PLANES tl[4],
                                        TOKENEXTRA *t[4],
                                        TOKENEXTRA **tp,
-                                       int skip[4])
+                                       int skip[4], int output_enabled)
 {
-  TOKENEXTRA tokens[4][16 * 24];
+  TOKENEXTRA tokens[4][16 * 25];
   int n_tokens[4], n;
 
   // if there were no skips, we don't need to do anything
@@ -2056,7 +1990,7 @@
     if (skip[n]) {
       x->e_mbd.above_context = &ta[n];
       x->e_mbd.left_context  = &tl[n];
-      vp9_stuff_mb(cpi, &x->e_mbd, tp, 0);
+      vp9_stuff_mb(cpi, &x->e_mbd, tp, !output_enabled);
     } else {
       if (n_tokens[n]) {
         memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);
@@ -2065,22 +1999,135 @@
     }
   }
 }
+
+#if CONFIG_SUPERBLOCKS64
+static void update_sb64_skip_coeff_state(VP9_COMP *cpi,
+                                         MACROBLOCK *x,
+                                         ENTROPY_CONTEXT_PLANES ta[16],
+                                         ENTROPY_CONTEXT_PLANES tl[16],
+                                         TOKENEXTRA *t[16],
+                                         TOKENEXTRA **tp,
+                                         int skip[16], int output_enabled) {
+  if (x->e_mbd.mode_info_context->mbmi.txfm_size == TX_32X32) {
+    TOKENEXTRA tokens[4][1024+512];
+    int n_tokens[4], n;
+
+    // if there were no skips, we don't need to do anything
+    if (!skip[0] && !skip[1] && !skip[2] && !skip[3])
+      return;
+
+    // if we don't do coeff skipping for this frame, we don't
+    // need to do anything here
+    if (!cpi->common.mb_no_coeff_skip)
+      return;
+
+    // if all 4 MBs skipped coeff coding, nothing to be done
+    if (skip[0] && skip[1] && skip[2] && skip[3])
+      return;
+
+    // so the situation now is that we want to skip coeffs
+    // for some MBs, but not all, and we didn't code EOB
+    // coefficients for them. However, the skip flag for this
+    // SB will be 0 overall, so we need to insert EOBs in the
+    // middle of the token tree. Do so here.
+    for (n = 0; n < 4; n++) {
+      if (n < 3) {
+        n_tokens[n] = t[n + 1] - t[n];
+      } else {
+        n_tokens[n] = *tp - t[3];
+      }
+      if (n_tokens[n]) {
+        memcpy(tokens[n], t[n], n_tokens[n] * sizeof(*t[0]));
+      }
+    }
+
+    // reset pointer, stuff EOBs where necessary
+    *tp = t[0];
+    for (n = 0; n < 4; n++) {
+      if (skip[n]) {
+        x->e_mbd.above_context = &ta[n * 2];
+        x->e_mbd.left_context  = &tl[n * 2];
+        vp9_stuff_sb(cpi, &x->e_mbd, tp, !output_enabled);
+      } else {
+        if (n_tokens[n]) {
+          memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);
+        }
+        (*tp) += n_tokens[n];
+      }
+    }
+  } else {
+    TOKENEXTRA tokens[16][16 * 25];
+    int n_tokens[16], n;
+
+    // if there were no skips, we don't need to do anything
+    if (!skip[ 0] && !skip[ 1] && !skip[ 2] && !skip[ 3] &&
+        !skip[ 4] && !skip[ 5] && !skip[ 6] && !skip[ 7] &&
+        !skip[ 8] && !skip[ 9] && !skip[10] && !skip[11] &&
+        !skip[12] && !skip[13] && !skip[14] && !skip[15])
+      return;
+
+    // if we don't do coeff skipping for this frame, we don't
+    // need to do anything here
+    if (!cpi->common.mb_no_coeff_skip)
+      return;
+
+    // if all 4 MBs skipped coeff coding, nothing to be done
+    if (skip[ 0] && skip[ 1] && skip[ 2] && skip[ 3] &&
+        skip[ 4] && skip[ 5] && skip[ 6] && skip[ 7] &&
+        skip[ 8] && skip[ 9] && skip[10] && skip[11] &&
+        skip[12] && skip[13] && skip[14] && skip[15])
+      return;
+
+    // so the situation now is that we want to skip coeffs
+    // for some MBs, but not all, and we didn't code EOB
+    // coefficients for them. However, the skip flag for this
+    // SB will be 0 overall, so we need to insert EOBs in the
+    // middle of the token tree. Do so here.
+    for (n = 0; n < 16; n++) {
+      if (n < 15) {
+        n_tokens[n] = t[n + 1] - t[n];
+      } else {
+        n_tokens[n] = *tp - t[15];
+      }
+      if (n_tokens[n]) {
+        memcpy(tokens[n], t[n], n_tokens[n] * sizeof(*t[0]));
+      }
+    }
+
+    // reset pointer, stuff EOBs where necessary
+    *tp = t[0];
+    for (n = 0; n < 16; n++) {
+      if (skip[n]) {
+        x->e_mbd.above_context = &ta[n];
+        x->e_mbd.left_context  = &tl[n];
+        vp9_stuff_mb(cpi, &x->e_mbd, tp, !output_enabled);
+      } else {
+        if (n_tokens[n]) {
+          memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);
+        }
+        (*tp) += n_tokens[n];
+      }
+    }
+  }
+}
+#endif  // CONFIG_SUPERBLOCKS64
 #endif /* CONFIG_SUPERBLOCKS */
 
-static void encode_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
-                              TOKENEXTRA **t, int recon_yoffset,
-                              int recon_uvoffset, int output_enabled,
-                              int mb_col, int mb_row) {
-  VP9_COMMON *cm = &cpi->common;
+static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
+                              int recon_yoffset, int recon_uvoffset,
+                              int output_enabled,
+                              int mb_row, int mb_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
   unsigned char *segment_id = &mbmi->segment_id;
   int seg_ref_active;
   unsigned char ref_pred_flag;
 
   x->skip = 0;
 #if CONFIG_SUPERBLOCKS
-  assert(!xd->mode_info_context->mbmi.encoded_as_sb);
+  assert(!xd->mode_info_context->mbmi.sb_type);
 #endif
 
 #ifdef ENC_DEBUG
@@ -2332,10 +2379,11 @@
 }
 
 #if CONFIG_SUPERBLOCKS
-static void encode_superblock(VP9_COMP *cpi, MACROBLOCK *x,
-                              TOKENEXTRA **t, int recon_yoffset,
-                              int recon_uvoffset, int mb_col, int mb_row) {
+static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,
+                                int recon_yoffset, int recon_uvoffset,
+                                int output_enabled, int mb_row, int mb_col) {
   VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const uint8_t *src = x->src.y_buffer;
   uint8_t *dst = xd->dst.y_buffer;
@@ -2403,7 +2451,8 @@
   if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
     vp9_build_intra_predictors_sby_s(&x->e_mbd);
     vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
-    sum_intra_stats(cpi, x);
+    if (output_enabled)
+      sum_intra_stats(cpi, x);
   } else {
     int ref_fb_idx;
 
@@ -2461,7 +2510,7 @@
     vp9_recon_sbuv_s_c(&x->e_mbd, udst, vdst);
 
     if (!x->skip) {
-      vp9_tokenize_sb(cpi, &x->e_mbd, t, 0);
+      vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled);
     } else {
       int mb_skip_context =
           cpi->common.mb_no_coeff_skip ?
@@ -2470,11 +2519,13 @@
           0;
       mi->mbmi.mb_skip_coeff = 1;
       if (cm->mb_no_coeff_skip) {
-        cpi->skip_true_count[mb_skip_context]++;
+        if (output_enabled)
+          cpi->skip_true_count[mb_skip_context]++;
         vp9_fix_contexts_sb(xd);
       } else {
-        vp9_stuff_sb(cpi, xd, t, 0);
-        cpi->skip_false_count[mb_skip_context]++;
+        vp9_stuff_sb(cpi, xd, t, !output_enabled);
+        if (output_enabled)
+          cpi->skip_false_count[mb_skip_context]++;
       }
     }
 
@@ -2493,7 +2544,7 @@
   for (n = 0; n < 4; n++) {
     int x_idx = n & 1, y_idx = n >> 1;
 
-    xd->left_context = cm->left_context + y_idx;
+    xd->left_context = cm->left_context + y_idx + (mb_row & 2);
     xd->above_context = cm->above_context + mb_col + x_idx;
     memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
     memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
@@ -2520,7 +2571,7 @@
                        vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
 
     if (!x->skip) {
-      vp9_tokenize_mb(cpi, &x->e_mbd, t, 0);
+      vp9_tokenize_mb(cpi, &x->e_mbd, t, !output_enabled);
       skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
     } else {
       int mb_skip_context =
@@ -2531,42 +2582,327 @@
       xd->mode_info_context->mbmi.mb_skip_coeff = skip[n] = 1;
       if (cpi->common.mb_no_coeff_skip) {
         // TODO(rbultje) this should be done per-sb instead of per-mb?
-        cpi->skip_true_count[mb_skip_context]++;
+        if (output_enabled)
+          cpi->skip_true_count[mb_skip_context]++;
         vp9_reset_mb_tokens_context(xd);
       } else {
-        vp9_stuff_mb(cpi, xd, t, 0);
+        vp9_stuff_mb(cpi, xd, t, !output_enabled);
         // TODO(rbultje) this should be done per-sb instead of per-mb?
-        cpi->skip_false_count[mb_skip_context]++;
+        if (output_enabled)
+          cpi->skip_false_count[mb_skip_context]++;
       }
     }
   }
 
   xd->mode_info_context = mi;
-  update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
+  update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip, output_enabled);
 #if CONFIG_TX32X32
   }
 #endif
-  if (cm->txfm_mode == TX_MODE_SELECT &&
-      !((cm->mb_no_coeff_skip && skip[0] && skip[1] && skip[2] && skip[3]) ||
-        (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
-         vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
-    cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++;
+  if (output_enabled) {
+    if (cm->txfm_mode == TX_MODE_SELECT &&
+        !((cm->mb_no_coeff_skip && skip[0] && skip[1] && skip[2] && skip[3]) ||
+          (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+           vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
+      cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++;
+    } else {
+      TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ?
+#if CONFIG_TX32X32
+                      TX_32X32 :
+#else
+                      TX_16X16 :
+#endif
+                      cm->txfm_mode;
+      mi->mbmi.txfm_size = sz;
+      if (mb_col < cm->mb_cols - 1)
+        mi[1].mbmi.txfm_size = sz;
+      if (mb_row < cm->mb_rows - 1) {
+        mi[mis].mbmi.txfm_size = sz;
+        if (mb_col < cm->mb_cols - 1)
+          mi[mis + 1].mbmi.txfm_size = sz;
+      }
+    }
+  }
+}
+
+#if CONFIG_SUPERBLOCKS64
+static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t,
+                                int recon_yoffset, int recon_uvoffset,
+                                int output_enabled, int mb_row, int mb_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const uint8_t *src = x->src.y_buffer;
+  uint8_t *dst = xd->dst.y_buffer;
+  const uint8_t *usrc = x->src.u_buffer;
+  uint8_t *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer;
+  uint8_t *vdst = xd->dst.v_buffer;
+  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  int seg_ref_active;
+  unsigned char ref_pred_flag;
+  int n;
+  TOKENEXTRA *tp[16];
+  int skip[16];
+  MODE_INFO *mi = x->e_mbd.mode_info_context;
+  unsigned int segment_id = mi->mbmi.segment_id;
+  ENTROPY_CONTEXT_PLANES ta[16], tl[16];
+  const int mis = cm->mode_info_stride;
+
+  x->skip = 0;
+
+  if (cm->frame_type == KEY_FRAME) {
+    if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
+      adjust_act_zbin(cpi, x);
+      vp9_update_zbin_extra(cpi, x);
+    }
   } else {
-    TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ?
+    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter, cm);
+
+    if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
+      // Adjust the zbin based on this MB rate.
+      adjust_act_zbin(cpi, x);
+    }
+
+    // Experimental code. Special case for gf and arf zeromv modes.
+    // Increase zbin size to suppress noise
+    cpi->zbin_mode_boost = 0;
+    if (cpi->zbin_mode_boost_enabled) {
+      if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
+        if (xd->mode_info_context->mbmi.mode == ZEROMV) {
+          if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)
+            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+          else
+            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+        } else if (xd->mode_info_context->mbmi.mode == SPLITMV) {
+          cpi->zbin_mode_boost = 0;
+        } else {
+          cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+        }
+      }
+    }
+
+    vp9_update_zbin_extra(cpi, x);
+
+    seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);
+
+    // SET VARIOUS PREDICTION FLAGS
+
+    // Did the chosen reference frame match its predicted value.
+    ref_pred_flag = ((xd->mode_info_context->mbmi.ref_frame ==
+                      vp9_get_pred_ref(cm, xd)));
+    vp9_set_pred_flag(xd, PRED_REF, ref_pred_flag);
+  }
+
+  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+    vp9_build_intra_predictors_sb64y_s(&x->e_mbd);
+    vp9_build_intra_predictors_sb64uv_s(&x->e_mbd);
+    if (output_enabled)
+      sum_intra_stats(cpi, x);
+  } else {
+    int ref_fb_idx;
+
+    assert(cm->frame_type != KEY_FRAME);
+
+    if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
+      ref_fb_idx = cpi->common.lst_fb_idx;
+    else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
+      ref_fb_idx = cpi->common.gld_fb_idx;
+    else
+      ref_fb_idx = cpi->common.alt_fb_idx;
+
+    xd->pre.y_buffer =
+        cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
+    xd->pre.u_buffer =
+        cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
+    xd->pre.v_buffer =
+        cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+
+    if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
+      int second_ref_fb_idx;
+
+      if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
+        second_ref_fb_idx = cpi->common.lst_fb_idx;
+      else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)
+        second_ref_fb_idx = cpi->common.gld_fb_idx;
+      else
+        second_ref_fb_idx = cpi->common.alt_fb_idx;
+
+      xd->second_pre.y_buffer =
+          cpi->common.yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;
+      xd->second_pre.u_buffer =
+          cpi->common.yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;
+      xd->second_pre.v_buffer =
+          cpi->common.yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;
+    }
+
+    vp9_build_inter64x64_predictors_sb(xd, xd->dst.y_buffer,
+                                       xd->dst.u_buffer, xd->dst.v_buffer,
+                                       xd->dst.y_stride, xd->dst.uv_stride);
+  }
+
+#if CONFIG_TX32X32
+  if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {
+    int n;
+
+    for (n = 0; n < 4; n++) {
+      int x_idx = n & 1, y_idx = n >> 1;
+
+      xd->mode_info_context = mi + x_idx * 2 + mis * y_idx * 2;
+      xd->left_context = cm->left_context + (y_idx << 1);
+      xd->above_context = cm->above_context + mb_col + (x_idx << 1);
+      memcpy(&ta[n * 2], xd->above_context, sizeof(*ta) * 2);
+      memcpy(&tl[n * 2], xd->left_context, sizeof(*tl) * 2);
+      tp[n] = *t;
+      xd->mode_info_context = mi + x_idx * 2 + y_idx * mis * 2;
+      vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff,
+                           src + x_idx * 32 + y_idx * 32 * src_y_stride,
+                           src_y_stride,
+                           dst + x_idx * 32 + y_idx * 32 * dst_y_stride,
+                           dst_y_stride);
+      vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,
+                            usrc + x_idx * 16 + y_idx * 16 * src_uv_stride,
+                            vsrc + x_idx * 16 + y_idx * 16 * src_uv_stride,
+                            src_uv_stride,
+                            udst + x_idx * 16 + y_idx * 16 * dst_uv_stride,
+                            vdst + x_idx * 16 + y_idx * 16 * dst_uv_stride,
+                            dst_uv_stride);
+      vp9_transform_sby_32x32(x);
+      vp9_transform_sbuv_16x16(x);
+      vp9_quantize_sby_32x32(x);
+      vp9_quantize_sbuv_16x16(x);
+      // TODO(rbultje): trellis optimize
+      vp9_inverse_transform_sbuv_16x16(&x->e_mbd.sb_coeff_data);
+      vp9_inverse_transform_sby_32x32(&x->e_mbd.sb_coeff_data);
+      vp9_recon_sby_s_c(&x->e_mbd,
+                        dst + 32 * x_idx + 32 * y_idx * dst_y_stride,
+                        dst_y_stride);
+      vp9_recon_sbuv_s_c(&x->e_mbd,
+                         udst + x_idx * 16 + y_idx * 16 * dst_uv_stride,
+                         vdst + x_idx * 16 + y_idx * 16 * dst_uv_stride);
+
+      if (!x->skip) {
+        vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled);
+      } else {
+        int mb_skip_context = cpi->common.mb_no_coeff_skip ?
+                              (mi - 1)->mbmi.mb_skip_coeff +
+                                  (mi - mis)->mbmi.mb_skip_coeff : 0;
+        xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+        if (cm->mb_no_coeff_skip) {
+          if (output_enabled)
+            cpi->skip_true_count[mb_skip_context]++;
+          vp9_fix_contexts_sb(xd);
+        } else {
+          vp9_stuff_sb(cpi, xd, t, !output_enabled);
+          if (output_enabled)
+            cpi->skip_false_count[mb_skip_context]++;
+        }
+      }
+
+      // copy skip flag on all mb_mode_info contexts in this SB
+      // if this was a skip at this txfm size
+      if (mb_col + x_idx * 2 < cm->mb_cols - 1)
+        mi[mis * y_idx * 2 + x_idx * 2 + 1].mbmi.mb_skip_coeff =
+            mi[mis * y_idx * 2 + x_idx * 2].mbmi.mb_skip_coeff;
+      if (mb_row + y_idx * 2 < cm->mb_rows - 1) {
+        mi[mis * y_idx * 2 + x_idx * 2 + mis].mbmi.mb_skip_coeff =
+            mi[mis * y_idx * 2 + x_idx * 2].mbmi.mb_skip_coeff;
+        if (mb_col + x_idx * 2 < cm->mb_cols - 1)
+          mi[mis * y_idx * 2 + x_idx * 2 + mis + 1].mbmi.mb_skip_coeff =
+              mi[mis * y_idx * 2 + x_idx * 2].mbmi.mb_skip_coeff;
+      }
+      skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
+    }
+  } else
+#endif
+  {
+    for (n = 0; n < 16; n++) {
+      const int x_idx = n & 3, y_idx = n >> 2;
+
+      xd->left_context = cm->left_context + y_idx;
+      xd->above_context = cm->above_context + mb_col + x_idx;
+      memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
+      memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
+      tp[n] = *t;
+      xd->mode_info_context = mi + x_idx + y_idx * mis;
+
+      vp9_subtract_mby_s_c(x->src_diff,
+                           src + x_idx * 16 + y_idx * 16 * src_y_stride,
+                           src_y_stride,
+                           dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
+                           dst_y_stride);
+      vp9_subtract_mbuv_s_c(x->src_diff,
+                            usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                            vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                            src_uv_stride,
+                            udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                            vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                            dst_uv_stride);
+      vp9_fidct_mb(x);
+      vp9_recon_mby_s_c(&x->e_mbd,
+                        dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
+      vp9_recon_mbuv_s_c(&x->e_mbd,
+                         udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                         vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
+
+      if (!x->skip) {
+        vp9_tokenize_mb(cpi, &x->e_mbd, t, !output_enabled);
+        skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
+      } else {
+        int mb_skip_context = cpi->common.mb_no_coeff_skip ?
+          (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +
+          (x->e_mbd.mode_info_context - mis)->mbmi.mb_skip_coeff : 0;
+        xd->mode_info_context->mbmi.mb_skip_coeff = skip[n] = 1;
+        if (cpi->common.mb_no_coeff_skip) {
+          // TODO(rbultje) this should be done per-sb instead of per-mb?
+          if (output_enabled)
+            cpi->skip_true_count[mb_skip_context]++;
+          vp9_reset_mb_tokens_context(xd);
+        } else {
+          vp9_stuff_mb(cpi, xd, t, !output_enabled);
+          // TODO(rbultje) this should be done per-sb instead of per-mb?
+          if (output_enabled)
+            cpi->skip_false_count[mb_skip_context]++;
+        }
+      }
+    }
+  }
+
+  xd->mode_info_context = mi;
+  update_sb64_skip_coeff_state(cpi, x, ta, tl, tp, t, skip, output_enabled);
+
+  if (output_enabled) {
+    if (cm->txfm_mode == TX_MODE_SELECT &&
+        !((cm->mb_no_coeff_skip &&
+           ((mi->mbmi.txfm_size == TX_32X32 &&
+             skip[0] && skip[1] && skip[2] && skip[3]) ||
+            (mi->mbmi.txfm_size != TX_32X32 &&
+             skip[0] && skip[1] && skip[2] && skip[3] &&
+             skip[4] && skip[5] && skip[6] && skip[7] &&
+             skip[8] && skip[9] && skip[10] && skip[11] &&
+             skip[12] && skip[13] && skip[14] && skip[15]))) ||
+          (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+           vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
+      cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++;
+    } else {
+      int x, y;
+      TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ?
 #if CONFIG_TX32X32
                     TX_32X32 :
 #else
                     TX_16X16 :
 #endif
                     cm->txfm_mode;
-    mi->mbmi.txfm_size = sz;
-    if (mb_col < cm->mb_cols - 1)
-      mi[1].mbmi.txfm_size = sz;
-    if (mb_row < cm->mb_rows - 1) {
-      mi[mis].mbmi.txfm_size = sz;
-      if (mb_col < cm->mb_cols - 1)
-        mi[mis + 1].mbmi.txfm_size = sz;
+      for (y = 0; y < 4; y++) {
+        for (x = 0; x < 4; x++) {
+          if (mb_col + x < cm->mb_cols && mb_row + y < cm->mb_rows) {
+            mi[mis * y + x].mbmi.txfm_size = sz;
+          }
+        }
+      }
     }
   }
 }
+#endif  // CONFIG_SUPERBLOCKS64
 #endif
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 6bce1ad..38a2eab 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -17,6 +17,7 @@
 #include <limits.h>
 #include <math.h>
 #include "vp9/common/vp9_findnearmv.h"
+#include "vp9/common/vp9_common.h"
 
 #ifdef ENTROPY_STATS
 static int mv_ref_ct [31] [4] [2];
@@ -241,9 +242,6 @@
     },                                                                   \
     v = INT_MAX;)
 
-#define MIN(x,y) (((x)<(y))?(x):(y))
-#define MAX(x,y) (((x)>(y))?(x):(y))
-
 int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                              int_mv *bestmv, int_mv *ref_mv,
                                              int error_per_bit,
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 9b186c2..44f20ad 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -556,43 +556,19 @@
 }
 
 static void update_reference_segmentation_map(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  int row, col, sb_rows = (cm->mb_rows + 1) >> 1, sb_cols = (cm->mb_cols + 1) >> 1;
-  MODE_INFO *mi = cm->mi;
-  uint8_t *segmap = cpi->segmentation_map;
-  uint8_t *segcache = cm->last_frame_seg_map;
+  VP9_COMMON *const cm = &cpi->common;
+  int row, col;
+  MODE_INFO *mi, *mi_ptr = cm->mi;
+  uint8_t *cache_ptr = cm->last_frame_seg_map, *cache;
 
-  for (row = 0; row < sb_rows; row++) {
-    for (col = 0; col < sb_cols; col++) {
-      MODE_INFO *miptr = mi + col * 2;
-      uint8_t *cache = segcache + col * 2;
-#if CONFIG_SUPERBLOCKS
-      if (miptr->mbmi.encoded_as_sb) {
-        cache[0] = miptr->mbmi.segment_id;
-        if (!(cm->mb_cols & 1) || col < sb_cols - 1)
-          cache[1] = miptr->mbmi.segment_id;
-        if (!(cm->mb_rows & 1) || row < sb_rows - 1) {
-          cache[cm->mb_cols] = miptr->mbmi.segment_id;
-          if (!(cm->mb_cols & 1) || col < sb_cols - 1)
-            cache[cm->mb_cols + 1] = miptr->mbmi.segment_id;
-        }
-      } else
-#endif
-      {
-        cache[0] = miptr[0].mbmi.segment_id;
-        if (!(cm->mb_cols & 1) || col < sb_cols - 1)
-          cache[1] = miptr[1].mbmi.segment_id;
-        if (!(cm->mb_rows & 1) || row < sb_rows - 1) {
-          cache[cm->mb_cols] = miptr[cm->mode_info_stride].mbmi.segment_id;
-          if (!(cm->mb_cols & 1) || col < sb_cols - 1)
-            cache[1] = miptr[1].mbmi.segment_id;
-          cache[cm->mb_cols + 1] = miptr[cm->mode_info_stride + 1].mbmi.segment_id;
-        }
-      }
+  for (row = 0; row < cm->mb_rows; row++) {
+    mi = mi_ptr;
+    cache = cache_ptr;
+    for (col = 0; col < cm->mb_cols; col++, mi++, cache++) {
+      cache[0] = mi->mbmi.segment_id;
     }
-    segmap += 2 * cm->mb_cols;
-    segcache += 2 * cm->mb_cols;
-    mi += 2 * cm->mode_info_stride;
+    mi_ptr += cm->mode_info_stride;
+    cache_ptr += cm->mb_cols;
   }
 }
 
@@ -1788,7 +1764,10 @@
   cm->prob_gf_coded                 = 128;
   cm->prob_intra_coded              = 63;
 #if CONFIG_SUPERBLOCKS
-  cm->sb_coded                      = 200;
+  cm->sb32_coded                    = 200;
+#if CONFIG_SUPERBLOCKS64
+  cm->sb64_coded                    = 200;
+#endif
 #endif
   for (i = 0; i < COMP_PRED_CONTEXTS; i++)
     cm->prob_comppred[i]         = 128;
@@ -1994,6 +1973,13 @@
       vp9_variance_halfpixvar32x32_h, vp9_variance_halfpixvar32x32_v,
       vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
       vp9_sad32x32x4d)
+
+#if CONFIG_SUPERBLOCKS64
+  BFP(BLOCK_64X64, vp9_sad64x64, vp9_variance64x64, vp9_sub_pixel_variance64x64,
+      vp9_variance_halfpixvar64x64_h, vp9_variance_halfpixvar64x64_v,
+      vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8,
+      vp9_sad64x64x4d)
+#endif
 #endif
 
   BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 7c9181b..1142835 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -390,8 +390,15 @@
   BLOCK_4X4 = PARTITIONING_4X4,
   BLOCK_16X16,
   BLOCK_MAX_SEGMENTS,
+#if CONFIG_SUPERBLOCKS
   BLOCK_32X32 = BLOCK_MAX_SEGMENTS,
+#if CONFIG_SUPERBLOCKS64
+  BLOCK_64X64,
+#endif  // CONFIG_SUPERBLOCKS64
   BLOCK_MAX_SB_SEGMENTS,
+#else  // CONFIG_SUPERBLOCKS
+  BLOCK_MAX_SB_SEGMENTS = BLOCK_MAX_SEGMENTS,
+#endif  // CONFIG_SUPERBLOCKS
 };
 
 typedef struct VP9_COMP {
@@ -571,7 +578,10 @@
   int cq_target_quality;
 
 #if CONFIG_SUPERBLOCKS
-  int sb_count;
+  int sb32_count[2];
+#if CONFIG_SUPERBLOCKS64
+  int sb64_count[2];
+#endif
   int sb_ymode_count [VP9_I32X32_MODES];
 #endif
   int ymode_count[VP9_YMODES];        /* intra MB type cts this frame */
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 774b577..a79cb5a 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -41,6 +41,7 @@
 #include "vp9/common/vp9_entropy.h"
 #include "vp9_rtcd.h"
 #include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_common.h"
 
 #define MAXF(a,b)            (((a) > (b)) ? (a) : (b))
 
@@ -926,14 +927,21 @@
 
 #if CONFIG_SUPERBLOCKS
 #if CONFIG_TX32X32
-static int rdcost_sby_32x32(MACROBLOCK *x) {
+static int rdcost_sby_32x32(MACROBLOCK *x, int backup) {
   MACROBLOCKD * const xd = &x->e_mbd;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above,
-                  *tl = (ENTROPY_CONTEXT *) &t_left;
+  ENTROPY_CONTEXT *ta, *tl;
 
-  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(ENTROPY_CONTEXT_PLANES));
+  if (backup) {
+    ta = (ENTROPY_CONTEXT *) &t_above,
+    tl = (ENTROPY_CONTEXT *) &t_left;
+
+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left,  xd->left_context,  sizeof(ENTROPY_CONTEXT_PLANES));
+  } else {
+    ta = (ENTROPY_CONTEXT *) xd->above_context;
+    tl = (ENTROPY_CONTEXT *) xd->left_context;
+  }
 
   return cost_coeffs(x, xd->block, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_32X32);
 }
@@ -953,7 +961,8 @@
 
 #define DEBUG_ERROR 0
 static void super_block_yrd_32x32(MACROBLOCK *x,
-                                  int *rate, int *distortion, int *skippable) {
+                                  int *rate, int *distortion, int *skippable,
+                                  int backup) {
   SUPERBLOCK  * const x_sb = &x->sb_coeff_data;
   MACROBLOCKD * const xd = &x->e_mbd;
   SUPERBLOCKD * const xd_sb = &xd->sb_coeff_data;
@@ -976,7 +985,7 @@
   printf("IDCT/FDCT error 32x32: %d (d: %d)\n",
          vp9_block_error_c(x_sb->src_diff, out, 1024), *distortion);
 #endif
-  *rate       = rdcost_sby_32x32(x);
+  *rate       = rdcost_sby_32x32(x, backup);
   *skippable  = vp9_sby_is_skippable_32x32(&x->e_mbd);
 }
 #endif
@@ -1005,7 +1014,7 @@
 #if CONFIG_TX32X32
   vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff, src, src_y_stride,
                        dst, dst_y_stride);
-  super_block_yrd_32x32(x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
+  super_block_yrd_32x32(x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32], 1);
 #endif
 
 #if DEBUG_ERROR
@@ -1065,6 +1074,104 @@
   xd->above_context = orig_above;
   xd->left_context = orig_left;
 }
+
+static void super_block_64_yrd(VP9_COMP *cpi,
+                               MACROBLOCK *x, int *rate, int *distortion,
+                               int *skip,
+                               int64_t txfm_cache[NB_TXFM_MODES]) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB], n;
+  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
+  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+  ENTROPY_CONTEXT_PLANES t_above[TX_SIZE_MAX_SB][4],
+                        *orig_above = xd->above_context;
+  ENTROPY_CONTEXT_PLANES t_left[TX_SIZE_MAX_SB][4],
+                        *orig_left = xd->left_context;
+
+  for (n = TX_4X4; n < TX_SIZE_MAX_SB; n++) {
+    vpx_memcpy(t_above[n], xd->above_context, sizeof(t_above[n]));
+    vpx_memcpy(t_left[n], xd->left_context, sizeof(t_left[n]));
+    r[n][0] = 0;
+    d[n] = 0;
+    s[n] = 1;
+  }
+
+#if CONFIG_TX32X32
+  for (n = 0; n < 4; n++) {
+    int x_idx = n & 1, y_idx = n >> 1;
+    int r_tmp, d_tmp, s_tmp;
+
+    xd->above_context = &t_above[TX_32X32][x_idx << 1];
+    xd->left_context = &t_left[TX_32X32][y_idx << 1];
+    vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff,
+                         src + 32 * x_idx + 32 * y_idx * src_y_stride,
+                         src_y_stride,
+                         dst + 32 * x_idx + 32 * y_idx * dst_y_stride,
+                         dst_y_stride);
+    super_block_yrd_32x32(x, &r_tmp, &d_tmp, &s_tmp, 0);
+    r[TX_32X32][0] += r_tmp;
+    d[TX_32X32] += d_tmp;
+    s[TX_32X32] = s[TX_32X32] && s_tmp;
+  }
+#endif
+
+#if DEBUG_ERROR
+  int err[3] = { 0, 0, 0 };
+#endif
+  for (n = 0; n < 16; n++) {
+    int x_idx = n & 3, y_idx = n >> 2;
+    int r_tmp, d_tmp, s_tmp;
+
+    vp9_subtract_mby_s_c(x->src_diff,
+                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
+                         src_y_stride,
+                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
+                         dst_y_stride);
+
+    xd->above_context = &t_above[TX_16X16][x_idx];
+    xd->left_context = &t_left[TX_16X16][y_idx];
+    macro_block_yrd_16x16(x, &r_tmp, &d_tmp, &s_tmp, 0);
+    d[TX_16X16] += d_tmp;
+    r[TX_16X16][0] += r_tmp;
+    s[TX_16X16] = s[TX_16X16] && s_tmp;
+#if DEBUG_ERROR
+    vp9_inverse_transform_mby_16x16(xd);
+    err[2] += vp9_block_error_c(xd->diff, x->src_diff, 256);
+#endif
+
+    xd->above_context = &t_above[TX_4X4][x_idx];
+    xd->left_context = &t_left[TX_4X4][y_idx];
+    macro_block_yrd_4x4(x, &r_tmp, &d_tmp, &s_tmp, 0);
+    d[TX_4X4] += d_tmp;
+    r[TX_4X4][0] += r_tmp;
+    s[TX_4X4] = s[TX_4X4] && s_tmp;
+#if DEBUG_ERROR
+    vp9_inverse_transform_mby_4x4(xd);
+    err[0] += vp9_block_error_c(xd->diff, x->src_diff, 256);
+#endif
+
+    xd->above_context = &t_above[TX_8X8][x_idx];
+    xd->left_context = &t_left[TX_8X8][y_idx];
+    macro_block_yrd_8x8(x, &r_tmp, &d_tmp, &s_tmp, 0);
+    d[TX_8X8] += d_tmp;
+    r[TX_8X8][0] += r_tmp;
+    s[TX_8X8] = s[TX_8X8] && s_tmp;
+#if DEBUG_ERROR
+    vp9_inverse_transform_mby_8x8(xd);
+    err[1] += vp9_block_error_c(xd->diff, x->src_diff, 256);
+#endif
+  }
+#if DEBUG_ERROR
+  printf("IDCT/FDCT error 16x16: %d (d: %d)\n", err[2], d[2]);
+  printf("IDCT/FDCT error 8x8: %d (d: %d)\n", err[1], d[1]);
+  printf("IDCT/FDCT error 4x4: %d (d: %d)\n", err[0], d[0]);
+#endif
+  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,
+                           TX_SIZE_MAX_SB - 1);
+
+  xd->above_context = orig_above;
+  xd->left_context = orig_left;
+}
 #endif
 
 static void copy_predictor_8x8(uint8_t *dst, const uint8_t *predictor) {
@@ -1359,6 +1466,48 @@
 
   return best_rd;
 }
+
+#if CONFIG_SUPERBLOCKS64
+static int64_t rd_pick_intra_sb64y_mode(VP9_COMP *cpi,
+                                        MACROBLOCK *x,
+                                        int *rate,
+                                        int *rate_tokenonly,
+                                        int *distortion,
+                                        int *skippable,
+                                        int64_t txfm_cache[NB_TXFM_MODES]) {
+  MB_PREDICTION_MODE mode;
+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  int this_rate, this_rate_tokenonly;
+  int this_distortion, s;
+  int64_t best_rd = INT64_MAX, this_rd;
+
+  /* Y Search for 32x32 intra prediction mode */
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    x->e_mbd.mode_info_context->mbmi.mode = mode;
+    vp9_build_intra_predictors_sb64y_s(&x->e_mbd);
+
+    super_block_64_yrd(cpi, x, &this_rate_tokenonly,
+                       &this_distortion, &s, txfm_cache);
+    this_rate = this_rate_tokenonly +
+                x->mbmode_cost[x->e_mbd.frame_type]
+                              [x->e_mbd.mode_info_context->mbmi.mode];
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+    if (this_rd < best_rd) {
+      mode_selected   = mode;
+      best_rd         = this_rd;
+      *rate           = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion     = this_distortion;
+      *skippable      = s;
+    }
+  }
+
+  x->e_mbd.mode_info_context->mbmi.mode = mode_selected;
+
+  return best_rd;
+}
+#endif  // CONFIG_SUPERBLOCKS64
 #endif
 
 static int64_t rd_pick_intra16x16mby_mode(VP9_COMP *cpi,
@@ -1735,18 +1884,23 @@
 
 #if CONFIG_SUPERBLOCKS
 #if CONFIG_TX32X32
-static int rd_cost_sbuv_16x16(MACROBLOCK *x) {
+static int rd_cost_sbuv_16x16(MACROBLOCK *x, int backup) {
   int b;
   int cost = 0;
   MACROBLOCKD *const xd = &x->e_mbd;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
   ENTROPY_CONTEXT *ta, *tl;
 
-  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  if (backup) {
+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
-  ta = (ENTROPY_CONTEXT *) &t_above;
-  tl = (ENTROPY_CONTEXT *) &t_left;
+    ta = (ENTROPY_CONTEXT *) &t_above;
+    tl = (ENTROPY_CONTEXT *) &t_left;
+  } else {
+    ta = (ENTROPY_CONTEXT *)xd->above_context;
+    tl = (ENTROPY_CONTEXT *)xd->left_context;
+  }
 
   for (b = 16; b < 24; b += 4)
     cost += cost_coeffs(x, xd->block + b, PLANE_TYPE_UV,
@@ -1757,13 +1911,14 @@
 }
 
 static void rd_inter32x32_uv_16x16(MACROBLOCK *x, int *rate,
-                                   int *distortion, int *skip) {
+                                   int *distortion, int *skip,
+                                   int backup) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
   vp9_transform_sbuv_16x16(x);
   vp9_quantize_sbuv_16x16(x);
 
-  *rate       = rd_cost_sbuv_16x16(x);
+  *rate       = rd_cost_sbuv_16x16(x, backup);
   *distortion = vp9_block_error_c(x->sb_coeff_data.coeff + 1024,
                                    xd->sb_coeff_data.dqcoeff + 1024, 512) >> 2;
   *skip       = vp9_sbuv_is_skippable_16x16(xd);
@@ -1783,7 +1938,7 @@
     vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,
                           usrc, vsrc, src_uv_stride,
                           udst, vdst, dst_uv_stride);
-    rd_inter32x32_uv_16x16(x, rate, distortion, skip);
+    rd_inter32x32_uv_16x16(x, rate, distortion, skip, 1);
   } else {
 #endif
   int n, r = 0, d = 0;
@@ -1833,6 +1988,14 @@
 
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
+
+static void super_block_64_uvrd(MACROBLOCK *x, int *rate,
+                                int *distortion, int *skip);
+static int64_t rd_inter64x64_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+                                int *distortion, int fullpixel, int *skip) {
+  super_block_64_uvrd(x, rate, distortion, skip);
+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
 #endif
 
 static int64_t rd_inter4x4_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
@@ -1984,13 +2147,13 @@
     vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,
                           usrc, vsrc, src_uv_stride,
                           udst, vdst, dst_uv_stride);
-    rd_inter32x32_uv_16x16(x, rate, distortion, skippable);
+    rd_inter32x32_uv_16x16(x, rate, distortion, skippable, 1);
   } else {
 #endif
   int d = 0, r = 0, n, s = 1;
   ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
-  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
-  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
+  ENTROPY_CONTEXT_PLANES *ta_orig = xd->above_context;
+  ENTROPY_CONTEXT_PLANES *tl_orig = xd->left_context;
 
   memcpy(t_above, xd->above_context, sizeof(t_above));
   memcpy(t_left,  xd->left_context,  sizeof(t_left));
@@ -2016,24 +2179,107 @@
     }
 
     d += vp9_mbuverror(x) >> 2;
-    xd->above_context = ta + x_idx;
-    xd->left_context = tl + y_idx;
-    r += rd_cost_mbuv_8x8(x, 0);
+    xd->above_context = t_above + x_idx;
+    xd->left_context = t_left + y_idx;
+    if (mbmi->txfm_size == TX_4X4) {
+      r += rd_cost_mbuv_4x4(x, 0);
+    } else {
+      r += rd_cost_mbuv_8x8(x, 0);
+    }
   }
 
-  xd->above_context = ta;
-  xd->left_context = tl;
+  xd->above_context = ta_orig;
+  xd->left_context = tl_orig;
+
+  *distortion = d;
+  *rate       = r;
+  *skippable  = s;
+#if CONFIG_TX32X32
+  }
+#endif
+}
+
+static void super_block_64_uvrd(MACROBLOCK *x,
+                                int *rate,
+                                int *distortion,
+                                int *skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
+  ENTROPY_CONTEXT_PLANES *ta_orig = xd->above_context;
+  ENTROPY_CONTEXT_PLANES *tl_orig = xd->left_context;
+  int d = 0, r = 0, n, s = 1;
+
+  memcpy(t_above, xd->above_context, sizeof(t_above));
+  memcpy(t_left,  xd->left_context,  sizeof(t_left));
+
+#if CONFIG_TX32X32
+  if (mbmi->txfm_size == TX_32X32) {
+    int n;
+
+    *rate = 0;
+    for (n = 0; n < 4; n++) {
+      int x_idx = n & 1, y_idx = n >> 1;
+      int r_tmp, d_tmp, s_tmp;
+
+      vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,
+                            usrc + x_idx * 16 + y_idx * 16 * src_uv_stride,
+                            vsrc + x_idx * 16 + y_idx * 16 * src_uv_stride,
+                            src_uv_stride,
+                            udst + x_idx * 16 + y_idx * 16 * dst_uv_stride,
+                            vdst + x_idx * 16 + y_idx * 16 * dst_uv_stride,
+                            dst_uv_stride);
+      xd->above_context = t_above + x_idx * 2;
+      xd->left_context = t_left + y_idx * 2;
+      rd_inter32x32_uv_16x16(x, &r_tmp, &d_tmp, &s_tmp, 0);
+      r += r_tmp;
+      d += d_tmp;
+      s = s && s_tmp;
+    }
+  } else {
+#endif
+    for (n = 0; n < 16; n++) {
+      int x_idx = n & 3, y_idx = n >> 2;
+
+      vp9_subtract_mbuv_s_c(x->src_diff,
+                            usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                            vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                            src_uv_stride,
+                            udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                            vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                            dst_uv_stride);
+      if (mbmi->txfm_size == TX_4X4) {
+        vp9_transform_mbuv_4x4(x);
+        vp9_quantize_mbuv_4x4(x);
+        s &= vp9_mbuv_is_skippable_4x4(xd);
+      } else {
+        vp9_transform_mbuv_8x8(x);
+        vp9_quantize_mbuv_8x8(x);
+        s &= vp9_mbuv_is_skippable_8x8(xd);
+      }
+
+      xd->above_context = t_above + x_idx;
+      xd->left_context = t_left + y_idx;
+      d += vp9_mbuverror(x) >> 2;
+      if (mbmi->txfm_size == TX_4X4) {
+        r += rd_cost_mbuv_4x4(x, 0);
+      } else {
+        r += rd_cost_mbuv_8x8(x, 0);
+      }
+    }
+#if CONFIG_TX32X32
+  }
+#endif
+
   *distortion = d;
   *rate       = r;
   *skippable  = s;
 
-  xd->left_context = tl;
-  xd->above_context = ta;
-  memcpy(xd->above_context, t_above, sizeof(t_above));
-  memcpy(xd->left_context,  t_left,  sizeof(t_left));
-#if CONFIG_TX32X32
-  }
-#endif
+  xd->left_context = tl_orig;
+  xd->above_context = ta_orig;
 }
 
 static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi,
@@ -2072,6 +2318,45 @@
 
   return best_rd;
 }
+
+#if CONFIG_SUPERBLOCKS64
+static int64_t rd_pick_intra_sb64uv_mode(VP9_COMP *cpi,
+                                         MACROBLOCK *x,
+                                         int *rate,
+                                         int *rate_tokenonly,
+                                         int *distortion,
+                                         int *skippable) {
+  MB_PREDICTION_MODE mode;
+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  int64_t best_rd = INT64_MAX, this_rd;
+  int this_rate_tokenonly, this_rate;
+  int this_distortion, s;
+
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
+    vp9_build_intra_predictors_sb64uv_s(&x->e_mbd);
+
+    super_block_64_uvrd(x, &this_rate_tokenonly,
+                        &this_distortion, &s);
+    this_rate = this_rate_tokenonly +
+    x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+    if (this_rd < best_rd) {
+      mode_selected   = mode;
+      best_rd         = this_rd;
+      *rate           = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion     = this_distortion;
+      *skippable      = s;
+    }
+  }
+
+  x->e_mbd.mode_info_context->mbmi.uv_mode = mode_selected;
+
+  return best_rd;
+}
+#endif  // CONFIG_SUPERBLOCKS64
 #endif
 
 int vp9_cost_mv_ref(VP9_COMP *cpi,
@@ -3161,8 +3446,6 @@
   *skippable = y_skippable && uv_skippable;
 }
 
-#define MIN(x,y) (((x)<(y))?(x):(y))
-#define MAX(x,y) (((x)>(y))?(x):(y))
 static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                                int idx, MV_REFERENCE_FRAME frame_type,
                                int block_size,
@@ -3367,7 +3650,28 @@
   }
 #endif
 
-  if (block_size == BLOCK_16X16) {
+#if CONFIG_SUPERBLOCKS
+#if CONFIG_SUPERBLOCKS64
+  if (block_size == BLOCK_64X64) {
+    vp9_build_inter64x64_predictors_sb(xd,
+                                       xd->dst.y_buffer,
+                                       xd->dst.u_buffer,
+                                       xd->dst.v_buffer,
+                                       xd->dst.y_stride,
+                                       xd->dst.uv_stride);
+  } else
+#endif  // CONFIG_SUPERBLOCKS64
+  if (block_size == BLOCK_32X32) {
+    vp9_build_inter32x32_predictors_sb(xd,
+                                       xd->dst.y_buffer,
+                                       xd->dst.u_buffer,
+                                       xd->dst.v_buffer,
+                                       xd->dst.y_stride,
+                                       xd->dst.uv_stride);
+  } else
+#endif  // CONFIG_SUPERBLOCKS
+  {
+    assert(block_size == BLOCK_16X16);
     vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
     if (is_comp_pred)
       vp9_build_2nd_inter16x16_predictors_mby(xd, xd->predictor, 16);
@@ -3376,15 +3680,6 @@
       vp9_build_interintra_16x16_predictors_mby(xd, xd->predictor, 16);
     }
 #endif
-  } else {
-#if CONFIG_SUPERBLOCKS
-    vp9_build_inter32x32_predictors_sb(xd,
-                                       xd->dst.y_buffer,
-                                       xd->dst.u_buffer,
-                                       xd->dst.v_buffer,
-                                       xd->dst.y_stride,
-                                       xd->dst.uv_stride);
-#endif
   }
 
   if (cpi->active_map_enabled && x->active_ptr[0] == 0)
@@ -3397,14 +3692,22 @@
     if (threshold < x->encode_breakout)
       threshold = x->encode_breakout;
 
-    if (block_size == BLOCK_16X16) {
-      var = vp9_variance16x16(*(b->base_src), b->src_stride,
-                              xd->predictor, 16, &sse);
-    } else {
 #if CONFIG_SUPERBLOCKS
+#if CONFIG_SUPERBLOCKS64
+    if (block_size == BLOCK_64X64) {
+      var = vp9_variance64x64(*(b->base_src), b->src_stride,
+                              xd->dst.y_buffer, xd->dst.y_stride, &sse);
+    } else
+#endif  // CONFIG_SUPERBLOCKS64
+    if (block_size == BLOCK_32X32) {
       var = vp9_variance32x32(*(b->base_src), b->src_stride,
                               xd->dst.y_buffer, xd->dst.y_stride, &sse);
-#endif
+    } else
+#endif  // CONFIG_SUPERBLOCK
+    {
+      assert(block_size == BLOCK_16X16);
+      var = vp9_variance16x16(*(b->base_src), b->src_stride,
+                              xd->predictor, 16, &sse);
     }
 
     if ((int)sse < threshold) {
@@ -3416,15 +3719,29 @@
         // Check u and v to make sure skip is ok
         int sse2;
 
-        if (block_size == BLOCK_16X16) {
-          sse2 = vp9_uvsse(x);
-        } else {
+#if CONFIG_SUPERBLOCKS
+#if CONFIG_SUPERBLOCKS64
+        if (block_size == BLOCK_64X64) {
+          unsigned int sse2u, sse2v;
+          var = vp9_variance32x32(x->src.u_buffer, x->src.uv_stride,
+                                  xd->dst.u_buffer, xd->dst.uv_stride, &sse2u);
+          var = vp9_variance32x32(x->src.v_buffer, x->src.uv_stride,
+                                  xd->dst.v_buffer, xd->dst.uv_stride, &sse2v);
+          sse2 = sse2u + sse2v;
+        } else
+#endif  // CONFIG_SUPERBLOCKS64
+        if (block_size == BLOCK_32X32) {
           unsigned int sse2u, sse2v;
           var = vp9_variance16x16(x->src.u_buffer, x->src.uv_stride,
                                   xd->dst.u_buffer, xd->dst.uv_stride, &sse2u);
           var = vp9_variance16x16(x->src.v_buffer, x->src.uv_stride,
                                   xd->dst.v_buffer, xd->dst.uv_stride, &sse2v);
           sse2 = sse2u + sse2v;
+        } else
+#endif  // CONFIG_SUPERBLOCKS
+        {
+          assert(block_size == BLOCK_16X16);
+          sse2 = vp9_uvsse(x);
         }
 
         if (sse2 * 2 < threshold) {
@@ -3455,23 +3772,26 @@
   }
 
   if (!x->skip) {
-    if (block_size == BLOCK_16X16) {
-      vp9_build_1st_inter16x16_predictors_mbuv(xd, &xd->predictor[256],
-                                               &xd->predictor[320], 8);
-      if (is_comp_pred)
-        vp9_build_2nd_inter16x16_predictors_mbuv(xd, &xd->predictor[256],
-                                                 &xd->predictor[320], 8);
-#if CONFIG_COMP_INTERINTRA_PRED
-      if (is_comp_interintra_pred) {
-        vp9_build_interintra_16x16_predictors_mbuv(xd, &xd->predictor[256],
-                                                   &xd->predictor[320], 8);
-      }
-#endif
-      inter_mode_cost(cpi, x, rate2, distortion,
-                      rate_y, distortion_y, rate_uv, distortion_uv,
-                      skippable, txfm_cache);
-    } else {
 #if CONFIG_SUPERBLOCKS
+#if CONFIG_SUPERBLOCKS64
+    if (block_size == BLOCK_64X64) {
+      int skippable_y, skippable_uv;
+
+      // Y cost and distortion
+      super_block_64_yrd(cpi, x, rate_y, distortion_y,
+                         &skippable_y, txfm_cache);
+      *rate2 += *rate_y;
+      *distortion += *distortion_y;
+
+      rd_inter64x64_uv(cpi, x, rate_uv, distortion_uv,
+                       cm->full_pixel, &skippable_uv);
+
+      *rate2 += *rate_uv;
+      *distortion += *distortion_uv;
+      *skippable = skippable_y && skippable_uv;
+    } else
+#endif  // CONFIG_SUPERBLOCKS64
+    if (block_size == BLOCK_32X32) {
       int skippable_y, skippable_uv;
 
       // Y cost and distortion
@@ -3486,7 +3806,25 @@
       *rate2 += *rate_uv;
       *distortion += *distortion_uv;
       *skippable = skippable_y && skippable_uv;
+    } else
+#endif  // CONFIG_SUPERBLOCKS
+    {
+      assert(block_size == BLOCK_16X16);
+
+      vp9_build_1st_inter16x16_predictors_mbuv(xd, &xd->predictor[256],
+                                               &xd->predictor[320], 8);
+      if (is_comp_pred)
+        vp9_build_2nd_inter16x16_predictors_mbuv(xd, &xd->predictor[256],
+                                                 &xd->predictor[320], 8);
+#if CONFIG_COMP_INTERINTRA_PRED
+      if (is_comp_interintra_pred) {
+        vp9_build_interintra_16x16_predictors_mbuv(xd, &xd->predictor[256],
+                                                   &xd->predictor[320], 8);
+      }
 #endif
+      inter_mode_cost(cpi, x, rate2, distortion,
+                      rate_y, distortion_y, rate_uv, distortion_uv,
+                      skippable, txfm_cache);
     }
   }
   return this_rd;  // if 0, this will be re-calculated by caller
@@ -3554,7 +3892,8 @@
   vpx_memset(&frame_mv, 0, sizeof(frame_mv));
   vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
   vpx_memset(&best_bmodes, 0, sizeof(best_bmodes));
-  vpx_memset(&x->mb_context[xd->mb_index], 0, sizeof(PICK_MODE_CONTEXT));
+  vpx_memset(&x->mb_context[xd->sb_index][xd->mb_index], 0,
+             sizeof(PICK_MODE_CONTEXT));
 
   for (i = 0; i < MAX_REF_FRAMES; i++)
     frame_mv[NEWMV][i].as_int = INVALID_MV;
@@ -3787,7 +4126,7 @@
 #if CONFIG_COMP_INTRA_PRED
                                              0,
 #endif
-                                             0);
+                                             cpi->update_context);
           rate2 += rate;
           distortion2 += distortion;
 
@@ -4298,18 +4637,18 @@
   }
 
 end:
-  store_coding_context(
-      x, &x->mb_context[xd->mb_index], best_mode_index, &best_partition,
-      &mbmi->ref_mvs[mbmi->ref_frame][0],
-      &mbmi->ref_mvs[mbmi->second_ref_frame < 0
-                     ? 0 : mbmi->second_ref_frame][0],
-      best_pred_diff, best_txfm_diff);
+  store_coding_context(x, &x->mb_context[xd->sb_index][xd->mb_index],
+                       best_mode_index, &best_partition,
+                       &mbmi->ref_mvs[mbmi->ref_frame][0],
+                       &mbmi->ref_mvs[mbmi->second_ref_frame < 0 ? 0 :
+                                      mbmi->second_ref_frame][0],
+                       best_pred_diff, best_txfm_diff);
 }
 
 #if CONFIG_SUPERBLOCKS
-void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                               int *returnrate,
-                               int *returndist) {
+void vp9_rd_pick_intra_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
+                                 int *returnrate,
+                                 int *returndist) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   int rate_y, rate_uv;
@@ -4335,6 +4674,37 @@
     *returndist = dist_y + (dist_uv >> 2);
   }
 }
+
+#if CONFIG_SUPERBLOCKS64
+void vp9_rd_pick_intra_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
+                                 int *returnrate,
+                                 int *returndist) {
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int rate_y, rate_uv;
+  int rate_y_tokenonly, rate_uv_tokenonly;
+  int error_y, error_uv;
+  int dist_y, dist_uv;
+  int y_skip, uv_skip;
+  int64_t txfm_cache[NB_TXFM_MODES];
+
+  error_y = rd_pick_intra_sb64y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
+                                     &dist_y, &y_skip, txfm_cache);
+  error_uv = rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+                                       &dist_uv, &uv_skip);
+
+  if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) {
+    *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
+    vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
+    *returndist = dist_y + (dist_uv >> 2);
+  } else {
+    *returnrate = rate_y + rate_uv;
+    if (cm->mb_no_coeff_skip)
+      *returnrate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
+    *returndist = dist_y + (dist_uv >> 2);
+  }
+}
+#endif
 #endif
 
 void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
@@ -4409,11 +4779,12 @@
 #if CONFIG_COMP_INTRA_PRED
                                        0,
 #endif
-                                       0);
+                                       cpi->update_context);
 #if CONFIG_COMP_INTRA_PRED
   error4x4d = rd_pick_intra4x4mby_modes(cpi, x,
                                         &rate4x4d, &rate4x4_tokenonly,
-                                        &dist4x4d, error16x16, 1, 0);
+                                        &dist4x4d, error16x16, 1,
+                                        cpi->update_context);
 #endif
 
   mbmi->mb_skip_coeff = 0;
@@ -4426,8 +4797,8 @@
            vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
     dist = dist16x16 + (distuv8x8 >> 2);
     mbmi->txfm_size = txfm_size_16x16;
-    memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
-           sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
+    memset(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff, 0,
+           sizeof(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff));
   } else if (error8x8 > error16x16) {
     if (error4x4 < error16x16) {
       rate = rateuv;
@@ -4444,15 +4815,16 @@
       mbmi->mode = B_PRED;
       mbmi->txfm_size = TX_4X4;
       dist = dist4x4 + (distuv >> 2);
-      memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
-             sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
+      memset(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff, 0,
+             sizeof(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff));
     } else {
       mbmi->txfm_size = txfm_size_16x16;
       mbmi->mode = mode16x16;
       rate = rate16x16 + rateuv8x8;
       dist = dist16x16 + (distuv8x8 >> 2);
       for (i = 0; i < NB_TXFM_MODES; i++) {
-        x->mb_context[xd->mb_index].txfm_rd_diff[i] = error16x16 - txfm_cache[i];
+        x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff[i] =
+            error16x16 - txfm_cache[i];
       }
     }
     if (cpi->common.mb_no_coeff_skip)
@@ -4473,8 +4845,8 @@
       mbmi->mode = B_PRED;
       mbmi->txfm_size = TX_4X4;
       dist = dist4x4 + (distuv >> 2);
-      memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
-             sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
+      memset(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff, 0,
+             sizeof(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff));
     } else {
       // FIXME(rbultje) support transform-size selection
       mbmi->mode = I8X8_PRED;
@@ -4482,8 +4854,8 @@
       set_i8x8_block_modes(x, mode8x8);
       rate = rate8x8 + rateuv;
       dist = dist8x8 + (distuv >> 2);
-      memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
-             sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
+      memset(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff, 0,
+             sizeof(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff));
     }
     if (cpi->common.mb_no_coeff_skip)
       rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
@@ -4494,9 +4866,11 @@
 }
 
 #if CONFIG_SUPERBLOCKS
-int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                                  int recon_yoffset, int recon_uvoffset,
-                                  int *returnrate, int *returndistortion) {
+static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+                                         int recon_yoffset, int recon_uvoffset,
+                                         int *returnrate,
+                                         int *returndistortion,
+                                         int block_size) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
@@ -4556,7 +4930,7 @@
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, BLOCK_32X32,
+      setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size,
                          recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],
                          frame_mv[NEARMV], frame_mdcounts,
                          y_buffer, u_buffer, v_buffer);
@@ -4565,27 +4939,56 @@
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
-  mbmi->mode = DC_PRED;
-  if (cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT) {
-    mbmi->txfm_size = TX_4X4;
-    rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_4x4, &rate_uv_tokenonly_4x4,
-                            &dist_uv_4x4, &uv_skip_4x4);
-    mode_uv_4x4 = mbmi->uv_mode;
-  }
-  if (cm->txfm_mode != ONLY_4X4) {
-    mbmi->txfm_size = TX_8X8;
-    rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_8x8, &rate_uv_tokenonly_8x8,
-                            &dist_uv_8x8, &uv_skip_8x8);
-    mode_uv_8x8 = mbmi->uv_mode;
-  }
+#if CONFIG_SUPERBLOCKS64
+  if (block_size == BLOCK_64X64) {
+    mbmi->mode = DC_PRED;
+    if (cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT) {
+      mbmi->txfm_size = TX_4X4;
+      rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv_4x4, &rate_uv_tokenonly_4x4,
+                                &dist_uv_4x4, &uv_skip_4x4);
+      mode_uv_4x4 = mbmi->uv_mode;
+    }
+    if (cm->txfm_mode != ONLY_4X4) {
+      mbmi->txfm_size = TX_8X8;
+      rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv_8x8, &rate_uv_tokenonly_8x8,
+                                &dist_uv_8x8, &uv_skip_8x8);
+      mode_uv_8x8 = mbmi->uv_mode;
+    }
 #if CONFIG_TX32X32
-  if (cm->txfm_mode >= ALLOW_32X32) {
-    mbmi->txfm_size = TX_32X32;
-    rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_16x16, &rate_uv_tokenonly_16x16,
-                            &dist_uv_16x16, &uv_skip_16x16);
-    mode_uv_16x16 = mbmi->uv_mode;
+    if (cm->txfm_mode >= ALLOW_32X32) {
+      mbmi->txfm_size = TX_32X32;
+      rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv_16x16,
+                                &rate_uv_tokenonly_16x16,
+                                &dist_uv_16x16, &uv_skip_16x16);
+      mode_uv_16x16 = mbmi->uv_mode;
+    }
+#endif  // CONFIG_TX32X32
+  } else
+#endif  // CONFIG_SUPERBLOCKS64
+  {
+    assert(block_size == BLOCK_32X32);
+    mbmi->mode = DC_PRED;
+    if (cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT) {
+      mbmi->txfm_size = TX_4X4;
+      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_4x4, &rate_uv_tokenonly_4x4,
+                              &dist_uv_4x4, &uv_skip_4x4);
+      mode_uv_4x4 = mbmi->uv_mode;
+    }
+    if (cm->txfm_mode != ONLY_4X4) {
+      mbmi->txfm_size = TX_8X8;
+      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_8x8, &rate_uv_tokenonly_8x8,
+                              &dist_uv_8x8, &uv_skip_8x8);
+      mode_uv_8x8 = mbmi->uv_mode;
+    }
+#if CONFIG_TX32X32
+    if (cm->txfm_mode >= ALLOW_32X32) {
+      mbmi->txfm_size = TX_32X32;
+      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_16x16, &rate_uv_tokenonly_16x16,
+                              &dist_uv_16x16, &uv_skip_16x16);
+      mode_uv_16x16 = mbmi->uv_mode;
+    }
+#endif  // CONFIG_TX32X32
   }
-#endif
 
   for (mode_index = 0; mode_index < MAX_MODES;
        mode_index += (!switchable_filter_index)) {
@@ -4713,9 +5116,19 @@
     }
 
     if (ref_frame == INTRA_FRAME) {
-      vp9_build_intra_predictors_sby_s(xd);
-      super_block_yrd(cpi, x, &rate_y, &distortion_y,
-                      &skippable, txfm_cache);
+#if CONFIG_SUPERBLOCKS64
+      if (block_size == BLOCK_64X64) {
+        vp9_build_intra_predictors_sb64y_s(xd);
+        super_block_64_yrd(cpi, x, &rate_y, &distortion_y,
+                           &skippable, txfm_cache);
+      } else
+#endif  // CONFIG_SUPERBLOCKS64
+      {
+        assert(block_size == BLOCK_32X32);
+        vp9_build_intra_predictors_sby_s(xd);
+        super_block_yrd(cpi, x, &rate_y, &distortion_y,
+                        &skippable, txfm_cache);
+      }
       if (mbmi->txfm_size == TX_4X4) {
         rate_uv = rate_uv_4x4;
         distortion_uv = dist_uv_4x4;
@@ -4727,7 +5140,7 @@
         distortion_uv = dist_uv_16x16;
         skippable = skippable && uv_skip_16x16;
         mbmi->uv_mode = mode_uv_16x16;
-#endif
+#endif  // CONFIG_TX32X32
       } else {
         rate_uv = rate_uv_8x8;
         distortion_uv = dist_uv_8x8;
@@ -4749,7 +5162,7 @@
 #endif
       }
 #endif
-      this_rd = handle_inter_mode(cpi, x, BLOCK_32X32,
+      this_rd = handle_inter_mode(cpi, x, block_size,
                                   &saddone, near_sadidx, mdcounts, txfm_cache,
                                   &rate2, &distortion2, &skippable,
                                   &compmode_cost,
@@ -5021,14 +5434,41 @@
   }
 
  end:
-  store_coding_context(x, &x->sb_context[0], best_mode_index, NULL,
-                       &mbmi->ref_mvs[mbmi->ref_frame][0],
-                       &mbmi->ref_mvs[mbmi->second_ref_frame < 0
-                                      ? 0 : mbmi->second_ref_frame][0],
-                       best_pred_diff, best_txfm_diff);
+  {
+#if CONFIG_SUPERBLOCKS64
+    PICK_MODE_CONTEXT *p = (block_size == BLOCK_32X32) ?
+                            &x->sb32_context[xd->sb_index] :
+                            &x->sb64_context;
+#else
+    PICK_MODE_CONTEXT *p = &x->sb32_context[xd->sb_index];
+#endif
+    store_coding_context(x, p, best_mode_index, NULL,
+                         &mbmi->ref_mvs[mbmi->ref_frame][0],
+                         &mbmi->ref_mvs[mbmi->second_ref_frame < 0 ? 0 :
+                             mbmi->second_ref_frame][0],
+                         best_pred_diff, best_txfm_diff);
+  }
 
   return best_rd;
 }
+
+int64_t vp9_rd_pick_inter_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
+                                    int recon_yoffset, int recon_uvoffset,
+                                    int *returnrate,
+                                    int *returndistortion) {
+  return vp9_rd_pick_inter_mode_sb(cpi, x, recon_yoffset, recon_uvoffset,
+                                   returnrate, returndistortion, BLOCK_32X32);
+}
+
+#if CONFIG_SUPERBLOCKS64
+int64_t vp9_rd_pick_inter_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
+                                    int recon_yoffset, int recon_uvoffset,
+                                    int *returnrate,
+                                    int *returndistortion) {
+  return vp9_rd_pick_inter_mode_sb(cpi, x, recon_yoffset, recon_uvoffset,
+                                   returnrate, returndistortion, BLOCK_64X64);
+}
+#endif  // CONFIG_SUPERBLOCKS64
 #endif
 
 void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
@@ -5063,8 +5503,8 @@
   //    vp9_pick_inter_mode
 
   // Store metrics so they can be added in to totals if this mode is picked
-  x->mb_context[xd->mb_index].distortion  = distortion;
-  x->mb_context[xd->mb_index].intra_error = intra_error;
+  x->mb_context[xd->sb_index][xd->mb_index].distortion  = distortion;
+  x->mb_context[xd->sb_index][xd->mb_index].intra_error = intra_error;
 
   *totalrate = rate;
   *totaldist = distortion;
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 4c2c33a..8ee2c0b 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -22,16 +22,23 @@
 extern void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                    int *r, int *d);
 
-extern void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                                      int *r, int *d);
+extern void vp9_rd_pick_intra_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
+                                        int *r, int *d);
+
+extern void vp9_rd_pick_intra_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
+                                        int *r, int *d);
 
 extern void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
-                                           int recon_yoffset,
-                                           int recon_uvoffset, int *r, int *d);
+                                           int ref_yoffset, int ref_uvoffset,
+                                           int *r, int *d);
 
-extern int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                                         int recon_yoffset, int recon_uvoffset,
-                                         int *returnrate, int *returndist);
+extern int64_t vp9_rd_pick_inter_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
+                                           int ref_yoffset, int ref_uvoffset,
+                                           int *r, int *d);
+
+extern int64_t vp9_rd_pick_inter_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
+                                           int ref_yoffset, int ref_uvoffset,
+                                           int *r, int *d);
 
 extern void vp9_init_me_luts();
 
diff --git a/vp9/encoder/vp9_sad_c.c b/vp9/encoder/vp9_sad_c.c
index e5249e5..9ce27fb 100644
--- a/vp9/encoder/vp9_sad_c.c
+++ b/vp9/encoder/vp9_sad_c.c
@@ -14,6 +14,14 @@
 #include "vpx_ports/config.h"
 #include "vpx/vpx_integer.h"
 
+unsigned int vp9_sad64x64_c(const uint8_t *src_ptr,
+                            int  src_stride,
+                            const uint8_t *ref_ptr,
+                            int  ref_stride,
+                            int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 64);
+}
+
 unsigned int vp9_sad32x32_c(const uint8_t *src_ptr,
                             int  src_stride,
                             const uint8_t *ref_ptr,
@@ -64,6 +72,19 @@
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);
 }
 
+void vp9_sad64x64x3_c(const uint8_t *src_ptr,
+                      int  src_stride,
+                      const uint8_t *ref_ptr,
+                      int  ref_stride,
+                      unsigned int *sad_array) {
+  sad_array[0] = vp9_sad64x64_c(src_ptr, src_stride,
+                                ref_ptr, ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad64x64_c(src_ptr, src_stride,
+                                ref_ptr + 1, ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad64x64_c(src_ptr, src_stride,
+                                ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
 void vp9_sad32x32x3_c(const uint8_t *src_ptr,
                       int  src_stride,
                       const uint8_t *ref_ptr,
@@ -77,6 +98,37 @@
                                 ref_ptr + 2, ref_stride, 0x7fffffff);
 }
 
+void vp9_sad64x64x8_c(const uint8_t *src_ptr,
+                      int  src_stride,
+                      const uint8_t *ref_ptr,
+                      int  ref_stride,
+                      uint16_t *sad_array) {
+  sad_array[0] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
+                                          ref_ptr, ref_stride,
+                                          0x7fffffff);
+  sad_array[1] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
+                                          ref_ptr + 1, ref_stride,
+                                          0x7fffffff);
+  sad_array[2] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
+                                          ref_ptr + 2, ref_stride,
+                                          0x7fffffff);
+  sad_array[3] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
+                                          ref_ptr + 3, ref_stride,
+                                          0x7fffffff);
+  sad_array[4] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
+                                          ref_ptr + 4, ref_stride,
+                                          0x7fffffff);
+  sad_array[5] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
+                                          ref_ptr + 5, ref_stride,
+                                          0x7fffffff);
+  sad_array[6] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
+                                          ref_ptr + 6, ref_stride,
+                                          0x7fffffff);
+  sad_array[7] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
+                                          ref_ptr + 7, ref_stride,
+                                          0x7fffffff);
+}
+
 void vp9_sad32x32x8_c(const uint8_t *src_ptr,
                       int  src_stride,
                       const uint8_t *ref_ptr,
@@ -328,6 +380,21 @@
                                         0x7fffffff);
 }
 
+void vp9_sad64x64x4d_c(const uint8_t *src_ptr,
+                       int  src_stride,
+                       uint8_t *ref_ptr[],
+                       int  ref_stride,
+                       unsigned int *sad_array) {
+  sad_array[0] = vp9_sad64x64_c(src_ptr, src_stride,
+                                ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad64x64_c(src_ptr, src_stride,
+                                ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad64x64_c(src_ptr, src_stride,
+                                ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp9_sad64x64_c(src_ptr, src_stride,
+                                ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
 void vp9_sad32x32x4d_c(const uint8_t *src_ptr,
                        int  src_stride,
                        uint8_t *ref_ptr[],
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index ee90f4f..19529fc 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -141,21 +141,57 @@
             segcounts[3] * vp9_cost_one(probs[2]);
 
   return cost;
+}
 
+static void count_segs(VP9_COMP *cpi,
+                       MODE_INFO *mi,
+                       int *no_pred_segcounts,
+                       int (*temporal_predictor_count)[2],
+                       int *t_unpred_seg_counts,
+                       int mb_size, int mb_row, int mb_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  const int segmap_index = mb_row * cm->mb_cols + mb_col;
+  const int segment_id = mi->mbmi.segment_id;
+
+  xd->mode_info_context = mi;
+  xd->mb_to_top_edge = -((mb_row * 16) << 3);
+  xd->mb_to_left_edge = -((mb_col * 16) << 3);
+  xd->mb_to_bottom_edge = ((cm->mb_rows - mb_size - mb_row) * 16) << 3;
+  xd->mb_to_right_edge  = ((cm->mb_cols - mb_size - mb_col) * 16) << 3;
+
+  // Count the number of hits on each segment with no prediction
+  no_pred_segcounts[segment_id]++;
+
+  // Temporal prediction not allowed on key frames
+  if (cm->frame_type != KEY_FRAME) {
+    // Test to see if the segment id matches the predicted value.
+    const int seg_predicted =
+        (segment_id == vp9_get_pred_mb_segid(cm, xd, segmap_index));
+
+    // Get the segment id prediction context
+    const int pred_context = vp9_get_pred_context(cm, xd, PRED_SEG_ID);
+
+    // Store the prediction status for this mb and update counts
+    // as appropriate
+    vp9_set_pred_flag(xd, PRED_SEG_ID, seg_predicted);
+    temporal_predictor_count[pred_context][seg_predicted]++;
+
+    if (!seg_predicted)
+      // Update the "unpredicted" segment count
+      t_unpred_seg_counts[segment_id]++;
+  }
 }
 
 void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
 
-  int i;
   int no_pred_cost;
   int t_pred_cost = INT_MAX;
-  int pred_context;
 
+  int i;
   int mb_row, mb_col;
-  int segmap_index = 0;
-  unsigned char segment_id;
 
   int temporal_predictor_count[PREDICTION_PROBS][2];
   int no_pred_segcounts[MAX_MB_SEGMENTS];
@@ -165,9 +201,8 @@
   vp9_prob t_pred_tree[MB_FEATURE_TREE_PROBS];
   vp9_prob t_nopred_prob[PREDICTION_PROBS];
 
-#if CONFIG_SUPERBLOCKS
   const int mis = cm->mode_info_stride;
-#endif
+  MODE_INFO *mi_ptr = cm->mi, *mi;
 
   // Set default state for the segment tree probabilities and the
   // temporal coding probabilities
@@ -183,87 +218,57 @@
   // First of all generate stats regarding how well the last segment map
   // predicts this one
 
-  // Initialize macroblock decoder mode info context for the first mb
-  // in the frame
-  xd->mode_info_context = cm->mi;
-
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 2) {
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 2) {
-      for (i = 0; i < 4; i++) {
-        static const int dx[4] = { +1, -1, +1, +1 };
-        static const int dy[4] = {  0, +1,  0, -1 };
-        int x_idx = i & 1, y_idx = i >> 1;
-
-        if (mb_col + x_idx >= cm->mb_cols ||
-            mb_row + y_idx >= cm->mb_rows) {
-          goto end;
-        }
-
-        xd->mb_to_top_edge = -((mb_row * 16) << 3);
-        xd->mb_to_left_edge = -((mb_col * 16) << 3);
-
-        segmap_index = (mb_row + y_idx) * cm->mb_cols + mb_col + x_idx;
-        segment_id = xd->mode_info_context->mbmi.segment_id;
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4, mi_ptr += 4 * mis) {
+    mi = mi_ptr;
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 4, mi += 4) {
+#if CONFIG_SUPERBLOCKS && CONFIG_SUPERBLOCKS64
+      if (mi->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
+        count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,
+                   t_unpred_seg_counts, 4, mb_row, mb_col);
+      } else
+#endif
+      {
+        for (i = 0; i < 4; i++) {
+          int x_idx = (i & 1) << 1, y_idx = i & 2;
 #if CONFIG_SUPERBLOCKS
-        if (xd->mode_info_context->mbmi.encoded_as_sb) {
-          if (mb_col + 1 < cm->mb_cols)
-            segment_id = segment_id &&
-                         xd->mode_info_context[1].mbmi.segment_id;
-          if (mb_row + 1 < cm->mb_rows) {
-            segment_id = segment_id &&
-                         xd->mode_info_context[mis].mbmi.segment_id;
-            if (mb_col + 1 < cm->mb_cols)
-              segment_id = segment_id &&
-                           xd->mode_info_context[mis + 1].mbmi.segment_id;
+          MODE_INFO *sb_mi = mi + y_idx * mis + x_idx;
+#endif
+
+          if (mb_col + x_idx >= cm->mb_cols ||
+              mb_row + y_idx >= cm->mb_rows) {
+            continue;
           }
-          xd->mb_to_bottom_edge = ((cm->mb_rows - 2 - mb_row) * 16) << 3;
-          xd->mb_to_right_edge  = ((cm->mb_cols - 2 - mb_col) * 16) << 3;
-        } else {
-#endif
-          xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
-          xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
-#if CONFIG_SUPERBLOCKS
-        }
-#endif
-
-        // Count the number of hits on each segment with no prediction
-        no_pred_segcounts[segment_id]++;
-
-        // Temporal prediction not allowed on key frames
-        if (cm->frame_type != KEY_FRAME) {
-          // Test to see if the segment id matches the predicted value.
-          int seg_predicted =
-            (segment_id == vp9_get_pred_mb_segid(cm, xd, segmap_index));
-
-          // Get the segment id prediction context
-          pred_context =
-            vp9_get_pred_context(cm, xd, PRED_SEG_ID);
-
-          // Store the prediction status for this mb and update counts
-          // as appropriate
-          vp9_set_pred_flag(xd, PRED_SEG_ID, seg_predicted);
-          temporal_predictor_count[pred_context][seg_predicted]++;
-
-          if (!seg_predicted)
-            // Update the "unpredicted" segment count
-            t_unpred_seg_counts[segment_id]++;
-        }
 
 #if CONFIG_SUPERBLOCKS
-        if (xd->mode_info_context->mbmi.encoded_as_sb) {
-          assert(!i);
-          xd->mode_info_context += 2;
-          break;
-        }
+          if (sb_mi->mbmi.sb_type) {
+            assert(sb_mi->mbmi.sb_type == BLOCK_SIZE_SB32X32);
+            count_segs(cpi, sb_mi, no_pred_segcounts, temporal_predictor_count,
+                       t_unpred_seg_counts, 2, mb_row + y_idx, mb_col + x_idx);
+          } else
 #endif
-      end:
-        xd->mode_info_context += dx[i] + dy[i] * cm->mode_info_stride;
+          {
+            int j;
+
+            for (j = 0; j < 4; j++) {
+              const int x_idx_mb = x_idx + (j & 1), y_idx_mb = y_idx + (j >> 1);
+              MODE_INFO *mb_mi = mi + x_idx_mb + y_idx_mb * mis;
+
+              if (mb_col + x_idx_mb >= cm->mb_cols ||
+                  mb_row + y_idx_mb >= cm->mb_rows) {
+                continue;
+              }
+
+#if CONFIG_SUPERBLOCKS
+              assert(mb_mi->mbmi.sb_type == BLOCK_SIZE_MB16X16);
+#endif
+              count_segs(cpi, mb_mi, no_pred_segcounts,
+                         temporal_predictor_count, t_unpred_seg_counts,
+                         1, mb_row + y_idx_mb, mb_col + x_idx_mb);
+            }
+          }
+        }
       }
     }
-
-    // this is to account for the border in mode_info_context
-    xd->mode_info_context -= mb_col;
-    xd->mode_info_context += cm->mode_info_stride * 2;
   }
 
   // Work out probability tree for coding segments without prediction
diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance_c.c
index ecb9257..9060d4c 100644
--- a/vp9/encoder/vp9_variance_c.c
+++ b/vp9/encoder/vp9_variance_c.c
@@ -25,6 +25,19 @@
 }
 
 #if CONFIG_SUPERBLOCKS
+unsigned int vp9_variance64x64_c(const uint8_t *src_ptr,
+                                 int  source_stride,
+                                 const uint8_t *ref_ptr,
+                                 int  recon_stride,
+                                 unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64, &var, &avg);
+  *sse = var;
+  return (var - (((int64_t)avg * avg) >> 12));
+}
+
 unsigned int vp9_variance32x32_c(const uint8_t *src_ptr,
                                  int  source_stride,
                                  const uint8_t *ref_ptr,
@@ -185,6 +198,27 @@
 }
 
 #if CONFIG_SUPERBLOCKS
+unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
+                                           int  src_pixels_per_line,
+                                           int  xoffset,
+                                           int  yoffset,
+                                           const uint8_t *dst_ptr,
+                                           int dst_pixels_per_line,
+                                           unsigned int *sse) {
+  uint16_t FData3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  const int16_t *HFilter, *VFilter;
+
+  HFilter = vp9_bilinear_filters[xoffset];
+  VFilter = vp9_bilinear_filters[yoffset];
+
+  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,
+                                    1, 65, 64, HFilter);
+  var_filter_block2d_bil_second_pass(FData3, temp2, 64, 64, 64, 64, VFilter);
+
+  return vp9_variance64x64_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
                                            int  src_pixels_per_line,
                                            int  xoffset,
@@ -224,6 +258,15 @@
   return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 0,
                                        ref_ptr, recon_stride, sse);
 }
+
+unsigned int vp9_variance_halfpixvar64x64_h_c(const uint8_t *src_ptr,
+                                              int  source_stride,
+                                              const uint8_t *ref_ptr,
+                                              int  recon_stride,
+                                              unsigned int *sse) {
+  return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 8, 0,
+                                       ref_ptr, recon_stride, sse);
+}
 #endif
 
 
@@ -245,6 +288,15 @@
   return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 0, 8,
                                        ref_ptr, recon_stride, sse);
 }
+
+unsigned int vp9_variance_halfpixvar64x64_v_c(const uint8_t *src_ptr,
+                                              int  source_stride,
+                                              const uint8_t *ref_ptr,
+                                              int  recon_stride,
+                                              unsigned int *sse) {
+  return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 0, 8,
+                                       ref_ptr, recon_stride, sse);
+}
 #endif
 
 unsigned int vp9_variance_halfpixvar16x16_hv_c(const uint8_t *src_ptr,
@@ -265,6 +317,15 @@
   return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 8,
                                        ref_ptr, recon_stride, sse);
 }
+
+unsigned int vp9_variance_halfpixvar64x64_hv_c(const uint8_t *src_ptr,
+                                               int  source_stride,
+                                               const uint8_t *ref_ptr,
+                                               int  recon_stride,
+                                               unsigned int *sse) {
+  return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 8, 8,
+                                       ref_ptr, recon_stride, sse);
+}
 #endif
 
 unsigned int vp9_sub_pixel_mse16x16_c(const uint8_t *src_ptr,
@@ -293,6 +354,19 @@
                                 dst_pixels_per_line, sse);
   return *sse;
 }
+
+unsigned int vp9_sub_pixel_mse64x64_c(const uint8_t *src_ptr,
+                                      int  src_pixels_per_line,
+                                      int  xoffset,
+                                      int  yoffset,
+                                      const uint8_t *dst_ptr,
+                                      int dst_pixels_per_line,
+                                      unsigned int *sse) {
+  vp9_sub_pixel_variance64x64_c(src_ptr, src_pixels_per_line,
+                                xoffset, yoffset, dst_ptr,
+                                dst_pixels_per_line, sse);
+  return *sse;
+}
 #endif
 
 unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr,