Merge "Moved dequantization into the token decoder" into experimental
diff --git a/configure b/configure
index 5d2910b..5c015b3 100755
--- a/configure
+++ b/configure
@@ -243,14 +243,14 @@
     newbintramodes
     comp_interintra_pred
     enable_6tap
-    code_nonzerocount
     modelcoefprob
     loop_dering
-    implicit_compoundinter_weight
     scatterscan
     oneshotq
     sbsegment
     multiple_arf
+    code_zerogroup
+    sb8x8
 "
 CONFIG_LIST="
     external_build
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index e142362..56fac12 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -57,11 +57,12 @@
   vpx_free(oci->above_context);
   vpx_free(oci->mip);
   vpx_free(oci->prev_mip);
+  vpx_free(oci->above_seg_context);
 
   oci->above_context = 0;
   oci->mip = 0;
   oci->prev_mip = 0;
-
+  oci->above_seg_context = 0;
 }
 
 int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
@@ -130,13 +131,21 @@
   oci->prev_mi = oci->prev_mip + oci->mode_info_stride + 1;
 
   oci->above_context =
-    vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * (3 + oci->mb_cols), 1);
+    vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * mb_cols_aligned_to_sb(oci), 1);
 
   if (!oci->above_context) {
     vp9_free_frame_buffers(oci);
     return 1;
   }
 
+  oci->above_seg_context =
+    vpx_calloc(sizeof(PARTITION_CONTEXT) * mb_cols_aligned_to_sb(oci), 1);
+
+  if (!oci->above_seg_context) {
+    vp9_free_frame_buffers(oci);
+    return 1;
+  }
+
   vp9_update_mode_info_border(oci, oci->mip);
   vp9_update_mode_info_in_image(oci, oci->mi);
 
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index f5ef3c5..89fb9d0 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -23,7 +23,8 @@
 
 // #define MODE_STATS
 
-#define MB_FEATURE_TREE_PROBS   3
+#define MAX_MB_SEGMENTS     8
+#define MB_SEG_TREE_PROBS   (MAX_MB_SEGMENTS-1)
 #define PREDICTION_PROBS 3
 
 #define DEFAULT_PRED_PROB_0 120
@@ -32,8 +33,6 @@
 
 #define MBSKIP_CONTEXTS 3
 
-#define MAX_MB_SEGMENTS         4
-
 #define MAX_REF_LF_DELTAS       4
 #define MAX_MODE_LF_DELTAS      4
 
@@ -55,6 +54,8 @@
   ENTROPY_CONTEXT v[2];
 } ENTROPY_CONTEXT_PLANES;
 
+typedef char PARTITION_CONTEXT;
+
 static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a,
                                            ENTROPY_CONTEXT b) {
   return (a != 0) + (b != 0);
@@ -246,11 +247,6 @@
   return mb_height_log2(sb_type) + 2;
 }
 
-static INLINE int partition_plane(BLOCK_SIZE_TYPE sb_type) {
-  assert(mb_width_log2(sb_type) == mb_height_log2(sb_type));
-  return (mb_width_log2(sb_type) - 1);
-}
-
 typedef struct {
   MB_PREDICTION_MODE mode, uv_mode;
 #if CONFIG_COMP_INTERINTRA_PRED
@@ -268,7 +264,7 @@
   unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
   unsigned char need_to_clamp_mvs;
   unsigned char need_to_clamp_secondmv;
-  unsigned char segment_id;                  /* Which set of segmentation parameters should be used for this MB */
+  unsigned char segment_id;           // Segment id for current frame
 
   // Flags used for prediction status of various bistream signals
   unsigned char seg_id_predicted;
@@ -282,9 +278,6 @@
   INTERPOLATIONFILTERTYPE interp_filter;
 
   BLOCK_SIZE_TYPE sb_type;
-#if CONFIG_CODE_NONZEROCOUNT
-  uint16_t nzcs[256+64*2];
-#endif
 } MB_MODE_INFO;
 
 typedef struct {
@@ -329,11 +322,7 @@
                                               int den,
                                               int offset_q4);
 
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-  convolve_fn_t predict[2][2][8];  // horiz, vert, weight (0 - 7)
-#else
   convolve_fn_t predict[2][2][2];  // horiz, vert, avg
-#endif
 };
 
 enum { MAX_MB_PLANE = 3 };
@@ -343,7 +332,7 @@
   int stride;
 };
 
-struct mb_plane {
+struct macroblockd_plane {
   DECLARE_ALIGNED(16, int16_t,  qcoeff[64 * 64]);
   DECLARE_ALIGNED(16, int16_t,  dqcoeff[64 * 64]);
   DECLARE_ALIGNED(16, uint16_t, eobs[256]);
@@ -363,16 +352,11 @@
   BLOCK_OFFSET((x)->plane[2].field, ((i) - 20), 16))
 
 typedef struct macroblockd {
-#if CONFIG_CODE_NONZEROCOUNT
-  DECLARE_ALIGNED(16, uint16_t, nzcs[256+64*2]);
-#endif
-  struct mb_plane plane[MAX_MB_PLANE];
+  struct macroblockd_plane plane[MAX_MB_PLANE];
 
   /* 16 Y blocks, 4 U, 4 V, each with 16 entries. */
   BLOCKD block[24];
 
-  YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
-  YV12_BUFFER_CONFIG second_pre;
   struct scale_factors scale_factor[2];
   struct scale_factors scale_factor_uv[2];
 
@@ -390,12 +374,20 @@
   ENTROPY_CONTEXT_PLANES *above_context;
   ENTROPY_CONTEXT_PLANES *left_context;
 
+  // partition contexts
+  PARTITION_CONTEXT *above_seg_context;
+  PARTITION_CONTEXT *left_seg_context;
+
   /* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */
   unsigned char segmentation_enabled;
 
   /* 0 (do not update) 1 (update) the macroblock segmentation map. */
   unsigned char update_mb_segmentation_map;
 
+#if CONFIG_IMPLICIT_SEGMENTATION
+  unsigned char allow_implicit_segment_update;
+#endif
+
   /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
   unsigned char update_mb_segmentation_data;
 
@@ -406,8 +398,7 @@
   /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */
 
   // Probability Tree used to code Segment number
-  vp9_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS];
-  vp9_prob mb_segment_mispred_tree_probs[MAX_MB_SEGMENTS];
+  vp9_prob mb_segment_tree_probs[MB_SEG_TREE_PROBS];
 
   // Segment features
   signed char segment_feature_data[MAX_MB_SEGMENTS][SEG_LVL_MAX];
@@ -458,6 +449,69 @@
 
 } MACROBLOCKD;
 
+static INLINE void update_partition_context(MACROBLOCKD *xd,
+                                            BLOCK_SIZE_TYPE sb_type,
+                                            BLOCK_SIZE_TYPE sb_size) {
+  int bsl = mb_width_log2(sb_size), bs = 1 << bsl;
+  int bwl = mb_width_log2(sb_type);
+  int bhl = mb_height_log2(sb_type);
+  int boffset = mb_width_log2(BLOCK_SIZE_SB64X64) - bsl;
+  int i;
+  // skip macroblock partition
+  if (bsl == 0)
+    return;
+
+  // update the partition context at the end notes. set partition bits
+  // of block sizes larger than the current one to be one, and partition
+  // bits of smaller block sizes to be zero.
+  if ((bwl == bsl) && (bhl == bsl)) {
+    for (i = 0; i < bs; i++)
+      xd->left_seg_context[i] = ~(0xf << boffset);
+    for (i = 0; i < bs; i++)
+      xd->above_seg_context[i] = ~(0xf << boffset);
+#if CONFIG_SBSEGMENT
+  } else if ((bwl == bsl) && (bhl < bsl)) {
+    for (i = 0; i < bs; i++)
+      xd->left_seg_context[i] = ~(0xe << boffset);
+    for (i = 0; i < bs; i++)
+      xd->above_seg_context[i] = ~(0xf << boffset);
+  }  else if ((bwl < bsl) && (bhl == bsl)) {
+    for (i = 0; i < bs; i++)
+      xd->left_seg_context[i] = ~(0xf << boffset);
+    for (i = 0; i < bs; i++)
+      xd->above_seg_context[i] = ~(0xe << boffset);
+#endif
+  } else if ((bwl < bsl) && (bhl < bsl)) {
+    for (i = 0; i < bs; i++)
+      xd->left_seg_context[i] = ~(0xe << boffset);
+    for (i = 0; i < bs; i++)
+      xd->above_seg_context[i] = ~(0xe << boffset);
+  } else {
+    assert(0);
+  }
+}
+
+static INLINE int partition_plane_context(MACROBLOCKD *xd,
+                                          BLOCK_SIZE_TYPE sb_type) {
+  int bsl = mb_width_log2(sb_type), bs = 1 << bsl;
+  int above = 0, left = 0, i;
+  int boffset = mb_width_log2(BLOCK_SIZE_SB64X64) - bsl;
+
+  assert(mb_width_log2(sb_type) == mb_height_log2(sb_type));
+  assert(bsl >= 0);
+  assert(boffset >= 0);
+
+  for (i = 0; i < bs; i++)
+    above |= (xd->above_seg_context[i] & (1 << boffset));
+  for (i = 0; i < bs; i++)
+    left |= (xd->left_seg_context[i] & (1 << boffset));
+
+  above = (above > 0);
+  left  = (left > 0);
+
+  return (left * 2 + above) + (bsl - 1) * PARTITION_PLOFFSET;
+}
+
 #define ACTIVE_HT   110                // quantization stepsize threshold
 
 #define ACTIVE_HT8  300
@@ -752,12 +806,6 @@
   return size;
 }
 
-#if CONFIG_CODE_NONZEROCOUNT
-static int get_nzc_used(TX_SIZE tx_size) {
-  return (tx_size >= TX_16X16);
-}
-#endif
-
 struct plane_block_idx {
   int plane;
   int block;
@@ -766,7 +814,7 @@
 // TODO(jkoleszar): returning a struct so it can be used in a const context,
 // expect to refactor this further later.
 static INLINE struct plane_block_idx plane_block_idx(int y_blocks,
-                                                      int b_idx) {
+                                                     int b_idx) {
   const int v_offset = y_blocks * 5 / 4;
   struct plane_block_idx res;
 
@@ -937,7 +985,22 @@
     foreach_predicted_block_in_plane(xd, bsize, plane, visit, arg);
   }
 }
+static int raster_block_offset(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
+                               int plane, int block) {
+  const int bw = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  const int stride = 4 << bw;
+  const int y = 4 * (block >> bw), x = 4 * (block & ((1 << bw) - 1));
+  return y * stride + x;
+}
+static int16_t* raster_block_offset_int16(MACROBLOCKD *xd,
+                                         BLOCK_SIZE_TYPE bsize,
+                                         int plane, int block, int16_t *base) {
+  return base + raster_block_offset(xd, bsize, plane, block);
+}
 
-
-
+#if CONFIG_CODE_ZEROGROUP
+static int get_zpc_used(TX_SIZE tx_size) {
+  return (tx_size >= TX_16X16);
+}
+#endif
 #endif  // VP9_COMMON_VP9_BLOCKD_H_
diff --git a/vp9/common/vp9_coefupdateprobs.h b/vp9/common/vp9_coefupdateprobs.h
index b4d892d..e49935c 100644
--- a/vp9/common/vp9_coefupdateprobs.h
+++ b/vp9/common/vp9_coefupdateprobs.h
@@ -18,12 +18,8 @@
   252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252
 };
 
-#if CONFIG_CODE_NONZEROCOUNT
-#define NZC_UPDATE_PROB_4X4     252
-#define NZC_UPDATE_PROB_8X8     252
-#define NZC_UPDATE_PROB_16X16   252
-#define NZC_UPDATE_PROB_32X32   252
-#define NZC_UPDATE_PROB_PCAT    252
+#if CONFIG_CODE_ZEROGROUP
+#define ZPC_UPDATE_PROB         248
 #endif
 
 #if CONFIG_MODELCOEFPROB
diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c
index a27ca6f..46ae503 100644
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -122,78 +122,6 @@
   }
 }
 
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-
-static inline uint8_t combine_qtr(uint8_t a, uint8_t b) {
-  return (((a) + (b) * 3 + 2) >> 2);
-}
-
-static inline uint8_t combine_3qtr(uint8_t a, uint8_t b) {
-  return (((a) * 3 + (b) + 2) >> 2);
-}
-
-static inline uint8_t combine_1by8(uint8_t a, uint8_t b) {
-  return (((a) * 1 + (b) * 7 + 4) >> 3);
-}
-
-static inline uint8_t combine_3by8(uint8_t a, uint8_t b) {
-  return (((a) * 3 + (b) * 5 + 4) >> 3);
-}
-
-static inline uint8_t combine_5by8(uint8_t a, uint8_t b) {
-  return (((a) * 5 + (b) * 3 + 4) >> 3);
-}
-
-static inline uint8_t combine_7by8(uint8_t a, uint8_t b) {
-  return (((a) * 7 + (b) * 1 + 4) >> 3);
-}
-
-// TODO(debargha): Implment with a separate weight parameter
-static void convolve_wtd_horiz_c(const uint8_t *src, int src_stride,
-                                 uint8_t *dst, int dst_stride,
-                                 const int16_t *filter_x0, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4,
-                                 int w, int h, int taps,
-                                 uint8_t (*combine)(uint8_t a, uint8_t b)) {
-  int x, y, k, sum;
-  const int16_t *filter_x_base = filter_x0;
-
-#if ALIGN_FILTERS_256
-  filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
-#endif
-
-  /* Adjust base pointer address for this source line */
-  src -= taps / 2 - 1;
-
-  for (y = 0; y < h; ++y) {
-    /* Pointer to filter to use */
-    const int16_t *filter_x = filter_x0;
-
-    /* Initial phase offset */
-    int x0_q4 = (filter_x - filter_x_base) / taps;
-    int x_q4 = x0_q4;
-
-    for (x = 0; x < w; ++x) {
-      /* Per-pixel src offset */
-      int src_x = (x_q4 - x0_q4) >> 4;
-
-      for (sum = 0, k = 0; k < taps; ++k) {
-        sum += src[src_x + k] * filter_x[k];
-      }
-      sum += (VP9_FILTER_WEIGHT >> 1);
-      dst[x] = combine(dst[x], clip_pixel(sum >> VP9_FILTER_SHIFT));
-
-      /* Adjust source and filter to use for the next pixel */
-      x_q4 += x_step_q4;
-      filter_x = filter_x_base + (x_q4 & 0xf) * taps;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-#endif
-
 static void convolve_vert_c(const uint8_t *src, int src_stride,
                             uint8_t *dst, int dst_stride,
                             const int16_t *filter_x, int x_step_q4,
@@ -279,52 +207,6 @@
   }
 }
 
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-static void convolve_wtd_vert_c(const uint8_t *src, int src_stride,
-                                uint8_t *dst, int dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y0, int y_step_q4,
-                                int w, int h, int taps,
-                                uint8_t (*combine)(uint8_t a, uint8_t b)) {
-  int x, y, k, sum;
-
-  const int16_t *filter_y_base = filter_y0;
-
-#if ALIGN_FILTERS_256
-  filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
-#endif
-
-  /* Adjust base pointer address for this source column */
-  src -= src_stride * (taps / 2 - 1);
-  for (x = 0; x < w; ++x) {
-    /* Pointer to filter to use */
-    const int16_t *filter_y = filter_y0;
-
-    /* Initial phase offset */
-    int y0_q4 = (filter_y - filter_y_base) / taps;
-    int y_q4 = y0_q4;
-
-    for (y = 0; y < h; ++y) {
-      /* Per-pixel src offset */
-      int src_y = (y_q4 - y0_q4) >> 4;
-
-      for (sum = 0, k = 0; k < taps; ++k) {
-        sum += src[(src_y + k) * src_stride] * filter_y[k];
-      }
-      sum += (VP9_FILTER_WEIGHT >> 1);
-      dst[y * dst_stride] = combine(dst[y * dst_stride],
-                                    clip_pixel(sum >> VP9_FILTER_SHIFT));
-
-      /* Adjust source and filter to use for the next pixel */
-      y_q4 += y_step_q4;
-      filter_y = filter_y_base + (y_q4 & 0xf) * taps;
-    }
-    ++src;
-    ++dst;
-  }
-}
-#endif
-
 static void convolve_c(const uint8_t *src, int src_stride,
                        uint8_t *dst, int dst_stride,
                        const int16_t *filter_x, int x_step_q4,
@@ -403,68 +285,6 @@
                        w, h, 8);
 }
 
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-void vp9_convolve8_1by8_horiz_c(const uint8_t *src, int src_stride,
-                                uint8_t *dst, int dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4,
-                                int w, int h) {
-  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,
-                       filter_x, x_step_q4, filter_y, y_step_q4,
-                       w, h, 8, combine_1by8);
-}
-
-void vp9_convolve8_qtr_horiz_c(const uint8_t *src, int src_stride,
-                               uint8_t *dst, int dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,
-                       filter_x, x_step_q4, filter_y, y_step_q4,
-                       w, h, 8, combine_qtr);
-}
-
-void vp9_convolve8_3by8_horiz_c(const uint8_t *src, int src_stride,
-                                uint8_t *dst, int dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4,
-                                int w, int h) {
-  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,
-                       filter_x, x_step_q4, filter_y, y_step_q4,
-                       w, h, 8, combine_3by8);
-}
-
-void vp9_convolve8_5by8_horiz_c(const uint8_t *src, int src_stride,
-                                uint8_t *dst, int dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4,
-                                int w, int h) {
-  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,
-                       filter_x, x_step_q4, filter_y, y_step_q4,
-                       w, h, 8, combine_5by8);
-}
-
-void vp9_convolve8_3qtr_horiz_c(const uint8_t *src, int src_stride,
-                                uint8_t *dst, int dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4,
-                                int w, int h) {
-  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,
-                       filter_x, x_step_q4, filter_y, y_step_q4,
-                       w, h, 8, combine_3qtr);
-}
-
-void vp9_convolve8_7by8_horiz_c(const uint8_t *src, int src_stride,
-                                uint8_t *dst, int dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4,
-                                int w, int h) {
-  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,
-                       filter_x, x_step_q4, filter_y, y_step_q4,
-                       w, h, 8, combine_7by8);
-}
-#endif
-
 void vp9_convolve8_vert_c(const uint8_t *src, int src_stride,
                           uint8_t *dst, int dst_stride,
                           const int16_t *filter_x, int x_step_q4,
@@ -485,68 +305,6 @@
                       w, h, 8);
 }
 
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-void vp9_convolve8_1by8_vert_c(const uint8_t *src, int src_stride,
-                               uint8_t *dst, int dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4,
-                      w, h, 8, combine_1by8);
-}
-
-void vp9_convolve8_qtr_vert_c(const uint8_t *src, int src_stride,
-                              uint8_t *dst, int dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
-  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4,
-                      w, h, 8, combine_qtr);
-}
-
-void vp9_convolve8_3by8_vert_c(const uint8_t *src, int src_stride,
-                               uint8_t *dst, int dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4,
-                      w, h, 8, combine_3by8);
-}
-
-void vp9_convolve8_5by8_vert_c(const uint8_t *src, int src_stride,
-                               uint8_t *dst, int dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4,
-                      w, h, 8, combine_5by8);
-}
-
-void vp9_convolve8_3qtr_vert_c(const uint8_t *src, int src_stride,
-                               uint8_t *dst, int dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4,
-                      w, h, 8, combine_3qtr);
-}
-
-void vp9_convolve8_7by8_vert_c(const uint8_t *src, int src_stride,
-                               uint8_t *dst, int dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4,
-                      w, h, 8, combine_7by8);
-}
-#endif
-
 void vp9_convolve8_c(const uint8_t *src, int src_stride,
                      uint8_t *dst, int dst_stride,
                      const int16_t *filter_x, int x_step_q4,
@@ -579,140 +337,6 @@
                    w, h);
 }
 
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-void vp9_convolve8_1by8_c(const uint8_t *src, int src_stride,
-                         uint8_t *dst, int dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);
-  assert(w <= 16);
-  assert(h <= 16);
-
-  vp9_convolve8(src, src_stride,
-                temp, 16,
-                filter_x, x_step_q4,
-                filter_y, y_step_q4,
-                w, h);
-  vp9_convolve_1by8(temp, 16,
-                    dst, dst_stride,
-                    NULL, 0, /* These unused parameter should be removed! */
-                    NULL, 0, /* These unused parameter should be removed! */
-                    w, h);
-}
-
-void vp9_convolve8_qtr_c(const uint8_t *src, int src_stride,
-                         uint8_t *dst, int dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);
-  assert(w <= 16);
-  assert(h <= 16);
-
-  vp9_convolve8(src, src_stride,
-                temp, 16,
-                filter_x, x_step_q4,
-                filter_y, y_step_q4,
-                w, h);
-  vp9_convolve_qtr(temp, 16,
-                   dst, dst_stride,
-                   NULL, 0, /* These unused parameter should be removed! */
-                   NULL, 0, /* These unused parameter should be removed! */
-                   w, h);
-}
-
-void vp9_convolve8_3by8_c(const uint8_t *src, int src_stride,
-                         uint8_t *dst, int dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);
-  assert(w <= 16);
-  assert(h <= 16);
-
-  vp9_convolve8(src, src_stride,
-                temp, 16,
-                filter_x, x_step_q4,
-                filter_y, y_step_q4,
-                w, h);
-  vp9_convolve_3by8(temp, 16,
-                    dst, dst_stride,
-                    NULL, 0, /* These unused parameter should be removed! */
-                    NULL, 0, /* These unused parameter should be removed! */
-                    w, h);
-}
-
-void vp9_convolve8_5by8_c(const uint8_t *src, int src_stride,
-                         uint8_t *dst, int dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);
-  assert(w <= 16);
-  assert(h <= 16);
-
-  vp9_convolve8(src, src_stride,
-                temp, 16,
-                filter_x, x_step_q4,
-                filter_y, y_step_q4,
-                w, h);
-  vp9_convolve_5by8(temp, 16,
-                    dst, dst_stride,
-                    NULL, 0, /* These unused parameter should be removed! */
-                    NULL, 0, /* These unused parameter should be removed! */
-                    w, h);
-}
-
-void vp9_convolve8_3qtr_c(const uint8_t *src, int src_stride,
-                          uint8_t *dst, int dst_stride,
-                          const int16_t *filter_x, int x_step_q4,
-                          const int16_t *filter_y, int y_step_q4,
-                          int w, int h) {
-  /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);
-  assert(w <= 16);
-  assert(h <= 16);
-
-  vp9_convolve8(src, src_stride,
-                temp, 16,
-                filter_x, x_step_q4,
-                filter_y, y_step_q4,
-                w, h);
-  vp9_convolve_3qtr(temp, 16,
-                    dst, dst_stride,
-                    NULL, 0, /* These unused parameter should be removed! */
-                    NULL, 0, /* These unused parameter should be removed! */
-                    w, h);
-}
-
-void vp9_convolve8_7by8_c(const uint8_t *src, int src_stride,
-                         uint8_t *dst, int dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);
-  assert(w <= 16);
-  assert(h <= 16);
-
-  vp9_convolve8(src, src_stride,
-                temp, 16,
-                filter_x, x_step_q4,
-                filter_y, y_step_q4,
-                w, h);
-  vp9_convolve_7by8(temp, 16,
-                    dst, dst_stride,
-                    NULL, 0, /* These unused parameter should be removed! */
-                    NULL, 0, /* These unused parameter should be removed! */
-                    w, h);
-}
-#endif
-
 void vp9_convolve_copy(const uint8_t *src, int src_stride,
                        uint8_t *dst, int dst_stride,
                        const int16_t *filter_x, int filter_x_stride,
@@ -750,101 +374,3 @@
     dst += dst_stride;
   }
 }
-
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-void vp9_convolve_1by8(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int filter_x_stride,
-                       const int16_t *filter_y, int filter_y_stride,
-                       int w, int h) {
-  int x, y;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      dst[x] = combine_1by8(dst[x], src[x]);
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_convolve_qtr(const uint8_t *src, int src_stride,
-                      uint8_t *dst, int dst_stride,
-                      const int16_t *filter_x, int filter_x_stride,
-                      const int16_t *filter_y, int filter_y_stride,
-                      int w, int h) {
-  int x, y;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      dst[x] = combine_qtr(dst[x], src[x]);
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_convolve_3by8(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int filter_x_stride,
-                       const int16_t *filter_y, int filter_y_stride,
-                       int w, int h) {
-  int x, y;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      dst[x] = combine_3by8(dst[x], src[x]);
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_convolve_5by8(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int filter_x_stride,
-                       const int16_t *filter_y, int filter_y_stride,
-                       int w, int h) {
-  int x, y;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      dst[x] = combine_5by8(dst[x], src[x]);
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_convolve_3qtr(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int filter_x_stride,
-                       const int16_t *filter_y, int filter_y_stride,
-                       int w, int h) {
-  int x, y;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      dst[x] = combine_3qtr(dst[x], src[x]);
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_convolve_7by8(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int filter_x_stride,
-                       const int16_t *filter_y, int filter_y_stride,
-                       int w, int h) {
-  int x, y;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      dst[x] = combine_7by8(dst[x], src[x]);
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-#endif
diff --git a/vp9/common/vp9_convolve.h b/vp9/common/vp9_convolve.h
index bef2d85..0596080 100644
--- a/vp9/common/vp9_convolve.h
+++ b/vp9/common/vp9_convolve.h
@@ -33,50 +33,6 @@
                       const int16_t *filter_y, int y_step_q4,
                       int w, int h);
 
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-// Not a convolution, a block wtd (1/8, 7/8) average for (dst, src)
-void vp9_convolve_1by8(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int x_step_q4,
-                       const int16_t *filter_y, int y_step_q4,
-                       int w, int h);
-
-// Not a convolution, a block wtd (1/4, 3/4) average for (dst, src)
-void vp9_convolve_qtr(const uint8_t *src, int src_stride,
-                      uint8_t *dst, int dst_stride,
-                      const int16_t *filter_x, int x_step_q4,
-                      const int16_t *filter_y, int y_step_q4,
-                      int w, int h);
-
-// Not a convolution, a block wtd (3/8, 5/8) average for (dst, src)
-void vp9_convolve_3by8(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int x_step_q4,
-                       const int16_t *filter_y, int y_step_q4,
-                       int w, int h);
-
-// Not a convolution, a block wtd (5/8, 3/8) average for (dst, src)
-void vp9_convolve_5by8(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int x_step_q4,
-                       const int16_t *filter_y, int y_step_q4,
-                       int w, int h);
-
-// Not a convolution, a block wtd (3/4, 1/4) average for (dst, src)
-void vp9_convolve_3qtr(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int x_step_q4,
-                       const int16_t *filter_y, int y_step_q4,
-                       int w, int h);
-
-// Not a convolution, a block wtd (7/8, 1/8) average for (dst, src)
-void vp9_convolve_7by8(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int x_step_q4,
-                       const int16_t *filter_y, int y_step_q4,
-                       int w, int h);
-#endif
-
 struct subpix_fn_table {
   const int16_t (*filter_x)[8];
   const int16_t (*filter_y)[8];
diff --git a/vp9/common/vp9_default_coef_probs.h b/vp9/common/vp9_default_coef_probs.h
index 5a781fb..453b4a2 100644
--- a/vp9/common/vp9_default_coef_probs.h
+++ b/vp9/common/vp9_default_coef_probs.h
@@ -700,298 +700,85 @@
   }
 };
 
-#if CONFIG_CODE_NONZEROCOUNT
+#if CONFIG_CODE_ZEROGROUP
 
-// TODO(debargha): Remove the macro and count tables after experimentation
-#define NZC_DEFAULT_COUNTS  /* Uncomment to use counts as defaults */
-
-#ifdef NZC_DEFAULT_COUNTS
-static const unsigned int default_nzc_counts_4x4[MAX_NZC_CONTEXTS]
-                                                [REF_TYPES]
-                                                [BLOCK_TYPES]
-                                                [NZC4X4_TOKENS] = {
-  {
-    {
-      { 967652, 29023, 15039, 6952, 1568, 116 },
-      { 289116, 22938, 4522, 1935, 520, 47 }
-    }, {
-      { 967652, 29023, 15039, 6952, 1568, 116 },
-      { 689116, 22938, 4522, 1935, 520, 47 }
-    },
-  }, {
-    {
-      { 124684, 37167, 15270, 8483, 1777, 102 },
-      { 10405, 12395, 3401, 3574, 2461, 771 }
-    }, {
-      { 124684, 37167, 15270, 8483, 1777, 102 },
-      { 20405, 12395, 3401, 3574, 2461, 771 }
+// There are two probs: the first is the prob(0) of the isolated zero bit,
+// the second is the prob(0) of the end of orientation symbol [if 0 that
+// indicates a zerotree root].
+static const vp9_zpc_probs default_zpc_probs_4x4 = {
+  { /* Intra */
+    { /* Coeff Band 0 */
+      { 1, }, { 1, }, { 1, },
+    }, { /* Coeff Band 1 */
+      { 1, }, { 1, }, { 1, },
+    }, { /* Coeff Band 2 */
+      { 1, }, { 1, }, { 1, },
     }
-  }, {
-    {
-      { 4100, 22976, 15627, 16137, 7982, 1793 },
-      { 4249, 3084, 2131, 4081, 6439, 1653 }
-    }, {
-      { 21100, 22976, 15627, 16137, 7982, 1793 },
-      { 4249, 3084, 2131, 4081, 2439, 1653 }
+  }, { /* Inter */
+    { /* Coeff Band 0 */
+      { 1, }, { 1, }, { 1, },
+    }, { /* Coeff Band 1 */
+      { 1, }, { 1, }, { 1, },
+    }, { /* Coeff Band 2 */
+      { 1, }, { 1, }, { 1, },
     }
   }
 };
-
-static const unsigned int default_nzc_counts_8x8[MAX_NZC_CONTEXTS]
-                                                [REF_TYPES]
-                                                [BLOCK_TYPES]
-                                                [NZC8X8_TOKENS] = {
-  {
-    {
-      { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10 },
-      { 72052, 30468, 6973, 3250, 1500, 750, 375, 5 },
-    }, {
-      { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10 },
-      { 192052, 30468, 6973, 3250, 1500, 750, 375, 5 },
+static const vp9_zpc_probs default_zpc_probs_8x8 = {
+  { /* Intra */
+    { /* ZPC Band 0 */
+      { 4, }, { 2, }, { 1, },
+    }, { /* ZPC Band 1 */
+      { 4, }, { 2, }, { 1, },
+    }, { /* ZPC Band 2 */
+      { 4, }, { 2, }, { 1, },
     }
-  }, {
-    {
-      { 121533, 33527, 15655, 11920, 5723, 2009, 315, 7 },
-      { 23772, 23120, 13127, 8115, 4000, 2000, 200, 6 },
-    }, {
-      { 121533, 33527, 15655, 11920, 5723, 2009, 315, 7 },
-      { 23772, 23120, 13127, 8115, 4000, 2000, 200, 6 },
-    }
-  }, {
-    {
-      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17 },
-      { 11612, 13874, 13329, 13022, 6500, 3250, 300, 12 },
-    }, {
-      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17 },
-      { 11612, 13874, 13329, 13022, 6500, 3250, 300, 12 },
+  }, { /* Inter */
+    { /* ZPC Band 0 */
+      { 4, }, { 2, }, { 1, },
+    }, { /* ZPC Band 1 */
+      { 4, }, { 2, }, { 1, },
+    }, { /* ZPC Band 2 */
+      { 4, }, { 2, }, { 1, },
     }
   }
 };
-
-static const unsigned int default_nzc_counts_16x16[MAX_NZC_CONTEXTS]
-                                                  [REF_TYPES]
-                                                  [BLOCK_TYPES]
-                                                  [NZC16X16_TOKENS] = {
-  {
-    {
-      { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2 },
-      { 72052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1 },
-    }, {
-      { 32988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2 },
-      { 92052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1 },
-    }
-  }, {
-    {
-      { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2 },
-      { 47772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2 },
-    }, {
-      { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2 },
-      { 27772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2 },
-    }
-  }, {
-    {
-      { 19408, 31758, 16023, 10123, 6705, 2468, 369, 17, 10, 5 },
-      { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3 },
-    }, {
-      { 22408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5 },
-      { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3 },
-    }
-  }
-};
-
-static const unsigned int default_nzc_counts_32x32[MAX_NZC_CONTEXTS]
-                                                  [REF_TYPES]
-                                                  [BLOCK_TYPES]
-                                                  [NZC32X32_TOKENS] = {
-  {
-    {
-      { 72988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2, 1, 0 },
-      { 52052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1, 0, 0 },
-    }, {
-      { 72988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2, 1, 0 },
-      { 72052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1, 0, 0 },
-    }
-  }, {
-    {
-      { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2, 1, 0 },
-      { 27772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2, 1, 0 },
-    }, {
-      { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2, 1, 0 },
-      { 27772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2, 1, 0 },
-    }
-  }, {
-    {
-      { 19408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5, 2, 1 },
-      { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3, 2, 1 },
-    }, {
-      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5, 2, 1 },
-      { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3, 2, 1 },
-    }
-  }
-};
-
-#else
-
-static const vp9_prob default_nzc_probs_4x4[MAX_NZC_CONTEXTS]
-                                           [REF_TYPES]
-                                           [BLOCK_TYPES]
-                                           [NZC4X4_TOKENS] = {
-  {
-    {
-      { 219, 162, 179, 142, 242, },
-      { 214, 253, 228, 246, 255, },
-    }, {
-      { 225, 236, 190, 229, 253, },
-      { 251, 253, 240, 248, 255, },
+static const vp9_zpc_probs default_zpc_probs_16x16 = {
+  { /* Intra */
+    { /* ZPC Band 0 */
+      {  57,  }, {  30,  }, {   13,  },
+    }, { /* ZPC Band 1 */
+      {  46,  }, {  23,  }, {   4,  },
+    }, { /* ZPC Band 1 */
+      {  36,  }, {  11,  }, {   2,  },
     },
-  }, {
-    {
-      { 106, 126, 158, 126, 244, },
-      { 118, 241, 201, 240, 255, },
-    }, {
-      { 165, 179, 143, 189, 242, },
-      { 173, 239, 192, 255, 128, },
-    },
-  }, {
-    {
-      { 42 , 78 , 153, 92 , 223, },
-      { 128, 128, 128, 128, 128, },
-    }, {
-      { 76 , 68 , 126, 110, 216, },
-      { 128, 128, 128, 128, 128, },
+  }, { /* Inter */
+    { /* ZPC Band 0 */
+      {  45,  }, {  21  }, {  10,  },
+    }, { /* ZPC Band 1 */
+      {  24,  }, {  14,  }, {   3,  },
+    }, { /* ZPC Band 2 */
+      {  16,  }, {  6,  }, {   1,  },
     },
   },
 };
-
-static const vp9_prob default_nzc_probs_8x8[MAX_NZC_CONTEXTS]
-                                           [REF_TYPES]
-                                           [BLOCK_TYPES]
-                                           [NZC8X8_TOKENS] = {
-  {
-    {
-      { 134, 139, 170, 178, 142, 197, 255, },
-      { 167, 224, 199, 252, 205, 255, 128, },
-    }, {
-      { 181, 210, 180, 241, 190, 235, 255, },
-      { 234, 251, 235, 252, 219, 255, 128, },
+static const vp9_zpc_probs default_zpc_probs_32x32 = {
+  { /* Intra */
+    { /* ZPC Band 0 */
+      {  132,  }, {  60,  }, {  19,  },
+    }, { /* ZPC Band 1 */
+      {  64,  }, {  32,  }, {   8,  },
+    }, { /* ZPC Band 2 */
+      {  25,  }, {  11,  }, {   1,  },
     },
-  }, {
-    {
-      { 33 , 64 , 155, 143, 86 , 216, 255, },
-      { 73 , 160, 167, 251, 153, 255, 128, },
-    }, {
-      { 79 , 104, 153, 195, 119, 246, 255, },
-      { 149, 183, 186, 249, 203, 255, 128, },
-    },
-  }, {
-    {
-      { 10 , 25 , 156, 61 , 69 , 156, 254, },
-      { 32 , 1  , 128, 146, 64 , 255, 128, },
-    }, {
-      { 37 , 48 , 143, 113, 81 , 202, 255, },
-      { 1  , 255, 128, 128, 128, 128, 128, },
+  }, { /* Inter */
+    { /* ZPC Band 0 */
+      {  134,  }, {  39,  }, {  25,  },
+    }, { /* ZPC Band 1 */
+      {  64,  }, {  24,  }, {  12,  },
+    }, { /* ZPC Band 2 */
+      {  21,  }, {  10,  }, {   1,  },
     },
   },
 };
-
-static const vp9_prob default_nzc_probs_16x16[MAX_NZC_CONTEXTS]
-                                             [REF_TYPES]
-                                             [BLOCK_TYPES]
-                                             [NZC16X16_TOKENS] = {
-  {
-    {
-      { 11 , 188, 210, 167, 141, 143, 152, 255, 128, },
-      { 171, 201, 203, 244, 207, 255, 255, 128, 128, },
-    }, {
-      { 23 , 217, 207, 251, 198, 255, 219, 128, 128, },
-      { 235, 249, 229, 255, 199, 128, 128, 128, 128, },
-    },
-  }, {
-    {
-      { 9  , 45 , 168, 85 , 66 , 221, 139, 246, 255, },
-      { 51 , 110, 163, 238, 94 , 255, 255, 128, 128, },
-    }, {
-      { 4  , 149, 175, 240, 149, 255, 205, 128, 128, },
-      { 141, 217, 186, 255, 128, 128, 128, 128, 128, },
-    },
-  }, {
-    {
-      { 1  , 12 , 173, 6  , 68 , 145, 41 , 204, 255, },
-      { 39 , 47 , 128, 199, 110, 255, 128, 128, 128, },
-    }, {
-      { 1  , 121, 171, 149, 115, 242, 159, 255, 128, },
-      { 1  , 255, 255, 128, 128, 128, 128, 128, 128, },
-    },
-  },
-};
-
-static const vp9_prob default_nzc_probs_32x32[MAX_NZC_CONTEXTS]
-                                             [REF_TYPES]
-                                             [BLOCK_TYPES]
-                                             [NZC32X32_TOKENS] = {
-  {
-    {
-      { 11 , 216, 195, 201, 160, 247, 217, 255, 255, 128, 128, },
-      { 177, 240, 239, 255, 192, 128, 128, 128, 128, 128, 128, },
-    }, {
-      { 48 , 235, 213, 235, 199, 255, 255, 128, 128, 128, 128, },
-      { 205, 255, 248, 128, 128, 128, 128, 128, 128, 128, 128, },
-    },
-  }, {
-    {
-      { 6  , 96 , 138, 99 , 125, 248, 188, 255, 128, 128, 128, },
-      { 17 , 53 , 43 , 189, 1  , 255, 171, 128, 128, 128, 128, },
-    }, {
-      { 5  , 187, 235, 232, 117, 255, 219, 128, 128, 128, 128, },
-      { 146, 255, 255, 128, 128, 128, 128, 128, 128, 128, 128, },
-    },
-  }, {
-    {
-      { 1  , 7  , 93 , 14 , 100, 30 , 85 , 65 , 81 , 210, 255, },
-      { 1  , 1  , 128, 26 , 1  , 218, 78 , 255, 255, 128, 128, },
-    }, {
-      { 4  , 148, 206, 137, 160, 255, 255, 128, 128, 128, 128, },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, },
-    },
-  },
-};
-#endif
-
-static const vp9_prob default_nzc_pcat_probs[MAX_NZC_CONTEXTS]
-                                            [NZC_TOKENS_EXTRA]
-                                            [NZC_BITS_EXTRA] = {
-  // Bit probabilities are in least to most significance order
-  {
-    {176, 128, 128, 128, 128, 128, 128, 128, 128},   // 3 - 4
-    {164, 192, 128, 128, 128, 128, 128, 128, 128},   // 5 - 8
-    {154, 184, 208, 128, 128, 128, 128, 128, 128},   // 9 - 16
-    {144, 176, 200, 216, 128, 128, 128, 128, 128},   // 17 - 32
-    {140, 172, 192, 208, 224, 128, 128, 128, 128},   // 33 - 64
-    {136, 168, 188, 200, 220, 232, 128, 128, 128},   // 65 - 128
-    {132, 164, 184, 196, 216, 228, 240, 128, 128},   // 129 - 256
-    {130, 162, 178, 194, 212, 226, 240, 248, 128},   // 257 - 512
-    {128, 160, 176, 192, 208, 224, 240, 248, 254},   // 513 - 1024
-  }, {
-    {168, 128, 128, 128, 128, 128, 128, 128, 128},   // 3 - 4
-    {152, 184, 128, 128, 128, 128, 128, 128, 128},   // 5 - 8
-    {152, 184, 208, 128, 128, 128, 128, 128, 128},   // 9 - 16
-    {144, 176, 200, 216, 128, 128, 128, 128, 128},   // 17 - 32
-    {140, 172, 192, 208, 224, 128, 128, 128, 128},   // 33 - 64
-    {136, 168, 188, 200, 220, 232, 128, 128, 128},   // 65 - 128
-    {132, 164, 184, 196, 216, 228, 240, 128, 128},   // 129 - 256
-    {130, 162, 178, 194, 212, 226, 240, 248, 128},   // 257 - 512
-    {128, 160, 176, 192, 208, 224, 240, 248, 254},   // 513 - 1024
-  }, {
-    {160, 128, 128, 128, 128, 128, 128, 128, 128},   // 3 - 4
-    {152, 176, 128, 128, 128, 128, 128, 128, 128},   // 5 - 8
-    {150, 184, 208, 128, 128, 128, 128, 128, 128},   // 9 - 16
-    {144, 176, 200, 216, 128, 128, 128, 128, 128},   // 17 - 32
-    {140, 172, 192, 208, 224, 128, 128, 128, 128},   // 33 - 64
-    {136, 168, 188, 200, 220, 232, 128, 128, 128},   // 65 - 128
-    {132, 164, 184, 196, 216, 228, 240, 128, 128},   // 129 - 256
-    {130, 162, 178, 194, 212, 226, 240, 248, 128},   // 257 - 512
-    {128, 160, 176, 192, 208, 224, 240, 248, 254},   // 513 - 1024
-  },
-};
-
-#endif  // CONFIG_CODE_NONZEROCOUNT
+#endif  // CONFIG_CODE_ZEROGROUP
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 5b3ddfb..16ef14f 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -12,7 +12,6 @@
 #include <stdio.h>
 
 #include "vp9/common/vp9_entropy.h"
-#include "string.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_entropymode.h"
@@ -400,65 +399,6 @@
   254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
 };
 
-#if CONFIG_CODE_NONZEROCOUNT
-const vp9_tree_index vp9_nzc4x4_tree[2 * NZC4X4_NODES] = {
-  -NZC_0, 2,
-  4, 6,
-  -NZC_1, -NZC_2,
-  -NZC_3TO4, 8,
-  -NZC_5TO8, -NZC_9TO16,
-};
-struct vp9_token vp9_nzc4x4_encodings[NZC4X4_TOKENS];
-
-const vp9_tree_index vp9_nzc8x8_tree[2 * NZC8X8_NODES] = {
-  -NZC_0, 2,
-  4, 6,
-  -NZC_1, -NZC_2,
-  8, 10,
-  -NZC_3TO4, -NZC_5TO8,
-  -NZC_9TO16, 12,
-  -NZC_17TO32, -NZC_33TO64,
-};
-struct vp9_token vp9_nzc8x8_encodings[NZC8X8_TOKENS];
-
-const vp9_tree_index vp9_nzc16x16_tree[2 * NZC16X16_NODES] = {
-  -NZC_0, 2,
-  4, 6,
-  -NZC_1, -NZC_2,
-  8, 10,
-  -NZC_3TO4, -NZC_5TO8,
-  12, 14,
-  -NZC_9TO16, -NZC_17TO32,
-  -NZC_33TO64, 16,
-  -NZC_65TO128, -NZC_129TO256,
-};
-struct vp9_token vp9_nzc16x16_encodings[NZC16X16_TOKENS];
-
-const vp9_tree_index vp9_nzc32x32_tree[2 * NZC32X32_NODES] = {
-  -NZC_0, 2,
-  4, 6,
-  -NZC_1, -NZC_2,
-  8, 10,
-  -NZC_3TO4, -NZC_5TO8,
-  12, 14,
-  -NZC_9TO16, -NZC_17TO32,
-  16, 18,
-  -NZC_33TO64, -NZC_65TO128,
-  -NZC_129TO256, 20,
-  -NZC_257TO512, -NZC_513TO1024,
-};
-struct vp9_token vp9_nzc32x32_encodings[NZC32X32_TOKENS];
-
-const int vp9_extranzcbits[NZC32X32_TOKENS] = {
-  0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
-};
-
-const int vp9_basenzcvalue[NZC32X32_TOKENS] = {
-  0, 1, 2, 3, 5, 9, 17, 33, 65, 129, 257, 513
-};
-
-#endif  // CONFIG_CODE_NONZEROCOUNT
-
 #if CONFIG_MODELCOEFPROB
 
 #if UNCONSTRAINED_NODES == 2
@@ -1344,10 +1284,10 @@
     int ctx;
     assert(neighbors[MAX_NEIGHBORS * c + 0] >= 0);
     if (neighbors[MAX_NEIGHBORS * c + 1] >= 0) {
-      ctx = (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
-             token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
+      ctx = (1 + token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]] +
+             token_cache[scan[neighbors[MAX_NEIGHBORS * c + 1]]]) >> 1;
     } else {
-      ctx = token_cache[neighbors[MAX_NEIGHBORS * c + 0]];
+      ctx = token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]];
     }
     return vp9_pt_energy_class[ctx];
   }
@@ -1357,55 +1297,6 @@
 #if CONFIG_MODELCOEFPROB
   int b, r, c, p;
 #endif
-#if CONFIG_CODE_NONZEROCOUNT
-#ifdef NZC_DEFAULT_COUNTS
-  int h, g;
-  for (h = 0; h < MAX_NZC_CONTEXTS; ++h) {
-    for (g = 0; g < REF_TYPES; ++g) {
-      int i;
-      unsigned int branch_ct4x4[NZC4X4_NODES][2];
-      unsigned int branch_ct8x8[NZC8X8_NODES][2];
-      unsigned int branch_ct16x16[NZC16X16_NODES][2];
-      unsigned int branch_ct32x32[NZC32X32_NODES][2];
-      for (i = 0; i < BLOCK_TYPES; ++i) {
-        vp9_tree_probs_from_distribution(
-          vp9_nzc4x4_tree,
-          pc->fc.nzc_probs_4x4[h][g][i], branch_ct4x4,
-          default_nzc_counts_4x4[h][g][i], 0);
-      }
-      for (i = 0; i < BLOCK_TYPES; ++i) {
-        vp9_tree_probs_from_distribution(
-          vp9_nzc8x8_tree,
-          pc->fc.nzc_probs_8x8[h][g][i], branch_ct8x8,
-          default_nzc_counts_8x8[h][g][i], 0);
-      }
-      for (i = 0; i < BLOCK_TYPES; ++i) {
-        vp9_tree_probs_from_distribution(
-          vp9_nzc16x16_tree,
-          pc->fc.nzc_probs_16x16[h][g][i], branch_ct16x16,
-          default_nzc_counts_16x16[h][g][i], 0);
-      }
-      for (i = 0; i < BLOCK_TYPES; ++i) {
-        vp9_tree_probs_from_distribution(
-          vp9_nzc32x32_tree,
-          pc->fc.nzc_probs_32x32[h][g][i], branch_ct32x32,
-          default_nzc_counts_32x32[h][g][i], 0);
-      }
-    }
-  }
-#else
-  vpx_memcpy(pc->fc.nzc_probs_4x4, default_nzc_probs_4x4,
-             sizeof(pc->fc.nzc_probs_4x4));
-  vpx_memcpy(pc->fc.nzc_probs_8x8, default_nzc_probs_8x8,
-             sizeof(pc->fc.nzc_probs_8x8));
-  vpx_memcpy(pc->fc.nzc_probs_16x16, default_nzc_probs_16x16,
-             sizeof(pc->fc.nzc_probs_16x16));
-  vpx_memcpy(pc->fc.nzc_probs_32x32, default_nzc_probs_32x32,
-             sizeof(pc->fc.nzc_probs_32x32));
-#endif
-  vpx_memcpy(pc->fc.nzc_pcat_probs, default_nzc_pcat_probs,
-             sizeof(pc->fc.nzc_pcat_probs));
-#endif  // CONFIG_CODE_NONZEROCOUNT
 #if CONFIG_MODELCOEFPROB
   for (b = 0; b < BLOCK_TYPES; ++b)
     for (r = 0; r < REF_TYPES; ++r)
@@ -1447,6 +1338,16 @@
   vpx_memcpy(pc->fc.coef_probs_32x32, default_coef_probs_32x32,
              sizeof(pc->fc.coef_probs_32x32));
 #endif
+#if CONFIG_CODE_ZEROGROUP
+  vpx_memcpy(pc->fc.zpc_probs_4x4, default_zpc_probs_4x4,
+             sizeof(pc->fc.zpc_probs_4x4));
+  vpx_memcpy(pc->fc.zpc_probs_8x8, default_zpc_probs_8x8,
+             sizeof(pc->fc.zpc_probs_8x8));
+  vpx_memcpy(pc->fc.zpc_probs_16x16, default_zpc_probs_16x16,
+             sizeof(pc->fc.zpc_probs_16x16));
+  vpx_memcpy(pc->fc.zpc_probs_32x32, default_zpc_probs_32x32,
+             sizeof(pc->fc.zpc_probs_32x32));
+#endif
 }
 
 // Neighborhood 5-tuples for various scans and blocksizes,
@@ -1597,1099 +1498,8 @@
   vp9_init_neighbors();
   init_bit_trees();
   vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_tokens_from_tree(vp9_nzc4x4_encodings, vp9_nzc4x4_tree);
-  vp9_tokens_from_tree(vp9_nzc8x8_encodings, vp9_nzc8x8_tree);
-  vp9_tokens_from_tree(vp9_nzc16x16_encodings, vp9_nzc16x16_tree);
-  vp9_tokens_from_tree(vp9_nzc32x32_encodings, vp9_nzc32x32_tree);
-#endif
 }
 
-#if CONFIG_CODE_NONZEROCOUNT
-
-#define mb_in_cur_tile(cm, mb_row, mb_col)      \
-    ((mb_col) >= (cm)->cur_tile_mb_col_start && \
-     (mb_col) <= (cm)->cur_tile_mb_col_end   && \
-     (mb_row) >= 0)
-
-#define choose_nzc_context(nzc_exp, t2, t1)     \
-    ((nzc_exp) >= (t2) ? 2 : (nzc_exp) >= (t1) ? 1 : 0)
-
-#define NZC_T2_32X32    (16 << 6)
-#define NZC_T1_32X32     (4 << 6)
-
-#define NZC_T2_16X16    (12 << 6)
-#define NZC_T1_16X16     (3 << 6)
-
-#define NZC_T2_8X8       (8 << 6)
-#define NZC_T1_8X8       (2 << 6)
-
-#define NZC_T2_4X4       (4 << 6)
-#define NZC_T1_4X4       (1 << 6)
-
-// Transforms a mb16 block index to a sb64 block index
-static inline int mb16_to_sb64_index(int mb_row, int mb_col, int block) {
-  int r = (mb_row & 3);
-  int c = (mb_col & 3);
-  int b;
-  if (block < 16) {  // Y
-    int ib = block >> 2;
-    int jb = block & 3;
-    ib += r * 4;
-    jb += c * 4;
-    b = ib * 16 + jb;
-    assert(b < 256);
-    return b;
-  } else {  // UV
-    int base = block - (block & 3);
-    int ib = (block - base) >> 1;
-    int jb = (block - base) & 1;
-    ib += r * 2;
-    jb += c * 2;
-    b = base * 16 + ib * 8 + jb;
-    assert(b >= 256 && b < 384);
-    return b;
-  }
-}
-
-// Transforms a mb16 block index to a sb32 block index
-static inline int mb16_to_sb32_index(int mb_row, int mb_col, int block) {
-  int r = (mb_row & 1);
-  int c = (mb_col & 1);
-  int b;
-  if (block < 16) {  // Y
-    int ib = block >> 2;
-    int jb = block & 3;
-    ib += r * 4;
-    jb += c * 4;
-    b = ib * 8 + jb;
-    assert(b < 64);
-    return b;
-  } else {  // UV
-    int base = block - (block & 3);
-    int ib = (block - base) >> 1;
-    int jb = (block - base) & 1;
-    ib += r * 2;
-    jb += c * 2;
-    b = base * 4 + ib * 4 + jb;
-    assert(b >= 64 && b < 96);
-    return b;
-  }
-}
-
-static inline int block_to_txfm_index(int block, TX_SIZE tx_size, int s) {
-  // s is the log of the number of 4x4 blocks in each row/col of larger block
-  int b, ib, jb, nb;
-  ib = block >> s;
-  jb = block - (ib << s);
-  ib >>= tx_size;
-  jb >>= tx_size;
-  nb = 1 << (s - tx_size);
-  b = (ib * nb + jb) << (2 * tx_size);
-  return b;
-}
-
-/* BEGIN - Helper functions to get the y nzcs */
-static unsigned int get_nzc_4x4_y_sb64(MB_MODE_INFO *mi, int block) {
-  int b;
-  assert(block < 256);
-  b = block_to_txfm_index(block, mi->txfm_size, 4);
-  assert(b < 256);
-  return mi->nzcs[b] << (6 - 2 * mi->txfm_size);
-}
-
-static unsigned int get_nzc_4x4_y_sb32(MB_MODE_INFO *mi, int block) {
-  int b;
-  assert(block < 64);
-  b = block_to_txfm_index(block, mi->txfm_size, 3);
-  assert(b < 64);
-  return mi->nzcs[b] << (6 - 2 * mi->txfm_size);
-}
-
-static unsigned int get_nzc_4x4_y_mb16(MB_MODE_INFO *mi, int block) {
-  int b;
-  assert(block < 16);
-  b = block_to_txfm_index(block, mi->txfm_size, 2);
-  assert(b < 16);
-  return mi->nzcs[b] << (6 - 2 * mi->txfm_size);
-}
-/* END - Helper functions to get the y nzcs */
-
-/* Function to get y nzc where block index is in mb16 terms */
-static unsigned int get_nzc_4x4_y(VP9_COMMON *cm, MODE_INFO *m,
-                                  int mb_row, int mb_col, int block) {
-  // NOTE: All values returned are at 64 times the true value at 4x4 scale
-  MB_MODE_INFO *const mi = &m->mbmi;
-  const int mis = cm->mode_info_stride;
-  if (mi->mb_skip_coeff || !mb_in_cur_tile(cm, mb_row, mb_col))
-    return 0;
-  if (mi->sb_type == BLOCK_SIZE_SB64X64) {
-    int r = mb_row & 3;
-    int c = mb_col & 3;
-    m -= c + r * mis;
-    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))
-      return 0;
-    else
-      return get_nzc_4x4_y_sb64(
-          &m->mbmi, mb16_to_sb64_index(mb_row, mb_col, block));
-  } else if (mi->sb_type == BLOCK_SIZE_SB32X32) {
-    int r = mb_row & 1;
-    int c = mb_col & 1;
-    m -= c + r * mis;
-    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))
-      return 0;
-    else
-      return get_nzc_4x4_y_sb32(
-          &m->mbmi, mb16_to_sb32_index(mb_row, mb_col, block));
-  } else {
-    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row, mb_col))
-      return 0;
-    return get_nzc_4x4_y_mb16(mi, block);
-  }
-}
-
-/* BEGIN - Helper functions to get the uv nzcs */
-static unsigned int get_nzc_4x4_uv_sb64(MB_MODE_INFO *mi, int block) {
-  int b;
-  int base, uvtxfm_size;
-  assert(block >= 256 && block < 384);
-  uvtxfm_size = mi->txfm_size;
-  base = 256 + (block & 64);
-  block -= base;
-  b = base + block_to_txfm_index(block, uvtxfm_size, 3);
-  assert(b >= 256 && b < 384);
-  return mi->nzcs[b] << (6 - 2 * uvtxfm_size);
-}
-
-static unsigned int get_nzc_4x4_uv_sb32(MB_MODE_INFO *mi, int block) {
-  int b;
-  int base, uvtxfm_size;
-  assert(block >= 64 && block < 96);
-  if (mi->txfm_size == TX_32X32)
-    uvtxfm_size = TX_16X16;
-  else
-    uvtxfm_size = mi->txfm_size;
-  base = 64 + (block & 16);
-  block -= base;
-  b = base + block_to_txfm_index(block, uvtxfm_size, 2);
-  assert(b >= 64 && b < 96);
-  return mi->nzcs[b] << (6 - 2 * uvtxfm_size);
-}
-
-static unsigned int get_nzc_4x4_uv_mb16(MB_MODE_INFO *mi, int block) {
-  int b;
-  int base, uvtxfm_size;
-  assert(block >= 16 && block < 24);
-  if (mi->txfm_size == TX_8X8 &&
-      (mi->mode == SPLITMV || mi->mode == I8X8_PRED))
-    uvtxfm_size = TX_4X4;
-  else if (mi->txfm_size == TX_16X16)
-    uvtxfm_size = TX_8X8;
-  else
-    uvtxfm_size = mi->txfm_size;
-  base = 16 + (block & 4);
-  block -= base;
-  b = base + block_to_txfm_index(block, uvtxfm_size, 1);
-  assert(b >= 16 && b < 24);
-  return mi->nzcs[b] << (6 - 2 * uvtxfm_size);
-}
-/* END - Helper functions to get the uv nzcs */
-
-/* Function to get uv nzc where block index is in mb16 terms */
-static unsigned int get_nzc_4x4_uv(VP9_COMMON *cm, MODE_INFO *m,
-                                   int mb_row, int mb_col, int block) {
-  // NOTE: All values returned are at 64 times the true value at 4x4 scale
-  MB_MODE_INFO *const mi = &m->mbmi;
-  const int mis = cm->mode_info_stride;
-  if (mi->mb_skip_coeff || !mb_in_cur_tile(cm, mb_row, mb_col))
-    return 0;
-  if (mi->sb_type == BLOCK_SIZE_SB64X64) {
-    int r = mb_row & 3;
-    int c = mb_col & 3;
-    m -= c + r * mis;
-    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))
-      return 0;
-    else
-      return get_nzc_4x4_uv_sb64(
-          &m->mbmi, mb16_to_sb64_index(mb_row, mb_col, block));
-  } else if (mi->sb_type == BLOCK_SIZE_SB32X32) {
-    int r = mb_row & 1;
-    int c = mb_col & 1;
-    m -= c + r * mis;
-    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))
-      return 0;
-    else
-      return get_nzc_4x4_uv_sb32(
-          &m->mbmi, mb16_to_sb32_index(mb_row, mb_col, block));
-  } else {
-    return get_nzc_4x4_uv_mb16(mi, block);
-  }
-}
-
-int vp9_get_nzc_context_y_sb64(VP9_COMMON *cm, MODE_INFO *cur,
-                               int mb_row, int mb_col, int block) {
-  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
-  // neighboring blocks are
-  int mis = cm->mode_info_stride;
-  int nzc_exp = 0;
-  TX_SIZE txfm_size = cur->mbmi.txfm_size;
-  assert(block < 256);
-  switch (txfm_size) {
-    case TX_32X32:
-      assert((block & 63) == 0);
-      if (block < 128) {
-        int o = (block >> 6) * 2;
-        nzc_exp =
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 12) +
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 13) +
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 14) +
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 15) +
-            get_nzc_4x4_y(cm, cur - mis + o + 1,
-                          mb_row - 1, mb_col + o + 1, 12) +
-            get_nzc_4x4_y(cm, cur - mis + o + 1,
-                          mb_row - 1, mb_col + o + 1, 13) +
-            get_nzc_4x4_y(cm, cur - mis + o + 1,
-                          mb_row - 1, mb_col + o + 1, 14) +
-            get_nzc_4x4_y(cm, cur - mis + o + 1,
-                          mb_row - 1, mb_col + o + 1, 15);
-      } else {
-        nzc_exp = cur->mbmi.nzcs[block - 128] << 3;
-      }
-      if ((block & 127) == 0) {
-        int o = (block >> 7) * 2;
-        nzc_exp +=
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 3) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 7) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 11) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 15) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,
-                          mb_row + o + 1, mb_col - 1, 3) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,
-                          mb_row + o + 1, mb_col - 1, 7) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,
-                          mb_row + o + 1, mb_col - 1, 11) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,
-                          mb_row + o + 1, mb_col - 1, 15);
-      } else {
-        nzc_exp += cur->mbmi.nzcs[block - 64] << 3;
-      }
-      nzc_exp <<= 2;
-      // Note nzc_exp is 64 times the average value expected at 32x32 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_32X32, NZC_T1_32X32);
-      break;
-
-    case TX_16X16:
-      assert((block & 15) == 0);
-      if (block < 64) {
-        int o = block >> 4;
-        nzc_exp =
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 12) +
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 13) +
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 14) +
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 15);
-      } else {
-        nzc_exp = cur->mbmi.nzcs[block - 64] << 4;
-      }
-      if ((block & 63) == 0) {
-        int o = block >> 6;
-        nzc_exp +=
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 3) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 7) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 11) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 15);
-      } else {
-        nzc_exp += cur->mbmi.nzcs[block - 16] << 4;
-      }
-      nzc_exp <<= 1;
-      // Note nzc_exp is 64 times the average value expected at 16x16 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);
-      break;
-
-    case TX_8X8:
-      assert((block & 3) == 0);
-      if (block < 32) {
-        int o = block >> 3;
-        int p = ((block >> 2) & 1) ? 14 : 12;
-        nzc_exp =
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p) +
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p + 1);
-      } else {
-        nzc_exp = cur->mbmi.nzcs[block - 32] << 5;
-      }
-      if ((block & 31) == 0) {
-        int o = block >> 6;
-        int p = ((block >> 5) & 1) ? 11 : 3;
-        nzc_exp +=
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p + 4);
-      } else {
-        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;
-      }
-      // Note nzc_exp is 64 times the average value expected at 8x8 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
-      break;
-
-    case TX_4X4:
-      if (block < 16) {
-        int o = block >> 2;
-        int p = block & 3;
-        nzc_exp = get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o,
-                                12 + p);
-      } else {
-        nzc_exp = (cur->mbmi.nzcs[block - 16] << 6);
-      }
-      if ((block & 15) == 0) {
-        int o = block >> 6;
-        int p = (block >> 4) & 3;
-        nzc_exp += get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
-                                 3 + 4 * p);
-      } else {
-        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
-      }
-      nzc_exp >>= 1;
-      // Note nzc_exp is 64 times the average value expected at 4x4 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
-      break;
-
-    default:
-      return 0;
-  }
-}
-
-int vp9_get_nzc_context_y_sb32(VP9_COMMON *cm, MODE_INFO *cur,
-                               int mb_row, int mb_col, int block) {
-  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
-  // neighboring blocks are
-  int mis = cm->mode_info_stride;
-  int nzc_exp = 0;
-  TX_SIZE txfm_size = cur->mbmi.txfm_size;
-  assert(block < 64);
-  switch (txfm_size) {
-    case TX_32X32:
-      assert(block == 0);
-      nzc_exp =
-          (get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 12) +
-           get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 13) +
-           get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 14) +
-           get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 15) +
-           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 12) +
-           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 13) +
-           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 14) +
-           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 15) +
-           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 3) +
-           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 7) +
-           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 11) +
-           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 15) +
-           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 3) +
-           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 7) +
-           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 11) +
-           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 15)) << 2;
-      // Note nzc_exp is 64 times the average value expected at 32x32 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_32X32, NZC_T1_32X32);
-      break;
-
-    case TX_16X16:
-      assert((block & 15) == 0);
-      if (block < 32) {
-        int o = (block >> 4) & 1;
-        nzc_exp =
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 12) +
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 13) +
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 14) +
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 15);
-      } else {
-        nzc_exp = cur->mbmi.nzcs[block - 32] << 4;
-      }
-      if ((block & 31) == 0) {
-        int o = block >> 5;
-        nzc_exp +=
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 3) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 7) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 11) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 15);
-      } else {
-        nzc_exp += cur->mbmi.nzcs[block - 16] << 4;
-      }
-      nzc_exp <<= 1;
-      // Note nzc_exp is 64 times the average value expected at 16x16 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);
-      break;
-
-    case TX_8X8:
-      assert((block & 3) == 0);
-      if (block < 16) {
-        int o = block >> 3;
-        int p = ((block >> 2) & 1) ? 14 : 12;
-        nzc_exp =
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p) +
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p + 1);
-      } else {
-        nzc_exp = cur->mbmi.nzcs[block - 16] << 5;
-      }
-      if ((block & 15) == 0) {
-        int o = block >> 5;
-        int p = ((block >> 4) & 1) ? 11 : 3;
-        nzc_exp +=
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p + 4);
-      } else {
-        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;
-      }
-      // Note nzc_exp is 64 times the average value expected at 8x8 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
-      break;
-
-    case TX_4X4:
-      if (block < 8) {
-        int o = block >> 2;
-        int p = block & 3;
-        nzc_exp = get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o,
-                                12 + p);
-      } else {
-        nzc_exp = (cur->mbmi.nzcs[block - 8] << 6);
-      }
-      if ((block & 7) == 0) {
-        int o = block >> 5;
-        int p = (block >> 3) & 3;
-        nzc_exp += get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
-                                 3 + 4 * p);
-      } else {
-        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
-      }
-      nzc_exp >>= 1;
-      // Note nzc_exp is 64 times the average value expected at 4x4 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
-      break;
-
-    default:
-      return 0;
-      break;
-  }
-}
-
-int vp9_get_nzc_context_y_mb16(VP9_COMMON *cm, MODE_INFO *cur,
-                               int mb_row, int mb_col, int block) {
-  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
-  // neighboring blocks are
-  int mis = cm->mode_info_stride;
-  int nzc_exp = 0;
-  TX_SIZE txfm_size = cur->mbmi.txfm_size;
-  assert(block < 16);
-  switch (txfm_size) {
-    case TX_16X16:
-      assert(block == 0);
-      nzc_exp =
-          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 12) +
-          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 13) +
-          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 14) +
-          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 15) +
-          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 3) +
-          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 7) +
-          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 11) +
-          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 15);
-      nzc_exp <<= 1;
-      // Note nzc_exp is 64 times the average value expected at 16x16 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);
-
-    case TX_8X8:
-      assert((block & 3) == 0);
-      if (block < 8) {
-        int p = ((block >> 2) & 1) ? 14 : 12;
-        nzc_exp =
-            get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, p) +
-            get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, p + 1);
-      } else {
-        nzc_exp = cur->mbmi.nzcs[block - 8] << 5;
-      }
-      if ((block & 7) == 0) {
-        int p = ((block >> 3) & 1) ? 11 : 3;
-        nzc_exp +=
-            get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, p) +
-            get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, p + 4);
-      } else {
-        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;
-      }
-      // Note nzc_exp is 64 times the average value expected at 8x8 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
-
-    case TX_4X4:
-      if (block < 4) {
-        int p = block & 3;
-        nzc_exp = get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col,
-                                12 + p);
-      } else {
-        nzc_exp = (cur->mbmi.nzcs[block - 4] << 6);
-      }
-      if ((block & 3) == 0) {
-        int p = (block >> 2) & 3;
-        nzc_exp += get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1,
-                                 3 + 4 * p);
-      } else {
-        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
-      }
-      nzc_exp >>= 1;
-      // Note nzc_exp is 64 times the average value expected at 4x4 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
-
-    default:
-      return 0;
-      break;
-  }
-}
-
-int vp9_get_nzc_context_uv_sb64(VP9_COMMON *cm, MODE_INFO *cur,
-                                int mb_row, int mb_col, int block) {
-  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
-  // neighboring blocks are
-  int mis = cm->mode_info_stride;
-  int nzc_exp = 0;
-  const int base = block - (block & 63);
-  const int boff = (block & 63);
-  const int base_mb16 = base >> 4;
-  TX_SIZE txfm_size = cur->mbmi.txfm_size;
-  TX_SIZE txfm_size_uv;
-
-  assert(block >= 256 && block < 384);
-  txfm_size_uv = txfm_size;
-
-  switch (txfm_size_uv) {
-    case TX_32X32:
-      assert(block == 256 || block == 320);
-      nzc_exp =
-          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,
-                         base_mb16 + 2) +
-          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,
-                         base_mb16 + 3) +
-          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,
-                         base_mb16 + 2) +
-          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,
-                         base_mb16 + 3) +
-          get_nzc_4x4_uv(cm, cur - mis + 2, mb_row - 1, mb_col + 2,
-                         base_mb16 + 2) +
-          get_nzc_4x4_uv(cm, cur - mis + 2, mb_row - 1, mb_col + 2,
-                         base_mb16 + 3) +
-          get_nzc_4x4_uv(cm, cur - mis + 3, mb_row - 1, mb_col + 3,
-                         base_mb16 + 2) +
-          get_nzc_4x4_uv(cm, cur - mis + 3, mb_row - 1, mb_col + 3,
-                         base_mb16 + 3) +
-          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1,
-                         base_mb16 + 1) +
-          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1,
-                         base_mb16 + 3) +
-          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,
-                         base_mb16 + 1) +
-          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,
-                         base_mb16 + 3) +
-          get_nzc_4x4_uv(cm, cur - 1 + 2 * mis, mb_row + 2, mb_col - 1,
-                         base_mb16 + 1) +
-          get_nzc_4x4_uv(cm, cur - 1 + 2 * mis, mb_row + 2, mb_col - 1,
-                         base_mb16 + 3) +
-          get_nzc_4x4_uv(cm, cur - 1 + 3 * mis, mb_row + 3, mb_col - 1,
-                         base_mb16 + 1) +
-          get_nzc_4x4_uv(cm, cur - 1 + 3 * mis, mb_row + 3, mb_col - 1,
-                         base_mb16 + 3);
-      nzc_exp <<= 2;
-      // Note nzc_exp is 64 times the average value expected at 32x32 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_32X32, NZC_T1_32X32);
-
-    case TX_16X16:
-      // uv txfm_size 16x16
-      assert((block & 15) == 0);
-      if (boff < 32) {
-        int o = (boff >> 4) & 1;
-        nzc_exp =
-            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
-                           base_mb16 + 2) +
-            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
-                           base_mb16 + 3) +
-            get_nzc_4x4_uv(cm, cur - mis + o + 1, mb_row - 1, mb_col + o + 1,
-                           base_mb16 + 2) +
-            get_nzc_4x4_uv(cm, cur - mis + o + 1, mb_row - 1, mb_col + o + 1,
-                           base_mb16 + 3);
-      } else {
-        nzc_exp = cur->mbmi.nzcs[block - 32] << 4;
-      }
-      if ((boff & 31) == 0) {
-        int o = boff >> 5;
-        nzc_exp +=
-            get_nzc_4x4_uv(cm, cur - 1 + o * mis,
-                           mb_row + o, mb_col - 1, base_mb16 + 1) +
-            get_nzc_4x4_uv(cm, cur - 1 + o * mis,
-                           mb_row + o, mb_col - 1, base_mb16 + 3) +
-            get_nzc_4x4_uv(cm, cur - 1 + o * mis + mis,
-                           mb_row + o + 1, mb_col - 1, base_mb16 + 1) +
-            get_nzc_4x4_uv(cm, cur - 1 + o * mis + mis,
-                           mb_row + o + 1, mb_col - 1, base_mb16 + 3);
-      } else {
-        nzc_exp += cur->mbmi.nzcs[block - 16] << 4;
-      }
-      nzc_exp <<= 1;
-      // Note nzc_exp is 64 times the average value expected at 16x16 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);
-
-    case TX_8X8:
-      assert((block & 3) == 0);
-      if (boff < 16) {
-        int o = boff >> 2;
-        nzc_exp =
-            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
-                           base_mb16 + 2) +
-            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
-                           base_mb16 + 3);
-      } else {
-        nzc_exp = cur->mbmi.nzcs[block - 16] << 5;
-      }
-      if ((boff & 15) == 0) {
-        int o = boff >> 4;
-        nzc_exp +=
-            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
-                           base_mb16 + 1) +
-            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
-                           base_mb16 + 3);
-      } else {
-        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;
-      }
-      // Note nzc_exp is 64 times the average value expected at 8x8 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
-
-    case TX_4X4:
-      if (boff < 8) {
-        int o = boff >> 1;
-        int p = boff & 1;
-        nzc_exp = get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
-                                 base_mb16 + 2 + p);
-      } else {
-        nzc_exp = (cur->mbmi.nzcs[block - 8] << 6);
-      }
-      if ((boff & 7) == 0) {
-        int o = boff >> 4;
-        int p = (boff >> 3) & 1;
-        nzc_exp += get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
-                                  base_mb16 + 1 + 2 * p);
-      } else {
-        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
-      }
-      nzc_exp >>= 1;
-      // Note nzc_exp is 64 times the average value expected at 4x4 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
-
-    default:
-      return 0;
-  }
-}
-
-int vp9_get_nzc_context_uv_sb32(VP9_COMMON *cm, MODE_INFO *cur,
-                                int mb_row, int mb_col, int block) {
-  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
-  // neighboring blocks are
-  int mis = cm->mode_info_stride;
-  int nzc_exp = 0;
-  const int base = block - (block & 15);
-  const int boff = (block & 15);
-  const int base_mb16 = base >> 2;
-  TX_SIZE txfm_size = cur->mbmi.txfm_size;
-  TX_SIZE txfm_size_uv;
-
-  assert(block >= 64 && block < 96);
-  if (txfm_size == TX_32X32)
-    txfm_size_uv = TX_16X16;
-  else
-    txfm_size_uv = txfm_size;
-
-  switch (txfm_size_uv) {
-    case TX_16X16:
-      // uv txfm_size 16x16
-      assert(block == 64 || block == 80);
-      nzc_exp =
-          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,
-                         base_mb16 + 2) +
-          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,
-                         base_mb16 + 3) +
-          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,
-                         base_mb16 + 2) +
-          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,
-                         base_mb16 + 3) +
-          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row, mb_col - 1,
-                         base_mb16 + 1) +
-          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row, mb_col - 1,
-                         base_mb16 + 3) +
-          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,
-                         base_mb16 + 1) +
-          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,
-                         base_mb16 + 3);
-      nzc_exp <<= 1;
-      // Note nzc_exp is 64 times the average value expected at 16x16 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);
-      break;
-
-    case TX_8X8:
-      assert((block & 3) == 0);
-      if (boff < 8) {
-        int o = boff >> 2;
-        nzc_exp =
-            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
-                           base_mb16 + 2) +
-            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
-                           base_mb16 + 3);
-      } else {
-        nzc_exp = cur->mbmi.nzcs[block - 8] << 5;
-      }
-      if ((boff & 7) == 0) {
-        int o = boff >> 3;
-        nzc_exp +=
-            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
-                           base_mb16 + 1) +
-            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
-                           base_mb16 + 3);
-      } else {
-        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;
-      }
-      // Note nzc_exp is 64 times the average value expected at 8x8 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
-
-    case TX_4X4:
-      if (boff < 4) {
-        int o = boff >> 1;
-        int p = boff & 1;
-        nzc_exp = get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
-                                 base_mb16 + 2 + p);
-      } else {
-        nzc_exp = (cur->mbmi.nzcs[block - 4] << 6);
-      }
-      if ((boff & 3) == 0) {
-        int o = boff >> 3;
-        int p = (boff >> 2) & 1;
-        nzc_exp += get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
-                                  base_mb16 + 1 + 2 * p);
-      } else {
-        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
-      }
-      nzc_exp >>= 1;
-      // Note nzc_exp is 64 times the average value expected at 4x4 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
-
-    default:
-      return 0;
-  }
-}
-
-int vp9_get_nzc_context_uv_mb16(VP9_COMMON *cm, MODE_INFO *cur,
-                                int mb_row, int mb_col, int block) {
-  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
-  // neighboring blocks are
-  int mis = cm->mode_info_stride;
-  int nzc_exp = 0;
-  const int base = block - (block & 3);
-  const int boff = (block & 3);
-  const int base_mb16 = base;
-  TX_SIZE txfm_size = cur->mbmi.txfm_size;
-  TX_SIZE txfm_size_uv;
-
-  assert(block >= 16 && block < 24);
-  if (txfm_size == TX_16X16)
-    txfm_size_uv = TX_8X8;
-  else if (txfm_size == TX_8X8 &&
-           (cur->mbmi.mode == I8X8_PRED || cur->mbmi.mode == SPLITMV))
-    txfm_size_uv = TX_4X4;
-  else
-    txfm_size_uv = txfm_size;
-
-  switch (txfm_size_uv) {
-    case TX_8X8:
-      assert((block & 3) == 0);
-      nzc_exp =
-          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col, base_mb16 + 2) +
-          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col, base_mb16 + 3) +
-          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1, base_mb16 + 1) +
-          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1, base_mb16 + 3);
-      // Note nzc_exp is 64 times the average value expected at 8x8 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
-
-    case TX_4X4:
-      if (boff < 2) {
-        int p = boff & 1;
-        nzc_exp = get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,
-                                 base_mb16 + 2 + p);
-      } else {
-        nzc_exp = (cur->mbmi.nzcs[block - 2] << 6);
-      }
-      if ((boff & 1) == 0) {
-        int p = (boff >> 1) & 1;
-        nzc_exp += get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1,
-                                  base_mb16 + 1 + 2 * p);
-      } else {
-        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
-      }
-      nzc_exp >>= 1;
-      // Note nzc_exp is 64 times the average value expected at 4x4 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
-
-    default:
-      return 0;
-  }
-}
-
-int vp9_get_nzc_context(VP9_COMMON *cm, MACROBLOCKD *xd, int block) {
-  if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
-    assert(block < 384);
-    if (block < 256)
-      return vp9_get_nzc_context_y_sb64(cm, xd->mode_info_context,
-                                        get_mb_row(xd), get_mb_col(xd), block);
-    else
-      return vp9_get_nzc_context_uv_sb64(cm, xd->mode_info_context,
-                                         get_mb_row(xd), get_mb_col(xd), block);
-  } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {
-    assert(block < 96);
-    if (block < 64)
-      return vp9_get_nzc_context_y_sb32(cm, xd->mode_info_context,
-                                        get_mb_row(xd), get_mb_col(xd), block);
-    else
-      return vp9_get_nzc_context_uv_sb32(cm, xd->mode_info_context,
-                                         get_mb_row(xd), get_mb_col(xd), block);
-  } else {
-    assert(block < 64);
-    if (block < 16)
-      return vp9_get_nzc_context_y_mb16(cm, xd->mode_info_context,
-                                        get_mb_row(xd), get_mb_col(xd), block);
-    else
-      return vp9_get_nzc_context_uv_mb16(cm, xd->mode_info_context,
-                                         get_mb_row(xd), get_mb_col(xd), block);
-  }
-}
-
-static void update_nzc(VP9_COMMON *cm,
-                       uint16_t nzc,
-                       int nzc_context,
-                       TX_SIZE tx_size,
-                       int ref,
-                       int type) {
-  int e, c;
-  if (!get_nzc_used(tx_size)) return;
-  c = codenzc(nzc);
-  if (tx_size == TX_32X32)
-    cm->fc.nzc_counts_32x32[nzc_context][ref][type][c]++;
-  else if (tx_size == TX_16X16)
-    cm->fc.nzc_counts_16x16[nzc_context][ref][type][c]++;
-  else if (tx_size == TX_8X8)
-    cm->fc.nzc_counts_8x8[nzc_context][ref][type][c]++;
-  else if (tx_size == TX_4X4)
-    cm->fc.nzc_counts_4x4[nzc_context][ref][type][c]++;
-  else
-    assert(0);
-
-  if ((e = vp9_extranzcbits[c])) {
-    int x = nzc - vp9_basenzcvalue[c];
-    while (e--) {
-      int b = (x >> e) & 1;
-      cm->fc.nzc_pcat_counts[nzc_context][c - NZC_TOKENS_NOEXTRA][e][b]++;
-    }
-  }
-}
-
-static void update_nzcs_sb64(VP9_COMMON *cm,
-                             MACROBLOCKD *xd,
-                             int mb_row,
-                             int mb_col) {
-  MODE_INFO *m = xd->mode_info_context;
-  MB_MODE_INFO *const mi = &m->mbmi;
-  int j, nzc_context;
-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
-
-  assert(mb_col == get_mb_col(xd));
-  assert(mb_row == get_mb_row(xd));
-
-  if (mi->mb_skip_coeff)
-    return;
-
-  switch (mi->txfm_size) {
-    case TX_32X32:
-      for (j = 0; j < 256; j += 64) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0);
-      }
-      for (j = 256; j < 384; j += 64) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 1);
-      }
-      break;
-
-    case TX_16X16:
-      for (j = 0; j < 256; j += 16) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0);
-      }
-      for (j = 256; j < 384; j += 16) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1);
-      }
-      break;
-
-    case TX_8X8:
-      for (j = 0; j < 256; j += 4) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0);
-      }
-      for (j = 256; j < 384; j += 4) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);
-      }
-      break;
-
-    case TX_4X4:
-      for (j = 0; j < 256; ++j) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0);
-      }
-      for (j = 256; j < 384; ++j) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);
-      }
-      break;
-
-    default:
-      break;
-  }
-}
-
-static void update_nzcs_sb32(VP9_COMMON *cm,
-                            MACROBLOCKD *xd,
-                            int mb_row,
-                            int mb_col) {
-  MODE_INFO *m = xd->mode_info_context;
-  MB_MODE_INFO *const mi = &m->mbmi;
-  int j, nzc_context;
-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
-
-  assert(mb_col == get_mb_col(xd));
-  assert(mb_row == get_mb_row(xd));
-
-  if (mi->mb_skip_coeff)
-    return;
-
-  switch (mi->txfm_size) {
-    case TX_32X32:
-      for (j = 0; j < 64; j += 64) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0);
-      }
-      for (j = 64; j < 96; j += 16) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1);
-      }
-      break;
-
-    case TX_16X16:
-      for (j = 0; j < 64; j += 16) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0);
-      }
-      for (j = 64; j < 96; j += 16) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1);
-      }
-      break;
-
-    case TX_8X8:
-      for (j = 0; j < 64; j += 4) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0);
-      }
-      for (j = 64; j < 96; j += 4) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);
-      }
-      break;
-
-    case TX_4X4:
-      for (j = 0; j < 64; ++j) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0);
-      }
-      for (j = 64; j < 96; ++j) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);
-      }
-      break;
-
-    default:
-      break;
-  }
-}
-
-static void update_nzcs_mb16(VP9_COMMON *cm,
-                             MACROBLOCKD *xd,
-                             int mb_row,
-                             int mb_col) {
-  MODE_INFO *m = xd->mode_info_context;
-  MB_MODE_INFO *const mi = &m->mbmi;
-  int j, nzc_context;
-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
-
-  assert(mb_col == get_mb_col(xd));
-  assert(mb_row == get_mb_row(xd));
-
-  if (mi->mb_skip_coeff)
-    return;
-
-  switch (mi->txfm_size) {
-    case TX_16X16:
-      for (j = 0; j < 16; j += 16) {
-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0);
-      }
-      for (j = 16; j < 24; j += 4) {
-        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);
-      }
-      break;
-
-    case TX_8X8:
-      for (j = 0; j < 16; j += 4) {
-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0);
-      }
-      if (mi->mode == I8X8_PRED || mi->mode == SPLITMV) {
-        for (j = 16; j < 24; ++j) {
-          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-          update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);
-        }
-      } else {
-        for (j = 16; j < 24; j += 4) {
-          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-          update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);
-        }
-      }
-      break;
-
-    case TX_4X4:
-      for (j = 0; j < 16; ++j) {
-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0);
-      }
-      for (j = 16; j < 24; ++j) {
-        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);
-      }
-      break;
-
-    default:
-      break;
-  }
-}
-
-void vp9_update_nzc_counts(VP9_COMMON *cm,
-                           MACROBLOCKD *xd,
-                           int mb_row,
-                           int mb_col) {
-  if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64)
-    update_nzcs_sb64(cm, xd, mb_row, mb_col);
-  else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32)
-    update_nzcs_sb32(cm, xd, mb_row, mb_col);
-  else
-    update_nzcs_mb16(cm, xd, mb_row, mb_col);
-}
-#endif  // CONFIG_CODE_NONZEROCOUNT
-
 // #define COEF_COUNT_TESTING
 
 #define COEF_COUNT_SAT 24
@@ -2777,111 +1587,105 @@
                    count_sat, update_factor);
 }
 
-#if CONFIG_CODE_NONZEROCOUNT
-static void adapt_nzc_probs_common(VP9_COMMON *cm,
+#if CONFIG_CODE_ZEROGROUP
+OrientationType vp9_get_orientation(int rc, TX_SIZE tx_size) {
+  int i = rc >> (tx_size + 2);
+  int j = rc & ((4 << tx_size) - 1);
+  if (i > 2 * j)
+    return VERTICAL;
+  else if (j > 2 * i)
+    return HORIZONTAL;
+  else
+    return DIAGONAL;
+  /*
+  if (i == 0 && j == 0) return DIAGONAL;
+  while (i > 1 || j > 1) {
+    i >>= 1;
+    j >>= 1;
+  }
+  if (i == 0 && j == 1)
+    return HORIZONTAL;  // horizontal
+  else if (i == 1 && j == 1)
+    return DIAGONAL;    // diagonal
+  else if (i == 1 && j == 0)
+    return VERTICAL;    // vertical
+  assert(0);
+  */
+}
+
+int vp9_use_eoo(int c, int seg_eob, const int *scan,
+                TX_SIZE tx_size, int *is_last_zero, int *is_eoo) {
+  // NOTE: returning 0 from this function will turn off eoo symbols
+  // For instance we can experiment with turning eoo off for smaller blocks
+  // and/or lower bands
+  int o = vp9_get_orientation(scan[c], tx_size);
+  int band = get_coef_band(scan, tx_size, c);
+  int use_eoo = (!is_last_zero[o] &&
+                 !is_eoo[o] &&
+                 band <= ZPC_EOO_BAND_UPPER &&
+                 band >= ZPC_EOO_BAND_LOWER &&
+                 get_zpc_used(tx_size) &&
+                 seg_eob - c > (ZPC_USEEOO_THRESH << tx_size) &&
+                 is_eoo[0] + is_eoo[1] + is_eoo[2] < 2);
+  return use_eoo;
+}
+
+int vp9_is_eoo(int c, int eob, const int *scan, TX_SIZE tx_size,
+               const int16_t *qcoeff_ptr, int *last_nz_pos) {
+  int rc = scan[c];
+  int o = vp9_get_orientation(rc, tx_size);
+  int eoo = c > last_nz_pos[o];
+  return eoo;
+}
+
+static void adapt_zpc_probs_common(VP9_COMMON *cm,
                                    TX_SIZE tx_size,
                                    int count_sat,
                                    int update_factor) {
-  int c, r, b, n;
+  int r, b, p, n;
   int count, factor;
-  unsigned int nzc_branch_ct[NZC32X32_NODES][2];
-  vp9_prob nzc_probs[NZC32X32_NODES];
-  int tokens, nodes;
-  const vp9_tree_index *nzc_tree;
-  vp9_prob *dst_nzc_probs;
-  vp9_prob *pre_nzc_probs;
-  unsigned int *nzc_counts;
-
-  if (!get_nzc_used(tx_size)) return;
+  vp9_zpc_probs *zpc_probs;
+  vp9_zpc_probs *pre_zpc_probs;
+  vp9_zpc_count *zpc_counts;
+  if (!get_zpc_used(tx_size)) return;
   if (tx_size == TX_32X32) {
-    tokens = NZC32X32_TOKENS;
-    nzc_tree = vp9_nzc32x32_tree;
-    dst_nzc_probs = cm->fc.nzc_probs_32x32[0][0][0];
-    pre_nzc_probs = cm->fc.pre_nzc_probs_32x32[0][0][0];
-    nzc_counts = cm->fc.nzc_counts_32x32[0][0][0];
+    zpc_probs = &cm->fc.zpc_probs_32x32;
+    pre_zpc_probs = &cm->fc.pre_zpc_probs_32x32;
+    zpc_counts = &cm->fc.zpc_counts_32x32;
   } else if (tx_size == TX_16X16) {
-    tokens = NZC16X16_TOKENS;
-    nzc_tree = vp9_nzc16x16_tree;
-    dst_nzc_probs = cm->fc.nzc_probs_16x16[0][0][0];
-    pre_nzc_probs = cm->fc.pre_nzc_probs_16x16[0][0][0];
-    nzc_counts = cm->fc.nzc_counts_16x16[0][0][0];
+    zpc_probs = &cm->fc.zpc_probs_16x16;
+    pre_zpc_probs = &cm->fc.pre_zpc_probs_16x16;
+    zpc_counts = &cm->fc.zpc_counts_16x16;
   } else if (tx_size == TX_8X8) {
-    tokens = NZC8X8_TOKENS;
-    nzc_tree = vp9_nzc8x8_tree;
-    dst_nzc_probs = cm->fc.nzc_probs_8x8[0][0][0];
-    pre_nzc_probs = cm->fc.pre_nzc_probs_8x8[0][0][0];
-    nzc_counts = cm->fc.nzc_counts_8x8[0][0][0];
+    zpc_probs = &cm->fc.zpc_probs_8x8;
+    pre_zpc_probs = &cm->fc.pre_zpc_probs_8x8;
+    zpc_counts = &cm->fc.zpc_counts_8x8;
   } else {
-    nzc_tree = vp9_nzc4x4_tree;
-    tokens = NZC4X4_TOKENS;
-    dst_nzc_probs = cm->fc.nzc_probs_4x4[0][0][0];
-    pre_nzc_probs = cm->fc.pre_nzc_probs_4x4[0][0][0];
-    nzc_counts = cm->fc.nzc_counts_4x4[0][0][0];
+    zpc_probs = &cm->fc.zpc_probs_4x4;
+    pre_zpc_probs = &cm->fc.pre_zpc_probs_4x4;
+    zpc_counts = &cm->fc.zpc_counts_4x4;
   }
-  nodes = tokens - 1;
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c)
-    for (r = 0; r < REF_TYPES; ++r)
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;
-        int offset_nodes = offset * nodes;
-        int offset_tokens = offset * tokens;
-        vp9_tree_probs_from_distribution(nzc_tree,
-                                         nzc_probs, nzc_branch_ct,
-                                         nzc_counts + offset_tokens, 0);
-        for (n = 0; n < nodes; ++n) {
-          count = nzc_branch_ct[n][0] + nzc_branch_ct[n][1];
+  for (r = 0; r < REF_TYPES; ++r) {
+    for (b = 0; b < ZPC_BANDS; ++b) {
+      for (p = 0; p < ZPC_PTOKS; ++p) {
+        for (n = 0; n < ZPC_NODES; ++n) {
+          vp9_prob prob = get_binary_prob((*zpc_counts)[r][b][p][n][0],
+                                          (*zpc_counts)[r][b][p][n][1]);
+          count = (*zpc_counts)[r][b][p][n][0] + (*zpc_counts)[r][b][p][n][1];
           count = count > count_sat ? count_sat : count;
           factor = (update_factor * count / count_sat);
-          dst_nzc_probs[offset_nodes + n] =
-              weighted_prob(pre_nzc_probs[offset_nodes + n],
-                            nzc_probs[n], factor);
+          (*zpc_probs)[r][b][p][n] = weighted_prob(
+              (*pre_zpc_probs)[r][b][p][n], prob, factor);
         }
       }
-}
-
-static void adapt_nzc_pcat(VP9_COMMON *cm, int count_sat, int update_factor) {
-  int c, t;
-  int count, factor;
-  if (!(get_nzc_used(TX_4X4) || get_nzc_used(TX_8X8) ||
-        get_nzc_used(TX_16X16) || get_nzc_used(TX_32X32)))
-    return;
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {
-      int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];
-      int b;
-      for (b = 0; b < bits; ++b) {
-        vp9_prob prob = get_binary_prob(cm->fc.nzc_pcat_counts[c][t][b][0],
-                                        cm->fc.nzc_pcat_counts[c][t][b][1]);
-        count = cm->fc.nzc_pcat_counts[c][t][b][0] +
-                cm->fc.nzc_pcat_counts[c][t][b][1];
-        count = count > count_sat ? count_sat : count;
-        factor = (update_factor * count / count_sat);
-        cm->fc.nzc_pcat_probs[c][t][b] = weighted_prob(
-            cm->fc.pre_nzc_pcat_probs[c][t][b], prob, factor);
-      }
     }
   }
 }
 
-// #define NZC_COUNT_TESTING
-void vp9_adapt_nzc_probs(VP9_COMMON *cm) {
+// #define ZPC_COUNT_TESTING
+void vp9_adapt_zpc_probs(VP9_COMMON *cm) {
   int count_sat;
   int update_factor; /* denominator 256 */
-#ifdef NZC_COUNT_TESTING
-  int c, r, b, t;
-  printf("\n");
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c)
-    for (r = 0; r < REF_TYPES; ++r) {
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        printf("    {");
-        for (t = 0; t < NZC4X4_TOKENS; ++t) {
-          printf(" %d,", cm->fc.nzc_counts_4x4[c][r][b][t]);
-        }
-        printf("}\n");
-      }
-      printf("\n");
-    }
-#endif
 
   if (cm->frame_type == KEY_FRAME) {
     update_factor = COEF_MAX_UPDATE_FACTOR_KEY;
@@ -2894,10 +1698,9 @@
     count_sat = COEF_COUNT_SAT;
   }
 
-  adapt_nzc_probs_common(cm, TX_4X4, count_sat, update_factor);
-  adapt_nzc_probs_common(cm, TX_8X8, count_sat, update_factor);
-  adapt_nzc_probs_common(cm, TX_16X16, count_sat, update_factor);
-  adapt_nzc_probs_common(cm, TX_32X32, count_sat, update_factor);
-  adapt_nzc_pcat(cm, count_sat, update_factor);
+  adapt_zpc_probs_common(cm, TX_4X4, count_sat, update_factor);
+  adapt_zpc_probs_common(cm, TX_8X8, count_sat, update_factor);
+  adapt_zpc_probs_common(cm, TX_16X16, count_sat, update_factor);
+  adapt_zpc_probs_common(cm, TX_32X32, count_sat, update_factor);
 }
-#endif  // CONFIG_CODE_NONZEROCOUNT
+#endif  // CONFIG_CODE_ZEROGROUP
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 3cae946..07b07a7 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -167,88 +167,94 @@
                                 int b, int r);
 #endif  // CONFIG_MODELCOEFPROB
 
-#if CONFIG_CODE_NONZEROCOUNT
-/* Alphabet for number of non-zero symbols in block */
-#define NZC_0                   0       /* Used for all blocks */
-#define NZC_1                   1       /* Used for all blocks */
-#define NZC_2                   2       /* Used for all blocks */
-#define NZC_3TO4                3       /* Used for all blocks */
-#define NZC_5TO8                4       /* Used for all blocks */
-#define NZC_9TO16               5       /* Used for all blocks */
-#define NZC_17TO32              6       /* Used for 8x8 and larger blocks */
-#define NZC_33TO64              7       /* Used for 8x8 and larger blocks */
-#define NZC_65TO128             8       /* Used for 16x16 and larger blocks */
-#define NZC_129TO256            9       /* Used for 16x16 and larger blocks */
-#define NZC_257TO512           10       /* Used for 32x32 and larger blocks */
-#define NZC_513TO1024          11       /* Used for 32x32 and larger blocks */
+#if CONFIG_CODE_ZEROGROUP
 
-/* Number of tokens for each block size */
-#define NZC4X4_TOKENS           6
-#define NZC8X8_TOKENS           8
-#define NZC16X16_TOKENS        10
-#define NZC32X32_TOKENS        12
+#define ZPC_STATS
 
-/* Number of nodes for each block size */
-#define NZC4X4_NODES            5
-#define NZC8X8_NODES            7
-#define NZC16X16_NODES          9
-#define NZC32X32_NODES         11
+typedef enum {
+  HORIZONTAL = 0,
+  DIAGONAL,
+  VERTICAL,
+} OrientationType;
 
-/* Max number of tokens with extra bits */
-#define NZC_TOKENS_EXTRA        9
+/* Note EOB should become part of this symbol eventually,
+ * but holding off on this for now because that is a major
+ * change in the rest of the codebase */
 
-/* Max number of extra bits */
-#define NZC_BITS_EXTRA          9
+#define ZPC_ISOLATED     (MAX_ENTROPY_TOKENS + 0)    /* Isolated zero */
 
-/* Tokens without extra bits */
-#define NZC_TOKENS_NOEXTRA      (NZC32X32_TOKENS - NZC_TOKENS_EXTRA)
+/* ZPC_EOORIENT: All remaining coefficients in the same orientation are 0.
+ * In other words all remaining coeffs in the current subband, and all
+ * children of the current subband are zero. Subbands are defined by
+ * dyadic partitioning in the coeff domain */
+#define ZPC_EOORIENT     (MAX_ENTROPY_TOKENS + 1)    /* End of Orientation */
 
-#define MAX_NZC_CONTEXTS        3
+/* Band limits over which the eoo bit is sent */
+#define ZPC_EOO_BAND_LOWER       0
+#define ZPC_EOO_BAND_UPPER       5
 
-/* whether to update extra bit probabilities */
-#define NZC_PCAT_UPDATE
+#define USE_ZPC_EOORIENT         1       /* 0: not used */
+                                         /* 1: used */
+#define ZPC_NODES                1
 
-/* nzc trees */
-extern const vp9_tree_index    vp9_nzc4x4_tree[];
-extern const vp9_tree_index    vp9_nzc8x8_tree[];
-extern const vp9_tree_index    vp9_nzc16x16_tree[];
-extern const vp9_tree_index    vp9_nzc32x32_tree[];
+#define UNKNOWN_TOKEN          255       /* Not signalled, encoder only */
 
-/* nzc encodings */
-extern struct vp9_token vp9_nzc4x4_encodings[NZC4X4_TOKENS];
-extern struct vp9_token vp9_nzc8x8_encodings[NZC8X8_TOKENS];
-extern struct vp9_token vp9_nzc16x16_encodings[NZC16X16_TOKENS];
-extern struct vp9_token vp9_nzc32x32_encodings[NZC32X32_TOKENS];
+#define ZPC_BANDS                3       /* context bands for izr */
+#define ZPC_PTOKS                3       /* context pt for zpcs */
 
-#define codenzc(x) (\
-  (x) <= 3 ? (x) : (x) <= 4 ? 3 : (x) <= 8 ? 4 : \
-  (x) <= 16 ? 5 : (x) <= 32 ? 6 : (x) <= 64 ? 7 :\
-  (x) <= 128 ? 8 : (x) <= 256 ? 9 : (x) <= 512 ? 10 : 11)
+#define coef_to_zpc_band(b)      ((b) >> 1)
+#define coef_to_zpc_ptok(p)      ((p) > 2 ? 2 : (p))
 
-int vp9_get_nzc_context_y_sb64(struct VP9Common *cm, MODE_INFO *cur,
-                               int mb_row, int mb_col, int block);
-int vp9_get_nzc_context_y_sb32(struct VP9Common *cm, MODE_INFO *cur,
-                               int mb_row, int mb_col, int block);
-int vp9_get_nzc_context_y_mb16(struct VP9Common *cm, MODE_INFO *cur,
-                               int mb_row, int mb_col, int block);
-int vp9_get_nzc_context_uv_sb64(struct VP9Common *cm, MODE_INFO *cur,
-                                int mb_row, int mb_col, int block);
-int vp9_get_nzc_context_uv_sb32(struct VP9Common *cm, MODE_INFO *cur,
-                                int mb_row, int mb_col, int block);
-int vp9_get_nzc_context_uv_mb16(struct VP9Common *cm, MODE_INFO *cur,
-                                int mb_row, int mb_col, int block);
-int vp9_get_nzc_context(struct VP9Common *cm, MACROBLOCKD *xd, int block);
-void vp9_update_nzc_counts(struct VP9Common *cm, MACROBLOCKD *xd,
-                           int mb_row, int mb_col);
-void vp9_adapt_nzc_probs(struct VP9Common *cm);
+typedef vp9_prob vp9_zpc_probs[REF_TYPES][ZPC_BANDS]
+                              [ZPC_PTOKS][ZPC_NODES];
+typedef unsigned int vp9_zpc_count[REF_TYPES][ZPC_BANDS]
+                                  [ZPC_PTOKS][ZPC_NODES][2];
 
-/* Extra bits array */
-extern const int vp9_extranzcbits[NZC32X32_TOKENS];
+OrientationType vp9_get_orientation(int rc, TX_SIZE tx_size);
+int vp9_use_eoo(int c, int eob, const int *scan, TX_SIZE tx_size,
+                int *is_last_zero, int *is_eoo);
+int vp9_is_eoo(int c, int eob, const int *scan, TX_SIZE tx_size,
+               const int16_t *qcoeff_ptr, int *last_nz_pos);
 
-/* Base nzc values */
-extern const int vp9_basenzcvalue[NZC32X32_TOKENS];
+#define ZPC_USEEOO_THRESH        4
+#define ZPC_ZEROSSAVED_EOO       7   /* encoder only */
 
-#endif  // CONFIG_CODE_NONZEROCOUNT
+void vp9_adapt_zpc_probs(struct VP9Common *cm);
+
+#endif  // CONFIG_CODE_ZEROGROUP
+
+static INLINE const int* get_scan_4x4(TX_TYPE tx_type) {
+  switch (tx_type) {
+    case ADST_DCT:
+      return vp9_row_scan_4x4;
+    case DCT_ADST:
+      return vp9_col_scan_4x4;
+    default:
+      return vp9_default_zig_zag1d_4x4;
+  }
+}
+
+static INLINE const int* get_scan_8x8(TX_TYPE tx_type) {
+  switch (tx_type) {
+    case ADST_DCT:
+      return vp9_row_scan_8x8;
+    case DCT_ADST:
+      return vp9_col_scan_8x8;
+    default:
+      return vp9_default_zig_zag1d_8x8;
+  }
+}
+
+static INLINE const int* get_scan_16x16(TX_TYPE tx_type) {
+  switch (tx_type) {
+    case ADST_DCT:
+      return vp9_row_scan_16x16;
+    case DCT_ADST:
+      return vp9_col_scan_16x16;
+    default:
+      return vp9_default_zig_zag1d_16x16;
+  }
+}
 
 #include "vp9/common/vp9_coefupdateprobs.h"
 
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index f19dc12..fc93c99 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -96,9 +96,9 @@
 } sumvfref_t;
 
 int vp9_mv_cont(const int_mv *l, const int_mv *a) {
-  int lez = (l->as_int == 0);
-  int aez = (a->as_int == 0);
-  int lea = (l->as_int == a->as_int);
+  const int lez = (l->as_int == 0);
+  const int aez = (a->as_int == 0);
+  const int lea = (l->as_int == a->as_int);
 
   if (lea && lez)
     return SUBMVREF_LEFT_ABOVE_ZED;
@@ -152,13 +152,22 @@
 const vp9_prob vp9_mbsplit_probs [VP9_NUMMBSPLITS - 1] = { 110, 111, 150};
 
 #if CONFIG_SBSEGMENT
-const vp9_prob vp9_partition_probs[PARTITION_PLANES][PARTITION_TYPES - 1] = {
-  {110, 111, 150},
-  {110, 111, 150},
+const vp9_prob vp9_partition_probs[NUM_PARTITION_CONTEXTS]
+                                  [PARTITION_TYPES - 1] = {
+  {202, 162, 107},
+  {16,  2,   169},
+  {3,   246,  19},
+  {104, 90,  134},
+  {183, 70,  109},
+  {30,  14,  162},
+  {67,  208,  22},
+  {4,   17,   5},
 };
 #else
-const vp9_prob vp9_partition_probs[PARTITION_PLANES][PARTITION_TYPES - 1] = {
-  {200}, {200},
+const vp9_prob vp9_partition_probs[NUM_PARTITION_CONTEXTS]
+                                  [PARTITION_TYPES - 1] = {
+  {200}, {200}, {200}, {200},
+  {200}, {200}, {200}, {200},
 };
 #endif
 
@@ -323,30 +332,26 @@
 struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
 
 void vp9_init_mbmode_probs(VP9_COMMON *x) {
-  unsigned int bct [VP9_YMODES] [2];      /* num Ymodes > num UV modes */
+  unsigned int bct[VP9_YMODES][2];  // num Ymodes > num UV modes
+  int i;
 
   vp9_tree_probs_from_distribution(vp9_ymode_tree, x->fc.ymode_prob,
                                    bct, y_mode_cts, 0);
   vp9_tree_probs_from_distribution(vp9_sb_ymode_tree, x->fc.sb_ymode_prob,
                                    bct, y_mode_cts, 0);
-  {
-    int i;
-    for (i = 0; i < 8; i++) {
-      vp9_tree_probs_from_distribution(vp9_kf_ymode_tree, x->kf_ymode_prob[i],
-                                       bct, kf_y_mode_cts[i], 0);
-      vp9_tree_probs_from_distribution(vp9_sb_kf_ymode_tree,
-                                       x->sb_kf_ymode_prob[i], bct,
-                                       kf_y_mode_cts[i], 0);
-    }
+  for (i = 0; i < 8; i++) {
+    vp9_tree_probs_from_distribution(vp9_kf_ymode_tree, x->kf_ymode_prob[i],
+                                     bct, kf_y_mode_cts[i], 0);
+    vp9_tree_probs_from_distribution(vp9_sb_kf_ymode_tree,
+                                     x->sb_kf_ymode_prob[i], bct,
+                                     kf_y_mode_cts[i], 0);
   }
-  {
-    int i;
-    for (i = 0; i < VP9_YMODES; i++) {
-      vp9_tree_probs_from_distribution(vp9_uv_mode_tree, x->kf_uv_mode_prob[i],
-                                       bct, kf_uv_mode_cts[i], 0);
-      vp9_tree_probs_from_distribution(vp9_uv_mode_tree, x->fc.uv_mode_prob[i],
-                                       bct, uv_mode_cts[i], 0);
-    }
+
+  for (i = 0; i < VP9_YMODES; i++) {
+    vp9_tree_probs_from_distribution(vp9_uv_mode_tree, x->kf_uv_mode_prob[i],
+                                     bct, kf_uv_mode_cts[i], 0);
+    vp9_tree_probs_from_distribution(vp9_uv_mode_tree, x->fc.uv_mode_prob[i],
+                                     bct, uv_mode_cts[i], 0);
   }
 
   vp9_tree_probs_from_distribution(vp9_i8x8_mode_tree, x->fc.i8x8_mode_prob,
@@ -482,9 +487,7 @@
 void vp9_accum_mv_refs(VP9_COMMON *pc,
                        MB_PREDICTION_MODE m,
                        const int context) {
-  unsigned int (*mv_ref_ct)[4][2];
-
-  mv_ref_ct = pc->fc.mv_ref_ct;
+  unsigned int (*mv_ref_ct)[4][2] = pc->fc.mv_ref_ct;
 
   if (m == ZEROMV) {
     ++mv_ref_ct[context][0][0];
@@ -512,12 +515,8 @@
 #define MVREF_MAX_UPDATE_FACTOR 128
 void vp9_adapt_mode_context(VP9_COMMON *pc) {
   int i, j;
-  unsigned int (*mv_ref_ct)[4][2];
-  int (*mode_context)[4];
-
-  mode_context = pc->fc.vp9_mode_contexts;
-
-  mv_ref_ct = pc->fc.mv_ref_ct;
+  unsigned int (*mv_ref_ct)[4][2] = pc->fc.mv_ref_ct;
+  int (*mode_context)[4] = pc->fc.vp9_mode_contexts;
 
   for (j = 0; j < INTER_MODE_CONTEXTS; j++) {
     for (i = 0; i < 4; i++) {
@@ -578,93 +577,102 @@
 // #define MODE_COUNT_TESTING
 void vp9_adapt_mode_probs(VP9_COMMON *cm) {
   int i;
+  FRAME_CONTEXT *fc = &cm->fc;
 #ifdef MODE_COUNT_TESTING
   int t;
 
   printf("static const unsigned int\nymode_counts"
          "[VP9_YMODES] = {\n");
-  for (t = 0; t < VP9_YMODES; ++t) printf("%d, ", cm->fc.ymode_counts[t]);
+  for (t = 0; t < VP9_YMODES; ++t)
+    printf("%d, ", fc->ymode_counts[t]);
   printf("};\n");
   printf("static const unsigned int\nuv_mode_counts"
          "[VP9_YMODES] [VP9_UV_MODES] = {\n");
   for (i = 0; i < VP9_YMODES; ++i) {
     printf("  {");
-    for (t = 0; t < VP9_UV_MODES; ++t) printf("%d, ", cm->fc.uv_mode_counts[i][t]);
+    for (t = 0; t < VP9_UV_MODES; ++t)
+      printf("%d, ", fc->uv_mode_counts[i][t]);
     printf("},\n");
   }
   printf("};\n");
   printf("static const unsigned int\nbmode_counts"
          "[VP9_NKF_BINTRAMODES] = {\n");
   for (t = 0; t < VP9_NKF_BINTRAMODES; ++t)
-    printf("%d, ", cm->fc.bmode_counts[t]);
+    printf("%d, ", fc->bmode_counts[t]);
   printf("};\n");
   printf("static const unsigned int\ni8x8_mode_counts"
          "[VP9_I8X8_MODES] = {\n");
-  for (t = 0; t < VP9_I8X8_MODES; ++t) printf("%d, ", cm->fc.i8x8_mode_counts[t]);
+  for (t = 0; t < VP9_I8X8_MODES; ++t)
+    printf("%d, ", fc->i8x8_mode_counts[t]);
   printf("};\n");
   printf("static const unsigned int\nsub_mv_ref_counts"
          "[SUBMVREF_COUNT] [VP9_SUBMVREFS] = {\n");
   for (i = 0; i < SUBMVREF_COUNT; ++i) {
     printf("  {");
-    for (t = 0; t < VP9_SUBMVREFS; ++t) printf("%d, ", cm->fc.sub_mv_ref_counts[i][t]);
+    for (t = 0; t < VP9_SUBMVREFS; ++t)
+      printf("%d, ", fc->sub_mv_ref_counts[i][t]);
     printf("},\n");
   }
   printf("};\n");
   printf("static const unsigned int\nmbsplit_counts"
          "[VP9_NUMMBSPLITS] = {\n");
-  for (t = 0; t < VP9_NUMMBSPLITS; ++t) printf("%d, ", cm->fc.mbsplit_counts[t]);
+  for (t = 0; t < VP9_NUMMBSPLITS; ++t)
+    printf("%d, ", fc->mbsplit_counts[t]);
   printf("};\n");
 #if CONFIG_COMP_INTERINTRA_PRED
   printf("static const unsigned int\ninterintra_counts"
          "[2] = {\n");
-  for (t = 0; t < 2; ++t) printf("%d, ", cm->fc.interintra_counts[t]);
+  for (t = 0; t < 2; ++t)
+    printf("%d, ", fc->interintra_counts[t]);
   printf("};\n");
 #endif
 #endif
 
   update_mode_probs(VP9_YMODES, vp9_ymode_tree,
-                    cm->fc.ymode_counts, cm->fc.pre_ymode_prob,
-                    cm->fc.ymode_prob, 0);
+                    fc->ymode_counts, fc->pre_ymode_prob,
+                    fc->ymode_prob, 0);
   update_mode_probs(VP9_I32X32_MODES, vp9_sb_ymode_tree,
-                    cm->fc.sb_ymode_counts, cm->fc.pre_sb_ymode_prob,
-                    cm->fc.sb_ymode_prob, 0);
-  for (i = 0; i < VP9_YMODES; ++i) {
+                    fc->sb_ymode_counts, fc->pre_sb_ymode_prob,
+                    fc->sb_ymode_prob, 0);
+
+  for (i = 0; i < VP9_YMODES; ++i)
     update_mode_probs(VP9_UV_MODES, vp9_uv_mode_tree,
-                      cm->fc.uv_mode_counts[i], cm->fc.pre_uv_mode_prob[i],
-                      cm->fc.uv_mode_prob[i], 0);
-  }
+                      fc->uv_mode_counts[i], fc->pre_uv_mode_prob[i],
+                      fc->uv_mode_prob[i], 0);
+
   update_mode_probs(VP9_NKF_BINTRAMODES, vp9_bmode_tree,
-                    cm->fc.bmode_counts, cm->fc.pre_bmode_prob,
-                    cm->fc.bmode_prob, 0);
+                    fc->bmode_counts, fc->pre_bmode_prob,
+                    fc->bmode_prob, 0);
   update_mode_probs(VP9_I8X8_MODES,
-                    vp9_i8x8_mode_tree, cm->fc.i8x8_mode_counts,
-                    cm->fc.pre_i8x8_mode_prob, cm->fc.i8x8_mode_prob, 0);
-  for (i = 0; i < SUBMVREF_COUNT; ++i) {
+                    vp9_i8x8_mode_tree, fc->i8x8_mode_counts,
+                    fc->pre_i8x8_mode_prob, fc->i8x8_mode_prob, 0);
+
+  for (i = 0; i < SUBMVREF_COUNT; ++i)
     update_mode_probs(VP9_SUBMVREFS,
-                      vp9_sub_mv_ref_tree, cm->fc.sub_mv_ref_counts[i],
-                      cm->fc.pre_sub_mv_ref_prob[i], cm->fc.sub_mv_ref_prob[i],
+                      vp9_sub_mv_ref_tree, fc->sub_mv_ref_counts[i],
+                      fc->pre_sub_mv_ref_prob[i], fc->sub_mv_ref_prob[i],
                       LEFT4X4);
-  }
+
   update_mode_probs(VP9_NUMMBSPLITS, vp9_mbsplit_tree,
-                    cm->fc.mbsplit_counts, cm->fc.pre_mbsplit_prob,
-                    cm->fc.mbsplit_prob, 0);
+                    fc->mbsplit_counts, fc->pre_mbsplit_prob,
+                    fc->mbsplit_prob, 0);
 #if CONFIG_COMP_INTERINTRA_PRED
   if (cm->use_interintra) {
     int factor, interintra_prob, count;
 
-    interintra_prob = get_binary_prob(cm->fc.interintra_counts[0],
-                                      cm->fc.interintra_counts[1]);
-    count = cm->fc.interintra_counts[0] + cm->fc.interintra_counts[1];
+    interintra_prob = get_binary_prob(fc->interintra_counts[0],
+                                      fc->interintra_counts[1]);
+    count = fc->interintra_counts[0] + fc->interintra_counts[1];
     count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
     factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-    cm->fc.interintra_prob = weighted_prob(cm->fc.pre_interintra_prob,
-                                           interintra_prob, factor);
+    fc->interintra_prob = weighted_prob(fc->pre_interintra_prob,
+                                        interintra_prob, factor);
   }
 #endif
-  for (i = 0; i < PARTITION_PLANES; i++)
+  for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)
     update_mode_probs(PARTITION_TYPES, vp9_partition_tree,
-                      cm->fc.partition_counts[i], cm->fc.pre_partition_prob[i],
-                      cm->fc.partition_prob[i], 0);
+                      fc->partition_counts[i], fc->pre_partition_prob[i],
+                      fc->partition_prob[i], 0);
 }
 
 static void set_default_lf_deltas(MACROBLOCKD *xd) {
@@ -691,7 +699,7 @@
   if (cm->last_frame_seg_map)
     vpx_memset(cm->last_frame_seg_map, 0, (cm->mb_rows * cm->mb_cols));
 
-  /* reset the mode ref deltas for loop filter */
+  // Reset the mode ref deltas for loop filter
   vpx_memset(xd->last_ref_lf_deltas, 0, sizeof(xd->last_ref_lf_deltas));
   vpx_memset(xd->last_mode_lf_deltas, 0, sizeof(xd->last_mode_lf_deltas));
   set_default_lf_deltas(xd);
@@ -701,14 +709,14 @@
   vp9_default_bmode_probs(cm->fc.bmode_prob);
   vp9_kf_default_bmode_probs(cm->kf_bmode_prob);
   vp9_init_mv_probs(cm);
+
   // To force update of the sharpness
   cm->last_sharpness_level = -1;
 
   vp9_init_mode_contexts(cm);
 
-  for (i = 0; i < NUM_FRAME_CONTEXTS; i++) {
+  for (i = 0; i < NUM_FRAME_CONTEXTS; i++)
     vpx_memcpy(&cm->frame_contexts[i], &cm->fc, sizeof(cm->fc));
-  }
 
   vpx_memset(cm->prev_mip, 0,
              (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
@@ -718,6 +726,9 @@
   vp9_update_mode_info_border(cm, cm->mip);
   vp9_update_mode_info_in_image(cm, cm->mi);
 
+  vp9_update_mode_info_border(cm, cm->prev_mip);
+  vp9_update_mode_info_in_image(cm, cm->prev_mi);
+
   cm->ref_frame_sign_bias[GOLDEN_FRAME] = 0;
   cm->ref_frame_sign_bias[ALTREF_FRAME] = 0;
 
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 930a597..d9a6721 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -14,6 +14,13 @@
 #include "./vpx_config.h"
 
 typedef enum BLOCK_SIZE_TYPE {
+#if CONFIG_SB8X8
+  BLOCK_SIZE_SB8X8,
+#if CONFIG_SBSEGMENT
+  BLOCK_SIZE_SB8X16,
+  BLOCK_SIZE_SB16X8,
+#endif
+#endif
   BLOCK_SIZE_MB16X16,
 #if CONFIG_SBSEGMENT
   BLOCK_SIZE_SB16X32,
@@ -37,6 +44,7 @@
   PARTITION_TYPES
 } PARTITION_TYPE;
 
-#define PARTITION_PLANES 2  // number of probability models
+#define PARTITION_PLOFFSET   4  // number of probability models per block size
+#define NUM_PARTITION_CONTEXTS (2 * PARTITION_PLOFFSET)
 
 #endif  // VP9_COMMON_VP9_ENUMS_H_
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 4943e42..0bb7968 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -281,66 +281,71 @@
                      uint8_t *y_ptr, uint8_t *u_ptr, uint8_t *v_ptr,
                      int y_stride, int uv_stride,
                      int y_only, int dering) {
-    BLOCK_SIZE_TYPE sb_type = mode_info_context->mbmi.sb_type;
-    TX_SIZE tx_size = mode_info_context->mbmi.txfm_size;
-    int do_left_v, do_above_h;
-    int do_left_v_mbuv, do_above_h_mbuv;
-    int mis = cm->mode_info_stride;
-    const MODE_INFO *mi;
+  BLOCK_SIZE_TYPE sb_type = mode_info_context->mbmi.sb_type;
+  const int wbl = b_width_log2(sb_type), hbl = b_height_log2(sb_type);
+  TX_SIZE tx_size = mode_info_context->mbmi.txfm_size;
+  int do_left_v, do_above_h;
+  int do_left_v_mbuv, do_above_h_mbuv;
+  int mis = cm->mode_info_stride;
+  const MODE_INFO *mi;
 
-    // process 1st MB top-left
-    mi = mode_info_context;
-    do_left_v = (mb_col > 0);
-    do_above_h = (mb_row > 0);
-    do_left_v_mbuv = !(sb_type >= BLOCK_SIZE_SB64X64 &&
+  // process 1st MB top-left
+  mi = mode_info_context;
+  do_left_v = (mb_col > 0);
+  do_above_h = (mb_row > 0);
+  do_left_v_mbuv = !(sb_type >= BLOCK_SIZE_SB64X64 &&
       tx_size >= TX_32X32 && (mb_col & 2));
-    do_above_h_mbuv = !(sb_type >= BLOCK_SIZE_SB64X64 &&
+  do_above_h_mbuv = !(sb_type >= BLOCK_SIZE_SB64X64 &&
       tx_size >= TX_32X32 && (mb_row & 2));
-    lpf_mb(cm, mi, do_left_v, do_above_h,
+  lpf_mb(cm, mi, do_left_v, do_above_h,
       do_left_v_mbuv, do_above_h_mbuv,
       y_ptr,
       y_only? 0 : u_ptr,
       y_only? 0 : v_ptr,
       y_stride, uv_stride, dering);
-    // process 2nd MB top-right
-    mi = mode_info_context + 1;
-    do_left_v = !(sb_type && (tx_size >= TX_32X32 ||
+  // process 2nd MB top-right
+  mi = mode_info_context + 1;
+  do_left_v = !(wbl >= 3 /* 32x16 or >=32x32 */ && (tx_size >= TX_32X32 ||
       sb_mb_lf_skip(mode_info_context, mi)));
-    do_above_h = (mb_row > 0);
-    do_left_v_mbuv = do_left_v;
-    do_above_h_mbuv = !(sb_type >= BLOCK_SIZE_SB64X64 &&
+  do_above_h = (mb_row > 0);
+  do_left_v_mbuv = !(wbl >= 3 /* 32x16 or >=32x32 */ && (tx_size >= TX_16X16 ||
+      sb_mb_lf_skip(mode_info_context, mi)));
+  do_above_h_mbuv = !(sb_type >= BLOCK_SIZE_SB64X64 &&
       tx_size >= TX_32X32 && (mb_row & 2));
-    lpf_mb(cm, mi, do_left_v, do_above_h,
+  lpf_mb(cm, mi, do_left_v, do_above_h,
       do_left_v_mbuv, do_above_h_mbuv,
       y_ptr + 16,
       y_only ? 0 : (u_ptr + 8),
       y_only ? 0 : (v_ptr + 8),
       y_stride, uv_stride, dering);
 
-    // process 3rd MB bottom-left
-    mi = mode_info_context + mis;
-    do_left_v = (mb_col > 0);
-    do_above_h =!(sb_type && (tx_size >= TX_32X32 ||
+  // process 3rd MB bottom-left
+  mi = mode_info_context + mis;
+  do_left_v = (mb_col > 0);
+  do_above_h = !(hbl >= 3 /* 16x32 or >=32x32 */ && (tx_size >= TX_32X32 ||
       sb_mb_lf_skip(mode_info_context, mi)));
-    do_left_v_mbuv =  !(sb_type >= BLOCK_SIZE_SB64X64 &&
+  do_left_v_mbuv = !(sb_type >= BLOCK_SIZE_SB64X64 &&
       tx_size >= TX_32X32 && (mb_col & 2));
-    do_above_h_mbuv = do_above_h;
-    lpf_mb(cm, mi, do_left_v, do_above_h,
+  do_above_h_mbuv = !(hbl >= 3 /* 16x32 or >=32x32 */ && (tx_size >= TX_16X16 ||
+      sb_mb_lf_skip(mode_info_context, mi)));
+  lpf_mb(cm, mi, do_left_v, do_above_h,
       do_left_v_mbuv, do_above_h_mbuv,
       y_ptr + 16 * y_stride,
       y_only ? 0 : (u_ptr + 8 * uv_stride),
       y_only ? 0 : (v_ptr + 8 * uv_stride),
       y_stride, uv_stride, dering);
 
-    // process 4th MB bottom right
-    mi = mode_info_context + mis + 1;
-    do_left_v = !(sb_type && (tx_size >= TX_32X32 ||
+  // process 4th MB bottom right
+  mi = mode_info_context + mis + 1;
+  do_left_v = !(wbl >= 3 /* 32x16 or >=32x32 */ && (tx_size >= TX_32X32 ||
       sb_mb_lf_skip(mi - 1, mi)));
-    do_above_h =!(sb_type && (tx_size >= TX_32X32 ||
+  do_above_h = !(hbl >= 3 /* 16x32 or >=32x32 */ && (tx_size >= TX_32X32 ||
       sb_mb_lf_skip(mode_info_context + 1, mi)));
-    do_left_v_mbuv = do_left_v;
-    do_above_h_mbuv = do_above_h;
-    lpf_mb(cm, mi, do_left_v, do_above_h,
+  do_left_v_mbuv = (wbl >= 3 /* 32x16 or >=32x32 */ && (tx_size >= TX_16X16 ||
+      sb_mb_lf_skip(mi - 1, mi)));
+  do_above_h_mbuv = !(hbl >= 3 /* 16x32 or >=32x32 */ && (tx_size >= TX_16X16 ||
+      sb_mb_lf_skip(mode_info_context + 1, mi)));
+  lpf_mb(cm, mi, do_left_v, do_above_h,
       do_left_v_mbuv, do_above_h_mbuv,
       y_ptr + 16 * y_stride + 16,
       y_only ? 0 : (u_ptr + 8 * uv_stride + 8),
@@ -353,25 +358,26 @@
                      uint8_t *y_ptr, uint8_t *u_ptr, uint8_t *v_ptr,
                      int y_stride, int uv_stride,
                      int y_only, int dering) {
-    lpf_sb32(cm, mode_info_context, mb_row, mb_col,
+  lpf_sb32(cm, mode_info_context, mb_row, mb_col,
       y_ptr, u_ptr, v_ptr,
       y_stride, uv_stride, y_only, dering);
-    lpf_sb32(cm, mode_info_context + 2, mb_row, mb_col + 2,
+  lpf_sb32(cm, mode_info_context + 2, mb_row, mb_col + 2,
       y_ptr + 32, u_ptr + 16, v_ptr + 16,
       y_stride, uv_stride, y_only, dering);
-    lpf_sb32(cm, mode_info_context + cm->mode_info_stride * 2,
+  lpf_sb32(cm, mode_info_context + cm->mode_info_stride * 2,
       mb_row + 2, mb_col,
       y_ptr + 32 * y_stride,
       u_ptr + 16 * uv_stride,
       v_ptr + 16 * uv_stride,
       y_stride, uv_stride, y_only, dering);
-    lpf_sb32(cm, mode_info_context + cm->mode_info_stride * 2 + 2,
+  lpf_sb32(cm, mode_info_context + cm->mode_info_stride * 2 + 2,
       mb_row + 2, mb_col + 2,
       y_ptr + 32 * y_stride + 32,
       u_ptr + 16 * uv_stride + 16,
       v_ptr + 16 * uv_stride + 16,
       y_stride, uv_stride, y_only, dering);
 }
+
 void vp9_loop_filter_frame(VP9_COMMON *cm,
                            MACROBLOCKD *xd,
                            int frame_filter_level,
diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h
index 3b81146..81745e4 100644
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -36,7 +36,7 @@
                   lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
   DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
                   hev_thr[4][SIMD_WIDTH]);
-  unsigned char lvl[4][4][4];
+  unsigned char lvl[MAX_MB_SEGMENTS][4][4];
   unsigned char mode_lf_lut[MB_MODE_COUNT];
 } loop_filter_info_n;
 
diff --git a/vp9/common/vp9_mbpitch.c b/vp9/common/vp9_mbpitch.c
index 957068f..d5f104d 100644
--- a/vp9/common/vp9_mbpitch.c
+++ b/vp9/common/vp9_mbpitch.c
@@ -44,13 +44,13 @@
     u2 = NULL;
     v2 = NULL;
   } else {
-    y = &mb->pre.y_buffer;
-    u = &mb->pre.u_buffer;
-    v = &mb->pre.v_buffer;
+    y = &mb->plane[0].pre[0].buf;
+    u = &mb->plane[1].pre[0].buf;
+    v = &mb->plane[2].pre[0].buf;
 
-    y2 = &mb->second_pre.y_buffer;
-    u2 = &mb->second_pre.u_buffer;
-    v2 = &mb->second_pre.v_buffer;
+    y2 = &mb->plane[0].pre[1].buf;
+    u2 = &mb->plane[1].pre[1].buf;
+    v2 = &mb->plane[2].pre[1].buf;
   }
 
   // luma
diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h
index 422f388..b85b889 100644
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h
@@ -21,6 +21,9 @@
 #include "vpx/vp8cx.h"
 #include "vpx_scale/yv12config.h"
 #include "vp9/common/vp9_ppflags.h"
+
+#define MAX_MB_SEGMENTS 8
+
   typedef int *VP9_PTR;
 
   /* Create/destroy static data structures. */
@@ -225,8 +228,9 @@
 
   int vp9_set_roimap(VP9_PTR comp, unsigned char *map,
                      unsigned int rows, unsigned int cols,
-                     int delta_q[4], int delta_lf[4],
-                     unsigned int threshold[4]);
+                     int delta_q[MAX_MB_SEGMENTS],
+                     int delta_lf[MAX_MB_SEGMENTS],
+                     unsigned int threshold[MAX_MB_SEGMENTS]);
 
   int vp9_set_active_map(VP9_PTR comp, unsigned char *map,
                          unsigned int rows, unsigned int cols);
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 13ec865..eea0894 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -68,23 +68,17 @@
   vp9_prob i8x8_mode_prob[VP9_I8X8_MODES - 1];
   vp9_prob sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
   vp9_prob mbsplit_prob[VP9_NUMMBSPLITS - 1];
-  vp9_prob partition_prob[PARTITION_PLANES][PARTITION_TYPES - 1];
+  vp9_prob partition_prob[NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
 
   vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES];
   vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES];
   vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES];
   vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES];
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_prob nzc_probs_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                        [NZC4X4_NODES];
-  vp9_prob nzc_probs_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                        [NZC8X8_NODES];
-  vp9_prob nzc_probs_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                          [NZC16X16_NODES];
-  vp9_prob nzc_probs_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                          [NZC32X32_NODES];
-  vp9_prob nzc_pcat_probs[MAX_NZC_CONTEXTS]
-                         [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA];
+#if CONFIG_CODE_ZEROGROUP
+  vp9_zpc_probs zpc_probs_4x4;
+  vp9_zpc_probs zpc_probs_8x8;
+  vp9_zpc_probs zpc_probs_16x16;
+  vp9_zpc_probs zpc_probs_32x32;
 #endif
 
   nmv_context nmvc;
@@ -96,7 +90,7 @@
   vp9_prob pre_i8x8_mode_prob[VP9_I8X8_MODES - 1];
   vp9_prob pre_sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
   vp9_prob pre_mbsplit_prob[VP9_NUMMBSPLITS - 1];
-  vp9_prob pre_partition_prob[PARTITION_PLANES][PARTITION_TYPES - 1];
+  vp9_prob pre_partition_prob[NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
   unsigned int bmode_counts[VP9_NKF_BINTRAMODES];
   unsigned int ymode_counts[VP9_YMODES];   /* interframe intra mode probs */
   unsigned int sb_ymode_counts[VP9_I32X32_MODES];
@@ -104,23 +98,17 @@
   unsigned int i8x8_mode_counts[VP9_I8X8_MODES];   /* interframe intra probs */
   unsigned int sub_mv_ref_counts[SUBMVREF_COUNT][VP9_SUBMVREFS];
   unsigned int mbsplit_counts[VP9_NUMMBSPLITS];
-  unsigned int partition_counts[PARTITION_PLANES][PARTITION_TYPES];
+  unsigned int partition_counts[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
 
   vp9_coeff_probs pre_coef_probs_4x4[BLOCK_TYPES];
   vp9_coeff_probs pre_coef_probs_8x8[BLOCK_TYPES];
   vp9_coeff_probs pre_coef_probs_16x16[BLOCK_TYPES];
   vp9_coeff_probs pre_coef_probs_32x32[BLOCK_TYPES];
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_prob pre_nzc_probs_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                            [NZC4X4_NODES];
-  vp9_prob pre_nzc_probs_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                            [NZC8X8_NODES];
-  vp9_prob pre_nzc_probs_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                              [NZC16X16_NODES];
-  vp9_prob pre_nzc_probs_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                              [NZC32X32_NODES];
-  vp9_prob pre_nzc_pcat_probs[MAX_NZC_CONTEXTS]
-                             [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA];
+#if CONFIG_CODE_ZEROGROUP
+  vp9_zpc_probs pre_zpc_probs_4x4;
+  vp9_zpc_probs pre_zpc_probs_8x8;
+  vp9_zpc_probs pre_zpc_probs_16x16;
+  vp9_zpc_probs pre_zpc_probs_32x32;
 #endif
 
   vp9_coeff_count coef_counts_4x4[BLOCK_TYPES];
@@ -130,17 +118,11 @@
   unsigned int eob_branch_counts[TX_SIZE_MAX_SB][BLOCK_TYPES][REF_TYPES]
                                 [COEF_BANDS][PREV_COEF_CONTEXTS];
 
-#if CONFIG_CODE_NONZEROCOUNT
-  unsigned int nzc_counts_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                             [NZC4X4_TOKENS];
-  unsigned int nzc_counts_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                             [NZC8X8_TOKENS];
-  unsigned int nzc_counts_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                               [NZC16X16_TOKENS];
-  unsigned int nzc_counts_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                               [NZC32X32_TOKENS];
-  unsigned int nzc_pcat_counts[MAX_NZC_CONTEXTS]
-                              [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA][2];
+#if CONFIG_CODE_ZEROGROUP
+  vp9_zpc_count zpc_counts_4x4;
+  vp9_zpc_count zpc_counts_8x8;
+  vp9_zpc_count zpc_counts_16x16;
+  vp9_zpc_count zpc_counts_32x32;
 #endif
 
   nmv_context_counts NMVcount;
@@ -214,6 +196,7 @@
   FRAME_TYPE frame_type;
 
   int show_frame;
+  int last_show_frame;
 
   int frame_flags;
   int MBs;
@@ -268,6 +251,10 @@
   ENTROPY_CONTEXT_PLANES *above_context;   /* row of context for each plane */
   ENTROPY_CONTEXT_PLANES left_context[4];  /* (up to) 4 contexts "" */
 
+  // partition contexts
+  PARTITION_CONTEXT *above_seg_context;
+  PARTITION_CONTEXT left_seg_context[4];
+
   /* keyframe block modes are predicted by their above, left neighbors */
 
   vp9_prob kf_bmode_prob[VP9_KF_BINTRAMODES]
@@ -350,24 +337,22 @@
   buf[new_idx]++;
 }
 
-// TODO(debargha): merge the two functions
-static void set_mb_row(VP9_COMMON *cm, MACROBLOCKD *xd,
-                       int mb_row, int block_size) {
+static int mb_cols_aligned_to_sb(VP9_COMMON *cm) {
+  return (cm->mb_cols + 3) & ~3;
+}
+
+static void set_mb_row_col(VP9_COMMON *cm, MACROBLOCKD *xd,
+                       int mb_row, int bh,
+                       int mb_col, int bw) {
   xd->mb_to_top_edge    = -((mb_row * 16) << 3);
-  xd->mb_to_bottom_edge = ((cm->mb_rows - block_size - mb_row) * 16) << 3;
+  xd->mb_to_bottom_edge = ((cm->mb_rows - bh - mb_row) * 16) << 3;
+  xd->mb_to_left_edge   = -((mb_col * 16) << 3);
+  xd->mb_to_right_edge  = ((cm->mb_cols - bw - mb_col) * 16) << 3;
 
   // Are edges available for intra prediction?
   xd->up_available    = (mb_row != 0);
-}
-
-static void set_mb_col(VP9_COMMON *cm, MACROBLOCKD *xd,
-                       int mb_col, int block_size) {
-  xd->mb_to_left_edge   = -((mb_col * 16) << 3);
-  xd->mb_to_right_edge  = ((cm->mb_cols - block_size - mb_col) * 16) << 3;
-
-  // Are edges available for intra prediction?
   xd->left_available  = (mb_col > cm->cur_tile_mb_col_start);
-  xd->right_available = (mb_col + block_size < cm->cur_tile_mb_col_end);
+  xd->right_available = (mb_col + bw < cm->cur_tile_mb_col_end);
 }
 
 static int get_mb_row(const MACROBLOCKD *xd) {
@@ -377,4 +362,12 @@
 static int get_mb_col(const MACROBLOCKD *xd) {
   return ((-xd->mb_to_left_edge) >> 7);
 }
+
+static int get_token_alloc(int mb_rows, int mb_cols) {
+#if CONFIG_CODE_ZEROGROUP
+  return mb_rows * mb_cols * (24 * 16 * 2);
+#else
+  return mb_rows * mb_cols * (24 * 16 + 4);
+#endif
+}
 #endif  // VP9_COMMON_VP9_ONYXC_INT_H_
diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c
index e110cff..c12920c 100644
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -23,83 +23,72 @@
                                    const MACROBLOCKD *const xd,
                                    PRED_ID pred_id) {
   int pred_context;
-  MODE_INFO *m = xd->mode_info_context;
-
+  const MODE_INFO *const mi = xd->mode_info_context;
+  const MODE_INFO *const above_mi = mi - cm->mode_info_stride;
+  const MODE_INFO *const left_mi = mi - 1;
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
   // The prediction flags in these dummy entries are initialised to 0.
   switch (pred_id) {
     case PRED_SEG_ID:
-      pred_context = (m - cm->mode_info_stride)->mbmi.seg_id_predicted;
+      pred_context = above_mi->mbmi.seg_id_predicted;
       if (xd->left_available)
-        pred_context += (m - 1)->mbmi.seg_id_predicted;
+        pred_context += left_mi->mbmi.seg_id_predicted;
       break;
 
     case PRED_REF:
-      pred_context = (m - cm->mode_info_stride)->mbmi.ref_predicted;
+      pred_context = above_mi->mbmi.ref_predicted;
       if (xd->left_available)
-        pred_context += (m - 1)->mbmi.ref_predicted;
+        pred_context += left_mi->mbmi.ref_predicted;
       break;
 
     case PRED_COMP:
-      // Context based on use of comp pred flag by neighbours
-      // pred_context =
-      //   ((m - 1)->mbmi.second_ref_frame > INTRA_FRAME) +
-      //    ((m - cm->mode_info_stride)->mbmi.second_ref_frame > INTRA_FRAME);
-
-      // Context based on mode and reference frame
-      // if ( m->mbmi.ref_frame == LAST_FRAME )
-      //    pred_context = 0 + (m->mbmi.mode != ZEROMV);
-      // else if ( m->mbmi.ref_frame == GOLDEN_FRAME )
-      //    pred_context = 2 + (m->mbmi.mode != ZEROMV);
-      // else
-      //    pred_context = 4 + (m->mbmi.mode != ZEROMV);
-
-      if (m->mbmi.ref_frame == LAST_FRAME)
+      if (mi->mbmi.ref_frame == LAST_FRAME)
         pred_context = 0;
       else
         pred_context = 1;
-
       break;
 
     case PRED_MBSKIP:
-      pred_context = (m - cm->mode_info_stride)->mbmi.mb_skip_coeff;
+      pred_context = above_mi->mbmi.mb_skip_coeff;
       if (xd->left_available)
-        pred_context += (m - 1)->mbmi.mb_skip_coeff;
+        pred_context += left_mi->mbmi.mb_skip_coeff;
       break;
 
-    case PRED_SWITCHABLE_INTERP:
-      {
-        int left_in_image = xd->left_available && (m - 1)->mbmi.mb_in_image;
-        int above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image;
-        int left_mode = (m - 1)->mbmi.mode;
-        int above_mode = (m - cm->mode_info_stride)->mbmi.mode;
-        int left_interp, above_interp;
-        if (left_in_image && left_mode >= NEARESTMV && left_mode <= SPLITMV)
-          left_interp = vp9_switchable_interp_map[(m - 1)->mbmi.interp_filter];
-        else
-          left_interp = VP9_SWITCHABLE_FILTERS;
-        assert(left_interp != -1);
-        if (above_in_image && above_mode >= NEARESTMV && above_mode <= SPLITMV)
-          above_interp = vp9_switchable_interp_map[
-              (m - cm->mode_info_stride)->mbmi.interp_filter];
-        else
-          above_interp = VP9_SWITCHABLE_FILTERS;
-        assert(above_interp != -1);
+    case PRED_SWITCHABLE_INTERP: {
+      // left
+      const int left_in_image = xd->left_available && left_mi->mbmi.mb_in_image;
+      const int left_mv_pred = left_mi->mbmi.mode >= NEARESTMV &&
+                               left_mi->mbmi.mode <= SPLITMV;
+      const int left_interp = left_in_image && left_mv_pred ?
+                    vp9_switchable_interp_map[left_mi->mbmi.interp_filter] :
+                    VP9_SWITCHABLE_FILTERS;
 
-        if (left_interp == above_interp)
-          pred_context = left_interp;
-        else if (left_interp == VP9_SWITCHABLE_FILTERS &&
-                 above_interp != VP9_SWITCHABLE_FILTERS)
-          pred_context = above_interp;
-        else if (left_interp != VP9_SWITCHABLE_FILTERS &&
-                 above_interp == VP9_SWITCHABLE_FILTERS)
-          pred_context = left_interp;
-        else
-          pred_context = VP9_SWITCHABLE_FILTERS;
-      }
+      // above
+      const int above_in_image = xd->up_available && above_mi->mbmi.mb_in_image;
+      const int above_mv_pred = above_mi->mbmi.mode >= NEARESTMV &&
+                                above_mi->mbmi.mode <= SPLITMV;
+      const int above_interp = above_in_image && above_mv_pred ?
+                    vp9_switchable_interp_map[above_mi->mbmi.interp_filter] :
+                    VP9_SWITCHABLE_FILTERS;
+
+      assert(left_interp != -1);
+      assert(above_interp != -1);
+
+      if (left_interp == above_interp)
+        pred_context = left_interp;
+      else if (left_interp == VP9_SWITCHABLE_FILTERS &&
+               above_interp != VP9_SWITCHABLE_FILTERS)
+         pred_context = above_interp;
+      else if (left_interp != VP9_SWITCHABLE_FILTERS &&
+               above_interp == VP9_SWITCHABLE_FILTERS)
+        pred_context = left_interp;
+      else
+        pred_context = VP9_SWITCHABLE_FILTERS;
+
       break;
+    }
 
     default:
       pred_context = 0;  // *** add error trap code.
@@ -230,7 +219,7 @@
 int vp9_get_pred_mb_segid(VP9_COMMON *cm, BLOCK_SIZE_TYPE sb_type,
                           int mb_row, int mb_col) {
   const int mb_index = mb_row * cm->mb_cols + mb_col;
-  if (sb_type) {
+  if (sb_type > BLOCK_SIZE_MB16X16) {
     const int bw = 1 << mb_width_log2(sb_type);
     const int bh = 1 << mb_height_log2(sb_type);
     const int ymbs = MIN(cm->mb_rows - mb_row, bh);
diff --git a/vp9/common/vp9_recon.c b/vp9/common/vp9_recon.c
index cc44afe..4119450 100644
--- a/vp9/common/vp9_recon.c
+++ b/vp9/common/vp9_recon.c
@@ -14,79 +14,61 @@
 #include "vp9/common/vp9_blockd.h"
 
 static INLINE void recon(int rows, int cols,
-                         const uint8_t *pred_ptr, int pred_stride,
                          const int16_t *diff_ptr, int diff_stride,
                          uint8_t *dst_ptr, int dst_stride) {
   int r, c;
 
   for (r = 0; r < rows; r++) {
     for (c = 0; c < cols; c++)
-      dst_ptr[c] = clip_pixel(diff_ptr[c] + pred_ptr[c]);
+      dst_ptr[c] = clip_pixel(diff_ptr[c] + dst_ptr[c]);
 
     dst_ptr += dst_stride;
     diff_ptr += diff_stride;
-    pred_ptr += pred_stride;
   }
 }
 
 
 void vp9_recon_b_c(uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr,
                    int stride) {
-  recon(4, 4, pred_ptr, stride, diff_ptr, 16, dst_ptr, stride);
+  assert(pred_ptr == dst_ptr);
+  recon(4, 4, diff_ptr, 16, dst_ptr, stride);
 }
 
 void vp9_recon_uv_b_c(uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr,
                       int stride) {
-  recon(4, 4, pred_ptr, stride, diff_ptr, 8, dst_ptr, stride);
+  assert(pred_ptr == dst_ptr);
+  recon(4, 4, diff_ptr, 8, dst_ptr, stride);
 }
 
 void vp9_recon4b_c(uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr,
                    int stride) {
-  recon(4, 16, pred_ptr, stride, diff_ptr, 16, dst_ptr, stride);
+  assert(pred_ptr == dst_ptr);
+  recon(4, 16, diff_ptr, 16, dst_ptr, stride);
 }
 
 void vp9_recon2b_c(uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr,
                    int stride) {
-  recon(4, 8, pred_ptr, stride, diff_ptr, 8, dst_ptr, stride);
+  assert(pred_ptr == dst_ptr);
+  recon(4, 8, diff_ptr, 8, dst_ptr, stride);
+}
+
+static void recon_plane(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize, int plane) {
+  const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
+  const int bh = 4 << (b_height_log2(bsize) - xd->plane[plane].subsampling_y);
+  recon(bh, bw,
+        xd->plane[plane].diff, bw,
+        xd->plane[plane].dst.buf, xd->plane[plane].dst.stride);
 }
 
 void vp9_recon_sby_c(MACROBLOCKD *mb, BLOCK_SIZE_TYPE bsize) {
-  const int bw = 16 << mb_width_log2(bsize), bh = 16 << mb_height_log2(bsize);
-  int x, y;
-  const int stride = mb->plane[0].dst.stride;
-  uint8_t *dst = mb->plane[0].dst.buf;
-  const int16_t *diff = mb->plane[0].diff;
-
-  for (y = 0; y < bh; y++) {
-    for (x = 0; x < bw; x++)
-      dst[x] = clip_pixel(dst[x] + diff[x]);
-
-    dst += stride;
-    diff += bw;
-  }
+  recon_plane(mb, bsize, 0);
 }
 
 void vp9_recon_sbuv_c(MACROBLOCKD *mb, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = mb_width_log2(bsize), bhl = mb_height_log2(bsize);
-  const int bw = 8 << bwl, bh = 8 << bhl;
-  int x, y;
-  const int stride =  mb->plane[1].dst.stride;
-  uint8_t *u_dst = mb->plane[1].dst.buf;
-  uint8_t *v_dst = mb->plane[2].dst.buf;
-  const int16_t *u_diff = mb->plane[1].diff;
-  const int16_t *v_diff = mb->plane[2].diff;
+  int i;
 
-  for (y = 0; y < bh; y++) {
-    for (x = 0; x < bw; x++) {
-      u_dst[x] = clip_pixel(u_dst[x] + u_diff[x]);
-      v_dst[x] = clip_pixel(v_dst[x] + v_diff[x]);
-    }
-
-    u_dst += stride;
-    v_dst += stride;
-    u_diff += bw;
-    v_diff += bw;
-  }
+  for (i = 1; i < MAX_MB_PLANE; i++)
+    recon_plane(mb, bsize, i);
 }
 
 void vp9_recon_sb_c(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index c5b677f0..c2e0f2f 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -57,127 +57,6 @@
   // applied in one direction only, and not at all for 0,0, seems to give the
   // best quality, but it may be worth trying an additional mode that does
   // do the filtering on full-pel.
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-  if (scale->x_step_q4 == 16) {
-    if (scale->y_step_q4 == 16) {
-      // No scaling in either direction.
-      scale->predict[0][0][0] = vp9_convolve_copy;
-      scale->predict[0][0][1] = vp9_convolve_1by8;
-      scale->predict[0][0][2] = vp9_convolve_qtr;
-      scale->predict[0][0][3] = vp9_convolve_3by8;
-      scale->predict[0][0][4] = vp9_convolve_avg;
-      scale->predict[0][0][5] = vp9_convolve_5by8;
-      scale->predict[0][0][6] = vp9_convolve_3qtr;
-      scale->predict[0][0][7] = vp9_convolve_7by8;
-      scale->predict[0][1][0] = vp9_convolve8_vert;
-      scale->predict[0][1][1] = vp9_convolve8_1by8_vert;
-      scale->predict[0][1][2] = vp9_convolve8_qtr_vert;
-      scale->predict[0][1][3] = vp9_convolve8_3by8_vert;
-      scale->predict[0][1][4] = vp9_convolve8_avg_vert;
-      scale->predict[0][1][5] = vp9_convolve8_5by8_vert;
-      scale->predict[0][1][6] = vp9_convolve8_3qtr_vert;
-      scale->predict[0][1][7] = vp9_convolve8_7by8_vert;
-      scale->predict[1][0][0] = vp9_convolve8_horiz;
-      scale->predict[1][0][1] = vp9_convolve8_1by8_horiz;
-      scale->predict[1][0][2] = vp9_convolve8_qtr_horiz;
-      scale->predict[1][0][3] = vp9_convolve8_3by8_horiz;
-      scale->predict[1][0][4] = vp9_convolve8_avg_horiz;
-      scale->predict[1][0][5] = vp9_convolve8_5by8_horiz;
-      scale->predict[1][0][6] = vp9_convolve8_3qtr_horiz;
-      scale->predict[1][0][7] = vp9_convolve8_7by8_horiz;
-    } else {
-      // No scaling in x direction. Must always scale in the y direction.
-      scale->predict[0][0][0] = vp9_convolve8_vert;
-      scale->predict[0][0][1] = vp9_convolve8_1by8_vert;
-      scale->predict[0][0][2] = vp9_convolve8_qtr_vert;
-      scale->predict[0][0][3] = vp9_convolve8_3by8_vert;
-      scale->predict[0][0][4] = vp9_convolve8_avg_vert;
-      scale->predict[0][0][5] = vp9_convolve8_5by8_vert;
-      scale->predict[0][0][6] = vp9_convolve8_3qtr_vert;
-      scale->predict[0][0][7] = vp9_convolve8_7by8_vert;
-      scale->predict[0][1][0] = vp9_convolve8_vert;
-      scale->predict[0][1][1] = vp9_convolve8_1by8_vert;
-      scale->predict[0][1][2] = vp9_convolve8_qtr_vert;
-      scale->predict[0][1][3] = vp9_convolve8_3by8_vert;
-      scale->predict[0][1][4] = vp9_convolve8_avg_vert;
-      scale->predict[0][1][5] = vp9_convolve8_5by8_vert;
-      scale->predict[0][1][6] = vp9_convolve8_3qtr_vert;
-      scale->predict[0][1][7] = vp9_convolve8_7by8_vert;
-      scale->predict[1][0][0] = vp9_convolve8;
-      scale->predict[1][0][1] = vp9_convolve8_1by8;
-      scale->predict[1][0][2] = vp9_convolve8_qtr;
-      scale->predict[1][0][3] = vp9_convolve8_3by8;
-      scale->predict[1][0][4] = vp9_convolve8_avg;
-      scale->predict[1][0][5] = vp9_convolve8_5by8;
-      scale->predict[1][0][6] = vp9_convolve8_3qtr;
-      scale->predict[1][0][7] = vp9_convolve8_7by8;
-    }
-  } else {
-    if (scale->y_step_q4 == 16) {
-      // No scaling in the y direction. Must always scale in the x direction.
-      scale->predict[0][0][0] = vp9_convolve8_horiz;
-      scale->predict[0][0][1] = vp9_convolve8_1by8_horiz;
-      scale->predict[0][0][2] = vp9_convolve8_qtr_horiz;
-      scale->predict[0][0][3] = vp9_convolve8_3by8_horiz;
-      scale->predict[0][0][4] = vp9_convolve8_avg_horiz;
-      scale->predict[0][0][5] = vp9_convolve8_5by8_horiz;
-      scale->predict[0][0][6] = vp9_convolve8_3qtr_horiz;
-      scale->predict[0][0][7] = vp9_convolve8_7by8_horiz;
-      scale->predict[0][1][0] = vp9_convolve8;
-      scale->predict[0][1][1] = vp9_convolve8_1by8;
-      scale->predict[0][1][2] = vp9_convolve8_qtr;
-      scale->predict[0][1][3] = vp9_convolve8_3by8;
-      scale->predict[0][1][4] = vp9_convolve8_avg;
-      scale->predict[0][1][5] = vp9_convolve8_5by8;
-      scale->predict[0][1][6] = vp9_convolve8_3qtr;
-      scale->predict[0][1][7] = vp9_convolve8_7by8;
-      scale->predict[1][0][0] = vp9_convolve8_horiz;
-      scale->predict[1][0][1] = vp9_convolve8_1by8_horiz;
-      scale->predict[1][0][2] = vp9_convolve8_qtr_horiz;
-      scale->predict[1][0][3] = vp9_convolve8_3by8_horiz;
-      scale->predict[1][0][4] = vp9_convolve8_avg_horiz;
-      scale->predict[1][0][5] = vp9_convolve8_5by8_horiz;
-      scale->predict[1][0][6] = vp9_convolve8_3qtr_horiz;
-      scale->predict[1][0][7] = vp9_convolve8_7by8_horiz;
-    } else {
-      // Must always scale in both directions.
-      scale->predict[0][0][0] = vp9_convolve8;
-      scale->predict[0][0][1] = vp9_convolve8_1by8;
-      scale->predict[0][0][2] = vp9_convolve8_qtr;
-      scale->predict[0][0][3] = vp9_convolve8_3by8;
-      scale->predict[0][0][4] = vp9_convolve8_avg;
-      scale->predict[0][0][5] = vp9_convolve8_5by8;
-      scale->predict[0][0][6] = vp9_convolve8_3qtr;
-      scale->predict[0][0][7] = vp9_convolve8_7by8;
-      scale->predict[0][1][0] = vp9_convolve8;
-      scale->predict[0][1][1] = vp9_convolve8_1by8;
-      scale->predict[0][1][2] = vp9_convolve8_qtr;
-      scale->predict[0][1][3] = vp9_convolve8_3by8;
-      scale->predict[0][1][4] = vp9_convolve8_avg;
-      scale->predict[0][1][5] = vp9_convolve8_5by8;
-      scale->predict[0][1][6] = vp9_convolve8_3qtr;
-      scale->predict[0][1][7] = vp9_convolve8_7by8;
-      scale->predict[1][0][0] = vp9_convolve8;
-      scale->predict[1][0][1] = vp9_convolve8_1by8;
-      scale->predict[1][0][2] = vp9_convolve8_qtr;
-      scale->predict[1][0][3] = vp9_convolve8_3by8;
-      scale->predict[1][0][4] = vp9_convolve8_avg;
-      scale->predict[1][0][5] = vp9_convolve8_5by8;
-      scale->predict[1][0][6] = vp9_convolve8_3qtr;
-      scale->predict[1][0][7] = vp9_convolve8_7by8;
-    }
-  }
-  // 2D subpel motion always gets filtered in both directions
-  scale->predict[1][1][0] = vp9_convolve8;
-  scale->predict[1][1][1] = vp9_convolve8_1by8;
-  scale->predict[1][1][2] = vp9_convolve8_qtr;
-  scale->predict[1][1][3] = vp9_convolve8_3by8;
-  scale->predict[1][1][4] = vp9_convolve8_avg;
-  scale->predict[1][1][5] = vp9_convolve8_5by8;
-  scale->predict[1][1][6] = vp9_convolve8_3qtr;
-  scale->predict[1][1][7] = vp9_convolve8_7by8;
-}
-#else
   if (scale->x_step_q4 == 16) {
     if (scale->y_step_q4 == 16) {
       // No scaling in either direction.
@@ -219,7 +98,6 @@
   scale->predict[1][1][0] = vp9_convolve8;
   scale->predict[1][1][1] = vp9_convolve8_avg;
 }
-#endif
 
 void vp9_setup_interp_filters(MACROBLOCKD *xd,
                               INTERPOLATIONFILTERTYPE mcomp_filter_type,
@@ -383,60 +261,6 @@
       w, h);
 }
 
-static void build_2x1_inter_predictor_wh(const BLOCKD *d0, const BLOCKD *d1,
-                                         struct scale_factors *s,
-                                         uint8_t *predictor,
-                                         int block_size, int stride,
-                                         int which_mv, int weight,
-                                         int width, int height,
-                                         const struct subpix_fn_table *subpix,
-                                         int row, int col) {
-  struct scale_factors * scale = &s[which_mv];
-
-  assert(d1->dst - d0->dst == block_size);
-  assert(d1->pre == d0->pre + block_size);
-
-  scale->set_scaled_offsets(scale, row, col);
-
-  if (d0->bmi.as_mv[which_mv].as_int == d1->bmi.as_mv[which_mv].as_int) {
-    uint8_t **base_pre = which_mv ? d0->base_second_pre : d0->base_pre;
-
-    vp9_build_inter_predictor(*base_pre + d0->pre,
-                              d0->pre_stride,
-                              predictor, stride,
-                              &d0->bmi.as_mv[which_mv],
-                              scale,
-                              width, height,
-                              weight, subpix);
-
-  } else {
-    uint8_t **base_pre0 = which_mv ? d0->base_second_pre : d0->base_pre;
-    uint8_t **base_pre1 = which_mv ? d1->base_second_pre : d1->base_pre;
-
-    vp9_build_inter_predictor(*base_pre0 + d0->pre,
-                              d0->pre_stride,
-                              predictor, stride,
-                              &d0->bmi.as_mv[which_mv],
-                              scale,
-                              width > block_size ? block_size : width, height,
-                              weight, subpix);
-
-    if (width <= block_size) return;
-
-    scale->set_scaled_offsets(scale, row, col + block_size);
-
-    vp9_build_inter_predictor(*base_pre1 + d1->pre,
-                              d1->pre_stride,
-                              predictor + block_size, stride,
-                              &d1->bmi.as_mv[which_mv],
-                              scale,
-                              width - block_size, height,
-                              weight, subpix);
-  }
-}
-
-#if !CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-
 static INLINE int round_mv_comp_q4(int value) {
   return (value < 0 ? value - 2 : value + 2) / 4;
 }
@@ -575,16 +399,15 @@
   }
 }
 void vp9_build_inter_predictors_sby(MACROBLOCKD *xd,
-                                    uint8_t *dst_y,
-                                    int dst_ystride,
                                     int mb_row,
                                     int mb_col,
                                     BLOCK_SIZE_TYPE bsize) {
   struct build_inter_predictors_args args = {
     xd, mb_col * 16, mb_row * 16,
-    {dst_y, NULL, NULL}, {dst_ystride, 0, 0},
-    {{xd->pre.y_buffer, NULL, NULL}, {xd->second_pre.y_buffer, NULL, NULL}},
-    {{xd->pre.y_stride, 0, 0}, {xd->second_pre.y_stride, 0, 0}},
+    {xd->plane[0].dst.buf, NULL, NULL}, {xd->plane[0].dst.stride, 0, 0},
+    {{xd->plane[0].pre[0].buf, NULL, NULL},
+     {xd->plane[0].pre[1].buf, NULL, NULL}},
+    {{xd->plane[0].pre[0].stride, 0, 0}, {xd->plane[0].pre[1].stride, 0, 0}},
   };
 
   // TODO(jkoleszar): This is a hack no matter where you put it, but does it
@@ -595,613 +418,41 @@
   foreach_predicted_block_in_plane(xd, bsize, 0, build_inter_predictors, &args);
 }
 void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd,
-                                     uint8_t *dst_u,
-                                     uint8_t *dst_v,
-                                     int dst_uvstride,
                                      int mb_row,
                                      int mb_col,
                                      BLOCK_SIZE_TYPE bsize) {
   struct build_inter_predictors_args args = {
     xd, mb_col * 16, mb_row * 16,
-    {NULL, dst_u, dst_v}, {0, dst_uvstride, dst_uvstride},
-    {{NULL, xd->pre.u_buffer, xd->pre.v_buffer},
-     {NULL, xd->second_pre.u_buffer, xd->second_pre.v_buffer}},
-    {{0, xd->pre.uv_stride, xd->pre.uv_stride},
-     {0, xd->second_pre.uv_stride, xd->second_pre.uv_stride}},
+    {NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf},
+    {0, xd->plane[1].dst.stride, xd->plane[1].dst.stride},
+    {{NULL, xd->plane[1].pre[0].buf, xd->plane[2].pre[0].buf},
+     {NULL, xd->plane[1].pre[1].buf, xd->plane[2].pre[1].buf}},
+    {{0, xd->plane[1].pre[0].stride, xd->plane[1].pre[0].stride},
+     {0, xd->plane[1].pre[1].stride, xd->plane[1].pre[1].stride}},
   };
   foreach_predicted_block_uv(xd, bsize, build_inter_predictors, &args);
 }
 void vp9_build_inter_predictors_sb(MACROBLOCKD *xd,
                                    int mb_row, int mb_col,
                                    BLOCK_SIZE_TYPE bsize) {
+#if CONFIG_COMP_INTERINTRA_PRED
   uint8_t *const y = xd->plane[0].dst.buf;
   uint8_t *const u = xd->plane[1].dst.buf;
   uint8_t *const v = xd->plane[2].dst.buf;
   const int y_stride = xd->plane[0].dst.stride;
   const int uv_stride = xd->plane[1].dst.stride;
+#endif
 
-  vp9_build_inter_predictors_sby(xd, y, y_stride, mb_row, mb_col, bsize);
-  vp9_build_inter_predictors_sbuv(xd, u, v, uv_stride, mb_row, mb_col, bsize);
+  vp9_build_inter_predictors_sby(xd, mb_row, mb_col, bsize);
+  vp9_build_inter_predictors_sbuv(xd, mb_row, mb_col, bsize);
+
 #if CONFIG_COMP_INTERINTRA_PRED
-  if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
-    if (bsize == BLOCK_SIZE_SB32X32)
-      vp9_build_interintra_32x32_predictors_sb(xd, y, u, v,
-                                               y_stride, uv_stride);
-    else
-      vp9_build_interintra_64x64_predictors_sb(xd, y, u, v,
-                                               y_stride, uv_stride);
-  }
+  if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME)
+    vp9_build_interintra_predictors(xd, y, u, v,
+                                    y_stride, uv_stride,
+                                    bsize);
 #endif
 }
-#endif  // !CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-
-#define AVERAGE_WEIGHT  (1 << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT))
-
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-
-static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {
-  /* If the MV points so far into the UMV border that no visible pixels
-   * are used for reconstruction, the subpel part of the MV can be
-   * discarded and the MV limited to 16 pixels with equivalent results.
-   *
-   * This limit kicks in at 19 pixels for the top and left edges, for
-   * the 16 pixels plus 3 taps right of the central pixel when subpel
-   * filtering. The bottom and right edges use 16 pixels plus 2 pixels
-   * left of the central pixel when filtering.
-   */
-  if (mv->col < (xd->mb_to_left_edge - ((16 + VP9_INTERP_EXTEND) << 3)))
-    mv->col = xd->mb_to_left_edge - (16 << 3);
-  else if (mv->col > xd->mb_to_right_edge + ((15 + VP9_INTERP_EXTEND) << 3))
-    mv->col = xd->mb_to_right_edge + (16 << 3);
-
-  if (mv->row < (xd->mb_to_top_edge - ((16 + VP9_INTERP_EXTEND) << 3)))
-    mv->row = xd->mb_to_top_edge - (16 << 3);
-  else if (mv->row > xd->mb_to_bottom_edge + ((15 + VP9_INTERP_EXTEND) << 3))
-    mv->row = xd->mb_to_bottom_edge + (16 << 3);
-}
-
-// Whether to use implicit weighting for UV
-#define USE_IMPLICIT_WEIGHT_UV
-
-// Whether to use implicit weighting for SplitMV
-// #define USE_IMPLICIT_WEIGHT_SPLITMV
-
-// #define SEARCH_MIN3
-static int64_t get_consistency_metric(MACROBLOCKD *xd,
-                                      uint8_t *tmp_y, int tmp_ystride) {
-  int block_size = 16 <<  xd->mode_info_context->mbmi.sb_type;
-  uint8_t *rec_y = xd->plane[0].dst.buf;
-  int rec_ystride = xd->plane[0].dst.stride;
-  int64_t metric = 0;
-  int i;
-  if (xd->up_available) {
-    for (i = 0; i < block_size; ++i) {
-      int diff = abs(*(rec_y - rec_ystride + i) -
-                     *(tmp_y + i));
-#ifdef SEARCH_MIN3
-      // Searches for the min abs diff among 3 pixel neighbors in the border
-      int diff1 = xd->left_available ?
-          abs(*(rec_y - rec_ystride + i - 1) - *(tmp_y + i)) : diff;
-      int diff2 = i < block_size - 1 ?
-          abs(*(rec_y - rec_ystride + i + 1) - *(tmp_y + i)) : diff;
-      diff = diff <= diff1 ? diff : diff1;
-      diff = diff <= diff2 ? diff : diff2;
-#endif
-      metric += diff;
-    }
-  }
-  if (xd->left_available) {
-    for (i = 0; i < block_size; ++i) {
-      int diff = abs(*(rec_y - 1 + i * rec_ystride) -
-                     *(tmp_y + i * tmp_ystride));
-#ifdef SEARCH_MIN3
-      // Searches for the min abs diff among 3 pixel neighbors in the border
-      int diff1 = xd->up_available ?
-          abs(*(rec_y - 1 + (i - 1) * rec_ystride) -
-                      *(tmp_y + i * tmp_ystride)) : diff;
-      int diff2 = i < block_size - 1 ?
-          abs(*(rec_y - 1 + (i + 1) * rec_ystride) -
-              *(tmp_y + i * tmp_ystride)) : diff;
-      diff = diff <= diff1 ? diff : diff1;
-      diff = diff <= diff2 ? diff : diff2;
-#endif
-      metric += diff;
-    }
-  }
-  return metric;
-}
-
-static int get_weight(MACROBLOCKD *xd, int64_t metric_1, int64_t metric_2) {
-  int weight = AVERAGE_WEIGHT;
-  if (2 * metric_1 < metric_2)
-    weight = 6;
-  else if (4 * metric_1 < 3 * metric_2)
-    weight = 5;
-  else if (2 * metric_2 < metric_1)
-    weight = 2;
-  else if (4 * metric_2 < 3 * metric_1)
-    weight = 3;
-  return weight;
-}
-
-#ifdef USE_IMPLICIT_WEIGHT_SPLITMV
-static int get_implicit_compoundinter_weight_splitmv(
-    MACROBLOCKD *xd, int mb_row, int mb_col) {
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-  BLOCKD *blockd = xd->block;
-  const int use_second_ref = mbmi->second_ref_frame > 0;
-  int64_t metric_2 = 0, metric_1 = 0;
-  int i, which_mv, weight;
-  uint8_t tmp_y[256];
-  const int tmp_ystride = 16;
-
-  if (!use_second_ref) return 0;
-  if (!(xd->up_available || xd->left_available))
-    return AVERAGE_WEIGHT;
-
-  assert(xd->mode_info_context->mbmi.mode == SPLITMV);
-
-  which_mv = 1;  // second predictor
-  if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) {
-    for (i = 0; i < 16; i += 8) {
-      BLOCKD *d0 = &blockd[i];
-      BLOCKD *d1 = &blockd[i + 2];
-      const int y = i & 8;
-
-      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];
-      blockd[i + 2].bmi = xd->mode_info_context->bmi[i + 2];
-
-      if (mbmi->need_to_clamp_mvs) {
-        clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv[which_mv].as_mv, xd);
-        clamp_mv_to_umv_border(&blockd[i + 2].bmi.as_mv[which_mv].as_mv, xd);
-      }
-      if (i == 0) {
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 8, 16,
-                                     which_mv, 0, 16, 1,
-                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 8, 16,
-                                     which_mv, 0, 1, 8,
-                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);
-      } else {
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + 8 * 16,
-                                     8, 16, which_mv, 0, 1, 8,
-                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);
-      }
-    }
-  } else {
-    for (i = 0; i < 16; i += 2) {
-      BLOCKD *d0 = &blockd[i];
-      BLOCKD *d1 = &blockd[i + 1];
-      const int x = (i & 3) * 4;
-      const int y = (i >> 2) * 4;
-
-      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];
-      blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1];
-
-      if (i >= 4 && (i & 3) != 0) continue;
-
-      if (i == 0) {
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 4, 16,
-                                     which_mv, 0, 8, 1, &xd->subpix,
-                                     mb_row * 16 + y, mb_col * 16 + x);
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 4, 16,
-                                     which_mv, 0, 1, 4, &xd->subpix,
-                                     mb_row * 16 + y, mb_col * 16 + x);
-      } else if (i < 4) {
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + x, 4, 16,
-                                     which_mv, 0, 8, 1, &xd->subpix,
-                                     mb_row * 16 + y, mb_col * 16 + x);
-      } else {
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + y * 16,
-                                     4, 16, which_mv, 0, 1, 4, &xd->subpix,
-                                     mb_row * 16 + y, mb_col * 16 + x);
-      }
-    }
-  }
-  metric_2 = get_consistency_metric(xd, tmp_y, tmp_ystride);
-
-  which_mv = 0;  // first predictor
-  if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) {
-    for (i = 0; i < 16; i += 8) {
-      BLOCKD *d0 = &blockd[i];
-      BLOCKD *d1 = &blockd[i + 2];
-      const int y = i & 8;
-
-      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];
-      blockd[i + 2].bmi = xd->mode_info_context->bmi[i + 2];
-
-      if (mbmi->need_to_clamp_mvs) {
-        clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv[which_mv].as_mv, xd);
-        clamp_mv_to_umv_border(&blockd[i + 2].bmi.as_mv[which_mv].as_mv, xd);
-      }
-      if (i == 0) {
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 8, 16,
-                                     which_mv, 0, 16, 1,
-                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 8, 16,
-                                     which_mv, 0, 1, 8,
-                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);
-      } else {
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + 8 * 16,
-                                     8, 16, which_mv, 0, 1, 8,
-                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);
-      }
-    }
-  } else {
-    for (i = 0; i < 16; i += 2) {
-      BLOCKD *d0 = &blockd[i];
-      BLOCKD *d1 = &blockd[i + 1];
-      const int x = (i & 3) * 4;
-      const int y = (i >> 2) * 4;
-
-      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];
-      blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1];
-
-      if (i >= 4 && (i & 3) != 0) continue;
-
-      if (i == 0) {
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 4, 16,
-                                     which_mv, 0, 8, 1, &xd->subpix,
-                                     mb_row * 16 + y, mb_col * 16 + x);
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 4, 16,
-                                     which_mv, 0, 1, 4, &xd->subpix,
-                                     mb_row * 16 + y, mb_col * 16 + x);
-      } else if (i < 4) {
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + x, 4, 16,
-                                     which_mv, 0, 8, 1, &xd->subpix,
-                                     mb_row * 16 + y, mb_col * 16 + x);
-      } else {
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + y * 16,
-                                     4, 16, which_mv, 0, 1, 4, &xd->subpix,
-                                     mb_row * 16 + y, mb_col * 16 + x);
-      }
-    }
-  }
-  metric_1 = get_consistency_metric(xd, tmp_y, tmp_ystride);
-
-  // Choose final weight for averaging
-  weight = get_weight(xd, metric_1, metric_2);
-  return weight;
-}
-#endif
-
-static int get_implicit_compoundinter_weight(MACROBLOCKD *xd,
-                                             int mb_row,
-                                             int mb_col) {
-  const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;
-  int64_t metric_2 = 0, metric_1 = 0;
-  int n, clamp_mvs, pre_stride;
-  uint8_t *base_pre;
-  int_mv ymv;
-  uint8_t tmp_y[4096];
-  const int tmp_ystride = 64;
-  int weight;
-  int edge[4];
-  int block_size = 16 <<  xd->mode_info_context->mbmi.sb_type;
-  struct scale_factors *scale;
-
-  if (!use_second_ref) return 0;
-  if (!(xd->up_available || xd->left_available))
-    return AVERAGE_WEIGHT;
-
-  edge[0] = xd->mb_to_top_edge;
-  edge[1] = xd->mb_to_bottom_edge;
-  edge[2] = xd->mb_to_left_edge;
-  edge[3] = xd->mb_to_right_edge;
-
-  clamp_mvs = xd->mode_info_context->mbmi.need_to_clamp_secondmv;
-  base_pre = xd->second_pre.y_buffer;
-  pre_stride = xd->second_pre.y_stride;
-  ymv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;
-  // First generate the second predictor
-  scale = &xd->scale_factor[1];
-  for (n = 0; n < block_size; n += 16) {
-    xd->mb_to_left_edge   = edge[2] - (n << 3);
-    xd->mb_to_right_edge  = edge[3] + ((16 - n) << 3);
-    if (clamp_mvs)
-      clamp_mv_to_umv_border(&ymv.as_mv, xd);
-    scale->set_scaled_offsets(scale, mb_row * 16, mb_col * 16 + n);
-    // predict a single row of pixels
-    vp9_build_inter_predictor(base_pre +
-        scaled_buffer_offset(n, 0, pre_stride, scale),
-        pre_stride, tmp_y + n, tmp_ystride, &ymv, scale, 16, 1, 0, &xd->subpix);
-  }
-  xd->mb_to_left_edge = edge[2];
-  xd->mb_to_right_edge = edge[3];
-  for (n = 0; n < block_size; n += 16) {
-    xd->mb_to_top_edge    = edge[0] - (n << 3);
-    xd->mb_to_bottom_edge = edge[1] + ((16 - n) << 3);
-    if (clamp_mvs)
-      clamp_mv_to_umv_border(&ymv.as_mv, xd);
-    scale->set_scaled_offsets(scale, mb_row * 16 + n, mb_col * 16);
-    // predict a single col of pixels
-    vp9_build_inter_predictor(base_pre +
-        scaled_buffer_offset(0, n, pre_stride, scale),
-        pre_stride, tmp_y + n * tmp_ystride, tmp_ystride, &ymv,
-        scale, 1, 16, 0, &xd->subpix);
-  }
-  xd->mb_to_top_edge = edge[0];
-  xd->mb_to_bottom_edge = edge[1];
-  // Compute consistency metric
-  metric_2 = get_consistency_metric(xd, tmp_y, tmp_ystride);
-
-  clamp_mvs = xd->mode_info_context->mbmi.need_to_clamp_mvs;
-  base_pre = xd->pre.y_buffer;
-  pre_stride = xd->pre.y_stride;
-  ymv.as_int = xd->mode_info_context->mbmi.mv[0].as_int;
-  // Now generate the first predictor
-  scale = &xd->scale_factor[0];
-  for (n = 0; n < block_size; n += 16) {
-    xd->mb_to_left_edge   = edge[2] - (n << 3);
-    xd->mb_to_right_edge  = edge[3] + ((16 - n) << 3);
-    if (clamp_mvs)
-      clamp_mv_to_umv_border(&ymv.as_mv, xd);
-    scale->set_scaled_offsets(scale, mb_row * 16, mb_col * 16 + n);
-    // predict a single row of pixels
-    vp9_build_inter_predictor(base_pre +
-        scaled_buffer_offset(n, 0, pre_stride, scale),
-        pre_stride, tmp_y + n, tmp_ystride, &ymv, scale, 16, 1, 0, &xd->subpix);
-  }
-  xd->mb_to_left_edge = edge[2];
-  xd->mb_to_right_edge = edge[3];
-  for (n = 0; n < block_size; n += 16) {
-    xd->mb_to_top_edge    = edge[0] - (n << 3);
-    xd->mb_to_bottom_edge = edge[1] + ((16 - n) << 3);
-    if (clamp_mvs)
-      clamp_mv_to_umv_border(&ymv.as_mv, xd);
-    scale->set_scaled_offsets(scale, mb_row * 16 + n, mb_col * 16);
-    // predict a single col of pixels
-    vp9_build_inter_predictor(base_pre +
-        scaled_buffer_offset(0, n, pre_stride, scale),
-        pre_stride, tmp_y + n * tmp_ystride, tmp_ystride, &ymv,
-        scale, 1, 16, 0, &xd->subpix);
-  }
-  xd->mb_to_top_edge = edge[0];
-  xd->mb_to_bottom_edge = edge[1];
-  metric_1 = get_consistency_metric(xd, tmp_y, tmp_ystride);
-
-  // Choose final weight for averaging
-  weight = get_weight(xd, metric_1, metric_2);
-  return weight;
-}
-
-static void build_inter16x16_predictors_mby_w(MACROBLOCKD *xd,
-                                              uint8_t *dst_y,
-                                              int dst_ystride,
-                                              int weight,
-                                              int mb_row,
-                                              int mb_col) {
-  const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;
-  int which_mv;
-
-  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-    const int clamp_mvs = which_mv ?
-        xd->mode_info_context->mbmi.need_to_clamp_secondmv :
-         xd->mode_info_context->mbmi.need_to_clamp_mvs;
-
-    uint8_t *base_pre = which_mv ? xd->second_pre.y_buffer : xd->pre.y_buffer;
-    int pre_stride = which_mv ? xd->second_pre.y_stride : xd->pre.y_stride;
-    int_mv ymv;
-    struct scale_factors *scale = &xd->scale_factor[which_mv];
-
-    ymv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int;
-
-    if (clamp_mvs)
-      clamp_mv_to_umv_border(&ymv.as_mv, xd);
-
-    scale->set_scaled_offsets(scale, mb_row * 16, mb_col * 16);
-
-    vp9_build_inter_predictor(base_pre, pre_stride, dst_y, dst_ystride,
-                              &ymv, scale, 16, 16,
-                              which_mv ? weight : 0, &xd->subpix);
-  }
-}
-
-static void build_inter16x16_predictors_mbuv_w(MACROBLOCKD *xd,
-                                               uint8_t *dst_u,
-                                               uint8_t *dst_v,
-                                               int dst_uvstride,
-                                               int weight,
-                                               int mb_row,
-                                               int mb_col) {
-  const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;
-  int which_mv;
-
-  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-    const int clamp_mvs =
-        which_mv ? xd->mode_info_context->mbmi.need_to_clamp_secondmv
-                 : xd->mode_info_context->mbmi.need_to_clamp_mvs;
-    uint8_t *uptr, *vptr;
-    int pre_stride = which_mv ? xd->second_pre.uv_stride
-                              : xd->pre.uv_stride;
-    int_mv mv;
-
-    struct scale_factors *scale = &xd->scale_factor_uv[which_mv];
-    mv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int;
-
-
-    if (clamp_mvs)
-      clamp_mv_to_umv_border(&mv.as_mv, xd);
-
-    uptr = (which_mv ? xd->second_pre.u_buffer : xd->pre.u_buffer);
-    vptr = (which_mv ? xd->second_pre.v_buffer : xd->pre.v_buffer);
-
-    scale->set_scaled_offsets(scale, mb_row * 16, mb_col * 16);
-
-    vp9_build_inter_predictor_q4(
-        uptr, pre_stride, dst_u, dst_uvstride, &mv,
-        scale, 8, 8, which_mv ? weight : 0, &xd->subpix);
-
-    vp9_build_inter_predictor_q4(
-        vptr, pre_stride, dst_v, dst_uvstride, &mv,
-        scale, 8, 8, which_mv ? weight : 0, &xd->subpix);
-  }
-}
-static void build_inter_predictors_sby_w(MACROBLOCKD *x,
-                                         uint8_t *dst_y,
-                                         int dst_ystride,
-                                         int weight,
-                                         int mb_row,
-                                         int mb_col,
-                                         BLOCK_SIZE_TYPE bsize) {
-  const int bwl = mb_width_log2(bsize),  bw = 1 << bwl;
-  const int bhl = mb_height_log2(bsize), bh = 1 << bhl;
-  uint8_t *y1 = x->pre.y_buffer;
-  uint8_t *y2 = x->second_pre.y_buffer;
-  int edge[4], n;
-
-  edge[0] = x->mb_to_top_edge;
-  edge[1] = x->mb_to_bottom_edge;
-  edge[2] = x->mb_to_left_edge;
-  edge[3] = x->mb_to_right_edge;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-
-    x->mb_to_top_edge    = edge[0] -           ((y_idx  * 16) << 3);
-    x->mb_to_bottom_edge = edge[1] + (((bh - 1 - y_idx) * 16) << 3);
-    x->mb_to_left_edge   = edge[2] -           ((x_idx  * 16) << 3);
-    x->mb_to_right_edge  = edge[3] + (((bw - 1 - x_idx) * 16) << 3);
-
-    x->pre.y_buffer = y1 + scaled_buffer_offset(x_idx * 16,
-                                                y_idx * 16,
-                                                x->pre.y_stride,
-                                                &x->scale_factor[0]);
-    if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-      x->second_pre.y_buffer = y2 +
-          scaled_buffer_offset(x_idx * 16,
-                               y_idx * 16,
-                               x->second_pre.y_stride,
-                               &x->scale_factor[1]);
-    }
-    build_inter16x16_predictors_mby_w(x,
-        dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,
-        dst_ystride, weight, mb_row + y_idx, mb_col + x_idx);
-  }
-  x->mb_to_top_edge    = edge[0];
-  x->mb_to_bottom_edge = edge[1];
-  x->mb_to_left_edge   = edge[2];
-  x->mb_to_right_edge  = edge[3];
-
-  x->pre.y_buffer = y1;
-  if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-    x->second_pre.y_buffer = y2;
-  }
-}
-
-void vp9_build_inter_predictors_sby(MACROBLOCKD *x,
-                                         uint8_t *dst_y,
-                                         int dst_ystride,
-                                         int mb_row,
-                                         int mb_col,
-                                         BLOCK_SIZE_TYPE bsize) {
-  int weight = get_implicit_compoundinter_weight(x, mb_row, mb_col);
-  build_inter_predictors_sby_w(x, dst_y, dst_ystride, weight,
-                                    mb_row, mb_col, bsize);
-}
-
-static void build_inter_predictors_sbuv_w(MACROBLOCKD *x,
-                                          uint8_t *dst_u,
-                                          uint8_t *dst_v,
-                                          int dst_uvstride,
-                                          int weight,
-                                          int mb_row,
-                                          int mb_col,
-                                          BLOCK_SIZE_TYPE bsize) {
-  const int bwl = mb_width_log2(bsize),  bw = 1 << bwl;
-  const int bhl = mb_height_log2(bsize), bh = 1 << bhl;
-  uint8_t *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;
-  uint8_t *u2 = x->second_pre.u_buffer, *v2 = x->second_pre.v_buffer;
-  int edge[4], n;
-
-  edge[0] = x->mb_to_top_edge;
-  edge[1] = x->mb_to_bottom_edge;
-  edge[2] = x->mb_to_left_edge;
-  edge[3] = x->mb_to_right_edge;
-
-  for (n = 0; n < bw * bh; n++) {
-    int scaled_uv_offset;
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-
-    x->mb_to_top_edge    = edge[0] -           ((y_idx  * 16) << 3);
-    x->mb_to_bottom_edge = edge[1] + (((bh - 1 - y_idx) * 16) << 3);
-    x->mb_to_left_edge   = edge[2] -           ((x_idx  * 16) << 3);
-    x->mb_to_right_edge  = edge[3] + (((bw - 1 - x_idx) * 16) << 3);
-
-    scaled_uv_offset = scaled_buffer_offset(x_idx * 8,
-                                            y_idx * 8,
-                                            x->pre.uv_stride,
-                                            &x->scale_factor_uv[0]);
-    x->pre.u_buffer = u1 + scaled_uv_offset;
-    x->pre.v_buffer = v1 + scaled_uv_offset;
-
-    if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-      scaled_uv_offset = scaled_buffer_offset(x_idx * 8,
-                                              y_idx * 8,
-                                              x->second_pre.uv_stride,
-                                              &x->scale_factor_uv[1]);
-      x->second_pre.u_buffer = u2 + scaled_uv_offset;
-      x->second_pre.v_buffer = v2 + scaled_uv_offset;
-    }
-
-    build_inter16x16_predictors_mbuv_w(x,
-        dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,
-        dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,
-        dst_uvstride, weight, mb_row + y_idx, mb_col + x_idx);
-  }
-  x->mb_to_top_edge    = edge[0];
-  x->mb_to_bottom_edge = edge[1];
-  x->mb_to_left_edge   = edge[2];
-  x->mb_to_right_edge  = edge[3];
-
-  x->pre.u_buffer = u1;
-  x->pre.v_buffer = v1;
-
-  if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-    x->second_pre.u_buffer = u2;
-    x->second_pre.v_buffer = v2;
-  }
-}
-
-void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd,
-                                     uint8_t *dst_u,
-                                     uint8_t *dst_v,
-                                     int dst_uvstride,
-                                     int mb_row,
-                                     int mb_col,
-                                     BLOCK_SIZE_TYPE bsize) {
-#ifdef USE_IMPLICIT_WEIGHT_UV
-  int weight = get_implicit_compoundinter_weight(xd, mb_row, mb_col);
-#else
-  int weight = AVERAGE_WEIGHT;
-#endif
-  build_inter_predictors_sbuv_w(xd, dst_u, dst_v, dst_uvstride,
-                                weight, mb_row, mb_col, bsize);
-}
-
-void vp9_build_inter_predictors_sb(MACROBLOCKD *mb,
-                                   int mb_row, int mb_col,
-                                   BLOCK_SIZE_TYPE bsize) {
-  uint8_t *const y = mb->plane[0].dst.buf;
-  uint8_t *const u = mb->plane[1].dst.buf;
-  uint8_t *const v = mb->plane[2].dst.buf;
-  const int y_stride = mb->plane[0].dst.stride;
-  const int uv_stride = mb->plane[1].dst.stride;
-
-  vp9_build_inter_predictors_sby(mb, y, y_stride, mb_row, mb_col, bsize);
-  vp9_build_inter_predictors_sbuv(mb, u, v, uv_stride, mb_row, mb_col, bsize);
-#if CONFIG_COMP_INTERINTRA_PRED
-  if (mb->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
-    if (bsize == BLOCK_SIZE_SB32X32)
-      vp9_build_interintra_32x32_predictors_sb(mb, y, u, v,
-                                               y_stride, uv_stride);
-    else
-      vp9_build_interintra_64x64_predictors_sb(mb, y, u, v,
-                                               y_stride, uv_stride);
-  }
-#endif
-}
-#endif  // CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
 
 static INLINE int round_mv_comp(int value) {
   return (value < 0 ? value - 2 : value + 2) / 4;
@@ -1223,20 +474,9 @@
   return round_mv_comp(temp);
 }
 
-void vp9_build_inter_predictors_mb(MACROBLOCKD *xd,
-                                   int mb_row,
-                                   int mb_col) {
-  vp9_build_inter_predictors_sb(xd, mb_row, mb_col, BLOCK_SIZE_MB16X16);
-}
-
-
 /*encoder only*/
 void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd,
                                         int mb_row, int mb_col) {
-  uint8_t *const u = xd->plane[1].dst.buf;
-  uint8_t *const v = xd->plane[2].dst.buf;
-  const int uv_stride = xd->plane[1].dst.stride;
-
-  vp9_build_inter_predictors_sbuv(xd, u, v, uv_stride, mb_row, mb_col,
+  vp9_build_inter_predictors_sbuv(xd, mb_row, mb_col,
                                   BLOCK_SIZE_MB16X16);
 }
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index 8ffdfd1..51b705f 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -17,31 +17,19 @@
 struct subpix_fn_table;
 
 void vp9_build_inter_predictors_sby(MACROBLOCKD *xd,
-                                    uint8_t *dst_y,
-                                    int dst_ystride,
                                     int mb_row,
                                     int mb_col,
                                     BLOCK_SIZE_TYPE bsize);
 
 void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd,
-                                     uint8_t *dst_u,
-                                     uint8_t *dst_v,
-                                     int dst_uvstride,
                                      int mb_row,
                                      int mb_col,
                                      BLOCK_SIZE_TYPE bsize);
+
 void vp9_build_inter_predictors_sb(MACROBLOCKD *mb,
                                    int mb_row, int mb_col,
                                    BLOCK_SIZE_TYPE bsize);
 
-void vp9_build_inter_predictors_mb(MACROBLOCKD *xd,
-                                   int mb_row,
-                                   int mb_col);
-
-void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd,
-                                        int mb_row,
-                                        int mb_col);
-
 void vp9_setup_interp_filters(MACROBLOCKD *xd,
                               INTERPOLATIONFILTERTYPE filter,
                               VP9_COMMON *cm);
@@ -118,6 +106,35 @@
                    xd->plane[2].subsampling_x, xd->plane[2].subsampling_y);
 }
 
+static void setup_pre_planes(MACROBLOCKD *xd,
+                             const YV12_BUFFER_CONFIG *src0,
+                             const YV12_BUFFER_CONFIG *src1,
+                             int mb_row, int mb_col,
+                             const struct scale_factors *scale,
+                             const struct scale_factors *scale_uv) {
+  int i;
+
+  for (i = 0; i < 2; i++) {
+    const YV12_BUFFER_CONFIG *src = i ? src1 : src0;
+
+    if (!src)
+      continue;
+
+    setup_pred_plane(&xd->plane[0].pre[i],
+                     src->y_buffer, src->y_stride,
+                     mb_row, mb_col, scale ? scale + i : NULL,
+                     xd->plane[0].subsampling_x, xd->plane[0].subsampling_y);
+    setup_pred_plane(&xd->plane[1].pre[i],
+                     src->u_buffer, src->uv_stride,
+                     mb_row, mb_col, scale_uv ? scale_uv + i : NULL,
+                     xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
+    setup_pred_plane(&xd->plane[2].pre[i],
+                     src->v_buffer, src->uv_stride,
+                     mb_row, mb_col, scale_uv ? scale_uv + i : NULL,
+                     xd->plane[2].subsampling_x, xd->plane[2].subsampling_y);
+  }
+}
+
 static void setup_pred_block(YV12_BUFFER_CONFIG *dst,
                              const YV12_BUFFER_CONFIG *src,
                              int mb_row, int mb_col,
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 1031be7..4e786b0 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -227,70 +227,6 @@
   }
 }
 
-static void corner_predictor(uint8_t *ypred_ptr, int y_stride, int n,
-                             uint8_t *yabove_row,
-                             uint8_t *yleft_col) {
-  int mh, mv, maxgradh, maxgradv, x, y, nx, ny;
-  int i, j;
-  int top_left = yabove_row[-1];
-  mh = mv = 0;
-  maxgradh = yabove_row[1] - top_left;
-  maxgradv = yleft_col[1] - top_left;
-  for (i = 2; i < n; ++i) {
-    int gh = yabove_row[i] - yabove_row[i - 2];
-    int gv = yleft_col[i] - yleft_col[i - 2];
-    if (gh > maxgradh) {
-      maxgradh = gh;
-      mh = i - 1;
-    }
-    if (gv > maxgradv) {
-      maxgradv = gv;
-      mv = i - 1;
-    }
-  }
-  nx = mh + mv + 3;
-  ny = 2 * n + 1 - nx;
-
-  x = top_left;
-  for (i = 0; i <= mh; ++i) x += yabove_row[i];
-  for (i = 0; i <= mv; ++i) x += yleft_col[i];
-  x += (nx >> 1);
-  x /= nx;
-  y = 0;
-  for (i = mh + 1; i < n; ++i) y += yabove_row[i];
-  for (i = mv + 1; i < n; ++i) y += yleft_col[i];
-  y += (ny >> 1);
-  y /= ny;
-
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n; ++j)
-      ypred_ptr[j] = (i <= mh && j <= mv ? x : y);
-    ypred_ptr += y_stride;
-  }
-}
-
-void vp9_recon_intra_mbuv(MACROBLOCKD *xd) {
-  int i;
-  for (i = 16; i < 24; i += 2) {
-    BLOCKD *b = &xd->block[i];
-    vp9_recon2b(*(b->base_dst) + b->dst, b->diff,
-                *(b->base_dst) + b->dst, b->dst_stride);
-  }
-}
-
-static INLINE int log2_minus_1(int n) {
-  switch (n) {
-    case 4: return 1;
-    case 8: return 2;
-    case 16: return 3;
-    case 32: return 4;
-    case 64: return 5;
-    default:
-      assert(0);
-      return 0;
-  }
-}
-
 void vp9_build_intra_predictors(uint8_t *src, int src_stride,
                                 uint8_t *ypred_ptr,
                                 int y_stride, int mode,
@@ -474,7 +410,7 @@
                                int interstride,
                                uint8_t *intrapred,
                                int intrastride,
-                               int size) {
+                               int bw, int bh) {
   // TODO(debargha): Explore different ways of combining predictors
   //                 or designing the tables below
   static const int scale_bits = 8;
@@ -492,6 +428,7 @@
      68,  68,  68,  67,  67,  67,  67,  67,
   };
 
+  int size = MAX(bw, bh);
   int size_scale = (size >= 64 ? 1:
                     size == 32 ? 2 :
                     size == 16 ? 4 :
@@ -499,8 +436,8 @@
   int i, j;
   switch (mode) {
     case V_PRED:
-      for (i = 0; i < size; ++i) {
-        for (j = 0; j < size; ++j) {
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
           int k = i * interstride + j;
           int scale = weights1d[i * size_scale];
           interpred[k] =
@@ -512,8 +449,8 @@
       break;
 
     case H_PRED:
-      for (i = 0; i < size; ++i) {
-        for (j = 0; j < size; ++j) {
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
           int k = i * interstride + j;
           int scale = weights1d[j * size_scale];
           interpred[k] =
@@ -526,8 +463,8 @@
 
     case D63_PRED:
     case D117_PRED:
-      for (i = 0; i < size; ++i) {
-        for (j = 0; j < size; ++j) {
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
           int k = i * interstride + j;
           int scale = (weights1d[i * size_scale] * 3 +
                        weights1d[j * size_scale]) >> 2;
@@ -541,8 +478,8 @@
 
     case D27_PRED:
     case D153_PRED:
-      for (i = 0; i < size; ++i) {
-        for (j = 0; j < size; ++j) {
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
           int k = i * interstride + j;
           int scale = (weights1d[j * size_scale] * 3 +
                        weights1d[i * size_scale]) >> 2;
@@ -555,8 +492,8 @@
       break;
 
     case D135_PRED:
-      for (i = 0; i < size; ++i) {
-        for (j = 0; j < size; ++j) {
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
           int k = i * interstride + j;
           int scale = weights1d[(i < j ? i : j) * size_scale];
           interpred[k] =
@@ -568,8 +505,8 @@
       break;
 
     case D45_PRED:
-      for (i = 0; i < size; ++i) {
-        for (j = 0; j < size; ++j) {
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
           int k = i * interstride + j;
           int scale = (weights1d[i * size_scale] +
                        weights1d[j * size_scale]) >> 1;
@@ -585,8 +522,8 @@
     case DC_PRED:
     default:
       // simple average
-      for (i = 0; i < size; ++i) {
-        for (j = 0; j < size; ++j) {
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
           int k = i * interstride + j;
           interpred[k] = (interpred[k] + intrapred[i * intrastride + j]) >> 1;
         }
@@ -595,137 +532,55 @@
   }
 }
 
-void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd,
+void vp9_build_interintra_predictors(MACROBLOCKD *xd,
                                               uint8_t *ypred,
                                               uint8_t *upred,
                                               uint8_t *vpred,
-                                              int ystride, int uvstride) {
-  vp9_build_interintra_16x16_predictors_mby(xd, ypred, ystride);
-  vp9_build_interintra_16x16_predictors_mbuv(xd, upred, vpred, uvstride);
+                                              int ystride, int uvstride,
+                                              BLOCK_SIZE_TYPE bsize) {
+  vp9_build_interintra_predictors_sby(xd, ypred, ystride, bsize);
+  vp9_build_interintra_predictors_sbuv(xd, upred, vpred, uvstride, bsize);
 }
 
-void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd,
+void vp9_build_interintra_predictors_sby(MACROBLOCKD *xd,
                                                uint8_t *ypred,
-                                               int ystride) {
-  uint8_t intrapredictor[256];
-  vp9_build_intra_predictors(
-      xd->plane[0].dst.buf, xd->plane[0].dst.stride,
-      intrapredictor, 16,
-      xd->mode_info_context->mbmi.interintra_mode, 16, 16,
-      xd->up_available, xd->left_available, xd->right_available);
-  combine_interintra(xd->mode_info_context->mbmi.interintra_mode,
-                     ypred, ystride, intrapredictor, 16, 16);
-}
-
-void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                                uint8_t *upred,
-                                                uint8_t *vpred,
-                                                int uvstride) {
-  uint8_t uintrapredictor[64];
-  uint8_t vintrapredictor[64];
-  vp9_build_intra_predictors(
-      xd->plane[1].dst.buf, xd->plane[1].dst.stride,
-      uintrapredictor, 8,
-      xd->mode_info_context->mbmi.interintra_uv_mode, 8, 8,
-      xd->up_available, xd->left_available, xd->right_available);
-  vp9_build_intra_predictors(
-      xd->plane[2].dst.buf, xd->plane[1].dst.stride,
-      vintrapredictor, 8,
-      xd->mode_info_context->mbmi.interintra_uv_mode, 8, 8,
-      xd->up_available, xd->left_available, xd->right_available);
-  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
-                     upred, uvstride, uintrapredictor, 8, 8);
-  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
-                     vpred, uvstride, vintrapredictor, 8, 8);
-}
-
-void vp9_build_interintra_32x32_predictors_sby(MACROBLOCKD *xd,
-                                               uint8_t *ypred,
-                                               int ystride) {
-  uint8_t intrapredictor[1024];
-  vp9_build_intra_predictors(
-      xd->plane[0].dst.buf, xd->plane[0].dst.stride,
-      intrapredictor, 32,
-      xd->mode_info_context->mbmi.interintra_mode, 32, 32,
-      xd->up_available, xd->left_available, xd->right_available);
-  combine_interintra(xd->mode_info_context->mbmi.interintra_mode,
-                     ypred, ystride, intrapredictor, 32, 32);
-}
-
-void vp9_build_interintra_32x32_predictors_sbuv(MACROBLOCKD *xd,
-                                                uint8_t *upred,
-                                                uint8_t *vpred,
-                                                int uvstride) {
-  uint8_t uintrapredictor[256];
-  uint8_t vintrapredictor[256];
-  vp9_build_intra_predictors(
-      xd->plane[1].dst.buf, xd->plane[1].dst.stride,
-      uintrapredictor, 16,
-      xd->mode_info_context->mbmi.interintra_uv_mode, 16, 16,
-      xd->up_available, xd->left_available, xd->right_available);
-  vp9_build_intra_predictors(
-      xd->plane[2].dst.buf, xd->plane[1].dst.stride,
-      vintrapredictor, 16,
-      xd->mode_info_context->mbmi.interintra_uv_mode, 16, 16,
-      xd->up_available, xd->left_available, xd->right_available);
-  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
-                     upred, uvstride, uintrapredictor, 16, 16);
-  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
-                     vpred, uvstride, vintrapredictor, 16, 16);
-}
-
-void vp9_build_interintra_32x32_predictors_sb(MACROBLOCKD *xd,
-                                              uint8_t *ypred,
-                                              uint8_t *upred,
-                                              uint8_t *vpred,
-                                              int ystride,
-                                              int uvstride) {
-  vp9_build_interintra_32x32_predictors_sby(xd, ypred, ystride);
-  vp9_build_interintra_32x32_predictors_sbuv(xd, upred, vpred, uvstride);
-}
-
-void vp9_build_interintra_64x64_predictors_sby(MACROBLOCKD *xd,
-                                               uint8_t *ypred,
-                                               int ystride) {
+                                               int ystride,
+                                               BLOCK_SIZE_TYPE bsize) {
+  int bwl = mb_width_log2(bsize),  bw = 16 << bwl;
+  int bhl = mb_height_log2(bsize), bh = 16 << bhl;
   uint8_t intrapredictor[4096];
-  const int mode = xd->mode_info_context->mbmi.interintra_mode;
-  vp9_build_intra_predictors(xd->plane[0].dst.buf, xd->plane[0].dst.stride,
-                             intrapredictor, 64, mode, 64, 64,
-                             xd->up_available, xd->left_available,
-                             xd->right_available);
+  vp9_build_intra_predictors(
+      xd->plane[0].dst.buf, xd->plane[0].dst.stride,
+      intrapredictor, bw,
+      xd->mode_info_context->mbmi.interintra_mode, bw, bh,
+      xd->up_available, xd->left_available, xd->right_available);
   combine_interintra(xd->mode_info_context->mbmi.interintra_mode,
-                     ypred, ystride, intrapredictor, 64, 64);
+                     ypred, ystride, intrapredictor, bw, bw, bh);
 }
 
-void vp9_build_interintra_64x64_predictors_sbuv(MACROBLOCKD *xd,
+void vp9_build_interintra_predictors_sbuv(MACROBLOCKD *xd,
                                                 uint8_t *upred,
                                                 uint8_t *vpred,
-                                                int uvstride) {
+                                                int uvstride,
+                                                BLOCK_SIZE_TYPE bsize) {
+  int bwl = mb_width_log2(bsize),  bw = 8 << bwl;
+  int bhl = mb_height_log2(bsize), bh = 8 << bhl;
   uint8_t uintrapredictor[1024];
   uint8_t vintrapredictor[1024];
-  const int mode = xd->mode_info_context->mbmi.interintra_uv_mode;
-  vp9_build_intra_predictors(xd->plane[1].dst.buf, xd->plane[1].dst.stride,
-                             uintrapredictor, 32, mode, 32, 32,
-                             xd->up_available, xd->left_available,
-                             xd->right_available);
-  vp9_build_intra_predictors(xd->plane[2].dst.buf, xd->plane[1].dst.stride,
-                             vintrapredictor, 32, mode, 32, 32,
-                             xd->up_available, xd->left_available,
-                             xd->right_available);
+  vp9_build_intra_predictors(
+      xd->plane[1].dst.buf, xd->plane[1].dst.stride,
+      uintrapredictor, bw,
+      xd->mode_info_context->mbmi.interintra_uv_mode, bw, bh,
+      xd->up_available, xd->left_available, xd->right_available);
+  vp9_build_intra_predictors(
+      xd->plane[2].dst.buf, xd->plane[1].dst.stride,
+      vintrapredictor, bw,
+      xd->mode_info_context->mbmi.interintra_uv_mode, bw, bh,
+      xd->up_available, xd->left_available, xd->right_available);
   combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
-                     upred, uvstride, uintrapredictor, 32, 32);
+                     upred, uvstride, uintrapredictor, bw, bw, bh);
   combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
-                     vpred, uvstride, vintrapredictor, 32, 32);
-}
-
-void vp9_build_interintra_64x64_predictors_sb(MACROBLOCKD *xd,
-                                              uint8_t *ypred,
-                                              uint8_t *upred,
-                                              uint8_t *vpred,
-                                              int ystride,
-                                              int uvstride) {
-  vp9_build_interintra_64x64_predictors_sby(xd, ypred, ystride);
-  vp9_build_interintra_64x64_predictors_sbuv(xd, upred, vpred, uvstride);
+                     vpred, uvstride, vintrapredictor, bw, bw, bh);
 }
 #endif  // CONFIG_COMP_INTERINTRA_PRED
 
diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h
index b97b608..e943596 100644
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@@ -14,8 +14,6 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
 
-void vp9_recon_intra_mbuv(MACROBLOCKD *xd);
-
 B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
                                               int stride, int n,
                                               int tx, int ty);
@@ -23,35 +21,24 @@
 B_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, BLOCKD *x);
 
 #if CONFIG_COMP_INTERINTRA_PRED
-void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd,
-                                              uint8_t *ypred,
-                                              uint8_t *upred,
-                                              uint8_t *vpred,
-                                              int ystride,
-                                              int uvstride);
+void vp9_build_interintra_predictors(MACROBLOCKD *xd,
+                                     uint8_t *ypred,
+                                     uint8_t *upred,
+                                     uint8_t *vpred,
+                                     int ystride,
+                                     int uvstride,
+                                     BLOCK_SIZE_TYPE bsize);
 
-void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd,
-                                               uint8_t *ypred,
-                                               int ystride);
+void vp9_build_interintra_predictors_sby(MACROBLOCKD *xd,
+                                         uint8_t *ypred,
+                                         int ystride,
+                                         BLOCK_SIZE_TYPE bsize);
 
-void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                                uint8_t *upred,
-                                                uint8_t *vpred,
-                                                int uvstride);
+void vp9_build_interintra_predictors_sbuv(MACROBLOCKD *xd,
+                                          uint8_t *upred,
+                                          uint8_t *vpred,
+                                          int uvstride,
+                                          BLOCK_SIZE_TYPE bsize);
 #endif  // CONFIG_COMP_INTERINTRA_PRED
 
-void vp9_build_interintra_32x32_predictors_sb(MACROBLOCKD *xd,
-                                              uint8_t *ypred,
-                                              uint8_t *upred,
-                                              uint8_t *vpred,
-                                              int ystride,
-                                              int uvstride);
-
-void vp9_build_interintra_64x64_predictors_sb(MACROBLOCKD *xd,
-                                              uint8_t *ypred,
-                                              uint8_t *upred,
-                                              uint8_t *vpred,
-                                              int ystride,
-                                              int uvstride);
-
 #endif  // VP9_COMMON_VP9_RECONINTRA_H_
diff --git a/vp9/common/vp9_reconintra4x4.c b/vp9/common/vp9_reconintra4x4.c
index a070001..7f81b05 100644
--- a/vp9/common/vp9_reconintra4x4.c
+++ b/vp9/common/vp9_reconintra4x4.c
@@ -258,15 +258,22 @@
 
   switch (b_mode) {
     case B_DC_PRED: {
-      int expected_dc = 0;
-
-      for (i = 0; i < 4; i++) {
-        expected_dc += above[i];
-        expected_dc += left[i];
+      int expected_dc = 128;
+      if (have_top || have_left) {
+        int average = 0;
+        int count = 0;
+        if (have_top) {
+          for (i = 0; i < 4; i++)
+            average += above[i];
+          count += 4;
+        }
+        if (have_left) {
+          for (i = 0; i < 4; i++)
+            average += left[i];
+          count += 4;
+        }
+        expected_dc = (average + (count >> 1)) / count;
       }
-
-      expected_dc = ROUND_POWER_OF_TWO(expected_dc, 3);
-
       for (r = 0; r < 4; r++) {
         for (c = 0; c < 4; c++)
           predictor[c] = expected_dc;
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index d3f719e..7d1d585 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -230,62 +230,6 @@
 prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8_avg_vert ssse3
 
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-prototype void vp9_convolve8_1by8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_1by8
-
-prototype void vp9_convolve8_qtr "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_qtr
-
-prototype void vp9_convolve8_3by8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_3by8
-
-prototype void vp9_convolve8_5by8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_5by8
-
-prototype void vp9_convolve8_3qtr "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_3qtr
-
-prototype void vp9_convolve8_7by8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_7by8
-
-prototype void vp9_convolve8_1by8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_1by8_horiz
-
-prototype void vp9_convolve8_qtr_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_qtr_horiz
-
-prototype void vp9_convolve8_3by8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_3by8_horiz
-
-prototype void vp9_convolve8_5by8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_5by8_horiz
-
-prototype void vp9_convolve8_3qtr_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_3qtr_horiz
-
-prototype void vp9_convolve8_7by8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_7by8_horiz
-
-prototype void vp9_convolve8_1by8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_1by8_vert
-
-prototype void vp9_convolve8_qtr_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_qtr_vert
-
-prototype void vp9_convolve8_3by8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_3by8_vert
-
-prototype void vp9_convolve8_5by8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_5by8_vert
-
-prototype void vp9_convolve8_3qtr_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_3qtr_vert
-
-prototype void vp9_convolve8_7by8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_7by8_vert
-#endif
-
 #
 # dct
 #
@@ -617,12 +561,6 @@
 specialize vp9_block_error mmx sse2
 vp9_block_error_sse2=vp9_block_error_xmm
 
-prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"
-# TODO(jingning): The prototype function in c has been changed to remove
-# the use of predictor buffer in MACROBLOCKD. Need to modify the mmx and sse2
-# versions accordingly.
-specialize vp9_subtract_b
-
 #
 # Structured Similarity (SSIM)
 #
diff --git a/vp9/common/vp9_seg_common.c b/vp9/common/vp9_seg_common.c
index df67cff..4c913e2 100644
--- a/vp9/common/vp9_seg_common.c
+++ b/vp9/common/vp9_seg_common.c
@@ -85,4 +85,26 @@
          (1 << ref_frame)) ? 1 : 0;
 }
 
+
+#if CONFIG_IMPLICIT_SEGMENTATION
+// This function defines an implicit segmentation for the next frame based
+// on predcition and transform decisions in the current frame.
+// For test purposes at the moment only TX size is used.
+void vp9_implicit_segment_map_update(VP9_COMMON * cm) {
+  int row, col;
+  MODE_INFO *mi, *mi_ptr = cm->mi;
+  unsigned char * map_ptr = cm->last_frame_seg_map;
+
+  for (row = 0; row < cm->mb_rows; row++) {
+    mi = mi_ptr;
+    // Experimental use of tx size to define implicit segmentation
+    for (col = 0; col < cm->mb_cols; ++col, ++mi) {
+      map_ptr[col] = mi->mbmi.txfm_size;
+    }
+    mi_ptr += cm->mode_info_stride;
+    map_ptr += cm->mb_cols;
+  }
+}
+#endif
+
 // TBD? Functions to read and write segment data with range / validity checking
diff --git a/vp9/common/vp9_seg_common.h b/vp9/common/vp9_seg_common.h
index 243ff88..4550dd1 100644
--- a/vp9/common/vp9_seg_common.h
+++ b/vp9/common/vp9_seg_common.h
@@ -55,5 +55,9 @@
                      int segment_id,
                      MV_REFERENCE_FRAME ref_frame);
 
+#if CONFIG_IMPLICIT_SEGMENTATION
+void vp9_implicit_segment_map_update(VP9_COMMON * cm);
+#endif
+
 #endif  // VP9_COMMON_VP9_SEG_COMMON_H_
 
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 7135ea1..65e81b9 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -75,30 +75,29 @@
 
 static int read_mb_segid(vp9_reader *r, MACROBLOCKD *xd) {
   const vp9_prob *const p = xd->mb_segment_tree_probs;
-  return vp9_read(r, p[0]) ? 2 + vp9_read(r, p[2])
-                           :     vp9_read(r, p[1]);
-}
+  int ret_val;
 
-// This function reads the current macro block's segnent id from the bitstream
-// It should only be called if a segment map update is indicated.
-static int read_mb_segid_except(vp9_reader *r,
-                                VP9_COMMON *cm, MACROBLOCKD *xd,
-                                int mb_row, int mb_col) {
-  const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
-  const int pred_seg_id = vp9_get_pred_mb_segid(cm, sb_type, mb_row, mb_col);
-  const vp9_prob *const p = xd->mb_segment_tree_probs;
-  const vp9_prob prob = xd->mb_segment_mispred_tree_probs[pred_seg_id];
-
-  return vp9_read(r, prob)
-             ? 2 + (pred_seg_id  < 2 ? vp9_read(r, p[2]) : (pred_seg_id == 2))
-             :     (pred_seg_id >= 2 ? vp9_read(r, p[1]) : (pred_seg_id == 0));
+  if (vp9_read(r, p[0])) {
+    if (vp9_read(r, p[4])) {
+      ret_val = 6 + vp9_read(r, p[6]);
+    } else {
+      ret_val = 4 + vp9_read(r, p[5]);
+    }
+  } else {
+    if (vp9_read(r, p[1])) {
+      ret_val = 2 + vp9_read(r, p[3]);
+    } else {
+      ret_val = vp9_read(r, p[2]);
+    }
+  }
+  return ret_val;
 }
 
 static void set_segment_id(VP9_COMMON *cm, MB_MODE_INFO *mbmi,
                            int mb_row, int mb_col, int segment_id) {
   const int mb_index = mb_row * cm->mb_cols + mb_col;
   const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;
-  if (sb_type) {
+  if (sb_type > BLOCK_SIZE_MB16X16) {
     const int bw = 1 << mb_width_log2(sb_type);
     const int bh = 1 << mb_height_log2(sb_type);
     const int ymbs = MIN(cm->mb_rows - mb_row, bh);
@@ -116,6 +115,17 @@
   }
 }
 
+static TX_SIZE select_txfm_size(VP9_COMMON *cm, vp9_reader *r,
+                                int allow_16x16, int allow_32x32) {
+  TX_SIZE txfm_size = vp9_read(r, cm->prob_tx[0]);  // TX_4X4 or >TX_4X4
+  if (txfm_size != TX_4X4 && allow_16x16) {
+    txfm_size += vp9_read(r, cm->prob_tx[1]);       // TX_8X8 or >TX_8X8
+    if (txfm_size != TX_8X8 && allow_32x32)
+      txfm_size += vp9_read(r, cm->prob_tx[2]);     // TX_16X16 or >TX_16X16
+  }
+  return txfm_size;
+}
+
 extern const int vp9_i8x8_block[4];
 static void kfread_modes(VP9D_COMP *pbi, MODE_INFO *m,
                          int mb_row, int mb_col,
@@ -138,7 +148,7 @@
     m->mbmi.mb_skip_coeff = vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
 
   // luma mode
-  m->mbmi.mode = m->mbmi.sb_type ?
+  m->mbmi.mode = m->mbmi.sb_type > BLOCK_SIZE_MB16X16 ?
       read_kf_sb_ymode(r, cm->sb_kf_ymode_prob[cm->kf_ymode_probs_index]):
       read_kf_mb_ymode(r, cm->kf_ymode_prob[cm->kf_ymode_probs_index]);
 
@@ -174,15 +184,11 @@
   }
 
   if (cm->txfm_mode == TX_MODE_SELECT &&
-      m->mbmi.mb_skip_coeff == 0 &&
+      !m->mbmi.mb_skip_coeff &&
       m->mbmi.mode <= I8X8_PRED) {
-    // FIXME(rbultje) code ternary symbol once all experiments are merged
-    m->mbmi.txfm_size = vp9_read(r, cm->prob_tx[0]);
-    if (m->mbmi.txfm_size != TX_4X4 && m->mbmi.mode != I8X8_PRED) {
-      m->mbmi.txfm_size += vp9_read(r, cm->prob_tx[1]);
-      if (m->mbmi.txfm_size != TX_8X8 && m->mbmi.sb_type >= BLOCK_SIZE_SB32X32)
-        m->mbmi.txfm_size += vp9_read(r, cm->prob_tx[2]);
-    }
+    const int allow_16x16 = m->mbmi.mode != I8X8_PRED;
+    const int allow_32x32 = m->mbmi.sb_type >= BLOCK_SIZE_SB32X32;
+    m->mbmi.txfm_size = select_txfm_size(cm, r, allow_16x16, allow_32x32);
   } else if (cm->txfm_mode >= ALLOW_32X32 &&
              m->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
     m->mbmi.txfm_size = TX_32X32;
@@ -195,6 +201,7 @@
   }
 }
 
+
 static int read_nmv_component(vp9_reader *r,
                               int rv,
                               const nmv_component *mvcomp) {
@@ -472,7 +479,7 @@
   } else {
     nmv_context *const nmvc = &pbi->common.fc.nmvc;
     MACROBLOCKD *const xd = &pbi->mb;
-    int i, j;
+    int i;
 
     if (cm->mcomp_filter_type == SWITCHABLE)
       read_switchable_interp_probs(pbi, r);
@@ -506,11 +513,6 @@
       for (i = 0; i < VP9_I32X32_MODES - 1; ++i)
         cm->fc.sb_ymode_prob[i] = vp9_read_prob(r);
 
-    for (j = 0; j < PARTITION_PLANES; j++)
-      if (vp9_read_bit(r))
-        for (i = 0; i < PARTITION_TYPES - 1; i++)
-          cm->fc.partition_prob[j][i] = vp9_read_prob(r);
-
     read_nmvprobs(r, nmvc, xd->allow_high_precision_mv);
   }
 }
@@ -543,7 +545,7 @@
       // then use the predicted value, otherwise decode it explicitly
       segment_id = pred_flag ? vp9_get_pred_mb_segid(cm, mbmi->sb_type,
                                                      mb_row, mb_col)
-                             : read_mb_segid_except(r, cm, xd, mb_row, mb_col);
+                             : read_mb_segid(r, xd);
     } else {
       segment_id = read_mb_segid(r, xd);  // Normal unpredicted coding mode
     }
@@ -601,7 +603,8 @@
 
   const int use_prev_in_find_mv_refs = cm->width == cm->last_width &&
                                        cm->height == cm->last_height &&
-                                       !cm->error_resilient_mode;
+                                       !cm->error_resilient_mode &&
+                                       cm->last_show_frame;
 
   int mb_to_left_edge, mb_to_right_edge, mb_to_top_edge, mb_to_bottom_edge;
 
@@ -617,8 +620,7 @@
   // Distance of Mb to the various image edges.
   // These specified to 8th pel as they are always compared to MV values
   // that are in 1/8th pel units
-  set_mb_row(cm, xd, mb_row, bh);
-  set_mb_col(cm, xd, mb_col, bw);
+  set_mb_row_col(cm, xd, mb_row, bh, mb_col, bw);
 
   mb_to_top_edge = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
   mb_to_bottom_edge = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
@@ -644,15 +646,14 @@
 
     const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;
     struct scale_factors *sf0 = &xd->scale_factor[0];
-    struct scale_factors *sf_uv0 = &xd->scale_factor_uv[0];
     *sf0 = cm->active_ref_scale[mbmi->ref_frame - 1];
 
     {
       // Select the appropriate reference frame for this MB
       const int ref_fb_idx = cm->active_ref_idx[ref_frame - 1];
 
-      setup_pred_block(&xd->pre, &cm->yv12_fb[ref_fb_idx],
-                       mb_row, mb_col, sf0, sf_uv0);
+      setup_pre_planes(xd, &cm->yv12_fb[ref_fb_idx], NULL,
+                       mb_row, mb_col, xd->scale_factor, xd->scale_factor_uv);
 
 #ifdef DEC_DEBUG
       if (dec_debug)
@@ -669,7 +670,8 @@
       if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) {
         mbmi->mode = ZEROMV;
       } else {
-        mbmi->mode = mbmi->sb_type ? read_sb_mv_ref(r, mv_ref_p)
+        mbmi->mode = mbmi->sb_type > BLOCK_SIZE_MB16X16 ?
+                                     read_sb_mv_ref(r, mv_ref_p)
                                    : read_mv_ref(r, mv_ref_p);
         vp9_accum_mv_refs(cm, mbmi->mode, mbmi->mb_mode_context[ref_frame]);
       }
@@ -712,12 +714,11 @@
       if (mbmi->second_ref_frame > 0) {
         const MV_REFERENCE_FRAME second_ref_frame = mbmi->second_ref_frame;
         struct scale_factors *sf1 = &xd->scale_factor[1];
-        struct scale_factors *sf_uv1 = &xd->scale_factor_uv[1];
         const int second_ref_fb_idx = cm->active_ref_idx[second_ref_frame - 1];
         *sf1 = cm->active_ref_scale[second_ref_frame - 1];
 
-        setup_pred_block(&xd->second_pre, &cm->yv12_fb[second_ref_fb_idx],
-                         mb_row, mb_col, sf1, sf_uv1);
+        setup_pre_planes(xd, NULL, &cm->yv12_fb[second_ref_fb_idx],
+                         mb_row, mb_col, xd->scale_factor, xd->scale_factor_uv);
 
         vp9_find_mv_refs(cm, xd, mi,
                          use_prev_in_find_mv_refs ? prev_mi : NULL,
@@ -934,7 +935,7 @@
     // required for left and above block mv
     mv0->as_int = 0;
 
-    if (mbmi->sb_type) {
+    if (mbmi->sb_type > BLOCK_SIZE_MB16X16) {
       mbmi->mode = read_sb_ymode(r, cm->fc.sb_ymode_prob);
       cm->fc.sb_ymode_counts[mbmi->mode]++;
     } else {
@@ -981,14 +982,9 @@
       ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= I8X8_PRED) ||
        (mbmi->ref_frame != INTRA_FRAME && !(mbmi->mode == SPLITMV &&
                            mbmi->partitioning == PARTITIONING_4X4)))) {
-    // FIXME(rbultje) code ternary symbol once all experiments are merged
-    mbmi->txfm_size = vp9_read(r, cm->prob_tx[0]);
-    if (mbmi->txfm_size != TX_4X4 && mbmi->mode != I8X8_PRED &&
-        mbmi->mode != SPLITMV) {
-      mbmi->txfm_size += vp9_read(r, cm->prob_tx[1]);
-      if (mbmi->sb_type >= BLOCK_SIZE_SB32X32 && mbmi->txfm_size != TX_8X8)
-        mbmi->txfm_size += vp9_read(r, cm->prob_tx[2]);
-    }
+    const int allow_16x16 = mbmi->mode != I8X8_PRED && mbmi->mode != SPLITMV;
+    const int allow_32x32 = mbmi->sb_type >= BLOCK_SIZE_SB32X32;
+    mbmi->txfm_size = select_txfm_size(cm, r, allow_16x16, allow_32x32);
   } else if (mbmi->sb_type >= BLOCK_SIZE_SB32X32 &&
              cm->txfm_mode >= ALLOW_32X32) {
     mbmi->txfm_size = TX_32X32;
@@ -1018,258 +1014,6 @@
   mb_mode_mv_init(pbi, r);
 }
 
-#if CONFIG_CODE_NONZEROCOUNT
-static uint16_t read_nzc(VP9_COMMON *const cm,
-                         int nzc_context,
-                         TX_SIZE tx_size,
-                         int ref,
-                         int type,
-                         vp9_reader *r) {
-  int c, e;
-  uint16_t nzc;
-  if (!get_nzc_used(tx_size)) return 0;
-  if (tx_size == TX_32X32) {
-    c = treed_read(r, vp9_nzc32x32_tree,
-                   cm->fc.nzc_probs_32x32[nzc_context][ref][type]);
-    cm->fc.nzc_counts_32x32[nzc_context][ref][type][c]++;
-  } else if (tx_size == TX_16X16) {
-    c = treed_read(r, vp9_nzc16x16_tree,
-                   cm->fc.nzc_probs_16x16[nzc_context][ref][type]);
-    cm->fc.nzc_counts_16x16[nzc_context][ref][type][c]++;
-  } else if (tx_size == TX_8X8) {
-    c = treed_read(r, vp9_nzc8x8_tree,
-                   cm->fc.nzc_probs_8x8[nzc_context][ref][type]);
-    cm->fc.nzc_counts_8x8[nzc_context][ref][type][c]++;
-  } else if (tx_size == TX_4X4) {
-    c = treed_read(r, vp9_nzc4x4_tree,
-                   cm->fc.nzc_probs_4x4[nzc_context][ref][type]);
-    cm->fc.nzc_counts_4x4[nzc_context][ref][type][c]++;
-  } else {
-    assert(0);
-  }
-  nzc = vp9_basenzcvalue[c];
-  if ((e = vp9_extranzcbits[c])) {
-    int x = 0;
-    while (e--) {
-      int b = vp9_read(
-          r, cm->fc.nzc_pcat_probs[nzc_context][c - NZC_TOKENS_NOEXTRA][e]);
-      x |= (b << e);
-      cm->fc.nzc_pcat_counts[nzc_context][c - NZC_TOKENS_NOEXTRA][e][b]++;
-    }
-    nzc += x;
-  }
-  if (tx_size == TX_32X32)
-    assert(nzc <= 1024);
-  else if (tx_size == TX_16X16)
-    assert(nzc <= 256);
-  else if (tx_size == TX_8X8)
-    assert(nzc <= 64);
-  else if (tx_size == TX_4X4)
-    assert(nzc <= 16);
-  return nzc;
-}
-
-static void read_nzcs_sb64(VP9_COMMON *const cm,
-                           MACROBLOCKD* xd,
-                           int mb_row,
-                           int mb_col,
-                           vp9_reader *r) {
-  MODE_INFO *m = xd->mode_info_context;
-  MB_MODE_INFO *const mi = &m->mbmi;
-  int j, nzc_context;
-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
-
-  assert(mb_col == get_mb_col(xd));
-  assert(mb_row == get_mb_row(xd));
-
-  vpx_memset(m->mbmi.nzcs, 0, 384 * sizeof(m->mbmi.nzcs[0]));
-
-  if (mi->mb_skip_coeff)
-    return;
-
-  switch (mi->txfm_size) {
-    case TX_32X32:
-      for (j = 0; j < 256; j += 64) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_32X32, ref, 0, r);
-      }
-      for (j = 256; j < 384; j += 64) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_32X32, ref, 1, r);
-      }
-      break;
-
-    case TX_16X16:
-      for (j = 0; j < 256; j += 16) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 0, r);
-      }
-      for (j = 256; j < 384; j += 16) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 1, r);
-      }
-      break;
-
-    case TX_8X8:
-      for (j = 0; j < 256; j += 4) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 0, r);
-      }
-      for (j = 256; j < 384; j += 4) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, r);
-      }
-      break;
-
-    case TX_4X4:
-      for (j = 0; j < 256; ++j) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 0, r);
-      }
-      for (j = 256; j < 384; ++j) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, r);
-      }
-      break;
-
-    default:
-      break;
-  }
-}
-
-static void read_nzcs_sb32(VP9_COMMON *const cm,
-                           MACROBLOCKD* xd,
-                           int mb_row,
-                           int mb_col,
-                           vp9_reader *r) {
-  MODE_INFO *m = xd->mode_info_context;
-  MB_MODE_INFO *const mi = &m->mbmi;
-  int j, nzc_context;
-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
-
-  assert(mb_col == get_mb_col(xd));
-  assert(mb_row == get_mb_row(xd));
-
-  vpx_memset(m->mbmi.nzcs, 0, 384 * sizeof(m->mbmi.nzcs[0]));
-
-  if (mi->mb_skip_coeff)
-    return;
-
-  switch (mi->txfm_size) {
-    case TX_32X32:
-      for (j = 0; j < 64; j += 64) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_32X32, ref, 0, r);
-      }
-      for (j = 64; j < 96; j += 16) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 1, r);
-      }
-      break;
-
-    case TX_16X16:
-      for (j = 0; j < 64; j += 16) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 0, r);
-      }
-      for (j = 64; j < 96; j += 16) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 1, r);
-      }
-      break;
-
-    case TX_8X8:
-      for (j = 0; j < 64; j += 4) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 0, r);
-      }
-      for (j = 64; j < 96; j += 4) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, r);
-      }
-      break;
-
-    case TX_4X4:
-      for (j = 0; j < 64; ++j) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 0, r);
-      }
-      for (j = 64; j < 96; ++j) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, r);
-      }
-      break;
-
-    default:
-      break;
-  }
-}
-
-static void read_nzcs_mb16(VP9_COMMON *const cm,
-                           MACROBLOCKD* xd,
-                           int mb_row,
-                           int mb_col,
-                           vp9_reader *r) {
-  MODE_INFO *m = xd->mode_info_context;
-  MB_MODE_INFO *const mi = &m->mbmi;
-  int j, nzc_context;
-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
-
-  assert(mb_col == get_mb_col(xd));
-  assert(mb_row == get_mb_row(xd));
-
-  vpx_memset(m->mbmi.nzcs, 0, 384 * sizeof(m->mbmi.nzcs[0]));
-
-  if (mi->mb_skip_coeff)
-    return;
-
-  switch (mi->txfm_size) {
-    case TX_16X16:
-      for (j = 0; j < 16; j += 16) {
-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 0, r);
-      }
-      for (j = 16; j < 24; j += 4) {
-        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, r);
-      }
-      break;
-
-    case TX_8X8:
-      for (j = 0; j < 16; j += 4) {
-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 0, r);
-      }
-      if (mi->mode == I8X8_PRED || mi->mode == SPLITMV) {
-        for (j = 16; j < 24; ++j) {
-          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-          m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, r);
-        }
-      } else {
-        for (j = 16; j < 24; j += 4) {
-          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-          m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, r);
-        }
-      }
-      break;
-
-    case TX_4X4:
-      for (j = 0; j < 16; ++j) {
-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 0, r);
-      }
-      for (j = 16; j < 24; ++j) {
-        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, r);
-      }
-      break;
-
-    default:
-      break;
-  }
-}
-#endif  // CONFIG_CODE_NONZEROCOUNT
-
 void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,
                            MACROBLOCKD* const xd,
                            int mb_row,
@@ -1288,16 +1032,8 @@
                       mi->mbmi.ref_frame - 1, mi->mbmi.second_ref_frame - 1,
                       cm->active_ref_scale);
   }
-#if CONFIG_CODE_NONZEROCOUNT
-  if (mbmi->sb_type == BLOCK_SIZE_SB64X64)
-    read_nzcs_sb64(cm, xd, mb_row, mb_col, r);
-  else if (mbmi->sb_type == BLOCK_SIZE_SB32X32)
-    read_nzcs_sb32(cm, xd, mb_row, mb_col, r);
-  else
-    read_nzcs_mb16(cm, xd, mb_row, mb_col, r);
-#endif  // CONFIG_CODE_NONZEROCOUNT
 
-  if (mbmi->sb_type) {
+  if (mbmi->sb_type > BLOCK_SIZE_MB16X16) {
     const int bw = 1 << mb_width_log2(mbmi->sb_type);
     const int bh = 1 << mb_height_log2(mbmi->sb_type);
     const int y_mbs = MIN(bh, cm->mb_rows - mb_row);
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index c962729..ac1c146 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -37,8 +37,6 @@
 #include <assert.h>
 #include <stdio.h>
 
-#define COEFCOUNT_TESTING
-
 // #define DEC_DEBUG
 #ifdef DEC_DEBUG
 int dec_debug = 0;
@@ -58,11 +56,20 @@
   return start + len > start && start + len <= end;
 }
 
-static TXFM_MODE read_txfm_mode(vp9_reader *r) {
-  TXFM_MODE mode = vp9_read_literal(r, 2);
-  if (mode == ALLOW_32X32)
-    mode += vp9_read_bit(r);
-  return mode;
+static void setup_txfm_mode(VP9_COMMON *pc, int lossless, vp9_reader *r) {
+  if (lossless) {
+    pc->txfm_mode = ONLY_4X4;
+  } else {
+    pc->txfm_mode = vp9_read_literal(r, 2);
+    if (pc->txfm_mode == ALLOW_32X32)
+      pc->txfm_mode += vp9_read_bit(r);
+
+    if (pc->txfm_mode == TX_MODE_SELECT) {
+      pc->prob_tx[0] = vp9_read_prob(r);
+      pc->prob_tx[1] = vp9_read_prob(r);
+      pc->prob_tx[2] = vp9_read_prob(r);
+    }
+  }
 }
 
 static int get_unsigned_bits(unsigned int num_values) {
@@ -165,10 +172,11 @@
   VP9_COMMON *const pc = &pbi->common;
 
   for (q = 0; q < QINDEX_RANGE; q++) {
+    // DC value
     pc->y_dequant[q][0] = (int16_t)vp9_dc_quant(q, pc->y_dc_delta_q);
     pc->uv_dequant[q][0] = (int16_t)vp9_dc_uv_quant(q, pc->uv_dc_delta_q);
 
-    /* all the ac values =; */
+    // AC values
     for (i = 1; i < 16; i++) {
       const int rc = vp9_default_zig_zag1d_4x4[i];
 
@@ -220,32 +228,6 @@
   }
 }
 
-#if CONFIG_CODE_NONZEROCOUNT
-static void propagate_nzcs(VP9_COMMON *cm, MACROBLOCKD *xd) {
-  MODE_INFO *m = xd->mode_info_context;
-  BLOCK_SIZE_TYPE sb_type = m->mbmi.sb_type;
-  const int mis = cm->mode_info_stride;
-  int n;
-  if (sb_type == BLOCK_SIZE_SB64X64) {
-    for (n = 0; n < 16; ++n) {
-      int i = n >> 2;
-      int j = n & 3;
-      if (i == 0 && j == 0) continue;
-      vpx_memcpy((m + j + mis * i)->mbmi.nzcs, m->mbmi.nzcs,
-                 384 * sizeof(m->mbmi.nzcs[0]));
-    }
-  } else if (sb_type == BLOCK_SIZE_SB32X32) {
-    for (n = 0; n < 4; ++n) {
-      int i = n >> 1;
-      int j = n & 1;
-      if (i == 0 && j == 0) continue;
-      vpx_memcpy((m + j + mis * i)->mbmi.nzcs, m->mbmi.nzcs,
-                 384 * sizeof(m->mbmi.nzcs[0]));
-    }
-  }
-}
-#endif
-
 static void decode_16x16(MACROBLOCKD *xd) {
   const TX_TYPE tx_type = get_tx_type_16x16(xd, 0);
 
@@ -323,7 +305,7 @@
 
 static INLINE void dequant_add_y(MACROBLOCKD *xd, TX_TYPE tx_type, int idx) {
   BLOCKD *const b = &xd->block[idx];
-  struct mb_plane *const y = &xd->plane[0];
+  struct macroblockd_plane *const y = &xd->plane[0];
   if (tx_type != DCT_DCT) {
     vp9_iht_add_c(tx_type, BLOCK_OFFSET(y->qcoeff, idx, 16),
                   *(b->base_dst) + b->dst, b->dst_stride, y->eobs[idx]);
@@ -608,9 +590,6 @@
 
   if (mi->mbmi.mb_skip_coeff) {
     vp9_reset_sb_tokens_context(xd, bsize);
-#if CONFIG_CODE_NONZEROCOUNT
-    vpx_memset(mi->mbmi.nzcs, 0, 384 * sizeof(mi->mbmi.nzcs[0]));
-#endif
   } else {
     // re-initialize macroblock dequantizer before detokenization
     if (xd->segmentation_enabled)
@@ -645,10 +624,6 @@
       }
     }
   }
-
-#if CONFIG_CODE_NONZEROCOUNT
-  propagate_nzcs(&pbi->common, xd);
-#endif
 }
 
 // TODO(jingning): Need to merge SB and MB decoding. The MB decoding currently
@@ -660,7 +635,7 @@
   const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;
   const int tx_size = xd->mode_info_context->mbmi.txfm_size;
 
-  assert(!xd->mode_info_context->mbmi.sb_type);
+  assert(xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_MB16X16);
 
   //mode = xd->mode_info_context->mbmi.mode;
   if (pbi->common.frame_type != KEY_FRAME)
@@ -681,7 +656,7 @@
            xd->mode_info_context->mbmi.mode, tx_size,
            xd->mode_info_context->mbmi.interp_filter);
 #endif
-    vp9_build_inter_predictors_mb(xd, mb_row, mb_col);
+    vp9_build_inter_predictors_sb(xd, mb_row, mb_col, BLOCK_SIZE_MB16X16);
   }
 
   if (xd->mode_info_context->mbmi.mb_skip_coeff) {
@@ -769,11 +744,6 @@
   return old_value != *dq;
 }
 
-#ifdef PACKET_TESTING
-#include <stdio.h>
-FILE *vpxlog = 0;
-#endif
-
 static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE_TYPE bsize,
                         int mb_row, int mb_col) {
   const int bh = 1 << mb_height_log2(bsize);
@@ -791,11 +761,12 @@
   xd->prev_mode_info_context = cm->prev_mi + mb_idx;
   xd->above_context = cm->above_context + mb_col;
   xd->left_context = cm->left_context + mb_row % 4;
+  xd->above_seg_context = cm->above_seg_context + mb_col;
+  xd->left_seg_context  = cm->left_seg_context + (mb_row & 3);
 
   // Distance of Mb to the various image edges. These are specified to 8th pel
   // as they are always compared to values that are in 1/8th pel units
-  set_mb_row(cm, xd, mb_row, bh);
-  set_mb_col(cm, xd, mb_col, bw);
+  set_mb_row_col(cm, xd, mb_row, bh, mb_col, bw);
 
   xd->plane[0].dst.buf = dst_fb->y_buffer + recon_yoffset;
   xd->plane[1].dst.buf = dst_fb->u_buffer + recon_uvoffset;
@@ -813,8 +784,8 @@
     const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[fb_idx];
     xd->scale_factor[0]    = cm->active_ref_scale[mbmi->ref_frame - 1];
     xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame - 1];
-    setup_pred_block(&xd->pre, cfg, mb_row, mb_col,
-                     &xd->scale_factor[0], &xd->scale_factor_uv[0]);
+    setup_pre_planes(xd, cfg, NULL, mb_row, mb_col,
+                     xd->scale_factor, xd->scale_factor_uv);
     xd->corrupted |= cfg->corrupted;
 
     if (mbmi->second_ref_frame > INTRA_FRAME) {
@@ -823,8 +794,8 @@
       const YV12_BUFFER_CONFIG *second_cfg = &cm->yv12_fb[second_fb_idx];
       xd->scale_factor[1]    = cm->active_ref_scale[mbmi->second_ref_frame - 1];
       xd->scale_factor_uv[1] = cm->active_ref_scale[mbmi->second_ref_frame - 1];
-      setup_pred_block(&xd->second_pre, second_cfg, mb_row, mb_col,
-                       &xd->scale_factor[1], &xd->scale_factor_uv[1]);
+      setup_pre_planes(xd, NULL, second_cfg, mb_row, mb_col,
+                       xd->scale_factor, xd->scale_factor_uv);
       xd->corrupted |= second_cfg->corrupted;
     }
   }
@@ -860,10 +831,14 @@
     return;
 
   if (bsize > BLOCK_SIZE_MB16X16) {
+    int pl;
     // read the partition information
+    xd->left_seg_context = pc->left_seg_context + (mb_row & 3);
+    xd->above_seg_context = pc->above_seg_context + mb_col;
+    pl = partition_plane_context(xd, bsize);
     partition = treed_read(r, vp9_partition_tree,
-                           pc->fc.partition_prob[bsl - 1]);
-    pc->fc.partition_counts[bsl - 1][partition]++;
+                           pc->fc.partition_prob[pl]);
+    pc->fc.partition_counts[pl][partition]++;
   }
 
   switch (partition) {
@@ -902,6 +877,13 @@
     default:
       assert(0);
   }
+  // update partition context
+  if ((partition == PARTITION_SPLIT) && (bsize > BLOCK_SIZE_SB32X32))
+    return;
+
+  xd->left_seg_context = pc->left_seg_context + (mb_row & 3);
+  xd->above_seg_context = pc->above_seg_context + mb_col;
+  update_partition_context(xd, subsize, bsize);
 }
 
 /* Decode a row of Superblocks (4x4 region of MBs) */
@@ -913,6 +895,7 @@
        mb_row < pc->cur_tile_mb_row_end; mb_row += 4) {
     // For a SB there are 2 left contexts, each pertaining to a MB row within
     vpx_memset(pc->left_context, 0, sizeof(pc->left_context));
+    vpx_memset(pc->left_seg_context, 0, sizeof(pc->left_seg_context));
     for (mb_col = pc->cur_tile_mb_col_start;
          mb_col < pc->cur_tile_mb_col_end; mb_col += 4) {
       decode_modes_sb(pbi, mb_row, mb_col, r, BLOCK_SIZE_SB64X64);
@@ -957,48 +940,37 @@
   xd->frame_type = pc->frame_type;
   xd->mode_info_context->mbmi.mode = DC_PRED;
   xd->mode_info_stride = pc->mode_info_stride;
-  xd->corrupted = 0;
 }
 
-#if CONFIG_CODE_NONZEROCOUNT
-static void read_nzc_probs_common(VP9_COMMON *cm,
-                                  vp9_reader *rd,
+#if CONFIG_CODE_ZEROGROUP
+static void read_zpc_probs_common(VP9_COMMON *cm,
+                                  vp9_reader* bc,
                                   TX_SIZE tx_size) {
-  int c, r, b, t;
-  int tokens, nodes;
-  vp9_prob *nzc_probs;
-  vp9_prob upd;
-
-  if (!get_nzc_used(tx_size)) return;
-  if (!vp9_read_bit(rd)) return;
+  int r, b, p, n;
+  vp9_zpc_probs *zpc_probs;
+  vp9_prob upd = ZPC_UPDATE_PROB;
+  if (!get_zpc_used(tx_size)) return;
+  if (!vp9_read_bit(bc)) return;
 
   if (tx_size == TX_32X32) {
-    tokens = NZC32X32_TOKENS;
-    nzc_probs = cm->fc.nzc_probs_32x32[0][0][0];
-    upd = NZC_UPDATE_PROB_32X32;
+    zpc_probs = &cm->fc.zpc_probs_32x32;
   } else if (tx_size == TX_16X16) {
-    tokens = NZC16X16_TOKENS;
-    nzc_probs = cm->fc.nzc_probs_16x16[0][0][0];
-    upd = NZC_UPDATE_PROB_16X16;
+    zpc_probs = &cm->fc.zpc_probs_16x16;
   } else if (tx_size == TX_8X8) {
-    tokens = NZC8X8_TOKENS;
-    nzc_probs = cm->fc.nzc_probs_8x8[0][0][0];
-    upd = NZC_UPDATE_PROB_8X8;
+    zpc_probs = &cm->fc.zpc_probs_8x8;
   } else {
-    tokens = NZC4X4_TOKENS;
-    nzc_probs = cm->fc.nzc_probs_4x4[0][0][0];
-    upd = NZC_UPDATE_PROB_4X4;
+    zpc_probs = &cm->fc.zpc_probs_4x4;
   }
-  nodes = tokens - 1;
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    for (r = 0; r < REF_TYPES; ++r) {
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;
-        int offset_nodes = offset * nodes;
-        for (t = 0; t < nodes; ++t) {
-          vp9_prob *p = &nzc_probs[offset_nodes + t];
-          if (vp9_read(rd, upd)) {
-            *p = read_prob_diff_update(rd, *p);
+  for (r = 0; r < REF_TYPES; ++r) {
+    for (b = 0; b < ZPC_BANDS; ++b) {
+      for (p = 0; p < ZPC_PTOKS; ++p) {
+        for (n = 0; n < ZPC_NODES; ++n) {
+          vp9_prob *q = &(*zpc_probs)[r][b][p][n];
+#if USE_ZPC_EXTRA == 0
+          if (n == 1) continue;
+#endif
+          if (vp9_read(bc, upd)) {
+            *q = read_prob_diff_update(bc, *q);
           }
         }
       }
@@ -1006,41 +978,17 @@
   }
 }
 
-static void read_nzc_pcat_probs(VP9_COMMON *cm, vp9_reader *r) {
-  int c, t, b;
-  vp9_prob upd = NZC_UPDATE_PROB_PCAT;
-  if (!(get_nzc_used(TX_4X4) || get_nzc_used(TX_8X8) ||
-        get_nzc_used(TX_16X16) || get_nzc_used(TX_32X32)))
-    return;
-  if (!vp9_read_bit(r)) {
-    return;
-  }
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {
-      int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];
-      for (b = 0; b < bits; ++b) {
-        vp9_prob *p = &cm->fc.nzc_pcat_probs[c][t][b];
-        if (vp9_read(r, upd)) {
-          *p = read_prob_diff_update(r, *p);
-        }
-      }
-    }
-  }
-}
-
-static void read_nzc_probs(VP9_COMMON *cm, vp9_reader *r) {
-  read_nzc_probs_common(cm, r, TX_4X4);
-  if (cm->txfm_mode != ONLY_4X4)
-    read_nzc_probs_common(cm, r, TX_8X8);
+static void read_zpc_probs(VP9_COMMON *cm,
+                           vp9_reader* bc) {
+  read_zpc_probs_common(cm, bc, TX_4X4);
+  if (cm->txfm_mode > ONLY_4X4)
+    read_zpc_probs_common(cm, bc, TX_8X8);
   if (cm->txfm_mode > ALLOW_8X8)
-    read_nzc_probs_common(cm, r, TX_16X16);
+    read_zpc_probs_common(cm, bc, TX_16X16);
   if (cm->txfm_mode > ALLOW_16X16)
-    read_nzc_probs_common(cm, r, TX_32X32);
-#ifdef NZC_PCAT_UPDATE
-  read_nzc_pcat_probs(cm, r);
-#endif
+    read_zpc_probs_common(cm, bc, TX_32X32);
 }
-#endif  // CONFIG_CODE_NONZEROCOUNT
+#endif  // CONFIG_CODE_ZEROGROUP
 
 static void read_coef_probs_common(VP9D_COMP *pbi,
                                    vp9_reader *r,
@@ -1059,11 +1007,7 @@
       for (j = 0; j < REF_TYPES; j++) {
         for (k = 0; k < COEF_BANDS; k++) {
           for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
-#if CONFIG_CODE_NONZEROCOUNT
-            const int mstart = get_nzc_used(tx_size);
-#else
             const int mstart = 0;
-#endif
             if (l >= 3 && k == 0)
               continue;
 
@@ -1091,7 +1035,7 @@
 
   read_coef_probs_common(pbi, r, fc->coef_probs_4x4, TX_4X4);
 
-  if (mode != ONLY_4X4)
+  if (mode > ONLY_4X4)
     read_coef_probs_common(pbi, r, fc->coef_probs_8x8, TX_8X8);
 
   if (mode > ALLOW_8X8)
@@ -1114,10 +1058,12 @@
   memset(cm->mip, 0,
         (cm->mb_cols + 1) * (cm->mb_rows + 1) * sizeof(MODE_INFO));
   vp9_update_mode_info_border(cm, cm->mip);
+  vp9_update_mode_info_border(cm, cm->prev_mip);
 
   cm->mi = cm->mip + cm->mode_info_stride + 1;
   cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
   vp9_update_mode_info_in_image(cm, cm->mi);
+  vp9_update_mode_info_in_image(cm, cm->prev_mi);
 }
 
 static void setup_segmentation(VP9_COMMON *pc, MACROBLOCKD *xd, vp9_reader *r) {
@@ -1125,59 +1071,53 @@
 
   xd->update_mb_segmentation_map = 0;
   xd->update_mb_segmentation_data = 0;
+#if CONFIG_IMPLICIT_SEGMENTATION
+  xd->allow_implicit_segment_update = 0;
+#endif
 
   xd->segmentation_enabled = vp9_read_bit(r);
-  if (xd->segmentation_enabled) {
-    // Segmentation map update
-    xd->update_mb_segmentation_map = vp9_read_bit(r);
-    if (xd->update_mb_segmentation_map) {
-      for (i = 0; i < MB_FEATURE_TREE_PROBS; i++)
-        xd->mb_segment_tree_probs[i] = vp9_read_bit(r) ? vp9_read_prob(r)
-                                                       : MAX_PROB;
+  if (!xd->segmentation_enabled)
+    return;
 
-      pc->temporal_update = vp9_read_bit(r);
-      if (pc->temporal_update) {
-        const vp9_prob *p = xd->mb_segment_tree_probs;
-        vp9_prob *mispred_p = xd->mb_segment_mispred_tree_probs;
+  // Segmentation map update
+  xd->update_mb_segmentation_map = vp9_read_bit(r);
+#if CONFIG_IMPLICIT_SEGMENTATION
+    xd->allow_implicit_segment_update = vp9_read_bit(r);
+#endif
+  if (xd->update_mb_segmentation_map) {
+    for (i = 0; i < MB_SEG_TREE_PROBS; i++)
+      xd->mb_segment_tree_probs[i] = vp9_read_bit(r) ? vp9_read_prob(r)
+                                                     : MAX_PROB;
 
-        const int c0 =        p[0]  *        p[1];
-        const int c1 =        p[0]  * (256 - p[1]);
-        const int c2 = (256 - p[0]) *        p[2];
-        const int c3 = (256 - p[0]) * (256 - p[2]);
-
-        mispred_p[0] = get_binary_prob(c1, c2 + c3);
-        mispred_p[1] = get_binary_prob(c0, c2 + c3);
-        mispred_p[2] = get_binary_prob(c0 + c1, c3);
-        mispred_p[3] = get_binary_prob(c0 + c1, c2);
-
-        for (i = 0; i < PREDICTION_PROBS; i++)
-          pc->segment_pred_probs[i] = vp9_read_bit(r) ? vp9_read_prob(r)
-                                                      : MAX_PROB;
-      } else {
-        for (i = 0; i < PREDICTION_PROBS; i++)
-          pc->segment_pred_probs[i] = MAX_PROB;
-      }
+    pc->temporal_update = vp9_read_bit(r);
+    if (pc->temporal_update) {
+      for (i = 0; i < PREDICTION_PROBS; i++)
+        pc->segment_pred_probs[i] = vp9_read_bit(r) ? vp9_read_prob(r)
+                                                    : MAX_PROB;
+    } else {
+      for (i = 0; i < PREDICTION_PROBS; i++)
+        pc->segment_pred_probs[i] = MAX_PROB;
     }
+  }
 
-    // Segmentation data update
-    xd->update_mb_segmentation_data = vp9_read_bit(r);
-    if (xd->update_mb_segmentation_data) {
-      xd->mb_segment_abs_delta = vp9_read_bit(r);
+  // Segmentation data update
+  xd->update_mb_segmentation_data = vp9_read_bit(r);
+  if (xd->update_mb_segmentation_data) {
+    xd->mb_segment_abs_delta = vp9_read_bit(r);
 
-      vp9_clearall_segfeatures(xd);
+    vp9_clearall_segfeatures(xd);
 
-      for (i = 0; i < MAX_MB_SEGMENTS; i++) {
-        for (j = 0; j < SEG_LVL_MAX; j++) {
-          int data = 0;
-          const int feature_enabled = vp9_read_bit(r);
-          if (feature_enabled) {
-            vp9_enable_segfeature(xd, i, j);
-            data = decode_unsigned_max(r, vp9_seg_feature_data_max(j));
-            if (vp9_is_segfeature_signed(j))
-              data = vp9_read_and_apply_sign(r, data);
-          }
-          vp9_set_segdata(xd, i, j, data);
+    for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+      for (j = 0; j < SEG_LVL_MAX; j++) {
+        int data = 0;
+        const int feature_enabled = vp9_read_bit(r);
+        if (feature_enabled) {
+          vp9_enable_segfeature(xd, i, j);
+          data = decode_unsigned_max(r, vp9_seg_feature_data_max(j));
+          if (vp9_is_segfeature_signed(j))
+            data = vp9_read_and_apply_sign(r, data);
         }
+        vp9_set_segdata(xd, i, j, data);
       }
     }
   }
@@ -1214,8 +1154,8 @@
   // Read in loop filter deltas applied at the MB level based on mode or ref
   // frame.
   xd->mode_ref_lf_delta_update = 0;
-  xd->mode_ref_lf_delta_enabled = vp9_read_bit(r);
 
+  xd->mode_ref_lf_delta_enabled = vp9_read_bit(r);
   if (xd->mode_ref_lf_delta_enabled) {
     xd->mode_ref_lf_delta_update = vp9_read_bit(r);
     if (xd->mode_ref_lf_delta_update) {
@@ -1251,6 +1191,11 @@
   mb_init_dequantizer(pbi, &pbi->mb);  // MB level dequantizer setup
 }
 
+static INTERPOLATIONFILTERTYPE read_mcomp_filter_type(vp9_reader *r) {
+  return vp9_read_bit(r) ? SWITCHABLE
+                         : vp9_read_literal(r, 2);
+}
+
 static const uint8_t *read_frame_size(VP9_COMMON *const pc, const uint8_t *data,
                                       const uint8_t *data_end,
                                       int *width, int *height) {
@@ -1356,18 +1301,16 @@
   vp9_zero(fc->interintra_counts);
 #endif
 
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_copy(fc->pre_nzc_probs_4x4, fc->nzc_probs_4x4);
-  vp9_copy(fc->pre_nzc_probs_8x8, fc->nzc_probs_8x8);
-  vp9_copy(fc->pre_nzc_probs_16x16, fc->nzc_probs_16x16);
-  vp9_copy(fc->pre_nzc_probs_32x32, fc->nzc_probs_32x32);
-  vp9_copy(fc->pre_nzc_pcat_probs, fc->nzc_pcat_probs);
+#if CONFIG_CODE_ZEROGROUP
+  vp9_copy(fc->pre_zpc_probs_4x4, fc->zpc_probs_4x4);
+  vp9_copy(fc->pre_zpc_probs_8x8, fc->zpc_probs_8x8);
+  vp9_copy(fc->pre_zpc_probs_16x16, fc->zpc_probs_16x16);
+  vp9_copy(fc->pre_zpc_probs_32x32, fc->zpc_probs_32x32);
 
-  vp9_zero(fc->nzc_counts_4x4);
-  vp9_zero(fc->nzc_counts_8x8);
-  vp9_zero(fc->nzc_counts_16x16);
-  vp9_zero(fc->nzc_counts_32x32);
-  vp9_zero(fc->nzc_pcat_counts);
+  vp9_zero(fc->zpc_counts_4x4);
+  vp9_zero(fc->zpc_counts_8x8);
+  vp9_zero(fc->zpc_counts_16x16);
+  vp9_zero(fc->zpc_counts_32x32);
 #endif
 }
 
@@ -1394,7 +1337,10 @@
   pc->tile_rows    = 1 << pc->log2_tile_rows;
 
   vpx_memset(pc->above_context, 0,
-             sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
+             sizeof(ENTROPY_CONTEXT_PLANES) * mb_cols_aligned_to_sb(pc));
+
+  vpx_memset(pc->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
+                                       mb_cols_aligned_to_sb(pc));
 
   if (pbi->oxcf.inv_tile_order) {
     const int n_cols = pc->tile_columns;
@@ -1459,12 +1405,11 @@
   const uint8_t *data = pbi->source;
   const uint8_t *data_end = data + pbi->source_sz;
   size_t first_partition_size = 0;
-  int i, corrupt_tokens = 0;
-
-  // printf("Decoding frame %d\n", pc->current_video_frame);
+  YV12_BUFFER_CONFIG *new_fb = &pc->yv12_fb[pc->new_fb_idx];
+  int i;
 
   xd->corrupted = 0;  // start with no corruption of current frame
-  pc->yv12_fb[pc->new_fb_idx].corrupted = 0;
+  new_fb->corrupted = 0;
 
   if (data_end - data < 3) {
     vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet");
@@ -1507,8 +1452,7 @@
   init_frame(pbi);
 
   // Reset the frame pointers to the current frame size
-  vp8_yv12_realloc_frame_buffer(&pc->yv12_fb[pc->new_fb_idx],
-                                pc->width, pc->height,
+  vp8_yv12_realloc_frame_buffer(new_fb, pc->width, pc->height,
                                 VP9BORDERINPIXELS);
 
   if (vp9_reader_init(&header_bc, data, first_partition_size))
@@ -1519,10 +1463,11 @@
   pc->clamp_type = (CLAMP_TYPE)vp9_read_bit(&header_bc);
   pc->error_resilient_mode = vp9_read_bit(&header_bc);
 
+  xd->lossless = vp9_read_bit(&header_bc);
+
   setup_loopfilter(pc, xd, &header_bc);
 
-  // Dummy read for now
-  vp9_read_literal(&header_bc, 2);
+  vp9_read_literal(&header_bc, 2);  // unused
 
   setup_quantization(pbi, &header_bc);
 
@@ -1544,14 +1489,8 @@
 
     pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp9_read_bit(&header_bc);
     pc->ref_frame_sign_bias[ALTREF_FRAME] = vp9_read_bit(&header_bc);
-
-    // Is high precision mv allowed
     xd->allow_high_precision_mv = vp9_read_bit(&header_bc);
-
-    // Read the type of subpel filter to use
-    pc->mcomp_filter_type = vp9_read_bit(&header_bc)
-                                ? SWITCHABLE
-                                : vp9_read_literal(&header_bc, 2);
+    pc->mcomp_filter_type = read_mcomp_filter_type(&header_bc);
 
 #if CONFIG_COMP_INTERINTRA_PRED
     pc->use_interintra = vp9_read_bit(&header_bc);
@@ -1579,6 +1518,7 @@
     pc->refresh_entropy_probs = 0;
     pc->frame_parallel_decoding_mode = 1;
   }
+
   pc->frame_context_idx = vp9_read_literal(&header_bc, NUM_FRAME_CONTEXTS_LG2);
   vpx_memcpy(&pc->fc, &pc->frame_contexts[pc->frame_context_idx],
              sizeof(pc->fc));
@@ -1587,13 +1527,7 @@
 
   setup_pred_probs(pc, &header_bc);
 
-  xd->lossless = vp9_read_bit(&header_bc);
-  pc->txfm_mode = xd->lossless ? ONLY_4X4 : read_txfm_mode(&header_bc);
-  if (pc->txfm_mode == TX_MODE_SELECT) {
-    pc->prob_tx[0] = vp9_read_prob(&header_bc);
-    pc->prob_tx[1] = vp9_read_prob(&header_bc);
-    pc->prob_tx[2] = vp9_read_prob(&header_bc);
-  }
+  setup_txfm_mode(pc, xd->lossless, &header_bc);
 
   // Read inter mode probability context updates
   if (pc->frame_type != KEY_FRAME) {
@@ -1604,32 +1538,21 @@
           pc->fc.vp9_mode_contexts[i][j] = vp9_read_prob(&header_bc);
   }
 #if CONFIG_MODELCOEFPROB
-  if (pc->frame_type == KEY_FRAME) {
+  if (pc->frame_type == KEY_FRAME)
     vp9_default_coef_probs(pc);
-  }
 #endif
 
-  if (0) {
-    FILE *z = fopen("decodestats.stt", "a");
-    fprintf(z, "%6d F:%d,R:%d,Q:%d\n",
-            pc->current_video_frame,
-            pc->frame_type,
-            pbi->refresh_frame_flags,
-            pc->base_qindex);
-    fclose(z);
-  }
-
   update_frame_context(pbi);
 
   read_coef_probs(pbi, &header_bc);
-#if CONFIG_CODE_NONZEROCOUNT
-  read_nzc_probs(&pbi->common, &header_bc);
+#if CONFIG_CODE_ZEROGROUP
+  read_zpc_probs(pc, &header_bc);
 #endif
 
   // Initialize xd pointers. Any reference should do for xd->pre, so use 0.
-  vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->active_ref_idx[0]],
-             sizeof(YV12_BUFFER_CONFIG));
-  setup_dst_planes(xd, &pc->yv12_fb[pc->new_fb_idx], 0, 0);
+  setup_pre_planes(xd, &pc->yv12_fb[pc->active_ref_idx[0]], NULL,
+                   0, 0, NULL, NULL);
+  setup_dst_planes(xd, new_fb, 0, 0);
 
   // Create the segmentation map structure and set to 0
   if (!pc->last_frame_seg_map)
@@ -1637,7 +1560,7 @@
                     vpx_calloc((pc->mb_rows * pc->mb_cols), 1));
 
   // set up frame new frame for intra coded blocks
-  vp9_setup_intra_recon(&pc->yv12_fb[pc->new_fb_idx]);
+  vp9_setup_intra_recon(new_fb);
 
   vp9_setup_block_dptrs(xd);
   vp9_build_block_doffsets(xd);
@@ -1652,56 +1575,45 @@
   vp9_decode_mode_mvs_init(pbi, &header_bc);
 
   decode_tiles(pbi, data, first_partition_size, &header_bc, &residual_bc);
-  corrupt_tokens |= xd->corrupted;
 
-  // keep track of the last coded dimensions
   pc->last_width = pc->width;
   pc->last_height = pc->height;
 
-  // Collect information about decoder corruption.
-  // 1. Check first boolean decoder for errors.
-  // 2. Check the macroblock information
-  pc->yv12_fb[pc->new_fb_idx].corrupted = vp9_reader_has_error(&header_bc) |
-                                          corrupt_tokens;
+  new_fb->corrupted = vp9_reader_has_error(&header_bc) | xd->corrupted;
 
   if (!pbi->decoded_key_frame) {
-    if (pc->frame_type == KEY_FRAME && !pc->yv12_fb[pc->new_fb_idx].corrupted)
+    if (pc->frame_type == KEY_FRAME && !new_fb->corrupted)
       pbi->decoded_key_frame = 1;
     else
-      vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME,
+      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                          "A stream must start with a complete key frame");
   }
 
+  // Adaptation
   if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) {
     vp9_adapt_coef_probs(pc);
-#if CONFIG_CODE_NONZEROCOUNT
-    vp9_adapt_nzc_probs(pc);
+#if CONFIG_CODE_ZEROGROUP
+    vp9_adapt_zpc_probs(pc);
 #endif
-  }
-
-  if (pc->frame_type != KEY_FRAME) {
-    if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) {
+    if (pc->frame_type != KEY_FRAME) {
       vp9_adapt_mode_probs(pc);
       vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);
-      vp9_adapt_mode_context(&pbi->common);
+      vp9_adapt_mode_context(pc);
     }
   }
 
+#if CONFIG_IMPLICIT_SEGMENTATION
+  // If signalled at the frame level apply implicit updates to the segment map.
+  if (!pc->error_resilient_mode && xd->allow_implicit_segment_update) {
+    vp9_implicit_segment_map_update(pc);
+  }
+#endif
+
   if (pc->refresh_entropy_probs) {
     vpx_memcpy(&pc->frame_contexts[pc->frame_context_idx], &pc->fc,
                sizeof(pc->fc));
   }
 
-#ifdef PACKET_TESTING
-  {
-    FILE *f = fopen("decompressor.VP8", "ab");
-    unsigned int size = residual_bc.pos + header_bc.pos + 8;
-    fwrite((void *) &size, 4, 1, f);
-    fwrite((void *) pbi->Source, size, 1, f);
-    fclose(f);
-  }
-#endif
-
   *p_data_end = vp9_reader_find_end(&residual_bc);
   return 0;
 }
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 7519108..3099b93 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -60,25 +60,29 @@
 
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
 
+#if CONFIG_CODE_ZEROGROUP
+#define ZEROGROUP_ADVANCE()                \
+  do {                                     \
+    token_cache[scan[c]] = ZERO_TOKEN;     \
+    is_last_zero[o] = 1;                   \
+    c++;                                   \
+  } while (0)
+#define INCREMENT_COUNT(token)             \
+  do {                                     \
+    coef_counts[type][ref][get_coef_band(scan, txfm_size, c)] \
+               [pt][token]++;     \
+    token_cache[scan[c]] = token; \
+    is_last_zero[o] = (token == ZERO_TOKEN);    \
+  } while (0)
+#else
 #define INCREMENT_COUNT(token)               \
   do {                                       \
     coef_counts[type][ref][get_coef_band(scan, txfm_size, c)] \
                [pt][token]++;     \
-    token_cache[c] = token; \
-    pt = vp9_get_coef_context(scan, nb, pad, token_cache,     \
-                              c + 1, default_eob); \
+    token_cache[scan[c]] = token; \
   } while (0)
+#endif
 
-#if CONFIG_CODE_NONZEROCOUNT
-#define WRITE_COEF_CONTINUE(val, token)                       \
-  {                                                           \
-    qcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(r, val);    \
-    INCREMENT_COUNT(token);                                   \
-    c++;                                                      \
-    nzc++;                                                    \
-    continue;                                                 \
-  }
-#else
 #define WRITE_COEF_CONTINUE(val, token)                  \
   {                                                      \
     qcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(r, val) * dq[c > 0]; \
@@ -86,7 +90,12 @@
     c++;                                                 \
     continue;                                            \
   }
-#endif  // CONFIG_CODE_NONZEROCOUNT
+
+#define WRITE_COEF_ONE()                                 \
+{                                                        \
+  qcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(br, 1);  \
+  INCREMENT_COUNT(ONE_TOKEN);                            \
+}
 
 #define ADJUST_COEF(prob, bits_count)  \
   do {                                 \
@@ -108,14 +117,21 @@
   vp9_prob *prob;
   vp9_coeff_count *coef_counts;
   const int ref = xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME;
-#if CONFIG_CODE_NONZEROCOUNT
-  const int nzc_used = get_nzc_used(txfm_size);
-  uint16_t nzc = 0;
-  uint16_t nzc_expected =
-      nzc_used ? xd->mode_info_context->mbmi.nzcs[block_idx] : 0;
+  TX_TYPE tx_type = DCT_DCT;
+#if CONFIG_CODE_ZEROGROUP
+  int is_eoo[3] = {0, 0, 0};
+  int is_last_zero[3] = {0, 0, 0};
+  int o, rc;
+  vp9_zpc_probs *zpc_probs;
+  vp9_zpc_count *zpc_count;
+  vp9_prob *zprobs;
+  int eoo = 0, use_eoo;
 #endif
   const int *scan, *nb;
   uint8_t token_cache[1024];
+#if CONFIG_CODE_ZEROGROUP
+  vpx_memset(token_cache, UNKNOWN_TOKEN, sizeof(token_cache));
+#endif
 
   if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
     aidx = vp9_block2above_sb64[txfm_size][block_idx];
@@ -147,24 +163,18 @@
   switch (txfm_size) {
     default:
     case TX_4X4: {
-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                              get_tx_type_4x4(xd, block_idx) : DCT_DCT;
-      switch (tx_type) {
-        default:
-          scan = vp9_default_zig_zag1d_4x4;
-          break;
-        case ADST_DCT:
-          scan = vp9_row_scan_4x4;
-          break;
-        case DCT_ADST:
-          scan = vp9_col_scan_4x4;
-          break;
-      }
+      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+          get_tx_type_4x4(xd, block_idx) : DCT_DCT;
+      scan = get_scan_4x4(tx_type);
       above_ec = A0[aidx] != 0;
       left_ec = L0[lidx] != 0;
       coef_probs  = fc->coef_probs_4x4;
       coef_counts = fc->coef_counts_4x4;
       default_eob = 16;
+#if CONFIG_CODE_ZEROGROUP
+      zpc_probs = &(fc->zpc_probs_4x4);
+      zpc_count = &(fc->zpc_counts_4x4);
+#endif
       break;
     }
     case TX_8X8: {
@@ -172,24 +182,18 @@
       const int sz = 3 + mb_width_log2(sb_type);
       const int x = block_idx & ((1 << sz) - 1);
       const int y = block_idx - x;
-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                              get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
-      switch (tx_type) {
-        default:
-          scan = vp9_default_zig_zag1d_8x8;
-          break;
-        case ADST_DCT:
-          scan = vp9_row_scan_8x8;
-          break;
-        case DCT_ADST:
-          scan = vp9_col_scan_8x8;
-          break;
-      }
+      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+          get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
+      scan = get_scan_8x8(tx_type);
       coef_probs  = fc->coef_probs_8x8;
       coef_counts = fc->coef_counts_8x8;
       above_ec = (A0[aidx] + A0[aidx + 1]) != 0;
       left_ec  = (L0[lidx] + L0[lidx + 1]) != 0;
       default_eob = 64;
+#if CONFIG_CODE_ZEROGROUP
+      zpc_probs = &(fc->zpc_probs_8x8);
+      zpc_count = &(fc->zpc_counts_8x8);
+#endif
       break;
     }
     case TX_16X16: {
@@ -197,19 +201,9 @@
       const int sz = 4 + mb_width_log2(sb_type);
       const int x = block_idx & ((1 << sz) - 1);
       const int y = block_idx - x;
-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                              get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
-      switch (tx_type) {
-        default:
-          scan = vp9_default_zig_zag1d_16x16;
-          break;
-        case ADST_DCT:
-          scan = vp9_row_scan_16x16;
-          break;
-        case DCT_ADST:
-          scan = vp9_col_scan_16x16;
-          break;
-      }
+      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+          get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
+      scan = get_scan_16x16(tx_type);
       coef_probs  = fc->coef_probs_16x16;
       coef_counts = fc->coef_counts_16x16;
       if (type == PLANE_TYPE_UV) {
@@ -222,6 +216,10 @@
         left_ec  = (L0[lidx] + L0[lidx + 1] + L0[lidx + 2] + L0[lidx + 3]) != 0;
       }
       default_eob = 256;
+#if CONFIG_CODE_ZEROGROUP
+      zpc_probs = &(fc->zpc_probs_16x16);
+      zpc_count = &(fc->zpc_counts_16x16);
+#endif
       break;
     }
     case TX_32X32:
@@ -248,6 +246,10 @@
                     L1[lidx] + L1[lidx + 1] + L1[lidx + 2] + L1[lidx + 3]) != 0;
       }
       default_eob = 1024;
+#if CONFIG_CODE_ZEROGROUP
+      zpc_probs = &fc->zpc_probs_32x32;
+      zpc_count = &fc->zpc_counts_32x32;
+#endif
       break;
   }
 
@@ -256,35 +258,67 @@
 
   while (1) {
     int val;
+    int band;
     const uint8_t *cat6 = cat6_prob;
-
     if (c >= seg_eob)
       break;
-#if CONFIG_CODE_NONZEROCOUNT
-    if (nzc_used && nzc == nzc_expected)
+    if (c)
+      pt = vp9_get_coef_context(scan, nb, pad, token_cache,
+                                c, default_eob);
+    band = get_coef_band(scan, txfm_size, c);
+    prob = coef_probs[type][ref][band][pt];
+    fc->eob_branch_counts[txfm_size][type][ref][band][pt]++;
+    if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
       break;
+#if CONFIG_CODE_ZEROGROUP
+    rc = scan[c];
+    o = vp9_get_orientation(rc, txfm_size);
+    if (token_cache[rc] == ZERO_TOKEN || is_eoo[o]) {
+      coef_counts[type][ref][band][pt][ZERO_TOKEN]++;
+      ZEROGROUP_ADVANCE();
+      goto SKIP_START;
+    }
 #endif
-    prob = coef_probs[type][ref][get_coef_band(scan, txfm_size, c)][pt];
-    fc->eob_branch_counts[txfm_size][type][ref]
-                         [get_coef_band(scan, txfm_size, c)][pt]++;
-#if CONFIG_CODE_NONZEROCOUNT
-    if (!nzc_used)
-#endif
-      if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
-        break;
+
 SKIP_START:
     if (c >= seg_eob)
       break;
-#if CONFIG_CODE_NONZEROCOUNT
-    if (nzc_used && nzc == nzc_expected)
-      break;
-    // decode zero node only if there are zeros left
-    if (!nzc_used || seg_eob - nzc_expected - c + nzc > 0)
+    if (c)
+      pt = vp9_get_coef_context(scan, nb, pad, token_cache,
+                                c, default_eob);
+    band = get_coef_band(scan, txfm_size, c);
+    prob = coef_probs[type][ref][band][pt];
+#if CONFIG_CODE_ZEROGROUP
+    rc = scan[c];
+    o = vp9_get_orientation(rc, txfm_size);
+    if (token_cache[rc] == ZERO_TOKEN || is_eoo[o]) {
+      ZEROGROUP_ADVANCE();
+      goto SKIP_START;
+    }
+    zprobs = (*zpc_probs)[ref]
+             [coef_to_zpc_band(band)]
+             [coef_to_zpc_ptok(pt)];
 #endif
     if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
+#if CONFIG_CODE_ZEROGROUP
+      eoo = 0;
+#if USE_ZPC_EOORIENT == 1
+      use_eoo = vp9_use_eoo(c, seg_eob, scan, txfm_size, is_last_zero, is_eoo);
+#else
+      use_eoo = 0;
+#endif
+      if (use_eoo) {
+        eoo = !vp9_read(r, zprobs[0]);
+        ++(*zpc_count)[ref]
+                      [coef_to_zpc_band(band)]
+                      [coef_to_zpc_ptok(pt)][0][!eoo];
+        if (eoo) {
+          is_eoo[o] = 1;
+        }
+      }
+#endif
       INCREMENT_COUNT(ZERO_TOKEN);
       ++c;
-      prob = coef_probs[type][ref][get_coef_band(scan, txfm_size, c)][pt];
       goto SKIP_START;
     }
     // ONE_CONTEXT_NODE_0_
@@ -347,18 +381,9 @@
     WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY6);
   }
 
-#if CONFIG_CODE_NONZEROCOUNT
-  if (!nzc_used)
-#endif
-    if (c < seg_eob)
-      coef_counts[type][ref][get_coef_band(scan, txfm_size, c)]
-                 [pt][DCT_EOB_TOKEN]++;
-#if CONFIG_CODE_NONZEROCOUNT
-  if (!nzc_used)
-    xd->mode_info_context->mbmi.nzcs[block_idx] = nzc;
-  else
-    assert(nzc == nzc_expected);
-#endif
+  if (c < seg_eob)
+    coef_counts[type][ref][get_coef_band(scan, txfm_size, c)]
+        [pt][DCT_EOB_TOKEN]++;
 
   A0[aidx] = L0[lidx] = c > 0;
   if (txfm_size >= TX_8X8) {
diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index b64b7e4..a07a8fd 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@@ -365,20 +365,23 @@
 
   vp9_clear_system_state();
 
+  cm->last_show_frame = cm->show_frame;
   if (cm->show_frame) {
-    vpx_memcpy(cm->prev_mip, cm->mip,
-               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
-  } else {
-    vpx_memset(cm->prev_mip, 0,
-               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
+    // current mip will be the prev_mip for the next frame
+    MODE_INFO *temp = cm->prev_mip;
+    cm->prev_mip = cm->mip;
+    cm->mip = temp;
+
+    // update the upper left visible macroblock ptrs
+    cm->mi = cm->mip + cm->mode_info_stride + 1;
+    cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
+
+    cm->current_video_frame++;
   }
 
   /*vp9_print_modes_and_motion_vectors(cm->mi, cm->mb_rows,cm->mb_cols,
                                        cm->current_video_frame);*/
 
-  if (cm->show_frame)
-    cm->current_video_frame++;
-
   pbi->ready_for_new_data = 0;
   pbi->last_time_stamp = time_stamp;
   pbi->source_sz = 0;
diff --git a/vp9/encoder/vp9_asm_enc_offsets.c b/vp9/encoder/vp9_asm_enc_offsets.c
index 1a770dc..d8e844e 100644
--- a/vp9/encoder/vp9_asm_enc_offsets.c
+++ b/vp9/encoder/vp9_asm_enc_offsets.c
@@ -20,15 +20,6 @@
 BEGIN
 
 /* regular quantize */
-DEFINE(vp9_block_coeff,                         offsetof(BLOCK, coeff));
-DEFINE(vp9_block_zbin,                          offsetof(BLOCK, zbin));
-DEFINE(vp9_block_round,                         offsetof(BLOCK, round));
-DEFINE(vp9_block_quant,                         offsetof(BLOCK, quant));
-DEFINE(vp9_block_quant_fast,                    offsetof(BLOCK, quant_fast));
-DEFINE(vp9_block_zbin_extra,                    offsetof(BLOCK, zbin_extra));
-DEFINE(vp9_block_zrun_zbin_boost,               offsetof(BLOCK, zrun_zbin_boost));
-DEFINE(vp9_block_quant_shift,                   offsetof(BLOCK, quant_shift));
-
 DEFINE(vp9_blockd_dequant,                      offsetof(BLOCKD, dequant));
 
 END
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 95e05fd..df58a91 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -50,21 +50,15 @@
 extern unsigned int active_section;
 #endif
 
-#if CONFIG_CODE_NONZEROCOUNT
-#ifdef NZC_STATS
-unsigned int nzc_stats_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                          [NZC4X4_TOKENS];
-unsigned int nzc_stats_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                          [NZC8X8_TOKENS];
-unsigned int nzc_stats_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                          [NZC16X16_TOKENS];
-unsigned int nzc_stats_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                          [NZC32X32_TOKENS];
-unsigned int nzc_pcat_stats[MAX_NZC_CONTEXTS][NZC_TOKENS_EXTRA]
-                          [NZC_BITS_EXTRA][2];
-void init_nzcstats();
-void update_nzcstats(VP9_COMMON *const cm);
-void print_nzcstats();
+#if CONFIG_CODE_ZEROGROUP
+#ifdef ZPC_STATS
+vp9_zpc_count zpc_stats_4x4;
+vp9_zpc_count zpc_stats_8x8;
+vp9_zpc_count zpc_stats_16x16;
+vp9_zpc_count zpc_stats_32x32;
+void init_zpcstats();
+void update_zpcstats(VP9_COMMON *const cm);
+void print_zpcstats();
 #endif
 #endif
 
@@ -427,24 +421,42 @@
     const unsigned char *pp = p->context_tree;
     int v = a->value;
     int n = a->len;
+    int ncount = n;
 
     if (t == EOSB_TOKEN)
     {
       ++p;
       break;
     }
+    assert(pp != 0);
+#if CONFIG_CODE_ZEROGROUP
+    if (t == ZPC_ISOLATED || t == ZPC_EOORIENT) {
+      assert((p - 1)->token == ZERO_TOKEN);
+      encode_bool(bc, t == ZPC_ISOLATED, *pp);
+      ++p;
+      continue;
+    } else if (p->skip_coef_val) {
+      assert(p->skip_eob_node == 0);
+      assert(t == DCT_EOB_TOKEN || t == ZERO_TOKEN);
+      encode_bool(bc, t == ZERO_TOKEN, *pp);
+      ++p;
+      continue;
+    }
+#endif
 
     /* skip one or two nodes */
     if (p->skip_eob_node) {
       n -= p->skip_eob_node;
       i = 2 * p->skip_eob_node;
+      ncount -= p->skip_eob_node;
     }
 
     do {
       const int bb = (v >> --n) & 1;
       vp9_write(bc, bb, pp[i >> 1]);
       i = vp9_coef_tree[i + bb];
-    } while (n);
+      ncount--;
+    } while (n && ncount);
 
 
     if (b->base_val) {
@@ -525,51 +537,54 @@
       case 0:
         vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
         vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);
+        vp9_write(bc, 0, xd->mb_segment_tree_probs[2]);
         break;
       case 1:
         vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[1]);
+        vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);
+        vp9_write(bc, 1, xd->mb_segment_tree_probs[2]);
         break;
       case 2:
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[2]);
+        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
+        vp9_write(bc, 1, xd->mb_segment_tree_probs[1]);
+        vp9_write(bc, 0, xd->mb_segment_tree_probs[3]);
         break;
       case 3:
+        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
+        vp9_write(bc, 1, xd->mb_segment_tree_probs[1]);
+        vp9_write(bc, 1, xd->mb_segment_tree_probs[3]);
+        break;
+      case 4:
         vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[2]);
+        vp9_write(bc, 0, xd->mb_segment_tree_probs[4]);
+        vp9_write(bc, 0, xd->mb_segment_tree_probs[5]);
+        break;
+      case 5:
+        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);
+        vp9_write(bc, 0, xd->mb_segment_tree_probs[4]);
+        vp9_write(bc, 1, xd->mb_segment_tree_probs[5]);
+        break;
+      case 6:
+        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);
+        vp9_write(bc, 1, xd->mb_segment_tree_probs[4]);
+        vp9_write(bc, 0, xd->mb_segment_tree_probs[6]);
+        break;
+      case 7:
+        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);
+        vp9_write(bc, 1, xd->mb_segment_tree_probs[4]);
+        vp9_write(bc, 1, xd->mb_segment_tree_probs[6]);
         break;
 
         // TRAP.. This should not happen
       default:
         vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
         vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);
+        vp9_write(bc, 0, xd->mb_segment_tree_probs[2]);
         break;
     }
   }
 }
 
-static void write_mb_segid_except(VP9_COMMON *cm,
-                                  vp9_writer *bc,
-                                  const MB_MODE_INFO *mi,
-                                  const MACROBLOCKD *xd,
-                                  int mb_row, int mb_col) {
-  // Encode the MB segment id.
-  const int seg_id = mi->segment_id;
-  const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
-  const int pred_seg_id = vp9_get_pred_mb_segid(cm, sb_type, mb_row, mb_col);
-  const vp9_prob *p = xd->mb_segment_tree_probs;
-  const vp9_prob p1 = xd->mb_segment_mispred_tree_probs[pred_seg_id];
-
-  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
-    vp9_write(bc, seg_id >= 2, p1);
-    if (pred_seg_id >= 2 && seg_id < 2) {
-      vp9_write(bc, seg_id == 1, p[1]);
-    } else if (pred_seg_id < 2 && seg_id >= 2) {
-      vp9_write(bc, seg_id == 3, p[2]);
-    }
-  }
-}
-
 // This function encodes the reference frame
 static void encode_ref_frame(vp9_writer *const bc,
                              VP9_COMMON *const cm,
@@ -695,8 +710,7 @@
   // These specified to 8th pel as they are always compared to MV
   // values that are in 1/8th pel units
 
-  set_mb_row(pc, xd, mb_row, bh);
-  set_mb_col(pc, xd, mb_col, bw);
+  set_mb_row_col(pc, xd, mb_row, bh, mb_col, bw);
 
 #ifdef ENTROPY_STATS
   active_section = 9;
@@ -713,7 +727,7 @@
 
       // If the mb segment id wasn't predicted code explicitly
       if (!prediction_flag)
-        write_mb_segid_except(pc, bc, mi, &cpi->mb.e_mbd, mb_row, mb_col);
+        write_mb_segid(bc, mi, &cpi->mb.e_mbd);
     } else {
       // Normal unpredicted coding
       write_mb_segid(bc, mi, &cpi->mb.e_mbd);
@@ -736,7 +750,7 @@
     active_section = 6;
 #endif
 
-    if (m->mbmi.sb_type)
+    if (m->mbmi.sb_type > BLOCK_SIZE_MB16X16)
       write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob);
     else
       write_ymode(bc, mode, pc->fc.ymode_prob);
@@ -772,7 +786,7 @@
 
     // If segment skip is not enabled code the mode.
     if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
-      if (mi->sb_type) {
+      if (mi->sb_type > BLOCK_SIZE_MB16X16) {
         write_sb_mv_ref(bc, mode, mv_ref_p);
       } else {
         write_mv_ref(bc, mode, mv_ref_p);
@@ -933,7 +947,7 @@
               vp9_get_pred_prob(c, xd, PRED_MBSKIP));
   }
 
-  if (m->mbmi.sb_type) {
+  if (m->mbmi.sb_type > BLOCK_SIZE_MB16X16) {
     sb_kfwrite_ymode(bc, ym,
                      c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
   } else {
@@ -985,328 +999,131 @@
   }
 }
 
-#if CONFIG_CODE_NONZEROCOUNT
-static void write_nzc(VP9_COMP *const cpi,
-                      uint16_t nzc,
-                      int nzc_context,
-                      TX_SIZE tx_size,
-                      int ref,
-                      int type,
-                      vp9_writer* const bc) {
-  VP9_COMMON *const cm = &cpi->common;
-  int c, e;
-  // if (!cpi->dummy_packing && cm->current_video_frame == 27)
-  //   printf("nzc: %d, tx_size: %d\n", nzc, tx_size);
-  if (!get_nzc_used(tx_size)) return;
-  c = codenzc(nzc);
-  if (tx_size == TX_32X32) {
-    write_token(bc, vp9_nzc32x32_tree,
-                cm->fc.nzc_probs_32x32[nzc_context][ref][type],
-                vp9_nzc32x32_encodings + c);
-    // cm->fc.nzc_counts_32x32[nzc_context][ref][type][c]++;
-  } else if (tx_size == TX_16X16) {
-    write_token(bc, vp9_nzc16x16_tree,
-                cm->fc.nzc_probs_16x16[nzc_context][ref][type],
-                vp9_nzc16x16_encodings + c);
-    // cm->fc.nzc_counts_16x16[nzc_context][ref][type][c]++;
-  } else if (tx_size == TX_8X8) {
-    write_token(bc, vp9_nzc8x8_tree,
-                cm->fc.nzc_probs_8x8[nzc_context][ref][type],
-                vp9_nzc8x8_encodings + c);
-    // cm->fc.nzc_counts_8x8[nzc_context][ref][type][c]++;
-  } else if (tx_size == TX_4X4) {
-    write_token(bc, vp9_nzc4x4_tree,
-                cm->fc.nzc_probs_4x4[nzc_context][ref][type],
-                vp9_nzc4x4_encodings + c);
-    // cm->fc.nzc_counts_4x4[nzc_context][ref][type][c]++;
-  } else {
-    assert(0);
-  }
-
-  if ((e = vp9_extranzcbits[c])) {
-    int x = nzc - vp9_basenzcvalue[c];
-    while (e--) {
-      int b = (x >> e) & 1;
-      vp9_write(bc, b,
-                cm->fc.nzc_pcat_probs[nzc_context][c - NZC_TOKENS_NOEXTRA][e]);
-      // cm->fc.nzc_pcat_counts[nzc_context][c - NZC_TOKENS_NOEXTRA][e][b]++;
-    }
-  }
+#if CONFIG_CODE_ZEROGROUP
+#ifdef ZPC_STATS
+void init_zpcstats() {
+  vp9_zero(zpc_stats_4x4);
+  vp9_zero(zpc_stats_8x8);
+  vp9_zero(zpc_stats_16x16);
+  vp9_zero(zpc_stats_32x32);
 }
 
-static void write_nzcs_sb64(VP9_COMP *cpi,
-                            MACROBLOCKD *xd,
-                            int mb_row,
-                            int mb_col,
-                            vp9_writer* const bc) {
-  VP9_COMMON *const cm = &cpi->common;
-  MODE_INFO *m = xd->mode_info_context;
-  MB_MODE_INFO *const mi = &m->mbmi;
-  int j, nzc_context;
-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
-
-  assert(mb_col == get_mb_col(xd));
-  assert(mb_row == get_mb_row(xd));
-
-  if (mi->mb_skip_coeff)
-    return;
-
-  switch (mi->txfm_size) {
-    case TX_32X32:
-      for (j = 0; j < 256; j += 64) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        write_nzc(cpi, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0, bc);
-      }
-      for (j = 256; j < 384; j += 64) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        write_nzc(cpi, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 1, bc);
-      }
-      break;
-
-    case TX_16X16:
-      for (j = 0; j < 256; j += 16) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        write_nzc(cpi, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0, bc);
-      }
-      for (j = 256; j < 384; j += 16) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        write_nzc(cpi, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1, bc);
-      }
-      break;
-
-    case TX_8X8:
-      for (j = 0; j < 256; j += 4) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        write_nzc(cpi, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0, bc);
-      }
-      for (j = 256; j < 384; j += 4) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        write_nzc(cpi, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);
-      }
-      break;
-
-    case TX_4X4:
-      for (j = 0; j < 256; ++j) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        write_nzc(cpi, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0, bc);
-      }
-      for (j = 256; j < 384; ++j) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        write_nzc(cpi, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);
-      }
-      break;
-
-    default:
-      break;
-  }
-}
-
-static void write_nzcs_sb32(VP9_COMP *cpi,
-                            MACROBLOCKD *xd,
-                            int mb_row,
-                            int mb_col,
-                            vp9_writer* const bc) {
-  VP9_COMMON *const cm = &cpi->common;
-  MODE_INFO *m = xd->mode_info_context;
-  MB_MODE_INFO *const mi = &m->mbmi;
-  int j, nzc_context;
-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
-
-  assert(mb_col == get_mb_col(xd));
-  assert(mb_row == get_mb_row(xd));
-
-  if (mi->mb_skip_coeff)
-    return;
-
-  switch (mi->txfm_size) {
-    case TX_32X32:
-      for (j = 0; j < 64; j += 64) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        write_nzc(cpi, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0, bc);
-      }
-      for (j = 64; j < 96; j += 16) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        write_nzc(cpi, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1, bc);
-      }
-      break;
-
-    case TX_16X16:
-      for (j = 0; j < 64; j += 16) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        write_nzc(cpi, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0, bc);
-      }
-      for (j = 64; j < 96; j += 16) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        write_nzc(cpi, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1, bc);
-      }
-      break;
-
-    case TX_8X8:
-      for (j = 0; j < 64; j += 4) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        write_nzc(cpi, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0, bc);
-      }
-      for (j = 64; j < 96; j += 4) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        write_nzc(cpi, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);
-      }
-      break;
-
-    case TX_4X4:
-      for (j = 0; j < 64; ++j) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        write_nzc(cpi, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0, bc);
-      }
-      for (j = 64; j < 96; ++j) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        write_nzc(cpi, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);
-      }
-      break;
-
-    default:
-      break;
-  }
-}
-
-static void write_nzcs_mb16(VP9_COMP *cpi,
-                            MACROBLOCKD *xd,
-                            int mb_row,
-                            int mb_col,
-                            vp9_writer* const bc) {
-  VP9_COMMON *const cm = &cpi->common;
-  MODE_INFO *m = xd->mode_info_context;
-  MB_MODE_INFO *const mi = &m->mbmi;
-  int j, nzc_context;
-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
-
-  assert(mb_col == get_mb_col(xd));
-  assert(mb_row == get_mb_row(xd));
-
-  if (mi->mb_skip_coeff)
-    return;
-
-  switch (mi->txfm_size) {
-    case TX_16X16:
-      for (j = 0; j < 16; j += 16) {
-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
-        write_nzc(cpi, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0, bc);
-      }
-      for (j = 16; j < 24; j += 4) {
-        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-        write_nzc(cpi, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);
-      }
-      break;
-
-    case TX_8X8:
-      for (j = 0; j < 16; j += 4) {
-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
-        write_nzc(cpi, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0, bc);
-      }
-      if (mi->mode == I8X8_PRED || mi->mode == SPLITMV) {
-        for (j = 16; j < 24; ++j) {
-          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-          write_nzc(cpi, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);
+void update_zpcstats(VP9_COMMON *const cm) {
+  int r, b, p, n;
+  for (r = 0; r < REF_TYPES; ++r) {
+    for (b = 0; b < ZPC_BANDS; ++b) {
+      for (p = 0; p < ZPC_PTOKS; ++p) {
+        for (n = 0; n < ZPC_NODES; ++n) {
+          zpc_stats_4x4[r][b][p][n][0] += cm->fc.zpc_counts_4x4[r][b][p][n][0];
+          zpc_stats_4x4[r][b][p][n][1] += cm->fc.zpc_counts_4x4[r][b][p][n][1];
+          zpc_stats_8x8[r][b][p][n][0] += cm->fc.zpc_counts_8x8[r][b][p][n][0];
+          zpc_stats_8x8[r][b][p][n][1] += cm->fc.zpc_counts_8x8[r][b][p][n][1];
+          zpc_stats_16x16[r][b][p][n][0] +=
+              cm->fc.zpc_counts_16x16[r][b][p][n][0];
+          zpc_stats_16x16[r][b][p][n][1] +=
+              cm->fc.zpc_counts_16x16[r][b][p][n][1];
+          zpc_stats_32x32[r][b][p][n][0] +=
+              cm->fc.zpc_counts_32x32[r][b][p][n][0];
+          zpc_stats_32x32[r][b][p][n][1] +=
+              cm->fc.zpc_counts_32x32[r][b][p][n][1];
         }
-      } else {
-        for (j = 16; j < 24; j += 4) {
-          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-          write_nzc(cpi, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);
-        }
-      }
-      break;
-
-    case TX_4X4:
-      for (j = 0; j < 16; ++j) {
-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
-        write_nzc(cpi, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0, bc);
-      }
-      for (j = 16; j < 24; ++j) {
-        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-        write_nzc(cpi, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);
-      }
-      break;
-
-    default:
-      break;
-  }
-}
-
-#ifdef NZC_STATS
-void init_nzcstats() {
-  vp9_zero(nzc_stats_4x4);
-  vp9_zero(nzc_stats_8x8);
-  vp9_zero(nzc_stats_16x16);
-  vp9_zero(nzc_stats_32x32);
-  vp9_zero(nzc_pcat_stats);
-}
-
-void update_nzcstats(VP9_COMMON *const cm) {
-  int c, r, b, t;
-
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    for (r = 0; r < REF_TYPES; ++r) {
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        for (t = 0; t < NZC4X4_TOKENS; ++t) {
-          nzc_stats_4x4[c][r][b][t] += cm->fc.nzc_counts_4x4[c][r][b][t];
-        }
-      }
-    }
-  }
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    for (r = 0; r < REF_TYPES; ++r) {
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        for (t = 0; t < NZC8X8_TOKENS; ++t) {
-          nzc_stats_8x8[c][r][b][t] += cm->fc.nzc_counts_8x8[c][r][b][t];
-        }
-      }
-    }
-  }
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    for (r = 0; r < REF_TYPES; ++r) {
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        for (t = 0; t < NZC16X16_TOKENS; ++t) {
-          nzc_stats_16x16[c][r][b][t] += cm->fc.nzc_counts_16x16[c][r][b][t];
-        }
-      }
-    }
-  }
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    for (r = 0; r < REF_TYPES; ++r) {
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        for (t = 0; t < NZC32X32_TOKENS; ++t) {
-          nzc_stats_32x32[c][r][b][t] += cm->fc.nzc_counts_32x32[c][r][b][t];
-        }
-      }
-    }
-  }
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {
-      int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];
-      for (b = 0; b < bits; ++b) {
-        nzc_pcat_stats[c][t][b][0] += cm->fc.nzc_pcat_counts[c][t][b][0];
-        nzc_pcat_stats[c][t][b][1] += cm->fc.nzc_pcat_counts[c][t][b][1];
       }
     }
   }
 }
 
-void print_nzcstats() {
-  int c, r, b, t;
+void print_zpcstats() {
+  int r, b, p, n;
   FILE *f;
 
   printf(
-    "static const unsigned int default_nzc_counts_4x4[MAX_NZC_CONTEXTS]\n"
-    "                                                [REF_TYPES]\n"
-    "                                                [BLOCK_TYPES]\n"
-    "                                                [NZC4X4_TOKENS] = {\n");
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+      "static const unsigned int default_zpc_probs_4x4[REF_TYPES]\n"
+      "                                               [ZPC_BANDS]\n"
+      "                                               [ZPC_PTOKS]\n"
+      "                                               [ZPC_NODES] = {\n");
+  for (r = 0; r < REF_TYPES; ++r) {
     printf("  {\n");
-    for (r = 0; r < REF_TYPES; ++r) {
+    for (b = 0; b < ZPC_BANDS; ++b) {
       printf("    {\n");
-      for (b = 0; b < BLOCK_TYPES; ++b) {
+      for (p = 0; p < ZPC_PTOKS; ++p) {
         printf("      {");
-        for (t = 0; t < NZC4X4_TOKENS; ++t) {
-          printf(" %-3d,", nzc_stats_4x4[c][r][b][t]);
+        for (n = 0; n < ZPC_NODES; ++n) {
+          vp9_prob prob = get_binary_prob(zpc_stats_4x4[r][b][p][n][0],
+                                          zpc_stats_4x4[r][b][p][n][1]);
+          printf(" %-3d [%d/%d],", prob, zpc_stats_4x4[r][b][p][n][0],
+                                         zpc_stats_4x4[r][b][p][n][1]);
+        }
+        printf(" },\n");
+      }
+      printf("    },\n");
+    }
+    printf("  },\n");
+  }
+  printf("};\n");
+  printf(
+    "static const unsigned int default_zpc_probs_8x8[REF_TYPES]\n"
+    "                                               [ZPC_BANDS]\n"
+    "                                               [ZPC_PTOKS]\n"
+    "                                               [ZPC_NODES] = {\n");
+  for (r = 0; r < REF_TYPES; ++r) {
+    printf("  {\n");
+    for (b = 0; b < ZPC_BANDS; ++b) {
+      printf("    {\n");
+      for (p = 0; p < ZPC_PTOKS; ++p) {
+        printf("      {");
+        for (n = 0; n < ZPC_NODES; ++n) {
+          vp9_prob prob = get_binary_prob(zpc_stats_8x8[r][b][p][n][0],
+                                          zpc_stats_8x8[r][b][p][n][1]);
+          printf(" %-3d [%d/%d],", prob, zpc_stats_8x8[r][b][p][n][0],
+                                         zpc_stats_8x8[r][b][p][n][1]);
+        }
+        printf(" },\n");
+      }
+      printf("    },\n");
+    }
+    printf("  },\n");
+  }
+  printf("};\n");
+  printf(
+    "static const unsigned int default_zpc_probs_16x16[REF_TYPES]\n"
+    "                                                 [ZPC_BANDS]\n"
+    "                                                 [ZPC_PTOKS]\n"
+    "                                                 [ZPC_NODES] = {\n");
+  for (r = 0; r < REF_TYPES; ++r) {
+    printf("  {\n");
+    for (b = 0; b < ZPC_BANDS; ++b) {
+      printf("    {\n");
+      for (p = 0; p < ZPC_PTOKS; ++p) {
+        printf("      {");
+        for (n = 0; n < ZPC_NODES; ++n) {
+          vp9_prob prob = get_binary_prob(zpc_stats_16x16[r][b][p][n][0],
+                                          zpc_stats_16x16[r][b][p][n][1]);
+          printf(" %-3d [%d/%d],", prob, zpc_stats_16x16[r][b][p][n][0],
+                                         zpc_stats_16x16[r][b][p][n][1]);
+        }
+        printf(" },\n");
+      }
+      printf("    },\n");
+    }
+    printf("  },\n");
+  }
+  printf("};\n");
+  printf(
+    "static const unsigned int default_zpc_probs_32x32[REF_TYPES]\n"
+    "                                                 [ZPC_BANDS]\n"
+    "                                                 [ZPC_PTOKS]\n"
+    "                                                 [ZPC_NODES] = {\n");
+  for (r = 0; r < REF_TYPES; ++r) {
+    printf("  {\n");
+    for (b = 0; b < ZPC_BANDS; ++b) {
+      printf("    {\n");
+      for (p = 0; p < ZPC_PTOKS; ++p) {
+        printf("      {");
+        for (n = 0; n < ZPC_NODES; ++n) {
+          vp9_prob prob = get_binary_prob(zpc_stats_32x32[r][b][p][n][0],
+                                          zpc_stats_32x32[r][b][p][n][1]);
+          printf(" %-3d [%d/%d],", prob, zpc_stats_32x32[r][b][p][n][0],
+                                         zpc_stats_32x32[r][b][p][n][1]);
         }
         printf(" },\n");
       }
@@ -1316,230 +1133,15 @@
   }
   printf("};\n");
 
-  printf(
-    "static const unsigned int default_nzc_counts_8x8[MAX_NZC_CONTEXTS]\n"
-    "                                                [REF_TYPES]\n"
-    "                                                [BLOCK_TYPES]\n"
-    "                                                [NZC8X8_TOKENS] = {\n");
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    printf("  {\n");
-    for (r = 0; r < REF_TYPES; ++r) {
-      printf("    {\n");
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        printf("      {");
-        for (t = 0; t < NZC8X8_TOKENS; ++t) {
-          printf(" %-3d,", nzc_stats_8x8[c][r][b][t]);
-        }
-        printf(" },\n");
-      }
-      printf("    },\n");
-    }
-    printf("  },\n");
-  }
-  printf("};\n");
-
-  printf(
-    "static const unsigned int default_nzc_counts_16x16[MAX_NZC_CONTEXTS]\n"
-    "                                                  [REF_TYPES]\n"
-    "                                                  [BLOCK_TYPES]\n"
-    "                                                  [NZC16X16_TOKENS] = {"
-    "\n");
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    printf("  {\n");
-    for (r = 0; r < REF_TYPES; ++r) {
-      printf("    {\n");
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        printf("      {");
-        for (t = 0; t < NZC16X16_TOKENS; ++t) {
-          printf(" %-3d,", nzc_stats_16x16[c][r][b][t]);
-        }
-        printf(" },\n");
-      }
-      printf("    },\n");
-    }
-    printf("  },\n");
-  }
-  printf("};\n");
-
-  printf(
-    "static const unsigned int default_nzc_counts_32x32[MAX_NZC_CONTEXTS]\n"
-    "                                                  [REF_TYPES]\n"
-    "                                                  [BLOCK_TYPES]\n"
-    "                                                  [NZC32X32_TOKENS] = {"
-    "\n");
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    printf("  {\n");
-    for (r = 0; r < REF_TYPES; ++r) {
-      printf("    {\n");
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        printf("      {");
-        for (t = 0; t < NZC32X32_TOKENS; ++t) {
-          printf(" %-3d,", nzc_stats_32x32[c][r][b][t]);
-        }
-        printf(" },\n");
-      }
-      printf("    },\n");
-    }
-    printf("  },\n");
-  }
-  printf("};\n");
-
-  printf(
-    "static const vp9_prob default_nzc_pcat_counts[MAX_NZC_CONTEXTS]\n"
-    "                                             [NZC_TOKENS_EXTRA]\n"
-    "                                             [NZC_BITS_EXTRA] = {\n");
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    printf("  {\n");
-    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {
-      printf("    {");
-      for (b = 0; b < NZC_BITS_EXTRA; ++b) {
-        printf(" %d/%d,",
-               nzc_pcat_stats[c][t][b][0], nzc_pcat_stats[c][t][b][1]);
-      }
-      printf(" },\n");
-    }
-    printf("  },\n");
-  }
-  printf("};\n");
-
-  printf(
-    "static const vp9_prob default_nzc_probs_4x4[MAX_NZC_CONTEXTS]\n"
-    "                                           [REF_TYPES]\n"
-    "                                           [BLOCK_TYPES]\n"
-    "                                           [NZC4X4_TOKENS] = {\n");
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    printf("  {\n");
-    for (r = 0; r < REF_TYPES; ++r) {
-      printf("    {\n");
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        vp9_prob probs[NZC4X4_NODES];
-        unsigned int branch_ct[NZC4X4_NODES][2];
-        vp9_tree_probs_from_distribution(vp9_nzc4x4_tree,
-                                         probs, branch_ct,
-                                         nzc_stats_4x4[c][r][b], 0);
-        printf("      {");
-        for (t = 0; t < NZC4X4_NODES; ++t) {
-          printf(" %-3d,", probs[t]);
-        }
-        printf(" },\n");
-      }
-      printf("    },\n");
-    }
-    printf("  },\n");
-  }
-  printf("};\n");
-
-  printf(
-    "static const vp9_prob default_nzc_probs_8x8[MAX_NZC_CONTEXTS]\n"
-    "                                           [REF_TYPES]\n"
-    "                                           [BLOCK_TYPES]\n"
-    "                                           [NZC8X8_TOKENS] = {\n");
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    printf("  {\n");
-    for (r = 0; r < REF_TYPES; ++r) {
-      printf("    {\n");
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        vp9_prob probs[NZC8X8_NODES];
-        unsigned int branch_ct[NZC8X8_NODES][2];
-        vp9_tree_probs_from_distribution(vp9_nzc8x8_tree,
-                                         probs, branch_ct,
-                                         nzc_stats_8x8[c][r][b], 0);
-        printf("      {");
-        for (t = 0; t < NZC8X8_NODES; ++t) {
-          printf(" %-3d,", probs[t]);
-        }
-        printf(" },\n");
-      }
-      printf("    },\n");
-    }
-    printf("  },\n");
-  }
-  printf("};\n");
-
-  printf(
-    "static const vp9_prob default_nzc_probs_16x16[MAX_NZC_CONTEXTS]\n"
-    "                                             [REF_TYPES]\n"
-    "                                             [BLOCK_TYPES]\n"
-    "                                             [NZC16X16_TOKENS] = {\n");
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    printf("  {\n");
-    for (r = 0; r < REF_TYPES; ++r) {
-      printf("    {\n");
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        vp9_prob probs[NZC16X16_NODES];
-        unsigned int branch_ct[NZC16X16_NODES][2];
-        vp9_tree_probs_from_distribution(vp9_nzc16x16_tree,
-                                         probs, branch_ct,
-                                         nzc_stats_16x16[c][r][b], 0);
-        printf("      {");
-        for (t = 0; t < NZC16X16_NODES; ++t) {
-          printf(" %-3d,", probs[t]);
-        }
-        printf(" },\n");
-      }
-      printf("    },\n");
-    }
-    printf("  },\n");
-  }
-  printf("};\n");
-
-  printf(
-    "static const vp9_prob default_nzc_probs_32x32[MAX_NZC_CONTEXTS]\n"
-    "                                             [REF_TYPES]\n"
-    "                                             [BLOCK_TYPES]\n"
-    "                                             [NZC32X32_TOKENS] = {\n");
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    printf("  {\n");
-    for (r = 0; r < REF_TYPES; ++r) {
-      printf("    {\n");
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        vp9_prob probs[NZC32X32_NODES];
-        unsigned int branch_ct[NZC32X32_NODES][2];
-        vp9_tree_probs_from_distribution(vp9_nzc32x32_tree,
-                                         probs, branch_ct,
-                                         nzc_stats_32x32[c][r][b], 0);
-        printf("      {");
-        for (t = 0; t < NZC32X32_NODES; ++t) {
-          printf(" %-3d,", probs[t]);
-        }
-        printf(" },\n");
-      }
-      printf("    },\n");
-    }
-    printf("  },\n");
-  }
-  printf("};\n");
-
-  printf(
-    "static const vp9_prob default_nzc_pcat_probs[MAX_NZC_CONTEXTS]\n"
-    "                                            [NZC_TOKENS_EXTRA]\n"
-    "                                            [NZC_BITS_EXTRA] = {\n");
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    printf("  {\n");
-    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {
-      printf("    {");
-      for (b = 0; b < NZC_BITS_EXTRA; ++b) {
-        vp9_prob prob = get_binary_prob(nzc_pcat_stats[c][t][b][0],
-                                        nzc_pcat_stats[c][t][b][1]);
-        printf(" %-3d,", prob);
-      }
-      printf(" },\n");
-    }
-    printf("  },\n");
-  }
-  printf("};\n");
-
-  f = fopen("nzcstats.bin", "wb");
-  fwrite(nzc_stats_4x4, sizeof(nzc_stats_4x4), 1, f);
-  fwrite(nzc_stats_8x8, sizeof(nzc_stats_8x8), 1, f);
-  fwrite(nzc_stats_16x16, sizeof(nzc_stats_16x16), 1, f);
-  fwrite(nzc_stats_32x32, sizeof(nzc_stats_32x32), 1, f);
-  fwrite(nzc_pcat_stats, sizeof(nzc_pcat_stats), 1, f);
+  f = fopen("zpcstats.bin", "wb");
+  fwrite(zpc_stats_4x4, sizeof(zpc_stats_4x4), 1, f);
+  fwrite(zpc_stats_8x8, sizeof(zpc_stats_8x8), 1, f);
+  fwrite(zpc_stats_16x16, sizeof(zpc_stats_16x16), 1, f);
+  fwrite(zpc_stats_32x32, sizeof(zpc_stats_32x32), 1, f);
   fclose(f);
 }
 #endif
-
-#endif  // CONFIG_CODE_NONZEROCOUNT
+#endif  // CONFIG_CODE_ZEROGROUP
 
 static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
                           TOKENEXTRA **tok, TOKENEXTRA *tok_end,
@@ -1548,8 +1150,9 @@
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
 
   xd->mode_info_context = m;
-  set_mb_row(&cpi->common, xd, mb_row, 1 << mb_height_log2(m->mbmi.sb_type));
-  set_mb_col(&cpi->common, xd, mb_col, 1 << mb_width_log2(m->mbmi.sb_type));
+  set_mb_row_col(&cpi->common, xd, mb_row,
+                 1 << mb_height_log2(m->mbmi.sb_type),
+                 mb_col, 1 << mb_width_log2(m->mbmi.sb_type));
   if (cm->frame_type == KEY_FRAME) {
     write_mb_modes_kf(cpi, m, bc,
                       cm->mb_rows - mb_row, cm->mb_cols - mb_col);
@@ -1563,14 +1166,6 @@
     active_section = 1;
 #endif
   }
-#if CONFIG_CODE_NONZEROCOUNT
-  if (m->mbmi.sb_type == BLOCK_SIZE_SB64X64)
-    write_nzcs_sb64(cpi, xd, mb_row, mb_col, bc);
-  else if (m->mbmi.sb_type == BLOCK_SIZE_SB32X32)
-    write_nzcs_sb32(cpi, xd, mb_row, mb_col, bc);
-  else
-    write_nzcs_mb16(cpi, xd, mb_row, mb_col, bc);
-#endif
 
   assert(*tok < tok_end);
   pack_mb_tokens(bc, tok, tok_end);
@@ -1581,6 +1176,7 @@
                            int mb_row, int mb_col,
                            BLOCK_SIZE_TYPE bsize) {
   VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
   const int mis = cm->mode_info_stride;
   int bwl, bhl;
 #if CONFIG_SBSEGMENT
@@ -1615,22 +1211,32 @@
   else
     assert(0);
 
-  if (bsize > BLOCK_SIZE_MB16X16)
+  if (bsize > BLOCK_SIZE_MB16X16) {
+    int pl;
+    xd->left_seg_context = cm->left_seg_context + (mb_row & 3);
+    xd->above_seg_context = cm->above_seg_context + mb_col;
+    pl = partition_plane_context(xd, bsize);
     // encode the partition information
-    write_token(bc, vp9_partition_tree, cm->fc.partition_prob[bsl - 1],
+    write_token(bc, vp9_partition_tree, cm->fc.partition_prob[pl],
                 vp9_partition_encodings + partition);
+  }
 
   switch (partition) {
     case PARTITION_NONE:
+      subsize = bsize;
       write_modes_b(cpi, m, bc, tok, tok_end, mb_row, mb_col);
       break;
 #if CONFIG_SBSEGMENT
     case PARTITION_HORZ:
+      subsize = (bsize == BLOCK_SIZE_SB64X64) ? BLOCK_SIZE_SB64X32 :
+                                                BLOCK_SIZE_SB32X16;
       write_modes_b(cpi, m, bc, tok, tok_end, mb_row, mb_col);
       if ((mb_row + bh) < cm->mb_rows)
         write_modes_b(cpi, m + bh * mis, bc, tok, tok_end, mb_row + bh, mb_col);
       break;
     case PARTITION_VERT:
+      subsize = (bsize == BLOCK_SIZE_SB64X64) ? BLOCK_SIZE_SB32X64 :
+                                                BLOCK_SIZE_SB16X32;
       write_modes_b(cpi, m, bc, tok, tok_end, mb_row, mb_col);
       if ((mb_col + bw) < cm->mb_cols)
         write_modes_b(cpi, m + bw, bc, tok, tok_end, mb_row, mb_col + bw);
@@ -1654,6 +1260,14 @@
     default:
       assert(0);
   }
+
+  // update partition context
+  if ((partition == PARTITION_SPLIT) && (bsize > BLOCK_SIZE_SB32X32))
+    return;
+
+  xd->left_seg_context = cm->left_seg_context + (mb_row & 3);
+  xd->above_seg_context = cm->above_seg_context + mb_col;
+  update_partition_context(xd, subsize, bsize);
 }
 
 static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
@@ -1664,9 +1278,13 @@
   int mb_row, mb_col;
 
   m_ptr += c->cur_tile_mb_col_start + c->cur_tile_mb_row_start * mis;
+  vpx_memset(c->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
+                                       mb_cols_aligned_to_sb(c));
+
   for (mb_row = c->cur_tile_mb_row_start;
        mb_row < c->cur_tile_mb_row_end; mb_row += 4, m_ptr += 4 * mis) {
     m = m_ptr;
+    vpx_memset(c->left_seg_context, 0, sizeof(c->left_seg_context));
     for (mb_col = c->cur_tile_mb_col_start;
          mb_col < c->cur_tile_mb_col_end; mb_col += 4, m += 4)
       write_modes_sb(cpi, m, bc, tok, tok_end, mb_row, mb_col,
@@ -1779,92 +1397,64 @@
                           cpi->frame_branch_ct_32x32, BLOCK_TYPES);
 }
 
-#if CONFIG_CODE_NONZEROCOUNT
-static void update_nzc_probs_common(VP9_COMP* cpi,
+#if CONFIG_CODE_ZEROGROUP
+static void update_zpc_probs_common(VP9_COMP* cpi,
                                     vp9_writer* const bc,
                                     TX_SIZE tx_size) {
-  VP9_COMMON *cm = &cpi->common;
-  int c, r, b, t;
+  int r, b, p, n;
+  VP9_COMMON *const cm = &cpi->common;
   int update[2] = {0, 0};
   int savings = 0;
-  int tokens, nodes;
-  const vp9_tree_index *nzc_tree;
-  vp9_prob *new_nzc_probs;
-  vp9_prob *old_nzc_probs;
-  unsigned int *nzc_counts;
-  unsigned int (*nzc_branch_ct)[2];
-  vp9_prob upd;
+  vp9_zpc_probs newprobs;
+  vp9_zpc_probs *zpc_probs;
+  vp9_zpc_count *zpc_counts;
+  vp9_prob upd = ZPC_UPDATE_PROB;
 
-  if (!get_nzc_used(tx_size)) return;
+  if (!get_zpc_used(tx_size)) return;
   if (tx_size == TX_32X32) {
-    tokens = NZC32X32_TOKENS;
-    nzc_tree = vp9_nzc32x32_tree;
-    old_nzc_probs = cm->fc.nzc_probs_32x32[0][0][0];
-    new_nzc_probs = cpi->frame_nzc_probs_32x32[0][0][0];
-    nzc_counts = cm->fc.nzc_counts_32x32[0][0][0];
-    nzc_branch_ct = cpi->frame_nzc_branch_ct_32x32[0][0][0];
-    upd = NZC_UPDATE_PROB_32X32;
+    zpc_probs = &cm->fc.zpc_probs_32x32;
+    zpc_counts = &cm->fc.zpc_counts_32x32;
   } else if (tx_size == TX_16X16) {
-    tokens = NZC16X16_TOKENS;
-    nzc_tree = vp9_nzc16x16_tree;
-    old_nzc_probs = cm->fc.nzc_probs_16x16[0][0][0];
-    new_nzc_probs = cpi->frame_nzc_probs_16x16[0][0][0];
-    nzc_counts = cm->fc.nzc_counts_16x16[0][0][0];
-    nzc_branch_ct = cpi->frame_nzc_branch_ct_16x16[0][0][0];
-    upd = NZC_UPDATE_PROB_16X16;
+    zpc_probs = &cm->fc.zpc_probs_16x16;
+    zpc_counts = &cm->fc.zpc_counts_16x16;
   } else if (tx_size == TX_8X8) {
-    tokens = NZC8X8_TOKENS;
-    nzc_tree = vp9_nzc8x8_tree;
-    old_nzc_probs = cm->fc.nzc_probs_8x8[0][0][0];
-    new_nzc_probs = cpi->frame_nzc_probs_8x8[0][0][0];
-    nzc_counts = cm->fc.nzc_counts_8x8[0][0][0];
-    nzc_branch_ct = cpi->frame_nzc_branch_ct_8x8[0][0][0];
-    upd = NZC_UPDATE_PROB_8X8;
+    zpc_probs = &cm->fc.zpc_probs_8x8;
+    zpc_counts = &cm->fc.zpc_counts_8x8;
   } else {
-    nzc_tree = vp9_nzc4x4_tree;
-    tokens = NZC4X4_TOKENS;
-    old_nzc_probs = cm->fc.nzc_probs_4x4[0][0][0];
-    new_nzc_probs = cpi->frame_nzc_probs_4x4[0][0][0];
-    nzc_counts = cm->fc.nzc_counts_4x4[0][0][0];
-    nzc_branch_ct = cpi->frame_nzc_branch_ct_4x4[0][0][0];
-    upd = NZC_UPDATE_PROB_4X4;
+    zpc_probs = &cm->fc.zpc_probs_4x4;
+    zpc_counts = &cm->fc.zpc_counts_4x4;
   }
-  nodes = tokens - 1;
-  // Get the new probabilities and the branch counts
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    for (r = 0; r < REF_TYPES; ++r) {
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;
-        int offset_nodes = offset * nodes;
-        int offset_tokens = offset * tokens;
-        vp9_tree_probs_from_distribution(nzc_tree,
-                                         new_nzc_probs + offset_nodes,
-                                         nzc_branch_ct + offset_nodes,
-                                         nzc_counts + offset_tokens, 0);
+  for (r = 0; r < REF_TYPES; ++r) {
+    for (b = 0; b < ZPC_BANDS; ++b) {
+      for (p = 0; p < ZPC_PTOKS; ++p) {
+        for (n = 0; n < ZPC_NODES; ++n) {
+          newprobs[r][b][p][n] = get_binary_prob((*zpc_counts)[r][b][p][n][0],
+                                                 (*zpc_counts)[r][b][p][n][1]);
+        }
       }
     }
   }
-
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    for (r = 0; r < REF_TYPES; ++r) {
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;
-        int offset_nodes = offset * nodes;
-        for (t = 0; t < nodes; ++t) {
-          vp9_prob newp = new_nzc_probs[offset_nodes + t];
-          vp9_prob oldp = old_nzc_probs[offset_nodes + t];
+  for (r = 0; r < REF_TYPES; ++r) {
+    for (b = 0; b < ZPC_BANDS; ++b) {
+      for (p = 0; p < ZPC_PTOKS; ++p) {
+        for (n = 0; n < ZPC_NODES; ++n) {
+          vp9_prob newp = newprobs[r][b][p][n];
+          vp9_prob oldp = (*zpc_probs)[r][b][p][n];
           int s, u = 0;
+#if USE_ZPC_EXTRA == 0
+          if (n == 1) continue;
+#endif
 #if defined(SEARCH_NEWP)
-            s = prob_diff_update_savings_search(nzc_branch_ct[offset_nodes],
-                                                oldp, &newp, upd);
-            if (s > 0 && newp != oldp)
-              u = 1;
-            if (u)
-              savings += s - (int)(vp9_cost_zero(upd));
-            else
-              savings -= (int)(vp9_cost_zero(upd));
+          s = prob_diff_update_savings_search((*zpc_counts)[r][b][p][n],
+                                              oldp, &newp, upd);
+          if (s > 0 && newp != oldp)
+            u = 1;
+          if (u)
+            savings += s - (int)(vp9_cost_zero(upd));
+          else
+            savings -= (int)(vp9_cost_zero(upd));
 #else
-          s = prob_update_savings(nzc_branch_ct[offset_nodes],
+          s = prob_update_savings((*zpc_counts)[r][b][p][n],
                                   oldp, newp, upd);
           if (s > 0)
             u = 1;
@@ -1878,98 +1468,26 @@
   }
   if (update[1] == 0 || savings < 0) {
     vp9_write_bit(bc, 0);
-  } else {
-    vp9_write_bit(bc, 1);
-    for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-      for (r = 0; r < REF_TYPES; ++r) {
-        for (b = 0; b < BLOCK_TYPES; ++b) {
-          int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;
-          int offset_nodes = offset * nodes;
-          for (t = 0; t < nodes; ++t) {
-            vp9_prob newp = new_nzc_probs[offset_nodes + t];
-            vp9_prob *oldp = &old_nzc_probs[offset_nodes + t];
-            int s, u = 0;
-#if defined(SEARCH_NEWP)
-            s = prob_diff_update_savings_search(nzc_branch_ct[offset_nodes],
-                                                *oldp, &newp, upd);
-            if (s > 0 && newp != *oldp)
-              u = 1;
-#else
-            s = prob_update_savings(nzc_branch_ct[offset_nodes],
-                                    *oldp, newp, upd);
-            if (s > 0)
-              u = 1;
-#endif
-            vp9_write(bc, u, upd);
-            if (u) {
-              /* send/use new probability */
-              write_prob_diff_update(bc, newp, *oldp);
-              *oldp = newp;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-static void update_nzc_pcat_probs(VP9_COMP *cpi, vp9_writer* const bc) {
-  VP9_COMMON *cm = &cpi->common;
-  int c, t, b;
-  int update[2] = {0, 0};
-  int savings = 0;
-  vp9_prob upd = NZC_UPDATE_PROB_PCAT;
-  if (!(get_nzc_used(TX_4X4) || get_nzc_used(TX_8X8) ||
-        get_nzc_used(TX_16X16) || get_nzc_used(TX_32X32)))
     return;
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {
-      int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];
-      for (b = 0; b < bits; ++b) {
-        vp9_prob newp = get_binary_prob(cm->fc.nzc_pcat_counts[c][t][b][0],
-                                        cm->fc.nzc_pcat_counts[c][t][b][1]);
-        vp9_prob oldp = cm->fc.nzc_pcat_probs[c][t][b];
-        int s, u = 0;
-#if defined(SEARCH_NEWP)
-        s = prob_diff_update_savings_search(cm->fc.nzc_pcat_counts[c][t][b],
-                                            oldp, &newp, upd);
-        if (s > 0 && newp != oldp)
-          u = 1;
-        if (u)
-          savings += s - (int)(vp9_cost_zero(upd));
-        else
-          savings -= (int)(vp9_cost_zero(upd));
-#else
-        s = prob_update_savings(cm->fc.nzc_pcat_counts[c][t][b],
-                                oldp, newp, upd);
-        if (s > 0)
-          u = 1;
-        if (u)
-          savings += s;
-#endif
-        update[u]++;
-      }
-    }
   }
-  if (update[1] == 0 || savings < 0) {
-    vp9_write_bit(bc, 0);
-  } else {
-    vp9_write_bit(bc, 1);
-    for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-      for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {
-        int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];
-        for (b = 0; b < bits; ++b) {
-          vp9_prob newp = get_binary_prob(cm->fc.nzc_pcat_counts[c][t][b][0],
-                                          cm->fc.nzc_pcat_counts[c][t][b][1]);
-          vp9_prob *oldp = &cm->fc.nzc_pcat_probs[c][t][b];
+  vp9_write_bit(bc, 1);
+  for (r = 0; r < REF_TYPES; ++r) {
+    for (b = 0; b < ZPC_BANDS; ++b) {
+      for (p = 0; p < ZPC_PTOKS; ++p) {
+        for (n = 0; n < ZPC_NODES; ++n) {
+          vp9_prob newp = newprobs[r][b][p][n];
+          vp9_prob *oldp = &(*zpc_probs)[r][b][p][n];
           int s, u = 0;
+#if USE_ZPC_EXTRA == 0
+          if (n == 1) continue;
+#endif
 #if defined(SEARCH_NEWP)
-          s = prob_diff_update_savings_search(cm->fc.nzc_pcat_counts[c][t][b],
+          s = prob_diff_update_savings_search((*zpc_counts)[r][b][p][n],
                                               *oldp, &newp, upd);
           if (s > 0 && newp != *oldp)
             u = 1;
 #else
-          s = prob_update_savings(cm->fc.nzc_pcat_counts[c][t][b],
+          s = prob_update_savings((*zpc_counts)[r][b][p][n],
                                   *oldp, newp, upd);
           if (s > 0)
             u = 1;
@@ -1986,24 +1504,21 @@
   }
 }
 
-static void update_nzc_probs(VP9_COMP* cpi,
+static void update_zpc_probs(VP9_COMP* cpi,
                              vp9_writer* const bc) {
-  update_nzc_probs_common(cpi, bc, TX_4X4);
+  update_zpc_probs_common(cpi, bc, TX_4X4);
   if (cpi->common.txfm_mode != ONLY_4X4)
-    update_nzc_probs_common(cpi, bc, TX_8X8);
+    update_zpc_probs_common(cpi, bc, TX_8X8);
   if (cpi->common.txfm_mode > ALLOW_8X8)
-    update_nzc_probs_common(cpi, bc, TX_16X16);
+    update_zpc_probs_common(cpi, bc, TX_16X16);
   if (cpi->common.txfm_mode > ALLOW_16X16)
-    update_nzc_probs_common(cpi, bc, TX_32X32);
-#ifdef NZC_PCAT_UPDATE
-  update_nzc_pcat_probs(cpi, bc);
-#endif
-#ifdef NZC_STATS
+    update_zpc_probs_common(cpi, bc, TX_32X32);
+#ifdef ZPC_STATS
   if (!cpi->dummy_packing)
-    update_nzcstats(&cpi->common);
+    update_zpcstats(&cpi->common);
 #endif
 }
-#endif  // CONFIG_CODE_NONZEROCOUNT
+#endif  // CONFIG_CODE_ZEROGROUP
 
 static void update_coef_probs_common(vp9_writer* const bc,
                                      VP9_COMP *cpi,
@@ -2024,11 +1539,7 @@
 #endif
   // vp9_prob bestupd = find_coef_update_prob(cpi);
 
-#if CONFIG_CODE_NONZEROCOUNT
-  const int tstart = get_nzc_used(tx_size);
-#else
   const int tstart = 0;
-#endif
   /* dry run to see if there is any udpate at all needed */
   savings = 0;
   for (i = 0; i < BLOCK_TYPES; ++i) {
@@ -2318,6 +1829,9 @@
   // error resilient mode
   vp9_write_bit(&header_bc, pc->error_resilient_mode);
 
+  // lossless mode: note this needs to be before loopfilter
+  vp9_write_bit(&header_bc, cpi->mb.e_mbd.lossless);
+
   // Encode the loop filter level and type
   vp9_write_bit(&header_bc, pc->filter_type);
   vp9_write_literal(&header_bc, pc->filter_level, 6);
@@ -2501,6 +2015,9 @@
   if (xd->segmentation_enabled) {
     // Indicate whether or not the segmentation map is being updated.
     vp9_write_bit(&header_bc, (xd->update_mb_segmentation_map) ? 1 : 0);
+#if CONFIG_IMPLICIT_SEGMENTATION
+    vp9_write_bit(&header_bc, (xd->allow_implicit_segment_update) ? 1 : 0);
+#endif
 
     // If it is, then indicate the method that will be used.
     if (xd->update_mb_segmentation_map) {
@@ -2508,7 +2025,7 @@
       vp9_choose_segmap_coding_method(cpi);
       // Send the tree probabilities used to decode unpredicted
       // macro-block segments
-      for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {
+      for (i = 0; i < MB_SEG_TREE_PROBS; i++) {
         const int prob = xd->mb_segment_tree_probs[i];
         if (prob != 255) {
           vp9_write_bit(&header_bc, 1);
@@ -2587,7 +2104,6 @@
     }
   }
 
-  vp9_write_bit(&header_bc, cpi->mb.e_mbd.lossless);
   if (cpi->mb.e_mbd.lossless) {
     pc->txfm_mode = ONLY_4X4;
   } else {
@@ -2670,26 +2186,15 @@
            cpi->common.fc.coef_probs_16x16);
   vp9_copy(cpi->common.fc.pre_coef_probs_32x32,
            cpi->common.fc.coef_probs_32x32);
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_copy(cpi->common.fc.pre_nzc_probs_4x4,
-           cpi->common.fc.nzc_probs_4x4);
-  vp9_copy(cpi->common.fc.pre_nzc_probs_8x8,
-           cpi->common.fc.nzc_probs_8x8);
-  vp9_copy(cpi->common.fc.pre_nzc_probs_16x16,
-           cpi->common.fc.nzc_probs_16x16);
-  vp9_copy(cpi->common.fc.pre_nzc_probs_32x32,
-           cpi->common.fc.nzc_probs_32x32);
-  vp9_copy(cpi->common.fc.pre_nzc_pcat_probs,
-           cpi->common.fc.nzc_pcat_probs);
-  // NOTE that if the counts are reset, we also need to uncomment
-  // the count updates in the write_nzc function
-  /*
-  vp9_zero(cpi->common.fc.nzc_counts_4x4);
-  vp9_zero(cpi->common.fc.nzc_counts_8x8);
-  vp9_zero(cpi->common.fc.nzc_counts_16x16);
-  vp9_zero(cpi->common.fc.nzc_counts_32x32);
-  vp9_zero(cpi->common.fc.nzc_pcat_counts);
-  */
+#if CONFIG_CODE_ZEROGROUP
+  vp9_copy(cpi->common.fc.pre_zpc_probs_4x4,
+           cpi->common.fc.zpc_probs_4x4);
+  vp9_copy(cpi->common.fc.pre_zpc_probs_8x8,
+           cpi->common.fc.zpc_probs_8x8);
+  vp9_copy(cpi->common.fc.pre_zpc_probs_16x16,
+           cpi->common.fc.zpc_probs_16x16);
+  vp9_copy(cpi->common.fc.pre_zpc_probs_32x32,
+           cpi->common.fc.zpc_probs_32x32);
 #endif
   vp9_copy(cpi->common.fc.pre_sb_ymode_prob, cpi->common.fc.sb_ymode_prob);
   vp9_copy(cpi->common.fc.pre_ymode_prob, cpi->common.fc.ymode_prob);
@@ -2708,8 +2213,8 @@
   vp9_zero(cpi->common.fc.mv_ref_ct);
 
   update_coef_probs(cpi, &header_bc);
-#if CONFIG_CODE_NONZEROCOUNT
-  update_nzc_probs(cpi, &header_bc);
+#if CONFIG_CODE_ZEROGROUP
+  update_zpc_probs(cpi, &header_bc);
 #endif
 
 #ifdef ENTROPY_STATS
@@ -2771,14 +2276,6 @@
     }
     update_mbintra_mode_probs(cpi, &header_bc);
 
-    for (i = 0; i < PARTITION_PLANES; i++) {
-      vp9_prob Pnew[PARTITION_TYPES - 1];
-      unsigned int bct[PARTITION_TYPES - 1][2];
-      update_mode(&header_bc, PARTITION_TYPES, vp9_partition_encodings,
-                  vp9_partition_tree, Pnew, pc->fc.partition_prob[i], bct,
-                  (unsigned int *)cpi->partition_count[i]);
-    }
-
     vp9_write_nmv_probs(cpi, xd->allow_high_precision_mv, &header_bc);
   }
 
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index f4e3c2e..eede4cb 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -24,33 +24,10 @@
 } search_site;
 
 typedef struct block {
-  // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
-  int16_t *src_diff;
-  int16_t *coeff;
-
-  // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
-  int16_t *quant;
-  int16_t *quant_fast;      // fast quant deprecated for now
-  uint8_t *quant_shift;
-  int16_t *zbin;
-  int16_t *zbin_8x8;
-  int16_t *zbin_16x16;
-  int16_t *zbin_32x32;
-  int16_t *zrun_zbin_boost;
-  int16_t *zrun_zbin_boost_8x8;
-  int16_t *zrun_zbin_boost_16x16;
-  int16_t *zrun_zbin_boost_32x32;
-  int16_t *round;
-
-  // Zbin Over Quant value
-  short zbin_extra;
-
   uint8_t **base_src;
   uint8_t **base_second_src;
   int src;
   int src_stride;
-
-  int skip_block;
 } BLOCK;
 
 typedef struct {
@@ -81,12 +58,33 @@
   int comp_pred_diff;
   int single_pred_diff;
   int64_t txfm_rd_diff[NB_TXFM_MODES];
+
+  // Bit flag for each mode whether it has high error in comparison to others.
+  unsigned int modes_with_high_error;
+
+  // Bit flag for each ref frame whether it has high error compared to others.
+  unsigned int frames_with_high_error;
 } PICK_MODE_CONTEXT;
 
+struct macroblock_plane {
+  DECLARE_ALIGNED(16, int16_t, src_diff[64*64]);
+  DECLARE_ALIGNED(16, int16_t, coeff[64*64]);
+
+  // Quantizer setings
+  int16_t *quant;
+  uint8_t *quant_shift;
+  int16_t *zbin;
+  int16_t *zrun_zbin_boost;
+  int16_t *round;
+
+  // Zbin Over Quant value
+  int16_t zbin_extra;
+};
+
 typedef struct macroblock MACROBLOCK;
 struct macroblock {
-  DECLARE_ALIGNED(16, int16_t, src_diff[64*64+32*32*2]);
-  DECLARE_ALIGNED(16, int16_t, coeff[64*64+32*32*2]);
+  struct macroblock_plane plane[MAX_MB_PLANE];
+  int skip_block;
   // 16 Y blocks, 4 U blocks, 4 V blocks,
   BLOCK block[24];
 
@@ -145,18 +143,9 @@
 
   int encode_breakout;
 
-  // char * gf_active_ptr;
-  signed char *gf_active_ptr;
-
   unsigned char *active_ptr;
 
   vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES];
-#if CONFIG_CODE_NONZEROCOUNT
-  unsigned int nzc_costs_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][17];
-  unsigned int nzc_costs_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][65];
-  unsigned int nzc_costs_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][257];
-  unsigned int nzc_costs_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][1025];
-#endif
 
   int optimize;
 
@@ -174,7 +163,7 @@
   PICK_MODE_CONTEXT sb64x32_context[2];
 #endif
   PICK_MODE_CONTEXT sb64_context;
-  int partition_cost[PARTITION_PLANES][PARTITION_TYPES];
+  int partition_cost[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
 
   void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch);
   void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index e9612b9..7dc77d9 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -551,6 +551,10 @@
   xd->above_context = cm->above_context + mb_col;
   xd->left_context  = cm->left_context + (mb_row & 3);
 
+  // partition contexts
+  xd->above_seg_context = cm->above_seg_context + mb_col;
+  xd->left_seg_context  = cm->left_seg_context + (mb_row & 3);
+
   // Activity map pointer
   x->mb_activity_ptr = &cpi->mb_activity_map[idx_map];
   x->active_ptr = cpi->active_map + idx_map;
@@ -575,8 +579,7 @@
 
   // Set up distance of MB to edge of frame in 1/8th pel units
   assert(!(mb_col & (bw - 1)) && !(mb_row & (bh - 1)));
-  set_mb_row(cm, xd, mb_row, bh);
-  set_mb_col(cm, xd, mb_col, bw);
+  set_mb_row_col(cm, xd, mb_row, bh, mb_col, bw);
 
   /* set up source buffers */
   setup_pred_block(&x->src, cpi->Source, mb_row, mb_col, NULL, NULL);
@@ -594,7 +597,7 @@
       mbmi->segment_id = find_seg_id(cm->last_frame_seg_map, bsize,
                                      mb_row, cm->mb_rows, mb_col, cm->mb_cols);
     }
-    assert(mbmi->segment_id <= 3);
+    assert(mbmi->segment_id <= (MAX_MB_SEGMENTS-1));
     vp9_mb_init_quantizer(cpi, x);
 
     if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
@@ -738,9 +741,6 @@
     if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))
       cpi->inter_zz_count++;
   }
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_update_nzc_counts(&cpi->common, xd, mb_row, mb_col);
-#endif
 }
 
 static void encode_sb(VP9_COMP *cpi,
@@ -752,6 +752,11 @@
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   BLOCK_SIZE_TYPE bsize = BLOCK_SIZE_SB32X32;
+  int pl;
+
+  xd->left_seg_context  = cm->left_seg_context + (mb_row & 0x03);
+  xd->above_seg_context = cm->above_seg_context + mb_col;
+  pl = partition_plane_context(xd, bsize);
 
   if (is_sb == BLOCK_SIZE_SB32X32) {
     set_offsets(cpi, mb_row, mb_col, bsize);
@@ -762,7 +767,7 @@
                       output_enabled, mb_row, mb_col, bsize);
     if (output_enabled) {
       update_stats(cpi, mb_row, mb_col);
-      cpi->partition_count[partition_plane(bsize)][PARTITION_NONE]++;
+      cpi->partition_count[pl][PARTITION_NONE]++;
 
       (*tp)->token = EOSB_TOKEN;
       (*tp)++;
@@ -772,7 +777,7 @@
     int i;
 
     if (output_enabled)
-      cpi->partition_count[partition_plane(bsize)][PARTITION_VERT]++;
+      cpi->partition_count[pl][PARTITION_VERT]++;
     for (i = 0; i < 2 && mb_col + i != cm->mb_cols; i++) {
       set_offsets(cpi, mb_row, mb_col + i, BLOCK_SIZE_SB16X32);
       update_state(cpi, &x->sb16x32_context[xd->sb_index][i],
@@ -790,7 +795,7 @@
     int i;
 
     if (output_enabled)
-      cpi->partition_count[partition_plane(bsize)][PARTITION_HORZ]++;
+      cpi->partition_count[pl][PARTITION_HORZ]++;
     for (i = 0; i < 2 && mb_row + i != cm->mb_rows; i++) {
       set_offsets(cpi, mb_row + i, mb_col, BLOCK_SIZE_SB32X16);
       update_state(cpi, &x->sb32x16_context[xd->sb_index][i],
@@ -808,7 +813,7 @@
   } else {
     int i;
     if (output_enabled)
-      cpi->partition_count[partition_plane(bsize)][PARTITION_SPLIT]++;
+      cpi->partition_count[pl][PARTITION_SPLIT]++;
 
     for (i = 0; i < 4; i++) {
       const int x_idx = i & 1, y_idx = i >> 1;
@@ -837,6 +842,10 @@
     }
   }
 
+  xd->above_seg_context = cm->above_seg_context + mb_col;
+  xd->left_seg_context  = cm->left_seg_context + (mb_row & 3);
+  update_partition_context(xd, is_sb, BLOCK_SIZE_SB32X32);
+
   // debug output
 #if DBG_PRNT_SEGMAP
   {
@@ -856,6 +865,11 @@
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   BLOCK_SIZE_TYPE bsize = BLOCK_SIZE_SB64X64;
+  int pl;
+
+  xd->left_seg_context  = cm->left_seg_context + (mb_row & 3);
+  xd->above_seg_context = cm->above_seg_context + mb_col;
+  pl = partition_plane_context(xd, bsize);
 
   if (is_sb[0] == BLOCK_SIZE_SB64X64) {
     set_offsets(cpi, mb_row, mb_col, bsize);
@@ -866,12 +880,13 @@
 
     (*tp)->token = EOSB_TOKEN;
     (*tp)++;
-    cpi->partition_count[partition_plane(bsize)][PARTITION_NONE]++;
+
+    cpi->partition_count[pl][PARTITION_NONE]++;
 #if CONFIG_SBSEGMENT
   } else if (is_sb[0] == BLOCK_SIZE_SB32X64) {
     int i;
 
-    cpi->partition_count[partition_plane(bsize)][PARTITION_VERT]++;
+    cpi->partition_count[pl][PARTITION_VERT]++;
     for (i = 0; i < 2 && mb_col + i * 2 != cm->mb_cols; i++) {
       set_offsets(cpi, mb_row, mb_col + i * 2, BLOCK_SIZE_SB32X64);
       update_state(cpi, &x->sb32x64_context[i], BLOCK_SIZE_SB32X64, 1);
@@ -885,7 +900,7 @@
   } else if (is_sb[0] == BLOCK_SIZE_SB64X32) {
     int i;
 
-    cpi->partition_count[partition_plane(bsize)][PARTITION_HORZ]++;
+    cpi->partition_count[pl][PARTITION_HORZ]++;
     for (i = 0; i < 2 && mb_row + i * 2 != cm->mb_rows; i++) {
       set_offsets(cpi, mb_row + i * 2, mb_col, BLOCK_SIZE_SB64X32);
       update_state(cpi, &x->sb64x32_context[i], BLOCK_SIZE_SB64X32, 1);
@@ -899,7 +914,7 @@
 #endif
   } else {
     int i;
-    cpi->partition_count[partition_plane(bsize)][PARTITION_SPLIT]++;
+    cpi->partition_count[pl][PARTITION_SPLIT]++;
     for (i = 0; i < 4; i++) {
       const int x_idx = i & 1, y_idx = i >> 1;
 
@@ -913,6 +928,12 @@
                 is_sb[i]);
     }
   }
+
+  if (is_sb[0] > BLOCK_SIZE_SB32X32) {
+    xd->above_seg_context = cm->above_seg_context + mb_col;
+    xd->left_seg_context  = cm->left_seg_context + (mb_row & 3);
+    update_partition_context(xd, is_sb[0], BLOCK_SIZE_SB64X64);
+  }
 }
 
 static void encode_sb_row(VP9_COMP *cpi,
@@ -922,10 +943,11 @@
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  int mb_col;
+  int mb_col, pl;
 
   // Initialize the left context for the new SB row
   vpx_memset(cm->left_context, 0, sizeof(cm->left_context));
+  vpx_memset(cm->left_seg_context, 0, sizeof(cm->left_seg_context));
 
   // Code each SB in the row
   for (mb_col = cm->cur_tile_mb_col_start;
@@ -935,10 +957,13 @@
     int sb64_rate = 0, sb64_dist = 0;
     int sb64_skip = 0;
     ENTROPY_CONTEXT_PLANES l[4], a[4];
+    PARTITION_CONTEXT seg_l[4], seg_a[4];
     TOKENEXTRA *tp_orig = *tp;
 
     memcpy(&a, cm->above_context + mb_col, sizeof(a));
     memcpy(&l, cm->left_context, sizeof(l));
+    memcpy(&seg_a, cm->above_seg_context + mb_col, sizeof(seg_a));
+    memcpy(&seg_l, cm->left_seg_context, sizeof(seg_l));
     for (i = 0; i < 4; i++) {
       const int x_idx = (i & 1) << 1, y_idx = i & 2;
       int sb32_rate = 0, sb32_dist = 0;
@@ -985,8 +1010,10 @@
       vpx_memcpy(cm->left_context + y_idx, l2, sizeof(l2));
       vpx_memcpy(cm->above_context + mb_col + x_idx, a2, sizeof(a2));
 
-      sb32_rate += x->partition_cost[partition_plane(BLOCK_SIZE_SB32X32)]
-                                    [PARTITION_SPLIT];
+      xd->left_seg_context  = cm->left_seg_context + (y_idx & 3);
+      xd->above_seg_context = cm->above_seg_context + mb_col + x_idx;
+      pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
+      sb32_rate += x->partition_cost[pl][PARTITION_SPLIT];
 
       if (cpi->sf.splitmode_breakout) {
         sb32_skip = splitmodes_used;
@@ -1018,8 +1045,10 @@
           d += d2;
         }
 
-        r += x->partition_cost[partition_plane(BLOCK_SIZE_SB32X32)]
-                              [PARTITION_HORZ];
+        xd->left_seg_context  = cm->left_seg_context + (y_idx & 3);
+        xd->above_seg_context = cm->above_seg_context + mb_col + x_idx;
+        pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
+        r += x->partition_cost[pl][PARTITION_HORZ];
 
         /* is this better than MB coding? */
         if (RDCOST(x->rdmult, x->rddiv, r, d) <
@@ -1057,8 +1086,10 @@
           d += d2;
         }
 
-        r += x->partition_cost[partition_plane(BLOCK_SIZE_SB32X32)]
-                              [PARTITION_VERT];
+        xd->left_seg_context  = cm->left_seg_context + (y_idx & 3);
+        xd->above_seg_context = cm->above_seg_context + mb_col + x_idx;
+        pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
+        r += x->partition_cost[pl][PARTITION_VERT];
 
         /* is this better than MB coding? */
         if (RDCOST(x->rdmult, x->rddiv, r, d) <
@@ -1081,8 +1112,11 @@
         pick_sb_modes(cpi, mb_row + y_idx, mb_col + x_idx,
                       tp, &r, &d, BLOCK_SIZE_SB32X32,
                       &x->sb32_context[xd->sb_index]);
-        r += x->partition_cost[partition_plane(BLOCK_SIZE_SB32X32)]
-                              [PARTITION_NONE];
+
+        xd->left_seg_context  = cm->left_seg_context + (y_idx & 3);
+        xd->above_seg_context = cm->above_seg_context + mb_col + x_idx;
+        pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
+        r += x->partition_cost[pl][PARTITION_NONE];
 
         if (RDCOST(x->rdmult, x->rddiv, r, d) <
                 RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
@@ -1112,9 +1146,13 @@
 
     memcpy(cm->above_context + mb_col, &a, sizeof(a));
     memcpy(cm->left_context, &l, sizeof(l));
+    memcpy(cm->above_seg_context + mb_col, &seg_a, sizeof(seg_a));
+    memcpy(cm->left_seg_context, &seg_l, sizeof(seg_l));
 
-    sb64_rate += x->partition_cost[partition_plane(BLOCK_SIZE_SB64X64)]
-                                  [PARTITION_SPLIT];
+    xd->left_seg_context  = cm->left_seg_context;
+    xd->above_seg_context = cm->above_seg_context + mb_col;
+    pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
+    sb64_rate += x->partition_cost[pl][PARTITION_SPLIT];
 
 #if CONFIG_SBSEGMENT
     // check 64x32
@@ -1140,8 +1178,10 @@
         d += d2;
       }
 
-      r += x->partition_cost[partition_plane(BLOCK_SIZE_SB64X64)]
-                            [PARTITION_HORZ];
+      xd->left_seg_context  = cm->left_seg_context;
+      xd->above_seg_context = cm->above_seg_context + mb_col;
+      pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
+      r += x->partition_cost[pl][PARTITION_HORZ];
 
       /* is this better than MB coding? */
       if (RDCOST(x->rdmult, x->rddiv, r, d) <
@@ -1178,8 +1218,10 @@
         d += d2;
       }
 
-      r += x->partition_cost[partition_plane(BLOCK_SIZE_SB64X64)]
-                            [PARTITION_VERT];
+      xd->left_seg_context  = cm->left_seg_context;
+      xd->above_seg_context = cm->above_seg_context + mb_col;
+      pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
+      r += x->partition_cost[pl][PARTITION_VERT];
 
       /* is this better than MB coding? */
       if (RDCOST(x->rdmult, x->rddiv, r, d) <
@@ -1200,8 +1242,11 @@
 
       pick_sb_modes(cpi, mb_row, mb_col, tp, &r, &d,
                     BLOCK_SIZE_SB64X64, &x->sb64_context);
-      r += x->partition_cost[partition_plane(BLOCK_SIZE_SB64X64)]
-                            [PARTITION_NONE];
+
+      xd->left_seg_context  = cm->left_seg_context;
+      xd->above_seg_context = cm->above_seg_context + mb_col;
+      pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
+      r += x->partition_cost[pl][PARTITION_NONE];
 
       if (RDCOST(x->rdmult, x->rddiv, r, d) <
               RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
@@ -1238,7 +1283,10 @@
 
   // Copy data over into macro block data structures.
   x->src = *cpi->Source;
-  xd->pre = cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]];
+
+  // TODO(jkoleszar): are these initializations required?
+  setup_pre_planes(xd, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]], NULL,
+                   0, 0, NULL, NULL);
   setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], 0, 0);
 
   // set up frame for intra coded blocks
@@ -1248,8 +1296,6 @@
 
   vp9_setup_block_dptrs(&x->e_mbd);
 
-  vp9_setup_block_ptrs(x);
-
   xd->mode_info_context->mbmi.mode = DC_PRED;
   xd->mode_info_context->mbmi.uv_mode = DC_PRED;
 
@@ -1270,7 +1316,9 @@
 #endif
 
   vpx_memset(cm->above_context, 0,
-             sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);
+             sizeof(ENTROPY_CONTEXT_PLANES) * mb_cols_aligned_to_sb(cm));
+  vpx_memset(cm->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
+                                       mb_cols_aligned_to_sb(cm));
 }
 
 static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
@@ -1338,12 +1386,11 @@
   vp9_zero(cpi->coef_counts_16x16);
   vp9_zero(cpi->coef_counts_32x32);
   vp9_zero(cm->fc.eob_branch_counts);
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_zero(cm->fc.nzc_counts_4x4);
-  vp9_zero(cm->fc.nzc_counts_8x8);
-  vp9_zero(cm->fc.nzc_counts_16x16);
-  vp9_zero(cm->fc.nzc_counts_32x32);
-  vp9_zero(cm->fc.nzc_pcat_counts);
+#if CONFIG_CODE_ZEROGROUP
+  vp9_zero(cm->fc.zpc_counts_4x4);
+  vp9_zero(cm->fc.zpc_counts_8x8);
+  vp9_zero(cm->fc.zpc_counts_16x16);
+  vp9_zero(cm->fc.zpc_counts_32x32);
 #endif
 
   cpi->mb.e_mbd.lossless = (cm->base_qindex == 0 &&
@@ -1397,6 +1444,8 @@
             encode_sb_row(cpi, mb_row, &tp, &totalrate);
           }
           cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old);
+          assert(tp - cpi->tok <=
+                 get_token_alloc(cm->mb_rows, cm->mb_cols));
         }
       }
     }
@@ -1732,30 +1781,6 @@
 
 }
 
-void vp9_setup_block_ptrs(MACROBLOCK *x) {
-  int r, c;
-  int i;
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++)
-      x->block[r * 4 + c].src_diff = x->src_diff + r * 4 * 16 + c * 4;
-  }
-
-  for (r = 0; r < 2; r++) {
-    for (c = 0; c < 2; c++)
-      x->block[16 + r * 2 + c].src_diff = x->src_diff + 256 + r * 4 * 8 + c * 4;
-  }
-
-
-  for (r = 0; r < 2; r++) {
-    for (c = 0; c < 2; c++)
-      x->block[20 + r * 2 + c].src_diff = x->src_diff + 320 + r * 4 * 8 + c * 4;
-  }
-
-  for (i = 0; i < 24; i++)
-    x->block[i].coeff = x->coeff + i * 16;
-}
-
 void vp9_build_block_offsets(MACROBLOCK *x) {
   int block = 0;
   int br, bc;
@@ -1827,7 +1852,7 @@
   }
 #endif
 
-  if (xd->mode_info_context->mbmi.sb_type) {
+  if (xd->mode_info_context->mbmi.sb_type > BLOCK_SIZE_MB16X16) {
     ++cpi->sb_ymode_count[m];
   } else {
     ++cpi->ymode_count[m];
@@ -1873,137 +1898,6 @@
 #endif
 }
 
-#if CONFIG_CODE_NONZEROCOUNT
-static void gather_nzcs_mb16(VP9_COMMON *const cm,
-                             MACROBLOCKD *xd) {
-  int i;
-  vpx_memset(xd->mode_info_context->mbmi.nzcs, 0,
-             384 * sizeof(xd->mode_info_context->mbmi.nzcs[0]));
-  switch (xd->mode_info_context->mbmi.txfm_size) {
-    case TX_4X4:
-      for (i = 0; i < 24; ++i) {
-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-      }
-      break;
-
-    case TX_8X8:
-      for (i = 0; i < 16; i += 4) {
-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-      }
-      if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
-          xd->mode_info_context->mbmi.mode == SPLITMV) {
-        for (i = 16; i < 24; ++i) {
-          xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-        }
-      } else {
-        for (i = 16; i < 24; i += 4) {
-          xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-        }
-      }
-      break;
-
-    case TX_16X16:
-      xd->mode_info_context->mbmi.nzcs[0] = xd->nzcs[0];
-      for (i = 16; i < 24; i += 4) {
-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-      }
-      break;
-
-    default:
-      break;
-  }
-}
-
-static void gather_nzcs_sb32(VP9_COMMON *const cm,
-                             MACROBLOCKD *xd) {
-  MODE_INFO *m = xd->mode_info_context;
-  int mis = cm->mode_info_stride;
-  int i, j;
-
-  vpx_memset(m->mbmi.nzcs, 0,
-             384 * sizeof(xd->mode_info_context->mbmi.nzcs[0]));
-  switch (xd->mode_info_context->mbmi.txfm_size) {
-    case TX_4X4:
-      for (i = 0; i < 96; ++i) {
-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-      }
-      break;
-
-    case TX_8X8:
-      for (i = 0; i < 96; i += 4) {
-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-      }
-      break;
-
-    case TX_16X16:
-      for (i = 0; i < 96; i += 16) {
-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-      }
-      break;
-
-    case TX_32X32:
-      xd->mode_info_context->mbmi.nzcs[0] = xd->nzcs[0];
-      for (i = 64; i < 96; i += 16) {
-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-      }
-      break;
-
-    default:
-      break;
-  }
-  for (i = 0; i < 2; ++i)
-    for (j = 0; j < 2; ++j) {
-      if (i == 0 && j == 0) continue;
-      vpx_memcpy((m + j + mis * i)->mbmi.nzcs, m->mbmi.nzcs,
-                 384 * sizeof(m->mbmi.nzcs[0]));
-    }
-}
-
-static void gather_nzcs_sb64(VP9_COMMON *const cm,
-                             MACROBLOCKD *xd) {
-  MODE_INFO *m = xd->mode_info_context;
-  int mis = cm->mode_info_stride;
-  int i, j;
-
-  vpx_memset(xd->mode_info_context->mbmi.nzcs, 0,
-             384 * sizeof(xd->mode_info_context->mbmi.nzcs[0]));
-  switch (xd->mode_info_context->mbmi.txfm_size) {
-    case TX_4X4:
-      for (i = 0; i < 384; ++i) {
-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-      }
-      break;
-
-    case TX_8X8:
-      for (i = 0; i < 384; i += 4) {
-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-      }
-      break;
-
-    case TX_16X16:
-      for (i = 0; i < 384; i += 16) {
-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-      }
-      break;
-
-    case TX_32X32:
-      for (i = 0; i < 384; i += 64) {
-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-      }
-      break;
-
-    default:
-      break;
-  }
-  for (i = 0; i < 4; ++i)
-    for (j = 0; j < 4; ++j) {
-      if (i == 0 && j == 0) continue;
-      vpx_memcpy((m + j + mis * i)->mbmi.nzcs, m->mbmi.nzcs,
-                 384 * sizeof(m->mbmi.nzcs[0]));
-    }
-}
-#endif
-
 static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
                               int output_enabled,
                               int mb_row, int mb_col) {
@@ -2014,7 +1908,7 @@
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const int mis = cm->mode_info_stride;
 
-  assert(!xd->mode_info_context->mbmi.sb_type);
+  assert(xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_MB16X16);
 
 #ifdef ENC_DEBUG
   enc_debug = (cpi->common.current_video_frame == 11 && cm->show_frame &&
@@ -2079,7 +1973,7 @@
     if (output_enabled)
       sum_intra_stats(cpi, x);
   } else {
-    int ref_fb_idx;
+    int ref_fb_idx, second_ref_fb_idx;
 #ifdef ENC_DEBUG
     if (enc_debug)
       printf("Mode %d skip %d tx_size %d ref %d ref2 %d mv %d %d interp %d\n",
@@ -2098,39 +1992,34 @@
     else
       ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
 
-    setup_pred_block(&xd->pre,
-                     &cpi->common.yv12_fb[ref_fb_idx],
-                     mb_row, mb_col,
-                     &xd->scale_factor[0], &xd->scale_factor_uv[0]);
-
     if (mbmi->second_ref_frame > 0) {
-      int second_ref_fb_idx;
-
       if (mbmi->second_ref_frame == LAST_FRAME)
         second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
       else if (mbmi->second_ref_frame == GOLDEN_FRAME)
         second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
       else
         second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
-
-      setup_pred_block(&xd->second_pre,
-                       &cpi->common.yv12_fb[second_ref_fb_idx],
-                       mb_row, mb_col,
-                       &xd->scale_factor[1], &xd->scale_factor_uv[1]);
     }
 
+    setup_pre_planes(xd,
+        &cpi->common.yv12_fb[ref_fb_idx],
+        mbmi->second_ref_frame > 0 ? &cpi->common.yv12_fb[second_ref_fb_idx]
+                                   : NULL,
+        mb_row, mb_col, xd->scale_factor, xd->scale_factor_uv);
+
     if (!x->skip) {
       vp9_encode_inter16x16(cm, x, mb_row, mb_col);
     } else {
       vp9_build_inter_predictors_sb(xd, mb_row, mb_col, BLOCK_SIZE_MB16X16);
 #if CONFIG_COMP_INTERINTRA_PRED
       if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
-        vp9_build_interintra_16x16_predictors_mb(xd,
-                                                 xd->plane[0].dst.buf,
-                                                 xd->plane[1].dst.buf,
-                                                 xd->plane[2].dst.buf,
-                                                 xd->plane[0].dst.stride,
-                                                 xd->plane[1].dst.stride);
+        vp9_build_interintra_predictors(xd,
+                                        xd->plane[0].dst.buf,
+                                        xd->plane[1].dst.buf,
+                                        xd->plane[2].dst.buf,
+                                        xd->plane[0].dst.stride,
+                                        xd->plane[1].dst.stride,
+                                        BLOCK_SIZE_MB16X16);
       }
 #endif
     }
@@ -2189,9 +2078,6 @@
     }
 #endif
 
-#if CONFIG_CODE_NONZEROCOUNT
-    gather_nzcs_mb16(cm, xd);
-#endif
     vp9_tokenize_mb(cpi, xd, t, !output_enabled);
 
   } else {
@@ -2239,14 +2125,6 @@
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const uint8_t *src = x->src.y_buffer;
-  uint8_t *dst = xd->plane[0].dst.buf;
-  const uint8_t *usrc = x->src.u_buffer;
-  uint8_t *udst = xd->plane[1].dst.buf;
-  const uint8_t *vsrc = x->src.v_buffer;
-  uint8_t *vdst = xd->plane[2].dst.buf;
-  int src_y_stride = x->src.y_stride, dst_y_stride = xd->plane[0].dst.stride;
-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->plane[1].dst.stride;
   int n;
   MODE_INFO *mi = x->e_mbd.mode_info_context;
   unsigned int segment_id = mi->mbmi.segment_id;
@@ -2296,7 +2174,7 @@
     if (output_enabled)
       sum_intra_stats(cpi, x);
   } else {
-    int ref_fb_idx;
+    int ref_fb_idx, second_ref_fb_idx;
 
     assert(cm->frame_type != KEY_FRAME);
 
@@ -2307,35 +2185,26 @@
     else
       ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
 
-    setup_pred_block(&xd->pre,
-                     &cpi->common.yv12_fb[ref_fb_idx],
-                     mb_row, mb_col,
-                     &xd->scale_factor[0], &xd->scale_factor_uv[0]);
-
     if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-      int second_ref_fb_idx;
-
       if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
         second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
       else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)
         second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
       else
         second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
-
-      setup_pred_block(&xd->second_pre,
-                       &cpi->common.yv12_fb[second_ref_fb_idx],
-                       mb_row, mb_col,
-                       &xd->scale_factor[1], &xd->scale_factor_uv[1]);
     }
 
+    setup_pre_planes(xd,
+        &cpi->common.yv12_fb[ref_fb_idx],
+        xd->mode_info_context->mbmi.second_ref_frame > 0
+            ? &cpi->common.yv12_fb[second_ref_fb_idx] : NULL,
+        mb_row, mb_col, xd->scale_factor, xd->scale_factor_uv);
+
     vp9_build_inter_predictors_sb(xd, mb_row, mb_col, bsize);
   }
 
   if (!x->skip) {
-    vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride,
-                         bsize);
-    vp9_subtract_sbuv_s_c(x->src_diff, usrc, vsrc, src_uv_stride,
-                          udst, vdst, dst_uv_stride, bsize);
+    vp9_subtract_sb(x, bsize);
 
     switch (xd->mode_info_context->mbmi.txfm_size) {
       case TX_32X32:
@@ -2411,13 +2280,6 @@
       default: assert(0);
     }
     vp9_recon_sb_c(xd, bsize);
-#if CONFIG_CODE_NONZEROCOUNT
-    if (bsize == BLOCK_SIZE_SB32X32) {
-      gather_nzcs_sb32(cm, &x->e_mbd);
-    } else {
-      gather_nzcs_sb64(cm, &x->e_mbd);
-    }
-#endif
     vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled, bsize);
   } else {
     // FIXME(rbultje): not tile-aware (mi - 1)
diff --git a/vp9/encoder/vp9_encodeframe.h b/vp9/encoder/vp9_encodeframe.h
index 9f13edc..4ace468 100644
--- a/vp9/encoder/vp9_encodeframe.h
+++ b/vp9/encoder/vp9_encodeframe.h
@@ -16,6 +16,4 @@
 
 void vp9_build_block_offsets(struct macroblock *x);
 
-void vp9_setup_block_ptrs(struct macroblock *x);
-
 #endif  // VP9_ENCODER_VP9_ENCODEFRAME_H_
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index 95befb7..66d62d9 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -37,7 +37,7 @@
     }
   }
 
-  return vp9_get_mb_ss(x->src_diff);
+  return vp9_get_mb_ss(x->plane[0].src_diff);
 }
 
 static void encode_intra4x4block(MACROBLOCK *x, int ib) {
@@ -45,6 +45,10 @@
   BLOCK *be = &x->block[ib];
   MACROBLOCKD * const xd = &x->e_mbd;
   TX_TYPE tx_type;
+  int16_t* const src_diff =
+      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,
+                                x->plane[0].src_diff);
+  int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);
 
   assert(ib < 16);
 
@@ -54,16 +58,18 @@
 
   vp9_intra4x4_predict(&x->e_mbd, b, b->bmi.as_mode.first,
                        *(b->base_dst) + b->dst, b->dst_stride);
-  vp9_subtract_b(be, b, 16);
+  vp9_subtract_block(4, 4, src_diff, 16,
+                     *(be->base_src) + be->src, be->src_stride,
+                     *(b->base_dst) + b->dst, b->dst_stride);
 
   tx_type = get_tx_type_4x4(&x->e_mbd, ib);
   if (tx_type != DCT_DCT) {
-    vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
+    vp9_short_fht4x4(src_diff, coeff, 16, tx_type);
     vp9_ht_quantize_b_4x4(x, ib, tx_type);
     vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
                      b->diff, 16, tx_type);
   } else {
-    x->fwd_txm4x4(be->src_diff, be->coeff, 32);
+    x->fwd_txm4x4(src_diff, coeff, 32);
     x->quantize_b_4x4(x, ib, 16);
     vp9_inverse_transform_b_4x4(&x->e_mbd, xd->plane[0].eobs[ib],
                                 BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
@@ -86,10 +92,7 @@
   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
 
   vp9_build_intra_predictors_sby_s(xd, BLOCK_SIZE_MB16X16);
-  vp9_subtract_sby_s_c(x->src_diff,
-                       x->src.y_buffer, x->src.y_stride,
-                       xd->plane[0].dst.buf, xd->plane[0].dst.stride,
-                       BLOCK_SIZE_MB16X16);
+  vp9_subtract_sby(x, BLOCK_SIZE_MB16X16);
 
   switch (tx_size) {
     case TX_16X16:
@@ -123,11 +126,7 @@
   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
 
   vp9_build_intra_predictors_sbuv_s(xd, BLOCK_SIZE_MB16X16);
-  vp9_subtract_sbuv_s_c(x->src_diff,
-                        x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
-                        xd->plane[1].dst.buf, xd->plane[2].dst.buf,
-                        xd->plane[1].dst.stride,
-                        BLOCK_SIZE_MB16X16);
+  vp9_subtract_sbuv(x, BLOCK_SIZE_MB16X16);
 
   switch (tx_size) {
     case TX_4X4:
@@ -146,13 +145,16 @@
       break;
     }
 
-  vp9_recon_intra_mbuv(xd);
+  vp9_recon_sbuv(xd, BLOCK_SIZE_MB16X16);
 }
 
 void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
   MACROBLOCKD *xd = &x->e_mbd;
   BLOCKD *b = &xd->block[ib];
   BLOCK *be = &x->block[ib];
+  int16_t* const src_diff =
+      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,
+                                x->plane[0].src_diff);
   const int iblock[4] = {0, 1, 4, 5};
   int i;
   TX_TYPE tx_type;
@@ -160,40 +162,47 @@
   vp9_intra8x8_predict(xd, b, b->bmi.as_mode.first,
                        *(b->base_dst) + b->dst, b->dst_stride);
   // generate residual blocks
-  vp9_subtract_4b_c(be, b, 16);
+  vp9_subtract_block(8, 8, src_diff, 16,
+                     *(be->base_src) + be->src, be->src_stride,
+                     *(b->base_dst) + b->dst, b->dst_stride);
 
   if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
     int idx = (ib & 0x02) ? (ib + 2) : ib;
-    int16_t * const dqcoeff = BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16);
+    int16_t* const dqcoeff = BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16);
+    int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, idx, 16);
 
     assert(idx < 16);
     tx_type = get_tx_type_8x8(xd, ib);
     if (tx_type != DCT_DCT) {
-      vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type);
+      vp9_short_fht8x8(src_diff, coeff, 16, tx_type);
       x->quantize_b_8x8(x, idx, tx_type, 16);
       vp9_short_iht8x8(dqcoeff, xd->block[ib].diff,
                             16, tx_type);
     } else {
-      x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32);
+      x->fwd_txm8x8(src_diff, coeff, 32);
       x->quantize_b_8x8(x, idx, DCT_DCT, 16);
       vp9_short_idct8x8(dqcoeff, xd->block[ib].diff, 32);
     }
   } else {
     for (i = 0; i < 4; i++) {
       int idx = ib + iblock[i];
-      int16_t * const dqcoeff = BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16);
+      int16_t* const dqcoeff = BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16);
+      int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, idx, 16);
+      int16_t* const src_diff =
+          raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, idx,
+                                    x->plane[0].src_diff);
 
       assert(idx < 16);
       b = &xd->block[ib + iblock[i]];
       be = &x->block[ib + iblock[i]];
       tx_type = get_tx_type_4x4(xd, ib + iblock[i]);
       if (tx_type != DCT_DCT) {
-        vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
+        vp9_short_fht4x4(src_diff, coeff, 16, tx_type);
         vp9_ht_quantize_b_4x4(x, ib + iblock[i], tx_type);
         vp9_short_iht4x4(dqcoeff, b->diff, 16, tx_type);
       } else if (!(i & 1) &&
                  get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) {
-        x->fwd_txm8x4(be->src_diff, be->coeff, 32);
+        x->fwd_txm8x4(src_diff, coeff, 32);
         x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1, 16);
         vp9_inverse_transform_b_4x4(xd, xd->plane[0].eobs[ib + iblock[i]],
                                     dqcoeff, b->diff, 32);
@@ -201,7 +210,7 @@
                                     dqcoeff + 16, (b + 1)->diff, 32);
         i++;
       } else {
-        x->fwd_txm4x4(be->src_diff, be->coeff, 32);
+        x->fwd_txm4x4(src_diff, coeff, 32);
         x->quantize_b_4x4(x, ib + iblock[i], 16);
         vp9_inverse_transform_b_4x4(xd, xd->plane[0].eobs[ib + iblock[i]],
                                     dqcoeff, b->diff, 32);
@@ -229,16 +238,23 @@
   BLOCKD *b = &x->e_mbd.block[ib];
   BLOCK *be = &x->block[ib];
   int16_t * const dqcoeff = MB_SUBBLOCK_FIELD(xd, dqcoeff, ib);
+  int16_t* const coeff = MB_SUBBLOCK_FIELD(x, coeff, ib);
   const int plane = ib < 20 ? 1 : 2;
   const int block = ib < 20 ? ib - 16 : ib - 20;
+  int16_t* const src_diff =
+      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, plane, block,
+                                x->plane[plane].src_diff);
 
   assert(ib >= 16 && ib < 24);
   vp9_intra_uv4x4_predict(&x->e_mbd, b, mode,
                           *(b->base_dst) + b->dst, b->dst_stride);
 
-  vp9_subtract_b(be, b, 8);
+  assert(xd->plane[1].subsampling_x == 1);
+  vp9_subtract_block(4, 4, src_diff, 8,
+                     *(be->base_src) + be->src, be->src_stride,
+                     *(b->base_dst) + b->dst, b->dst_stride);
 
-  x->fwd_txm4x4(be->src_diff, be->coeff, 16);
+  x->fwd_txm4x4(src_diff, coeff, 16);
   x->quantize_b_4x4(x, ib, 16);
   vp9_inverse_transform_b_4x4(&x->e_mbd, xd->plane[plane].eobs[block],
                               dqcoeff, b->diff, 16);
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index ea19fbf..d8893b6 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -20,102 +20,54 @@
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9_rtcd.h"
 
-void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch) {
-  uint8_t *src_ptr = (*(be->base_src) + be->src);
-  int16_t *diff_ptr = be->src_diff;
-  uint8_t *pred_ptr = *(bd->base_dst) + bd->dst;
-  int src_stride = be->src_stride;
-  int dst_stride = bd->dst_stride;
-
+void vp9_subtract_block(int rows, int cols,
+                        int16_t *diff_ptr, int diff_stride,
+                        const uint8_t *src_ptr, int src_stride,
+                        const uint8_t *pred_ptr, int pred_stride) {
   int r, c;
 
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++)
+  for (r = 0; r < rows; r++) {
+    for (c = 0; c < cols; c++)
       diff_ptr[c] = src_ptr[c] - pred_ptr[c];
 
-    diff_ptr += pitch;
-    pred_ptr += dst_stride;
+    diff_ptr += diff_stride;
+    pred_ptr += pred_stride;
     src_ptr  += src_stride;
   }
 }
 
-void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch) {
-  uint8_t *src_ptr = (*(be->base_src) + be->src);
-  int16_t *diff_ptr = be->src_diff;
-  uint8_t *pred_ptr = *(bd->base_dst) + bd->dst;
-  int src_stride = be->src_stride;
-  int dst_stride = bd->dst_stride;
-  int r, c;
 
-  for (r = 0; r < 8; r++) {
-    for (c = 0; c < 8; c++)
-      diff_ptr[c] = src_ptr[c] - pred_ptr[c];
+static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int plane) {
+  const MACROBLOCKD * const xd = &x->e_mbd;
+  const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
+  const int bh = 4 << (b_height_log2(bsize) - xd->plane[plane].subsampling_y);
+  const uint8_t *src = plane == 0 ? x->src.y_buffer :
+                       plane == 1 ? x->src.u_buffer : x->src.v_buffer;
+  const int src_stride = plane == 0 ? x->src.y_stride : x->src.uv_stride;
 
-    diff_ptr += pitch;
-    pred_ptr += dst_stride;
-    src_ptr  += src_stride;
-  }
+  assert(plane < 3);
+  vp9_subtract_block(bh, bw,
+                     x->plane[plane].src_diff, bw, src, src_stride,
+                     xd->plane[plane].dst.buf, xd->plane[plane].dst.stride);
 }
 
-void vp9_subtract_sby_s_c(int16_t *diff, const uint8_t *src, int src_stride,
-                          const uint8_t *pred, int dst_stride,
-                          BLOCK_SIZE_TYPE bsize) {
-  const int bh = 16 << mb_height_log2(bsize), bw = 16 << mb_width_log2(bsize);
-  int r, c;
-
-  for (r = 0; r < bh; r++) {
-    for (c = 0; c < bw; c++)
-      diff[c] = src[c] - pred[c];
-
-    diff += bw;
-    pred += dst_stride;
-    src  += src_stride;
-  }
+void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+  subtract_plane(x, bsize, 0);
 }
 
-void vp9_subtract_sbuv_s_c(int16_t *diff, const uint8_t *usrc,
-                           const uint8_t *vsrc, int src_stride,
-                           const uint8_t *upred,
-                           const uint8_t *vpred, int dst_stride,
-                           BLOCK_SIZE_TYPE bsize) {
-  const int bhl = mb_height_log2(bsize), bwl = mb_width_log2(bsize);
-  const int uoff = (16 * 16) << (bhl + bwl), voff = (uoff * 5) >> 2;
-  const int bw = 8 << bwl, bh = 8 << bhl;
-  int16_t *udiff = diff + uoff;
-  int16_t *vdiff = diff + voff;
-  int r, c;
+void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+  int i;
 
-  for (r = 0; r < bh; r++) {
-    for (c = 0; c < bw; c++)
-      udiff[c] = usrc[c] - upred[c];
-
-    udiff += bw;
-    upred += dst_stride;
-    usrc  += src_stride;
-  }
-
-  for (r = 0; r < bh; r++) {
-    for (c = 0; c < bw; c++)
-      vdiff[c] = vsrc[c] - vpred[c];
-
-    vdiff += bw;
-    vpred += dst_stride;
-    vsrc  += src_stride;
-  }
+  for (i = 1; i < MAX_MB_PLANE; i++)
+    subtract_plane(x, bsize, i);
 }
 
-static void subtract_mb(MACROBLOCK *x) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  vp9_subtract_sby_s_c(x->src_diff, x->src.y_buffer, x->src.y_stride,
-                       xd->plane[0].dst.buf, xd->plane[0].dst.stride,
-                       BLOCK_SIZE_MB16X16);
-  vp9_subtract_sbuv_s_c(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                        x->src.uv_stride,
-                        xd->plane[1].dst.buf, xd->plane[2].dst.buf,
-                        xd->plane[1].dst.stride,
-                        BLOCK_SIZE_MB16X16);
+void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+  vp9_subtract_sby(x, bsize);
+  vp9_subtract_sbuv(x, bsize);
 }
 
+
 void vp9_transform_sby_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
   const int bwl = mb_width_log2(bsize) - 1, bw = 1 << bwl;
   const int bh = 1 << (mb_height_log2(bsize) - 1);
@@ -125,8 +77,8 @@
   for (n = 0; n < bw * bh; n++) {
     const int x_idx = n & (bw - 1), y_idx = n >> bwl;
 
-    vp9_short_fdct32x32(x->src_diff + y_idx * stride * 32 + x_idx * 32,
-                        x->coeff + n * 1024, stride * 2);
+    vp9_short_fdct32x32(x->plane[0].src_diff + y_idx * stride * 32 + x_idx * 32,
+                        x->plane[0].coeff + n * 1024, stride * 2);
   }
 }
 
@@ -143,11 +95,12 @@
                                               (y_idx * bstride + x_idx) * 4);
 
     if (tx_type != DCT_DCT) {
-      vp9_short_fht16x16(x->src_diff + y_idx * stride * 16 + x_idx * 16,
-                         x->coeff + n * 256, stride, tx_type);
+      vp9_short_fht16x16(x->plane[0].src_diff +
+                             y_idx * stride * 16 + x_idx * 16,
+                         x->plane[0].coeff + n * 256, stride, tx_type);
     } else {
-      x->fwd_txm16x16(x->src_diff + y_idx * stride * 16 + x_idx * 16,
-                      x->coeff + n * 256, stride * 2);
+      x->fwd_txm16x16(x->plane[0].src_diff + y_idx * stride * 16 + x_idx * 16,
+                      x->plane[0].coeff + n * 256, stride * 2);
     }
   }
 }
@@ -164,11 +117,11 @@
     const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * bstride + x_idx) * 2);
 
     if (tx_type != DCT_DCT) {
-      vp9_short_fht8x8(x->src_diff + y_idx * stride * 8 + x_idx * 8,
-                       x->coeff + n * 64, stride, tx_type);
+      vp9_short_fht8x8(x->plane[0].src_diff + y_idx * stride * 8 + x_idx * 8,
+                       x->plane[0].coeff + n * 64, stride, tx_type);
     } else {
-      x->fwd_txm8x8(x->src_diff + y_idx * stride * 8 + x_idx * 8,
-                    x->coeff + n * 64, stride * 2);
+      x->fwd_txm8x8(x->plane[0].src_diff + y_idx * stride * 8 + x_idx * 8,
+                    x->plane[0].coeff + n * 64, stride * 2);
     }
   }
 }
@@ -185,11 +138,11 @@
     const TX_TYPE tx_type = get_tx_type_4x4(xd, n);
 
     if (tx_type != DCT_DCT) {
-      vp9_short_fht4x4(x->src_diff + y_idx * stride * 4 + x_idx * 4,
-                       x->coeff + n * 16, stride, tx_type);
+      vp9_short_fht4x4(x->plane[0].src_diff + y_idx * stride * 4 + x_idx * 4,
+                       x->plane[0].coeff + n * 16, stride, tx_type);
     } else {
-      x->fwd_txm4x4(x->src_diff + y_idx * stride * 4 + x_idx * 4,
-                    x->coeff + n * 16, stride * 2);
+      x->fwd_txm4x4(x->plane[0].src_diff + y_idx * stride * 4 + x_idx * 4,
+                    x->plane[0].coeff + n * 16, stride * 2);
     }
   }
 }
@@ -197,15 +150,12 @@
 void vp9_transform_sbuv_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
   assert(bsize == BLOCK_SIZE_SB64X64);
   vp9_clear_system_state();
-  vp9_short_fdct32x32(x->src_diff + 4096,
-                      x->coeff + 4096, 64);
-  vp9_short_fdct32x32(x->src_diff + 4096 + 1024,
-                      x->coeff + 4096 + 1024, 64);
+  vp9_short_fdct32x32(x->plane[1].src_diff, x->plane[1].coeff, 64);
+  vp9_short_fdct32x32(x->plane[2].src_diff, x->plane[2].coeff, 64);
 }
 
 void vp9_transform_sbuv_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
   const int bwl = mb_width_log2(bsize), bhl = mb_height_log2(bsize);
-  const int uoff = (16 * 16) << (bwl + bhl), voff = (uoff * 5) >> 2;
   const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
   const int stride = 16 << (bwl - 1);
   int n;
@@ -214,16 +164,15 @@
   for (n = 0; n < bw * bh; n++) {
     const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
 
-    x->fwd_txm16x16(x->src_diff + uoff + y_idx * stride * 16 + x_idx * 16,
-                    x->coeff + uoff + n * 256, stride * 2);
-    x->fwd_txm16x16(x->src_diff + voff + y_idx * stride * 16 + x_idx * 16,
-                    x->coeff + voff + n * 256, stride * 2);
+    x->fwd_txm16x16(x->plane[1].src_diff + y_idx * stride * 16 + x_idx * 16,
+                    x->plane[1].coeff + n * 256, stride * 2);
+    x->fwd_txm16x16(x->plane[2].src_diff + y_idx * stride * 16 + x_idx * 16,
+                    x->plane[2].coeff + n * 256, stride * 2);
   }
 }
 
 void vp9_transform_sbuv_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
   const int bwl = mb_width_log2(bsize) + 1, bhl = mb_height_log2(bsize) + 1;
-  const int uoff = (8 * 8) << (bwl + bhl), voff = (uoff * 5) >> 2;
   const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
   const int stride = 8 << (bwl - 1);
   int n;
@@ -232,16 +181,15 @@
   for (n = 0; n < bw * bh; n++) {
     const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
 
-    x->fwd_txm8x8(x->src_diff + uoff + y_idx * stride * 8 + x_idx * 8,
-                  x->coeff + uoff + n * 64, stride * 2);
-    x->fwd_txm8x8(x->src_diff + voff + y_idx * stride * 8 + x_idx * 8,
-                  x->coeff + voff + n * 64, stride * 2);
+    x->fwd_txm8x8(x->plane[1].src_diff + y_idx * stride * 8 + x_idx * 8,
+                  x->plane[1].coeff + n * 64, stride * 2);
+    x->fwd_txm8x8(x->plane[2].src_diff + y_idx * stride * 8 + x_idx * 8,
+                  x->plane[2].coeff + n * 64, stride * 2);
   }
 }
 
 void vp9_transform_sbuv_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
   const int bwl = mb_width_log2(bsize) + 2, bhl = mb_height_log2(bsize) + 2;
-  const int uoff = (4 * 4) << (bwl + bhl), voff = (uoff * 5) >> 2;
   const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
   const int stride = 4 << (bwl - 1);
   int n;
@@ -250,10 +198,10 @@
   for (n = 0; n < bw * bh; n++) {
     const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
 
-    x->fwd_txm4x4(x->src_diff + uoff + y_idx * stride * 4 + x_idx * 4,
-                  x->coeff + uoff + n * 16, stride * 2);
-    x->fwd_txm4x4(x->src_diff + voff + y_idx * stride * 4 + x_idx * 4,
-                  x->coeff + voff + n * 16, stride * 2);
+    x->fwd_txm4x4(x->plane[1].src_diff + y_idx * stride * 4 + x_idx * 4,
+                  x->plane[1].coeff + n * 16, stride * 2);
+    x->fwd_txm4x4(x->plane[2].src_diff + y_idx * stride * 4 + x_idx * 4,
+                  x->plane[2].coeff + n * 16, stride * 2);
   }
 }
 
@@ -295,10 +243,10 @@
                                      int idx, int token,
                                      uint8_t *token_cache,
                                      int pad, int l) {
-  int bak = token_cache[idx], pt;
-  token_cache[idx] = token;
+  int bak = token_cache[scan[idx]], pt;
+  token_cache[scan[idx]] = token;
   pt = vp9_get_coef_context(scan, nb, pad, token_cache, idx + 1, l);
-  token_cache[idx] = bak;
+  token_cache[scan[idx]] = bak;
   return pt;
 }
 
@@ -312,7 +260,8 @@
   vp9_token_state tokens[1025][2];
   unsigned best_index[1025][2];
   const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, ib);
-  const int16_t *coeff_ptr = mb->coeff + ib * 16;
+  const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff,
+                                          pb_idx.block, 16);
   int16_t *qcoeff_ptr;
   int16_t *dqcoeff_ptr;
   int eob = xd->plane[pb_idx.plane].eobs[pb_idx.block], final_eob, sz = 0;
@@ -326,24 +275,6 @@
   int const *scan, *nb;
   const int mul = 1 + (tx_size == TX_32X32);
   uint8_t token_cache[1024];
-#if CONFIG_CODE_NONZEROCOUNT
-  // TODO(debargha): the dynamic programming approach used in this function
-  // is not compatible with the true rate cost when nzcs are used. Note
-  // the total rate is the sum of the nzc rate and the indicvidual token
-  // rates. The latter part can be optimized in this function, but because
-  // the nzc rate is a function of all the other tokens without a Markov
-  // relationship this rate cannot be considered correctly.
-  // The current implementation uses a suboptimal approach to account for
-  // the nzc rates somewhat, but in reality the optimization approach needs
-  // to change substantially.
-  const int nzc_used = get_nzc_used(tx_size);
-  uint16_t nzc = xd->nzcs[ib];
-  uint16_t nzc0, nzc1;
-  uint16_t final_nzc = 0, final_nzc_exp;
-  int nzc_context = vp9_get_nzc_context(cm, xd, ib);
-  unsigned int *nzc_cost;
-  nzc0 = nzc1 = nzc;
-#endif
 
   assert((!type && !pb_idx.plane) || (type && pb_idx.plane));
   dqcoeff_ptr = BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16);
@@ -353,16 +284,7 @@
     case TX_4X4: {
       const TX_TYPE tx_type = get_tx_type_4x4(xd, ib);
       default_eob = 16;
-#if CONFIG_CODE_NONZEROCOUNT
-      nzc_cost = mb->nzc_costs_4x4[nzc_context][ref][type];
-#endif
-      if (tx_type == DCT_ADST) {
-        scan = vp9_col_scan_4x4;
-      } else if (tx_type == ADST_DCT) {
-        scan = vp9_row_scan_4x4;
-      } else {
-        scan = vp9_default_zig_zag1d_4x4;
-      }
+      scan = get_scan_4x4(tx_type);
       break;
     }
     case TX_8X8: {
@@ -370,17 +292,8 @@
       const int sz = 3 + mb_width_log2(sb_type);
       const int x = ib & ((1 << sz) - 1), y = ib - x;
       const TX_TYPE tx_type = get_tx_type_8x8(xd, y + (x >> 1));
-      if (tx_type == DCT_ADST) {
-        scan = vp9_col_scan_8x8;
-      } else if (tx_type == ADST_DCT) {
-        scan = vp9_row_scan_8x8;
-      } else {
-        scan = vp9_default_zig_zag1d_8x8;
-      }
+      scan = get_scan_8x8(tx_type);
       default_eob = 64;
-#if CONFIG_CODE_NONZEROCOUNT
-      nzc_cost = mb->nzc_costs_8x8[nzc_context][ref][type];
-#endif
       break;
     }
     case TX_16X16: {
@@ -388,25 +301,13 @@
       const int sz = 4 + mb_width_log2(sb_type);
       const int x = ib & ((1 << sz) - 1), y = ib - x;
       const TX_TYPE tx_type = get_tx_type_16x16(xd, y + (x >> 2));
-      if (tx_type == DCT_ADST) {
-        scan = vp9_col_scan_16x16;
-      } else if (tx_type == ADST_DCT) {
-        scan = vp9_row_scan_16x16;
-      } else {
-        scan = vp9_default_zig_zag1d_16x16;
-      }
+      scan = get_scan_16x16(tx_type);
       default_eob = 256;
-#if CONFIG_CODE_NONZEROCOUNT
-      nzc_cost = mb->nzc_costs_16x16[nzc_context][ref][type];
-#endif
       break;
     }
     case TX_32X32:
       scan = vp9_default_zig_zag1d_32x32;
       default_eob = 1024;
-#if CONFIG_CODE_NONZEROCOUNT
-      nzc_cost = mb->nzc_costs_32x32[nzc_context][ref][type];
-#endif
       break;
   }
   assert(eob <= default_eob);
@@ -418,11 +319,7 @@
   rddiv = mb->rddiv;
   memset(best_index, 0, sizeof(best_index));
   /* Initialize the sentinel node of the trellis. */
-#if CONFIG_CODE_NONZEROCOUNT
-  tokens[eob][0].rate = nzc_used ? nzc_cost[nzc] : 0;
-#else
   tokens[eob][0].rate = 0;
-#endif
   tokens[eob][0].error = 0;
   tokens[eob][0].next = default_eob;
   tokens[eob][0].token = DCT_EOB_TOKEN;
@@ -430,14 +327,11 @@
   *(tokens[eob] + 1) = *(tokens[eob] + 0);
   next = eob;
   for (i = 0; i < eob; i++)
-    token_cache[i] = vp9_dct_value_tokens_ptr[qcoeff_ptr[scan[i]]].token;
+    token_cache[scan[i]] = vp9_dct_value_tokens_ptr[qcoeff_ptr[scan[i]]].token;
   nb = vp9_get_coef_neighbors_handle(scan, &pad);
 
   for (i = eob; i-- > i0;) {
     int base_bits, d2, dx;
-#if CONFIG_CODE_NONZEROCOUNT
-    int new_nzc0, new_nzc1;
-#endif
 
     rc = scan[i];
     x = qcoeff_ptr[rc];
@@ -472,9 +366,6 @@
       tokens[i][0].token = t0;
       tokens[i][0].qc = x;
       best_index[i][0] = best;
-#if CONFIG_CODE_NONZEROCOUNT
-      new_nzc0 = (best ? nzc1 : nzc0);
-#endif
 
       /* Evaluate the second possibility for this state. */
       rate0 = tokens[next][0].rate;
@@ -501,14 +392,6 @@
              DCT_EOB_TOKEN : ZERO_TOKEN;
         t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
              DCT_EOB_TOKEN : ZERO_TOKEN;
-#if CONFIG_CODE_NONZEROCOUNT
-        // Account for rate drop because of the nzc change.
-        // TODO(debargha): Find a better solution
-        if (nzc_used) {
-          rate0 -= nzc_cost[nzc0] - nzc_cost[nzc0 - 1];
-          rate1 -= nzc_cost[nzc1] - nzc_cost[nzc1 - 1];
-        }
-#endif
       } else {
         t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token;
       }
@@ -543,11 +426,6 @@
       tokens[i][1].token = best ? t1 : t0;
       tokens[i][1].qc = x;
       best_index[i][1] = best;
-#if CONFIG_CODE_NONZEROCOUNT
-      new_nzc1 = (best ? nzc1 : nzc0) - (!x);
-      nzc0 = new_nzc0;
-      nzc1 = new_nzc1;
-#endif
       /* Finally, make this the new head of the trellis. */
       next = i;
     }
@@ -586,17 +464,13 @@
   rate1 += mb->token_costs[tx_size][type][ref][band][pt][t1];
   UPDATE_RD_COST();
   best = rd_cost1 < rd_cost0;
-#if CONFIG_CODE_NONZEROCOUNT
-  final_nzc_exp = (best ? nzc1 : nzc0);
-#endif
   final_eob = i0 - 1;
+  vpx_memset(qcoeff_ptr, 0, sizeof(*qcoeff_ptr) * (16 << (tx_size * 2)));
+  vpx_memset(dqcoeff_ptr, 0, sizeof(*dqcoeff_ptr) * (16 << (tx_size * 2)));
   for (i = next; i < eob; i = next) {
     x = tokens[i][best].qc;
     if (x) {
       final_eob = i;
-#if CONFIG_CODE_NONZEROCOUNT
-      ++final_nzc;
-#endif
     }
     rc = scan[i];
     qcoeff_ptr[rc] = x;
@@ -609,10 +483,6 @@
 
   xd->plane[pb_idx.plane].eobs[pb_idx.block] = final_eob;
   *a = *l = (final_eob > 0);
-#if CONFIG_CODE_NONZEROCOUNT
-  assert(final_nzc == final_nzc_exp);
-  xd->nzcs[ib] = final_nzc;
-#endif
 }
 
 void vp9_optimize_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
@@ -904,8 +774,8 @@
                            int mb_row, int mb_col) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  vp9_build_inter_predictors_mb(xd, mb_row, mb_col);
-  subtract_mb(x);
+  vp9_build_inter_predictors_sb(xd, mb_row, mb_col, BLOCK_SIZE_MB16X16);
+  vp9_subtract_sb(x, BLOCK_SIZE_MB16X16);
   vp9_fidct_mb(cm, x);
   vp9_recon_sb(xd, BLOCK_SIZE_MB16X16);
 }
@@ -914,12 +784,8 @@
 void vp9_encode_inter16x16y(MACROBLOCK *x, int mb_row, int mb_col) {
   MACROBLOCKD *xd = &x->e_mbd;
 
-  vp9_build_inter_predictors_sby(xd, xd->plane[0].dst.buf,
-                                 xd->plane[0].dst.stride,
-                                 mb_row, mb_col, BLOCK_SIZE_MB16X16);
-  vp9_subtract_sby_s_c(x->src_diff, x->src.y_buffer, x->src.y_stride,
-                       xd->plane[0].dst.buf, xd->plane[0].dst.stride,
-                       BLOCK_SIZE_MB16X16);
+  vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_SIZE_MB16X16);
+  vp9_subtract_sby(x, BLOCK_SIZE_MB16X16);
 
   vp9_transform_sby_4x4(x, BLOCK_SIZE_MB16X16);
   vp9_quantize_sby_4x4(x, BLOCK_SIZE_MB16X16);
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 3c0d760..da134a8 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -56,15 +56,12 @@
 
 void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x);
 
-void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);
-
-void vp9_subtract_sby_s_c(int16_t *diff, const uint8_t *src, int src_stride,
-                          const uint8_t *pred, int dst_stride,
-                          BLOCK_SIZE_TYPE bsize);
-void vp9_subtract_sbuv_s_c(int16_t *diff, const uint8_t *usrc,
-                           const uint8_t *vsrc, int src_stride,
-                           const uint8_t *upred,
-                           const uint8_t *vpred, int dst_stride,
-                           BLOCK_SIZE_TYPE bsize);
+void vp9_subtract_block(int rows, int cols,
+                        int16_t *diff_ptr, int diff_stride,
+                        const uint8_t *src_ptr, int src_stride,
+                        const uint8_t *pred_ptr, int pred_stride);
+void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_subtract_sb(MACROBLOCK *xd, BLOCK_SIZE_TYPE bsize);
 
 #endif  // VP9_ENCODER_VP9_ENCODEMB_H_
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 91d7f05..6e61250 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -358,7 +358,7 @@
   int ref_stride = d->pre_stride;
 
   // Set up pointers for this macro block recon buffer
-  xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
+  xd->plane[0].pre[0].buf = recon_buffer->y_buffer + recon_yoffset;
 
   ref_ptr = (uint8_t *)(*(d->base_pre) + d->pre);
 
@@ -402,7 +402,7 @@
   v_fn_ptr.vf = vp9_mse16x16;
 
   // Set up pointers for this macro block recon buffer
-  xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
+  xd->plane[0].pre[0].buf = recon_buffer->y_buffer + recon_yoffset;
 
   // Initial step/diamond search centred on best mv
   tmp_mv.as_int = 0;
@@ -485,7 +485,7 @@
   vp9_clear_system_state();  // __asm emms;
 
   x->src = * cpi->Source;
-  xd->pre = *lst_yv12;
+  setup_pre_planes(xd, lst_yv12, NULL, 0, 0, NULL, NULL);
   setup_dst_planes(xd, new_yv12, 0, 0);
 
   x->partition_info = x->pi;
@@ -496,8 +496,6 @@
 
   vp9_setup_block_dptrs(&x->e_mbd);
 
-  vp9_setup_block_ptrs(x);
-
   // set up frame new frame for intra coded blocks
   vp9_setup_intra_recon(new_yv12);
   vp9_frame_init_quantizer(cpi);
@@ -526,15 +524,15 @@
     x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16)
                     + (VP9BORDERINPIXELS - 16);
 
-    set_mb_row(cm, xd, mb_row, 1 << mb_height_log2(BLOCK_SIZE_MB16X16));
-
     // for each macroblock col in image
     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
       int this_error;
       int gf_motion_error = INT_MAX;
       int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
 
-      set_mb_col(cm, xd, mb_col, 1 << mb_height_log2(BLOCK_SIZE_MB16X16));
+      set_mb_row_col(cm, xd,
+                     mb_row, 1 << mb_height_log2(BLOCK_SIZE_MB16X16),
+                     mb_col, 1 << mb_height_log2(BLOCK_SIZE_MB16X16));
       xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
       xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
       xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
@@ -601,9 +599,9 @@
           }
 
           // Reset to last frame as reference buffer
-          xd->pre.y_buffer = lst_yv12->y_buffer + recon_yoffset;
-          xd->pre.u_buffer = lst_yv12->u_buffer + recon_uvoffset;
-          xd->pre.v_buffer = lst_yv12->v_buffer + recon_uvoffset;
+          xd->plane[0].pre[0].buf = lst_yv12->y_buffer + recon_yoffset;
+          xd->plane[1].pre[0].buf = lst_yv12->u_buffer + recon_uvoffset;
+          xd->plane[2].pre[0].buf = lst_yv12->v_buffer + recon_uvoffset;
 
           // In accumulating a score for the older reference frame
           // take the best of the motion predicted score and
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 9784d2d..924d9fd 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -71,9 +71,7 @@
   }
 
   vp9_set_mbmode_and_mvs(x, NEWMV, dst_mv);
-  vp9_build_inter_predictors_sby(xd, xd->plane[0].dst.buf,
-                                 xd->plane[0].dst.stride,
-                                 mb_row, mb_col, BLOCK_SIZE_MB16X16);
+  vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_SIZE_MB16X16);
   best_err = vp9_sad16x16(x->src.y_buffer, x->src.y_stride,
                           xd->plane[0].dst.buf, xd->plane[0].dst.stride,
                           INT_MAX);
@@ -112,15 +110,16 @@
     b->src_stride = x->src.y_stride;
     b->src        = x->src.y_stride * (n & 12) + (n & 3) * 4;
 
-    d->base_pre   = &xd->pre.y_buffer;
-    d->pre_stride = xd->pre.y_stride;
-    d->pre        = xd->pre.y_stride * (n & 12) + (n & 3) * 4;
+    d->base_pre   = &xd->plane[0].pre[0].buf;
+    d->pre_stride = xd->plane[0].pre[0].stride;
+    d->pre        = xd->plane[0].pre[0].stride * (n & 12) + (n & 3) * 4;
   }
 
   // Try zero MV first
   // FIXME should really use something like near/nearest MV and/or MV prediction
   err = vp9_sad16x16(x->src.y_buffer, x->src.y_stride,
-                     xd->pre.y_buffer, xd->pre.y_stride, INT_MAX);
+                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
+                     INT_MAX);
   dst_mv->as_int = 0;
 
   // Test last reference frame using the previous best mv as the
@@ -164,7 +163,8 @@
   // Try zero MV first
   // FIXME should really use something like near/nearest MV and/or MV prediction
   err = vp9_sad16x16(x->src.y_buffer, x->src.y_stride,
-                     xd->pre.y_buffer, xd->pre.y_stride, INT_MAX);
+                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
+                     INT_MAX);
 
   dst_mv->as_int = 0;
 
@@ -249,8 +249,8 @@
   // Golden frame MV search, if it exists and is different than last frame
   if (golden_ref) {
     int g_motion_error;
-    xd->pre.y_buffer = golden_ref->y_buffer + mb_y_offset;
-    xd->pre.y_stride = golden_ref->y_stride;
+    xd->plane[0].pre[0].buf = golden_ref->y_buffer + mb_y_offset;
+    xd->plane[0].pre[0].stride = golden_ref->y_stride;
     g_motion_error = do_16x16_motion_search(cpi, prev_golden_ref_mv,
                                             &stats->ref[GOLDEN_FRAME].m.mv,
                                             buf, mb_y_offset,
@@ -265,8 +265,8 @@
   // Alt-ref frame MV search, if it exists and is different than last/golden frame
   if (alt_ref) {
     int a_motion_error;
-    xd->pre.y_buffer = alt_ref->y_buffer + mb_y_offset;
-    xd->pre.y_stride = alt_ref->y_stride;
+    xd->plane[0].pre[0].buf = alt_ref->y_buffer + mb_y_offset;
+    xd->plane[0].pre[0].stride = alt_ref->y_stride;
     a_motion_error = do_16x16_zerozero_search(cpi,
                                               &stats->ref[ALTREF_FRAME].m.mv,
                                               buf, mb_y_offset,
@@ -306,7 +306,7 @@
                       - 16 - VP9_INTERP_EXTEND;
   xd->up_available  = 0;
   xd->plane[0].dst.stride  = buf->y_stride;
-  xd->pre.y_stride  = buf->y_stride;
+  xd->plane[0].pre[0].stride  = buf->y_stride;
   xd->plane[1].dst.stride = buf->uv_stride;
   xd->mode_info_context = &mi_local;
 
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 66ed1da..c2c587e 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -114,10 +114,10 @@
 extern void print_nmvstats();
 #endif
 
-#if CONFIG_CODE_NONZEROCOUNT
-#ifdef NZC_STATS
-extern void init_nzcstats();
-extern void print_nzcstats();
+#if CONFIG_CODE_ZEROGROUP
+#ifdef ZPC_STATS
+extern void init_zpcstats();
+extern void print_zpcstats();
 #endif
 #endif
 
@@ -285,6 +285,9 @@
 
   xd->update_mb_segmentation_map = 0;
   xd->update_mb_segmentation_data = 0;
+#if CONFIG_IMPLICIT_SEGMENTATION
+  xd->allow_implicit_segment_update = 0;
+#endif
   vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
 
   vp9_clearall_segfeatures(xd);
@@ -389,6 +392,9 @@
     vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols));
     xd->update_mb_segmentation_map = 0;
     xd->update_mb_segmentation_data = 0;
+#if CONFIG_IMPLICIT_SEGMENTATION
+  xd->allow_implicit_segment_update = 0;
+#endif
     cpi->static_mb_pct = 0;
 
     // Disable segmentation
@@ -402,6 +408,9 @@
     vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols));
     xd->update_mb_segmentation_map = 0;
     xd->update_mb_segmentation_data = 0;
+#if CONFIG_IMPLICIT_SEGMENTATION
+  xd->allow_implicit_segment_update = 0;
+#endif
     cpi->static_mb_pct = 0;
 
     // Disable segmentation and individual segment features by default
@@ -502,6 +511,45 @@
   }
 }
 
+#if CONFIG_IMPLICIT_SEGMENTATION
+static void configure_implicit_segmentation(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  int i;
+  int qi_delta;
+  double q_target = cpi->active_worst_quality * 1.10;
+
+  // Set the flags to allow implicit segment update but disallow explicit update
+  xd->segmentation_enabled = 1;
+  xd->allow_implicit_segment_update = 1;
+  xd->update_mb_segmentation_map = 0;
+
+  // For key frames clear down the segment map to a default state.
+  if (cm->frame_type == KEY_FRAME) {
+    // Clear down the global segmentation map
+    vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols));
+
+    // Clear down the segment features.
+    vp9_clearall_segfeatures(xd);
+
+    xd->update_mb_segmentation_data = 1;
+
+    // Enable use of q deltas on segments
+    for (i = 0; i < MAX_MB_SEGMENTS; ++i) {
+      qi_delta = compute_qdelta(cpi, cpi->active_worst_quality, q_target);
+      vp9_set_segdata(xd, i, SEG_LVL_ALT_Q, qi_delta);
+      q_target *= 0.95;
+      vp9_enable_segfeature(xd, i, SEG_LVL_ALT_Q);
+    }
+
+    // Where relevant assume segment data is delta data
+    xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
+  } else {
+    xd->update_mb_segmentation_data = 0;
+  }
+}
+#endif
+
 // DEBUG: Print out the segment id of each MB in the current frame.
 static void print_seg_map(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
@@ -739,8 +787,12 @@
   // Switch segmentation off.
   sf->static_segmentation = 0;
 #else
+#if CONFIG_IMPLICIT_SEGMENTATION
+  sf->static_segmentation = 0;
+#else
   sf->static_segmentation = 1;
 #endif
+#endif
   sf->splitmode_breakout = 0;
   sf->mb16_breakout = 0;
 
@@ -754,7 +806,11 @@
       // Switch segmentation off.
       sf->static_segmentation = 0;
 #else
-      sf->static_segmentation = 1;
+#if CONFIG_IMPLICIT_SEGMENTATION
+  sf->static_segmentation = 0;
+#else
+  sf->static_segmentation = 1;
+#endif
 #endif
       sf->splitmode_breakout = 1;
       sf->mb16_breakout = 0;
@@ -895,7 +951,7 @@
   vpx_free(cpi->tok);
 
   {
-    unsigned int tokens = cm->mb_rows * cm->mb_cols * (24 * 16 + 1);
+    unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
 
     CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
   }
@@ -1433,9 +1489,9 @@
 #ifdef NMV_STATS
   init_nmvstats();
 #endif
-#if CONFIG_CODE_NONZEROCOUNT
-#ifdef NZC_STATS
-  init_nzcstats();
+#if CONFIG_CODE_ZEROGROUP
+#ifdef ZPC_STATS
+  init_zpcstats();
 #endif
 #endif
 
@@ -1643,12 +1699,11 @@
   cpi->common.error.setjmp = 0;
 
   vp9_zero(cpi->y_uv_mode_count)
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_zero(cm->fc.nzc_counts_4x4);
-  vp9_zero(cm->fc.nzc_counts_8x8);
-  vp9_zero(cm->fc.nzc_counts_16x16);
-  vp9_zero(cm->fc.nzc_counts_32x32);
-  vp9_zero(cm->fc.nzc_pcat_counts);
+#if CONFIG_CODE_ZEROGROUP
+  vp9_zero(cm->fc.zpc_counts_4x4);
+  vp9_zero(cm->fc.zpc_counts_8x8);
+  vp9_zero(cm->fc.zpc_counts_16x16);
+  vp9_zero(cm->fc.zpc_counts_32x32);
 #endif
 
   return (VP9_PTR) cpi;
@@ -1677,10 +1732,10 @@
     if (cpi->pass != 1)
       print_nmvstats();
 #endif
-#if CONFIG_CODE_NONZEROCOUNT
-#ifdef NZC_STATS
+#if CONFIG_CODE_ZEROGROUP
+#ifdef ZPC_STATS
     if (cpi->pass != 1)
-      print_nzcstats();
+      print_zpcstats();
 #endif
 #endif
 
@@ -2717,7 +2772,8 @@
     }
   }
 
-  // Configure use of segmentation for enhanced coding of static regions.
+  // Configure experimental use of segmentation for enhanced coding of
+  // static regions if indicated.
   // Only allowed for now in second pass of two pass (as requires lagged coding)
   // and if the relevant speed feature flag is set.
   if ((cpi->pass == 2) && (cpi->sf.static_segmentation)) {
@@ -3025,6 +3081,12 @@
         cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame;
         vp9_setup_inter_frame(cpi);
       }
+
+#if CONFIG_IMPLICIT_SEGMENTATION
+      if (!cm->error_resilient_mode && !cpi->sf.static_segmentation) {
+        configure_implicit_segmentation(cpi);
+      }
+#endif
     }
 
     // transform / motion compensation build reconstruction frame
@@ -3289,8 +3351,17 @@
   cpi->dummy_packing = 0;
   vp9_pack_bitstream(cpi, dest, size);
 
-  if (cpi->mb.e_mbd.update_mb_segmentation_map)
+#if CONFIG_IMPLICIT_SEGMENTATION
+  // Should we allow implicit update of the segment map.
+  if (xd->allow_implicit_segment_update && !cm->error_resilient_mode) {
+    vp9_implicit_segment_map_update(cm);
+  // or has there been an explicit update
+  } else if (xd->update_mb_segmentation_map) {
+#else
+  if (xd->update_mb_segmentation_map) {
+#endif
     update_reference_segmentation_map(cpi);
+  }
 
   release_scaled_references(cpi);
   update_reference_frames(cpi);
@@ -3301,8 +3372,8 @@
   if (!cpi->common.error_resilient_mode &&
       !cpi->common.frame_parallel_decoding_mode) {
     vp9_adapt_coef_probs(&cpi->common);
-#if CONFIG_CODE_NONZEROCOUNT
-    vp9_adapt_nzc_probs(&cpi->common);
+#if CONFIG_CODE_ZEROGROUP
+    vp9_adapt_zpc_probs(&cpi->common);
 #endif
   }
   if (cpi->common.frame_type != KEY_FRAME) {
@@ -4133,8 +4204,9 @@
 }
 
 int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
-                   unsigned int cols, int delta_q[4], int delta_lf[4],
-                   unsigned int threshold[4]) {
+                   unsigned int cols, int delta_q[MAX_MB_SEGMENTS],
+                   int delta_lf[MAX_MB_SEGMENTS],
+                   unsigned int threshold[MAX_MB_SEGMENTS]) {
   VP9_COMP *cpi = (VP9_COMP *) comp;
   signed char feature_data[SEG_LVL_MAX][MAX_MB_SEGMENTS];
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
@@ -4154,25 +4226,15 @@
   // Activate segmentation.
   vp9_enable_segmentation((VP9_PTR)cpi);
 
-  // Set up the quant segment data
-  feature_data[SEG_LVL_ALT_Q][0] = delta_q[0];
-  feature_data[SEG_LVL_ALT_Q][1] = delta_q[1];
-  feature_data[SEG_LVL_ALT_Q][2] = delta_q[2];
-  feature_data[SEG_LVL_ALT_Q][3] = delta_q[3];
-
-  // Set up the loop segment data s
-  feature_data[SEG_LVL_ALT_LF][0] = delta_lf[0];
-  feature_data[SEG_LVL_ALT_LF][1] = delta_lf[1];
-  feature_data[SEG_LVL_ALT_LF][2] = delta_lf[2];
-  feature_data[SEG_LVL_ALT_LF][3] = delta_lf[3];
-
-  cpi->segment_encode_breakout[0] = threshold[0];
-  cpi->segment_encode_breakout[1] = threshold[1];
-  cpi->segment_encode_breakout[2] = threshold[2];
-  cpi->segment_encode_breakout[3] = threshold[3];
+  // Set up the quan, LF and breakout threshold segment data
+  for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+    feature_data[SEG_LVL_ALT_Q][i] = delta_q[i];
+    feature_data[SEG_LVL_ALT_LF][i] = delta_lf[i];
+    cpi->segment_encode_breakout[i] = threshold[i];
+  }
 
   // Enable the loop and quant changes in the feature mask
-  for (i = 0; i < 4; i++) {
+  for (i = 0; i < MAX_MB_SEGMENTS; i++) {
     if (delta_q[i])
       vp9_enable_segfeature(xd, i, SEG_LVL_ALT_Q);
     else
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 6d309c8..4fff233 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -103,7 +103,7 @@
   vp9_prob i8x8_mode_prob[VP9_I8X8_MODES - 1];
   vp9_prob sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
   vp9_prob mbsplit_prob[VP9_NUMMBSPLITS - 1];
-  vp9_prob partition_prob[PARTITION_PLANES][PARTITION_TYPES - 1];
+  vp9_prob partition_prob[NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
 
   vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
                                  [VP9_SWITCHABLE_FILTERS - 1];
@@ -114,17 +114,11 @@
   int mv_ref_ct[INTER_MODE_CONTEXTS][4][2];
   int vp9_mode_contexts[INTER_MODE_CONTEXTS][4];
 
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_prob nzc_probs_4x4
-           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC4X4_NODES];
-  vp9_prob nzc_probs_8x8
-           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC8X8_NODES];
-  vp9_prob nzc_probs_16x16
-           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC16X16_NODES];
-  vp9_prob nzc_probs_32x32
-           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC32X32_NODES];
-  vp9_prob nzc_pcat_probs[MAX_NZC_CONTEXTS]
-                         [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA];
+#if CONFIG_CODE_ZEROGROUP
+  vp9_zpc_probs zpc_probs_4x4;
+  vp9_zpc_probs zpc_probs_8x8;
+  vp9_zpc_probs zpc_probs_16x16;
+  vp9_zpc_probs zpc_probs_32x32;
 #endif
 } CODING_CONTEXT;
 
@@ -463,7 +457,7 @@
   int sub_mv_ref_count[SUBMVREF_COUNT][VP9_SUBMVREFS];
   int mbsplit_count[VP9_NUMMBSPLITS];
   int y_uv_mode_count[VP9_YMODES][VP9_UV_MODES];
-  unsigned int partition_count[PARTITION_PLANES][PARTITION_TYPES];
+  unsigned int partition_count[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
 #if CONFIG_COMP_INTERINTRA_PRED
   unsigned int interintra_count[2];
   unsigned int interintra_select_count[2];
@@ -487,25 +481,6 @@
   vp9_coeff_probs frame_coef_probs_32x32[BLOCK_TYPES];
   vp9_coeff_stats frame_branch_ct_32x32[BLOCK_TYPES];
 
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_prob frame_nzc_probs_4x4
-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC4X4_NODES];
-  unsigned int frame_nzc_branch_ct_4x4
-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC4X4_NODES][2];
-  vp9_prob frame_nzc_probs_8x8
-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC8X8_NODES];
-  unsigned int frame_nzc_branch_ct_8x8
-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC8X8_NODES][2];
-  vp9_prob frame_nzc_probs_16x16
-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC16X16_NODES];
-  unsigned int frame_nzc_branch_ct_16x16
-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC16X16_NODES][2];
-  vp9_prob frame_nzc_probs_32x32
-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC32X32_NODES];
-  unsigned int frame_nzc_branch_ct_32x32
-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC32X32_NODES][2];
-#endif
-
   int gfu_boost;
   int last_boost;
   int kf_boost;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 80d9849..78ea78c 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -28,45 +28,29 @@
 
 void vp9_ht_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) {
   MACROBLOCKD *const xd = &mb->e_mbd;
-  BLOCK *const b = &mb->block[0];
   BLOCKD *const d = &xd->block[0];
   int i, rc, eob;
   int zbin;
   int x, y, z, sz;
-  int16_t *coeff_ptr       = mb->coeff + b_idx * 16;
+  int16_t *coeff_ptr       = BLOCK_OFFSET(mb->plane[0].coeff, b_idx, 16);
   // ht is luma-only
   int16_t *qcoeff_ptr      = BLOCK_OFFSET(xd->plane[0].qcoeff, b_idx, 16);
   int16_t *dqcoeff_ptr     = BLOCK_OFFSET(xd->plane[0].dqcoeff, b_idx, 16);
-  int16_t *zbin_boost_ptr  = b->zrun_zbin_boost;
-  int16_t *zbin_ptr        = b->zbin;
-  int16_t *round_ptr       = b->round;
-  int16_t *quant_ptr       = b->quant;
-  uint8_t *quant_shift_ptr = b->quant_shift;
+  int16_t *zbin_boost_ptr  = mb->plane[0].zrun_zbin_boost;
+  int16_t *zbin_ptr        = mb->plane[0].zbin;
+  int16_t *round_ptr       = mb->plane[0].round;
+  int16_t *quant_ptr       = mb->plane[0].quant;
+  uint8_t *quant_shift_ptr = mb->plane[0].quant_shift;
   int16_t *dequant_ptr     = d->dequant;
-  int zbin_oq_value        = b->zbin_extra;
-  const int *pt_scan;
-#if CONFIG_CODE_NONZEROCOUNT
-  int nzc = 0;
-#endif
-
-  switch (tx_type) {
-    case ADST_DCT:
-      pt_scan = vp9_row_scan_4x4;
-      break;
-    case DCT_ADST:
-      pt_scan = vp9_col_scan_4x4;
-      break;
-    default:
-      pt_scan = vp9_default_zig_zag1d_4x4;
-      break;
-  }
+  int zbin_oq_value        = mb->plane[0].zbin_extra;
+  const int *pt_scan = get_scan_4x4(tx_type);
 
   vpx_memset(qcoeff_ptr, 0, 32);
   vpx_memset(dqcoeff_ptr, 0, 32);
 
   eob = -1;
 
-  if (!b->skip_block) {
+  if (!mb->skip_block) {
     for (i = 0; i < 16; i++) {
       rc   = pt_scan[i];
       z    = coeff_ptr[rc];
@@ -87,45 +71,36 @@
 
         if (y) {
           eob = i;                                // last nonzero coeffs
-#if CONFIG_CODE_NONZEROCOUNT
-          ++nzc;                                  // number of nonzero coeffs
-#endif
-          zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength
+          zbin_boost_ptr = mb->plane[0].zrun_zbin_boost;  // reset zero run len
         }
       }
     }
   }
 
   xd->plane[0].eobs[b_idx] = eob + 1;
-#if CONFIG_CODE_NONZEROCOUNT
-  xd->nzcs[b_idx] = nzc;
-#endif
 }
 
 void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, int y_blocks) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
   const int c_idx = plane_idx(pb_idx.plane);
-  BLOCK *const b = &mb->block[c_idx];
   BLOCKD *const d = &xd->block[c_idx];
   int i, rc, eob;
   int zbin;
   int x, y, z, sz;
-  int16_t *coeff_ptr       = mb->coeff + b_idx * 16;
+  int16_t *coeff_ptr       = BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff,
+                                          pb_idx.block, 16);
   int16_t *qcoeff_ptr      = BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff,
                                           pb_idx.block, 16);
   int16_t *dqcoeff_ptr     = BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff,
                                           pb_idx.block, 16);
-  int16_t *zbin_boost_ptr  = b->zrun_zbin_boost;
-  int16_t *zbin_ptr        = b->zbin;
-  int16_t *round_ptr       = b->round;
-  int16_t *quant_ptr       = b->quant;
-  uint8_t *quant_shift_ptr = b->quant_shift;
+  int16_t *zbin_boost_ptr  = mb->plane[pb_idx.plane].zrun_zbin_boost;
+  int16_t *zbin_ptr        = mb->plane[pb_idx.plane].zbin;
+  int16_t *round_ptr       = mb->plane[pb_idx.plane].round;
+  int16_t *quant_ptr       = mb->plane[pb_idx.plane].quant;
+  uint8_t *quant_shift_ptr = mb->plane[pb_idx.plane].quant_shift;
   int16_t *dequant_ptr     = d->dequant;
-  int zbin_oq_value        = b->zbin_extra;
-#if CONFIG_CODE_NONZEROCOUNT
-  int nzc = 0;
-#endif
+  int zbin_oq_value        = mb->plane[pb_idx.plane].zbin_extra;
 
   if (c_idx == 0) assert(pb_idx.plane == 0);
   if (c_idx == 16) assert(pb_idx.plane == 1);
@@ -135,7 +110,7 @@
 
   eob = -1;
 
-  if (!b->skip_block) {
+  if (!mb->skip_block) {
     for (i = 0; i < 16; i++) {
       rc   = vp9_default_zig_zag1d_4x4[i];
       z    = coeff_ptr[rc];
@@ -157,19 +132,13 @@
 
         if (y) {
           eob = i;                                // last nonzero coeffs
-#if CONFIG_CODE_NONZEROCOUNT
-          ++nzc;                                  // number of nonzero coeffs
-#endif
-          zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength
+          zbin_boost_ptr = mb->plane[pb_idx.plane].zrun_zbin_boost;
         }
       }
     }
   }
 
   xd->plane[pb_idx.plane].eobs[pb_idx.block] = eob + 1;
-#if CONFIG_CODE_NONZEROCOUNT
-  xd->nzcs[b_idx] = nzc;
-#endif
 }
 
 void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
@@ -181,21 +150,10 @@
                                      pb_idx.block, 16);
   int16_t *dqcoeff_ptr = BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff,
                                       pb_idx.block, 16);
-  BLOCK *const b = &mb->block[c_idx];
+  int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff,
+                                    pb_idx.block, 16);
   BLOCKD *const d = &xd->block[c_idx];
-  const int *pt_scan;
-
-  switch (tx_type) {
-    case ADST_DCT:
-      pt_scan = vp9_row_scan_8x8;
-      break;
-    case DCT_ADST:
-      pt_scan = vp9_col_scan_8x8;
-      break;
-    default:
-      pt_scan = vp9_default_zig_zag1d_8x8;
-      break;
-  }
+  const int *pt_scan = get_scan_8x8(tx_type);
 
   if (c_idx == 0) assert(pb_idx.plane == 0);
   if (c_idx == 16) assert(pb_idx.plane == 1);
@@ -203,22 +161,18 @@
   vpx_memset(qcoeff_ptr, 0, 64 * sizeof(int16_t));
   vpx_memset(dqcoeff_ptr, 0, 64 * sizeof(int16_t));
 
-  if (!b->skip_block) {
+  if (!mb->skip_block) {
     int i, rc, eob;
     int zbin;
     int x, y, z, sz;
     int zero_run;
-    int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
-    int16_t *coeff_ptr  = mb->coeff + 16 * b_idx;
-    int16_t *zbin_ptr   = b->zbin;
-    int16_t *round_ptr  = b->round;
-    int16_t *quant_ptr  = b->quant;
-    uint8_t *quant_shift_ptr = b->quant_shift;
+    int16_t *zbin_boost_ptr = mb->plane[pb_idx.plane].zrun_zbin_boost;
+    int16_t *zbin_ptr   = mb->plane[pb_idx.plane].zbin;
+    int16_t *round_ptr  = mb->plane[pb_idx.plane].round;
+    int16_t *quant_ptr  = mb->plane[pb_idx.plane].quant;
+    uint8_t *quant_shift_ptr = mb->plane[pb_idx.plane].quant_shift;
     int16_t *dequant_ptr = d->dequant;
-    int zbin_oq_value = b->zbin_extra;
-#if CONFIG_CODE_NONZEROCOUNT
-    int nzc = 0;
-#endif
+    int zbin_oq_value = mb->plane[pb_idx.plane].zbin_extra;
 
     eob = -1;
 
@@ -242,9 +196,6 @@
 
         if (y) {
           eob = 0;                                   // last nonzero coeffs
-#if CONFIG_CODE_NONZEROCOUNT
-          ++nzc;                                  // number of nonzero coeffs
-#endif
           zero_run = 0;
         }
       }
@@ -271,22 +222,13 @@
 
         if (y) {
           eob = i;                                   // last nonzero coeffs
-#if CONFIG_CODE_NONZEROCOUNT
-          ++nzc;                                     // number of nonzero coeffs
-#endif
           zero_run = 0;
         }
       }
     }
     xd->plane[pb_idx.plane].eobs[pb_idx.block] = eob + 1;
-#if CONFIG_CODE_NONZEROCOUNT
-    xd->nzcs[b_idx] = nzc;
-#endif
   } else {
     xd->plane[pb_idx.plane].eobs[pb_idx.block] = 0;
-#if CONFIG_CODE_NONZEROCOUNT
-    xd->nzcs[b_idx] = 0;
-#endif
   }
 }
 
@@ -297,18 +239,12 @@
                      int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
                      int16_t *dequant_ptr, int zbin_oq_value,
                      uint16_t *eob_ptr,
-#if CONFIG_CODE_NONZEROCOUNT
-                     uint16_t *nzc_ptr,
-#endif
                      const int *scan, int mul) {
   int i, rc, eob;
   int zbin;
   int x, y, z, sz;
   int zero_run = 0;
   int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
-#if CONFIG_CODE_NONZEROCOUNT
-  int nzc = 0;
-#endif
 
   vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
   vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
@@ -337,18 +273,12 @@
         if (y) {
           eob = i;                                  // last nonzero coeffs
           zero_run = 0;
-#if CONFIG_CODE_NONZEROCOUNT
-          ++nzc;                                    // number of nonzero coeffs
-#endif
         }
       }
     }
   }
 
   *eob_ptr = eob + 1;
-#if CONFIG_CODE_NONZEROCOUNT
-  *nzc_ptr = nzc;
-#endif
 }
 
 void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
@@ -356,37 +286,24 @@
   MACROBLOCKD *const xd = &mb->e_mbd;
   const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
   const int c_idx = plane_idx(pb_idx.plane);
-  BLOCK *const b = &mb->block[c_idx];
   BLOCKD *const d = &xd->block[c_idx];
-  const int *pt_scan;
-
-  switch (tx_type) {
-    case ADST_DCT:
-      pt_scan = vp9_row_scan_16x16;
-      break;
-    case DCT_ADST:
-      pt_scan = vp9_col_scan_16x16;
-      break;
-    default:
-      pt_scan = vp9_default_zig_zag1d_16x16;
-      break;
-  }
+  const int *pt_scan = get_scan_16x16(tx_type);
 
   if (c_idx == 0) assert(pb_idx.plane == 0);
   if (c_idx == 16) assert(pb_idx.plane == 1);
   if (c_idx == 20) assert(pb_idx.plane == 2);
-  quantize(b->zrun_zbin_boost,
-           mb->coeff + 16 * b_idx,
-           256, b->skip_block,
-           b->zbin, b->round, b->quant, b->quant_shift,
+  quantize(mb->plane[pb_idx.plane].zrun_zbin_boost,
+           BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
+           256, mb->skip_block,
+           mb->plane[pb_idx.plane].zbin,
+           mb->plane[pb_idx.plane].round,
+           mb->plane[pb_idx.plane].quant,
+           mb->plane[pb_idx.plane].quant_shift,
            BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16),
            BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16),
            d->dequant,
-           b->zbin_extra,
+           mb->plane[pb_idx.plane].zbin_extra,
            &xd->plane[pb_idx.plane].eobs[pb_idx.block],
-#if CONFIG_CODE_NONZEROCOUNT
-           &xd->nzcs[b_idx],
-#endif
            pt_scan, 1);
 }
 
@@ -394,25 +311,23 @@
   MACROBLOCKD *const xd = &mb->e_mbd;
   const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
   const int c_idx = plane_idx(pb_idx.plane);
-  BLOCK *const b = &mb->block[c_idx];
   BLOCKD *const d = &xd->block[c_idx];
 
   if (c_idx == 0) assert(pb_idx.plane == 0);
   if (c_idx == 16) assert(pb_idx.plane == 1);
   if (c_idx == 20) assert(pb_idx.plane == 2);
-  quantize(b->zrun_zbin_boost,
-           mb->coeff + b_idx * 16,
-           1024, b->skip_block,
-           b->zbin,
-           b->round, b->quant, b->quant_shift,
+  quantize(mb->plane[pb_idx.plane].zrun_zbin_boost,
+           BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
+           1024, mb->skip_block,
+           mb->plane[pb_idx.plane].zbin,
+           mb->plane[pb_idx.plane].round,
+           mb->plane[pb_idx.plane].quant,
+           mb->plane[pb_idx.plane].quant_shift,
            BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16),
            BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16),
            d->dequant,
-           b->zbin_extra,
+           mb->plane[pb_idx.plane].zbin_extra,
            &xd->plane[pb_idx.plane].eobs[pb_idx.block],
-#if CONFIG_CODE_NONZEROCOUNT
-           &xd->nzcs[b_idx],
-#endif
            vp9_default_zig_zag1d_32x32, 2);
 }
 
@@ -607,55 +522,46 @@
   zbin_extra = (cpi->common.y_dequant[qindex][1] *
                  (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;
 
-  for (i = 0; i < 16; i++) {
-    x->block[i].quant = cpi->Y1quant[qindex];
-    x->block[i].quant_shift = cpi->Y1quant_shift[qindex];
-    x->block[i].zbin = cpi->Y1zbin[qindex];
-    x->block[i].round = cpi->Y1round[qindex];
+  x->plane[0].quant = cpi->Y1quant[qindex];
+  x->plane[0].quant_shift = cpi->Y1quant_shift[qindex];
+  x->plane[0].zbin = cpi->Y1zbin[qindex];
+  x->plane[0].round = cpi->Y1round[qindex];
+  x->plane[0].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[qindex];
+  x->plane[0].zbin_extra = (int16_t)zbin_extra;
+  for (i = 0; i < 16; i++)
     x->e_mbd.block[i].dequant = cpi->common.y_dequant[qindex];
-    x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[qindex];
-    x->block[i].zbin_extra = (int16_t)zbin_extra;
-
-    // Segment skip feature.
-    x->block[i].skip_block =
-      vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
-  }
 
   // UV
   zbin_extra = (cpi->common.uv_dequant[qindex][1] *
                 (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;
 
-  for (i = 16; i < 24; i++) {
-    x->block[i].quant = cpi->UVquant[qindex];
-    x->block[i].quant_shift = cpi->UVquant_shift[qindex];
-    x->block[i].zbin = cpi->UVzbin[qindex];
-    x->block[i].round = cpi->UVround[qindex];
-    x->e_mbd.block[i].dequant = cpi->common.uv_dequant[qindex];
-    x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[qindex];
-    x->block[i].zbin_extra = (int16_t)zbin_extra;
-
-    // Segment skip feature.
-    x->block[i].skip_block =
-        vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
+  for (i = 1; i < 3; i++) {
+    x->plane[i].quant = cpi->UVquant[qindex];
+    x->plane[i].quant_shift = cpi->UVquant_shift[qindex];
+    x->plane[i].zbin = cpi->UVzbin[qindex];
+    x->plane[i].round = cpi->UVround[qindex];
+    x->plane[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[qindex];
+    x->plane[i].zbin_extra = (int16_t)zbin_extra;
   }
+  for (i = 16; i < 24; i++)
+    x->e_mbd.block[i].dequant = cpi->common.uv_dequant[qindex];
+
+  x->skip_block = vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
 
   /* save this macroblock QIndex for vp9_update_zbin_extra() */
   x->e_mbd.q_index = qindex;
 }
 
 void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {
-  int i;
   const int qindex = x->e_mbd.q_index;
   const int y_zbin_extra = (cpi->common.y_dequant[qindex][1] *
                 (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;
   const int uv_zbin_extra = (cpi->common.uv_dequant[qindex][1] *
                   (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;
 
-  for (i = 0; i < 16; i++)
-    x->block[i].zbin_extra = (int16_t)y_zbin_extra;
-
-  for (i = 16; i < 24; i++)
-    x->block[i].zbin_extra = (int16_t)uv_zbin_extra;
+  x->plane[0].zbin_extra = (int16_t)y_zbin_extra;
+  x->plane[1].zbin_extra = (int16_t)uv_zbin_extra;
+  x->plane[2].zbin_extra = (int16_t)uv_zbin_extra;
 }
 
 void vp9_frame_init_quantizer(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 96d857f..7a419fb 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -172,12 +172,11 @@
 #if CONFIG_COMP_INTERINTRA_PRED
   cc->interintra_prob = cm->fc.interintra_prob;
 #endif
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_copy(cc->nzc_probs_4x4, cm->fc.nzc_probs_4x4);
-  vp9_copy(cc->nzc_probs_8x8, cm->fc.nzc_probs_8x8);
-  vp9_copy(cc->nzc_probs_16x16, cm->fc.nzc_probs_16x16);
-  vp9_copy(cc->nzc_probs_32x32, cm->fc.nzc_probs_32x32);
-  vp9_copy(cc->nzc_pcat_probs, cm->fc.nzc_pcat_probs);
+#if CONFIG_CODE_ZEROGROUP
+  vp9_copy(cc->zpc_probs_4x4, cm->fc.zpc_probs_4x4);
+  vp9_copy(cc->zpc_probs_8x8, cm->fc.zpc_probs_8x8);
+  vp9_copy(cc->zpc_probs_16x16, cm->fc.zpc_probs_16x16);
+  vp9_copy(cc->zpc_probs_32x32, cm->fc.zpc_probs_32x32);
 #endif
 }
 
@@ -235,12 +234,11 @@
 #if CONFIG_COMP_INTERINTRA_PRED
   cm->fc.interintra_prob = cc->interintra_prob;
 #endif
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_copy(cm->fc.nzc_probs_4x4, cc->nzc_probs_4x4);
-  vp9_copy(cm->fc.nzc_probs_8x8, cc->nzc_probs_8x8);
-  vp9_copy(cm->fc.nzc_probs_16x16, cc->nzc_probs_16x16);
-  vp9_copy(cm->fc.nzc_probs_32x32, cc->nzc_probs_32x32);
-  vp9_copy(cm->fc.nzc_pcat_probs, cc->nzc_pcat_probs);
+#if CONFIG_CODE_ZEROGROUP
+  vp9_copy(cm->fc.zpc_probs_4x4, cc->zpc_probs_4x4);
+  vp9_copy(cm->fc.zpc_probs_8x8, cc->zpc_probs_8x8);
+  vp9_copy(cm->fc.zpc_probs_16x16, cc->zpc_probs_16x16);
+  vp9_copy(cm->fc.zpc_probs_32x32, cc->zpc_probs_32x32);
 #endif
 }
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 0f6d713..f846cf3 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -162,60 +162,6 @@
         }
 }
 
-#if CONFIG_CODE_NONZEROCOUNT
-static void fill_nzc_costs(VP9_COMP *cpi, TX_SIZE tx_size) {
-  int nzc_context, r, b, nzc, values;
-  int cost[16];
-  values = (16 << (2 * tx_size)) + 1;
-
-  for (nzc_context = 0; nzc_context < MAX_NZC_CONTEXTS; ++nzc_context) {
-    for (r = 0; r < REF_TYPES; ++r) {
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        unsigned int *nzc_costs;
-        if (tx_size == TX_4X4) {
-          vp9_cost_tokens(cost,
-                          cpi->common.fc.nzc_probs_4x4[nzc_context][r][b],
-                          vp9_nzc4x4_tree);
-          nzc_costs = cpi->mb.nzc_costs_4x4[nzc_context][r][b];
-        } else if (tx_size == TX_8X8) {
-          vp9_cost_tokens(cost,
-                          cpi->common.fc.nzc_probs_8x8[nzc_context][r][b],
-                          vp9_nzc8x8_tree);
-          nzc_costs = cpi->mb.nzc_costs_8x8[nzc_context][r][b];
-        } else if (tx_size == TX_16X16) {
-          vp9_cost_tokens(cost,
-                          cpi->common.fc.nzc_probs_16x16[nzc_context][r][b],
-                          vp9_nzc16x16_tree);
-          nzc_costs = cpi->mb.nzc_costs_16x16[nzc_context][r][b];
-        } else {
-          vp9_cost_tokens(cost,
-                          cpi->common.fc.nzc_probs_32x32[nzc_context][r][b],
-                          vp9_nzc32x32_tree);
-          nzc_costs = cpi->mb.nzc_costs_32x32[nzc_context][r][b];
-        }
-
-        for (nzc = 0; nzc < values; ++nzc) {
-          int e, c, totalcost = 0;
-          c = codenzc(nzc);
-          totalcost = cost[c];
-          if ((e = vp9_extranzcbits[c])) {
-            int x = nzc - vp9_basenzcvalue[c];
-            while (e--) {
-              totalcost += vp9_cost_bit(
-                  cpi->common.fc.nzc_pcat_probs[nzc_context]
-                                               [c - NZC_TOKENS_NOEXTRA][e],
-                  ((x >> e) & 1));
-            }
-          }
-          nzc_costs[nzc] = totalcost;
-        }
-      }
-    }
-  }
-}
-#endif
-
-
 static int rd_iifactor[32] =  { 4, 4, 3, 2, 1, 0, 0, 0,
                                 0, 0, 0, 0, 0, 0, 0, 0,
                                 0, 0, 0, 0, 0, 0, 0, 0,
@@ -313,14 +259,8 @@
                    cpi->common.fc.coef_probs_16x16, TX_16X16);
   fill_token_costs(cpi->mb.token_costs[TX_32X32],
                    cpi->common.fc.coef_probs_32x32, TX_32X32);
-#if CONFIG_CODE_NONZEROCOUNT
-  fill_nzc_costs(cpi, TX_4X4);
-  fill_nzc_costs(cpi, TX_8X8);
-  fill_nzc_costs(cpi, TX_16X16);
-  fill_nzc_costs(cpi, TX_32X32);
-#endif
 
-  for (i = 0; i < 2; i++)
+  for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)
     vp9_cost_tokens(cpi->mb.partition_cost[i],
                     cpi->common.fc.partition_prob[i],
                     vp9_partition_tree);
@@ -374,11 +314,16 @@
       sizeof(ENTROPY_CONTEXT_PLANES)/sizeof(ENTROPY_CONTEXT);
   ENTROPY_CONTEXT *const l1 = l +
       sizeof(ENTROPY_CONTEXT_PLANES)/sizeof(ENTROPY_CONTEXT);
+  TX_TYPE tx_type = DCT_DCT;
 
-#if CONFIG_CODE_NONZEROCOUNT
-  const int nzc_used = get_nzc_used(tx_size);
-  int nzc_context = vp9_get_nzc_context(cm, xd, ib);
-  unsigned int *nzc_cost;
+#if CONFIG_CODE_ZEROGROUP
+  int last_nz_pos[3] = {-1, -1, -1};  // Encoder only
+  int is_eoo_list[3] = {0, 0, 0};
+  int is_eoo_negative[3] = {0, 0, 0};
+  int is_last_zero[3] = {0, 0, 0};
+  int o, rc, skip_coef_val;
+  vp9_zpc_probs *zpc_probs;
+  uint8_t token_cache_full[1024];
 #endif
   const int segment_id = xd->mode_info_context->mbmi.segment_id;
   vp9_prob (*coef_probs)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
@@ -386,6 +331,10 @@
   int seg_eob, default_eob;
   uint8_t token_cache[1024];
 
+#if CONFIG_CODE_ZEROGROUP
+  vpx_memset(token_cache, UNKNOWN_TOKEN, sizeof(token_cache));
+#endif
+
   // Check for consistency of tx_size with mode info
   assert((!type && !pb_idx.plane) || (type && pb_idx.plane));
   if (type == PLANE_TYPE_Y_WITH_DC) {
@@ -397,62 +346,41 @@
 
   switch (tx_size) {
     case TX_4X4: {
-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                              get_tx_type_4x4(xd, ib) : DCT_DCT;
+      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+          get_tx_type_4x4(xd, ib) : DCT_DCT;
       a_ec = *a;
       l_ec = *l;
-#if CONFIG_CODE_NONZEROCOUNT
-      nzc_cost = mb->nzc_costs_4x4[nzc_context][ref][type];
-#endif
       coef_probs = cm->fc.coef_probs_4x4;
       seg_eob = 16;
-      if (tx_type == ADST_DCT) {
-        scan = vp9_row_scan_4x4;
-      } else if (tx_type == DCT_ADST) {
-        scan = vp9_col_scan_4x4;
-      } else {
-        scan = vp9_default_zig_zag1d_4x4;
-      }
+      scan = get_scan_4x4(tx_type);
+#if CONFIG_CODE_ZEROGROUP
+      zpc_probs = &cm->fc.zpc_probs_4x4;
+#endif
       break;
     }
     case TX_8X8: {
       const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
       const int sz = 3 + mb_width_log2(sb_type);
       const int x = ib & ((1 << sz) - 1), y = ib - x;
-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                              get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
+      TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+          get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
       a_ec = (a[0] + a[1]) != 0;
       l_ec = (l[0] + l[1]) != 0;
-      if (tx_type == ADST_DCT) {
-        scan = vp9_row_scan_8x8;
-      } else if (tx_type == DCT_ADST) {
-        scan = vp9_col_scan_8x8;
-      } else {
-        scan = vp9_default_zig_zag1d_8x8;
-      }
-#if CONFIG_CODE_NONZEROCOUNT
-      nzc_cost = mb->nzc_costs_8x8[nzc_context][ref][type];
-#endif
+      scan = get_scan_8x8(tx_type);
       coef_probs = cm->fc.coef_probs_8x8;
       seg_eob = 64;
+#if CONFIG_CODE_ZEROGROUP
+      zpc_probs = &cm->fc.zpc_probs_8x8;
+#endif
       break;
     }
     case TX_16X16: {
       const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
       const int sz = 4 + mb_width_log2(sb_type);
       const int x = ib & ((1 << sz) - 1), y = ib - x;
-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                              get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
-      if (tx_type == ADST_DCT) {
-        scan = vp9_row_scan_16x16;
-      } else if (tx_type == DCT_ADST) {
-        scan = vp9_col_scan_16x16;
-      } else {
-        scan = vp9_default_zig_zag1d_16x16;
-      }
-#if CONFIG_CODE_NONZEROCOUNT
-      nzc_cost = mb->nzc_costs_16x16[nzc_context][ref][type];
-#endif
+      TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+          get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
+      scan = get_scan_16x16(tx_type);
       coef_probs = cm->fc.coef_probs_16x16;
       seg_eob = 256;
       if (type == PLANE_TYPE_UV) {
@@ -462,13 +390,13 @@
         a_ec = (a[0] + a[1] + a[2] + a[3]) != 0;
         l_ec = (l[0] + l[1] + l[2] + l[3]) != 0;
       }
+#if CONFIG_CODE_ZEROGROUP
+      zpc_probs = &cm->fc.zpc_probs_16x16;
+#endif
       break;
     }
     case TX_32X32:
       scan = vp9_default_zig_zag1d_32x32;
-#if CONFIG_CODE_NONZEROCOUNT
-      nzc_cost = mb->nzc_costs_32x32[nzc_context][ref][type];
-#endif
       coef_probs = cm->fc.coef_probs_32x32;
       seg_eob = 1024;
       if (type == PLANE_TYPE_UV) {
@@ -487,6 +415,9 @@
         l_ec = (l[0] + l[1] + l[2] + l[3] +
                 l1[0] + l1[1] + l1[2] + l1[3]) != 0;
       }
+#if CONFIG_CODE_ZEROGROUP
+      zpc_probs = &cm->fc.zpc_probs_32x32;
+#endif
       break;
     default:
       abort();
@@ -498,50 +429,113 @@
   nb = vp9_get_coef_neighbors_handle(scan, &pad);
   default_eob = seg_eob;
 
-#if CONFIG_CODE_NONZEROCOUNT
-  if (!nzc_used)
-#endif
-    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))
-      seg_eob = 0;
+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))
+    seg_eob = 0;
 
   /* sanity check to ensure that we do not have spurious non-zero q values */
   if (eob < seg_eob)
     assert(qcoeff_ptr[scan[eob]] == 0);
 
-  {
-#if CONFIG_CODE_NONZEROCOUNT
-    int nzc = 0;
+#if CONFIG_CODE_ZEROGROUP
+  vpx_memset(token_cache_full, ZERO_TOKEN, sizeof(token_cache_full));
+  for (c = 0; c < eob; ++c) {
+    rc = scan[c];
+    token_cache_full[rc] = vp9_dct_value_tokens_ptr[qcoeff_ptr[rc]].token;
+    o = vp9_get_orientation(rc, tx_size);
+    if (qcoeff_ptr[rc] != 0)
+      last_nz_pos[o] = c;
+  }
 #endif
-    for (; c < eob; c++) {
+  {
+    for (c = 0; c < eob; c++) {
       int v = qcoeff_ptr[scan[c]];
       int t = vp9_dct_value_tokens_ptr[v].token;
-#if CONFIG_CODE_NONZEROCOUNT
-      nzc += (v != 0);
+      int band = get_coef_band(scan, tx_size, c);
+      if (c)
+        pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
+#if CONFIG_CODE_ZEROGROUP
+      rc = scan[c];
+      o = vp9_get_orientation(rc, tx_size);
+      skip_coef_val = (token_cache[rc] == ZERO_TOKEN || is_eoo_list[o]);
+      if (!skip_coef_val) {
+        cost += token_costs[band][pt][t] + vp9_dct_value_cost_ptr[v];
+      } else {
+        assert(v == 0);
+      }
+#else
+      cost += token_costs[band][pt][t] + vp9_dct_value_cost_ptr[v];
 #endif
-      token_cache[c] = t;
-      cost += token_costs[get_coef_band(scan, tx_size, c)][pt][t];
-      cost += vp9_dct_value_cost_ptr[v];
-#if !CONFIG_CODE_NONZEROCOUNT
-      if (!c || token_cache[c - 1])
-        cost += vp9_cost_bit(coef_probs[type][ref]
-                                       [get_coef_band(scan, tx_size, c)]
-                                       [pt][0], 1);
+      if (!c || token_cache[scan[c - 1]])
+        cost += vp9_cost_bit(coef_probs[type][ref][band][pt][0], 1);
+      token_cache[scan[c]] = t;
+#if CONFIG_CODE_ZEROGROUP
+      if (t == ZERO_TOKEN && !skip_coef_val) {
+        int eoo = 0, use_eoo;
+#if USE_ZPC_EOORIENT == 1
+        use_eoo = vp9_use_eoo(c, seg_eob, scan, tx_size,
+                              is_last_zero, is_eoo_list);
+#else
+        use_eoo = 0;
 #endif
-      pt = vp9_get_coef_context(scan, nb, pad, token_cache, c + 1, default_eob);
+        if (use_eoo) {
+          eoo = vp9_is_eoo(c, eob, scan, tx_size, qcoeff_ptr, last_nz_pos);
+          if (eoo && is_eoo_negative[o]) eoo = 0;
+          if (eoo) {
+            int c_;
+            int savings = 0;
+            int zsaved = 0;
+            savings = vp9_cost_bit((*zpc_probs)[ref]
+                                   [coef_to_zpc_band(band)]
+                                   [coef_to_zpc_ptok(pt)][0], 1) -
+                      vp9_cost_bit((*zpc_probs)[ref]
+                                   [coef_to_zpc_band(band)]
+                                   [coef_to_zpc_ptok(pt)][0], 0);
+            for (c_ = c + 1; c_ < eob; ++c_) {
+              if (o == vp9_get_orientation(scan[c_], tx_size)) {
+                int pt_ = vp9_get_coef_context(scan, nb, pad,
+                                               token_cache_full, c_,
+                                               default_eob);
+                int band_ = get_coef_band(scan, tx_size, c_);
+                assert(token_cache_full[scan[c_]] == ZERO_TOKEN);
+                if (!c_ || token_cache_full[scan[c_ - 1]])
+                  savings += vp9_cost_bit(
+                      coef_probs[type][ref][band_][pt_][0], 1);
+                savings += vp9_cost_bit(
+                    coef_probs[type][ref][band_][pt_][1], 0);
+                zsaved++;
+              }
+            }
+            if (savings < 0) {
+            // if (zsaved < ZPC_ZEROSSAVED_EOO) {
+              eoo = 0;
+              is_eoo_negative[o] = 1;
+            }
+          }
+        }
+        if (use_eoo) {
+          cost += vp9_cost_bit((*zpc_probs)[ref]
+                                           [coef_to_zpc_band(band)]
+                                           [coef_to_zpc_ptok(pt)][0], !eoo);
+          if (eoo) {
+            assert(is_eoo_list[o] == 0);
+            is_eoo_list[o] = 1;
+          }
+        }
+      }
+      is_last_zero[o] = (t == ZERO_TOKEN);
+#endif
     }
-#if CONFIG_CODE_NONZEROCOUNT
-    if (nzc_used)
-      cost += nzc_cost[nzc];
-    else
-#endif
-      if (c < seg_eob)
-        cost += mb->token_costs[tx_size][type][ref]
-                               [get_coef_band(scan, tx_size, c)]
-                               [pt][DCT_EOB_TOKEN];
+    if (c < seg_eob) {
+      if (c)
+        pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
+      cost += mb->token_costs[tx_size][type][ref]
+          [get_coef_band(scan, tx_size, c)]
+          [pt][DCT_EOB_TOKEN];
+    }
   }
 
-  // is eob first coefficient;
-  pt = (c > 0);
+    // is eob first coefficient;
+    pt = (c > 0);
   *a = *l = pt;
   if (tx_size >= TX_8X8) {
     a[1] = l[1] = pt;
@@ -636,13 +630,13 @@
                                  rd[TX_4X4][1] : rd[TX_8X8][1];
 }
 
-static int vp9_sb_block_error_c(int16_t *coeff, int16_t *dqcoeff,
-                                int block_size, int shift) {
+static int block_error(int16_t *coeff, int16_t *dqcoeff,
+                       int block_size, int shift) {
   int i;
   int64_t error = 0;
 
   for (i = 0; i < block_size; i++) {
-    unsigned int this_diff = coeff[i] - dqcoeff[i];
+    int this_diff = coeff[i] - dqcoeff[i];
     error += this_diff * this_diff;
   }
   error >>= shift;
@@ -650,24 +644,24 @@
   return error > INT_MAX ? INT_MAX : (int)error;
 }
 
-static int vp9_sb_uv_block_error_c(int16_t *coeff,
-                                   int16_t *dqcoeff0, int16_t *dqcoeff1,
-                                   int block_size, int shift) {
-  int i;
-  int64_t error = 0;
+static int block_error_sby(MACROBLOCK *x, int block_size, int shift) {
+  return block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
+                     block_size, shift);
+}
 
-  for (i = 0; i < block_size / 2; i++) {
-    unsigned int this_diff = coeff[i] - dqcoeff0[i];
-    error += this_diff * this_diff;
-  }
-  coeff += block_size / 2;
-  for (i = 0; i < block_size / 2; i++) {
-    unsigned int this_diff = coeff[i] - dqcoeff1[i];
-    error += this_diff * this_diff;
-  }
-  error >>= shift;
+static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
+  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
+  int64_t sum = 0;
+  int plane;
 
-  return error > INT_MAX ? INT_MAX : (int)error;
+  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
+    const int subsampling = x->e_mbd.plane[plane].subsampling_x +
+                            x->e_mbd.plane[plane].subsampling_y;
+    sum += block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
+                       16 << (bwl + bhl - subsampling), 0);
+  }
+  sum >>= shift;
+  return sum > INT_MAX ? INT_MAX : (int)sum;
 }
 
 static int rdcost_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
@@ -704,8 +698,7 @@
   vp9_transform_sby_4x4(x, bsize);
   vp9_quantize_sby_4x4(x, bsize);
 
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff,
-                                     16 << (bwl + bhl), 2);
+  *distortion = block_error_sby(x, 16 << (bwl + bhl), 2);
   *rate       = rdcost_sby_4x4(cm, x, bsize);
   *skippable  = vp9_sby_is_skippable(xd, bsize);
 }
@@ -737,15 +730,14 @@
 static void super_block_yrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
                                 int *rate, int *distortion, int *skippable,
                                 BLOCK_SIZE_TYPE bsize) {
-  const int bwl = mb_width_log2(bsize) + 1, bhl = mb_height_log2(bsize) + 1;
+  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
   MACROBLOCKD *const xd = &x->e_mbd;
 
   xd->mode_info_context->mbmi.txfm_size = TX_8X8;
   vp9_transform_sby_8x8(x, bsize);
   vp9_quantize_sby_8x8(x, bsize);
 
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff,
-                                     64 << (bhl + bwl), 2);
+  *distortion = block_error_sby(x, 16 << (bhl + bwl), 2);
   *rate       = rdcost_sby_8x8(cm, x, bsize);
   *skippable  = vp9_sby_is_skippable(xd, bsize);
 }
@@ -775,15 +767,14 @@
 static void super_block_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
                                   int *rate, int *distortion, int *skippable,
                                   BLOCK_SIZE_TYPE bsize) {
-  const int bwl = mb_width_log2(bsize), bhl = mb_height_log2(bsize);
+  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
   MACROBLOCKD *const xd = &x->e_mbd;
 
   xd->mode_info_context->mbmi.txfm_size = TX_16X16;
   vp9_transform_sby_16x16(x, bsize);
   vp9_quantize_sby_16x16(x, bsize);
 
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff,
-                                     256 << (bwl + bhl), 2);
+  *distortion = block_error_sby(x, 16 << (bwl + bhl), 2);
   *rate       = rdcost_sby_16x16(cm, x, bsize);
   *skippable  = vp9_sby_is_skippable(xd, bsize);
 }
@@ -815,15 +806,14 @@
 static void super_block_yrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
                                   int *rate, int *distortion, int *skippable,
                                   BLOCK_SIZE_TYPE bsize) {
-  const int bwl = mb_width_log2(bsize) - 1, bhl = mb_height_log2(bsize) - 1;
+  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
   MACROBLOCKD *const xd = &x->e_mbd;
 
   xd->mode_info_context->mbmi.txfm_size = TX_32X32;
   vp9_transform_sby_32x32(x, bsize);
   vp9_quantize_sby_32x32(x, bsize);
 
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff,
-                                     1024 << (bwl + bhl), 0);
+  *distortion = block_error_sby(x, 16 << (bwl + bhl), 0);
   *rate       = rdcost_sby_32x32(cm, x, bsize);
   *skippable  = vp9_sby_is_skippable(xd, bsize);
 }
@@ -833,12 +823,9 @@
                             int *skip, BLOCK_SIZE_TYPE bs,
                             int64_t txfm_cache[NB_TXFM_MODES]) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
   int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB];
-  uint8_t *src = x->src.y_buffer, *dst = xd->plane[0].dst.buf;
-  int src_y_stride = x->src.y_stride, dst_y_stride = xd->plane[0].dst.stride;
 
-  vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride, bs);
+  vp9_subtract_sby(x, bs);
 
   if (bs >= BLOCK_SIZE_SB32X32)
     super_block_yrd_32x32(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
@@ -865,7 +852,10 @@
   VP9_COMMON *const cm = &cpi->common;
   BLOCK *be = x->block + ib;
   BLOCKD *b = xd->block + ib;
-
+  int16_t* const src_diff =
+      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,
+                                x->plane[0].src_diff);
+  int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);
   ENTROPY_CONTEXT ta = *a, tempa = *a;
   ENTROPY_CONTEXT tl = *l, templ = *l;
   TX_TYPE tx_type = DCT_DCT;
@@ -905,15 +895,17 @@
 #endif
 
     vp9_intra4x4_predict(xd, b, mode, *(b->base_dst) + b->dst, b->dst_stride);
-    vp9_subtract_b(be, b, 16);
+    vp9_subtract_block(4, 4, src_diff, 16,
+                       *(be->base_src) + be->src, be->src_stride,
+                       *(b->base_dst) + b->dst, b->dst_stride);
 
     b->bmi.as_mode.first = mode;
     tx_type = get_tx_type_4x4(xd, be - x->block);
     if (tx_type != DCT_DCT) {
-      vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
+      vp9_short_fht4x4(src_diff, coeff, 16, tx_type);
       vp9_ht_quantize_b_4x4(x, be - x->block, tx_type);
     } else {
-      x->fwd_txm4x4(be->src_diff, be->coeff, 32);
+      x->fwd_txm4x4(src_diff, coeff, 32);
       x->quantize_b_4x4(x, be - x->block, 16);
     }
 
@@ -923,7 +915,7 @@
     ratey = cost_coeffs(cm, x, b - xd->block,
                         PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4, 16);
     rate += ratey;
-    distortion = vp9_block_error(be->coeff,
+    distortion = vp9_block_error(coeff,
                                  BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
                                  16) >> 2;
 
@@ -1095,10 +1087,13 @@
   ENTROPY_CONTEXT_PLANES ta, tl;
   ENTROPY_CONTEXT *ta0, *ta1, besta0 = 0, besta1 = 0;
   ENTROPY_CONTEXT *tl0, *tl1, bestl0 = 0, bestl1 = 0;
-
   // perform transformation of dimension 8x8
   // note the input and output index mapping
   int idx = (ib & 0x02) ? (ib + 2) : ib;
+  int16_t* const src_diff =
+      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,
+                                x->plane[0].src_diff);
+  int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, idx, 16);
 
   assert(ib < 16);
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
@@ -1111,18 +1106,20 @@
 
     vp9_intra8x8_predict(xd, b, mode, *(b->base_dst) + b->dst, b->dst_stride);
 
-    vp9_subtract_4b_c(be, b, 16);
+    vp9_subtract_block(8, 8, src_diff, 16,
+                       *(be->base_src) + be->src, be->src_stride,
+                       *(b->base_dst) + b->dst, b->dst_stride);
 
     if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
       TX_TYPE tx_type = get_tx_type_8x8(xd, ib);
       if (tx_type != DCT_DCT)
-        vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type);
+        vp9_short_fht8x8(src_diff, coeff, 16, tx_type);
       else
-        x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32);
+        x->fwd_txm8x8(src_diff, coeff, 32);
       x->quantize_b_8x8(x, idx, tx_type, 16);
 
       // compute quantization mse of 8x8 block
-      distortion = vp9_block_error_c((x->block + idx)->coeff,
+      distortion = vp9_block_error_c(coeff,
           BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64);
 
       vpx_memcpy(&ta, a, sizeof(ENTROPY_CONTEXT_PLANES));
@@ -1150,23 +1147,29 @@
       distortion = 0;
       rate_t = 0;
       for (i = 0; i < 4; ++i) {
+        int16_t* const src_diff =
+            raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16,
+                                      0, ib + iblock[i],
+                                      x->plane[0].src_diff);
+        int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff,
+                                            ib + iblock[i], 16);
         int do_two = 0;
         b = &xd->block[ib + iblock[i]];
         be = &x->block[ib + iblock[i]];
         tx_type = get_tx_type_4x4(xd, ib + iblock[i]);
         if (tx_type != DCT_DCT) {
-          vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
+          vp9_short_fht4x4(src_diff, coeff, 16, tx_type);
           vp9_ht_quantize_b_4x4(x, ib + iblock[i], tx_type);
         } else if (!(i & 1) &&
                    get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) {
-          x->fwd_txm8x4(be->src_diff, be->coeff, 32);
+          x->fwd_txm8x4(src_diff, coeff, 32);
           x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1, 16);
           do_two = 1;
         } else {
-          x->fwd_txm4x4(be->src_diff, be->coeff, 32);
+          x->fwd_txm4x4(src_diff, coeff, 32);
           x->quantize_b_4x4(x, ib + iblock[i], 16);
         }
-        distortion += vp9_block_error_c(be->coeff,
+        distortion += vp9_block_error_c(coeff,
             BLOCK_OFFSET(xd->plane[0].dqcoeff, ib + iblock[i], 16),
             16 << do_two);
         rate_t += cost_coeffs(cm, x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
@@ -1364,17 +1367,13 @@
 static void super_block_uvrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
                                  int *rate, int *distortion, int *skip,
                                  BLOCK_SIZE_TYPE bsize) {
-  const int bwl = mb_width_log2(bsize) + 2, bhl = mb_height_log2(bsize) + 2;
   MACROBLOCKD *const xd = &x->e_mbd;
 
   vp9_transform_sbuv_4x4(x, bsize);
   vp9_quantize_sbuv_4x4(x, bsize);
 
   *rate       = rd_cost_sbuv_4x4(cm, x, bsize);
-  *distortion = vp9_sb_uv_block_error_c(x->coeff + (16 << (bwl + bhl)),
-                                        xd->plane[1].dqcoeff,
-                                        xd->plane[2].dqcoeff,
-                                        32 << (bwl + bhl - 2), 2);
+  *distortion = block_error_sbuv(x, bsize, 2);
   *skip       = vp9_sbuv_is_skippable(xd, bsize);
 }
 
@@ -1409,17 +1408,13 @@
 static void super_block_uvrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
                                  int *rate, int *distortion, int *skip,
                                  BLOCK_SIZE_TYPE bsize) {
-  const int bwl = mb_width_log2(bsize) + 1, bhl = mb_height_log2(bsize) + 1;
   MACROBLOCKD *const xd = &x->e_mbd;
 
   vp9_transform_sbuv_8x8(x, bsize);
   vp9_quantize_sbuv_8x8(x, bsize);
 
   *rate       = rd_cost_sbuv_8x8(cm, x, bsize);
-  *distortion = vp9_sb_uv_block_error_c(x->coeff + (64 << (bwl + bhl)),
-                                        xd->plane[1].dqcoeff,
-                                        xd->plane[2].dqcoeff,
-                                        128 << (bwl + bhl - 2), 2);
+  *distortion = block_error_sbuv(x, bsize, 2);
   *skip       = vp9_sbuv_is_skippable(xd, bsize);
 }
 
@@ -1454,17 +1449,13 @@
 static void super_block_uvrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
                                    int *rate, int *distortion, int *skip,
                                    BLOCK_SIZE_TYPE bsize) {
-  const int bwl = mb_width_log2(bsize), bhl = mb_height_log2(bsize);
   MACROBLOCKD *const xd = &x->e_mbd;
 
   vp9_transform_sbuv_16x16(x, bsize);
   vp9_quantize_sbuv_16x16(x, bsize);
 
   *rate       = rd_cost_sbuv_16x16(cm, x, bsize);
-  *distortion = vp9_sb_uv_block_error_c(x->coeff + (256 << (bwl + bhl)),
-                                        xd->plane[1].dqcoeff,
-                                        xd->plane[2].dqcoeff,
-                                        512 << (bwl + bhl - 2), 2);
+  *distortion = block_error_sbuv(x, bsize, 2);
   *skip       = vp9_sbuv_is_skippable(xd, bsize);
 }
 
@@ -1500,17 +1491,13 @@
 static void super_block_uvrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
                                    int *rate, int *distortion, int *skip,
                                    BLOCK_SIZE_TYPE bsize) {
-  const int bwl = mb_width_log2(bsize) - 1, bhl = mb_height_log2(bsize) - 1;
   MACROBLOCKD *const xd = &x->e_mbd;
 
   vp9_transform_sbuv_32x32(x, bsize);
   vp9_quantize_sbuv_32x32(x, bsize);
 
   *rate       = rd_cost_sbuv_32x32(cm, x, bsize);
-  *distortion = vp9_sb_uv_block_error_c(x->coeff + (1024 << (bwl + bhl)),
-                                        xd->plane[1].dqcoeff,
-                                        xd->plane[2].dqcoeff,
-                                        2048 << (bwl + bhl - 2), 0);
+  *distortion = block_error_sbuv(x, bsize, 0);
   *skip       = vp9_sbuv_is_skippable(xd, bsize);
 }
 
@@ -1519,12 +1506,8 @@
                              BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  uint8_t *usrc = x->src.u_buffer, *udst = xd->plane[1].dst.buf;
-  uint8_t *vsrc = x->src.v_buffer, *vdst = xd->plane[2].dst.buf;
-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->plane[1].dst.stride;
 
-  vp9_subtract_sbuv_s_c(x->src_diff, usrc, vsrc, src_uv_stride,
-                        udst, vdst, dst_uv_stride, bsize);
+  vp9_subtract_sbuv(x, bsize);
 
   if (mbmi->txfm_size >= TX_32X32 && bsize >= BLOCK_SIZE_SB64X64) {
     super_block_uvrd_32x32(cm, x, rate, distortion, skippable, bsize);
@@ -1726,6 +1709,10 @@
     if (labels[i] == which_label) {
       BLOCKD *bd = &x->e_mbd.block[i];
       BLOCK *be = &x->block[i];
+      int16_t* const src_diff =
+          raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, i,
+                                    x->plane[0].src_diff);
+      int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);
       int thisdistortion;
 
       vp9_build_inter_predictor(*(bd->base_pre) + bd->pre,
@@ -1743,15 +1730,16 @@
         vp9_build_inter_predictor(
             *(bd->base_second_pre) + bd->pre, bd->pre_stride,
             *(bd->base_dst) + bd->dst, bd->dst_stride,
-            &bd->bmi.as_mv[1], &xd->scale_factor[1], 4, 4,
-            1 << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT) /* avg */,
+            &bd->bmi.as_mv[1], &xd->scale_factor[1], 4, 4, 1,
             &xd->subpix);
       }
 
-      vp9_subtract_b(be, bd, 16);
-      x->fwd_txm4x4(be->src_diff, be->coeff, 32);
+      vp9_subtract_block(4, 4, src_diff, 16,
+                         *(be->base_src) + be->src, be->src_stride,
+                         *(bd->base_dst) + bd->dst, bd->dst_stride);
+      x->fwd_txm4x4(src_diff, coeff, 32);
       x->quantize_b_4x4(x, i, 16);
-      thisdistortion = vp9_block_error(be->coeff,
+      thisdistortion = vp9_block_error(coeff,
           BLOCK_OFFSET(xd->plane[0].dqcoeff, i, 16), 16);
       *distortion += thisdistortion;
       *labelyrate += cost_coeffs(cm, x, i, PLANE_TYPE_Y_WITH_DC,
@@ -1796,7 +1784,11 @@
       int which_mv;
       const int idx = (ib & 8) + ((ib & 2) << 1);
       BLOCKD *bd = &xd->block[ib];
-      BLOCK *be = &x->block[ib], *be2 = &x->block[idx];
+      BLOCK *be = &x->block[ib];
+      int16_t* const src_diff =
+          raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,
+                                    x->plane[0].src_diff);
+      int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, idx, 16);
       int thisdistortion;
 
       assert(idx < 16);
@@ -1810,17 +1802,18 @@
             *base_pre + bd->pre, bd->pre_stride,
             *(bd->base_dst) + bd->dst, bd->dst_stride,
             &bd->bmi.as_mv[which_mv], &xd->scale_factor[which_mv], 8, 8,
-            which_mv << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT),
-            &xd->subpix);
+            which_mv, &xd->subpix);
       }
 
-      vp9_subtract_4b_c(be, bd, 16);
+      vp9_subtract_block(8, 8, src_diff, 16,
+                         *(be->base_src) + be->src, be->src_stride,
+                         *(bd->base_dst) + bd->dst, bd->dst_stride);
 
       if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) {
         if (otherrd) {
-          x->fwd_txm8x8(be->src_diff, be2->coeff, 32);
+          x->fwd_txm8x8(src_diff, coeff, 32);
           x->quantize_b_8x8(x, idx, DCT_DCT, 16);
-          thisdistortion = vp9_block_error_c(be2->coeff,
+          thisdistortion = vp9_block_error_c(coeff,
               BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64);
           otherdist += thisdistortion;
           xd->mode_info_context->mbmi.txfm_size = TX_8X8;
@@ -1831,11 +1824,17 @@
           xd->mode_info_context->mbmi.txfm_size = TX_4X4;
         }
         for (j = 0; j < 4; j += 2) {
+          int16_t* const src_diff =
+              raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16,
+                                        0, ib + iblock[j],
+                                        x->plane[0].src_diff);
+          int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff,
+                                              ib + iblock[j], 16);
           bd = &xd->block[ib + iblock[j]];
           be = &x->block[ib + iblock[j]];
-          x->fwd_txm8x4(be->src_diff, be->coeff, 32);
+          x->fwd_txm8x4(src_diff, coeff, 32);
           x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j] + 1, 16);
-          thisdistortion = vp9_block_error_c(be->coeff,
+          thisdistortion = vp9_block_error_c(coeff,
               BLOCK_OFFSET(xd->plane[0].dqcoeff, ib + iblock[j], 16), 32);
           *distortion += thisdistortion;
           *labelyrate +=
@@ -1853,10 +1852,15 @@
       } else /* 8x8 */ {
         if (otherrd) {
           for (j = 0; j < 4; j += 2) {
-            BLOCK *be = &x->block[ib + iblock[j]];
-            x->fwd_txm8x4(be->src_diff, be->coeff, 32);
+            int16_t* const src_diff =
+                raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16,
+                                          0, ib + iblock[j],
+                                          x->plane[0].src_diff);
+            int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff,
+                                                ib + iblock[j], 16);
+            x->fwd_txm8x4(src_diff, coeff, 32);
             x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j] + 1, 16);
-            thisdistortion = vp9_block_error_c(be->coeff,
+            thisdistortion = vp9_block_error_c(coeff,
                 BLOCK_OFFSET(xd->plane[0].dqcoeff, ib + iblock[j], 16), 32);
             otherdist += thisdistortion;
             xd->mode_info_context->mbmi.txfm_size = TX_4X4;
@@ -1874,9 +1878,9 @@
             xd->mode_info_context->mbmi.txfm_size = TX_8X8;
           }
         }
-        x->fwd_txm8x8(be->src_diff, be2->coeff, 32);
+        x->fwd_txm8x8(src_diff, coeff, 32);
         x->quantize_b_8x8(x, idx, DCT_DCT, 16);
-        thisdistortion = vp9_block_error_c(be2->coeff,
+        thisdistortion = vp9_block_error_c(coeff,
             BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64);
         *distortion += thisdistortion;
         *labelyrate += cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
@@ -2873,7 +2877,7 @@
                                   x->nmvjointcost, x->mvcost, 96,
                                   x->e_mbd.allow_high_precision_mv);
       } else {
-        YV12_BUFFER_CONFIG backup_yv12 = xd->pre;
+        struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
         int bestsme = INT_MAX;
         int further_steps, step_param = cpi->sf.first_step;
         int sadpb = x->sadperbit16;
@@ -2886,13 +2890,16 @@
         int tmp_row_max = x->mv_row_max;
 
         if (scaled_ref_frame) {
+          int i;
+
           // Swap out the reference frame for a version that's been scaled to
           // match the resolution of the current frame, allowing the existing
           // motion search code to be used without additional modifications.
-          xd->pre = *scaled_ref_frame;
-          xd->pre.y_buffer += mb_row * 16 * xd->pre.y_stride + mb_col * 16;
-          xd->pre.u_buffer += mb_row * 8 * xd->pre.uv_stride + mb_col * 8;
-          xd->pre.v_buffer += mb_row * 8 * xd->pre.uv_stride + mb_col * 8;
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            backup_yv12[i] = xd->plane[i].pre[0];
+
+          setup_pre_planes(xd, scaled_ref_frame, NULL, mb_row, mb_col,
+                           NULL, NULL);
         }
 
         vp9_clamp_mv_min_max(x, &ref_mv[0]);
@@ -2942,7 +2949,10 @@
 
         // restore the predictor, if required
         if (scaled_ref_frame) {
-          xd->pre = backup_yv12;
+          int i;
+
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            xd->plane[i].pre[0] = backup_yv12[i];
         }
       }
       break;
@@ -3355,6 +3365,9 @@
 
   int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
                                              cpi->common.y_dc_delta_q);
+  int64_t mode_distortions[MB_MODE_COUNT] = {-1};
+  int64_t frame_distortions[MAX_REF_FRAMES] = {-1};
+  int ref_frame;
 
   struct scale_factors scale_factor[4];
 
@@ -3365,6 +3378,9 @@
   vpx_memset(&x->mb_context[xd->sb_index][xd->mb_index], 0,
              sizeof(PICK_MODE_CONTEXT));
 
+  x->mb_context[xd->sb_index][xd->mb_index].frames_with_high_error = 0;
+  x->mb_context[xd->sb_index][xd->mb_index].modes_with_high_error = 0;
+
   for (i = 0; i < MAX_REF_FRAMES; i++)
     frame_mv[NEWMV][i].as_int = INVALID_MV;
   for (i = 0; i < NB_PREDICTION_TYPES; ++i)
@@ -3521,7 +3537,6 @@
       int ref = mbmi->ref_frame;
       int fb;
 
-      xd->pre = yv12_mb[ref];
       best_ref_mv = mbmi->ref_mvs[ref][0];
       vpx_memcpy(mdcounts, frame_mdcounts[ref], sizeof(mdcounts));
 
@@ -3540,10 +3555,15 @@
     if (mbmi->second_ref_frame > 0) {
       int ref = mbmi->second_ref_frame;
 
-      xd->second_pre = yv12_mb[ref];
       second_best_ref_mv = mbmi->ref_mvs[ref][0];
     }
 
+    // TODO(jkoleszar) scaling/translation handled during creation of yv12_mb
+    // currently.
+    setup_pre_planes(xd, &yv12_mb[mbmi->ref_frame],
+        mbmi->second_ref_frame > 0 ? &yv12_mb[mbmi->second_ref_frame] : NULL,
+        0, 0, NULL, NULL);
+
     // Experimental code. Special case for gf and arf zeromv modes.
     // Increase zbin size to suppress noise
     if (cpi->zbin_mode_boost_enabled) {
@@ -3757,14 +3777,10 @@
       if (tmp_rd < best_yrd) {
         int uv_skippable;
 
-        vp9_build_inter4x4_predictors_mbuv(&x->e_mbd, mb_row, mb_col);
+        vp9_build_inter_predictors_sbuv(&x->e_mbd, mb_row, mb_col,
+                                        BLOCK_SIZE_MB16X16);
 
-        vp9_subtract_sbuv_s_c(x->src_diff,
-                              x->src.u_buffer,
-                              x->src.v_buffer, x->src.uv_stride,
-                              xd->plane[1].dst.buf,
-                              xd->plane[2].dst.buf, xd->plane[1].dst.stride,
-                              BLOCK_SIZE_MB16X16);
+        vp9_subtract_sbuv(x, BLOCK_SIZE_MB16X16);
 
         super_block_uvrd_4x4(cm, x, &rate_uv, &distortion_uv,
                              &uv_skippable, BLOCK_SIZE_MB16X16);
@@ -3904,6 +3920,17 @@
 #endif
     }
 
+    // Store the respective mode distortions for later use.
+    // Store the respective mode distortions for later use.
+    if (mode_distortions[this_mode] == -1
+        || distortion2 < mode_distortions[this_mode]) {
+      mode_distortions[this_mode] = distortion2;
+    }
+    if (frame_distortions[mbmi->ref_frame] == -1 ||
+        distortion2 < frame_distortions[mbmi->ref_frame]) {
+       frame_distortions[mbmi->ref_frame] = distortion2;
+    }
+
     // Did this mode help.. i.e. is it the new best mode
     if (this_rd < best_rd || x->skip) {
       if (!mode_excluded) {
@@ -4112,6 +4139,29 @@
   }
 
 end:
+
+  // Flag all modes that have a distortion thats > 2x the best we found at
+  // this level.
+  for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
+    if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV
+        || mode_index == SPLITMV)
+      continue;
+
+    if (mode_distortions[mode_index] > 2 * *returndistortion) {
+      x->mb_context[xd->sb_index][xd->mb_index].modes_with_high_error |= (1
+          << mode_index);
+    }
+  }
+
+  // Flag all ref frames that have a distortion thats > 2x the best we found at
+  // this level.
+  for (ref_frame = INTRA_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    if (frame_distortions[ref_frame] > 2 * *returndistortion) {
+      x->mb_context[xd->sb_index][xd->mb_index].frames_with_high_error |= (1
+          << ref_frame);
+    }
+  }
+
   set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
                     scale_factor);
   store_coding_context(x, &x->mb_context[xd->sb_index][xd->mb_index],
@@ -4299,7 +4349,7 @@
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   MB_PREDICTION_MODE this_mode;
   MB_PREDICTION_MODE best_mode = DC_PRED;
-  MV_REFERENCE_FRAME ref_frame;
+  MV_REFERENCE_FRAME ref_frame, second_ref;
   unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
   int comp_pred, i;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
@@ -4338,6 +4388,12 @@
   struct scale_factors scale_factor[4];
   unsigned int ref_frame_mask = 0;
   unsigned int mode_mask = 0;
+  int64_t mode_distortions[MB_MODE_COUNT] = {-1};
+  int64_t frame_distortions[MAX_REF_FRAMES] = {-1};
+
+  // Everywhere the flag is set the error is much higher than its neighbors.
+  ctx->frames_with_high_error = 0;
+  ctx->modes_with_high_error = 0;
 
   xd->mode_info_context->mbmi.segment_id = segment_id;
   estimate_ref_frame_costs(cpi, segment_id, ref_costs);
@@ -4348,34 +4404,36 @@
   for (i = 0; i < NB_TXFM_MODES; i++)
     best_txfm_rd[i] = INT64_MAX;
 
-  // Create a mask set to 1 for each frame used by a smaller resolution.p
+  // Create a mask set to 1 for each frame used by a smaller resolution.
   if (cpi->Speed > 0) {
     switch (block_size) {
       case BLOCK_64X64:
         for (i = 0; i < 4; i++) {
           for (j = 0; j < 4; j++) {
-            ref_frame_mask |= (1 << x->mb_context[i][j].mic.mbmi.ref_frame);
-            mode_mask |= (1 << x->mb_context[i][j].mic.mbmi.mode);
+            ref_frame_mask |= x->mb_context[i][j].frames_with_high_error;
+            mode_mask |= x->mb_context[i][j].modes_with_high_error;
           }
         }
         for (i = 0; i < 4; i++) {
-          ref_frame_mask |= (1 << x->sb32_context[i].mic.mbmi.ref_frame);
-          mode_mask |= (1 << x->sb32_context[i].mic.mbmi.mode);
+          ref_frame_mask |= x->sb32_context[i].frames_with_high_error;
+          mode_mask |= x->sb32_context[i].modes_with_high_error;
         }
         break;
       case BLOCK_32X32:
         for (i = 0; i < 4; i++) {
-          ref_frame_mask |= (1
-              << x->mb_context[xd->sb_index][i].mic.mbmi.ref_frame);
-          mode_mask |= (1 << x->mb_context[xd->sb_index][i].mic.mbmi.mode);
+          ref_frame_mask |=
+              x->mb_context[xd->sb_index][i].frames_with_high_error;
+          mode_mask |= x->mb_context[xd->sb_index][i].modes_with_high_error;
         }
         break;
       default:
         // Until we handle all block sizes set it to present;
-        ref_frame_mask = 0xff;
-        mode_mask = 0xff;
+        ref_frame_mask = 0;
+        mode_mask = 0;
         break;
     }
+    ref_frame_mask = ~ref_frame_mask;
+    mode_mask = ~mode_mask;
   }
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
@@ -4430,6 +4488,9 @@
       if (!(ref_frame_mask & (1 << ref_frame))) {
         continue;
       }
+      if (!(mode_mask & (1 << this_mode))) {
+        continue;
+      }
       if (vp9_mode_order[mode_index].second_ref_frame != NONE
           && !(ref_frame_mask
               & (1 << vp9_mode_order[mode_index].second_ref_frame))) {
@@ -4465,8 +4526,6 @@
     //  continue;
 
     if (comp_pred) {
-      int second_ref;
-
       if (ref_frame == ALTREF_FRAME) {
         second_ref = LAST_FRAME;
       } else {
@@ -4478,7 +4537,6 @@
       set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
                         scale_factor);
 
-      xd->second_pre = yv12_mb[second_ref];
       mode_excluded =
           mode_excluded ?
               mode_excluded : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
@@ -4496,7 +4554,9 @@
       }
     }
 
-    xd->pre = yv12_mb[ref_frame];
+    setup_pre_planes(xd, &yv12_mb[ref_frame],
+        comp_pred ? &yv12_mb[second_ref] : NULL, 0, 0, NULL, NULL);
+
     vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts));
 
     // If the segment reference frame feature is enabled....
@@ -4672,6 +4732,16 @@
 #endif
     }
 
+    // Store the respective mode distortions for later use.
+    if (mode_distortions[this_mode] == -1
+        || distortion2 < mode_distortions[this_mode]) {
+      mode_distortions[this_mode] = distortion2;
+    }
+    if (frame_distortions[mbmi->ref_frame] == -1
+        || distortion2 < frame_distortions[mbmi->ref_frame]) {
+      frame_distortions[mbmi->ref_frame] = distortion2;
+    }
+
     // Did this mode help.. i.e. is it the new best mode
     if (this_rd < best_rd || x->skip) {
       if (!mode_excluded) {
@@ -4757,6 +4827,27 @@
     if (x->skip && !mode_excluded)
       break;
   }
+  // Flag all modes that have a distortion thats > 2x the best we found at
+  // this level.
+  for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
+    if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV
+        || mode_index == SPLITMV)
+      continue;
+
+    if (mode_distortions[mode_index] > 2 * *returndistortion) {
+      ctx->modes_with_high_error |= (1 << mode_index);
+    }
+  }
+
+  // Flag all ref frames that have a distortion thats > 2x the best we found at
+  // this level.
+  for (ref_frame = INTRA_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    if (frame_distortions[ref_frame] > 2 * *returndistortion) {
+      ctx->frames_with_high_error |= (1 << ref_frame);
+    }
+  }
+
+
 
   assert((cm->mcomp_filter_type == SWITCHABLE) ||
          (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index 6336969..40b1879 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -63,16 +63,18 @@
 static void calc_segtree_probs(MACROBLOCKD *xd,
                                int *segcounts,
                                vp9_prob *segment_tree_probs) {
-  int count1, count2;
-
-  // Total count for all segments
-  count1 = segcounts[0] + segcounts[1];
-  count2 = segcounts[2] + segcounts[3];
-
   // Work out probabilities of each segment
-  segment_tree_probs[0] = get_binary_prob(count1, count2);
-  segment_tree_probs[1] = get_prob(segcounts[0], count1);
-  segment_tree_probs[2] = get_prob(segcounts[2], count2);
+  segment_tree_probs[0] =
+    get_binary_prob(segcounts[0] + segcounts[1] + segcounts[2] + segcounts[3],
+                    segcounts[4] + segcounts[5] + segcounts[6] + segcounts[7]);
+  segment_tree_probs[1] =
+    get_binary_prob(segcounts[0] + segcounts[1], segcounts[2] + segcounts[3]);
+  segment_tree_probs[2] = get_binary_prob(segcounts[0], segcounts[1]);
+  segment_tree_probs[3] = get_binary_prob(segcounts[2], segcounts[3]);
+  segment_tree_probs[4] =
+    get_binary_prob(segcounts[4] + segcounts[5], segcounts[6] + segcounts[7]);
+  segment_tree_probs[5] = get_binary_prob(segcounts[4], segcounts[5]);
+  segment_tree_probs[6] = get_binary_prob(segcounts[6], segcounts[7]);
 }
 
 // Based on set of segment counts and probabilities calculate a cost estimate
@@ -83,81 +85,38 @@
   int count1, count2;
 
   // Cost the top node of the tree
-  count1 = segcounts[0] + segcounts[1];
-  count2 = segcounts[2] + segcounts[3];
+  count1 = segcounts[0] + segcounts[1] + segcounts[2] + segcounts[3];
+  count2 = segcounts[3] + segcounts[4] + segcounts[5] + segcounts[6];
   cost = count1 * vp9_cost_zero(probs[0]) +
          count2 * vp9_cost_one(probs[0]);
 
-  // Now add the cost of each individual segment branch
-  if (count1 > 0)
-    cost += segcounts[0] * vp9_cost_zero(probs[1]) +
-            segcounts[1] * vp9_cost_one(probs[1]);
+  // Cost subsequent levels
+  if (count1 > 0) {
+    count1 = segcounts[0] + segcounts[1];
+    count2 = segcounts[2] + segcounts[3];
+    cost += count1 * vp9_cost_zero(probs[1]) +
+            count2 * vp9_cost_one(probs[1]);
 
-  if (count2 > 0)
-    cost += segcounts[2] * vp9_cost_zero(probs[2]) +
-            segcounts[3] * vp9_cost_one(probs[2]);
+    if (count1 > 0)
+      cost += segcounts[0] * vp9_cost_zero(probs[2]) +
+              segcounts[1] * vp9_cost_one(probs[2]);
+    if (count2 > 0)
+      cost += segcounts[2] * vp9_cost_zero(probs[3]) +
+              segcounts[3] * vp9_cost_one(probs[3]);
+  }
 
-  return cost;
-}
+  if (count2 > 0) {
+    count1 = segcounts[4] + segcounts[5];
+    count2 = segcounts[6] + segcounts[7];
+    cost += count1 * vp9_cost_zero(probs[4]) +
+            count2 * vp9_cost_one(probs[4]);
 
-// Based on set of segment counts calculate a probability tree
-static void calc_segtree_probs_pred(MACROBLOCKD *xd,
-                                    int (*segcounts)[MAX_MB_SEGMENTS],
-                                    vp9_prob *segment_tree_probs,
-                                    vp9_prob *mod_probs) {
-  int count[4];
-
-  assert(!segcounts[0][0] && !segcounts[1][1] &&
-         !segcounts[2][2] && !segcounts[3][3]);
-
-  // Total count for all segments
-  count[0] = segcounts[3][0] + segcounts[1][0] + segcounts[2][0];
-  count[1] = segcounts[2][1] + segcounts[0][1] + segcounts[3][1];
-  count[2] = segcounts[0][2] + segcounts[3][2] + segcounts[1][2];
-  count[3] = segcounts[1][3] + segcounts[2][3] + segcounts[0][3];
-
-  // Work out probabilities of each segment
-  segment_tree_probs[0] = get_binary_prob(count[0] + count[1],
-                                          count[2] + count[3]);
-  segment_tree_probs[1] = get_binary_prob(count[0], count[1]);
-  segment_tree_probs[2] = get_binary_prob(count[2], count[3]);
-
-  // now work out modified counts that the decoder would have
-  count[0] =        segment_tree_probs[0]  *        segment_tree_probs[1];
-  count[1] =        segment_tree_probs[0]  * (256 - segment_tree_probs[1]);
-  count[2] = (256 - segment_tree_probs[0]) *        segment_tree_probs[2];
-  count[3] = (256 - segment_tree_probs[0]) * (256 - segment_tree_probs[2]);
-
-  // Work out modified probabilties depending on what segment was predicted
-  mod_probs[0] = get_binary_prob(count[1], count[2] + count[3]);
-  mod_probs[1] = get_binary_prob(count[0], count[2] + count[3]);
-  mod_probs[2] = get_binary_prob(count[0] + count[1], count[3]);
-  mod_probs[3] = get_binary_prob(count[0] + count[1], count[2]);
-}
-
-// Based on set of segment counts and probabilities calculate a cost estimate
-static int cost_segmap_pred(MACROBLOCKD *xd,
-                            int (*segcounts)[MAX_MB_SEGMENTS],
-                            vp9_prob *probs, vp9_prob *mod_probs) {
-  int pred_seg, cost = 0;
-
-  for (pred_seg = 0; pred_seg < MAX_MB_SEGMENTS; pred_seg++) {
-    int count1, count2;
-
-    // Cost the top node of the tree
-    count1 = segcounts[pred_seg][0] + segcounts[pred_seg][1];
-    count2 = segcounts[pred_seg][2] + segcounts[pred_seg][3];
-    cost += count1 * vp9_cost_zero(mod_probs[pred_seg]) +
-            count2 * vp9_cost_one(mod_probs[pred_seg]);
-
-    // Now add the cost of each individual segment branch
-    if (pred_seg >= 2 && count1) {
-      cost += segcounts[pred_seg][0] * vp9_cost_zero(probs[1]) +
-              segcounts[pred_seg][1] * vp9_cost_one(probs[1]);
-    } else if (pred_seg < 2 && count2 > 0) {
-      cost += segcounts[pred_seg][2] * vp9_cost_zero(probs[2]) +
-              segcounts[pred_seg][3] * vp9_cost_one(probs[2]);
-    }
+    if (count1 > 0)
+      cost += segcounts[4] * vp9_cost_zero(probs[5]) +
+              segcounts[5] * vp9_cost_one(probs[5]);
+    if (count2 > 0)
+      cost += segcounts[6] * vp9_cost_zero(probs[6]) +
+              segcounts[7] * vp9_cost_one(probs[6]);
   }
 
   return cost;
@@ -167,15 +126,14 @@
                        MODE_INFO *mi,
                        int *no_pred_segcounts,
                        int (*temporal_predictor_count)[2],
-                       int (*t_unpred_seg_counts)[MAX_MB_SEGMENTS],
+                       int *t_unpred_seg_counts,
                        int bw, int bh, int mb_row, int mb_col) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   const int segment_id = mi->mbmi.segment_id;
 
   xd->mode_info_context = mi;
-  set_mb_row(cm, xd, mb_row, bh);
-  set_mb_col(cm, xd, mb_col, bw);
+  set_mb_row_col(cm, xd, mb_row, bh, mb_col, bw);
 
   // Count the number of hits on each segment with no prediction
   no_pred_segcounts[segment_id]++;
@@ -197,7 +155,7 @@
 
     if (!seg_predicted)
       // Update the "unpredicted" segment count
-      t_unpred_seg_counts[pred_seg_id][segment_id]++;
+      t_unpred_seg_counts[segment_id]++;
   }
 }
 
@@ -213,11 +171,10 @@
 
   int temporal_predictor_count[PREDICTION_PROBS][2];
   int no_pred_segcounts[MAX_MB_SEGMENTS];
-  int t_unpred_seg_counts[MAX_MB_SEGMENTS][MAX_MB_SEGMENTS];
+  int t_unpred_seg_counts[MAX_MB_SEGMENTS];
 
-  vp9_prob no_pred_tree[MB_FEATURE_TREE_PROBS];
-  vp9_prob t_pred_tree[MB_FEATURE_TREE_PROBS];
-  vp9_prob t_pred_tree_mod[MAX_MB_SEGMENTS];
+  vp9_prob no_pred_tree[MB_SEG_TREE_PROBS];
+  vp9_prob t_pred_tree[MB_SEG_TREE_PROBS];
   vp9_prob t_nopred_prob[PREDICTION_PROBS];
 
   const int mis = cm->mode_info_stride;
@@ -332,10 +289,8 @@
   if (cm->frame_type != KEY_FRAME) {
     // Work out probability tree for coding those segments not
     // predicted using the temporal method and the cost.
-    calc_segtree_probs_pred(xd, t_unpred_seg_counts, t_pred_tree,
-                            t_pred_tree_mod);
-    t_pred_cost = cost_segmap_pred(xd, t_unpred_seg_counts, t_pred_tree,
-                                   t_pred_tree_mod);
+    calc_segtree_probs(xd, t_unpred_seg_counts, t_pred_tree);
+    t_pred_cost = cost_segmap(xd, t_unpred_seg_counts, t_pred_tree);
 
     // Add in the cost of the signalling for each prediction context
     for (i = 0; i < PREDICTION_PROBS; i++) {
@@ -355,8 +310,6 @@
     cm->temporal_update = 1;
     vpx_memcpy(xd->mb_segment_tree_probs,
                t_pred_tree, sizeof(t_pred_tree));
-    vpx_memcpy(xd->mb_segment_mispred_tree_probs,
-               t_pred_tree_mod, sizeof(t_pred_tree_mod));
     vpx_memcpy(&cm->segment_pred_probs,
                t_nopred_prob, sizeof(t_nopred_prob));
   } else {
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 6149518..a6c5f71 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -51,8 +51,7 @@
                             &mv,
                             &xd->scale_factor[which_mv],
                             16, 16,
-                            which_mv <<
-                            (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT),
+                            which_mv,
                             &xd->subpix);
 
   stride = (stride + 1) >> 1;
@@ -62,8 +61,7 @@
                                &mv,
                                &xd->scale_factor_uv[which_mv],
                                8, 8,
-                               which_mv <<
-                               (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT),
+                               which_mv,
                                &xd->subpix);
 
   vp9_build_inter_predictor_q4(v_mb_ptr, stride,
@@ -71,8 +69,7 @@
                                &mv,
                                &xd->scale_factor_uv[which_mv],
                                8, 8,
-                               which_mv <<
-                               (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT),
+                               which_mv,
                                &xd->subpix);
 }
 
@@ -221,9 +218,9 @@
   DECLARE_ALIGNED_ARRAY(16, uint8_t,  predictor, 16 * 16 + 8 * 8 + 8 * 8);
 
   // Save input state
-  uint8_t *y_buffer = mbd->pre.y_buffer;
-  uint8_t *u_buffer = mbd->pre.u_buffer;
-  uint8_t *v_buffer = mbd->pre.v_buffer;
+  uint8_t *y_buffer = mbd->plane[0].pre[0].buf;
+  uint8_t *u_buffer = mbd->plane[1].pre[0].buf;
+  uint8_t *v_buffer = mbd->plane[2].pre[0].buf;
 
   for (mb_row = 0; mb_row < mb_rows; mb_row++) {
 #if ALT_REF_MC_ENABLED
@@ -368,9 +365,9 @@
   }
 
   // Restore input state
-  mbd->pre.y_buffer = y_buffer;
-  mbd->pre.u_buffer = u_buffer;
-  mbd->pre.v_buffer = v_buffer;
+  mbd->plane[0].pre[0].buf = y_buffer;
+  mbd->plane[1].pre[0].buf = u_buffer;
+  mbd->plane[2].pre[0].buf = v_buffer;
 }
 
 void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 398b4bb..6f2cbbf 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -36,21 +36,6 @@
 extern vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES];
 #endif  /* ENTROPY_STATS */
 
-#if CONFIG_CODE_NONZEROCOUNT
-#ifdef NZC_STATS
-unsigned int nzc_counts_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                           [NZC4X4_TOKENS];
-unsigned int nzc_counts_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                           [NZC8X8_TOKENS];
-unsigned int nzc_counts_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                             [NZC16X16_TOKENS];
-unsigned int nzc_counts_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                             [NZC32X32_TOKENS];
-unsigned int nzc_pcat_counts[MAX_NZC_CONTEXTS][NZC_TOKENS_EXTRA]
-                            [NZC_BITS_EXTRA][2];
-#endif
-#endif
-
 static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
 const TOKENVALUE *vp9_dct_value_tokens_ptr;
 static int dct_value_cost[DCT_MAX_VALUE * 2];
@@ -121,7 +106,7 @@
                        int dry_run) {
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   int pt; /* near block/prev token context index */
-  int c = 0;
+  int c = 0, rc = 0;
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, ib);
   const int eob = xd->plane[pb_idx.plane].eobs[pb_idx.block];
@@ -132,15 +117,23 @@
   const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;
   const int *scan, *nb;
   vp9_coeff_count *counts;
-  vp9_coeff_probs *probs;
+  vp9_coeff_probs *coef_probs;
   const int ref = mbmi->ref_frame != INTRA_FRAME;
   ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec;
   uint8_t token_cache[1024];
-#if CONFIG_CODE_NONZEROCOUNT
-  const int nzc_used = get_nzc_used(tx_size);
-  int zerosleft = 0, nzc = 0;
-  if (eob == 0)
-    assert(xd->nzcs[ib] == 0);
+  TX_TYPE tx_type = DCT_DCT;
+#if CONFIG_CODE_ZEROGROUP
+  int last_nz_pos[3] = {-1, -1, -1};  // Encoder only
+  int is_eoo_list[3] = {0, 0, 0};
+  int is_last_zero[3] = {0, 0, 0};
+  int is_eoo_negative[3] = {0, 0, 0};
+  int o;
+  vp9_zpc_probs *zpc_probs;
+  vp9_zpc_count *zpc_count;
+  uint8_t token_cache_full[1024];
+#endif
+#if CONFIG_CODE_ZEROGROUP
+  vpx_memset(token_cache, UNKNOWN_TOKEN, sizeof(token_cache));
 #endif
 
   assert((!type && !pb_idx.plane) || (type && pb_idx.plane));
@@ -206,48 +199,42 @@
   switch (tx_size) {
     default:
     case TX_4X4: {
-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                              get_tx_type_4x4(xd, ib) : DCT_DCT;
+      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+          get_tx_type_4x4(xd, ib) : DCT_DCT;
       a_ec = *a;
       l_ec = *l;
       seg_eob = 16;
-      scan = vp9_default_zig_zag1d_4x4;
-      if (tx_type != DCT_DCT) {
-        if (tx_type == ADST_DCT) {
-          scan = vp9_row_scan_4x4;
-        } else if (tx_type == DCT_ADST) {
-          scan = vp9_col_scan_4x4;
-        }
-      }
+      scan = get_scan_4x4(tx_type);
       counts = cpi->coef_counts_4x4;
-      probs = cpi->common.fc.coef_probs_4x4;
+      coef_probs = cpi->common.fc.coef_probs_4x4;
+#if CONFIG_CODE_ZEROGROUP
+      zpc_count = &cpi->common.fc.zpc_counts_4x4;
+      zpc_probs = &cpi->common.fc.zpc_probs_4x4;
+#endif
       break;
     }
     case TX_8X8: {
       const int sz = 3 + mb_width_log2(sb_type);
       const int x = ib & ((1 << sz) - 1), y = ib - x;
-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                              get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
+      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+          get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
       a_ec = (a[0] + a[1]) != 0;
       l_ec = (l[0] + l[1]) != 0;
       seg_eob = 64;
-      scan = vp9_default_zig_zag1d_8x8;
-      if (tx_type != DCT_DCT) {
-        if (tx_type == ADST_DCT) {
-          scan = vp9_row_scan_8x8;
-        } else if (tx_type == DCT_ADST) {
-          scan = vp9_col_scan_8x8;
-        }
-      }
+      scan = get_scan_8x8(tx_type);
       counts = cpi->coef_counts_8x8;
-      probs = cpi->common.fc.coef_probs_8x8;
+      coef_probs = cpi->common.fc.coef_probs_8x8;
+#if CONFIG_CODE_ZEROGROUP
+      zpc_count = &cpi->common.fc.zpc_counts_8x8;
+      zpc_probs = &cpi->common.fc.zpc_probs_8x8;
+#endif
       break;
     }
     case TX_16X16: {
       const int sz = 4 + mb_width_log2(sb_type);
       const int x = ib & ((1 << sz) - 1), y = ib - x;
-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                              get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
+      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+          get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
       if (type != PLANE_TYPE_UV) {
         a_ec = (a[0] + a[1] + a[2] + a[3]) != 0;
         l_ec = (l[0] + l[1] + l[2] + l[3]) != 0;
@@ -256,16 +243,13 @@
         l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
       }
       seg_eob = 256;
-      scan = vp9_default_zig_zag1d_16x16;
-      if (tx_type != DCT_DCT) {
-        if (tx_type == ADST_DCT) {
-          scan = vp9_row_scan_16x16;
-        } else if (tx_type == DCT_ADST) {
-          scan = vp9_col_scan_16x16;
-        }
-      }
+      scan = get_scan_16x16(tx_type);
       counts = cpi->coef_counts_16x16;
-      probs = cpi->common.fc.coef_probs_16x16;
+      coef_probs = cpi->common.fc.coef_probs_16x16;
+#if CONFIG_CODE_ZEROGROUP
+      zpc_count = &cpi->common.fc.zpc_counts_16x16;
+      zpc_probs = &cpi->common.fc.zpc_probs_16x16;
+#endif
       break;
     }
     case TX_32X32:
@@ -283,7 +267,11 @@
       seg_eob = 1024;
       scan = vp9_default_zig_zag1d_32x32;
       counts = cpi->coef_counts_32x32;
-      probs = cpi->common.fc.coef_probs_32x32;
+      coef_probs = cpi->common.fc.coef_probs_32x32;
+#if CONFIG_CODE_ZEROGROUP
+      zpc_count = &cpi->common.fc.zpc_counts_32x32;
+      zpc_probs = &cpi->common.fc.zpc_probs_32x32;
+#endif
       break;
   }
 
@@ -294,56 +282,129 @@
   if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))
     seg_eob = 0;
 
+#if CONFIG_CODE_ZEROGROUP
+  vpx_memset(token_cache_full, ZERO_TOKEN, sizeof(token_cache_full));
+  for (c = 0; c < eob; ++c) {
+    rc = scan[c];
+    token_cache_full[rc] = vp9_dct_value_tokens_ptr[qcoeff_ptr[rc]].token;
+    o = vp9_get_orientation(rc, tx_size);
+    if (qcoeff_ptr[rc] != 0) {
+      last_nz_pos[o] = c;
+    }
+  }
+#endif
+  c = 0;
   do {
     const int band = get_coef_band(scan, tx_size, c);
     int token;
     int v = 0;
-#if CONFIG_CODE_NONZEROCOUNT
-    if (nzc_used)
-      zerosleft = seg_eob - xd->nzcs[ib] - c + nzc;
-#endif
+    rc = scan[c];
+    if (c)
+      pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
     if (c < eob) {
-      const int rc = scan[c];
       v = qcoeff_ptr[rc];
       assert(-DCT_MAX_VALUE <= v  &&  v < DCT_MAX_VALUE);
 
       t->extra = vp9_dct_value_tokens_ptr[v].extra;
       token    = vp9_dct_value_tokens_ptr[v].token;
     } else {
-#if CONFIG_CODE_NONZEROCOUNT
-      if (nzc_used)
-        break;
-      else
-#endif
-        token = DCT_EOB_TOKEN;
+      token = DCT_EOB_TOKEN;
     }
 
     t->token = token;
-    t->context_tree = probs[type][ref][band][pt];
-#if CONFIG_CODE_NONZEROCOUNT
-    // Skip zero node if there are no zeros left
-    if (nzc_used)
-      t->skip_eob_node = 1 + (zerosleft == 0);
-    else
-#endif
-      t->skip_eob_node = (c > 0) && (token_cache[c - 1] == 0);
+    t->context_tree = coef_probs[type][ref][band][pt];
+      t->skip_eob_node = (c > 0) && (token_cache[scan[c - 1]] == 0);
     assert(vp9_coef_encodings[t->token].len - t->skip_eob_node > 0);
+#if CONFIG_CODE_ZEROGROUP
+    o = vp9_get_orientation(rc, tx_size);
+    t->skip_coef_val = (token_cache[rc] == ZERO_TOKEN || is_eoo_list[o]);
+    if (t->skip_coef_val) {
+      assert(v == 0);
+    }
+    // No need to transmit any token
+    if (t->skip_eob_node && t->skip_coef_val) {
+      assert(token == ZERO_TOKEN);
+      is_last_zero[o] = 1;
+      token_cache[scan[c]] = ZERO_TOKEN;
+      continue;
+    }
+#endif
     if (!dry_run) {
       ++counts[type][ref][band][pt][token];
       if (!t->skip_eob_node)
         ++cpi->common.fc.eob_branch_counts[tx_size][type][ref][band][pt];
     }
-#if CONFIG_CODE_NONZEROCOUNT
-    nzc += (v != 0);
+    token_cache[scan[c]] = token;
+#if CONFIG_CODE_ZEROGROUP
+    if (token == ZERO_TOKEN && !t->skip_coef_val) {
+      int eoo = 0, use_eoo;
+#if USE_ZPC_EOORIENT == 1
+      use_eoo = vp9_use_eoo(c, seg_eob, scan, tx_size,
+                            is_last_zero, is_eoo_list);
+#else
+      use_eoo = 0;
 #endif
-    token_cache[c] = token;
-
-    pt = vp9_get_coef_context(scan, nb, pad, token_cache, c + 1, default_eob);
+      if (use_eoo) {
+        eoo = vp9_is_eoo(c, eob, scan, tx_size, qcoeff_ptr, last_nz_pos);
+        if (eoo && is_eoo_negative[o]) eoo = 0;
+        if (eoo) {
+          int c_;
+          int savings = 0;
+          int zsaved = 0;
+          savings =
+              vp9_cost_bit((*zpc_probs)[ref]
+                           [coef_to_zpc_band(band)]
+                           [coef_to_zpc_ptok(pt)][0], 1) -
+              vp9_cost_bit((*zpc_probs)[ref]
+                           [coef_to_zpc_band(band)]
+                           [coef_to_zpc_ptok(pt)][0], 0);
+          for (c_ = c + 1; c_ < eob; ++c_) {
+            if (o == vp9_get_orientation(scan[c_], tx_size)) {
+              int pt_ = vp9_get_coef_context(scan, nb, pad, token_cache_full,
+                                             c_, default_eob);
+              int band_ = get_coef_band(scan, tx_size, c_);
+              assert(token_cache_full[scan[c_]] == ZERO_TOKEN);
+              if (!c_ || token_cache_full[scan[c_ - 1]])
+                savings +=
+                    vp9_cost_bit(coef_probs[type][ref][band_][pt_][0], 1);
+              savings += vp9_cost_bit(coef_probs[type][ref][band_][pt_][1], 0);
+              zsaved++;
+            }
+          }
+          /*
+          if (!dry_run)
+            if (savings > 0)
+              printf("savings %d zsaved %d (%d, %d)\n",
+                     savings, zsaved, tx_size, band);
+                     */
+          if (savings < 0) {
+            eoo = 0;
+            is_eoo_negative[o] = 1;
+          }
+        }
+      }
+      if (use_eoo) {
+        t++;
+        t->skip_eob_node = t->skip_coef_val = 0;
+        // transmit the eoo symbol
+        t->token = !eoo ? ZPC_ISOLATED : ZPC_EOORIENT;
+        t->context_tree = &((*zpc_probs)[ref]
+                            [coef_to_zpc_band(band)]
+                            [coef_to_zpc_ptok(pt)][0]);
+        if (!dry_run)
+          (*zpc_count)[ref]
+              [coef_to_zpc_band(band)]
+              [coef_to_zpc_ptok(pt)][0][!eoo]++;
+        if (eoo) {
+          assert(is_eoo_list[o] == 0);
+          is_eoo_list[o] = 1;
+        }
+      }
+    }
+    is_last_zero[o] = (token == ZERO_TOKEN);
+#endif
     ++t;
   } while (c < eob && ++c < seg_eob);
-#if CONFIG_CODE_NONZEROCOUNT
-  assert(nzc == xd->nzcs[ib]);
-#endif
 
   *tp = t;
   a_ec = l_ec = (c > 0); /* 0 <-> all coeff data is zero */
@@ -720,248 +781,3 @@
 void vp9_tokenize_initialize() {
   fill_value_tokens();
 }
-
-static void stuff_b(VP9_COMP *cpi,
-                    MACROBLOCKD *xd,
-                    const int ib,
-                    TOKENEXTRA **tp,
-                    PLANE_TYPE type,
-                    TX_SIZE tx_size,
-                    int dry_run) {
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-  const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;
-  vp9_coeff_count *counts;
-  vp9_coeff_probs *probs;
-  int pt, band;
-  TOKENEXTRA *t = *tp;
-  const int ref = mbmi->ref_frame != INTRA_FRAME;
-  ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec;
-#if CONFIG_CODE_NONZEROCOUNT
-  const int nzc_used = get_nzc_used(tx_size);
-#endif
-
-  if (sb_type == BLOCK_SIZE_SB64X64) {
-    a = (ENTROPY_CONTEXT *)xd->above_context +
-                                             vp9_block2above_sb64[tx_size][ib];
-    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left_sb64[tx_size][ib];
-    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    a2 = a1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    l2 = l1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    a3 = a2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    l3 = l2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-  } else if (sb_type == BLOCK_SIZE_SB32X32) {
-    a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above_sb[tx_size][ib];
-    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left_sb[tx_size][ib];
-    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    a2 = l2 = a3 = l3 = NULL;
-  } else {
-    a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above[tx_size][ib];
-    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left[tx_size][ib];
-    a1 = l1 = a2 = l2 = a3 = l3 = NULL;
-  }
-
-  switch (tx_size) {
-    default:
-    case TX_4X4:
-      a_ec = a[0];
-      l_ec = l[0];
-      counts = cpi->coef_counts_4x4;
-      probs = cpi->common.fc.coef_probs_4x4;
-      break;
-    case TX_8X8:
-      a_ec = (a[0] + a[1]) != 0;
-      l_ec = (l[0] + l[1]) != 0;
-      counts = cpi->coef_counts_8x8;
-      probs = cpi->common.fc.coef_probs_8x8;
-      break;
-    case TX_16X16:
-      if (type != PLANE_TYPE_UV) {
-        a_ec = (a[0] + a[1] + a[2] + a[3]) != 0;
-        l_ec = (l[0] + l[1] + l[2] + l[3]) != 0;
-      } else {
-        a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
-        l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
-      }
-      counts = cpi->coef_counts_16x16;
-      probs = cpi->common.fc.coef_probs_16x16;
-      break;
-    case TX_32X32:
-      if (type != PLANE_TYPE_UV) {
-        a_ec = (a[0] + a[1] + a[2] + a[3] +
-                a1[0] + a1[1] + a1[2] + a1[3]) != 0;
-        l_ec = (l[0] + l[1] + l[2] + l[3] +
-                l1[0] + l1[1] + l1[2] + l1[3]) != 0;
-      } else {
-        a_ec = (a[0] + a[1] + a1[0] + a1[1] +
-                a2[0] + a2[1] + a3[0] + a3[1]) != 0;
-        l_ec = (l[0] + l[1] + l1[0] + l1[1] +
-                l2[0] + l2[1] + l3[0] + l3[1]) != 0;
-      }
-      counts = cpi->coef_counts_32x32;
-      probs = cpi->common.fc.coef_probs_32x32;
-      break;
-  }
-
-#if CONFIG_CODE_NONZEROCOUNT
-  if (!nzc_used) {
-#endif
-    pt = combine_entropy_contexts(a_ec, l_ec);
-    band = 0;
-    t->token = DCT_EOB_TOKEN;
-    t->context_tree = probs[type][ref][band][pt];
-    t->skip_eob_node = 0;
-    ++t;
-    *tp = t;
-    if (!dry_run) {
-      ++counts[type][ref][band][pt][DCT_EOB_TOKEN];
-    }
-#if CONFIG_CODE_NONZEROCOUNT
-  }
-#endif
-    *a = *l = 0;
-  if (tx_size == TX_8X8) {
-    a[1] = 0;
-    l[1] = 0;
-  } else if (tx_size == TX_16X16) {
-    if (type != PLANE_TYPE_UV) {
-      a[1] = a[2] = a[3] = 0;
-      l[1] = l[2] = l[3] = 0;
-    } else {
-      a1[0] = a1[1] = a[1] = a_ec;
-      l1[0] = l1[1] = l[1] = l_ec;
-    }
-  } else if (tx_size == TX_32X32) {
-    if (type != PLANE_TYPE_Y_WITH_DC) {
-      a[1] = a[2] = a[3] = a_ec;
-      l[1] = l[2] = l[3] = l_ec;
-      a1[0] = a1[1] = a1[2] = a1[3] = a_ec;
-      l1[0] = l1[1] = l1[2] = l1[3] = l_ec;
-    } else {
-      a[1] = a1[0] = a1[1] = a_ec;
-      l[1] = l1[0] = l1[1] = l_ec;
-      a2[0] = a2[1] = a3[0] = a3[1] = a_ec;
-      l2[0] = l2[1] = l3[0] = l3[1] = l_ec;
-    }
-  }
-}
-
-static void stuff_mb_8x8(VP9_COMP *cpi, MACROBLOCKD *xd,
-                         TOKENEXTRA **t, int dry_run) {
-  int b;
-
-  for (b = 0; b < 16; b += 4)
-    stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run);
-  for (b = 16; b < 24; b += 4)
-    stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);
-}
-
-static void stuff_mb_16x16(VP9_COMP *cpi, MACROBLOCKD *xd,
-                           TOKENEXTRA **t, int dry_run) {
-  int b;
-  stuff_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC, TX_16X16, dry_run);
-
-  for (b = 16; b < 24; b += 4) {
-    stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);
-  }
-}
-
-static void stuff_mb_4x4(VP9_COMP *cpi, MACROBLOCKD *xd,
-                         TOKENEXTRA **t, int dry_run) {
-  int b;
-
-  for (b = 0; b < 16; b++)
-    stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run);
-  for (b = 16; b < 24; b++)
-    stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);
-}
-
-static void stuff_mb_8x8_4x4uv(VP9_COMP *cpi, MACROBLOCKD *xd,
-                               TOKENEXTRA **t, int dry_run) {
-  int b;
-
-  for (b = 0; b < 16; b += 4)
-    stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run);
-  for (b = 16; b < 24; b++)
-    stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);
-}
-
-void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) {
-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
-  TOKENEXTRA * const t_backup = *t;
-
-  if (tx_size == TX_16X16) {
-    stuff_mb_16x16(cpi, xd, t, dry_run);
-  } else if (tx_size == TX_8X8) {
-    if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
-        xd->mode_info_context->mbmi.mode == SPLITMV) {
-      stuff_mb_8x8_4x4uv(cpi, xd, t, dry_run);
-    } else {
-      stuff_mb_8x8(cpi, xd, t, dry_run);
-    }
-  } else {
-    stuff_mb_4x4(cpi, xd, t, dry_run);
-  }
-
-  if (dry_run) {
-    *t = t_backup;
-  }
-}
-
-void vp9_stuff_sb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run,
-                  BLOCK_SIZE_TYPE bsize) {
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-  const int bwl = mb_width_log2(bsize) + 2, bhl = mb_height_log2(bsize) + 2;
-  const TX_SIZE txfm_size = mbmi->txfm_size;
-  const TX_SIZE uv_txfm_size = (bsize < BLOCK_SIZE_SB32X32 &&
-                                txfm_size == TX_16X16) ? TX_8X8 :
-                               (bsize < BLOCK_SIZE_SB64X64 &&
-                                txfm_size == TX_32X32) ? TX_16X16 : txfm_size;
-  int b;
-  const int n_y = (1 << (bwl + bhl)), n_uv = (n_y * 3) >> 1;
-  TOKENEXTRA * const t_backup = *t;
-
-  switch (txfm_size) {
-    case TX_32X32:
-      for (b = 0; b < n_y; b += 64)
-        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_32X32, dry_run);
-      if (uv_txfm_size == TX_32X32) {
-        assert(bsize == BLOCK_SIZE_SB64X64);
-        stuff_b(cpi, xd, 256, t, PLANE_TYPE_UV, TX_32X32, dry_run);
-        stuff_b(cpi, xd, 320, t, PLANE_TYPE_UV, TX_32X32, dry_run);
-      } else {
-        for (; b < n_uv; b += 16)
-          stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_16X16, dry_run);
-      }
-      break;
-    case TX_16X16:
-      for (b = 0; b < n_y; b += 16)
-        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_16X16, dry_run);
-      if (uv_txfm_size == TX_16X16) {
-        for (; b < n_uv; b += 16)
-          stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_16X16, dry_run);
-      } else {
-        for (; b < n_uv; b += 4)
-          stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);
-      }
-      break;
-    case TX_8X8:
-      for (b = 0; b < n_y; b += 4)
-        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run);
-      for (; b < n_uv; b += 4)
-        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);
-      break;
-    case TX_4X4:
-      for (b = 0; b < n_y; b++)
-        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run);
-      for (; b < n_uv; b++)
-        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);
-      break;
-    default: assert(0);
-  }
-
-  if (dry_run) {
-    *t = t_backup;
-  }
-}
diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index 82d798e..8165348 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -26,6 +26,9 @@
   int16_t         extra;
   uint8_t         token;
   uint8_t         skip_eob_node;
+#if CONFIG_CODE_ZEROGROUP
+  uint8_t         skip_coef_val;
+#endif
 } TOKENEXTRA;
 
 typedef int64_t vp9_coeff_accum[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
@@ -41,11 +44,6 @@
 void vp9_tokenize_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
                      TOKENEXTRA **t, int dry_run, BLOCK_SIZE_TYPE bsize);
 
-void vp9_stuff_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
-                  TOKENEXTRA **t, int dry_run);
-void vp9_stuff_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
-                  TOKENEXTRA **t, int dry_run, BLOCK_SIZE_TYPE bsize);
-
 #ifdef ENTROPY_STATS
 void init_context_counters();
 void print_context_counters();
diff --git a/vp9/encoder/x86/vp9_x86_csystemdependent.c b/vp9/encoder/x86/vp9_x86_csystemdependent.c
index 04383fc..6016e14 100644
--- a/vp9/encoder/x86/vp9_x86_csystemdependent.c
+++ b/vp9/encoder/x86/vp9_x86_csystemdependent.c
@@ -17,7 +17,7 @@
 
 // TODO(jimbankoski) Consider rewriting the c to take the same values rather
 // than going through these pointer conversions
-#if HAVE_MMX
+#if 0 && HAVE_MMX
 void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) {
   vp9_short_fdct4x4_mmx(input,   output,    pitch);
   vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch);
@@ -38,7 +38,7 @@
 
 #endif
 
-#if HAVE_SSE2
+#if 0 && HAVE_SSE2
 void vp9_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
                               short *diff, unsigned char *predictor,
                               int pitch);
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 7f19dd0..f8e2ef9 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -215,9 +215,13 @@
   unsigned char *roi_map;      /**< specify an id between 0 and 3 for each 16x16 region within a frame */
   unsigned int   rows;         /**< number of rows */
   unsigned int   cols;         /**< number of cols */
-  int     delta_q[4];          /**< quantizer delta [-63, 63] off baseline for regions with id between 0 and 3*/
-  int     delta_lf[4];         /**< loop filter strength delta [-63, 63] for regions with id between 0 and 3 */
-  unsigned int   static_threshold[4];/**< threshold for region to be treated as static */
+  // TODO(paulwilkins): broken for VP9 which has 8 segments
+  // q and loop filter deltas for each segment
+  // (see MAX_MB_SEGMENTS)
+  int     delta_q[4];
+  int     delta_lf[4];
+  // Static breakout threshold for each segment
+  unsigned int   static_threshold[4];
 } vpx_roi_map_t;
 
 /*!\brief  vpx active region map