Merge "Simplify vp9_adapt_nmv_probs" into experimental
diff --git a/configure b/configure
index f55f798..5c8dc8e 100755
--- a/configure
+++ b/configure
@@ -245,6 +245,7 @@
     comp_interintra_pred
     enable_6tap
     abovesprefmv
+    code_nonzerocount
 "
 CONFIG_LIST="
     external_build
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index b46dd05..5adfa69 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -250,6 +250,9 @@
   INTERPOLATIONFILTERTYPE interp_filter;
 
   BLOCK_SIZE_TYPE sb_type;
+#if CONFIG_CODE_NONZEROCOUNT
+  uint16_t nzcs[256+64*2];
+#endif
 } MB_MODE_INFO;
 
 typedef struct {
@@ -295,6 +298,9 @@
   DECLARE_ALIGNED(16, int16_t,  qcoeff[64*64+32*32*2]);
   DECLARE_ALIGNED(16, int16_t,  dqcoeff[64*64+32*32*2]);
   DECLARE_ALIGNED(16, uint16_t, eobs[256+64*2]);
+#if CONFIG_CODE_NONZEROCOUNT
+  DECLARE_ALIGNED(16, uint16_t, nzcs[256+64*2]);
+#endif
 
   /* 16 Y blocks, 4 U, 4 V, each with 16 entries. */
   BLOCKD block[24];
@@ -449,25 +455,29 @@
 extern const uint8_t vp9_block2left_sb64[TX_SIZE_MAX_SB][384];
 extern const uint8_t vp9_block2above_sb64[TX_SIZE_MAX_SB][384];
 
-#define USE_ADST_FOR_I16X16_8X8   0
-#define USE_ADST_FOR_I16X16_4X4   0
+#define USE_ADST_FOR_I16X16_8X8   1
+#define USE_ADST_FOR_I16X16_4X4   1
 #define USE_ADST_FOR_I8X8_4X4     1
 #define USE_ADST_PERIPHERY_ONLY   1
+#define USE_ADST_FOR_SB           1
+#define USE_ADST_FOR_REMOTE_EDGE  0
 
-static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) {
+static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, int ib) {
   // TODO(debargha): explore different patterns for ADST usage when blocksize
   // is smaller than the prediction size
   TX_TYPE tx_type = DCT_DCT;
-  int ib = (int)(b - xd->block);
-  if (ib >= 16)
+  const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
+#if !USE_ADST_FOR_SB
+  if (sb_type)
+    return tx_type;
+#endif
+  if (ib >= (16 << (2 * sb_type)))  // no chroma adst
     return tx_type;
   if (xd->lossless)
     return DCT_DCT;
-  // TODO(rbultje, debargha): Explore ADST usage for superblocks
-  if (xd->mode_info_context->mbmi.sb_type)
-    return tx_type;
   if (xd->mode_info_context->mbmi.mode == B_PRED &&
       xd->q_index < ACTIVE_HT) {
+    const BLOCKD *b = &xd->block[ib];
     tx_type = txfm_map(
 #if CONFIG_NEWBINTRAMODES
         b->bmi.as_mode.first == B_CONTEXT_PRED ? b->bmi.as_mode.context :
@@ -475,16 +485,32 @@
         b->bmi.as_mode.first);
   } else if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&
              xd->q_index < ACTIVE_HT) {
+    const BLOCKD *b = &xd->block[ib];
+    const int ic = (ib & 10);
 #if USE_ADST_FOR_I8X8_4X4
 #if USE_ADST_PERIPHERY_ONLY
     // Use ADST for periphery blocks only
-    int ic = (ib & 10);
+    const int inner = ib & 5;
     b += ic - ib;
-    tx_type = (ic != 10) ?
-         txfm_map(pred_mode_conv((MB_PREDICTION_MODE)b->bmi.as_mode.first)) :
-         DCT_DCT;
+    tx_type = txfm_map(pred_mode_conv(
+        (MB_PREDICTION_MODE)b->bmi.as_mode.first));
+#if USE_ADST_FOR_REMOTE_EDGE
+    if (inner == 5)
+      tx_type = DCT_DCT;
+#else
+    if (inner == 1) {
+      if (tx_type == ADST_ADST) tx_type = ADST_DCT;
+      else if (tx_type == DCT_ADST) tx_type = DCT_DCT;
+    } else if (inner == 4) {
+      if (tx_type == ADST_ADST) tx_type = DCT_ADST;
+      else if (tx_type == ADST_DCT) tx_type = DCT_DCT;
+    } else if (inner == 5) {
+      tx_type = DCT_DCT;
+    }
+#endif
 #else
     // Use ADST
+    b += ic - ib;
     tx_type = txfm_map(pred_mode_conv(
         (MB_PREDICTION_MODE)b->bmi.as_mode.first));
 #endif
@@ -496,9 +522,22 @@
              xd->q_index < ACTIVE_HT) {
 #if USE_ADST_FOR_I16X16_4X4
 #if USE_ADST_PERIPHERY_ONLY
-    // Use ADST for periphery blocks only
-    tx_type = (ib < 4 || ((ib & 3) == 0)) ?
-        txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode)) : DCT_DCT;
+    const int hmax = 4 << sb_type;
+    tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));
+#if USE_ADST_FOR_REMOTE_EDGE
+    if ((ib & (hmax - 1)) != 0 && ib >= hmax)
+      tx_type = DCT_DCT;
+#else
+    if (ib >= 1 && ib < hmax) {
+      if (tx_type == ADST_ADST) tx_type = ADST_DCT;
+      else if (tx_type == DCT_ADST) tx_type = DCT_DCT;
+    } else if (ib >= 1 && (ib & (hmax - 1)) == 0) {
+      if (tx_type == ADST_ADST) tx_type = DCT_ADST;
+      else if (tx_type == ADST_DCT) tx_type = DCT_DCT;
+    } else if (ib != 0) {
+      tx_type = DCT_DCT;
+    }
+#endif
 #else
     // Use ADST
     tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));
@@ -511,29 +550,44 @@
   return tx_type;
 }
 
-static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, const BLOCKD *b) {
+static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, int ib) {
   // TODO(debargha): explore different patterns for ADST usage when blocksize
   // is smaller than the prediction size
   TX_TYPE tx_type = DCT_DCT;
-  int ib = (int)(b - xd->block);
-  if (ib >= 16)
+  const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
+#if !USE_ADST_FOR_SB
+  if (sb_type)
     return tx_type;
-  // TODO(rbultje, debargha): Explore ADST usage for superblocks
-  if (xd->mode_info_context->mbmi.sb_type)
+#endif
+  if (ib >= (16 << (2 * sb_type)))  // no chroma adst
     return tx_type;
   if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&
       xd->q_index < ACTIVE_HT8) {
+    const BLOCKD *b = &xd->block[ib];
     // TODO(rbultje): MB_PREDICTION_MODE / B_PREDICTION_MODE should be merged
     // or the relationship otherwise modified to address this type conversion.
     tx_type = txfm_map(pred_mode_conv(
            (MB_PREDICTION_MODE)b->bmi.as_mode.first));
   } else if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&
              xd->q_index < ACTIVE_HT8) {
-#if USE_ADST_FOR_I8X8_4X4
+#if USE_ADST_FOR_I16X16_8X8
 #if USE_ADST_PERIPHERY_ONLY
-    // Use ADST for periphery blocks only
-    tx_type = (ib != 10) ?
-        txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode)) : DCT_DCT;
+    const int hmax = 4 << sb_type;
+    tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));
+#if USE_ADST_FOR_REMOTE_EDGE
+    if ((ib & (hmax - 1)) != 0 && ib >= hmax)
+      tx_type = DCT_DCT;
+#else
+    if (ib >= 1 && ib < hmax) {
+      if (tx_type == ADST_ADST) tx_type = ADST_DCT;
+      else if (tx_type == DCT_ADST) tx_type = DCT_DCT;
+    } else if (ib >= 1 && (ib & (hmax - 1)) == 0) {
+      if (tx_type == ADST_ADST) tx_type = DCT_ADST;
+      else if (tx_type == ADST_DCT) tx_type = DCT_DCT;
+    } else if (ib != 0) {
+      tx_type = DCT_DCT;
+    }
+#endif
 #else
     // Use ADST
     tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));
@@ -546,35 +600,37 @@
   return tx_type;
 }
 
-static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, const BLOCKD *b) {
+static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, int ib) {
   TX_TYPE tx_type = DCT_DCT;
-  int ib = (int)(b - xd->block);
-  if (ib >= 16)
+  const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
+#if !USE_ADST_FOR_SB
+  if (sb_type)
     return tx_type;
-  // TODO(rbultje, debargha): Explore ADST usage for superblocks
-  if (xd->mode_info_context->mbmi.sb_type)
+#endif
+  if (ib >= (16 << (2 * sb_type)))
     return tx_type;
   if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&
       xd->q_index < ACTIVE_HT16) {
     tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));
-  }
-  return tx_type;
-}
-
-static TX_TYPE get_tx_type(const MACROBLOCKD *xd, const BLOCKD *b) {
-  TX_TYPE tx_type = DCT_DCT;
-  int ib = (int)(b - xd->block);
-  if (ib >= 16)
-    return tx_type;
-  if (xd->mode_info_context->mbmi.txfm_size == TX_16X16) {
-    tx_type = get_tx_type_16x16(xd, b);
-  }
-  if (xd->mode_info_context->mbmi.txfm_size  == TX_8X8) {
-    ib = (ib & 8) + ((ib & 4) >> 1);
-    tx_type = get_tx_type_8x8(xd, &xd->block[ib]);
-  }
-  if (xd->mode_info_context->mbmi.txfm_size  == TX_4X4) {
-    tx_type = get_tx_type_4x4(xd, b);
+#if USE_ADST_PERIPHERY_ONLY
+    if (sb_type) {
+      const int hmax = 4 << sb_type;
+#if USE_ADST_FOR_REMOTE_EDGE
+      if ((ib & (hmax - 1)) != 0 && ib >= hmax)
+        tx_type = DCT_DCT;
+#else
+      if (ib >= 1 && ib < hmax) {
+        if (tx_type == ADST_ADST) tx_type = ADST_DCT;
+        else if (tx_type == DCT_ADST) tx_type = DCT_DCT;
+      } else if (ib >= 1 && (ib & (hmax - 1)) == 0) {
+        if (tx_type == ADST_ADST) tx_type = DCT_ADST;
+        else if (tx_type == ADST_DCT) tx_type = DCT_DCT;
+      } else if (ib != 0) {
+        tx_type = DCT_DCT;
+      }
+#endif
+    }
+#endif
   }
   return tx_type;
 }
@@ -592,4 +648,25 @@
   }
 }
 
+static TX_SIZE get_uv_tx_size(const MACROBLOCKD *xd) {
+  TX_SIZE tx_size_uv;
+  if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
+    tx_size_uv = xd->mode_info_context->mbmi.txfm_size;
+  } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {
+    if (xd->mode_info_context->mbmi.txfm_size == TX_32X32)
+      tx_size_uv = TX_16X16;
+    else
+      tx_size_uv = xd->mode_info_context->mbmi.txfm_size;
+  } else {
+    if (xd->mode_info_context->mbmi.txfm_size == TX_16X16)
+      tx_size_uv = TX_8X8;
+    else if (xd->mode_info_context->mbmi.txfm_size == TX_8X8 &&
+             (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
+              xd->mode_info_context->mbmi.mode == SPLITMV))
+      tx_size_uv = TX_4X4;
+    else
+      tx_size_uv = xd->mode_info_context->mbmi.txfm_size;
+  }
+  return tx_size_uv;
+}
 #endif  // VP9_COMMON_VP9_BLOCKD_H_
diff --git a/vp9/common/vp9_coefupdateprobs.h b/vp9/common/vp9_coefupdateprobs.h
index ee250e0..6d8ed67 100644
--- a/vp9/common/vp9_coefupdateprobs.h
+++ b/vp9/common/vp9_coefupdateprobs.h
@@ -9,7 +9,7 @@
  */
 
 #ifndef VP9_COMMON_VP9_COEFUPDATEPROBS_H_
-#define VP9_COMMON_VP9_COEFUPDATEPROBS_H__
+#define VP9_COMMON_VP9_COEFUPDATEPROBS_H_
 
 /* Update probabilities for the nodes in the token entropy tree.
    Generated file included by vp9_entropy.c */
@@ -17,4 +17,12 @@
 #define COEF_UPDATE_PROB_8X8 252
 #define COEF_UPDATE_PROB_16X16 252
 
+#if CONFIG_CODE_NONZEROCOUNT
+#define NZC_UPDATE_PROB_4X4     252
+#define NZC_UPDATE_PROB_8X8     252
+#define NZC_UPDATE_PROB_16X16   252
+#define NZC_UPDATE_PROB_32X32   252
+#define NZC_UPDATE_PROB_PCAT    252
+#endif
+
 #endif  // VP9_COMMON_VP9_COEFUPDATEPROBS_H__
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 4295eba..dcc5073 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -11,10 +11,11 @@
 #ifndef VP9_COMMON_VP9_COMMON_H_
 #define VP9_COMMON_VP9_COMMON_H_
 
-#include <assert.h>
-#include "vpx_config.h"
 /* Interface header for common constant data structures and lookup tables */
 
+#include <assert.h>
+
+#include "./vpx_config.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx/vpx_integer.h"
 
@@ -24,23 +25,27 @@
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 #define MAX(x, y) (((x) > (y)) ? (x) : (y))
 
-/* Only need this for fixed-size arrays, for structs just assign. */
+#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n))
 
-#define vp9_copy(Dest, Src) { \
-    assert(sizeof(Dest) == sizeof(Src)); \
-    vpx_memcpy(Dest, Src, sizeof(Src)); \
+/* If we don't want to use ROUND_POWER_OF_TWO macro
+static INLINE int16_t round_power_of_two(int16_t value, int n) {
+  return (value + (1 << (n - 1))) >> n;
+}*/
+
+// Only need this for fixed-size arrays, for structs just assign.
+#define vp9_copy(dest, src) {            \
+    assert(sizeof(dest) == sizeof(src)); \
+    vpx_memcpy(dest, src, sizeof(src));  \
   }
 
-/* Use this for variably-sized arrays. */
-
-#define vp9_copy_array(Dest, Src, N) { \
-    assert(sizeof(*Dest) == sizeof(*Src)); \
-    vpx_memcpy(Dest, Src, N * sizeof(*Src)); \
+// Use this for variably-sized arrays.
+#define vp9_copy_array(dest, src, n) {       \
+    assert(sizeof(*dest) == sizeof(*src));   \
+    vpx_memcpy(dest, src, n * sizeof(*src)); \
   }
 
-#define vp9_zero(Dest) vpx_memset(&Dest, 0, sizeof(Dest));
-
-#define vp9_zero_array(Dest, N) vpx_memset(Dest, 0, N * sizeof(*Dest));
+#define vp9_zero(dest) vpx_memset(&dest, 0, sizeof(dest));
+#define vp9_zero_array(dest, n) vpx_memset(dest, 0, n * sizeof(*dest));
 
 static INLINE uint8_t clip_pixel(int val) {
   return (val > 255) ? 255u : (val < 0) ? 0u : val;
diff --git a/vp9/common/vp9_debugmodes.c b/vp9/common/vp9_debugmodes.c
index 1953d60..c3fffc6 100644
--- a/vp9/common/vp9_debugmodes.c
+++ b/vp9/common/vp9_debugmodes.c
@@ -9,6 +9,7 @@
  */
 
 #include <stdio.h>
+
 #include "vp9/common/vp9_blockd.h"
 
 void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols,
@@ -18,8 +19,7 @@
   int mb_index = 0;
   FILE *mvs = fopen("mvs.stt", "a");
 
-  /* print out the macroblock Y modes */
-  mb_index = 0;
+  // Print out the macroblock Y modes
   fprintf(mvs, "Mb Modes for Frame %d\n", frame);
 
   for (mb_row = 0; mb_row < rows; mb_row++) {
diff --git a/vp9/common/vp9_default_coef_probs.h b/vp9/common/vp9_default_coef_probs.h
index 204e65a..6b1eff0 100644
--- a/vp9/common/vp9_default_coef_probs.h
+++ b/vp9/common/vp9_default_coef_probs.h
@@ -695,3 +695,299 @@
     }
   }
 };
+
+#if CONFIG_CODE_NONZEROCOUNT
+
+// TODO(debargha): Remove the macro and count tables after experimentation
+#define NZC_DEFAULT_COUNTS  /* Uncomment to use counts as defaults */
+
+#ifdef NZC_DEFAULT_COUNTS
+static const unsigned int default_nzc_counts_4x4[MAX_NZC_CONTEXTS]
+                                                [REF_TYPES]
+                                                [BLOCK_TYPES]
+                                                [NZC4X4_TOKENS] = {
+  {
+    {
+      { 967652, 29023, 15039, 6952, 1568, 116 },
+      { 289116, 22938, 4522, 1935, 520, 47 }
+    }, {
+      { 967652, 29023, 15039, 6952, 1568, 116 },
+      { 689116, 22938, 4522, 1935, 520, 47 }
+    },
+  }, {
+    {
+      { 124684, 37167, 15270, 8483, 1777, 102 },
+      { 10405, 12395, 3401, 3574, 2461, 771 }
+    }, {
+      { 124684, 37167, 15270, 8483, 1777, 102 },
+      { 20405, 12395, 3401, 3574, 2461, 771 }
+    }
+  }, {
+    {
+      { 4100, 22976, 15627, 16137, 7982, 1793 },
+      { 4249, 3084, 2131, 4081, 6439, 1653 }
+    }, {
+      { 21100, 22976, 15627, 16137, 7982, 1793 },
+      { 4249, 3084, 2131, 4081, 2439, 1653 }
+    }
+  }
+};
+
+static const unsigned int default_nzc_counts_8x8[MAX_NZC_CONTEXTS]
+                                                [REF_TYPES]
+                                                [BLOCK_TYPES]
+                                                [NZC8X8_TOKENS] = {
+  {
+    {
+      { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10 },
+      { 72052, 30468, 6973, 3250, 1500, 750, 375, 5 },
+    }, {
+      { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10 },
+      { 192052, 30468, 6973, 3250, 1500, 750, 375, 5 },
+    }
+  }, {
+    {
+      { 121533, 33527, 15655, 11920, 5723, 2009, 315, 7 },
+      { 23772, 23120, 13127, 8115, 4000, 2000, 200, 6 },
+    }, {
+      { 121533, 33527, 15655, 11920, 5723, 2009, 315, 7 },
+      { 23772, 23120, 13127, 8115, 4000, 2000, 200, 6 },
+    }
+  }, {
+    {
+      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17 },
+      { 11612, 13874, 13329, 13022, 6500, 3250, 300, 12 },
+    }, {
+      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17 },
+      { 11612, 13874, 13329, 13022, 6500, 3250, 300, 12 },
+    }
+  }
+};
+
+static const unsigned int default_nzc_counts_16x16[MAX_NZC_CONTEXTS]
+                                                  [REF_TYPES]
+                                                  [BLOCK_TYPES]
+                                                  [NZC16X16_TOKENS] = {
+  {
+    {
+      { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2 },
+      { 72052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1 },
+    }, {
+      { 32988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2 },
+      { 92052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1 },
+    }
+  }, {
+    {
+      { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2 },
+      { 47772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2 },
+    }, {
+      { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2 },
+      { 27772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2 },
+    }
+  }, {
+    {
+      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5 },
+      { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3 },
+    }, {
+      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5 },
+      { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3 },
+    }
+  }
+};
+
+static const unsigned int default_nzc_counts_32x32[MAX_NZC_CONTEXTS]
+                                                  [REF_TYPES]
+                                                  [BLOCK_TYPES]
+                                                  [NZC32X32_TOKENS] = {
+  {
+    {
+      { 72988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2, 1, 0 },
+      { 52052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1, 0, 0 },
+    }, {
+      { 72988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2, 1, 0 },
+      { 72052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1, 0, 0 },
+    }
+  }, {
+    {
+      { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2, 1, 0 },
+      { 27772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2, 1, 0 },
+    }, {
+      { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2, 1, 0 },
+      { 27772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2, 1, 0 },
+    }
+  }, {
+    {
+      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5, 2, 1 },
+      { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3, 2, 1 },
+    }, {
+      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5, 2, 1 },
+      { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3, 2, 1 },
+    }
+  }
+};
+
+#else
+
+static const vp9_prob default_nzc_probs_4x4[MAX_NZC_CONTEXTS]
+                                           [REF_TYPES]
+                                           [BLOCK_TYPES]
+                                           [NZC4X4_TOKENS] = {
+  {
+    {
+      { 219, 162, 179, 142, 242, },
+      { 214, 253, 228, 246, 255, },
+    }, {
+      { 225, 236, 190, 229, 253, },
+      { 251, 253, 240, 248, 255, },
+    },
+  }, {
+    {
+      { 106, 126, 158, 126, 244, },
+      { 118, 241, 201, 240, 255, },
+    }, {
+      { 165, 179, 143, 189, 242, },
+      { 173, 239, 192, 255, 128, },
+    },
+  }, {
+    {
+      { 42 , 78 , 153, 92 , 223, },
+      { 128, 128, 128, 128, 128, },
+    }, {
+      { 76 , 68 , 126, 110, 216, },
+      { 128, 128, 128, 128, 128, },
+    },
+  },
+};
+
+static const vp9_prob default_nzc_probs_8x8[MAX_NZC_CONTEXTS]
+                                           [REF_TYPES]
+                                           [BLOCK_TYPES]
+                                           [NZC8X8_TOKENS] = {
+  {
+    {
+      { 134, 139, 170, 178, 142, 197, 255, },
+      { 167, 224, 199, 252, 205, 255, 128, },
+    }, {
+      { 181, 210, 180, 241, 190, 235, 255, },
+      { 234, 251, 235, 252, 219, 255, 128, },
+    },
+  }, {
+    {
+      { 33 , 64 , 155, 143, 86 , 216, 255, },
+      { 73 , 160, 167, 251, 153, 255, 128, },
+    }, {
+      { 79 , 104, 153, 195, 119, 246, 255, },
+      { 149, 183, 186, 249, 203, 255, 128, },
+    },
+  }, {
+    {
+      { 10 , 25 , 156, 61 , 69 , 156, 254, },
+      { 32 , 1  , 128, 146, 64 , 255, 128, },
+    }, {
+      { 37 , 48 , 143, 113, 81 , 202, 255, },
+      { 1  , 255, 128, 128, 128, 128, 128, },
+    },
+  },
+};
+
+static const vp9_prob default_nzc_probs_16x16[MAX_NZC_CONTEXTS]
+                                             [REF_TYPES]
+                                             [BLOCK_TYPES]
+                                             [NZC16X16_TOKENS] = {
+  {
+    {
+      { 11 , 188, 210, 167, 141, 143, 152, 255, 128, },
+      { 171, 201, 203, 244, 207, 255, 255, 128, 128, },
+    }, {
+      { 23 , 217, 207, 251, 198, 255, 219, 128, 128, },
+      { 235, 249, 229, 255, 199, 128, 128, 128, 128, },
+    },
+  }, {
+    {
+      { 9  , 45 , 168, 85 , 66 , 221, 139, 246, 255, },
+      { 51 , 110, 163, 238, 94 , 255, 255, 128, 128, },
+    }, {
+      { 4  , 149, 175, 240, 149, 255, 205, 128, 128, },
+      { 141, 217, 186, 255, 128, 128, 128, 128, 128, },
+    },
+  }, {
+    {
+      { 1  , 12 , 173, 6  , 68 , 145, 41 , 204, 255, },
+      { 39 , 47 , 128, 199, 110, 255, 128, 128, 128, },
+    }, {
+      { 1  , 121, 171, 149, 115, 242, 159, 255, 128, },
+      { 1  , 255, 255, 128, 128, 128, 128, 128, 128, },
+    },
+  },
+};
+
+static const vp9_prob default_nzc_probs_32x32[MAX_NZC_CONTEXTS]
+                                             [REF_TYPES]
+                                             [BLOCK_TYPES]
+                                             [NZC32X32_TOKENS] = {
+  {
+    {
+      { 11 , 216, 195, 201, 160, 247, 217, 255, 255, 128, 128, },
+      { 177, 240, 239, 255, 192, 128, 128, 128, 128, 128, 128, },
+    }, {
+      { 48 , 235, 213, 235, 199, 255, 255, 128, 128, 128, 128, },
+      { 205, 255, 248, 128, 128, 128, 128, 128, 128, 128, 128, },
+    },
+  }, {
+    {
+      { 6  , 96 , 138, 99 , 125, 248, 188, 255, 128, 128, 128, },
+      { 17 , 53 , 43 , 189, 1  , 255, 171, 128, 128, 128, 128, },
+    }, {
+      { 5  , 187, 235, 232, 117, 255, 219, 128, 128, 128, 128, },
+      { 146, 255, 255, 128, 128, 128, 128, 128, 128, 128, 128, },
+    },
+  }, {
+    {
+      { 1  , 7  , 93 , 14 , 100, 30 , 85 , 65 , 81 , 210, 255, },
+      { 1  , 1  , 128, 26 , 1  , 218, 78 , 255, 255, 128, 128, },
+    }, {
+      { 4  , 148, 206, 137, 160, 255, 255, 128, 128, 128, 128, },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, },
+    },
+  },
+};
+#endif
+
+static const vp9_prob default_nzc_pcat_probs[MAX_NZC_CONTEXTS]
+                                            [NZC_TOKENS_EXTRA]
+                                            [NZC_BITS_EXTRA] = {
+  // Bit probabilities are in least to most significance order
+  {
+    {176, 128, 128, 128, 128, 128, 128, 128, 128},   // 3 - 4
+    {164, 192, 128, 128, 128, 128, 128, 128, 128},   // 5 - 8
+    {154, 184, 208, 128, 128, 128, 128, 128, 128},   // 9 - 16
+    {144, 176, 200, 216, 128, 128, 128, 128, 128},   // 17 - 32
+    {140, 172, 192, 208, 224, 128, 128, 128, 128},   // 33 - 64
+    {136, 168, 188, 200, 220, 232, 128, 128, 128},   // 65 - 128
+    {132, 164, 184, 196, 216, 228, 240, 128, 128},   // 129 - 256
+    {130, 162, 178, 194, 212, 226, 240, 248, 128},   // 257 - 512
+    {128, 160, 176, 192, 208, 224, 240, 248, 254},   // 513 - 1024
+  }, {
+    {168, 128, 128, 128, 128, 128, 128, 128, 128},   // 3 - 4
+    {152, 184, 128, 128, 128, 128, 128, 128, 128},   // 5 - 8
+    {152, 184, 208, 128, 128, 128, 128, 128, 128},   // 9 - 16
+    {144, 176, 200, 216, 128, 128, 128, 128, 128},   // 17 - 32
+    {140, 172, 192, 208, 224, 128, 128, 128, 128},   // 33 - 64
+    {136, 168, 188, 200, 220, 232, 128, 128, 128},   // 65 - 128
+    {132, 164, 184, 196, 216, 228, 240, 128, 128},   // 129 - 256
+    {130, 162, 178, 194, 212, 226, 240, 248, 128},   // 257 - 512
+    {128, 160, 176, 192, 208, 224, 240, 248, 254},   // 513 - 1024
+  }, {
+    {160, 128, 128, 128, 128, 128, 128, 128, 128},   // 3 - 4
+    {152, 176, 128, 128, 128, 128, 128, 128, 128},   // 5 - 8
+    {150, 184, 208, 128, 128, 128, 128, 128, 128},   // 9 - 16
+    {144, 176, 200, 216, 128, 128, 128, 128, 128},   // 17 - 32
+    {140, 172, 192, 208, 224, 128, 128, 128, 128},   // 33 - 64
+    {136, 168, 188, 200, 220, 232, 128, 128, 128},   // 65 - 128
+    {132, 164, 184, 196, 216, 228, 240, 128, 128},   // 129 - 256
+    {130, 162, 178, 194, 212, 226, 240, 248, 128},   // 257 - 512
+    {128, 160, 176, 192, 208, 224, 240, 248, 254},   // 513 - 1024
+  },
+};
+
+#endif  // CONFIG_CODE_NONZEROCOUNT
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index bc69353..b5ae70a 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -1,4 +1,4 @@
-/*
+/*
  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
@@ -186,6 +186,65 @@
   254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
 };
 
+#if CONFIG_CODE_NONZEROCOUNT
+const vp9_tree_index vp9_nzc4x4_tree[2 * NZC4X4_NODES] = {
+  -NZC_0, 2,
+  4, 6,
+  -NZC_1, -NZC_2,
+  -NZC_3TO4, 8,
+  -NZC_5TO8, -NZC_9TO16,
+};
+struct vp9_token_struct vp9_nzc4x4_encodings[NZC4X4_TOKENS];
+
+const vp9_tree_index vp9_nzc8x8_tree[2 * NZC8X8_NODES] = {
+  -NZC_0, 2,
+  4, 6,
+  -NZC_1, -NZC_2,
+  8, 10,
+  -NZC_3TO4, -NZC_5TO8,
+  -NZC_9TO16, 12,
+  -NZC_17TO32, -NZC_33TO64,
+};
+struct vp9_token_struct vp9_nzc8x8_encodings[NZC8X8_TOKENS];
+
+const vp9_tree_index vp9_nzc16x16_tree[2 * NZC16X16_NODES] = {
+  -NZC_0, 2,
+  4, 6,
+  -NZC_1, -NZC_2,
+  8, 10,
+  -NZC_3TO4, -NZC_5TO8,
+  12, 14,
+  -NZC_9TO16, -NZC_17TO32,
+  -NZC_33TO64, 16,
+  -NZC_65TO128, -NZC_129TO256,
+};
+struct vp9_token_struct vp9_nzc16x16_encodings[NZC16X16_TOKENS];
+
+const vp9_tree_index vp9_nzc32x32_tree[2 * NZC32X32_NODES] = {
+  -NZC_0, 2,
+  4, 6,
+  -NZC_1, -NZC_2,
+  8, 10,
+  -NZC_3TO4, -NZC_5TO8,
+  12, 14,
+  -NZC_9TO16, -NZC_17TO32,
+  16, 18,
+  -NZC_33TO64, -NZC_65TO128,
+  -NZC_129TO256, 20,
+  -NZC_257TO512, -NZC_513TO1024,
+};
+struct vp9_token_struct vp9_nzc32x32_encodings[NZC32X32_TOKENS];
+
+const int vp9_extranzcbits[NZC32X32_TOKENS] = {
+  0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+};
+
+const int vp9_basenzcvalue[NZC32X32_TOKENS] = {
+  0, 1, 2, 3, 5, 9, 17, 33, 65, 129, 257, 513
+};
+
+#endif  // CONFIG_CODE_NONZEROCOUNT
+
 static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
 
 static void init_bit_tree(vp9_tree_index *p, int n) {
@@ -253,6 +312,55 @@
 };
 
 void vp9_default_coef_probs(VP9_COMMON *pc) {
+#if CONFIG_CODE_NONZEROCOUNT
+#ifdef NZC_DEFAULT_COUNTS
+  int h, g;
+  for (h = 0; h < MAX_NZC_CONTEXTS; ++h) {
+    for (g = 0; g < REF_TYPES; ++g) {
+      int i;
+      unsigned int branch_ct4x4[NZC4X4_NODES][2];
+      unsigned int branch_ct8x8[NZC8X8_NODES][2];
+      unsigned int branch_ct16x16[NZC16X16_NODES][2];
+      unsigned int branch_ct32x32[NZC32X32_NODES][2];
+      for (i = 0; i < BLOCK_TYPES; ++i) {
+        vp9_tree_probs_from_distribution(
+          NZC4X4_TOKENS, vp9_nzc4x4_encodings, vp9_nzc4x4_tree,
+          pc->fc.nzc_probs_4x4[h][g][i], branch_ct4x4,
+          default_nzc_counts_4x4[h][g][i]);
+      }
+      for (i = 0; i < BLOCK_TYPES; ++i) {
+        vp9_tree_probs_from_distribution(
+          NZC8X8_TOKENS, vp9_nzc8x8_encodings, vp9_nzc8x8_tree,
+          pc->fc.nzc_probs_8x8[h][g][i], branch_ct8x8,
+          default_nzc_counts_8x8[h][g][i]);
+      }
+      for (i = 0; i < BLOCK_TYPES; ++i) {
+        vp9_tree_probs_from_distribution(
+          NZC16X16_TOKENS, vp9_nzc16x16_encodings, vp9_nzc16x16_tree,
+          pc->fc.nzc_probs_16x16[h][g][i], branch_ct16x16,
+          default_nzc_counts_16x16[h][g][i]);
+      }
+      for (i = 0; i < BLOCK_TYPES; ++i) {
+        vp9_tree_probs_from_distribution(
+          NZC32X32_TOKENS, vp9_nzc32x32_encodings, vp9_nzc32x32_tree,
+          pc->fc.nzc_probs_32x32[h][g][i], branch_ct32x32,
+          default_nzc_counts_32x32[h][g][i]);
+      }
+    }
+  }
+#else
+  vpx_memcpy(pc->fc.nzc_probs_4x4, default_nzc_probs_4x4,
+             sizeof(pc->fc.nzc_probs_4x4));
+  vpx_memcpy(pc->fc.nzc_probs_8x8, default_nzc_probs_8x8,
+             sizeof(pc->fc.nzc_probs_8x8));
+  vpx_memcpy(pc->fc.nzc_probs_16x16, default_nzc_probs_16x16,
+             sizeof(pc->fc.nzc_probs_16x16));
+  vpx_memcpy(pc->fc.nzc_probs_32x32, default_nzc_probs_32x32,
+             sizeof(pc->fc.nzc_probs_32x32));
+#endif
+  vpx_memcpy(pc->fc.nzc_pcat_probs, default_nzc_pcat_probs,
+             sizeof(pc->fc.nzc_pcat_probs));
+#endif  // CONFIG_CODE_NONZEROCOUNTyy
   vpx_memcpy(pc->fc.coef_probs_4x4, default_coef_probs_4x4,
              sizeof(pc->fc.coef_probs_4x4));
   vpx_memcpy(pc->fc.coef_probs_8x8, default_coef_probs_8x8,
@@ -266,8 +374,1098 @@
 void vp9_coef_tree_initialize() {
   init_bit_trees();
   vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_tokens_from_tree(vp9_nzc4x4_encodings, vp9_nzc4x4_tree);
+  vp9_tokens_from_tree(vp9_nzc8x8_encodings, vp9_nzc8x8_tree);
+  vp9_tokens_from_tree(vp9_nzc16x16_encodings, vp9_nzc16x16_tree);
+  vp9_tokens_from_tree(vp9_nzc32x32_encodings, vp9_nzc32x32_tree);
+#endif
 }
 
+#if CONFIG_CODE_NONZEROCOUNT
+
+#define mb_in_cur_tile(cm, mb_row, mb_col)      \
+    ((mb_col) >= (cm)->cur_tile_mb_col_start && \
+     (mb_col) <= (cm)->cur_tile_mb_col_end   && \
+     (mb_row) >= 0)
+
+#define choose_nzc_context(nzc_exp, t2, t1)     \
+    ((nzc_exp) >= (t2) ? 2 : (nzc_exp) >= (t1) ? 1 : 0)
+
+#define NZC_T2_32X32    (16 << 6)
+#define NZC_T1_32X32     (4 << 6)
+
+#define NZC_T2_16X16    (12 << 6)
+#define NZC_T1_16X16     (3 << 6)
+
+#define NZC_T2_8X8       (8 << 6)
+#define NZC_T1_8X8       (2 << 6)
+
+#define NZC_T2_4X4       (4 << 6)
+#define NZC_T1_4X4       (1 << 6)
+
+// Transforms a mb16 block index to a sb64 block index
+static inline int mb16_to_sb64_index(int mb_row, int mb_col, int block) {
+  int r = (mb_row & 3);
+  int c = (mb_col & 3);
+  int b;
+  if (block < 16) {  // Y
+    int ib = block >> 2;
+    int jb = block & 3;
+    ib += r * 4;
+    jb += c * 4;
+    b = ib * 16 + jb;
+    assert(b < 256);
+    return b;
+  } else {  // UV
+    int base = block - (block & 3);
+    int ib = (block - base) >> 1;
+    int jb = (block - base) & 1;
+    ib += r * 2;
+    jb += c * 2;
+    b = base * 16 + ib * 8 + jb;
+    assert(b >= 256 && b < 384);
+    return b;
+  }
+}
+
+// Transforms a mb16 block index to a sb32 block index
+static inline int mb16_to_sb32_index(int mb_row, int mb_col, int block) {
+  int r = (mb_row & 1);
+  int c = (mb_col & 1);
+  int b;
+  if (block < 16) {  // Y
+    int ib = block >> 2;
+    int jb = block & 3;
+    ib += r * 4;
+    jb += c * 4;
+    b = ib * 8 + jb;
+    assert(b < 64);
+    return b;
+  } else {  // UV
+    int base = block - (block & 3);
+    int ib = (block - base) >> 1;
+    int jb = (block - base) & 1;
+    ib += r * 2;
+    jb += c * 2;
+    b = base * 4 + ib * 4 + jb;
+    assert(b >= 64 && b < 96);
+    return b;
+  }
+}
+
+static inline int block_to_txfm_index(int block, TX_SIZE tx_size, int s) {
+  // s is the log of the number of 4x4 blocks in each row/col of larger block
+  int b, ib, jb, nb;
+  ib = block >> s;
+  jb = block - (ib << s);
+  ib >>= tx_size;
+  jb >>= tx_size;
+  nb = 1 << (s - tx_size);
+  b = (ib * nb + jb) << (2 * tx_size);
+  return b;
+}
+
+/* BEGIN - Helper functions to get the y nzcs */
+static unsigned int get_nzc_4x4_y_sb64(MB_MODE_INFO *mi, int block) {
+  int b;
+  assert(block < 256);
+  b = block_to_txfm_index(block, mi->txfm_size, 4);
+  assert(b < 256);
+  return mi->nzcs[b] << (6 - 2 * mi->txfm_size);
+}
+
+static unsigned int get_nzc_4x4_y_sb32(MB_MODE_INFO *mi, int block) {
+  int b;
+  assert(block < 64);
+  b = block_to_txfm_index(block, mi->txfm_size, 3);
+  assert(b < 64);
+  return mi->nzcs[b] << (6 - 2 * mi->txfm_size);
+}
+
+static unsigned int get_nzc_4x4_y_mb16(MB_MODE_INFO *mi, int block) {
+  int b;
+  assert(block < 16);
+  b = block_to_txfm_index(block, mi->txfm_size, 2);
+  assert(b < 16);
+  return mi->nzcs[b] << (6 - 2 * mi->txfm_size);
+}
+/* END - Helper functions to get the y nzcs */
+
+/* Function to get y nzc where block index is in mb16 terms */
+static unsigned int get_nzc_4x4_y(VP9_COMMON *cm, MODE_INFO *m,
+                                  int mb_row, int mb_col, int block) {
+  // NOTE: All values returned are at 64 times the true value at 4x4 scale
+  MB_MODE_INFO *const mi = &m->mbmi;
+  const int mis = cm->mode_info_stride;
+  if (mi->mb_skip_coeff || !mb_in_cur_tile(cm, mb_row, mb_col))
+    return 0;
+  if (mi->sb_type == BLOCK_SIZE_SB64X64) {
+    int r = mb_row & 3;
+    int c = mb_col & 3;
+    m -= c + r * mis;
+    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))
+      return 0;
+    else
+      return get_nzc_4x4_y_sb64(
+          &m->mbmi, mb16_to_sb64_index(mb_row, mb_col, block));
+  } else if (mi->sb_type == BLOCK_SIZE_SB32X32) {
+    int r = mb_row & 1;
+    int c = mb_col & 1;
+    m -= c + r * mis;
+    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))
+      return 0;
+    else
+      return get_nzc_4x4_y_sb32(
+          &m->mbmi, mb16_to_sb32_index(mb_row, mb_col, block));
+  } else {
+    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row, mb_col))
+      return 0;
+    return get_nzc_4x4_y_mb16(mi, block);
+  }
+}
+
+/* BEGIN - Helper functions to get the uv nzcs */
+static unsigned int get_nzc_4x4_uv_sb64(MB_MODE_INFO *mi, int block) {
+  int b;
+  int base, uvtxfm_size;
+  assert(block >= 256 && block < 384);
+  uvtxfm_size = mi->txfm_size;
+  base = 256 + (block & 64);
+  block -= base;
+  b = base + block_to_txfm_index(block, uvtxfm_size, 3);
+  assert(b >= 256 && b < 384);
+  return mi->nzcs[b] << (6 - 2 * uvtxfm_size);
+}
+
+static unsigned int get_nzc_4x4_uv_sb32(MB_MODE_INFO *mi, int block) {
+  int b;
+  int base, uvtxfm_size;
+  assert(block >= 64 && block < 96);
+  if (mi->txfm_size == TX_32X32)
+    uvtxfm_size = TX_16X16;
+  else
+    uvtxfm_size = mi->txfm_size;
+  base = 64 + (block & 16);
+  block -= base;
+  b = base + block_to_txfm_index(block, uvtxfm_size, 2);
+  assert(b >= 64 && b < 96);
+  return mi->nzcs[b] << (6 - 2 * uvtxfm_size);
+}
+
+static unsigned int get_nzc_4x4_uv_mb16(MB_MODE_INFO *mi, int block) {
+  int b;
+  int base, uvtxfm_size;
+  assert(block >= 16 && block < 24);
+  if (mi->txfm_size == TX_8X8 &&
+      (mi->mode == SPLITMV || mi->mode == I8X8_PRED))
+    uvtxfm_size = TX_4X4;
+  else if (mi->txfm_size == TX_16X16)
+    uvtxfm_size = TX_8X8;
+  else
+    uvtxfm_size = mi->txfm_size;
+  base = 16 + (block & 4);
+  block -= base;
+  b = base + block_to_txfm_index(block, uvtxfm_size, 1);
+  assert(b >= 16 && b < 24);
+  return mi->nzcs[b] << (6 - 2 * uvtxfm_size);
+}
+/* END - Helper functions to get the uv nzcs */
+
+/* Function to get uv nzc where block index is in mb16 terms */
+static unsigned int get_nzc_4x4_uv(VP9_COMMON *cm, MODE_INFO *m,
+                                   int mb_row, int mb_col, int block) {
+  // NOTE: All values returned are at 64 times the true value at 4x4 scale
+  MB_MODE_INFO *const mi = &m->mbmi;
+  const int mis = cm->mode_info_stride;
+  if (mi->mb_skip_coeff || !mb_in_cur_tile(cm, mb_row, mb_col))
+    return 0;
+  if (mi->sb_type == BLOCK_SIZE_SB64X64) {
+    int r = mb_row & 3;
+    int c = mb_col & 3;
+    m -= c + r * mis;
+    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))
+      return 0;
+    else
+      return get_nzc_4x4_uv_sb64(
+          &m->mbmi, mb16_to_sb64_index(mb_row, mb_col, block));
+  } else if (mi->sb_type == BLOCK_SIZE_SB32X32) {
+    int r = mb_row & 1;
+    int c = mb_col & 1;
+    m -= c + r * mis;
+    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))
+      return 0;
+    else
+    return get_nzc_4x4_uv_sb32(
+        &m->mbmi, mb16_to_sb32_index(mb_row, mb_col, block));
+  } else {
+    return get_nzc_4x4_uv_mb16(mi, block);
+  }
+}
+
+int vp9_get_nzc_context_y_sb64(VP9_COMMON *cm, MODE_INFO *cur,
+                               int mb_row, int mb_col, int block) {
+  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
+  // neighboring blocks are
+  int mis = cm->mode_info_stride;
+  int nzc_exp = 0;
+  TX_SIZE txfm_size = cur->mbmi.txfm_size;
+  assert(block < 256);
+  switch (txfm_size) {
+    case TX_32X32:
+      assert((block & 63) == 0);
+      if (block < 128) {
+        int o = (block >> 6) * 2;
+        nzc_exp =
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 12) +
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 13) +
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 14) +
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 15) +
+            get_nzc_4x4_y(cm, cur - mis + o + 1,
+                          mb_row - 1, mb_col + o + 1, 12) +
+            get_nzc_4x4_y(cm, cur - mis + o + 1,
+                          mb_row - 1, mb_col + o + 1, 13) +
+            get_nzc_4x4_y(cm, cur - mis + o + 1,
+                          mb_row - 1, mb_col + o + 1, 14) +
+            get_nzc_4x4_y(cm, cur - mis + o + 1,
+                          mb_row - 1, mb_col + o + 1, 15);
+      } else {
+        nzc_exp = cur->mbmi.nzcs[block - 128] << 3;
+      }
+      if ((block & 127) == 0) {
+        int o = (block >> 7) * 2;
+        nzc_exp +=
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 3) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 7) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 11) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 15) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,
+                          mb_row + o + 1, mb_col - 1, 3) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,
+                          mb_row + o + 1, mb_col - 1, 7) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,
+                          mb_row + o + 1, mb_col - 1, 11) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,
+                          mb_row + o + 1, mb_col - 1, 15);
+      } else {
+        nzc_exp += cur->mbmi.nzcs[block - 64] << 3;
+      }
+      nzc_exp <<= 2;
+      // Note nzc_exp is 64 times the average value expected at 32x32 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_32X32, NZC_T1_32X32);
+      break;
+
+    case TX_16X16:
+      assert((block & 15) == 0);
+      if (block < 64) {
+        int o = block >> 4;
+        nzc_exp =
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 12) +
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 13) +
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 14) +
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 15);
+      } else {
+        nzc_exp = cur->mbmi.nzcs[block - 64] << 4;
+      }
+      if ((block & 63) == 0) {
+        int o = block >> 6;
+        nzc_exp +=
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 3) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 7) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 11) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 15);
+      } else {
+        nzc_exp += cur->mbmi.nzcs[block - 16] << 4;
+      }
+      nzc_exp <<= 1;
+      // Note nzc_exp is 64 times the average value expected at 16x16 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);
+      break;
+
+    case TX_8X8:
+      assert((block & 3) == 0);
+      if (block < 32) {
+        int o = block >> 3;
+        int p = ((block >> 2) & 1) ? 14 : 12;
+        nzc_exp =
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p) +
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p + 1);
+      } else {
+        nzc_exp = cur->mbmi.nzcs[block - 32] << 5;
+      }
+      if ((block & 31) == 0) {
+        int o = block >> 6;
+        int p = ((block >> 5) & 1) ? 11 : 3;
+        nzc_exp +=
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p + 4);
+      } else {
+        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;
+      }
+      // Note nzc_exp is 64 times the average value expected at 8x8 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
+      break;
+
+    case TX_4X4:
+      if (block < 16) {
+        int o = block >> 2;
+        int p = block & 3;
+        nzc_exp = get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o,
+                                12 + p);
+      } else {
+        nzc_exp = (cur->mbmi.nzcs[block - 16] << 6);
+      }
+      if ((block & 15) == 0) {
+        int o = block >> 6;
+        int p = (block >> 4) & 3;
+        nzc_exp += get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
+                                 3 + 4 * p);
+      } else {
+        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
+      }
+      nzc_exp >>= 1;
+      // Note nzc_exp is 64 times the average value expected at 4x4 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
+      break;
+
+    default:
+      return 0;
+  }
+}
+
+int vp9_get_nzc_context_y_sb32(VP9_COMMON *cm, MODE_INFO *cur,
+                               int mb_row, int mb_col, int block) {
+  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
+  // neighboring blocks are
+  int mis = cm->mode_info_stride;
+  int nzc_exp = 0;
+  TX_SIZE txfm_size = cur->mbmi.txfm_size;
+  assert(block < 64);
+  switch (txfm_size) {
+    case TX_32X32:
+      assert(block == 0);
+      nzc_exp =
+          (get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 12) +
+           get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 13) +
+           get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 14) +
+           get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 15) +
+           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 12) +
+           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 13) +
+           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 14) +
+           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 15) +
+           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 3) +
+           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 7) +
+           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 11) +
+           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 15) +
+           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 3) +
+           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 7) +
+           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 11) +
+           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 15)) << 2;
+      // Note nzc_exp is 64 times the average value expected at 32x32 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_32X32, NZC_T1_32X32);
+      break;
+
+    case TX_16X16:
+      assert((block & 15) == 0);
+      if (block < 32) {
+        int o = (block >> 4) & 1;
+        nzc_exp =
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 12) +
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 13) +
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 14) +
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 15);
+      } else {
+        nzc_exp = cur->mbmi.nzcs[block - 32] << 4;
+      }
+      if ((block & 31) == 0) {
+        int o = block >> 5;
+        nzc_exp +=
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 3) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 7) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 11) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 15);
+      } else {
+        nzc_exp += cur->mbmi.nzcs[block - 16] << 4;
+      }
+      nzc_exp <<= 1;
+      // Note nzc_exp is 64 times the average value expected at 16x16 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);
+      break;
+
+    case TX_8X8:
+      assert((block & 3) == 0);
+      if (block < 16) {
+        int o = block >> 3;
+        int p = ((block >> 2) & 1) ? 14 : 12;
+        nzc_exp =
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p) +
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p + 1);
+      } else {
+        nzc_exp = cur->mbmi.nzcs[block - 16] << 5;
+      }
+      if ((block & 15) == 0) {
+        int o = block >> 5;
+        int p = ((block >> 4) & 1) ? 11 : 3;
+        nzc_exp +=
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p + 4);
+      } else {
+        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;
+      }
+      // Note nzc_exp is 64 times the average value expected at 8x8 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
+      break;
+
+    case TX_4X4:
+      if (block < 8) {
+        int o = block >> 2;
+        int p = block & 3;
+        nzc_exp = get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o,
+                                12 + p);
+      } else {
+        nzc_exp = (cur->mbmi.nzcs[block - 8] << 6);
+      }
+      if ((block & 7) == 0) {
+        int o = block >> 5;
+        int p = (block >> 3) & 3;
+        nzc_exp += get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
+                                 3 + 4 * p);
+      } else {
+        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
+      }
+      nzc_exp >>= 1;
+      // Note nzc_exp is 64 times the average value expected at 4x4 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
+      break;
+
+    default:
+      return 0;
+      break;
+  }
+}
+
+int vp9_get_nzc_context_y_mb16(VP9_COMMON *cm, MODE_INFO *cur,
+                               int mb_row, int mb_col, int block) {
+  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
+  // neighboring blocks are
+  int mis = cm->mode_info_stride;
+  int nzc_exp = 0;
+  TX_SIZE txfm_size = cur->mbmi.txfm_size;
+  assert(block < 16);
+  switch (txfm_size) {
+    case TX_16X16:
+      assert(block == 0);
+      nzc_exp =
+          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 12) +
+          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 13) +
+          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 14) +
+          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 15) +
+          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 3) +
+          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 7) +
+          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 11) +
+          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 15);
+      nzc_exp <<= 1;
+      // Note nzc_exp is 64 times the average value expected at 16x16 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);
+
+    case TX_8X8:
+      assert((block & 3) == 0);
+      if (block < 8) {
+        int p = ((block >> 2) & 1) ? 14 : 12;
+        nzc_exp =
+            get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, p) +
+            get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, p + 1);
+      } else {
+        nzc_exp = cur->mbmi.nzcs[block - 8] << 5;
+      }
+      if ((block & 7) == 0) {
+        int p = ((block >> 3) & 1) ? 11 : 3;
+        nzc_exp +=
+            get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, p) +
+            get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, p + 4);
+      } else {
+        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;
+      }
+      // Note nzc_exp is 64 times the average value expected at 8x8 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
+
+    case TX_4X4:
+      if (block < 4) {
+        int p = block & 3;
+        nzc_exp = get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col,
+                                12 + p);
+      } else {
+        nzc_exp = (cur->mbmi.nzcs[block - 4] << 6);
+      }
+      if ((block & 3) == 0) {
+        int p = (block >> 2) & 3;
+        nzc_exp += get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1,
+                                 3 + 4 * p);
+      } else {
+        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
+      }
+      nzc_exp >>= 1;
+      // Note nzc_exp is 64 times the average value expected at 4x4 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
+
+    default:
+      return 0;
+      break;
+  }
+}
+
+int vp9_get_nzc_context_uv_sb64(VP9_COMMON *cm, MODE_INFO *cur,
+                                int mb_row, int mb_col, int block) {
+  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
+  // neighboring blocks are
+  int mis = cm->mode_info_stride;
+  int nzc_exp = 0;
+  const int base = block - (block & 63);
+  const int boff = (block & 63);
+  const int base_mb16 = base >> 4;
+  TX_SIZE txfm_size = cur->mbmi.txfm_size;
+  TX_SIZE txfm_size_uv;
+
+  assert(block >= 256 && block < 384);
+  txfm_size_uv = txfm_size;
+
+  switch (txfm_size_uv) {
+    case TX_32X32:
+      assert(block == 256 || block == 320);
+      nzc_exp =
+          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,
+                         base_mb16 + 2) +
+          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,
+                         base_mb16 + 3) +
+          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,
+                         base_mb16 + 2) +
+          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,
+                         base_mb16 + 3) +
+          get_nzc_4x4_uv(cm, cur - mis + 2, mb_row - 1, mb_col + 2,
+                         base_mb16 + 2) +
+          get_nzc_4x4_uv(cm, cur - mis + 2, mb_row - 1, mb_col + 2,
+                         base_mb16 + 3) +
+          get_nzc_4x4_uv(cm, cur - mis + 3, mb_row - 1, mb_col + 3,
+                         base_mb16 + 2) +
+          get_nzc_4x4_uv(cm, cur - mis + 3, mb_row - 1, mb_col + 3,
+                         base_mb16 + 3) +
+          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1,
+                         base_mb16 + 1) +
+          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1,
+                         base_mb16 + 3) +
+          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,
+                         base_mb16 + 1) +
+          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,
+                         base_mb16 + 3) +
+          get_nzc_4x4_uv(cm, cur - 1 + 2 * mis, mb_row + 2, mb_col - 1,
+                         base_mb16 + 1) +
+          get_nzc_4x4_uv(cm, cur - 1 + 2 * mis, mb_row + 2, mb_col - 1,
+                         base_mb16 + 3) +
+          get_nzc_4x4_uv(cm, cur - 1 + 3 * mis, mb_row + 3, mb_col - 1,
+                         base_mb16 + 1) +
+          get_nzc_4x4_uv(cm, cur - 1 + 3 * mis, mb_row + 3, mb_col - 1,
+                         base_mb16 + 3);
+      nzc_exp <<= 2;
+      // Note nzc_exp is 64 times the average value expected at 32x32 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_32X32, NZC_T1_32X32);
+
+    case TX_16X16:
+      // uv txfm_size 16x16
+      assert((block & 15) == 0);
+      if (boff < 32) {
+        int o = (boff >> 4) & 1;
+        nzc_exp =
+            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
+                           base_mb16 + 2) +
+            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
+                           base_mb16 + 3) +
+            get_nzc_4x4_uv(cm, cur - mis + o + 1, mb_row - 1, mb_col + o + 1,
+                           base_mb16 + 2) +
+            get_nzc_4x4_uv(cm, cur - mis + o + 1, mb_row - 1, mb_col + o + 1,
+                           base_mb16 + 3);
+      } else {
+        nzc_exp = cur->mbmi.nzcs[block - 32] << 4;
+      }
+      if ((boff & 31) == 0) {
+        int o = boff >> 5;
+        nzc_exp +=
+            get_nzc_4x4_uv(cm, cur - 1 + o * mis,
+                           mb_row + o, mb_col - 1, base_mb16 + 1) +
+            get_nzc_4x4_uv(cm, cur - 1 + o * mis,
+                           mb_row + o, mb_col - 1, base_mb16 + 3) +
+            get_nzc_4x4_uv(cm, cur - 1 + o * mis + mis,
+                           mb_row + o + 1, mb_col - 1, base_mb16 + 1) +
+            get_nzc_4x4_uv(cm, cur - 1 + o * mis + mis,
+                           mb_row + o + 1, mb_col - 1, base_mb16 + 3);
+      } else {
+        nzc_exp += cur->mbmi.nzcs[block - 16] << 4;
+      }
+      nzc_exp <<= 1;
+      // Note nzc_exp is 64 times the average value expected at 16x16 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);
+
+    case TX_8X8:
+      assert((block & 3) == 0);
+      if (boff < 16) {
+        int o = boff >> 2;
+        nzc_exp =
+            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
+                           base_mb16 + 2) +
+            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
+                           base_mb16 + 3);
+      } else {
+        nzc_exp = cur->mbmi.nzcs[block - 16] << 5;
+      }
+      if ((boff & 15) == 0) {
+        int o = boff >> 4;
+        nzc_exp +=
+            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
+                           base_mb16 + 1) +
+            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
+                           base_mb16 + 3);
+      } else {
+        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;
+      }
+      // Note nzc_exp is 64 times the average value expected at 8x8 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
+
+    case TX_4X4:
+      if (boff < 8) {
+        int o = boff >> 1;
+        int p = boff & 1;
+        nzc_exp = get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
+                                 base_mb16 + 2 + p);
+      } else {
+        nzc_exp = (cur->mbmi.nzcs[block - 8] << 6);
+      }
+      if ((boff & 7) == 0) {
+        int o = boff >> 4;
+        int p = (boff >> 3) & 1;
+        nzc_exp += get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
+                                  base_mb16 + 1 + 2 * p);
+      } else {
+        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
+      }
+      nzc_exp >>= 1;
+      // Note nzc_exp is 64 times the average value expected at 4x4 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
+
+    default:
+      return 0;
+  }
+}
+
+int vp9_get_nzc_context_uv_sb32(VP9_COMMON *cm, MODE_INFO *cur,
+                                int mb_row, int mb_col, int block) {
+  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
+  // neighboring blocks are
+  int mis = cm->mode_info_stride;
+  int nzc_exp = 0;
+  const int base = block - (block & 15);
+  const int boff = (block & 15);
+  const int base_mb16 = base >> 2;
+  TX_SIZE txfm_size = cur->mbmi.txfm_size;
+  TX_SIZE txfm_size_uv;
+
+  assert(block >= 64 && block < 96);
+  if (txfm_size == TX_32X32)
+    txfm_size_uv = TX_16X16;
+  else
+    txfm_size_uv = txfm_size;
+
+  switch (txfm_size_uv) {
+    case TX_16X16:
+      // uv txfm_size 16x16
+      assert(block == 64 || block == 80);
+      nzc_exp =
+          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,
+                         base_mb16 + 2) +
+          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,
+                         base_mb16 + 3) +
+          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,
+                         base_mb16 + 2) +
+          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,
+                         base_mb16 + 3) +
+          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row, mb_col - 1,
+                         base_mb16 + 1) +
+          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row, mb_col - 1,
+                         base_mb16 + 3) +
+          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,
+                         base_mb16 + 1) +
+          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,
+                         base_mb16 + 3);
+      nzc_exp <<= 1;
+      // Note nzc_exp is 64 times the average value expected at 16x16 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);
+      break;
+
+    case TX_8X8:
+      assert((block & 3) == 0);
+      if (boff < 8) {
+        int o = boff >> 2;
+        nzc_exp =
+            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
+                           base_mb16 + 2) +
+            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
+                           base_mb16 + 3);
+      } else {
+        nzc_exp = cur->mbmi.nzcs[block - 8] << 5;
+      }
+      if ((boff & 7) == 0) {
+        int o = boff >> 3;
+        nzc_exp +=
+            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
+                           base_mb16 + 1) +
+            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
+                           base_mb16 + 3);
+      } else {
+        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;
+      }
+      // Note nzc_exp is 64 times the average value expected at 8x8 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
+
+    case TX_4X4:
+      if (boff < 4) {
+        int o = boff >> 1;
+        int p = boff & 1;
+        nzc_exp = get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
+                                 base_mb16 + 2 + p);
+      } else {
+        nzc_exp = (cur->mbmi.nzcs[block - 4] << 6);
+      }
+      if ((boff & 3) == 0) {
+        int o = boff >> 3;
+        int p = (boff >> 2) & 1;
+        nzc_exp += get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
+                                  base_mb16 + 1 + 2 * p);
+      } else {
+        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
+      }
+      nzc_exp >>= 1;
+      // Note nzc_exp is 64 times the average value expected at 4x4 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
+
+    default:
+      return 0;
+  }
+}
+
+int vp9_get_nzc_context_uv_mb16(VP9_COMMON *cm, MODE_INFO *cur,
+                                int mb_row, int mb_col, int block) {
+  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
+  // neighboring blocks are
+  int mis = cm->mode_info_stride;
+  int nzc_exp = 0;
+  const int base = block - (block & 3);
+  const int boff = (block & 3);
+  const int base_mb16 = base;
+  TX_SIZE txfm_size = cur->mbmi.txfm_size;
+  TX_SIZE txfm_size_uv;
+
+  assert(block >= 16 && block < 24);
+  if (txfm_size == TX_16X16)
+    txfm_size_uv = TX_8X8;
+  else if (txfm_size == TX_8X8 &&
+           (cur->mbmi.mode == I8X8_PRED || cur->mbmi.mode == SPLITMV))
+    txfm_size_uv = TX_4X4;
+  else
+    txfm_size_uv = txfm_size;
+
+  switch (txfm_size_uv) {
+    case TX_8X8:
+      assert((block & 3) == 0);
+      nzc_exp =
+          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col, base_mb16 + 2) +
+          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col, base_mb16 + 3) +
+          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1, base_mb16 + 1) +
+          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1, base_mb16 + 3);
+      // Note nzc_exp is 64 times the average value expected at 8x8 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
+
+    case TX_4X4:
+      if (boff < 2) {
+        int p = boff & 1;
+        nzc_exp = get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,
+                                 base_mb16 + 2 + p);
+      } else {
+        nzc_exp = (cur->mbmi.nzcs[block - 2] << 6);
+      }
+      if ((boff & 1) == 0) {
+        int p = (boff >> 1) & 1;
+        nzc_exp += get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1,
+                                  base_mb16 + 1 + 2 * p);
+      } else {
+        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
+      }
+      nzc_exp >>= 1;
+      // Note nzc_exp is 64 times the average value expected at 4x4 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
+
+    default:
+      return 0;
+  }
+}
+
+int vp9_get_nzc_context(VP9_COMMON *cm, MACROBLOCKD *xd, int block) {
+  if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
+    assert(block < 384);
+    if (block < 256)
+      return vp9_get_nzc_context_y_sb64(cm, xd->mode_info_context,
+                                        get_mb_row(xd), get_mb_col(xd), block);
+    else
+      return vp9_get_nzc_context_uv_sb64(cm, xd->mode_info_context,
+                                         get_mb_row(xd), get_mb_col(xd), block);
+  } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {
+    assert(block < 96);
+    if (block < 64)
+      return vp9_get_nzc_context_y_sb32(cm, xd->mode_info_context,
+                                        get_mb_row(xd), get_mb_col(xd), block);
+    else
+      return vp9_get_nzc_context_uv_sb32(cm, xd->mode_info_context,
+                                         get_mb_row(xd), get_mb_col(xd), block);
+  } else {
+    assert(block < 64);
+    if (block < 16)
+      return vp9_get_nzc_context_y_mb16(cm, xd->mode_info_context,
+                                        get_mb_row(xd), get_mb_col(xd), block);
+    else
+      return vp9_get_nzc_context_uv_mb16(cm, xd->mode_info_context,
+                                         get_mb_row(xd), get_mb_col(xd), block);
+  }
+}
+
+static void update_nzc(VP9_COMMON *cm,
+                       uint16_t nzc,
+                       int nzc_context,
+                       TX_SIZE tx_size,
+                       int ref,
+                       int type) {
+  int e, c;
+  c = codenzc(nzc);
+  if (tx_size == TX_32X32)
+    cm->fc.nzc_counts_32x32[nzc_context][ref][type][c]++;
+  else if (tx_size == TX_16X16)
+    cm->fc.nzc_counts_16x16[nzc_context][ref][type][c]++;
+  else if (tx_size == TX_8X8)
+    cm->fc.nzc_counts_8x8[nzc_context][ref][type][c]++;
+  else if (tx_size == TX_4X4)
+    cm->fc.nzc_counts_4x4[nzc_context][ref][type][c]++;
+  else
+    assert(0);
+
+  if ((e = vp9_extranzcbits[c])) {
+    int x = nzc - vp9_basenzcvalue[c];
+    while (e--) {
+      int b = (x >> e) & 1;
+      cm->fc.nzc_pcat_counts[nzc_context][c - NZC_TOKENS_NOEXTRA][e][b]++;
+    }
+  }
+}
+
+static void update_nzcs_sb64(VP9_COMMON *cm,
+                             MACROBLOCKD *xd,
+                             int mb_row,
+                             int mb_col) {
+  MODE_INFO *m = xd->mode_info_context;
+  MB_MODE_INFO *const mi = &m->mbmi;
+  int j, nzc_context;
+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
+
+  assert(mb_col == get_mb_col(xd));
+  assert(mb_row == get_mb_row(xd));
+
+  if (mi->mb_skip_coeff)
+    return;
+
+  switch (mi->txfm_size) {
+    case TX_32X32:
+      for (j = 0; j < 256; j += 64) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0);
+      }
+      for (j = 256; j < 384; j += 64) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 1);
+      }
+      break;
+
+    case TX_16X16:
+      for (j = 0; j < 256; j += 16) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0);
+      }
+      for (j = 256; j < 384; j += 16) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1);
+      }
+      break;
+
+    case TX_8X8:
+      for (j = 0; j < 256; j += 4) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0);
+      }
+      for (j = 256; j < 384; j += 4) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);
+      }
+      break;
+
+    case TX_4X4:
+      for (j = 0; j < 256; ++j) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0);
+      }
+      for (j = 256; j < 384; ++j) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);
+      }
+      break;
+
+    default:
+      break;
+  }
+}
+
+static void update_nzcs_sb32(VP9_COMMON *cm,
+                            MACROBLOCKD *xd,
+                            int mb_row,
+                            int mb_col) {
+  MODE_INFO *m = xd->mode_info_context;
+  MB_MODE_INFO *const mi = &m->mbmi;
+  int j, nzc_context;
+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
+
+  assert(mb_col == get_mb_col(xd));
+  assert(mb_row == get_mb_row(xd));
+
+  if (mi->mb_skip_coeff)
+    return;
+
+  switch (mi->txfm_size) {
+    case TX_32X32:
+      for (j = 0; j < 64; j += 64) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0);
+      }
+      for (j = 64; j < 96; j += 16) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1);
+      }
+      break;
+
+    case TX_16X16:
+      for (j = 0; j < 64; j += 16) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0);
+      }
+      for (j = 64; j < 96; j += 16) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1);
+      }
+      break;
+
+    case TX_8X8:
+      for (j = 0; j < 64; j += 4) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0);
+      }
+      for (j = 64; j < 96; j += 4) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);
+      }
+      break;
+
+    case TX_4X4:
+      for (j = 0; j < 64; ++j) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0);
+      }
+      for (j = 64; j < 96; ++j) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);
+      }
+      break;
+
+    default:
+      break;
+  }
+}
+
+static void update_nzcs_mb16(VP9_COMMON *cm,
+                             MACROBLOCKD *xd,
+                             int mb_row,
+                             int mb_col) {
+  MODE_INFO *m = xd->mode_info_context;
+  MB_MODE_INFO *const mi = &m->mbmi;
+  int j, nzc_context;
+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
+
+  assert(mb_col == get_mb_col(xd));
+  assert(mb_row == get_mb_row(xd));
+
+  if (mi->mb_skip_coeff)
+    return;
+
+  switch (mi->txfm_size) {
+    case TX_16X16:
+      for (j = 0; j < 16; j += 16) {
+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0);
+      }
+      for (j = 16; j < 24; j += 4) {
+        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);
+      }
+      break;
+
+    case TX_8X8:
+      for (j = 0; j < 16; j += 4) {
+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0);
+      }
+      if (mi->mode == I8X8_PRED || mi->mode == SPLITMV) {
+        for (j = 16; j < 24; ++j) {
+          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+          update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);
+        }
+      } else {
+        for (j = 16; j < 24; j += 4) {
+          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+          update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);
+        }
+      }
+      break;
+
+    case TX_4X4:
+      for (j = 0; j < 16; ++j) {
+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0);
+      }
+      for (j = 16; j < 24; ++j) {
+        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);
+      }
+      break;
+
+    default:
+      break;
+  }
+}
+
+void vp9_update_nzc_counts(VP9_COMMON *cm,
+                           MACROBLOCKD *xd,
+                           int mb_row,
+                           int mb_col) {
+  if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64)
+    update_nzcs_sb64(cm, xd, mb_row, mb_col);
+  else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32)
+    update_nzcs_sb32(cm, xd, mb_row, mb_col);
+  else
+    update_nzcs_mb16(cm, xd, mb_row, mb_col);
+}
+#endif  // CONFIG_CODE_NONZEROCOUNT
+
 // #define COEF_COUNT_TESTING
 
 #define COEF_COUNT_SAT 24
@@ -277,10 +1475,10 @@
 #define COEF_COUNT_SAT_AFTER_KEY 24
 #define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
 
-static void update_coef_probs(vp9_coeff_probs *dst_coef_probs,
-                              vp9_coeff_probs *pre_coef_probs,
-                              int block_types, vp9_coeff_count *coef_counts,
-                              int count_sat, int update_factor) {
+static void adapt_coef_probs(vp9_coeff_probs *dst_coef_probs,
+                             vp9_coeff_probs *pre_coef_probs,
+                             int block_types, vp9_coeff_count *coef_counts,
+                             int count_sat, int update_factor) {
   int t, i, j, k, l, count;
   unsigned int branch_ct[ENTROPY_NODES][2];
   vp9_prob coef_probs[ENTROPY_NODES];
@@ -307,9 +1505,6 @@
 }
 
 void vp9_adapt_coef_probs(VP9_COMMON *cm) {
-#ifdef COEF_COUNT_TESTING
-  int t, i, j, k;
-#endif
   int count_sat;
   int update_factor; /* denominator 256 */
 
@@ -325,16 +1520,143 @@
     count_sat = COEF_COUNT_SAT;
   }
 
-  update_coef_probs(cm->fc.coef_probs_4x4, cm->fc.pre_coef_probs_4x4,
-                    BLOCK_TYPES, cm->fc.coef_counts_4x4,
-                    count_sat, update_factor);
-  update_coef_probs(cm->fc.coef_probs_8x8, cm->fc.pre_coef_probs_8x8,
-                    BLOCK_TYPES, cm->fc.coef_counts_8x8,
-                    count_sat, update_factor);
-  update_coef_probs(cm->fc.coef_probs_16x16, cm->fc.pre_coef_probs_16x16,
-                    BLOCK_TYPES, cm->fc.coef_counts_16x16,
-                    count_sat, update_factor);
-  update_coef_probs(cm->fc.coef_probs_32x32, cm->fc.pre_coef_probs_32x32,
-                    BLOCK_TYPES, cm->fc.coef_counts_32x32,
-                    count_sat, update_factor);
+  adapt_coef_probs(cm->fc.coef_probs_4x4, cm->fc.pre_coef_probs_4x4,
+                   BLOCK_TYPES, cm->fc.coef_counts_4x4,
+                   count_sat, update_factor);
+  adapt_coef_probs(cm->fc.coef_probs_8x8, cm->fc.pre_coef_probs_8x8,
+                   BLOCK_TYPES, cm->fc.coef_counts_8x8,
+                   count_sat, update_factor);
+  adapt_coef_probs(cm->fc.coef_probs_16x16, cm->fc.pre_coef_probs_16x16,
+                   BLOCK_TYPES, cm->fc.coef_counts_16x16,
+                   count_sat, update_factor);
+  adapt_coef_probs(cm->fc.coef_probs_32x32, cm->fc.pre_coef_probs_32x32,
+                   BLOCK_TYPES, cm->fc.coef_counts_32x32,
+                   count_sat, update_factor);
 }
+
+#if CONFIG_CODE_NONZEROCOUNT
+static void adapt_nzc_probs(VP9_COMMON *cm,
+                            int block_size,
+                            int count_sat,
+                            int update_factor) {
+  int c, r, b, n;
+  int count, factor;
+  unsigned int nzc_branch_ct[NZC32X32_NODES][2];
+  vp9_prob nzc_probs[NZC32X32_NODES];
+  int tokens, nodes;
+  const vp9_tree_index *nzc_tree;
+  const struct vp9_token_struct *nzc_encodings;
+  vp9_prob *dst_nzc_probs;
+  vp9_prob *pre_nzc_probs;
+  unsigned int *nzc_counts;
+
+  if (block_size == 32) {
+    tokens = NZC32X32_TOKENS;
+    nzc_tree = vp9_nzc32x32_tree;
+    nzc_encodings = vp9_nzc32x32_encodings;
+    dst_nzc_probs = cm->fc.nzc_probs_32x32[0][0][0];
+    pre_nzc_probs = cm->fc.pre_nzc_probs_32x32[0][0][0];
+    nzc_counts = cm->fc.nzc_counts_32x32[0][0][0];
+  } else if (block_size == 16) {
+    tokens = NZC16X16_TOKENS;
+    nzc_tree = vp9_nzc16x16_tree;
+    nzc_encodings = vp9_nzc16x16_encodings;
+    dst_nzc_probs = cm->fc.nzc_probs_16x16[0][0][0];
+    pre_nzc_probs = cm->fc.pre_nzc_probs_16x16[0][0][0];
+    nzc_counts = cm->fc.nzc_counts_16x16[0][0][0];
+  } else if (block_size == 8) {
+    tokens = NZC8X8_TOKENS;
+    nzc_tree = vp9_nzc8x8_tree;
+    nzc_encodings = vp9_nzc8x8_encodings;
+    dst_nzc_probs = cm->fc.nzc_probs_8x8[0][0][0];
+    pre_nzc_probs = cm->fc.pre_nzc_probs_8x8[0][0][0];
+    nzc_counts = cm->fc.nzc_counts_8x8[0][0][0];
+  } else {
+    nzc_tree = vp9_nzc4x4_tree;
+    nzc_encodings = vp9_nzc4x4_encodings;
+    tokens = NZC4X4_TOKENS;
+    dst_nzc_probs = cm->fc.nzc_probs_4x4[0][0][0];
+    pre_nzc_probs = cm->fc.pre_nzc_probs_4x4[0][0][0];
+    nzc_counts = cm->fc.nzc_counts_4x4[0][0][0];
+  }
+  nodes = tokens - 1;
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c)
+    for (r = 0; r < REF_TYPES; ++r)
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;
+        int offset_nodes = offset * nodes;
+        int offset_tokens = offset * tokens;
+        vp9_tree_probs_from_distribution(tokens,
+                                         nzc_encodings, nzc_tree,
+                                         nzc_probs, nzc_branch_ct,
+                                         nzc_counts + offset_tokens);
+        for (n = 0; n < nodes; ++n) {
+          count = nzc_branch_ct[n][0] + nzc_branch_ct[n][1];
+          count = count > count_sat ? count_sat : count;
+          factor = (update_factor * count / count_sat);
+          dst_nzc_probs[offset_nodes + n] =
+              weighted_prob(pre_nzc_probs[offset_nodes + n],
+                            nzc_probs[n], factor);
+        }
+      }
+}
+
+static void adapt_nzc_pcat(VP9_COMMON *cm, int count_sat, int update_factor) {
+  int c, t;
+  int count, factor;
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {
+      int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];
+      int b;
+      for (b = 0; b < bits; ++b) {
+        vp9_prob prob = get_binary_prob(cm->fc.nzc_pcat_counts[c][t][b][0],
+                                        cm->fc.nzc_pcat_counts[c][t][b][1]);
+        count = cm->fc.nzc_pcat_counts[c][t][b][0] +
+                cm->fc.nzc_pcat_counts[c][t][b][1];
+        count = count > count_sat ? count_sat : count;
+        factor = (update_factor * count / count_sat);
+        cm->fc.nzc_pcat_probs[c][t][b] = weighted_prob(
+            cm->fc.pre_nzc_pcat_probs[c][t][b], prob, factor);
+      }
+    }
+  }
+}
+
+// #define NZC_COUNT_TESTING
+void vp9_adapt_nzc_probs(VP9_COMMON *cm) {
+  int count_sat;
+  int update_factor; /* denominator 256 */
+#ifdef NZC_COUNT_TESTING
+  int c, r, b, t;
+  printf("\n");
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c)
+    for (r = 0; r < REF_TYPES; ++r) {
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        printf("    {");
+        for (t = 0; t < NZC4X4_TOKENS; ++t) {
+          printf(" %d,", cm->fc.nzc_counts_4x4[c][r][b][t]);
+        }
+        printf("}\n");
+      }
+      printf("\n");
+    }
+#endif
+
+  if (cm->frame_type == KEY_FRAME) {
+    update_factor = COEF_MAX_UPDATE_FACTOR_KEY;
+    count_sat = COEF_COUNT_SAT_KEY;
+  } else if (cm->last_frame_type == KEY_FRAME) {
+    update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY;  /* adapt quickly */
+    count_sat = COEF_COUNT_SAT_AFTER_KEY;
+  } else {
+    update_factor = COEF_MAX_UPDATE_FACTOR;
+    count_sat = COEF_COUNT_SAT;
+  }
+
+  adapt_nzc_probs(cm, 4, count_sat, update_factor);
+  adapt_nzc_probs(cm, 8, count_sat, update_factor);
+  adapt_nzc_probs(cm, 16, count_sat, update_factor);
+  adapt_nzc_probs(cm, 32, count_sat, update_factor);
+  adapt_nzc_pcat(cm, count_sat, update_factor);
+}
+#endif  // CONFIG_CODE_NONZEROCOUNT
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 8d28b00..ceef1a7 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -142,4 +142,86 @@
 }
 extern int vp9_get_coef_context(int * recent_energy, int token);
 
+#if CONFIG_CODE_NONZEROCOUNT
+/* Alphabet for number of non-zero symbols in block */
+#define NZC_0                   0       /* Used for all blocks */
+#define NZC_1                   1       /* Used for all blocks */
+#define NZC_2                   2       /* Used for all blocks */
+#define NZC_3TO4                3       /* Used for all blocks */
+#define NZC_5TO8                4       /* Used for all blocks */
+#define NZC_9TO16               5       /* Used for all blocks */
+#define NZC_17TO32              6       /* Used for 8x8 and larger blocks */
+#define NZC_33TO64              7       /* Used for 8x8 and larger blocks */
+#define NZC_65TO128             8       /* Used for 16x16 and larger blocks */
+#define NZC_129TO256            9       /* Used for 16x16 and larger blocks */
+#define NZC_257TO512           10       /* Used for 32x32 and larger blocks */
+#define NZC_513TO1024          11       /* Used for 32x32 and larger blocks */
+
+/* Number of tokens for each block size */
+#define NZC4X4_TOKENS           6
+#define NZC8X8_TOKENS           8
+#define NZC16X16_TOKENS        10
+#define NZC32X32_TOKENS        12
+
+/* Number of nodes for each block size */
+#define NZC4X4_NODES            5
+#define NZC8X8_NODES            7
+#define NZC16X16_NODES          9
+#define NZC32X32_NODES         11
+
+/* Max number of tokens with extra bits */
+#define NZC_TOKENS_EXTRA        9
+
+/* Max number of extra bits */
+#define NZC_BITS_EXTRA          9
+
+/* Tokens without extra bits */
+#define NZC_TOKENS_NOEXTRA      (NZC32X32_TOKENS - NZC_TOKENS_EXTRA)
+
+#define MAX_NZC_CONTEXTS        3
+
+/* whether to update extra bit probabilities */
+#define NZC_PCAT_UPDATE
+
+/* nzc trees */
+extern const vp9_tree_index    vp9_nzc4x4_tree[];
+extern const vp9_tree_index    vp9_nzc8x8_tree[];
+extern const vp9_tree_index    vp9_nzc16x16_tree[];
+extern const vp9_tree_index    vp9_nzc32x32_tree[];
+
+/* nzc encodings */
+extern struct vp9_token_struct  vp9_nzc4x4_encodings[NZC4X4_TOKENS];
+extern struct vp9_token_struct  vp9_nzc8x8_encodings[NZC8X8_TOKENS];
+extern struct vp9_token_struct  vp9_nzc16x16_encodings[NZC16X16_TOKENS];
+extern struct vp9_token_struct  vp9_nzc32x32_encodings[NZC32X32_TOKENS];
+
+#define codenzc(x) (\
+  (x) <= 3 ? (x) : (x) <= 4 ? 3 : (x) <= 8 ? 4 : \
+  (x) <= 16 ? 5 : (x) <= 32 ? 6 : (x) <= 64 ? 7 :\
+  (x) <= 128 ? 8 : (x) <= 256 ? 9 : (x) <= 512 ? 10 : 11)
+
+int vp9_get_nzc_context_y_sb64(struct VP9Common *cm, MODE_INFO *cur,
+                               int mb_row, int mb_col, int block);
+int vp9_get_nzc_context_y_sb32(struct VP9Common *cm, MODE_INFO *cur,
+                               int mb_row, int mb_col, int block);
+int vp9_get_nzc_context_y_mb16(struct VP9Common *cm, MODE_INFO *cur,
+                               int mb_row, int mb_col, int block);
+int vp9_get_nzc_context_uv_sb64(struct VP9Common *cm, MODE_INFO *cur,
+                                int mb_row, int mb_col, int block);
+int vp9_get_nzc_context_uv_sb32(struct VP9Common *cm, MODE_INFO *cur,
+                                int mb_row, int mb_col, int block);
+int vp9_get_nzc_context_uv_mb16(struct VP9Common *cm, MODE_INFO *cur,
+                                int mb_row, int mb_col, int block);
+int vp9_get_nzc_context(struct VP9Common *cm, MACROBLOCKD *xd, int block);
+void vp9_update_nzc_counts(struct VP9Common *cm, MACROBLOCKD *xd,
+                           int mb_row, int mb_col);
+void vp9_adapt_nzc_probs(struct VP9Common *cm);
+
+/* Extra bits array */
+extern const int vp9_extranzcbits[NZC32X32_TOKENS];
+
+/* Base nzc values */
+extern const int vp9_basenzcvalue[NZC32X32_TOKENS];
+
+#endif  // CONFIG_CODE_NONZEROCOUNT
 #endif  // VP9_COMMON_VP9_ENTROPY_H_
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index 56cebff..9a7be45 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -42,7 +42,9 @@
   -MV_CLASS_2, -MV_CLASS_3,
   10, 12,
   -MV_CLASS_4, -MV_CLASS_5,
+  14, 16,
   -MV_CLASS_6, -MV_CLASS_7,
+  -MV_CLASS_8, -MV_CLASS_9,
 };
 struct vp9_token_struct vp9_mv_class_encodings[MV_CLASSES];
 
@@ -63,9 +65,9 @@
   {
     { /* vert component */
       128,                                             /* sign */
-      {224, 144, 192, 168, 192, 176, 192},             /* class */
+      {224, 144, 192, 168, 192, 176, 192, 198, 198},   /* class */
       {216},                                           /* class0 */
-      {136, 140, 148, 160, 176, 192, 224},             /* bits */
+      {136, 140, 148, 160, 176, 192, 224, 234, 234},   /* bits */
       {{128, 128, 64}, {96, 112, 64}},                 /* class0_fp */
       {64, 96, 64},                                    /* fp */
       160,                                             /* class0_hp bit */
@@ -73,9 +75,9 @@
     },
     { /* hor component */
       128,                                             /* sign */
-      {216, 128, 176, 160, 176, 176, 192},             /* class */
+      {216, 128, 176, 160, 176, 176, 192, 198, 198},   /* class */
       {208},                                           /* class0 */
-      {136, 140, 148, 160, 176, 192, 224},             /* bits */
+      {136, 140, 148, 160, 176, 192, 224, 234, 234},   /* bits */
       {{128, 128, 64}, {96, 112, 64}},                 /* class0_fp */
       {64, 96, 64},                                    /* fp */
       160,                                             /* class0_hp bit */
@@ -103,6 +105,8 @@
   else if (z < CLASS0_SIZE * 256)  c = MV_CLASS_5;
   else if (z < CLASS0_SIZE * 512)  c = MV_CLASS_6;
   else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7;
+  else if (z < CLASS0_SIZE * 2048) c = MV_CLASS_8;
+  else if (z < CLASS0_SIZE * 4096) c = MV_CLASS_9;
   else assert(0);
   if (offset)
     *offset = z - mv_class_base(c);
@@ -134,6 +138,7 @@
                                     int incr,
                                     int usehp) {
   int s, z, c, o, d, e, f;
+  if (!incr) return;
   assert (v != 0);            /* should not be zero */
   s = v < 0;
   mvcomp->sign[s] += incr;
diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h
index f5cfee9..3350006 100644
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h
@@ -49,7 +49,7 @@
 extern struct vp9_token_struct vp9_mv_joint_encodings [MV_JOINTS];
 
 /* Symbols for coding magnitude class of nonzero components */
-#define MV_CLASSES     8
+#define MV_CLASSES     10
 typedef enum {
   MV_CLASS_0 = 0,      /* (0, 2]     integer pel */
   MV_CLASS_1 = 1,      /* (2, 4]     integer pel */
@@ -59,6 +59,8 @@
   MV_CLASS_5 = 5,      /* (32, 64]   integer pel */
   MV_CLASS_6 = 6,      /* (64, 128]  integer pel */
   MV_CLASS_7 = 7,      /* (128, 256] integer pel */
+  MV_CLASS_8 = 8,      /* (256, 512] integer pel */
+  MV_CLASS_9 = 9,      /* (512, 1024] integer pel */
 } MV_CLASS_TYPE;
 
 extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2];
diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c
index 2f709bf..8409885 100644
--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@@ -9,10 +9,11 @@
  */
 
 
+#include <limits.h>
+
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/common/vp9_sadmxn.h"
 #include "vp9/common/vp9_subpelvar.h"
-#include <limits.h>
 
 const uint8_t vp9_mbsplit_offset[4][16] = {
   { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
@@ -32,8 +33,7 @@
 }
 
 vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,
-                           vp9_prob p[4], const int context
-                          ) {
+                           vp9_prob p[4], const int context) {
   p[0] = pc->fc.vp9_mode_contexts[context][0];
   p[1] = pc->fc.vp9_mode_contexts[context][1];
   p[2] = pc->fc.vp9_mode_contexts[context][2];
diff --git a/vp9/common/vp9_findnearmv.h b/vp9/common/vp9_findnearmv.h
index c42aab1..6887b04 100644
--- a/vp9/common/vp9_findnearmv.h
+++ b/vp9/common/vp9_findnearmv.h
@@ -17,6 +17,9 @@
 #include "vp9/common/vp9_treecoder.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
+#define LEFT_TOP_MARGIN (16 << 3)
+#define RIGHT_BOTTOM_MARGIN (16 << 3)
+
 /* check a list of motion vectors by sad score using a number rows of pixels
  * above and a number cols of pixels in the left to select the one with best
  * score to use as ref motion vector
@@ -30,8 +33,7 @@
 
 static void mv_bias(int refmb_ref_frame_sign_bias, int refframe,
                     int_mv *mvp, const int *ref_frame_sign_bias) {
-  MV xmv;
-  xmv = mvp->as_mv;
+  MV xmv = mvp->as_mv;
 
   if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe]) {
     xmv.row *= -1;
@@ -41,8 +43,6 @@
   mvp->as_mv = xmv;
 }
 
-#define LEFT_TOP_MARGIN (16 << 3)
-#define RIGHT_BOTTOM_MARGIN (16 << 3)
 
 static void clamp_mv(int_mv *mv,
                      int mb_to_left_edge,
@@ -72,10 +72,10 @@
                                     int mb_to_right_edge,
                                     int mb_to_top_edge,
                                     int mb_to_bottom_edge) {
-  return (mv->as_mv.col < mb_to_left_edge) ||
-         (mv->as_mv.col > mb_to_right_edge) ||
-         (mv->as_mv.row < mb_to_top_edge) ||
-         (mv->as_mv.row > mb_to_bottom_edge);
+  return mv->as_mv.col < mb_to_left_edge ||
+         mv->as_mv.col > mb_to_right_edge ||
+         mv->as_mv.row < mb_to_top_edge ||
+         mv->as_mv.row > mb_to_bottom_edge;
 }
 
 vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,
@@ -90,11 +90,12 @@
     if (!xd->left_available)
       return 0;
 
-    /* On L edge, get from MB to left of us */
+    // On L edge, get from MB to left of us
     --cur_mb;
 
     if (cur_mb->mbmi.mode != SPLITMV)
       return cur_mb->mbmi.mv[0].as_int;
+
     b += 4;
   }
 
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index bc79b5c..9e55adf 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -15,17 +15,12 @@
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
-
-#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n))
-
-/* If we don't want to use ROUND_POWER_OF_TWO macro
-static INLINE int16_t round_power_of_two(int16_t value, int n) {
-  return (value + (1 << (n - 1))) >> n;
-}*/
+#include "vp9/common/vp9_common.h"
 
 // Constants and Macros used by all idct/dct functions
 #define DCT_CONST_BITS 14
 #define DCT_CONST_ROUNDING  (1 << (DCT_CONST_BITS - 1))
+
 // Constants are round(16384 * cos(k*Pi/64)) where k = 1 to 31.
 // Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
 static const int cospi_1_64  = 16364;
@@ -67,13 +62,13 @@
 static const int sinpi_4_9 = 15212;
 
 static INLINE int dct_const_round_shift(int input) {
-  int rv = (input + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
   assert(INT16_MIN <= rv && rv <= INT16_MAX);
   return rv;
 }
 
 static INLINE int dct_32_round(int input) {
-  int rv = (input + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
   assert(-131072 <= rv && rv <= 131071);
   return rv;
 }
diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c
index 54b79ee..e210625 100644
--- a/vp9/common/vp9_idctllm.c
+++ b/vp9/common/vp9_idctllm.c
@@ -26,6 +26,7 @@
 #include <math.h>
 
 #include "./vpx_config.h"
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
@@ -109,7 +110,7 @@
   }
 }
 
-static void idct4_1d(int16_t *input, int16_t *output) {
+void vp9_idct4_1d_c(int16_t *input, int16_t *output) {
   int16_t step[4];
   int temp1, temp2;
   // stage 1
@@ -140,7 +141,7 @@
   for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j)
       temp_in[j] = input[j];
-    idct4_1d(temp_in, outptr);
+    vp9_idct4_1d(temp_in, outptr);
     input += 4;
     outptr += 4;
   }
@@ -149,7 +150,7 @@
   for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j)
       temp_in[j] = out[j * 4 + i];
-    idct4_1d(temp_in, temp_out);
+    vp9_idct4_1d(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
       output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
   }
@@ -205,7 +206,7 @@
   step1[6] = dct_const_round_shift(temp2);
 
   // stage 2 & stage 3 - even half
-  idct4_1d(step1, step1);
+  vp9_idct4_1d(step1, step1);
 
   // stage 2 - odd half
   step2[4] = step1[4] + step1[5];
@@ -298,24 +299,23 @@
   output[3] = dct_const_round_shift(s3);
 }
 
-static const transform_2d IHT_4[] = {
-  { idct4_1d,  idct4_1d  },  // DCT_DCT  = 0
-  { iadst4_1d, idct4_1d  },  // ADST_DCT = 1
-  { idct4_1d,  iadst4_1d },  // DCT_ADST = 2
-  { iadst4_1d, iadst4_1d }   // ADST_ADST = 3
-};
-
 void vp9_short_iht4x4_c(int16_t *input, int16_t *output,
-                        int pitch, TX_TYPE tx_type) {
+                        int pitch, int tx_type) {
+  const transform_2d IHT_4[] = {
+    { vp9_idct4_1d,  vp9_idct4_1d  },  // DCT_DCT  = 0
+    { iadst4_1d, vp9_idct4_1d  },      // ADST_DCT = 1
+    { vp9_idct4_1d,  iadst4_1d },      // DCT_ADST = 2
+    { iadst4_1d, iadst4_1d }           // ADST_ADST = 3
+  };
+
   int i, j;
   int16_t out[4 * 4];
   int16_t *outptr = out;
   int16_t temp_in[4], temp_out[4];
-  const transform_2d ht = IHT_4[tx_type];
 
   // inverse transform row vectors
   for (i = 0; i < 4; ++i) {
-    ht.rows(input, outptr);
+    IHT_4[tx_type].rows(input, outptr);
     input  += 4;
     outptr += 4;
   }
@@ -324,7 +324,7 @@
   for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j)
       temp_in[j] = out[j * 4 + i];
-    ht.cols(temp_in, temp_out);
+    IHT_4[tx_type].cols(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
       output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
   }
@@ -415,7 +415,7 @@
 };
 
 void vp9_short_iht8x8_c(int16_t *input, int16_t *output,
-                        int pitch, TX_TYPE tx_type) {
+                        int pitch, int tx_type) {
   int i, j;
   int16_t out[8 * 8];
   int16_t *outptr = out;
@@ -838,7 +838,7 @@
 };
 
 void vp9_short_iht16x16_c(int16_t *input, int16_t *output,
-                          int pitch, TX_TYPE tx_type) {
+                          int pitch, int tx_type) {
   int i, j;
   int16_t out[16 * 16];
   int16_t *outptr = out;
diff --git a/vp9/common/vp9_invtrans.c b/vp9/common/vp9_invtrans.c
index a26415f..a03a66e 100644
--- a/vp9/common/vp9_invtrans.c
+++ b/vp9/common/vp9_invtrans.c
@@ -24,7 +24,7 @@
   int i;
 
   for (i = 0; i < 16; i++) {
-    TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[i]);
+    TX_TYPE tx_type = get_tx_type_4x4(xd, i);
     if (tx_type != DCT_DCT) {
       vp9_short_iht4x4(xd->block[i].dqcoeff, xd->block[i].diff, 16, tx_type);
     } else {
@@ -58,7 +58,7 @@
   BLOCKD *blockd = xd->block;
 
   for (i = 0; i < 9; i += 8) {
-    TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[i]);
+    TX_TYPE tx_type = get_tx_type_8x8(xd, i);
     if (tx_type != DCT_DCT) {
       vp9_short_iht8x8(xd->block[i].dqcoeff, xd->block[i].diff, 16, tx_type);
     } else {
@@ -67,7 +67,7 @@
     }
   }
   for (i = 2; i < 11; i += 8) {
-    TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[i]);
+    TX_TYPE tx_type = get_tx_type_8x8(xd, i);
     if (tx_type != DCT_DCT) {
       vp9_short_iht8x8(xd->block[i + 2].dqcoeff, xd->block[i].diff,
                            16, tx_type);
@@ -100,7 +100,7 @@
 
 void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd) {
   BLOCKD *bd = &xd->block[0];
-  TX_TYPE tx_type = get_tx_type_16x16(xd, bd);
+  TX_TYPE tx_type = get_tx_type_16x16(xd, 0);
   if (tx_type != DCT_DCT) {
     vp9_short_iht16x16(bd->dqcoeff, bd->diff, 16, tx_type);
   } else {
@@ -123,9 +123,16 @@
 
   for (n = 0; n < 4; n++) {
     const int x_idx = n & 1, y_idx = n >> 1;
+    const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 8 + x_idx) * 4);
 
-    vp9_inverse_transform_b_16x16(xd->dqcoeff + n * 256,
-                                  xd->diff + x_idx * 16 + y_idx * 32 * 16, 64);
+    if (tx_type == DCT_DCT) {
+      vp9_inverse_transform_b_16x16(xd->dqcoeff + n * 256,
+                                    xd->diff + x_idx * 16 + y_idx * 32 * 16,
+                                    64);
+    } else {
+      vp9_short_iht16x16(xd->dqcoeff + n * 256,
+                         xd->diff + x_idx * 16 + y_idx * 32 * 16, 32, tx_type);
+    }
   }
 }
 
@@ -134,9 +141,15 @@
 
   for (n = 0; n < 16; n++) {
     const int x_idx = n & 3, y_idx = n >> 2;
+    const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 8 + x_idx) * 2);
 
-    vp9_inverse_transform_b_8x8(xd->dqcoeff + n * 64,
-                                xd->diff + x_idx * 8 + y_idx * 32 * 8, 64);
+    if (tx_type == DCT_DCT) {
+      vp9_inverse_transform_b_8x8(xd->dqcoeff + n * 64,
+                                  xd->diff + x_idx * 8 + y_idx * 32 * 8, 64);
+    } else {
+      vp9_short_iht8x8(xd->dqcoeff + n * 64,
+                       xd->diff + x_idx * 8 + y_idx * 32 * 8, 32, tx_type);
+    }
   }
 }
 
@@ -145,9 +158,15 @@
 
   for (n = 0; n < 64; n++) {
     const int x_idx = n & 7, y_idx = n >> 3;
+    const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 8 + x_idx);
 
-    vp9_inverse_transform_b_4x4(xd, xd->eobs[n], xd->dqcoeff + n * 16,
-                                xd->diff + x_idx * 4 + y_idx * 4 * 32, 64);
+    if (tx_type == DCT_DCT) {
+      vp9_inverse_transform_b_4x4(xd, xd->eobs[n], xd->dqcoeff + n * 16,
+                                  xd->diff + x_idx * 4 + y_idx * 4 * 32, 64);
+    } else {
+      vp9_short_iht4x4(xd->dqcoeff + n * 16,
+                       xd->diff + x_idx * 4 + y_idx * 4 * 32, 32, tx_type);
+    }
   }
 }
 
@@ -206,9 +225,16 @@
 
   for (n = 0; n < 16; n++) {
     const int x_idx = n & 3, y_idx = n >> 2;
+    const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 16 + x_idx) * 4);
 
-    vp9_inverse_transform_b_16x16(xd->dqcoeff + n * 256,
-                                  xd->diff + x_idx * 16 + y_idx * 64 * 16, 128);
+    if (tx_type == DCT_DCT) {
+      vp9_inverse_transform_b_16x16(xd->dqcoeff + n * 256,
+                                    xd->diff + x_idx * 16 + y_idx * 64 * 16,
+                                    128);
+    } else {
+      vp9_short_iht16x16(xd->dqcoeff + n * 256,
+                         xd->diff + x_idx * 16 + y_idx * 64 * 16, 64, tx_type);
+    }
   }
 }
 
@@ -217,9 +243,15 @@
 
   for (n = 0; n < 64; n++) {
     const int x_idx = n & 7, y_idx = n >> 3;
+    const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 16 + x_idx) * 2);
 
-    vp9_inverse_transform_b_8x8(xd->dqcoeff + n * 64,
-                                xd->diff + x_idx * 8 + y_idx * 64 * 8, 128);
+    if (tx_type == DCT_DCT) {
+      vp9_inverse_transform_b_8x8(xd->dqcoeff + n * 64,
+                                  xd->diff + x_idx * 8 + y_idx * 64 * 8, 128);
+    } else {
+      vp9_short_iht8x8(xd->dqcoeff + n * 64,
+                       xd->diff + x_idx * 8 + y_idx * 64 * 8, 64, tx_type);
+    }
   }
 }
 
@@ -228,9 +260,15 @@
 
   for (n = 0; n < 256; n++) {
     const int x_idx = n & 15, y_idx = n >> 4;
+    const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 16 + x_idx);
 
-    vp9_inverse_transform_b_4x4(xd, xd->eobs[n], xd->dqcoeff + n * 16,
-                                xd->diff + x_idx * 4 + y_idx * 4 * 64, 128);
+    if (tx_type == DCT_DCT) {
+      vp9_inverse_transform_b_4x4(xd, xd->eobs[n], xd->dqcoeff + n * 16,
+                                  xd->diff + x_idx * 4 + y_idx * 4 * 64, 128);
+    } else {
+      vp9_short_iht4x4(xd->dqcoeff + n * 16,
+                       xd->diff + x_idx * 4 + y_idx * 4 * 64, 64, tx_type);
+    }
   }
 }
 
diff --git a/vp9/common/vp9_maskingmv.c b/vp9/common/vp9_maskingmv.c
index f1151e3..326201b 100644
--- a/vp9/common/vp9_maskingmv.c
+++ b/vp9/common/vp9_maskingmv.c
@@ -11,25 +11,19 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-extern unsigned int vp9_sad16x16_sse3(
+
+unsigned int vp9_sad16x16_sse3(
   unsigned char *src_ptr,
   int  src_stride,
   unsigned char *ref_ptr,
   int  ref_stride,
   int  max_err);
 
-extern void vp9_sad16x16x3_sse3(
-  unsigned char *src_ptr,
-  int  src_stride,
-  unsigned char *ref_ptr,
-  int  ref_stride,
-  int  *results);
-
-extern int vp8_growmaskmb_sse3(
+int vp8_growmaskmb_sse3(
   unsigned char *om,
   unsigned char *nm);
 
-extern void vp8_makemask_sse3(
+void vp8_makemask_sse3(
   unsigned char *y,
   unsigned char *u,
   unsigned char *v,
@@ -238,6 +232,7 @@
   for (i = 0; i < 256; i++)
     ym[i] = nym[i];
 }
+
 void make_mb_mask(unsigned char *y, unsigned char *u, unsigned char *v,
                   unsigned char *ym, unsigned char *uvm,
                   int yp, int uvp,
@@ -283,6 +278,7 @@
 
   return sad;
 }
+
 int unmasked_sad(unsigned char *src, int p, unsigned char *dst, int dp,
                  unsigned char *ym) {
   int i, j;
@@ -294,6 +290,7 @@
 
   return sad;
 }
+
 int masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v,
                          int yp, int uvp,
                          unsigned char *dy, unsigned char *du, unsigned char *dv,
@@ -802,5 +799,5 @@
   }
   fclose(f);
   fclose(g);
-  return;
+  return 0;
 }
diff --git a/vp9/common/vp9_mbpitch.c b/vp9/common/vp9_mbpitch.c
index ed96292..8d99335 100644
--- a/vp9/common/vp9_mbpitch.c
+++ b/vp9/common/vp9_mbpitch.c
@@ -20,15 +20,15 @@
                         int mv_stride,
                         uint8_t **base,
                         uint8_t **base2,
-                        int Stride,
+                        int stride,
                         int offset,
                         BLOCKSET bs) {
   if (bs == DEST) {
-    b->dst_stride = Stride;
+    b->dst_stride = stride;
     b->dst = offset;
     b->base_dst = base;
   } else {
-    b->pre_stride = Stride;
+    b->pre_stride = stride;
     b->pre = offset;
     b->base_pre = base;
     b->base_second_pre = base2;
diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index 25aa53b..2f322a3 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -11,23 +11,27 @@
 #include "vp9/common/vp9_mvref_common.h"
 
 #define MVREF_NEIGHBOURS 8
+
 static int mb_mv_ref_search[MVREF_NEIGHBOURS][2] = {
     {0, -1}, {-1, 0}, {-1, -1}, {0, -2},
     {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}
 };
+
 static int mb_ref_distance_weight[MVREF_NEIGHBOURS] =
   { 3, 3, 2, 1, 1, 1, 1, 1 };
+
 static int sb_mv_ref_search[MVREF_NEIGHBOURS][2] = {
     {0, -1}, {-1, 0}, {1, -1}, {-1, 1},
     {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}
 };
+
 static int sb_ref_distance_weight[MVREF_NEIGHBOURS] =
   { 3, 3, 2, 2, 2, 1, 1, 1 };
 
 // clamp_mv_ref
 #define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
-static void clamp_mv_ref(const MACROBLOCKD *xd, int_mv *mv) {
 
+static void clamp_mv_ref(const MACROBLOCKD *xd, int_mv *mv) {
   if (mv->as_mv.col < (xd->mb_to_left_edge - MV_BORDER))
     mv->as_mv.col = xd->mb_to_left_edge - MV_BORDER;
   else if (mv->as_mv.col > xd->mb_to_right_edge + MV_BORDER)
@@ -41,11 +45,9 @@
 
 // Gets a candidate refenence motion vector from the given mode info
 // structure if one exists that matches the given reference frame.
-static int get_matching_candidate(
-  const MODE_INFO *candidate_mi,
+static int get_matching_candidate(const MODE_INFO *candidate_mi,
   MV_REFERENCE_FRAME ref_frame,
-  int_mv *c_mv
-) {
+  int_mv *c_mv) {
   int ret_val = TRUE;
 
   if (ref_frame == candidate_mi->mbmi.ref_frame) {
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 48d19a3..c8369eb 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -58,10 +58,23 @@
   vp9_prob i8x8_mode_prob[VP9_I8X8_MODES - 1];
   vp9_prob sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
   vp9_prob mbsplit_prob[VP9_NUMMBSPLITS - 1];
+
   vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES];
   vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES];
   vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES];
   vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES];
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_prob nzc_probs_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                        [NZC4X4_NODES];
+  vp9_prob nzc_probs_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                        [NZC8X8_NODES];
+  vp9_prob nzc_probs_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                          [NZC16X16_NODES];
+  vp9_prob nzc_probs_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                          [NZC32X32_NODES];
+  vp9_prob nzc_pcat_probs[MAX_NZC_CONTEXTS]
+                         [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA];
+#endif
 
   nmv_context nmvc;
   nmv_context pre_nmvc;
@@ -84,11 +97,35 @@
   vp9_coeff_probs pre_coef_probs_8x8[BLOCK_TYPES];
   vp9_coeff_probs pre_coef_probs_16x16[BLOCK_TYPES];
   vp9_coeff_probs pre_coef_probs_32x32[BLOCK_TYPES];
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_prob pre_nzc_probs_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                            [NZC4X4_NODES];
+  vp9_prob pre_nzc_probs_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                            [NZC8X8_NODES];
+  vp9_prob pre_nzc_probs_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                              [NZC16X16_NODES];
+  vp9_prob pre_nzc_probs_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                              [NZC32X32_NODES];
+  vp9_prob pre_nzc_pcat_probs[MAX_NZC_CONTEXTS]
+                             [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA];
+#endif
 
   vp9_coeff_count coef_counts_4x4[BLOCK_TYPES];
   vp9_coeff_count coef_counts_8x8[BLOCK_TYPES];
   vp9_coeff_count coef_counts_16x16[BLOCK_TYPES];
   vp9_coeff_count coef_counts_32x32[BLOCK_TYPES];
+#if CONFIG_CODE_NONZEROCOUNT
+  unsigned int nzc_counts_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                             [NZC4X4_TOKENS];
+  unsigned int nzc_counts_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                             [NZC8X8_TOKENS];
+  unsigned int nzc_counts_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                               [NZC16X16_TOKENS];
+  unsigned int nzc_counts_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                               [NZC32X32_TOKENS];
+  unsigned int nzc_pcat_counts[MAX_NZC_CONTEXTS]
+                              [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA][2];
+#endif
 
   nmv_context_counts NMVcount;
   vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
@@ -300,4 +337,31 @@
   buf[new_idx]++;
 }
 
+// TODO(debargha): merge the two functions
+static void set_mb_row(VP9_COMMON *cm, MACROBLOCKD *xd,
+                       int mb_row, int block_size) {
+  xd->mb_to_top_edge    = -((mb_row * 16) << 3);
+  xd->mb_to_bottom_edge = ((cm->mb_rows - block_size - mb_row) * 16) << 3;
+
+  // Are edges available for intra prediction?
+  xd->up_available    = (mb_row != 0);
+}
+
+static void set_mb_col(VP9_COMMON *cm, MACROBLOCKD *xd,
+                       int mb_col, int block_size) {
+  xd->mb_to_left_edge   = -((mb_col * 16) << 3);
+  xd->mb_to_right_edge  = ((cm->mb_cols - block_size - mb_col) * 16) << 3;
+
+  // Are edges available for intra prediction?
+  xd->left_available  = (mb_col > cm->cur_tile_mb_col_start);
+  xd->right_available = (mb_col + block_size < cm->cur_tile_mb_col_end);
+}
+
+static int get_mb_row(const MACROBLOCKD *xd) {
+  return ((-xd->mb_to_top_edge) >> 7);
+}
+
+static int get_mb_col(const MACROBLOCKD *xd) {
+  return ((-xd->mb_to_left_edge) >> 7);
+}
 #endif  // VP9_COMMON_VP9_ONYXC_INT_H_
diff --git a/vp9/common/vp9_postproc.h b/vp9/common/vp9_postproc.h
index 11f55ab..0a637f0 100644
--- a/vp9/common/vp9_postproc.h
+++ b/vp9/common/vp9_postproc.h
@@ -13,30 +13,26 @@
 #define VP9_COMMON_VP9_POSTPROC_H_
 
 #include "vpx_ports/mem.h"
+
 struct postproc_state {
-  int           last_q;
-  int           last_noise;
-  char          noise[3072];
+  int last_q;
+  int last_noise;
+  char noise[3072];
   DECLARE_ALIGNED(16, char, blackclamp[16]);
   DECLARE_ALIGNED(16, char, whiteclamp[16]);
   DECLARE_ALIGNED(16, char, bothclamp[16]);
 };
+
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_ppflags.h"
+
 int vp9_post_proc_frame(struct VP9Common *oci, YV12_BUFFER_CONFIG *dest,
                         vp9_ppflags_t *flags);
 
+void vp9_de_noise(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post,
+                  int q, int low_var_thresh, int flag);
 
-void vp9_de_noise(YV12_BUFFER_CONFIG         *source,
-                  YV12_BUFFER_CONFIG         *post,
-                  int                         q,
-                  int                         low_var_thresh,
-                  int                         flag);
-
-void vp9_deblock(YV12_BUFFER_CONFIG         *source,
-                 YV12_BUFFER_CONFIG         *post,
-                 int                         q,
-                 int                         low_var_thresh,
-                 int                         flag);
+void vp9_deblock(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post,
+                 int q, int low_var_thresh, int flag);
 
 #endif  // VP9_COMMON_VP9_POSTPROC_H_
diff --git a/vp9/common/vp9_pragmas.h b/vp9/common/vp9_pragmas.h
index cbeaf53..f079161 100644
--- a/vp9/common/vp9_pragmas.h
+++ b/vp9/common/vp9_pragmas.h
@@ -14,6 +14,7 @@
 #ifdef __INTEL_COMPILER
 #pragma warning(disable:997 1011 170)
 #endif
+
 #ifdef _MSC_VER
 #pragma warning(disable:4799)
 #endif
diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c
index 41a4e00..9fe66fc 100644
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -34,7 +34,6 @@
         pred_context += (m - 1)->mbmi.seg_id_predicted;
       break;
 
-
     case PRED_REF:
       pred_context = (m - cm->mode_info_stride)->mbmi.ref_predicted;
       if (xd->left_available)
@@ -101,8 +100,7 @@
       break;
 
     default:
-      // TODO *** add error trap code.
-      pred_context = 0;
+      pred_context = 0;  // *** add error trap code.
       break;
   }
 
@@ -114,39 +112,23 @@
 vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,
                           const MACROBLOCKD *const xd,
                           PRED_ID pred_id) {
-  vp9_prob pred_probability;
-  int pred_context;
-
-  // Get the appropriate prediction context
-  pred_context = vp9_get_pred_context(cm, xd, pred_id);
+  const int pred_context = vp9_get_pred_context(cm, xd, pred_id);
 
   switch (pred_id) {
     case PRED_SEG_ID:
-      pred_probability = cm->segment_pred_probs[pred_context];
-      break;
-
+      return cm->segment_pred_probs[pred_context];
     case PRED_REF:
-      pred_probability = cm->ref_pred_probs[pred_context];
-      break;
-
+      return cm->ref_pred_probs[pred_context];
     case PRED_COMP:
       // In keeping with convention elsewhre the probability returned is
       // the probability of a "0" outcome which in this case means the
       // probability of comp pred off.
-      pred_probability = cm->prob_comppred[pred_context];
-      break;
-
+      return cm->prob_comppred[pred_context];
     case PRED_MBSKIP:
-      pred_probability = cm->mbskip_pred_probs[pred_context];
-      break;
-
+      return cm->mbskip_pred_probs[pred_context];
     default:
-      // TODO *** add error trap code.
-      pred_probability = 128;
-      break;
+      return 128;  // *** add error trap code.
   }
-
-  return pred_probability;
 }
 
 // This function returns a context probability ptr for coding a given
@@ -154,71 +136,41 @@
 const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,
                                    const MACROBLOCKD *const xd,
                                    PRED_ID pred_id) {
-  const vp9_prob *pred_probability;
-  int pred_context;
-
-  // Get the appropriate prediction context
-  pred_context = vp9_get_pred_context(cm, xd, pred_id);
+  const int pred_context = vp9_get_pred_context(cm, xd, pred_id);
 
   switch (pred_id) {
     case PRED_SEG_ID:
-      pred_probability = &cm->segment_pred_probs[pred_context];
-      break;
-
+      return &cm->segment_pred_probs[pred_context];
     case PRED_REF:
-      pred_probability = &cm->ref_pred_probs[pred_context];
-      break;
-
+      return &cm->ref_pred_probs[pred_context];
     case PRED_COMP:
       // In keeping with convention elsewhre the probability returned is
       // the probability of a "0" outcome which in this case means the
       // probability of comp pred off.
-      pred_probability = &cm->prob_comppred[pred_context];
-      break;
-
+      return &cm->prob_comppred[pred_context];
     case PRED_MBSKIP:
-      pred_probability = &cm->mbskip_pred_probs[pred_context];
-      break;
-
+      return &cm->mbskip_pred_probs[pred_context];
     case PRED_SWITCHABLE_INTERP:
-      pred_probability = &cm->fc.switchable_interp_prob[pred_context][0];
-      break;
-
+      return &cm->fc.switchable_interp_prob[pred_context][0];
     default:
-      // TODO *** add error trap code.
-      pred_probability = NULL;
-      break;
+      return NULL;  // *** add error trap code.
   }
-
-  return pred_probability;
 }
 
 // This function returns the status of the given prediction signal.
 // I.e. is the predicted value for the given signal correct.
 unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,
                                 PRED_ID pred_id) {
-  unsigned char pred_flag = 0;
-
   switch (pred_id) {
     case PRED_SEG_ID:
-      pred_flag = xd->mode_info_context->mbmi.seg_id_predicted;
-      break;
-
+      return xd->mode_info_context->mbmi.seg_id_predicted;
     case PRED_REF:
-      pred_flag = xd->mode_info_context->mbmi.ref_predicted;
-      break;
-
+      return  xd->mode_info_context->mbmi.ref_predicted;
     case PRED_MBSKIP:
-      pred_flag = xd->mode_info_context->mbmi.mb_skip_coeff;
-      break;
-
+      return xd->mode_info_context->mbmi.mb_skip_coeff;
     default:
-      // TODO *** add error trap code.
-      pred_flag = 0;
-      break;
+      return 0;  // *** add error trap code.
   }
-
-  return pred_flag;
 }
 
 // This function sets the status of the given prediction signal.
@@ -280,7 +232,7 @@
       break;
 
     default:
-      // TODO *** add error trap code.
+      // *** add error trap code.
       break;
   }
 }
@@ -325,7 +277,6 @@
   MV_REFERENCE_FRAME pred_ref = LAST_FRAME;
 
   int segment_id = xd->mode_info_context->mbmi.segment_id;
-  int seg_ref_active;
   int i;
 
   unsigned char frame_allowed[MAX_REF_FRAMES] = {1, 1, 1, 1};
@@ -336,7 +287,7 @@
   unsigned char above_left_in_image;
 
   // Is segment coding ennabled
-  seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);
+  int seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);
 
   // Special case treatment if segment coding is enabled.
   // Dont allow prediction of a reference frame that the segment
@@ -389,9 +340,7 @@
 // Functions to computes a set of modified reference frame probabilities
 // to use when the prediction of the reference frame value fails
 void vp9_calc_ref_probs(int *count, vp9_prob *probs) {
-  int tot_count;
-
-  tot_count = count[0] + count[1] + count[2] + count[3];
+  int tot_count = count[0] + count[1] + count[2] + count[3];
   probs[0] = get_prob(count[0], tot_count);
 
   tot_count -= count[0];
@@ -407,19 +356,12 @@
 // they are not allowed for a given segment.
 void vp9_compute_mod_refprobs(VP9_COMMON *const cm) {
   int norm_cnt[MAX_REF_FRAMES];
-  int intra_count;
-  int inter_count;
-  int last_count;
-  int gfarf_count;
-  int gf_count;
-  int arf_count;
-
-  intra_count = cm->prob_intra_coded;
-  inter_count = (255 - intra_count);
-  last_count = (inter_count * cm->prob_last_coded) / 255;
-  gfarf_count = inter_count - last_count;
-  gf_count = (gfarf_count * cm->prob_gf_coded) / 255;
-  arf_count = gfarf_count - gf_count;
+  const int intra_count = cm->prob_intra_coded;
+  const int inter_count = (255 - intra_count);
+  const int last_count = (inter_count * cm->prob_last_coded) / 255;
+  const int gfarf_count = inter_count - last_count;
+  const int gf_count = (gfarf_count * cm->prob_gf_coded) / 255;
+  const int arf_count = gfarf_count - gf_count;
 
   // Work out modified reference frame probabilities to use where prediction
   // of the reference frame fails
diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h
index 52c4d42..49dcf0a 100644
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@@ -8,48 +8,48 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/vp9_blockd.h"
-
 #ifndef VP9_COMMON_VP9_PRED_COMMON_H_
 #define VP9_COMMON_VP9_PRED_COMMON_H_
 
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_onyxc_int.h"
 
 // Predicted items
 typedef enum {
-  PRED_SEG_ID = 0,               // Segment identifier
+  PRED_SEG_ID = 0,  // Segment identifier
   PRED_REF = 1,
   PRED_COMP = 2,
   PRED_MBSKIP = 3,
   PRED_SWITCHABLE_INTERP = 4
 } PRED_ID;
 
-extern unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,
-                                          const MACROBLOCKD *const xd,
-                                          PRED_ID pred_id);
+unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,
+                                   const MACROBLOCKD *const xd,
+                                   PRED_ID pred_id);
 
-extern vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,
-                                  const MACROBLOCKD *const xd,
-                                  PRED_ID pred_id);
+vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,
+                           const MACROBLOCKD *const xd,
+                           PRED_ID pred_id);
 
-extern const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,
-                                          const MACROBLOCKD *const xd,
-                                          PRED_ID pred_id);
+const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,
+                                   const MACROBLOCKD *const xd,
+                                   PRED_ID pred_id);
 
-extern unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,
-                                       PRED_ID pred_id);
+unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,
+                                PRED_ID pred_id);
 
-extern void vp9_set_pred_flag(MACROBLOCKD *const xd,
-                              PRED_ID pred_id,
-                              unsigned char pred_flag);
+void vp9_set_pred_flag(MACROBLOCKD *const xd,
+                       PRED_ID pred_id,
+                       unsigned char pred_flag);
 
 
-extern unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm,
-                                           const MACROBLOCKD *const xd,
-                                           int MbIndex);
+unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm,
+                                    const MACROBLOCKD *const xd,
+                                    int MbIndex);
 
-extern MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,
-                                       const MACROBLOCKD *const xd);
-extern void vp9_compute_mod_refprobs(VP9_COMMON *const cm);
+MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,
+                                    const MACROBLOCKD *const xd);
+
+void vp9_compute_mod_refprobs(VP9_COMMON *const cm);
 
 #endif  // VP9_COMMON_VP9_PRED_COMMON_H_
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index eb8de21..186532c 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -9,20 +9,19 @@
  */
 
 #include <stdio.h>
+
 #include "./vpx_config.h"
 #include "vp9_rtcd.h"
 #include "vp9/common/vp9_reconintra.h"
 #include "vpx_mem/vpx_mem.h"
 
-/* For skip_recon_mb(), add vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd)
- * and vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd).
- */
+// For skip_recon_mb(), add vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd)
+// and vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd).
 
-/* Using multiplication and shifting instead of division in diagonal prediction.
- * iscale table is calculated from ((1<<16) + (i+2)/2) / (i+2) and used as
- * ((A + B) * iscale[i] + (1<<15)) >> 16;
- * where A and B are weighted pixel values.
- */
+// Using multiplication and shifting instead of division in diagonal prediction.
+// iscale table is calculated from ((1 << 16) + (i + 2) / 2) / (i+2) and used as
+// ((A + B) * iscale[i] + (1 << 15)) >> 16;
+// where A and B are weighted pixel values.
 static const unsigned int iscale[64] = {
   32768, 21845, 16384, 13107, 10923,  9362,  8192,  7282,
    6554,  5958,  5461,  5041,  4681,  4369,  4096,  3855,
@@ -34,101 +33,107 @@
    1130,  1111,  1092,  1074,  1057,  1040,  1024,  1008,
 };
 
+static INLINE int iscale_round(int value, int i) {
+    return ROUND_POWER_OF_TWO(value * iscale[i], 16);
+}
 
 static void d27_predictor(uint8_t *ypred_ptr, int y_stride, int n,
                           uint8_t *yabove_row, uint8_t *yleft_col) {
-  int r, c, h, w, v;
-  int a, b;
+  int r, c;
+
   r = 0;
   for (c = 0; c < n - 2; c++) {
-    if (c & 1)
-      a = yleft_col[r + 1];
-    else
-      a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1;
-    b = yabove_row[c + 2];
-    ypred_ptr[c] = ((2 * a + (c + 1) * b) * iscale[1+c] + (1<<15)) >> 16;
+    int a = c & 1 ? yleft_col[r + 1]
+                  : ROUND_POWER_OF_TWO(yleft_col[r] + yleft_col[r + 1], 1);
+    int b = yabove_row[c + 2];
+    ypred_ptr[c] = iscale_round(2 * a + (c + 1) * b, 1 + c);
   }
+
   for (r = 1; r < n / 2 - 1; r++) {
     for (c = 0; c < n - 2 - 2 * r; c++) {
-      if (c & 1)
-        a = yleft_col[r + 1];
-      else
-        a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1;
-      b = ypred_ptr[(r - 1) * y_stride + c + 2];
-      ypred_ptr[r * y_stride + c] =
-                ((2 * a + (c + 1) * b) * iscale[1+c] + (1<<15)) >> 16;
+      int a = c & 1 ? yleft_col[r + 1]
+                    : ROUND_POWER_OF_TWO(yleft_col[r] + yleft_col[r + 1], 1);
+      int b = ypred_ptr[(r - 1) * y_stride + c + 2];
+      ypred_ptr[r * y_stride + c] = iscale_round(2 * a + (c + 1) * b, 1 + c);
     }
   }
-  for (; r < n - 1; ++r) {
+
+  for (; r < n - 1; r++) {
     for (c = 0; c < n; c++) {
-      v = (c & 1 ? yleft_col[r + 1] : (yleft_col[r] + yleft_col[r + 1] + 1) >> 1);
-      h = r - c / 2;
+      int v = c & 1 ? yleft_col[r + 1]
+                    : ROUND_POWER_OF_TWO(yleft_col[r] + yleft_col[r + 1], 1);
+      int h = r - c / 2;
       ypred_ptr[h * y_stride + c] = v;
     }
   }
+
   c = 0;
   r = n - 1;
-  ypred_ptr[r * y_stride] = (ypred_ptr[(r - 1) * y_stride] +
-                             yleft_col[r] + 1) >> 1;
+  ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(ypred_ptr[(r - 1) * y_stride] +
+                                               yleft_col[r], 1);
   for (r = n - 2; r >= n / 2; --r) {
-    w = c + (n - 1 - r) * 2;
-    ypred_ptr[r * y_stride + w] = (ypred_ptr[(r - 1) * y_stride + w] +
-                                   ypred_ptr[r * y_stride + w - 1] + 1) >> 1;
+    int w = c + (n - 1 - r) * 2;
+    ypred_ptr[r * y_stride + w] =
+        ROUND_POWER_OF_TWO(ypred_ptr[(r - 1) * y_stride + w] +
+                           ypred_ptr[r * y_stride + w - 1], 1);
   }
+
   for (c = 1; c < n; c++) {
     for (r = n - 1; r >= n / 2 + c / 2; --r) {
-      w = c + (n - 1 - r) * 2;
-      ypred_ptr[r * y_stride + w] = (ypred_ptr[(r - 1) * y_stride + w] +
-                                     ypred_ptr[r * y_stride + w - 1] + 1) >> 1;
+      int w = c + (n - 1 - r) * 2;
+      ypred_ptr[r * y_stride + w] =
+          ROUND_POWER_OF_TWO(ypred_ptr[(r - 1) * y_stride + w] +
+                             ypred_ptr[r * y_stride + w - 1], 1);
     }
   }
 }
 
 static void d63_predictor(uint8_t *ypred_ptr, int y_stride, int n,
                           uint8_t *yabove_row, uint8_t *yleft_col) {
-  int r, c, h, w, v;
-  int a, b;
+  int r, c;
+
   c = 0;
   for (r = 0; r < n - 2; r++) {
-    if (r & 1)
-      a = yabove_row[c + 1];
-    else
-      a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1;
-    b = yleft_col[r + 2];
-    ypred_ptr[r * y_stride] = ((2 * a + (r + 1) * b) * iscale[1+r] +
-                              (1<<15)) >> 16;
+    int a = r & 1 ? yabove_row[c + 1]
+                  : ROUND_POWER_OF_TWO(yabove_row[c] + yabove_row[c + 1], 1);
+    int b = yleft_col[r + 2];
+    ypred_ptr[r * y_stride] = iscale_round(2 * a + (r + 1) * b, 1 + r);
   }
+
   for (c = 1; c < n / 2 - 1; c++) {
     for (r = 0; r < n - 2 - 2 * c; r++) {
-      if (r & 1)
-        a = yabove_row[c + 1];
-      else
-        a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1;
-      b = ypred_ptr[(r + 2) * y_stride + c - 1];
-      ypred_ptr[r * y_stride + c] = ((2 * a + (c + 1) * b) * iscale[1+c] +
-                                    (1<<15)) >> 16;
+      int a = r & 1 ? yabove_row[c + 1]
+                    : ROUND_POWER_OF_TWO(yabove_row[c] + yabove_row[c + 1], 1);
+      int b = ypred_ptr[(r + 2) * y_stride + c - 1];
+      ypred_ptr[r * y_stride + c] = iscale_round(2 * a + (c + 1) * b, 1 + c);
     }
   }
+
   for (; c < n - 1; ++c) {
     for (r = 0; r < n; r++) {
-      v = (r & 1 ? yabove_row[c + 1] : (yabove_row[c] + yabove_row[c + 1] + 1) >> 1);
-      w = c - r / 2;
+      int v = r & 1 ? yabove_row[c + 1]
+                    : ROUND_POWER_OF_TWO(yabove_row[c] + yabove_row[c + 1], 1);
+      int w = c - r / 2;
       ypred_ptr[r * y_stride + w] = v;
     }
   }
+
   r = 0;
   c = n - 1;
-  ypred_ptr[c] = (ypred_ptr[(c - 1)] + yabove_row[c] + 1) >> 1;
+  ypred_ptr[c] = ROUND_POWER_OF_TWO(ypred_ptr[(c - 1)] + yabove_row[c], 1);
   for (c = n - 2; c >= n / 2; --c) {
-    h = r + (n - 1 - c) * 2;
-    ypred_ptr[h * y_stride + c] = (ypred_ptr[h * y_stride + c - 1] +
-                                   ypred_ptr[(h - 1) * y_stride + c] + 1) >> 1;
+    int h = r + (n - 1 - c) * 2;
+    ypred_ptr[h * y_stride + c] =
+         ROUND_POWER_OF_TWO(ypred_ptr[h * y_stride + c - 1] +
+                            ypred_ptr[(h - 1) * y_stride + c], 1);
   }
+
   for (r = 1; r < n; r++) {
     for (c = n - 1; c >= n / 2 + r / 2; --c) {
-      h = r + (n - 1 - c) * 2;
-      ypred_ptr[h * y_stride + c] = (ypred_ptr[h * y_stride + c - 1] +
-                                     ypred_ptr[(h - 1) * y_stride + c] + 1) >> 1;
+      int h = r + (n - 1 - c) * 2;
+      ypred_ptr[h * y_stride + c] =
+          ROUND_POWER_OF_TWO(ypred_ptr[h * y_stride + c - 1] +
+                             ypred_ptr[(h - 1) * y_stride + c], 1);
     }
   }
 }
@@ -136,27 +141,28 @@
 static void d45_predictor(uint8_t *ypred_ptr, int y_stride, int n,
                           uint8_t *yabove_row, uint8_t *yleft_col) {
   int r, c;
+
   for (r = 0; r < n - 1; ++r) {
     for (c = 0; c <= r; ++c) {
-      ypred_ptr[(r - c) * y_stride + c] =
-        ((yabove_row[r + 1] * (c + 1) +
-          yleft_col[r + 1] * (r - c + 1)) * iscale[r] + (1<<15)) >> 16;
+      ypred_ptr[(r - c) * y_stride + c] = iscale_round(
+          yabove_row[r + 1] * (c + 1) + yleft_col[r + 1] * (r - c + 1), r);
     }
   }
+
   for (c = 0; c <= r; ++c) {
     int yabove_ext = yabove_row[r];  // clip_pixel(2 * yabove_row[r] -
                                      //            yabove_row[r - 1]);
     int yleft_ext = yleft_col[r];  // clip_pixel(2 * yleft_col[r] -
                                    //            yleft_col[r-1]);
     ypred_ptr[(r - c) * y_stride + c] =
-      ((yabove_ext * (c + 1) +
-        yleft_ext * (r - c + 1)) * iscale[r] + (1<<15)) >> 16;
+         iscale_round(yabove_ext * (c + 1) + yleft_ext * (r - c + 1), r);
   }
   for (r = 1; r < n; ++r) {
     for (c = n - r; c < n; ++c) {
       const int yabove_ext = ypred_ptr[(r - 1) * y_stride + c];
       const int yleft_ext = ypred_ptr[r * y_stride + c - 1];
-      ypred_ptr[r * y_stride + c] = (yabove_ext + yleft_ext + 1) >> 1;
+      ypred_ptr[r * y_stride + c] =
+          ROUND_POWER_OF_TWO(yabove_ext + yleft_ext, 1);
     }
   }
 }
@@ -165,7 +171,7 @@
                            uint8_t *yabove_row, uint8_t *yleft_col) {
   int r, c;
   for (c = 0; c < n; c++)
-    ypred_ptr[c] = (yabove_row[c - 1] + yabove_row[c] + 1) >> 1;
+    ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 1] + yabove_row[c], 1);
   ypred_ptr += y_stride;
   for (c = 0; c < n; c++)
     ypred_ptr[c] = yabove_row[c - 1];
@@ -199,9 +205,10 @@
 static void d153_predictor(uint8_t *ypred_ptr, int y_stride, int n,
                            uint8_t *yabove_row, uint8_t *yleft_col) {
   int r, c;
-  ypred_ptr[0] = (yabove_row[-1] + yleft_col[0] + 1) >> 1;
+  ypred_ptr[0] = ROUND_POWER_OF_TWO(yabove_row[-1] + yleft_col[0], 1);
   for (r = 1; r < n; r++)
-    ypred_ptr[r * y_stride] = (yleft_col[r - 1] + yleft_col[r] + 1) >> 1;
+    ypred_ptr[r * y_stride] =
+        ROUND_POWER_OF_TWO(yleft_col[r - 1] + yleft_col[r], 1);
   ypred_ptr++;
   ypred_ptr[0] = yabove_row[-1];
   for (r = 1; r < n; r++)
@@ -268,6 +275,20 @@
   }
 }
 
+static INLINE int log2_minus_1(int n) {
+  switch (n) {
+    case 4: return 1;
+    case 8: return 2;
+    case 16: return 3;
+    case 32: return 4;
+    case 64: return 5;
+    default:
+      assert(0);
+      return 0;
+  }
+}
+
+
 void vp9_build_intra_predictors_internal(uint8_t *src, int src_stride,
                                          uint8_t *ypred_ptr,
                                          int y_stride, int mode, int bsize,
@@ -313,22 +334,7 @@
       int i;
       int shift;
       int average = 0;
-      int log2_bsize_minus_1;
-
-      assert(bsize == 4 || bsize == 8 || bsize == 16 || bsize == 32 ||
-             bsize == 64);
-      if (bsize == 4) {
-        log2_bsize_minus_1 = 1;
-      } else if (bsize == 8) {
-        log2_bsize_minus_1 = 2;
-      } else if (bsize == 16) {
-        log2_bsize_minus_1 = 3;
-      } else if (bsize == 32) {
-        log2_bsize_minus_1 = 4;
-      } else {
-        assert(bsize == 64);
-        log2_bsize_minus_1 = 5;
-      }
+      int log2_bsize_minus_1 = log2_minus_1(bsize);
 
       if (up_available || left_available) {
         if (up_available) {
@@ -343,7 +349,7 @@
           }
         }
         shift = log2_bsize_minus_1 + up_available + left_available;
-        expected_dc = (average + (1 << (shift - 1))) >> shift;
+        expected_dc = ROUND_POWER_OF_TWO(average, shift);
       } else {
         expected_dc = 128;
       }
@@ -354,21 +360,19 @@
       }
     }
     break;
-    case V_PRED: {
+    case V_PRED:
       for (r = 0; r < bsize; r++) {
         memcpy(ypred_ptr, yabove_row, bsize);
         ypred_ptr += y_stride;
       }
-    }
-    break;
-    case H_PRED: {
+      break;
+    case H_PRED:
       for (r = 0; r < bsize; r++) {
         vpx_memset(ypred_ptr, yleft_col[r], bsize);
         ypred_ptr += y_stride;
       }
-    }
-    break;
-    case TM_PRED: {
+      break;
+    case TM_PRED:
       for (r = 0; r < bsize; r++) {
         for (c = 0; c < bsize; c++) {
           ypred_ptr[c] = clip_pixel(yleft_col[r] + yabove_row[c] - ytop_left);
@@ -376,32 +380,25 @@
 
         ypred_ptr += y_stride;
       }
-    }
-    break;
-    case D45_PRED: {
+      break;
+    case D45_PRED:
       d45_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
-    }
-    break;
-    case D135_PRED: {
+      break;
+    case D135_PRED:
       d135_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
-    }
-    break;
-    case D117_PRED: {
+      break;
+    case D117_PRED:
       d117_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
-    }
-    break;
-    case D153_PRED: {
+      break;
+    case D153_PRED:
       d153_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
-    }
-    break;
-    case D27_PRED: {
+      break;
+    case D27_PRED:
       d27_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
-    }
-    break;
-    case D63_PRED: {
+      break;
+    case D63_PRED:
       d63_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
-    }
-    break;
+      break;
     case I8X8_PRED:
     case B_PRED:
     case NEARESTMV:
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index dab88a3..04b67b9 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -139,6 +139,29 @@
 prototype void vp9_intra_uv4x4_predict "struct macroblockd *xd, struct blockd *x, int b_mode, uint8_t *predictor"
 specialize vp9_intra_uv4x4_predict;
 
+if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
+prototype void vp9_add_residual_4x4 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"
+specialize vp9_add_residual_4x4 sse2
+
+prototype void vp9_add_residual_8x8 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"
+specialize vp9_add_residual_8x8 sse2
+
+prototype void vp9_add_residual_16x16 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"
+specialize vp9_add_residual_16x16 sse2
+
+prototype void vp9_add_residual_32x32 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"
+specialize vp9_add_residual_32x32 sse2
+
+prototype void vp9_add_constant_residual_8x8 "const int16_t diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"
+specialize vp9_add_constant_residual_8x8 sse2
+
+prototype void vp9_add_constant_residual_16x16 "const int16_t diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"
+specialize vp9_add_constant_residual_16x16 sse2
+
+prototype void vp9_add_constant_residual_32x32 "const int16_t diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"
+specialize vp9_add_constant_residual_32x32 sse2
+fi
+
 #
 # Loopfilter
 #
@@ -299,6 +322,9 @@
 prototype void vp9_short_iht16x16 "int16_t *input, int16_t *output, int pitch, int tx_type"
 specialize vp9_short_iht16x16
 
+prototype void vp9_idct4_1d "int16_t *input, int16_t *output"
+specialize vp9_idct4_1d sse2
+
 # dct and add
 
 prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
diff --git a/vp9/common/vp9_tile_common.c b/vp9/common/vp9_tile_common.c
index 29f89b6..53a1eb8 100644
--- a/vp9/common/vp9_tile_common.c
+++ b/vp9/common/vp9_tile_common.c
@@ -10,6 +10,11 @@
 
 #include "vp9/common/vp9_tile_common.h"
 
+#define MIN_TILE_WIDTH 256
+#define MAX_TILE_WIDTH 4096
+#define MIN_TILE_WIDTH_SBS (MIN_TILE_WIDTH >> 6)
+#define MAX_TILE_WIDTH_SBS (MAX_TILE_WIDTH >> 6)
+
 static void vp9_get_tile_offsets(VP9_COMMON *cm, int *min_tile_off,
                                  int *max_tile_off, int tile_idx,
                                  int log2_n_tiles, int n_mbs) {
@@ -35,8 +40,6 @@
                        cm->log2_tile_rows, cm->mb_rows);
 }
 
-#define MIN_TILE_WIDTH_SBS (MIN_TILE_WIDTH >> 6)
-#define MAX_TILE_WIDTH_SBS (MAX_TILE_WIDTH >> 6)
 
 void vp9_get_tile_n_bits(VP9_COMMON *cm, int *min_log2_n_tiles_ptr,
                          int *delta_log2_n_tiles) {
diff --git a/vp9/common/vp9_tile_common.h b/vp9/common/vp9_tile_common.h
index ea69356..7ea3772 100644
--- a/vp9/common/vp9_tile_common.h
+++ b/vp9/common/vp9_tile_common.h
@@ -13,9 +13,6 @@
 
 #include "vp9/common/vp9_onyxc_int.h"
 
-#define MIN_TILE_WIDTH 256
-#define MAX_TILE_WIDTH 4096
-
 void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx);
 
 void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx);
diff --git a/vp9/common/x86/vp9_idctllm_x86.c b/vp9/common/x86/vp9_idctllm_x86.c
index 7b3c579..3d7a148 100644
--- a/vp9/common/x86/vp9_idctllm_x86.c
+++ b/vp9/common/x86/vp9_idctllm_x86.c
@@ -77,10 +77,10 @@
 void vp9_short_idct4x4llm_sse2(int16_t *input, int16_t *output, int pitch) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
-  const __m128i cst = _mm_setr_epi16((short)cospi_16_64, (short)cospi_16_64,
-                                     (short)cospi_16_64, (short)-cospi_16_64,
-                                     (short)cospi_24_64, (short)-cospi_8_64,
-                                     (short)cospi_8_64, (short)cospi_24_64);
+  const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
+                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
+                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
+                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const int half_pitch = pitch >> 1;
   __m128i input0, input1, input2, input3;
@@ -198,4 +198,40 @@
   input3 = _mm_srli_si128(input3, 8);
   _mm_storel_epi64((__m128i *)(output + 2 * half_pitch), input3);
 }
+
+void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
+                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
+                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
+                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);
+  const __m128i c2 = _mm_setr_epi16(1, 1, 1, 1, 1, -1, 1, -1);
+
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i in, temp;
+
+  // Load input data.
+  in = _mm_loadl_epi64((__m128i *)input);
+
+  // Construct i3, i1, i3, i1, i2, i0, i2, i0
+  in = _mm_shufflelo_epi16(in, 0xd8);
+  in = _mm_unpacklo_epi32(in, in);
+
+  // Stage 1
+  in = _mm_madd_epi16(in, c1);
+  in = _mm_add_epi32(in, rounding);
+  in = _mm_srai_epi32(in, DCT_CONST_BITS);
+  in = _mm_packs_epi32(in, zero);
+
+  // Stage 2
+  temp = _mm_shufflelo_epi16(in, 0x9c);
+  in = _mm_shufflelo_epi16(in, 0xc9);
+  in = _mm_unpacklo_epi64(temp, in);
+  in = _mm_madd_epi16(in, c2);
+  in = _mm_packs_epi32(in, zero);
+
+  // Store results
+  _mm_storel_epi64((__m128i *)output, in);
+}
+
 #endif
diff --git a/vp9/decoder/vp9_dboolhuff.c b/vp9/decoder/vp9_dboolhuff.c
index 99c3664..7e3b464 100644
--- a/vp9/decoder/vp9_dboolhuff.c
+++ b/vp9/decoder/vp9_dboolhuff.c
@@ -8,11 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-#include "vp9/decoder/vp9_dboolhuff.h"
 #include "vpx_ports/mem.h"
 #include "vpx_mem/vpx_mem.h"
 
+#include "vp9/decoder/vp9_dboolhuff.h"
+
 int vp9_start_decode(BOOL_DECODER *br,
                      const unsigned char *source,
                      unsigned int source_sz) {
diff --git a/vp9/decoder/vp9_dboolhuff.h b/vp9/decoder/vp9_dboolhuff.h
index cf31d38..eeb5c35 100644
--- a/vp9/decoder/vp9_dboolhuff.h
+++ b/vp9/decoder/vp9_dboolhuff.h
@@ -13,6 +13,7 @@
 
 #include <stddef.h>
 #include <limits.h>
+
 #include "./vpx_config.h"
 #include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 326c802..86dfaf6 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -173,7 +173,6 @@
       m->mbmi.mb_skip_coeff = 0;
   }
 
-
   y_mode = m->mbmi.sb_type ?
       read_kf_sb_ymode(bc,
           pbi->common.sb_kf_ymode_prob[pbi->common.kf_ymode_probs_index]):
@@ -677,22 +676,23 @@
   mbmi->need_to_clamp_secondmv = 0;
   mbmi->second_ref_frame = NONE;
 
-  // Distance of Mb to the various image edges.
-  // These specified to 8th pel as they are always compared to MV values
-  // that are in 1/8th pel units
-  xd->mb_to_left_edge = mb_to_left_edge
-                      = -((mb_col * 16) << 3);
-  mb_to_left_edge -= LEFT_TOP_MARGIN;
-
-  xd->mb_to_right_edge = mb_to_right_edge
-                       = ((pbi->common.mb_cols - mb_size - mb_col) * 16) << 3;
-  mb_to_right_edge += RIGHT_BOTTOM_MARGIN;
-
   // Make sure the MACROBLOCKD mode info pointer is pointed at the
   // correct entry for the current macroblock.
   xd->mode_info_context = mi;
   xd->prev_mode_info_context = prev_mi;
 
+  // Distance of Mb to the various image edges.
+  // These specified to 8th pel as they are always compared to MV values
+  // that are in 1/8th pel units
+  set_mb_row(cm, xd, mb_row, mb_size);
+  set_mb_col(cm, xd, mb_col, mb_size);
+
+  mb_to_left_edge = xd->mb_to_left_edge;
+  mb_to_left_edge -= LEFT_TOP_MARGIN;
+
+  mb_to_right_edge = xd->mb_to_right_edge;
+  mb_to_right_edge += RIGHT_BOTTOM_MARGIN;
+
   // Read the macroblock segment id.
   read_mb_segment_id(pbi, mb_row, mb_col, bc);
 
@@ -750,17 +750,6 @@
 
       vp9_mv_ref_probs(&pbi->common, mv_ref_p,
                        mbmi->mb_mode_context[ref_frame]);
-      /*
-      if (pbi->common.current_video_frame == 1) {
-	int k = mbmi->mb_mode_context[ref_frame];
-	printf("vp9_mode_contexts: [%d %d %d %d] %d %d %d %d\n",
-	       mb_row, mb_col, ref_frame, k,
-	       cm->fc.vp9_mode_contexts[k][0],
-	       cm->fc.vp9_mode_contexts[k][1],
-	       cm->fc.vp9_mode_contexts[k][2],
-	       cm->fc.vp9_mode_contexts[k][3]);
-      }
-      */
 
       // If the segment level skip mode enabled
       if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) {
@@ -1176,20 +1165,274 @@
   vpx_memset(cm->mbskip_pred_probs, 0, sizeof(cm->mbskip_pred_probs));
   if (pbi->common.mb_no_coeff_skip) {
     int k;
-    for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+    for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
       cm->mbskip_pred_probs[k] = (vp9_prob)vp9_read_literal(bc, 8);
+    }
   }
 
   mb_mode_mv_init(pbi, bc);
 }
 
+#if CONFIG_CODE_NONZEROCOUNT
+static uint16_t read_nzc(VP9_COMMON *const cm,
+                         int nzc_context,
+                         TX_SIZE tx_size,
+                         int ref,
+                         int type,
+                         BOOL_DECODER* const bc) {
+  int c, e;
+  uint16_t nzc;
+  if (tx_size == TX_32X32) {
+    c = treed_read(bc, vp9_nzc32x32_tree,
+                   cm->fc.nzc_probs_32x32[nzc_context][ref][type]);
+    cm->fc.nzc_counts_32x32[nzc_context][ref][type][c]++;
+  } else if (tx_size == TX_16X16) {
+    c = treed_read(bc, vp9_nzc16x16_tree,
+                   cm->fc.nzc_probs_16x16[nzc_context][ref][type]);
+    cm->fc.nzc_counts_16x16[nzc_context][ref][type][c]++;
+  } else if (tx_size == TX_8X8) {
+    c = treed_read(bc, vp9_nzc8x8_tree,
+                   cm->fc.nzc_probs_8x8[nzc_context][ref][type]);
+    cm->fc.nzc_counts_8x8[nzc_context][ref][type][c]++;
+  } else if (tx_size == TX_4X4) {
+    c = treed_read(bc, vp9_nzc4x4_tree,
+                   cm->fc.nzc_probs_4x4[nzc_context][ref][type]);
+    cm->fc.nzc_counts_4x4[nzc_context][ref][type][c]++;
+  } else {
+    assert(0);
+  }
+  nzc = vp9_basenzcvalue[c];
+  if ((e = vp9_extranzcbits[c])) {
+    int x = 0;
+    while (e--) {
+      int b = vp9_read(
+          bc, cm->fc.nzc_pcat_probs[nzc_context][c - NZC_TOKENS_NOEXTRA][e]);
+      x |= (b << e);
+      cm->fc.nzc_pcat_counts[nzc_context][c - NZC_TOKENS_NOEXTRA][e][b]++;
+    }
+    nzc += x;
+  }
+  if (tx_size == TX_32X32)
+    assert(nzc <= 1024);
+  else if (tx_size == TX_16X16)
+    assert(nzc <= 256);
+  else if (tx_size == TX_8X8)
+    assert(nzc <= 64);
+  else if (tx_size == TX_4X4)
+    assert(nzc <= 16);
+  return nzc;
+}
+
+static void read_nzcs_sb64(VP9_COMMON *const cm,
+                           MACROBLOCKD* xd,
+                           int mb_row,
+                           int mb_col,
+                           BOOL_DECODER* const bc) {
+  MODE_INFO *m = xd->mode_info_context;
+  MB_MODE_INFO *const mi = &m->mbmi;
+  int j, nzc_context;
+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
+
+  assert(mb_col == get_mb_col(xd));
+  assert(mb_row == get_mb_row(xd));
+
+  vpx_memset(m->mbmi.nzcs, 0, 384 * sizeof(m->mbmi.nzcs[0]));
+
+  if (mi->mb_skip_coeff)
+    return;
+
+  switch (mi->txfm_size) {
+    case TX_32X32:
+      for (j = 0; j < 256; j += 64) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_32X32, ref, 0, bc);
+      }
+      for (j = 256; j < 384; j += 64) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_32X32, ref, 1, bc);
+      }
+      break;
+
+    case TX_16X16:
+      for (j = 0; j < 256; j += 16) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 0, bc);
+      }
+      for (j = 256; j < 384; j += 16) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 1, bc);
+      }
+      break;
+
+    case TX_8X8:
+      for (j = 0; j < 256; j += 4) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 0, bc);
+      }
+      for (j = 256; j < 384; j += 4) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc);
+      }
+      break;
+
+    case TX_4X4:
+      for (j = 0; j < 256; ++j) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 0, bc);
+      }
+      for (j = 256; j < 384; ++j) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc);
+      }
+      break;
+
+    default:
+      break;
+  }
+}
+
+static void read_nzcs_sb32(VP9_COMMON *const cm,
+                           MACROBLOCKD* xd,
+                           int mb_row,
+                           int mb_col,
+                           BOOL_DECODER* const bc) {
+  MODE_INFO *m = xd->mode_info_context;
+  MB_MODE_INFO *const mi = &m->mbmi;
+  int j, nzc_context;
+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
+
+  assert(mb_col == get_mb_col(xd));
+  assert(mb_row == get_mb_row(xd));
+
+  vpx_memset(m->mbmi.nzcs, 0, 384 * sizeof(m->mbmi.nzcs[0]));
+
+  if (mi->mb_skip_coeff)
+    return;
+
+  switch (mi->txfm_size) {
+    case TX_32X32:
+      for (j = 0; j < 64; j += 64) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_32X32, ref, 0, bc);
+      }
+      for (j = 64; j < 96; j += 16) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 1, bc);
+      }
+      break;
+
+    case TX_16X16:
+      for (j = 0; j < 64; j += 16) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 0, bc);
+      }
+      for (j = 64; j < 96; j += 16) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 1, bc);
+      }
+      break;
+
+    case TX_8X8:
+      for (j = 0; j < 64; j += 4) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 0, bc);
+      }
+      for (j = 64; j < 96; j += 4) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc);
+      }
+      break;
+
+    case TX_4X4:
+      for (j = 0; j < 64; ++j) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 0, bc);
+      }
+      for (j = 64; j < 96; ++j) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc);
+      }
+      break;
+
+    default:
+      break;
+  }
+}
+
+static void read_nzcs_mb16(VP9_COMMON *const cm,
+                           MACROBLOCKD* xd,
+                           int mb_row,
+                           int mb_col,
+                           BOOL_DECODER* const bc) {
+  MODE_INFO *m = xd->mode_info_context;
+  MB_MODE_INFO *const mi = &m->mbmi;
+  int j, nzc_context;
+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
+
+  assert(mb_col == get_mb_col(xd));
+  assert(mb_row == get_mb_row(xd));
+
+  vpx_memset(m->mbmi.nzcs, 0, 384 * sizeof(m->mbmi.nzcs[0]));
+
+  if (mi->mb_skip_coeff)
+    return;
+
+  switch (mi->txfm_size) {
+    case TX_16X16:
+      for (j = 0; j < 16; j += 16) {
+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 0, bc);
+      }
+      for (j = 16; j < 24; j += 4) {
+        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc);
+      }
+      break;
+
+    case TX_8X8:
+      for (j = 0; j < 16; j += 4) {
+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 0, bc);
+      }
+      if (mi->mode == I8X8_PRED || mi->mode == SPLITMV) {
+        for (j = 16; j < 24; ++j) {
+          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+          m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc);
+        }
+      } else {
+        for (j = 16; j < 24; j += 4) {
+          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+          m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc);
+        }
+      }
+      break;
+
+    case TX_4X4:
+      for (j = 0; j < 16; ++j) {
+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 0, bc);
+      }
+      for (j = 16; j < 24; ++j) {
+        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc);
+      }
+      break;
+
+    default:
+      break;
+  }
+}
+#endif  // CONFIG_CODE_NONZEROCOUNT
+
 void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,
                            MACROBLOCKD* const xd,
                            int mb_row,
                            int mb_col,
                            BOOL_DECODER* const bc) {
+  VP9_COMMON *const cm = &pbi->common;
   MODE_INFO *mi = xd->mode_info_context;
   MODE_INFO *prev_mi = xd->prev_mode_info_context;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
 
   if (pbi->common.frame_type == KEY_FRAME) {
     kfread_modes(pbi, mi, mb_row, mb_col, bc);
@@ -1199,4 +1442,28 @@
                       mi->mbmi.ref_frame - 1, mi->mbmi.second_ref_frame - 1,
                       pbi->common.active_ref_scale);
   }
+#if CONFIG_CODE_NONZEROCOUNT
+  if (mbmi->sb_type == BLOCK_SIZE_SB64X64)
+    read_nzcs_sb64(cm, xd, mb_row, mb_col, bc);
+  else if (mbmi->sb_type == BLOCK_SIZE_SB32X32)
+    read_nzcs_sb32(cm, xd, mb_row, mb_col, bc);
+  else
+    read_nzcs_mb16(cm, xd, mb_row, mb_col, bc);
+#endif  // CONFIG_CODE_NONZEROCOUNT
+
+  if (mbmi->sb_type) {
+    const int n_mbs = 1 << mbmi->sb_type;
+    const int y_mbs = MIN(n_mbs, cm->mb_rows - mb_row);
+    const int x_mbs = MIN(n_mbs, cm->mb_cols - mb_col);
+    const int mis = cm->mode_info_stride;
+    int x, y;
+
+    for (y = 0; y < y_mbs; y++) {
+      for (x = !y; x < x_mbs; x++) {
+        mi[y * mis + x] = *mi;
+      }
+    }
+  } else {
+    update_blockd_bmi(xd);
+  }
 }
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index b44d659..5b3e1bd 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -39,7 +39,7 @@
 
 #define COEFCOUNT_TESTING
 
-//#define DEC_DEBUG
+// #define DEC_DEBUG
 #ifdef DEC_DEBUG
 int dec_debug = 0;
 #endif
@@ -201,8 +201,7 @@
 
 static void decode_16x16(VP9D_COMP *pbi, MACROBLOCKD *xd,
                          BOOL_DECODER* const bc) {
-  BLOCKD *bd = &xd->block[0];
-  TX_TYPE tx_type = get_tx_type_16x16(xd, bd);
+  TX_TYPE tx_type = get_tx_type_16x16(xd, 0);
 #ifdef DEC_DEBUG
   if (dec_debug) {
     int i;
@@ -240,13 +239,13 @@
                        BOOL_DECODER* const bc) {
   // First do Y
   // if the first one is DCT_DCT assume all the rest are as well
-  TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[0]);
+  TX_TYPE tx_type = get_tx_type_8x8(xd, 0);
 #ifdef DEC_DEBUG
   if (dec_debug) {
     int i;
     printf("\n");
     printf("qcoeff 8x8\n");
-    for (i = 0; i < 400; i++) {
+    for (i = 0; i < 384; i++) {
       printf("%3d ", xd->qcoeff[i]);
       if (i % 16 == 15) printf("\n");
     }
@@ -267,7 +266,7 @@
         int i8x8mode = b->bmi.as_mode.first;
         vp9_intra8x8_predict(xd, b, i8x8mode, b->predictor);
       }
-      tx_type = get_tx_type_8x8(xd, &xd->block[ib]);
+      tx_type = get_tx_type_8x8(xd, ib);
       if (tx_type != DCT_DCT) {
         vp9_ht_dequant_idct_add_8x8_c(tx_type, q, dq, pre, dst, 16, stride,
                                       xd->eobs[idx]);
@@ -341,7 +340,7 @@
       vp9_intra8x8_predict(xd, b, i8x8mode, b->predictor);
       for (j = 0; j < 4; j++) {
         b = &xd->block[ib + iblock[j]];
-        tx_type = get_tx_type_4x4(xd, b);
+        tx_type = get_tx_type_4x4(xd, ib + iblock[j]);
         if (tx_type != DCT_DCT) {
           vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,
                                     b->dequant, b->predictor,
@@ -375,7 +374,7 @@
         eobtotal += vp9_decode_coefs_4x4(pbi, xd, bc, PLANE_TYPE_Y_WITH_DC, i);
 
       vp9_intra4x4_predict(xd, b, b_mode, b->predictor);
-      tx_type = get_tx_type_4x4(xd, b);
+      tx_type = get_tx_type_4x4(xd, i);
       if (tx_type != DCT_DCT) {
         vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,
                                   b->dequant, b->predictor,
@@ -397,7 +396,7 @@
                            xd->dst.v_buffer,
                            xd->dst.uv_stride,
                            xd);
-  } else if (mode == SPLITMV || get_tx_type_4x4(xd, &xd->block[0]) == DCT_DCT) {
+  } else if (mode == SPLITMV || get_tx_type_4x4(xd, 0) == DCT_DCT) {
     xd->itxm_add_y_block(xd->qcoeff,
                           xd->block[0].dequant,
                           xd->predictor,
@@ -431,7 +430,7 @@
 #endif
     for (i = 0; i < 16; i++) {
       BLOCKD *b = &xd->block[i];
-      tx_type = get_tx_type_4x4(xd, b);
+      tx_type = get_tx_type_4x4(xd, i);
       if (tx_type != DCT_DCT) {
         vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,
                                   b->dequant, b->predictor,
@@ -517,13 +516,24 @@
             xd->block[20].dequant, xd->dst.v_buffer, xd->dst.v_buffer,
             xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320]);
         break;
-      case TX_16X16:  // FIXME(rbultje): adst
+      case TX_16X16:
         for (n = 0; n < 16; n++) {
           const int x_idx = n & 3, y_idx = n >> 2;
-          vp9_dequant_idct_add_16x16(xd->qcoeff + n * 256, xd->block[0].dequant,
-              xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
-              xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
-              xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]);
+          const TX_TYPE tx_type = get_tx_type_16x16(xd,
+                                                    (y_idx * 16 + x_idx) * 4);
+          if (tx_type == DCT_DCT) {
+            vp9_dequant_idct_add_16x16(xd->qcoeff + n * 256,
+                xd->block[0].dequant,
+                xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
+                xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]);
+          } else {
+            vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff + n * 256,
+                xd->block[0].dequant,
+                xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
+                xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]);
+          }
         }
         for (n = 0; n < 4; n++) {
           const int x_idx = n & 1, y_idx = n >> 1;
@@ -539,13 +549,23 @@
               xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320 + n * 16]);
         }
         break;
-      case TX_8X8:  // FIXME(rbultje): adst
+      case TX_8X8:
         for (n = 0; n < 64; n++) {
           const int x_idx = n & 7, y_idx = n >> 3;
-          vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64, xd->block[0].dequant,
-              xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8,
-              xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8,
-              xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]);
+          const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 16 + x_idx) * 2);
+          if (tx_type == DCT_DCT) {
+            vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64,
+                xd->block[0].dequant,
+                xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8,
+                xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8,
+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]);
+          } else {
+            vp9_ht_dequant_idct_add_8x8_c(tx_type, xd->qcoeff + n * 64,
+                xd->block[0].dequant,
+                xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8,
+                xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8,
+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]);
+          }
         }
         for (n = 0; n < 16; n++) {
           const int x_idx = n & 3, y_idx = n >> 2;
@@ -561,13 +581,22 @@
               xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320 + n * 4]);
         }
         break;
-      case TX_4X4:  // FIXME(rbultje): adst
+      case TX_4X4:
         for (n = 0; n < 256; n++) {
           const int x_idx = n & 15, y_idx = n >> 4;
-          xd->itxm_add(xd->qcoeff + n * 16, xd->block[0].dequant,
-              xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4,
-              xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4,
-              xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]);
+          const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 16 + x_idx);
+          if (tx_type == DCT_DCT) {
+            xd->itxm_add(xd->qcoeff + n * 16, xd->block[0].dequant,
+                xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4,
+                xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4,
+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]);
+          } else {
+            vp9_ht_dequant_idct_add_c(tx_type, xd->qcoeff + n * 16,
+                xd->block[0].dequant,
+                xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4,
+                xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4,
+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]);
+          }
         }
         for (n = 0; n < 64; n++) {
           const int x_idx = n & 7, y_idx = n >> 3;
@@ -649,14 +678,24 @@
                                               xd->dst.v_buffer,
                                               xd->dst.uv_stride, xd);
         break;
-      case TX_16X16:  // FIXME(rbultje): adst
+      case TX_16X16:
         for (n = 0; n < 4; n++) {
           const int x_idx = n & 1, y_idx = n >> 1;
-          vp9_dequant_idct_add_16x16(
-              xd->qcoeff + n * 256, xd->block[0].dequant,
-              xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
-              xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
-              xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]);
+          const TX_TYPE tx_type = get_tx_type_16x16(xd,
+                                                    (y_idx * 8 + x_idx) * 4);
+          if (tx_type == DCT_DCT) {
+            vp9_dequant_idct_add_16x16(
+                xd->qcoeff + n * 256, xd->block[0].dequant,
+                xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
+                xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]);
+          } else {
+            vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff + n * 256,
+                xd->block[0].dequant,
+                xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
+                xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]);
+          }
         }
         vp9_dequant_idct_add_uv_block_16x16_c(xd->qcoeff + 1024,
                                               xd->block[16].dequant,
@@ -664,13 +703,23 @@
                                               xd->dst.v_buffer,
                                               xd->dst.uv_stride, xd);
         break;
-      case TX_8X8:  // FIXME(rbultje): adst
+      case TX_8X8:
         for (n = 0; n < 16; n++) {
           const int x_idx = n & 3, y_idx = n >> 2;
-          vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64, xd->block[0].dequant,
-              xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8,
-              xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8,
-              xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]);
+          const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 8 + x_idx) * 2);
+          if (tx_type == DCT_DCT) {
+            vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64,
+                xd->block[0].dequant,
+                xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8,
+                xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8,
+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]);
+          } else {
+            vp9_ht_dequant_idct_add_8x8_c(tx_type, xd->qcoeff + n * 64,
+                xd->block[0].dequant,
+                xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8,
+                xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8,
+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]);
+          }
         }
         for (n = 0; n < 4; n++) {
           const int x_idx = n & 1, y_idx = n >> 1;
@@ -686,13 +735,22 @@
               xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[80 + n * 4]);
         }
         break;
-      case TX_4X4:  // FIXME(rbultje): adst
+      case TX_4X4:
         for (n = 0; n < 64; n++) {
           const int x_idx = n & 7, y_idx = n >> 3;
-          xd->itxm_add(xd->qcoeff + n * 16, xd->block[0].dequant,
-              xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4,
-              xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4,
-              xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]);
+          const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 8 + x_idx);
+          if (tx_type == DCT_DCT) {
+            xd->itxm_add(xd->qcoeff + n * 16, xd->block[0].dequant,
+                xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4,
+                xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4,
+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]);
+          } else {
+            vp9_ht_dequant_idct_add_c(tx_type, xd->qcoeff + n * 16,
+                xd->block[0].dequant,
+                xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4,
+                xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4,
+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]);
+          }
         }
         for (n = 0; n < 16; n++) {
           const int x_idx = n & 3, y_idx = n >> 2;
@@ -862,14 +920,9 @@
    * values that are in 1/8th pel units
    */
   block_size >>= 4;  // in mb units
-  xd->mb_to_top_edge = -((mb_row * 16)) << 3;
-  xd->mb_to_left_edge = -((mb_col * 16) << 3);
-  xd->mb_to_bottom_edge = ((cm->mb_rows - block_size - mb_row) * 16) << 3;
-  xd->mb_to_right_edge = ((cm->mb_cols - block_size - mb_col) * 16) << 3;
 
-  xd->up_available    = (mb_row != 0);
-  xd->left_available  = (mb_col > cm->cur_tile_mb_col_start);
-  xd->right_available = (mb_col + block_size < cm->cur_tile_mb_col_end);
+  set_mb_row(cm, xd, mb_row, block_size);
+  set_mb_col(cm, xd, mb_col, block_size);
 
   xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
   xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
@@ -910,20 +963,6 @@
       xd->corrupted |= cm->yv12_fb[second_ref_fb_idx].corrupted;
     }
   }
-
-  if (mbmi->sb_type) {
-    const int n_mbs = 1 << mbmi->sb_type;
-    const int y_mbs = MIN(n_mbs, cm->mb_rows - mb_row);
-    const int x_mbs = MIN(n_mbs, cm->mb_cols - mb_col);
-    const int mis = cm->mode_info_stride;
-    int x, y;
-
-    for (y = 0; y < y_mbs; y++) {
-      for (x = !y; x < x_mbs; x++) {
-        mi[y * mis + x] = *mi;
-      }
-    }
-  }
 }
 
 /* Decode a row of Superblocks (2x2 region of MBs) */
@@ -938,6 +977,11 @@
   for (mb_col = pc->cur_tile_mb_col_start;
        mb_col < pc->cur_tile_mb_col_end; mb_col += 4) {
     if (vp9_read(bc, pc->sb64_coded)) {
+#ifdef DEC_DEBUG
+      dec_debug = (pc->current_video_frame == 1 && mb_row == 0 && mb_col == 0);
+      if (dec_debug)
+        printf("Debug\n");
+#endif
       set_offsets(pbi, 64, mb_row, mb_col);
       vp9_decode_mb_mode_mv(pbi, xd, mb_row, mb_col, bc);
       set_refs(pbi, 64, mb_row, mb_col);
@@ -958,6 +1002,10 @@
         xd->sb_index = j;
 
         if (vp9_read(bc, pc->sb32_coded)) {
+#ifdef DEC_DEBUG
+          dec_debug = (pc->current_video_frame == 1 &&
+                       mb_row + y_idx_sb == 0 && mb_col + x_idx_sb == 0);
+#endif
           set_offsets(pbi, 32, mb_row + y_idx_sb, mb_col + x_idx_sb);
           vp9_decode_mb_mode_mv(pbi,
                                 xd, mb_row + y_idx_sb, mb_col + x_idx_sb, bc);
@@ -978,11 +1026,14 @@
               // MB lies outside frame, skip on to next
               continue;
             }
+#ifdef DEC_DEBUG
+            dec_debug = (pc->current_video_frame == 1 &&
+                         mb_row + y_idx == 0 && mb_col + x_idx == 0);
+#endif
 
             set_offsets(pbi, 16, mb_row + y_idx, mb_col + x_idx);
             xd->mb_index = i;
             vp9_decode_mb_mode_mv(pbi, xd, mb_row + y_idx, mb_col + x_idx, bc);
-            update_blockd_bmi(xd);
             set_refs(pbi, 16, mb_row + y_idx, mb_col + x_idx);
             decode_macroblock(pbi, xd, mb_row + y_idx, mb_col + x_idx, bc);
 
@@ -1073,6 +1124,85 @@
     xd->fullpixel_mask = 0xfffffff8;
 }
 
+#if CONFIG_CODE_NONZEROCOUNT
+static void read_nzc_probs_common(VP9_COMMON *cm,
+                                  BOOL_DECODER* const bc,
+                                  int block_size) {
+  int c, r, b, t;
+  int tokens, nodes;
+  vp9_prob *nzc_probs;
+  vp9_prob upd;
+
+  if (!vp9_read_bit(bc)) return;
+
+  if (block_size == 32) {
+    tokens = NZC32X32_TOKENS;
+    nzc_probs = cm->fc.nzc_probs_32x32[0][0][0];
+    upd = NZC_UPDATE_PROB_32X32;
+  } else if (block_size == 16) {
+    tokens = NZC16X16_TOKENS;
+    nzc_probs = cm->fc.nzc_probs_16x16[0][0][0];
+    upd = NZC_UPDATE_PROB_16X16;
+  } else if (block_size == 8) {
+    tokens = NZC8X8_TOKENS;
+    nzc_probs = cm->fc.nzc_probs_8x8[0][0][0];
+    upd = NZC_UPDATE_PROB_8X8;
+  } else {
+    tokens = NZC4X4_TOKENS;
+    nzc_probs = cm->fc.nzc_probs_4x4[0][0][0];
+    upd = NZC_UPDATE_PROB_4X4;
+  }
+  nodes = tokens - 1;
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    for (r = 0; r < REF_TYPES; ++r) {
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;
+        int offset_nodes = offset * nodes;
+        for (t = 0; t < nodes; ++t) {
+          vp9_prob *p = &nzc_probs[offset_nodes + t];
+          if (vp9_read(bc, upd)) {
+            *p = read_prob_diff_update(bc, *p);
+          }
+        }
+      }
+    }
+  }
+}
+
+static void read_nzc_pcat_probs(VP9_COMMON *cm, BOOL_DECODER* const bc) {
+  int c, t, b;
+  vp9_prob upd = NZC_UPDATE_PROB_PCAT;
+  if (!vp9_read_bit(bc)) {
+    return;
+  }
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {
+      int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];
+      for (b = 0; b < bits; ++b) {
+        vp9_prob *p = &cm->fc.nzc_pcat_probs[c][t][b];
+        if (vp9_read(bc, upd)) {
+          *p = read_prob_diff_update(bc, *p);
+        }
+      }
+    }
+  }
+}
+
+static void read_nzc_probs(VP9_COMMON *cm,
+                           BOOL_DECODER* const bc) {
+  read_nzc_probs_common(cm, bc, 4);
+  if (cm->txfm_mode != ONLY_4X4)
+    read_nzc_probs_common(cm, bc, 8);
+  if (cm->txfm_mode > ALLOW_8X8)
+    read_nzc_probs_common(cm, bc, 16);
+  if (cm->txfm_mode > ALLOW_16X16)
+    read_nzc_probs_common(cm, bc, 32);
+#ifdef NZC_PCAT_UPDATE
+  read_nzc_pcat_probs(cm, bc);
+#endif
+}
+#endif  // CONFIG_CODE_NONZEROCOUNT
+
 static void read_coef_probs_common(BOOL_DECODER* const bc,
                                    vp9_coeff_probs *coef_probs,
                                    int block_types) {
@@ -1085,7 +1215,7 @@
           for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
             if (l >= 3 && k == 0)
               continue;
-            for (m = 0; m < ENTROPY_NODES; m++) {
+            for (m = CONFIG_CODE_NONZEROCOUNT; m < ENTROPY_NODES; m++) {
               vp9_prob *const p = coef_probs[i][j][k][l] + m;
 
               if (vp9_read(bc, COEF_UPDATE_PROB)) {
@@ -1539,6 +1669,19 @@
   pbi->common.fc.pre_interintra_prob = pbi->common.fc.interintra_prob;
 #endif
   pbi->common.fc.pre_nmvc = pbi->common.fc.nmvc;
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_copy(pbi->common.fc.pre_nzc_probs_4x4,
+           pbi->common.fc.nzc_probs_4x4);
+  vp9_copy(pbi->common.fc.pre_nzc_probs_8x8,
+           pbi->common.fc.nzc_probs_8x8);
+  vp9_copy(pbi->common.fc.pre_nzc_probs_16x16,
+           pbi->common.fc.nzc_probs_16x16);
+  vp9_copy(pbi->common.fc.pre_nzc_probs_32x32,
+           pbi->common.fc.nzc_probs_32x32);
+  vp9_copy(pbi->common.fc.pre_nzc_pcat_probs,
+           pbi->common.fc.nzc_pcat_probs);
+#endif
+
   vp9_zero(pbi->common.fc.coef_counts_4x4);
   vp9_zero(pbi->common.fc.coef_counts_8x8);
   vp9_zero(pbi->common.fc.coef_counts_16x16);
@@ -1555,8 +1698,18 @@
 #if CONFIG_COMP_INTERINTRA_PRED
   vp9_zero(pbi->common.fc.interintra_counts);
 #endif
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_zero(pbi->common.fc.nzc_counts_4x4);
+  vp9_zero(pbi->common.fc.nzc_counts_8x8);
+  vp9_zero(pbi->common.fc.nzc_counts_16x16);
+  vp9_zero(pbi->common.fc.nzc_counts_32x32);
+  vp9_zero(pbi->common.fc.nzc_pcat_counts);
+#endif
 
   read_coef_probs(pbi, &header_bc);
+#if CONFIG_CODE_NONZEROCOUNT
+  read_nzc_probs(&pbi->common, &header_bc);
+#endif
 
   /* Initialize xd pointers. Any reference should do for xd->pre, so use 0. */
   vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->active_ref_idx[0]],
@@ -1700,8 +1853,12 @@
   }
 
   if (!pc->error_resilient_mode &&
-      !pc->frame_parallel_decoding_mode)
+      !pc->frame_parallel_decoding_mode) {
     vp9_adapt_coef_probs(pc);
+#if CONFIG_CODE_NONZEROCOUNT
+    vp9_adapt_nzc_probs(pc);
+#endif
+  }
   if (pc->frame_type != KEY_FRAME) {
     if (!pc->error_resilient_mode &&
         !pc->frame_parallel_decoding_mode) {
diff --git a/vp9/decoder/vp9_dequantize.c b/vp9/decoder/vp9_dequantize.c
index eaf9860..92b78ed 100644
--- a/vp9/decoder/vp9_dequantize.c
+++ b/vp9/decoder/vp9_dequantize.c
@@ -15,6 +15,7 @@
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/common/vp9_common.h"
 
+
 static void add_residual(const int16_t *diff, const uint8_t *pred, int pitch,
                          uint8_t *dest, int stride, int width, int height) {
   int r, c;
@@ -29,6 +30,26 @@
   }
 }
 
+void vp9_add_residual_4x4_c(const int16_t *diff, const uint8_t *pred, int pitch,
+                         uint8_t *dest, int stride) {
+  add_residual(diff, pred, pitch, dest, stride, 4, 4);
+}
+
+void vp9_add_residual_8x8_c(const int16_t *diff, const uint8_t *pred, int pitch,
+                         uint8_t *dest, int stride) {
+  add_residual(diff, pred, pitch, dest, stride, 8, 8);
+}
+
+void vp9_add_residual_16x16_c(const int16_t *diff, const uint8_t *pred,
+                              int pitch, uint8_t *dest, int stride) {
+  add_residual(diff, pred, pitch, dest, stride, 16, 16);
+}
+
+void vp9_add_residual_32x32_c(const int16_t *diff, const uint8_t *pred,
+                              int pitch, uint8_t *dest, int stride) {
+  add_residual(diff, pred, pitch, dest, stride, 32, 32);
+}
+
 static void add_constant_residual(const int16_t diff, const uint8_t *pred,
                                   int pitch, uint8_t *dest, int stride,
                                   int width, int height) {
@@ -43,26 +64,41 @@
   }
 }
 
+void vp9_add_constant_residual_8x8_c(const int16_t diff, const uint8_t *pred,
+                                     int pitch, uint8_t *dest, int stride) {
+  add_constant_residual(diff, pred, pitch, dest, stride, 8, 8);
+}
+
+void vp9_add_constant_residual_16x16_c(const int16_t diff, const uint8_t *pred,
+                                       int pitch, uint8_t *dest, int stride) {
+  add_constant_residual(diff, pred, pitch, dest, stride, 16, 16);
+}
+
+void vp9_add_constant_residual_32x32_c(const int16_t diff, const uint8_t *pred,
+                                       int pitch, uint8_t *dest, int stride) {
+  add_constant_residual(diff, pred, pitch, dest, stride, 32, 32);
+}
+
 void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input,
                                const int16_t *dq,
                                uint8_t *pred, uint8_t *dest,
                                int pitch, int stride, int eob) {
   int i;
-  int16_t output[16];
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
 
   for (i = 0; i < 16; i++)
     input[i] *= dq[i];
 
   vp9_short_iht4x4(input, output, 4, tx_type);
   vpx_memset(input, 0, 32);
-  add_residual(output, pred, pitch, dest, stride, 4, 4);
+  vp9_add_residual_4x4(output, pred, pitch, dest, stride);
 }
 
 void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input,
                                    const int16_t *dq,
                                    uint8_t *pred, uint8_t *dest,
                                    int pitch, int stride, int eob) {
-  int16_t output[64];
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64);
 
   if (eob == 0) {
     // All 0 DCT coefficients
@@ -76,14 +112,14 @@
 
     vp9_short_iht8x8(input, output, 8, tx_type);
     vpx_memset(input, 0, 128);
-    add_residual(output, pred, pitch, dest, stride, 8, 8);
+    vp9_add_residual_8x8(output, pred, pitch, dest, stride);
   }
 }
 
 void vp9_dequant_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred,
                             uint8_t *dest, int pitch, int stride, int eob) {
   int i;
-  int16_t output[16];
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
 
   if (eob > 1) {
     for (i = 0; i < 16; i++)
@@ -94,7 +130,7 @@
 
     vpx_memset(input, 0, 32);
 
-    add_residual(output, pred, pitch, dest, stride, 4, 4);
+    vp9_add_residual_4x4(output, pred, pitch, dest, stride);
   } else {
     vp9_dc_only_idct_add(input[0]*dq[0], pred, dest, pitch, stride);
     ((int *)input)[0] = 0;
@@ -104,7 +140,7 @@
 void vp9_dequant_dc_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred,
                                uint8_t *dest, int pitch, int stride, int dc) {
   int i;
-  int16_t output[16];
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
 
   input[0] = dc;
 
@@ -114,14 +150,14 @@
   // the idct halves ( >> 1) the pitch
   vp9_short_idct4x4llm(input, output, 4 << 1);
   vpx_memset(input, 0, 32);
-  add_residual(output, pred, pitch, dest, stride, 4, 4);
+  vp9_add_residual_4x4(output, pred, pitch, dest, stride);
 }
 
 void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq,
                                      uint8_t *pred, uint8_t *dest,
                                      int pitch, int stride, int eob) {
   int i;
-  int16_t output[16];
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
 
   if (eob > 1) {
     for (i = 0; i < 16; i++)
@@ -131,7 +167,7 @@
 
     vpx_memset(input, 0, 32);
 
-    add_residual(output, pred, pitch, dest, stride, 4, 4);
+    vp9_add_residual_4x4(output, pred, pitch, dest, stride);
   } else {
     vp9_dc_only_inv_walsh_add(input[0]*dq[0], pred, dest, pitch, stride);
     ((int *)input)[0] = 0;
@@ -143,7 +179,7 @@
                                         uint8_t *dest,
                                         int pitch, int stride, int dc) {
   int i;
-  int16_t output[16];
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
 
   input[0] = dc;
 
@@ -152,14 +188,13 @@
 
   vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);
   vpx_memset(input, 0, 32);
-  add_residual(output, pred, pitch, dest, stride, 4, 4);
+  vp9_add_residual_4x4(output, pred, pitch, dest, stride);
 }
 
 void vp9_dequant_idct_add_8x8_c(int16_t *input, const int16_t *dq,
                                 uint8_t *pred, uint8_t *dest, int pitch,
                                 int stride, int eob) {
-  int16_t output[64];
-
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64);
 
   // If dc is 1, then input[0] is the reconstructed value, do not need
   // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
@@ -182,7 +217,7 @@
     vp9_short_idct1_8x8_c(&in, &out);
     input[0] = 0;
 
-    add_constant_residual(out, pred, pitch, dest, stride, 8, 8);
+    vp9_add_constant_residual_8x8(out, pred, pitch, dest, stride);
   } else if (eob <= 10) {
     input[1] *= dq[1];
     input[2] *= dq[1];
@@ -201,7 +236,7 @@
     input[16] = input[17] = 0;
     input[24] = 0;
 
-    add_residual(output, pred, pitch, dest, stride, 8, 8);
+    vp9_add_residual_8x8(output, pred, pitch, dest, stride);
   } else {
     int i;
 
@@ -212,7 +247,7 @@
     // the idct halves ( >> 1) the pitch
     vp9_short_idct8x8_c(input, output, 8 << 1);
     vpx_memset(input, 0, 128);
-    add_residual(output, pred, pitch, dest, stride, 8, 8);
+    vp9_add_residual_8x8(output, pred, pitch, dest, stride);
   }
 }
 
@@ -220,7 +255,7 @@
                                      const int16_t *dq, uint8_t *pred,
                                      uint8_t *dest, int pitch, int stride,
                                      int eob) {
-  int16_t output[256];
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);
 
   if (eob == 0) {
     // All 0 DCT coefficients
@@ -242,14 +277,14 @@
 
     vpx_memset(input, 0, 512);
 
-    add_residual(output, pred, pitch, dest, stride, 16, 16);
+    vp9_add_residual_16x16(output, pred, pitch, dest, stride);
   }
 }
 
 void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq,
                                   uint8_t *pred, uint8_t *dest, int pitch,
                                   int stride, int eob) {
-  int16_t output[256];
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);
 
   /* The calculation can be simplified if there are not many non-zero dct
    * coefficients. Use eobs to separate different cases. */
@@ -265,7 +300,7 @@
     vp9_short_idct1_16x16_c(&in, &out);
     input[0] = 0;
 
-    add_constant_residual(out, pred, pitch, dest, stride, 16, 16);
+    vp9_add_constant_residual_16x16(out, pred, pitch, dest, stride);
   } else if (eob <= 10) {
     input[0] *= dq[0];
 
@@ -287,7 +322,7 @@
     input[32] = input[33] = 0;
     input[48] = 0;
 
-    add_residual(output, pred, pitch, dest, stride, 16, 16);
+    vp9_add_residual_16x16(output, pred, pitch, dest, stride);
   } else {
     int i;
 
@@ -302,20 +337,20 @@
 
     vpx_memset(input, 0, 512);
 
-    add_residual(output, pred, pitch, dest, stride, 16, 16);
+    vp9_add_residual_16x16(output, pred, pitch, dest, stride);
   }
 }
 
 void vp9_dequant_idct_add_32x32_c(int16_t *input, const int16_t *dq,
                                   uint8_t *pred, uint8_t *dest, int pitch,
                                   int stride, int eob) {
-  int16_t output[1024];
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 1024);
 
   if (eob) {
     input[0] = input[0] * dq[0] / 2;
     if (eob == 1) {
       vp9_short_idct1_32x32(input, output);
-      add_constant_residual(output[0], pred, pitch, dest, stride, 32, 32);
+      vp9_add_constant_residual_32x32(output[0], pred, pitch, dest, stride);
       input[0] = 0;
     } else if (eob <= 10) {
       input[1] = input[1] * dq[1] / 2;
@@ -336,14 +371,14 @@
       input[64] = input[65] = 0;
       input[96] = 0;
 
-      add_residual(output, pred, pitch, dest, stride, 32, 32);
+      vp9_add_residual_32x32(output, pred, pitch, dest, stride);
     } else {
       int i;
       for (i = 1; i < 1024; i++)
         input[i] = input[i] * dq[1] / 2;
       vp9_short_idct32x32(input, output, 64);
       vpx_memset(input, 0, 2048);
-      add_residual(output, pred, pitch, dest, stride, 32, 32);
+      vp9_add_residual_32x32(output, pred, pitch, dest, stride);
     }
   }
 }
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index a192266..0a584d7 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -69,13 +69,24 @@
     pt = vp9_get_coef_context(&recent_energy, token);         \
   } while (0)
 
+#if CONFIG_CODE_NONZEROCOUNT
 #define WRITE_COEF_CONTINUE(val, token)                       \
   {                                                           \
-    qcoeff_ptr[scan[c]] = (int16_t) get_signed(br, val);        \
+    qcoeff_ptr[scan[c]] = (int16_t) get_signed(br, val);      \
+    INCREMENT_COUNT(token);                                   \
+    c++;                                                      \
+    nzc++;                                           \
+    continue;                                                 \
+  }
+#else
+#define WRITE_COEF_CONTINUE(val, token)                       \
+  {                                                           \
+    qcoeff_ptr[scan[c]] = (int16_t) get_signed(br, val);      \
     INCREMENT_COUNT(token);                                   \
     c++;                                                      \
     continue;                                                 \
   }
+#endif  // CONFIG_CODE_NONZEROCOUNT
 
 #define ADJUST_COEF(prob, bits_count)  \
   do {                                 \
@@ -85,9 +96,8 @@
 
 static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,
                         BOOL_DECODER* const br, int block_idx,
-                        PLANE_TYPE type, TX_TYPE tx_type,
-                        int seg_eob, int16_t *qcoeff_ptr,
-                        const int *const scan, TX_SIZE txfm_size) {
+                        PLANE_TYPE type, int seg_eob, int16_t *qcoeff_ptr,
+                        TX_SIZE txfm_size) {
   ENTROPY_CONTEXT* const A0 = (ENTROPY_CONTEXT *) xd->above_context;
   ENTROPY_CONTEXT* const L0 = (ENTROPY_CONTEXT *) xd->left_context;
   int aidx, lidx;
@@ -99,6 +109,11 @@
   vp9_prob *prob;
   vp9_coeff_count *coef_counts;
   const int ref = xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME;
+#if CONFIG_CODE_NONZEROCOUNT
+  uint16_t nzc = 0;
+  uint16_t nzc_expected = xd->mode_info_context->mbmi.nzcs[block_idx];
+#endif
+  const int *scan;
 
   if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
     aidx = vp9_block2above_sb64[txfm_size][block_idx];
@@ -113,19 +128,34 @@
 
   switch (txfm_size) {
     default:
-    case TX_4X4:
+    case TX_4X4: {
+      const TX_TYPE tx_type = get_tx_type_4x4(xd, block_idx);
+      switch (tx_type) {
+        default:
+          scan = vp9_default_zig_zag1d_4x4;
+          break;
+        case ADST_DCT:
+          scan = vp9_row_scan_4x4;
+          break;
+        case DCT_ADST:
+          scan = vp9_col_scan_4x4;
+          break;
+      }
       above_ec = A0[aidx] != 0;
       left_ec = L0[lidx] != 0;
       coef_probs  = fc->coef_probs_4x4;
       coef_counts = fc->coef_counts_4x4;
       break;
+    }
     case TX_8X8:
+      scan = vp9_default_zig_zag1d_8x8;
       coef_probs  = fc->coef_probs_8x8;
       coef_counts = fc->coef_counts_8x8;
       above_ec = (A0[aidx] + A0[aidx + 1]) != 0;
       left_ec  = (L0[lidx] + L0[lidx + 1]) != 0;
       break;
     case TX_16X16:
+      scan = vp9_default_zig_zag1d_16x16;
       coef_probs  = fc->coef_probs_16x16;
       coef_counts = fc->coef_counts_16x16;
       if (type == PLANE_TYPE_UV) {
@@ -139,6 +169,7 @@
       }
       break;
     case TX_32X32:
+      scan = vp9_default_zig_zag1d_32x32;
       coef_probs = fc->coef_probs_32x32;
       coef_counts = fc->coef_counts_32x32;
       if (type == PLANE_TYPE_UV) {
@@ -170,12 +201,24 @@
 
     if (c >= seg_eob)
       break;
+#if CONFIG_CODE_NONZEROCOUNT
+    if (nzc == nzc_expected)
+      break;
+#endif
     prob = coef_probs[type][ref][get_coef_band(txfm_size, c)][pt];
+#if CONFIG_CODE_NONZEROCOUNT == 0
     if (!vp9_read(br, prob[EOB_CONTEXT_NODE]))
       break;
+#endif
 SKIP_START:
     if (c >= seg_eob)
       break;
+#if CONFIG_CODE_NONZEROCOUNT
+    if (nzc == nzc_expected)
+      break;
+    // decode zero node only if there are zeros left
+    if (seg_eob - nzc_expected - c + nzc > 0)
+#endif
     if (!vp9_read(br, prob[ZERO_CONTEXT_NODE])) {
       INCREMENT_COUNT(ZERO_TOKEN);
       ++c;
@@ -242,8 +285,10 @@
     WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY6);
   }
 
+#if CONFIG_CODE_NONZEROCOUNT == 0
   if (c < seg_eob)
     coef_counts[type][ref][get_coef_band(txfm_size, c)][pt][DCT_EOB_TOKEN]++;
+#endif
 
   A0[aidx] = L0[lidx] = c > 0;
   if (txfm_size >= TX_8X8) {
@@ -272,7 +317,6 @@
       }
     }
   }
-
   return c;
 }
 
@@ -290,17 +334,15 @@
     case TX_32X32:
       // Luma block
       c = decode_coefs(pbi, xd, bc, 0, PLANE_TYPE_Y_WITH_DC,
-                       DCT_DCT, get_eob(xd, segment_id, 1024),
-                       xd->qcoeff, vp9_default_zig_zag1d_32x32, TX_32X32);
+                       get_eob(xd, segment_id, 1024), xd->qcoeff, TX_32X32);
       xd->eobs[0] = c;
       eobtotal += c;
 
       // 16x16 chroma blocks
       seg_eob = get_eob(xd, segment_id, 256);
       for (i = 64; i < 96; i += 16) {
-        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob,
-                         xd->qcoeff + i * 16,
-                         vp9_default_zig_zag1d_16x16, TX_16X16);
+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob,
+                         xd->qcoeff + i * 16, TX_16X16);
         xd->eobs[i] = c;
         eobtotal += c;
       }
@@ -310,17 +352,15 @@
       seg_eob = get_eob(xd, segment_id, 256);
       for (i = 0; i < 64; i += 16) {
         c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC,
-                         DCT_DCT, seg_eob, xd->qcoeff + i * 16,
-                         vp9_default_zig_zag1d_16x16, TX_16X16);
+                         seg_eob, xd->qcoeff + i * 16, TX_16X16);
         xd->eobs[i] = c;
         eobtotal += c;
       }
 
       // 16x16 chroma blocks
       for (i = 64; i < 96; i += 16) {
-        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob,
-                         xd->qcoeff + i * 16,
-                         vp9_default_zig_zag1d_16x16, TX_16X16);
+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob,
+                         xd->qcoeff + i * 16, TX_16X16);
         xd->eobs[i] = c;
         eobtotal += c;
       }
@@ -330,17 +370,15 @@
       seg_eob = get_eob(xd, segment_id, 64);
       for (i = 0; i < 64; i += 4) {
         c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC,
-                         DCT_DCT, seg_eob, xd->qcoeff + i * 16,
-                         vp9_default_zig_zag1d_8x8, TX_8X8);
+                         seg_eob, xd->qcoeff + i * 16, TX_8X8);
         xd->eobs[i] = c;
         eobtotal += c;
       }
 
       // 8x8 chroma blocks
       for (i = 64; i < 96; i += 4) {
-        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob,
-                         xd->qcoeff + i * 16,
-                         vp9_default_zig_zag1d_8x8, TX_8X8);
+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob,
+                         xd->qcoeff + i * 16, TX_8X8);
         xd->eobs[i] = c;
         eobtotal += c;
       }
@@ -350,17 +388,15 @@
       seg_eob = get_eob(xd, segment_id, 16);
       for (i = 0; i < 64; i++) {
         c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC,
-                         DCT_DCT, seg_eob, xd->qcoeff + i * 16,
-                         vp9_default_zig_zag1d_4x4, TX_4X4);
+                         seg_eob, xd->qcoeff + i * 16, TX_4X4);
         xd->eobs[i] = c;
         eobtotal += c;
       }
 
       // 4x4 chroma blocks
       for (i = 64; i < 96; i++) {
-        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob,
-                         xd->qcoeff + i * 16,
-                         vp9_default_zig_zag1d_4x4, TX_4X4);
+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob,
+                         xd->qcoeff + i * 16, TX_4X4);
         xd->eobs[i] = c;
         eobtotal += c;
       }
@@ -383,17 +419,15 @@
       seg_eob = get_eob(xd, segment_id, 1024);
       for (i = 0; i < 256; i += 64) {
         c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC,
-                         DCT_DCT, seg_eob, xd->qcoeff + i * 16,
-                         vp9_default_zig_zag1d_32x32, TX_32X32);
+                         seg_eob, xd->qcoeff + i * 16, TX_32X32);
         xd->eobs[i] = c;
         eobtotal += c;
       }
 
       // 32x32 chroma blocks
       for (i = 256; i < 384; i += 64) {
-        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob,
-                         xd->qcoeff + i * 16,
-                         vp9_default_zig_zag1d_32x32, TX_32X32);
+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob,
+                         xd->qcoeff + i * 16, TX_32X32);
         xd->eobs[i] = c;
         eobtotal += c;
       }
@@ -403,17 +437,15 @@
       seg_eob = get_eob(xd, segment_id, 256);
       for (i = 0; i < 256; i += 16) {
         c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC,
-                         DCT_DCT, seg_eob, xd->qcoeff + i * 16,
-                         vp9_default_zig_zag1d_16x16, TX_16X16);
+                         seg_eob, xd->qcoeff + i * 16, TX_16X16);
         xd->eobs[i] = c;
         eobtotal += c;
       }
 
       // 16x16 chroma blocks
       for (i = 256; i < 384; i += 16) {
-        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob,
-                         xd->qcoeff + i * 16,
-                         vp9_default_zig_zag1d_16x16, TX_16X16);
+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob,
+                         xd->qcoeff + i * 16, TX_16X16);
         xd->eobs[i] = c;
         eobtotal += c;
       }
@@ -423,17 +455,15 @@
       seg_eob = get_eob(xd, segment_id, 64);
       for (i = 0; i < 256; i += 4) {
         c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC,
-                         DCT_DCT, seg_eob, xd->qcoeff + i * 16,
-                         vp9_default_zig_zag1d_8x8, TX_8X8);
+                         seg_eob, xd->qcoeff + i * 16, TX_8X8);
         xd->eobs[i] = c;
         eobtotal += c;
       }
 
       // 8x8 chroma blocks
       for (i = 256; i < 384; i += 4) {
-        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob,
-                         xd->qcoeff + i * 16,
-                         vp9_default_zig_zag1d_8x8, TX_8X8);
+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob,
+                         xd->qcoeff + i * 16, TX_8X8);
         xd->eobs[i] = c;
         eobtotal += c;
       }
@@ -443,17 +473,15 @@
       seg_eob = get_eob(xd, segment_id, 16);
       for (i = 0; i < 256; i++) {
         c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC,
-                         DCT_DCT, seg_eob, xd->qcoeff + i * 16,
-                         vp9_default_zig_zag1d_4x4, TX_4X4);
+                         seg_eob, xd->qcoeff + i * 16, TX_4X4);
         xd->eobs[i] = c;
         eobtotal += c;
       }
 
       // 4x4 chroma blocks
       for (i = 256; i < 384; i++) {
-        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob,
-                         xd->qcoeff + i * 16,
-                         vp9_default_zig_zag1d_4x4, TX_4X4);
+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob,
+                         xd->qcoeff + i * 16, TX_4X4);
         xd->eobs[i] = c;
         eobtotal += c;
       }
@@ -472,9 +500,7 @@
 
   // Luma block
   int c = decode_coefs(pbi, xd, bc, 0, PLANE_TYPE_Y_WITH_DC,
-                       get_tx_type(xd, &xd->block[0]),
-                       get_eob(xd, segment_id, 256),
-                       xd->qcoeff, vp9_default_zig_zag1d_16x16, TX_16X16);
+                       get_eob(xd, segment_id, 256), xd->qcoeff, TX_16X16);
   xd->eobs[0] = c;
   eobtotal += c;
 
@@ -482,8 +508,7 @@
   seg_eob = get_eob(xd, segment_id, 64);
   for (i = 16; i < 24; i += 4) {
     c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV,
-                     DCT_DCT, seg_eob, xd->block[i].qcoeff,
-                     vp9_default_zig_zag1d_8x8, TX_8X8);
+                     seg_eob, xd->block[i].qcoeff, TX_8X8);
     xd->eobs[i] = c;
     eobtotal += c;
   }
@@ -500,9 +525,7 @@
   seg_eob = get_eob(xd, segment_id, 64);
   for (i = 0; i < 16; i += 4) {
     c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC,
-                     get_tx_type(xd, xd->block + i),
-                     seg_eob, xd->block[i].qcoeff,
-                     vp9_default_zig_zag1d_8x8, TX_8X8);
+                     seg_eob, xd->block[i].qcoeff, TX_8X8);
     xd->eobs[i] = c;
     eobtotal += c;
   }
@@ -514,16 +537,14 @@
     seg_eob = get_eob(xd, segment_id, 16);
     for (i = 16; i < 24; i++) {
       c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV,
-                       DCT_DCT, seg_eob, xd->block[i].qcoeff,
-                       vp9_default_zig_zag1d_4x4, TX_4X4);
+                       seg_eob, xd->block[i].qcoeff, TX_4X4);
       xd->eobs[i] = c;
       eobtotal += c;
     }
   } else {
     for (i = 16; i < 24; i += 4) {
       c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV,
-                       DCT_DCT, seg_eob, xd->block[i].qcoeff,
-                       vp9_default_zig_zag1d_8x8, TX_8X8);
+                       seg_eob, xd->block[i].qcoeff, TX_8X8);
       xd->eobs[i] = c;
       eobtotal += c;
     }
@@ -534,43 +555,20 @@
 
 static int decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,
                             BOOL_DECODER* const bc,
-                            PLANE_TYPE type, int i, int seg_eob,
-                            TX_TYPE tx_type, const int *scan) {
-  int c = decode_coefs(dx, xd, bc, i, type, tx_type, seg_eob,
-                       xd->block[i].qcoeff, scan, TX_4X4);
+                            PLANE_TYPE type, int i, int seg_eob) {
+  int c = decode_coefs(dx, xd, bc, i, type, seg_eob,
+                       xd->block[i].qcoeff, TX_4X4);
   xd->eobs[i] = c;
   return c;
 }
 
-static int decode_coefs_4x4_y(VP9D_COMP *dx, MACROBLOCKD *xd,
-                              BOOL_DECODER* const bc,
-                              PLANE_TYPE type, int i, int seg_eob) {
-  const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                          get_tx_type(xd, &xd->block[i]) : DCT_DCT;
-  const int *scan;
-
-  switch (tx_type) {
-    case ADST_DCT:
-      scan = vp9_row_scan_4x4;
-      break;
-    case DCT_ADST:
-      scan = vp9_col_scan_4x4;
-      break;
-    default:
-      scan = vp9_default_zig_zag1d_4x4;
-      break;
-  }
-
-  return decode_coefs_4x4(dx, xd, bc, type, i, seg_eob, tx_type, scan);
-}
-
 int vp9_decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,
                          BOOL_DECODER* const bc,
                          PLANE_TYPE type, int i) {
   const int segment_id = xd->mode_info_context->mbmi.segment_id;
   const int seg_eob = get_eob(xd, segment_id, 16);
 
-  return decode_coefs_4x4_y(dx, xd, bc, type, i, seg_eob);
+  return decode_coefs_4x4(dx, xd, bc, type, i, seg_eob);
 }
 
 static int decode_mb_tokens_4x4_uv(VP9D_COMP* const dx,
@@ -581,8 +579,7 @@
 
   // chroma blocks
   for (i = 16; i < 24; i++) {
-    eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_UV, i, seg_eob,
-                                 DCT_DCT, vp9_default_zig_zag1d_4x4);
+    eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_UV, i, seg_eob);
   }
 
   return eobtotal;
@@ -606,8 +603,7 @@
 
   // luma blocks
   for (i = 0; i < 16; ++i) {
-    eobtotal += decode_coefs_4x4_y(dx, xd, bc,
-                                   PLANE_TYPE_Y_WITH_DC, i, seg_eob);
+    eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_Y_WITH_DC, i, seg_eob);
   }
 
   // chroma blocks
diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index ce7958c..87d1d3b 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@@ -9,6 +9,9 @@
  */
 
 
+#include <stdio.h>
+#include <assert.h>
+
 #include "vp9/common/vp9_onyxc_int.h"
 #if CONFIG_POSTPROC
 #include "vp9/common/vp9_postproc.h"
@@ -19,8 +22,6 @@
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_swapyv12buffer.h"
-#include <stdio.h>
-#include <assert.h>
 
 #include "vp9/common/vp9_quant_common.h"
 #include "vpx_scale/vpx_scale.h"
@@ -99,7 +100,7 @@
 }
 #endif
 
-void vp9_initialize_dec(void) {
+void vp9_initialize_dec() {
   static int init_done = 0;
 
   if (!init_done) {
diff --git a/vp9/decoder/vp9_treereader.h b/vp9/decoder/vp9_treereader.h
index cca017d..305dfe5 100644
--- a/vp9/decoder/vp9_treereader.h
+++ b/vp9/decoder/vp9_treereader.h
@@ -13,7 +13,6 @@
 #define VP9_DECODER_VP9_TREEREADER_H_
 
 #include "vp9/common/vp9_treecoder.h"
-
 #include "vp9/decoder/vp9_dboolhuff.h"
 
 typedef BOOL_DECODER vp9_reader;
diff --git a/vp9/decoder/x86/vp9_dequantize_x86.c b/vp9/decoder/x86/vp9_dequantize_x86.c
new file mode 100644
index 0000000..acfae2a
--- /dev/null
+++ b/vp9/decoder/x86/vp9_dequantize_x86.c
@@ -0,0 +1,455 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_idct.h"
+
+#if HAVE_SSE2
+
+void vp9_add_residual_4x4_sse2(const int16_t *diff, const uint8_t *pred,
+                               int pitch, uint8_t *dest, int stride) {
+  const int width = 4;
+  const __m128i zero = _mm_setzero_si128();
+
+  // Diff data
+  const __m128i d0 = _mm_loadl_epi64((const __m128i *)(diff + 0 * width));
+  const __m128i d1 = _mm_loadl_epi64((const __m128i *)(diff + 1 * width));
+  const __m128i d2 = _mm_loadl_epi64((const __m128i *)(diff + 2 * width));
+  const __m128i d3 = _mm_loadl_epi64((const __m128i *)(diff + 3 * width));
+
+  // Prediction data.
+  __m128i p0 = _mm_cvtsi32_si128(*(const int *)(pred + 0 * pitch));
+  __m128i p1 = _mm_cvtsi32_si128(*(const int *)(pred + 1 * pitch));
+  __m128i p2 = _mm_cvtsi32_si128(*(const int *)(pred + 2 * pitch));
+  __m128i p3 = _mm_cvtsi32_si128(*(const int *)(pred + 3 * pitch));
+
+  p0 = _mm_unpacklo_epi8(p0, zero);
+  p1 = _mm_unpacklo_epi8(p1, zero);
+  p2 = _mm_unpacklo_epi8(p2, zero);
+  p3 = _mm_unpacklo_epi8(p3, zero);
+
+  p0 = _mm_add_epi16(p0, d0);
+  p1 = _mm_add_epi16(p1, d1);
+  p2 = _mm_add_epi16(p2, d2);
+  p3 = _mm_add_epi16(p3, d3);
+
+  p0 = _mm_packus_epi16(p0, p1);
+  p2 = _mm_packus_epi16(p2, p3);
+
+  *(int *)dest = _mm_cvtsi128_si32(p0);
+  dest += stride;
+
+  p0 = _mm_srli_si128(p0, 8);
+  *(int *)dest = _mm_cvtsi128_si32(p0);
+  dest += stride;
+
+  *(int *)dest = _mm_cvtsi128_si32(p2);
+  dest += stride;
+
+  p2 = _mm_srli_si128(p2, 8);
+  *(int *)dest = _mm_cvtsi128_si32(p2);
+}
+
+void vp9_add_residual_8x8_sse2(const int16_t *diff, const uint8_t *pred,
+                               int pitch, uint8_t *dest, int stride) {
+  const int width = 8;
+  const __m128i zero = _mm_setzero_si128();
+
+  // Diff data
+  const __m128i d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
+  const __m128i d1 = _mm_load_si128((const __m128i *)(diff + 1 * width));
+  const __m128i d2 = _mm_load_si128((const __m128i *)(diff + 2 * width));
+  const __m128i d3 = _mm_load_si128((const __m128i *)(diff + 3 * width));
+  const __m128i d4 = _mm_load_si128((const __m128i *)(diff + 4 * width));
+  const __m128i d5 = _mm_load_si128((const __m128i *)(diff + 5 * width));
+  const __m128i d6 = _mm_load_si128((const __m128i *)(diff + 6 * width));
+  const __m128i d7 = _mm_load_si128((const __m128i *)(diff + 7 * width));
+
+  // Prediction data.
+  __m128i p0 = _mm_loadl_epi64((const __m128i *)(pred + 0 * pitch));
+  __m128i p1 = _mm_loadl_epi64((const __m128i *)(pred + 1 * pitch));
+  __m128i p2 = _mm_loadl_epi64((const __m128i *)(pred + 2 * pitch));
+  __m128i p3 = _mm_loadl_epi64((const __m128i *)(pred + 3 * pitch));
+  __m128i p4 = _mm_loadl_epi64((const __m128i *)(pred + 4 * pitch));
+  __m128i p5 = _mm_loadl_epi64((const __m128i *)(pred + 5 * pitch));
+  __m128i p6 = _mm_loadl_epi64((const __m128i *)(pred + 6 * pitch));
+  __m128i p7 = _mm_loadl_epi64((const __m128i *)(pred + 7 * pitch));
+
+  p0 = _mm_unpacklo_epi8(p0, zero);
+  p1 = _mm_unpacklo_epi8(p1, zero);
+  p2 = _mm_unpacklo_epi8(p2, zero);
+  p3 = _mm_unpacklo_epi8(p3, zero);
+  p4 = _mm_unpacklo_epi8(p4, zero);
+  p5 = _mm_unpacklo_epi8(p5, zero);
+  p6 = _mm_unpacklo_epi8(p6, zero);
+  p7 = _mm_unpacklo_epi8(p7, zero);
+
+  p0 = _mm_add_epi16(p0, d0);
+  p1 = _mm_add_epi16(p1, d1);
+  p2 = _mm_add_epi16(p2, d2);
+  p3 = _mm_add_epi16(p3, d3);
+  p4 = _mm_add_epi16(p4, d4);
+  p5 = _mm_add_epi16(p5, d5);
+  p6 = _mm_add_epi16(p6, d6);
+  p7 = _mm_add_epi16(p7, d7);
+
+  p0 = _mm_packus_epi16(p0, p1);
+  p2 = _mm_packus_epi16(p2, p3);
+  p4 = _mm_packus_epi16(p4, p5);
+  p6 = _mm_packus_epi16(p6, p7);
+
+  _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0);
+  p0 = _mm_srli_si128(p0, 8);
+  _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0);
+
+  _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2);
+  p2 = _mm_srli_si128(p2, 8);
+  _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2);
+
+  _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4);
+  p4 = _mm_srli_si128(p4, 8);
+  _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4);
+
+  _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6);
+  p6 = _mm_srli_si128(p6, 8);
+  _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);
+}
+
+void vp9_add_residual_16x16_sse2(const int16_t *diff, const uint8_t *pred,
+                             int pitch, uint8_t *dest, int stride) {
+  const int width = 16;
+  int i = 4;
+  const __m128i zero = _mm_setzero_si128();
+
+  // Diff data
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i p0, p1, p2, p3, p4, p5, p6, p7;
+
+  do {
+    d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
+    d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));
+    d2 = _mm_load_si128((const __m128i *)(diff + 1 * width));
+    d3 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));
+    d4 = _mm_load_si128((const __m128i *)(diff + 2 * width));
+    d5 = _mm_load_si128((const __m128i *)(diff + 2 * width + 8));
+    d6 = _mm_load_si128((const __m128i *)(diff + 3 * width));
+    d7 = _mm_load_si128((const __m128i *)(diff + 3 * width + 8));
+
+    // Prediction data.
+    p1 = _mm_load_si128((const __m128i *)(pred + 0 * pitch));
+    p3 = _mm_load_si128((const __m128i *)(pred + 1 * pitch));
+    p5 = _mm_load_si128((const __m128i *)(pred + 2 * pitch));
+    p7 = _mm_load_si128((const __m128i *)(pred + 3 * pitch));
+
+    p0 = _mm_unpacklo_epi8(p1, zero);
+    p1 = _mm_unpackhi_epi8(p1, zero);
+    p2 = _mm_unpacklo_epi8(p3, zero);
+    p3 = _mm_unpackhi_epi8(p3, zero);
+    p4 = _mm_unpacklo_epi8(p5, zero);
+    p5 = _mm_unpackhi_epi8(p5, zero);
+    p6 = _mm_unpacklo_epi8(p7, zero);
+    p7 = _mm_unpackhi_epi8(p7, zero);
+
+    p0 = _mm_add_epi16(p0, d0);
+    p1 = _mm_add_epi16(p1, d1);
+    p2 = _mm_add_epi16(p2, d2);
+    p3 = _mm_add_epi16(p3, d3);
+    p4 = _mm_add_epi16(p4, d4);
+    p5 = _mm_add_epi16(p5, d5);
+    p6 = _mm_add_epi16(p6, d6);
+    p7 = _mm_add_epi16(p7, d7);
+
+    p0 = _mm_packus_epi16(p0, p1);
+    p1 = _mm_packus_epi16(p2, p3);
+    p2 = _mm_packus_epi16(p4, p5);
+    p3 = _mm_packus_epi16(p6, p7);
+
+    _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
+    _mm_store_si128((__m128i *)(dest + 1 * stride), p1);
+    _mm_store_si128((__m128i *)(dest + 2 * stride), p2);
+    _mm_store_si128((__m128i *)(dest + 3 * stride), p3);
+
+    diff += 4 * width;
+    pred += 4 * pitch;
+    dest += 4 * stride;
+  } while (--i);
+}
+
+void vp9_add_residual_32x32_sse2(const int16_t *diff, const uint8_t *pred,
+                             int pitch, uint8_t *dest, int stride) {
+  const int width = 32;
+  int i = 16;
+  const __m128i zero = _mm_setzero_si128();
+
+  // Diff data
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i p0, p1, p2, p3, p4, p5, p6, p7;
+
+  do {
+    d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
+    d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));
+    d2 = _mm_load_si128((const __m128i *)(diff + 0 * width + 16));
+    d3 = _mm_load_si128((const __m128i *)(diff + 0 * width + 24));
+    d4 = _mm_load_si128((const __m128i *)(diff + 1 * width));
+    d5 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));
+    d6 = _mm_load_si128((const __m128i *)(diff + 1 * width + 16));
+    d7 = _mm_load_si128((const __m128i *)(diff + 1 * width + 24));
+
+    // Prediction data.
+    p1 = _mm_load_si128((const __m128i *)(pred + 0 * pitch));
+    p3 = _mm_load_si128((const __m128i *)(pred + 0 * pitch + 16));
+    p5 = _mm_load_si128((const __m128i *)(pred + 1 * pitch));
+    p7 = _mm_load_si128((const __m128i *)(pred + 1 * pitch + 16));
+
+    p0 = _mm_unpacklo_epi8(p1, zero);
+    p1 = _mm_unpackhi_epi8(p1, zero);
+    p2 = _mm_unpacklo_epi8(p3, zero);
+    p3 = _mm_unpackhi_epi8(p3, zero);
+    p4 = _mm_unpacklo_epi8(p5, zero);
+    p5 = _mm_unpackhi_epi8(p5, zero);
+    p6 = _mm_unpacklo_epi8(p7, zero);
+    p7 = _mm_unpackhi_epi8(p7, zero);
+
+    p0 = _mm_add_epi16(p0, d0);
+    p1 = _mm_add_epi16(p1, d1);
+    p2 = _mm_add_epi16(p2, d2);
+    p3 = _mm_add_epi16(p3, d3);
+    p4 = _mm_add_epi16(p4, d4);
+    p5 = _mm_add_epi16(p5, d5);
+    p6 = _mm_add_epi16(p6, d6);
+    p7 = _mm_add_epi16(p7, d7);
+
+    p0 = _mm_packus_epi16(p0, p1);
+    p1 = _mm_packus_epi16(p2, p3);
+    p2 = _mm_packus_epi16(p4, p5);
+    p3 = _mm_packus_epi16(p6, p7);
+
+    _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
+    _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1);
+    _mm_store_si128((__m128i *)(dest + 1 * stride), p2);
+    _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3);
+
+    diff += 2 * width;
+    pred += 2 * pitch;
+    dest += 2 * stride;
+  } while (--i);
+}
+
+void vp9_add_constant_residual_8x8_sse2(const int16_t diff, const uint8_t *pred,
+                                        int pitch, uint8_t *dest, int stride) {
+  uint8_t abs_diff;
+  __m128i d;
+
+  // Prediction data.
+  __m128i p0 = _mm_loadl_epi64((const __m128i *)(pred + 0 * pitch));
+  __m128i p1 = _mm_loadl_epi64((const __m128i *)(pred + 1 * pitch));
+  __m128i p2 = _mm_loadl_epi64((const __m128i *)(pred + 2 * pitch));
+  __m128i p3 = _mm_loadl_epi64((const __m128i *)(pred + 3 * pitch));
+  __m128i p4 = _mm_loadl_epi64((const __m128i *)(pred + 4 * pitch));
+  __m128i p5 = _mm_loadl_epi64((const __m128i *)(pred + 5 * pitch));
+  __m128i p6 = _mm_loadl_epi64((const __m128i *)(pred + 6 * pitch));
+  __m128i p7 = _mm_loadl_epi64((const __m128i *)(pred + 7 * pitch));
+
+  p0 = _mm_unpacklo_epi64(p0, p1);
+  p2 = _mm_unpacklo_epi64(p2, p3);
+  p4 = _mm_unpacklo_epi64(p4, p5);
+  p6 = _mm_unpacklo_epi64(p6, p7);
+
+  // Clip diff value to [0, 255] range. Then, do addition or subtraction
+  // according to its sign.
+  if (diff >= 0) {
+    abs_diff = (diff > 255) ? 255 : diff;
+    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
+
+    p0 = _mm_adds_epu8(p0, d);
+    p2 = _mm_adds_epu8(p2, d);
+    p4 = _mm_adds_epu8(p4, d);
+    p6 = _mm_adds_epu8(p6, d);
+  } else {
+    abs_diff = (diff < -255) ? 255 : -diff;
+    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
+
+    p0 = _mm_subs_epu8(p0, d);
+    p2 = _mm_subs_epu8(p2, d);
+    p4 = _mm_subs_epu8(p4, d);
+    p6 = _mm_subs_epu8(p6, d);
+  }
+
+  _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0);
+  p0 = _mm_srli_si128(p0, 8);
+  _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0);
+
+  _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2);
+  p2 = _mm_srli_si128(p2, 8);
+  _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2);
+
+  _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4);
+  p4 = _mm_srli_si128(p4, 8);
+  _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4);
+
+  _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6);
+  p6 = _mm_srli_si128(p6, 8);
+  _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);
+}
+
+void vp9_add_constant_residual_16x16_sse2(const int16_t diff,
+                                          const uint8_t *pred, int pitch,
+                                          uint8_t *dest, int stride) {
+  uint8_t abs_diff;
+  __m128i d;
+
+  // Prediction data.
+  __m128i p0 = _mm_load_si128((const __m128i *)(pred + 0 * pitch));
+  __m128i p1 = _mm_load_si128((const __m128i *)(pred + 1 * pitch));
+  __m128i p2 = _mm_load_si128((const __m128i *)(pred + 2 * pitch));
+  __m128i p3 = _mm_load_si128((const __m128i *)(pred + 3 * pitch));
+  __m128i p4 = _mm_load_si128((const __m128i *)(pred + 4 * pitch));
+  __m128i p5 = _mm_load_si128((const __m128i *)(pred + 5 * pitch));
+  __m128i p6 = _mm_load_si128((const __m128i *)(pred + 6 * pitch));
+  __m128i p7 = _mm_load_si128((const __m128i *)(pred + 7 * pitch));
+  __m128i p8 = _mm_load_si128((const __m128i *)(pred + 8 * pitch));
+  __m128i p9 = _mm_load_si128((const __m128i *)(pred + 9 * pitch));
+  __m128i p10 = _mm_load_si128((const __m128i *)(pred + 10 * pitch));
+  __m128i p11 = _mm_load_si128((const __m128i *)(pred + 11 * pitch));
+  __m128i p12 = _mm_load_si128((const __m128i *)(pred + 12 * pitch));
+  __m128i p13 = _mm_load_si128((const __m128i *)(pred + 13 * pitch));
+  __m128i p14 = _mm_load_si128((const __m128i *)(pred + 14 * pitch));
+  __m128i p15 = _mm_load_si128((const __m128i *)(pred + 15 * pitch));
+
+  // Clip diff value to [0, 255] range. Then, do addition or subtraction
+  // according to its sign.
+  if (diff >= 0) {
+    abs_diff = (diff > 255) ? 255 : diff;
+    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
+
+    p0 = _mm_adds_epu8(p0, d);
+    p1 = _mm_adds_epu8(p1, d);
+    p2 = _mm_adds_epu8(p2, d);
+    p3 = _mm_adds_epu8(p3, d);
+    p4 = _mm_adds_epu8(p4, d);
+    p5 = _mm_adds_epu8(p5, d);
+    p6 = _mm_adds_epu8(p6, d);
+    p7 = _mm_adds_epu8(p7, d);
+    p8 = _mm_adds_epu8(p8, d);
+    p9 = _mm_adds_epu8(p9, d);
+    p10 = _mm_adds_epu8(p10, d);
+    p11 = _mm_adds_epu8(p11, d);
+    p12 = _mm_adds_epu8(p12, d);
+    p13 = _mm_adds_epu8(p13, d);
+    p14 = _mm_adds_epu8(p14, d);
+    p15 = _mm_adds_epu8(p15, d);
+  } else {
+    abs_diff = (diff < -255) ? 255 : -diff;
+    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
+
+    p0 = _mm_subs_epu8(p0, d);
+    p1 = _mm_subs_epu8(p1, d);
+    p2 = _mm_subs_epu8(p2, d);
+    p3 = _mm_subs_epu8(p3, d);
+    p4 = _mm_subs_epu8(p4, d);
+    p5 = _mm_subs_epu8(p5, d);
+    p6 = _mm_subs_epu8(p6, d);
+    p7 = _mm_subs_epu8(p7, d);
+    p8 = _mm_subs_epu8(p8, d);
+    p9 = _mm_subs_epu8(p9, d);
+    p10 = _mm_subs_epu8(p10, d);
+    p11 = _mm_subs_epu8(p11, d);
+    p12 = _mm_subs_epu8(p12, d);
+    p13 = _mm_subs_epu8(p13, d);
+    p14 = _mm_subs_epu8(p14, d);
+    p15 = _mm_subs_epu8(p15, d);
+  }
+
+  // Store results
+  _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
+  _mm_store_si128((__m128i *)(dest + 1 * stride), p1);
+  _mm_store_si128((__m128i *)(dest + 2 * stride), p2);
+  _mm_store_si128((__m128i *)(dest + 3 * stride), p3);
+  _mm_store_si128((__m128i *)(dest + 4 * stride), p4);
+  _mm_store_si128((__m128i *)(dest + 5 * stride), p5);
+  _mm_store_si128((__m128i *)(dest + 6 * stride), p6);
+  _mm_store_si128((__m128i *)(dest + 7 * stride), p7);
+  _mm_store_si128((__m128i *)(dest + 8 * stride), p8);
+  _mm_store_si128((__m128i *)(dest + 9 * stride), p9);
+  _mm_store_si128((__m128i *)(dest + 10 * stride), p10);
+  _mm_store_si128((__m128i *)(dest + 11 * stride), p11);
+  _mm_store_si128((__m128i *)(dest + 12 * stride), p12);
+  _mm_store_si128((__m128i *)(dest + 13 * stride), p13);
+  _mm_store_si128((__m128i *)(dest + 14 * stride), p14);
+  _mm_store_si128((__m128i *)(dest + 15 * stride), p15);
+}
+
+void vp9_add_constant_residual_32x32_sse2(const int16_t diff,
+                                          const uint8_t *pred, int pitch,
+                                          uint8_t *dest, int stride) {
+  uint8_t abs_diff;
+  __m128i d;
+  int i = 8;
+
+  if (diff >= 0) {
+    abs_diff = (diff > 255) ? 255 : diff;
+    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
+  } else {
+    abs_diff = (diff < -255) ? 255 : -diff;
+    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
+  }
+
+  do {
+    // Prediction data.
+    __m128i p0 = _mm_load_si128((const __m128i *)(pred + 0 * pitch));
+    __m128i p1 = _mm_load_si128((const __m128i *)(pred + 0 * pitch + 16));
+    __m128i p2 = _mm_load_si128((const __m128i *)(pred + 1 * pitch));
+    __m128i p3 = _mm_load_si128((const __m128i *)(pred + 1 * pitch + 16));
+    __m128i p4 = _mm_load_si128((const __m128i *)(pred + 2 * pitch));
+    __m128i p5 = _mm_load_si128((const __m128i *)(pred + 2 * pitch + 16));
+    __m128i p6 = _mm_load_si128((const __m128i *)(pred + 3 * pitch));
+    __m128i p7 = _mm_load_si128((const __m128i *)(pred + 3 * pitch + 16));
+
+    // Clip diff value to [0, 255] range. Then, do addition or subtraction
+    // according to its sign.
+    if (diff >= 0) {
+      p0 = _mm_adds_epu8(p0, d);
+      p1 = _mm_adds_epu8(p1, d);
+      p2 = _mm_adds_epu8(p2, d);
+      p3 = _mm_adds_epu8(p3, d);
+      p4 = _mm_adds_epu8(p4, d);
+      p5 = _mm_adds_epu8(p5, d);
+      p6 = _mm_adds_epu8(p6, d);
+      p7 = _mm_adds_epu8(p7, d);
+    } else {
+      p0 = _mm_subs_epu8(p0, d);
+      p1 = _mm_subs_epu8(p1, d);
+      p2 = _mm_subs_epu8(p2, d);
+      p3 = _mm_subs_epu8(p3, d);
+      p4 = _mm_subs_epu8(p4, d);
+      p5 = _mm_subs_epu8(p5, d);
+      p6 = _mm_subs_epu8(p6, d);
+      p7 = _mm_subs_epu8(p7, d);
+    }
+
+    // Store results
+    _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
+    _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1);
+    _mm_store_si128((__m128i *)(dest + 1 * stride), p2);
+    _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3);
+    _mm_store_si128((__m128i *)(dest + 2 * stride), p4);
+    _mm_store_si128((__m128i *)(dest + 2 * stride + 16), p5);
+    _mm_store_si128((__m128i *)(dest + 3 * stride), p6);
+    _mm_store_si128((__m128i *)(dest + 3 * stride + 16), p7);
+
+    pred += 4 * pitch;
+    dest += 4 * stride;
+  } while (--i);
+}
+#endif
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index fcbd3a1..558971d 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -50,6 +50,24 @@
 extern unsigned int active_section;
 #endif
 
+#if CONFIG_CODE_NONZEROCOUNT
+#ifdef NZC_STATS
+unsigned int nzc_stats_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                          [NZC4X4_TOKENS];
+unsigned int nzc_stats_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                          [NZC8X8_TOKENS];
+unsigned int nzc_stats_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                          [NZC16X16_TOKENS];
+unsigned int nzc_stats_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                          [NZC32X32_TOKENS];
+unsigned int nzc_pcat_stats[MAX_NZC_CONTEXTS][NZC_TOKENS_EXTRA]
+                          [NZC_BITS_EXTRA][2];
+void init_nzcstats();
+void update_nzcstats(VP9_COMMON *const cm);
+void print_nzcstats();
+#endif
+#endif
+
 #ifdef MODE_STATS
 int count_mb_seg[4] = { 0, 0, 0, 0 };
 #endif
@@ -719,10 +737,9 @@
   // Distance of Mb to the various image edges.
   // These specified to 8th pel as they are always compared to MV
   // values that are in 1/8th pel units
-  xd->mb_to_left_edge = -((mb_col * 16) << 3);
-  xd->mb_to_top_edge = -((mb_row * 16)) << 3;
-  xd->mb_to_right_edge = ((pc->mb_cols - mb_size - mb_col) * 16) << 3;
-  xd->mb_to_bottom_edge = ((pc->mb_rows - mb_size - mb_row) * 16) << 3;
+
+  set_mb_row(pc, xd, mb_row, mb_size);
+  set_mb_col(pc, xd, mb_col, mb_size);
 
 #ifdef ENTROPY_STATS
   active_section = 9;
@@ -751,18 +768,7 @@
   } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
     skip_coeff = 1;
   } else {
-    const int nmbs = mb_size;
-    const int xmbs = MIN(nmbs, mb_cols_left);
-    const int ymbs = MIN(nmbs, mb_rows_left);
-    int x, y;
-
-    skip_coeff = 1;
-    for (y = 0; y < ymbs; y++) {
-      for (x = 0; x < xmbs; x++) {
-        skip_coeff = skip_coeff && m[y * mis + x].mbmi.mb_skip_coeff;
-      }
-    }
-
+    skip_coeff = m->mbmi.mb_skip_coeff;
     vp9_write(bc, skip_coeff,
               vp9_get_pred_prob(pc, xd, PRED_MBSKIP));
   }
@@ -966,7 +972,7 @@
 }
 
 static void write_mb_modes_kf(const VP9_COMP *cpi,
-                              const MODE_INFO *m,
+                              MODE_INFO *m,
                               vp9_writer *bc,
                               int mb_rows_left, int mb_cols_left) {
   const VP9_COMMON *const c = &cpi->common;
@@ -985,18 +991,7 @@
   } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
     skip_coeff = 1;
   } else {
-    const int nmbs = 1 << m->mbmi.sb_type;
-    const int xmbs = MIN(nmbs, mb_cols_left);
-    const int ymbs = MIN(nmbs, mb_rows_left);
-    int x, y;
-
-    skip_coeff = 1;
-    for (y = 0; y < ymbs; y++) {
-      for (x = 0; x < xmbs; x++) {
-        skip_coeff = skip_coeff && m[y * mis + x].mbmi.mb_skip_coeff;
-      }
-    }
-
+    skip_coeff = m->mbmi.mb_skip_coeff;
     vp9_write(bc, skip_coeff,
               vp9_get_pred_prob(c, xd, PRED_MBSKIP));
   }
@@ -1054,30 +1049,585 @@
   }
 }
 
+#if CONFIG_CODE_NONZEROCOUNT
+static void write_nzc(VP9_COMMON *const cm,
+                      uint16_t nzc,
+                      int nzc_context,
+                      TX_SIZE tx_size,
+                      int ref,
+                      int type,
+                      vp9_writer* const bc) {
+  int c, e;
+  c = codenzc(nzc);
+  if (tx_size == TX_32X32) {
+    write_token(bc, vp9_nzc32x32_tree,
+                cm->fc.nzc_probs_32x32[nzc_context][ref][type],
+                vp9_nzc32x32_encodings + c);
+    // cm->fc.nzc_counts_32x32[nzc_context][ref][type][c]++;
+  } else if (tx_size == TX_16X16) {
+    write_token(bc, vp9_nzc16x16_tree,
+                cm->fc.nzc_probs_16x16[nzc_context][ref][type],
+                vp9_nzc16x16_encodings + c);
+    // cm->fc.nzc_counts_16x16[nzc_context][ref][type][c]++;
+  } else if (tx_size == TX_8X8) {
+    write_token(bc, vp9_nzc8x8_tree,
+                cm->fc.nzc_probs_8x8[nzc_context][ref][type],
+                vp9_nzc8x8_encodings + c);
+    // cm->fc.nzc_counts_8x8[nzc_context][ref][type][c]++;
+  } else if (tx_size == TX_4X4) {
+    write_token(bc, vp9_nzc4x4_tree,
+                cm->fc.nzc_probs_4x4[nzc_context][ref][type],
+                vp9_nzc4x4_encodings + c);
+    // cm->fc.nzc_counts_4x4[nzc_context][ref][type][c]++;
+  } else {
+    assert(0);
+  }
+
+  if ((e = vp9_extranzcbits[c])) {
+    int x = nzc - vp9_basenzcvalue[c];
+    while (e--) {
+      int b = (x >> e) & 1;
+      vp9_write(bc, b,
+                cm->fc.nzc_pcat_probs[nzc_context][c - NZC_TOKENS_NOEXTRA][e]);
+      // cm->fc.nzc_pcat_counts[nzc_context][c - NZC_TOKENS_NOEXTRA][e][b]++;
+    }
+  }
+}
+
+static void write_nzcs_sb64(VP9_COMP *cpi,
+                            MACROBLOCKD *xd,
+                            int mb_row,
+                            int mb_col,
+                            vp9_writer* const bc) {
+  VP9_COMMON *const cm = &cpi->common;
+  MODE_INFO *m = xd->mode_info_context;
+  MB_MODE_INFO *const mi = &m->mbmi;
+  int j, nzc_context;
+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
+
+  assert(mb_col == get_mb_col(xd));
+  assert(mb_row == get_mb_row(xd));
+
+  if (mi->mb_skip_coeff)
+    return;
+
+  switch (mi->txfm_size) {
+    case TX_32X32:
+      for (j = 0; j < 256; j += 64) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0, bc);
+      }
+      for (j = 256; j < 384; j += 64) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 1, bc);
+      }
+      break;
+
+    case TX_16X16:
+      for (j = 0; j < 256; j += 16) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0, bc);
+      }
+      for (j = 256; j < 384; j += 16) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1, bc);
+      }
+      break;
+
+    case TX_8X8:
+      for (j = 0; j < 256; j += 4) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0, bc);
+      }
+      for (j = 256; j < 384; j += 4) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);
+      }
+      break;
+
+    case TX_4X4:
+      for (j = 0; j < 256; ++j) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0, bc);
+      }
+      for (j = 256; j < 384; ++j) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);
+      }
+      break;
+
+    default:
+      break;
+  }
+}
+
+static void write_nzcs_sb32(VP9_COMP *cpi,
+                            MACROBLOCKD *xd,
+                            int mb_row,
+                            int mb_col,
+                            vp9_writer* const bc) {
+  VP9_COMMON *const cm = &cpi->common;
+  MODE_INFO *m = xd->mode_info_context;
+  MB_MODE_INFO *const mi = &m->mbmi;
+  int j, nzc_context;
+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
+
+  assert(mb_col == get_mb_col(xd));
+  assert(mb_row == get_mb_row(xd));
+
+  if (mi->mb_skip_coeff)
+    return;
+
+  switch (mi->txfm_size) {
+    case TX_32X32:
+      for (j = 0; j < 64; j += 64) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0, bc);
+      }
+      for (j = 64; j < 96; j += 16) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1, bc);
+      }
+      break;
+
+    case TX_16X16:
+      for (j = 0; j < 64; j += 16) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0, bc);
+      }
+      for (j = 64; j < 96; j += 16) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1, bc);
+      }
+      break;
+
+    case TX_8X8:
+      for (j = 0; j < 64; j += 4) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0, bc);
+      }
+      for (j = 64; j < 96; j += 4) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);
+      }
+      break;
+
+    case TX_4X4:
+      for (j = 0; j < 64; ++j) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0, bc);
+      }
+      for (j = 64; j < 96; ++j) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);
+      }
+      break;
+
+    default:
+      break;
+  }
+}
+
+static void write_nzcs_mb16(VP9_COMP *cpi,
+                            MACROBLOCKD *xd,
+                            int mb_row,
+                            int mb_col,
+                            vp9_writer* const bc) {
+  VP9_COMMON *const cm = &cpi->common;
+  MODE_INFO *m = xd->mode_info_context;
+  MB_MODE_INFO *const mi = &m->mbmi;
+  int j, nzc_context;
+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
+
+  assert(mb_col == get_mb_col(xd));
+  assert(mb_row == get_mb_row(xd));
+
+  if (mi->mb_skip_coeff)
+    return;
+
+  switch (mi->txfm_size) {
+    case TX_16X16:
+      for (j = 0; j < 16; j += 16) {
+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0, bc);
+      }
+      for (j = 16; j < 24; j += 4) {
+        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);
+      }
+      break;
+
+    case TX_8X8:
+      for (j = 0; j < 16; j += 4) {
+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0, bc);
+      }
+      if (mi->mode == I8X8_PRED || mi->mode == SPLITMV) {
+        for (j = 16; j < 24; ++j) {
+          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+          write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);
+        }
+      } else {
+        for (j = 16; j < 24; j += 4) {
+          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+          write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);
+        }
+      }
+      break;
+
+    case TX_4X4:
+      for (j = 0; j < 16; ++j) {
+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0, bc);
+      }
+      for (j = 16; j < 24; ++j) {
+        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);
+      }
+      break;
+
+    default:
+      break;
+  }
+}
+
+#ifdef NZC_STATS
+void init_nzcstats() {
+  vp9_zero(nzc_stats_4x4);
+  vp9_zero(nzc_stats_8x8);
+  vp9_zero(nzc_stats_16x16);
+  vp9_zero(nzc_stats_32x32);
+  vp9_zero(nzc_pcat_stats);
+}
+
+void update_nzcstats(VP9_COMMON *const cm) {
+  int c, r, b, t;
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    for (r = 0; r < REF_TYPES; ++r) {
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        for (t = 0; t < NZC4X4_TOKENS; ++t) {
+          nzc_stats_4x4[c][r][b][t] += cm->fc.nzc_counts_4x4[c][r][b][t];
+        }
+      }
+    }
+  }
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    for (r = 0; r < REF_TYPES; ++r) {
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        for (t = 0; t < NZC8X8_TOKENS; ++t) {
+          nzc_stats_8x8[c][r][b][t] += cm->fc.nzc_counts_8x8[c][r][b][t];
+        }
+      }
+    }
+  }
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    for (r = 0; r < REF_TYPES; ++r) {
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        for (t = 0; t < NZC16X16_TOKENS; ++t) {
+          nzc_stats_16x16[c][r][b][t] += cm->fc.nzc_counts_16x16[c][r][b][t];
+        }
+      }
+    }
+  }
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    for (r = 0; r < REF_TYPES; ++r) {
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        for (t = 0; t < NZC32X32_TOKENS; ++t) {
+          nzc_stats_32x32[c][r][b][t] += cm->fc.nzc_counts_32x32[c][r][b][t];
+        }
+      }
+    }
+  }
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {
+      int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];
+      for (b = 0; b < bits; ++b) {
+        nzc_pcat_stats[c][t][b][0] += cm->fc.nzc_pcat_counts[c][t][b][0];
+        nzc_pcat_stats[c][t][b][1] += cm->fc.nzc_pcat_counts[c][t][b][1];
+      }
+    }
+  }
+}
+
+void print_nzcstats() {
+  int c, r, b, t;
+  printf(
+    "static const unsigned int default_nzc_counts_4x4[MAX_NZC_CONTEXTS]\n"
+    "                                                [REF_TYPES]\n"
+    "                                                [BLOCK_TYPES]\n"
+    "                                                [NZC4X4_TOKENS] = {\n");
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    printf("  {\n");
+    for (r = 0; r < REF_TYPES; ++r) {
+      printf("    {\n");
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        printf("      {");
+        for (t = 0; t < NZC4X4_TOKENS; ++t) {
+          printf(" %-3d,", nzc_stats_4x4[c][r][b][t]);
+        }
+        printf(" },\n");
+      }
+      printf("    },\n");
+    }
+    printf("  },\n");
+  }
+  printf("};\n");
+
+  printf(
+    "static const unsigned int default_nzc_counts_8x8[MAX_NZC_CONTEXTS]\n"
+    "                                                [REF_TYPES]\n"
+    "                                                [BLOCK_TYPES]\n"
+    "                                                [NZC8X8_TOKENS] = {\n");
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    printf("  {\n");
+    for (r = 0; r < REF_TYPES; ++r) {
+      printf("    {\n");
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        printf("      {");
+        for (t = 0; t < NZC8X8_TOKENS; ++t) {
+          printf(" %-3d,", nzc_stats_8x8[c][r][b][t]);
+        }
+        printf(" },\n");
+      }
+      printf("    },\n");
+    }
+    printf("  },\n");
+  }
+  printf("};\n");
+
+  printf(
+    "static const unsigned int default_nzc_counts_16x16[MAX_NZC_CONTEXTS]\n"
+    "                                                  [REF_TYPES]\n"
+    "                                                  [BLOCK_TYPES]\n"
+    "                                                  [NZC16X16_TOKENS] = {"
+    "\n");
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    printf("  {\n");
+    for (r = 0; r < REF_TYPES; ++r) {
+      printf("    {\n");
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        printf("      {");
+        for (t = 0; t < NZC16X16_TOKENS; ++t) {
+          printf(" %-3d,", nzc_stats_16x16[c][r][b][t]);
+        }
+        printf(" },\n");
+      }
+      printf("    },\n");
+    }
+    printf("  },\n");
+  }
+  printf("};\n");
+
+  printf(
+    "static const unsigned int default_nzc_counts_32x32[MAX_NZC_CONTEXTS]\n"
+    "                                                  [REF_TYPES]\n"
+    "                                                  [BLOCK_TYPES]\n"
+    "                                                  [NZC32X32_TOKENS] = {"
+    "\n");
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    printf("  {\n");
+    for (r = 0; r < REF_TYPES; ++r) {
+      printf("    {\n");
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        printf("      {");
+        for (t = 0; t < NZC32X32_TOKENS; ++t) {
+          printf(" %-3d,", nzc_stats_32x32[c][r][b][t]);
+        }
+        printf(" },\n");
+      }
+      printf("    },\n");
+    }
+    printf("  },\n");
+  }
+  printf("};\n");
+
+  printf(
+    "static const vp9_prob default_nzc_pcat_counts[MAX_NZC_CONTEXTS]\n"
+    "                                             [NZC_TOKENS_EXTRA]\n"
+    "                                             [NZC_BITS_EXTRA] = {\n");
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    printf("  {\n");
+    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {
+      printf("    {");
+      for (b = 0; b < NZC_BITS_EXTRA; ++b) {
+        printf(" %d/%d,",
+               nzc_pcat_stats[c][t][b][0], nzc_pcat_stats[c][t][b][1]);
+      }
+      printf(" },\n");
+    }
+    printf("  },\n");
+  }
+  printf("};\n");
+
+  printf(
+    "static const vp9_prob default_nzc_probs_4x4[MAX_NZC_CONTEXTS]\n"
+    "                                           [REF_TYPES]\n"
+    "                                           [BLOCK_TYPES]\n"
+    "                                           [NZC4X4_TOKENS] = {\n");
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    printf("  {\n");
+    for (r = 0; r < REF_TYPES; ++r) {
+      printf("    {\n");
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        vp9_prob probs[NZC4X4_NODES];
+        unsigned int branch_ct[NZC4X4_NODES][2];
+        vp9_tree_probs_from_distribution(NZC4X4_TOKENS,
+                                         vp9_nzc4x4_encodings,
+                                         vp9_nzc4x4_tree,
+                                         probs, branch_ct,
+                                         nzc_stats_4x4[c][r][b]);
+        printf("      {");
+        for (t = 0; t < NZC4X4_NODES; ++t) {
+          printf(" %-3d,", probs[t]);
+        }
+        printf(" },\n");
+      }
+      printf("    },\n");
+    }
+    printf("  },\n");
+  }
+  printf("};\n");
+
+  printf(
+    "static const vp9_prob default_nzc_probs_8x8[MAX_NZC_CONTEXTS]\n"
+    "                                           [REF_TYPES]\n"
+    "                                           [BLOCK_TYPES]\n"
+    "                                           [NZC8X8_TOKENS] = {\n");
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    printf("  {\n");
+    for (r = 0; r < REF_TYPES; ++r) {
+      printf("    {\n");
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        vp9_prob probs[NZC8X8_NODES];
+        unsigned int branch_ct[NZC8X8_NODES][2];
+        vp9_tree_probs_from_distribution(NZC8X8_TOKENS,
+                                         vp9_nzc8x8_encodings,
+                                         vp9_nzc8x8_tree,
+                                         probs, branch_ct,
+                                         nzc_stats_8x8[c][r][b]);
+        printf("      {");
+        for (t = 0; t < NZC8X8_NODES; ++t) {
+          printf(" %-3d,", probs[t]);
+        }
+        printf(" },\n");
+      }
+      printf("    },\n");
+    }
+    printf("  },\n");
+  }
+  printf("};\n");
+
+  printf(
+    "static const vp9_prob default_nzc_probs_16x16[MAX_NZC_CONTEXTS]\n"
+    "                                             [REF_TYPES]\n"
+    "                                             [BLOCK_TYPES]\n"
+    "                                             [NZC16X16_TOKENS] = {\n");
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    printf("  {\n");
+    for (r = 0; r < REF_TYPES; ++r) {
+      printf("    {\n");
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        vp9_prob probs[NZC16X16_NODES];
+        unsigned int branch_ct[NZC16X16_NODES][2];
+        vp9_tree_probs_from_distribution(NZC16X16_TOKENS,
+                                         vp9_nzc16x16_encodings,
+                                         vp9_nzc16x16_tree,
+                                         probs, branch_ct,
+                                         nzc_stats_16x16[c][r][b]);
+        printf("      {");
+        for (t = 0; t < NZC16X16_NODES; ++t) {
+          printf(" %-3d,", probs[t]);
+        }
+        printf(" },\n");
+      }
+      printf("    },\n");
+    }
+    printf("  },\n");
+  }
+  printf("};\n");
+
+  printf(
+    "static const vp9_prob default_nzc_probs_32x32[MAX_NZC_CONTEXTS]\n"
+    "                                             [REF_TYPES]\n"
+    "                                             [BLOCK_TYPES]\n"
+    "                                             [NZC32X32_TOKENS] = {\n");
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    printf("  {\n");
+    for (r = 0; r < REF_TYPES; ++r) {
+      printf("    {\n");
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        vp9_prob probs[NZC32X32_NODES];
+        unsigned int branch_ct[NZC32X32_NODES][2];
+        vp9_tree_probs_from_distribution(NZC32X32_TOKENS,
+                                         vp9_nzc32x32_encodings,
+                                         vp9_nzc32x32_tree,
+                                         probs, branch_ct,
+                                         nzc_stats_32x32[c][r][b]);
+        printf("      {");
+        for (t = 0; t < NZC32X32_NODES; ++t) {
+          printf(" %-3d,", probs[t]);
+        }
+        printf(" },\n");
+      }
+      printf("    },\n");
+    }
+    printf("  },\n");
+  }
+  printf("};\n");
+
+  printf(
+    "static const vp9_prob default_nzc_pcat_probs[MAX_NZC_CONTEXTS]\n"
+    "                                            [NZC_TOKENS_EXTRA]\n"
+    "                                            [NZC_BITS_EXTRA] = {\n");
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    printf("  {\n");
+    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {
+      printf("    {");
+      for (b = 0; b < NZC_BITS_EXTRA; ++b) {
+        vp9_prob prob = get_binary_prob(nzc_pcat_stats[c][t][b][0],
+                                        nzc_pcat_stats[c][t][b][1]);
+        printf(" %-3d,", prob);
+      }
+      printf(" },\n");
+    }
+    printf("  },\n");
+  }
+  printf("};\n");
+}
+#endif
+
+#endif  // CONFIG_CODE_NONZEROCOUNT
+
 static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
                           TOKENEXTRA **tok, TOKENEXTRA *tok_end,
                           int mb_row, int mb_col) {
-  VP9_COMMON *const c = &cpi->common;
+  VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
 
   xd->mode_info_context = m;
-  xd->left_available = mb_col > c->cur_tile_mb_col_start;
-  xd->right_available =
-      (mb_col + (1 << m->mbmi.sb_type)) < c->cur_tile_mb_col_end;
-  xd->up_available = mb_row > 0;
-  if (c->frame_type == KEY_FRAME) {
+  set_mb_row(&cpi->common, xd, mb_row, (1 << m->mbmi.sb_type));
+  set_mb_col(&cpi->common, xd, mb_col, (1 << m->mbmi.sb_type));
+  if (cm->frame_type == KEY_FRAME) {
     write_mb_modes_kf(cpi, m, bc,
-                      c->mb_rows - mb_row, c->mb_cols - mb_col);
+                      cm->mb_rows - mb_row, cm->mb_cols - mb_col);
 #ifdef ENTROPY_STATS
     active_section = 8;
 #endif
   } else {
     pack_inter_mode_mvs(cpi, m, bc,
-                        c->mb_rows - mb_row, c->mb_cols - mb_col);
+                        cm->mb_rows - mb_row, cm->mb_cols - mb_col);
 #ifdef ENTROPY_STATS
     active_section = 1;
 #endif
   }
+#if CONFIG_CODE_NONZEROCOUNT
+  if (m->mbmi.sb_type == BLOCK_SIZE_SB64X64)
+    write_nzcs_sb64(cpi, xd, mb_row, mb_col, bc);
+  else if (m->mbmi.sb_type == BLOCK_SIZE_SB32X32)
+    write_nzcs_sb32(cpi, xd, mb_row, mb_col, bc);
+  else
+    write_nzcs_mb16(cpi, xd, mb_row, mb_col, bc);
+#endif
 
   assert(*tok < tok_end);
   pack_mb_tokens(bc, tok, tok_end);
@@ -1230,6 +1780,234 @@
                           cpi->frame_branch_ct_32x32, BLOCK_TYPES);
 }
 
+#if CONFIG_CODE_NONZEROCOUNT
+static void update_nzc_probs_common(VP9_COMP* cpi,
+                                    vp9_writer* const bc,
+                                    int block_size) {
+  VP9_COMMON *cm = &cpi->common;
+  int c, r, b, t;
+  int update[2] = {0, 0};
+  int savings = 0;
+  int tokens, nodes;
+  const vp9_tree_index *nzc_tree;
+  const struct vp9_token_struct *nzc_encodings;
+  vp9_prob *new_nzc_probs;
+  vp9_prob *old_nzc_probs;
+  unsigned int *nzc_counts;
+  unsigned int (*nzc_branch_ct)[2];
+  vp9_prob upd;
+
+  if (block_size == 32) {
+    tokens = NZC32X32_TOKENS;
+    nzc_tree = vp9_nzc32x32_tree;
+    nzc_encodings = vp9_nzc32x32_encodings;
+    old_nzc_probs = cm->fc.nzc_probs_32x32[0][0][0];
+    new_nzc_probs = cpi->frame_nzc_probs_32x32[0][0][0];
+    nzc_counts = cm->fc.nzc_counts_32x32[0][0][0];
+    nzc_branch_ct = cpi->frame_nzc_branch_ct_32x32[0][0][0];
+    upd = NZC_UPDATE_PROB_32X32;
+  } else if (block_size == 16) {
+    tokens = NZC16X16_TOKENS;
+    nzc_tree = vp9_nzc16x16_tree;
+    nzc_encodings = vp9_nzc16x16_encodings;
+    old_nzc_probs = cm->fc.nzc_probs_16x16[0][0][0];
+    new_nzc_probs = cpi->frame_nzc_probs_16x16[0][0][0];
+    nzc_counts = cm->fc.nzc_counts_16x16[0][0][0];
+    nzc_branch_ct = cpi->frame_nzc_branch_ct_16x16[0][0][0];
+    upd = NZC_UPDATE_PROB_16X16;
+  } else if (block_size == 8) {
+    tokens = NZC8X8_TOKENS;
+    nzc_tree = vp9_nzc8x8_tree;
+    nzc_encodings = vp9_nzc8x8_encodings;
+    old_nzc_probs = cm->fc.nzc_probs_8x8[0][0][0];
+    new_nzc_probs = cpi->frame_nzc_probs_8x8[0][0][0];
+    nzc_counts = cm->fc.nzc_counts_8x8[0][0][0];
+    nzc_branch_ct = cpi->frame_nzc_branch_ct_8x8[0][0][0];
+    upd = NZC_UPDATE_PROB_8X8;
+  } else {
+    nzc_tree = vp9_nzc4x4_tree;
+    nzc_encodings = vp9_nzc4x4_encodings;
+    tokens = NZC4X4_TOKENS;
+    old_nzc_probs = cm->fc.nzc_probs_4x4[0][0][0];
+    new_nzc_probs = cpi->frame_nzc_probs_4x4[0][0][0];
+    nzc_counts = cm->fc.nzc_counts_4x4[0][0][0];
+    nzc_branch_ct = cpi->frame_nzc_branch_ct_4x4[0][0][0];
+    upd = NZC_UPDATE_PROB_4X4;
+  }
+  nodes = tokens - 1;
+  // Get the new probabilities and the branch counts
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    for (r = 0; r < REF_TYPES; ++r) {
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;
+        int offset_nodes = offset * nodes;
+        int offset_tokens = offset * tokens;
+        vp9_tree_probs_from_distribution(tokens,
+                                         nzc_encodings, nzc_tree,
+                                         new_nzc_probs + offset_nodes,
+                                         nzc_branch_ct + offset_nodes,
+                                         nzc_counts + offset_tokens);
+      }
+    }
+  }
+
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    for (r = 0; r < REF_TYPES; ++r) {
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;
+        int offset_nodes = offset * nodes;
+        for (t = 0; t < nodes; ++t) {
+          vp9_prob newp = new_nzc_probs[offset_nodes + t];
+          vp9_prob oldp = old_nzc_probs[offset_nodes + t];
+          int s, u = 0;
+#if defined(SEARCH_NEWP)
+            s = prob_diff_update_savings_search(nzc_branch_ct[offset_nodes],
+                                                oldp, &newp, upd);
+            if (s > 0 && newp != oldp)
+              u = 1;
+            if (u)
+              savings += s - (int)(vp9_cost_zero(upd));
+            else
+              savings -= (int)(vp9_cost_zero(upd));
+#else
+          s = prob_update_savings(nzc_branch_ct[offset_nodes],
+                                  oldp, newp, upd);
+          if (s > 0)
+            u = 1;
+          if (u)
+            savings += s;
+#endif
+          update[u]++;
+        }
+      }
+    }
+  }
+  if (update[1] == 0 || savings < 0) {
+    vp9_write_bit(bc, 0);
+  } else {
+    vp9_write_bit(bc, 1);
+    for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+      for (r = 0; r < REF_TYPES; ++r) {
+        for (b = 0; b < BLOCK_TYPES; ++b) {
+          int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;
+          int offset_nodes = offset * nodes;
+          for (t = 0; t < nodes; ++t) {
+            vp9_prob newp = new_nzc_probs[offset_nodes + t];
+            vp9_prob *oldp = &old_nzc_probs[offset_nodes + t];
+            int s, u = 0;
+#if defined(SEARCH_NEWP)
+            s = prob_diff_update_savings_search(nzc_branch_ct[offset_nodes],
+                                                *oldp, &newp, upd);
+            if (s > 0 && newp != *oldp)
+              u = 1;
+#else
+            s = prob_update_savings(nzc_branch_ct[offset_nodes],
+                                    *oldp, newp, upd);
+            if (s > 0)
+              u = 1;
+#endif
+            vp9_write(bc, u, upd);
+            if (u) {
+              /* send/use new probability */
+              write_prob_diff_update(bc, newp, *oldp);
+              *oldp = newp;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void update_nzc_pcat_probs(VP9_COMP *cpi, vp9_writer* const bc) {
+  VP9_COMMON *cm = &cpi->common;
+  int c, t, b;
+  int update[2] = {0, 0};
+  int savings = 0;
+  vp9_prob upd = NZC_UPDATE_PROB_PCAT;
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {
+      int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];
+      for (b = 0; b < bits; ++b) {
+        vp9_prob newp = get_binary_prob(cm->fc.nzc_pcat_counts[c][t][b][0],
+                                        cm->fc.nzc_pcat_counts[c][t][b][1]);
+        vp9_prob oldp = cm->fc.nzc_pcat_probs[c][t][b];
+        int s, u = 0;
+#if defined(SEARCH_NEWP)
+        s = prob_diff_update_savings_search(cm->fc.nzc_pcat_counts[c][t][b],
+                                            oldp, &newp, upd);
+        if (s > 0 && newp != oldp)
+          u = 1;
+        if (u)
+          savings += s - (int)(vp9_cost_zero(upd));
+        else
+          savings -= (int)(vp9_cost_zero(upd));
+#else
+        s = prob_update_savings(cm->fc.nzc_pcat_counts[c][t][b],
+                                oldp, newp, upd);
+        if (s > 0)
+          u = 1;
+        if (u)
+          savings += s;
+#endif
+        update[u]++;
+      }
+    }
+  }
+  if (update[1] == 0 || savings < 0) {
+    vp9_write_bit(bc, 0);
+  } else {
+    vp9_write_bit(bc, 1);
+    for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+      for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {
+        int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];
+        for (b = 0; b < bits; ++b) {
+          vp9_prob newp = get_binary_prob(cm->fc.nzc_pcat_counts[c][t][b][0],
+                                          cm->fc.nzc_pcat_counts[c][t][b][1]);
+          vp9_prob *oldp = &cm->fc.nzc_pcat_probs[c][t][b];
+          int s, u = 0;
+#if defined(SEARCH_NEWP)
+          s = prob_diff_update_savings_search(cm->fc.nzc_pcat_counts[c][t][b],
+                                              *oldp, &newp, upd);
+          if (s > 0 && newp != *oldp)
+            u = 1;
+#else
+          s = prob_update_savings(cm->fc.nzc_pcat_counts[c][t][b],
+                                  *oldp, newp, upd);
+          if (s > 0)
+            u = 1;
+#endif
+          vp9_write(bc, u, upd);
+          if (u) {
+            /* send/use new probability */
+            write_prob_diff_update(bc, newp, *oldp);
+            *oldp = newp;
+          }
+        }
+      }
+    }
+  }
+}
+
+static void update_nzc_probs(VP9_COMP* cpi,
+                             vp9_writer* const bc) {
+  update_nzc_probs_common(cpi, bc, 4);
+  if (cpi->common.txfm_mode != ONLY_4X4)
+    update_nzc_probs_common(cpi, bc, 8);
+  if (cpi->common.txfm_mode > ALLOW_8X8)
+    update_nzc_probs_common(cpi, bc, 16);
+  if (cpi->common.txfm_mode > ALLOW_16X16)
+    update_nzc_probs_common(cpi, bc, 32);
+#ifdef NZC_PCAT_UPDATE
+  update_nzc_pcat_probs(cpi, bc);
+#endif
+#ifdef NZC_STATS
+  if (!cpi->dummy_packing)
+    update_nzcstats(&cpi->common);
+#endif
+}
+#endif  // CONFIG_CODE_NONZEROCOUNT
+
 static void update_coef_probs_common(vp9_writer* const bc,
 #ifdef ENTROPY_STATS
                                      VP9_COMP *cpi,
@@ -1251,7 +2029,7 @@
       for (k = 0; k < COEF_BANDS; ++k) {
         int prev_coef_savings[ENTROPY_NODES] = {0};
         for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
-          for (t = 0; t < ENTROPY_NODES; ++t) {
+          for (t = CONFIG_CODE_NONZEROCOUNT; t < ENTROPY_NODES; ++t) {
             vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
             const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t];
             const vp9_prob upd = COEF_UPDATE_PROB;
@@ -1297,7 +2075,7 @@
           int prev_coef_savings[ENTROPY_NODES] = {0};
           for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
             // calc probs and branch cts for this frame only
-            for (t = 0; t < ENTROPY_NODES; ++t) {
+            for (t = CONFIG_CODE_NONZEROCOUNT; t < ENTROPY_NODES; ++t) {
               vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
               vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
               const vp9_prob upd = COEF_UPDATE_PROB;
@@ -1898,6 +2676,27 @@
            cpi->common.fc.coef_probs_16x16);
   vp9_copy(cpi->common.fc.pre_coef_probs_32x32,
            cpi->common.fc.coef_probs_32x32);
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_copy(cpi->common.fc.pre_nzc_probs_4x4,
+           cpi->common.fc.nzc_probs_4x4);
+  vp9_copy(cpi->common.fc.pre_nzc_probs_8x8,
+           cpi->common.fc.nzc_probs_8x8);
+  vp9_copy(cpi->common.fc.pre_nzc_probs_16x16,
+           cpi->common.fc.nzc_probs_16x16);
+  vp9_copy(cpi->common.fc.pre_nzc_probs_32x32,
+           cpi->common.fc.nzc_probs_32x32);
+  vp9_copy(cpi->common.fc.pre_nzc_pcat_probs,
+           cpi->common.fc.nzc_pcat_probs);
+  // NOTE that if the counts are reset, we also need to uncomment
+  // the count updates in the write_nzc function
+  /*
+  vp9_zero(cpi->common.fc.nzc_counts_4x4);
+  vp9_zero(cpi->common.fc.nzc_counts_8x8);
+  vp9_zero(cpi->common.fc.nzc_counts_16x16);
+  vp9_zero(cpi->common.fc.nzc_counts_32x32);
+  vp9_zero(cpi->common.fc.nzc_pcat_counts);
+  */
+#endif
   vp9_copy(cpi->common.fc.pre_sb_ymode_prob, cpi->common.fc.sb_ymode_prob);
   vp9_copy(cpi->common.fc.pre_ymode_prob, cpi->common.fc.ymode_prob);
   vp9_copy(cpi->common.fc.pre_uv_mode_prob, cpi->common.fc.uv_mode_prob);
@@ -1914,6 +2713,9 @@
   vp9_zero(cpi->common.fc.mv_ref_ct)
 
   update_coef_probs(cpi, &header_bc);
+#if CONFIG_CODE_NONZEROCOUNT
+  update_nzc_probs(cpi, &header_bc);
+#endif
 
 #ifdef ENTROPY_STATS
   active_section = 2;
@@ -1925,8 +2727,9 @@
     int k;
 
     vp9_update_skip_probs(cpi);
-    for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+    for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
       vp9_write_literal(&header_bc, pc->mbskip_pred_probs[k], 8);
+    }
   }
 
   if (pc->frame_type == KEY_FRAME) {
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 560c371..4390061 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -151,6 +151,12 @@
   unsigned char *active_ptr;
 
   vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES];
+#if CONFIG_CODE_NONZEROCOUNT
+  unsigned int nzc_costs_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][17];
+  unsigned int nzc_costs_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][65];
+  unsigned int nzc_costs_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][257];
+  unsigned int nzc_costs_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][1025];
+#endif
 
   int optimize;
 
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index a4dbdc5..87d456d 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -630,10 +630,6 @@
   const int idx_map = mb_row * cm->mb_cols + mb_col;
   const int idx_str = xd->mode_info_stride * mb_row + mb_col;
 
-#ifdef ENC_DEBUG
-  enc_debug = (cpi->common.current_video_frame == 2 &&
-               mb_row == 4 && mb_col == 5);
-#endif
   // entropy context structures
   xd->above_context = cm->above_context + mb_col;
   xd->left_context  = cm->left_context + (mb_row & 3);
@@ -668,15 +664,8 @@
   // Set up distance of MB to edge of frame in 1/8th pel units
   block_size >>= 4;  // in macroblock units
   assert(!(mb_col & (block_size - 1)) && !(mb_row & (block_size - 1)));
-  xd->mb_to_top_edge    = -((mb_row * 16) << 3);
-  xd->mb_to_left_edge   = -((mb_col * 16) << 3);
-  xd->mb_to_bottom_edge = ((cm->mb_rows - block_size - mb_row) * 16) << 3;
-  xd->mb_to_right_edge  = ((cm->mb_cols - block_size - mb_col) * 16) << 3;
-
-  // Are edges available for intra prediction?
-  xd->up_available    = (mb_row != 0);
-  xd->left_available  = (mb_col > cm->cur_tile_mb_col_start);
-  xd->right_available = (mb_col + block_size < cm->cur_tile_mb_col_end);
+  set_mb_row(cm, xd, mb_row, block_size);
+  set_mb_col(cm, xd, mb_col, block_size);
 
   /* set up source buffers */
   setup_pred_block(&x->src, cpi->Source, mb_row, mb_col, NULL, NULL);
@@ -891,7 +880,7 @@
   }
 }
 
-static void update_stats(VP9_COMP *cpi) {
+static void update_stats(VP9_COMP *cpi, int mb_row, int mb_col) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -945,6 +934,9 @@
     if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))
       cpi->inter_zz_count++;
   }
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_update_nzc_counts(&cpi->common, xd, mb_row, mb_col);
+#endif
 }
 
 static void encode_sb(VP9_COMP *cpi,
@@ -963,8 +955,9 @@
 
     encode_superblock32(cpi, tp,
                         output_enabled, mb_row, mb_col);
-    if (output_enabled)
-      update_stats(cpi);
+    if (output_enabled) {
+      update_stats(cpi, mb_row, mb_col);
+    }
 
     if (output_enabled) {
       (*tp)->Token = EOSB_TOKEN;
@@ -992,12 +985,13 @@
 
       encode_macroblock(cpi, tp,
                         output_enabled, mb_row + y_idx, mb_col + x_idx);
-      if (output_enabled)
-        update_stats(cpi);
+      if (output_enabled) {
+        update_stats(cpi, mb_row + y_idx, mb_col + x_idx);
+      }
 
       if (output_enabled) {
         (*tp)->Token = EOSB_TOKEN;
-        (*tp)++;
+       (*tp)++;
         if (mb_row + y_idx < cm->mb_rows)
           cpi->tplist[mb_row + y_idx].stop = *tp;
       }
@@ -1029,7 +1023,7 @@
     update_state(cpi, &x->sb64_context, 64, 1);
     encode_superblock64(cpi, tp,
                         1, mb_row, mb_col);
-    update_stats(cpi);
+    update_stats(cpi, mb_row, mb_col);
 
     (*tp)->Token = EOSB_TOKEN;
     (*tp)++;
@@ -1247,8 +1241,9 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   int totalrate;
 
-//   fprintf(stderr, "encode_frame_internal frame %d (%d)\n",
-//          cpi->common.current_video_frame, cpi->common.show_frame);
+//   fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n",
+//            cpi->common.current_video_frame, cpi->common.show_frame,
+//            cm->frame_type);
 
   // Compute a modified set of reference frame probabilities to use when
   // prediction fails. These are based on the current general estimates for
@@ -1286,6 +1281,13 @@
   vp9_zero(cpi->coef_counts_8x8);
   vp9_zero(cpi->coef_counts_16x16);
   vp9_zero(cpi->coef_counts_32x32);
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_zero(cm->fc.nzc_counts_4x4);
+  vp9_zero(cm->fc.nzc_counts_8x8);
+  vp9_zero(cm->fc.nzc_counts_16x16);
+  vp9_zero(cm->fc.nzc_counts_32x32);
+  vp9_zero(cm->fc.nzc_pcat_counts);
+#endif
 #if CONFIG_NEW_MVREF
   vp9_zero(cpi->mb_mv_ref_count);
 #endif
@@ -1327,30 +1329,34 @@
 
     {
       // Take tiles into account and give start/end MB
-      int tile_col;
+      int tile_col, tile_row;
       TOKENEXTRA *tp = cpi->tok;
-      for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) {
-        TOKENEXTRA *tp_old = tp;
-        // For each row of SBs in the frame
-        vp9_get_tile_col_offsets(cm, tile_col);
 
-        for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4) {
-          encode_sb_row(cpi, mb_row, &tp, &totalrate);
+      for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
+        vp9_get_tile_row_offsets(cm, tile_row);
+
+        for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) {
+          TOKENEXTRA *tp_old = tp;
+
+          // For each row of SBs in the frame
+          vp9_get_tile_col_offsets(cm, tile_col);
+          for (mb_row = cm->cur_tile_mb_row_start;
+               mb_row < cm->cur_tile_mb_row_end; mb_row += 4) {
+            encode_sb_row(cpi, mb_row, &tp, &totalrate);
+          }
+          cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old);
         }
-        cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old);
       }
     }
 
     vpx_usec_timer_mark(&emr_timer);
     cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);
-
   }
 
   // 256 rate units to the bit,
   // projected_frame_size in units of BYTES
   cpi->projected_frame_size = totalrate >> 8;
 
-
 #if 0
   // Keep record of the total distortion this time around for future use
   cpi->last_frame_distortion = cpi->frame_distortion;
@@ -1554,30 +1560,15 @@
       txfm_type = ONLY_4X4;
       cpi->mb.e_mbd.lossless = 1;
     } else
-    /* FIXME (rbultje)
-     * this is a hack (no really), basically to work around the complete
-     * nonsense coefficient cost prediction for keyframes. The probabilities
-     * are reset to defaults, and thus we basically have no idea how expensive
-     * a 4x4 vs. 8x8 will really be. The result is that any estimate at which
-     * of the two is better is utterly bogus.
-     * I'd like to eventually remove this hack, but in order to do that, we
-     * need to move the frame reset code from the frame encode init to the
-     * bitstream write code, or alternatively keep a backup of the previous
-     * keyframe's probabilities as an estimate of what the current keyframe's
-     * coefficient cost distributions may look like. */
-    if (frame_type == 0) {
-      txfm_type = ALLOW_32X32;
-    } else
 #if 0
-    /* FIXME (rbultje)
-     * this code is disabled for a similar reason as the code above; the
-     * problem is that each time we "revert" to 4x4 only (or even 8x8 only),
-     * the coefficient probabilities for 16x16 (and 8x8) start lagging behind,
-     * thus leading to them lagging further behind and not being chosen for
-     * subsequent frames either. This is essentially a local minimum problem
-     * that we can probably fix by estimating real costs more closely within
-     * a frame, perhaps by re-calculating costs on-the-fly as frame encoding
-     * progresses. */
+    /* FIXME (rbultje): this code is disabled until we support cost updates
+     * while a frame is being encoded; the problem is that each time we
+     * "revert" to 4x4 only (or even 8x8 only), the coefficient probabilities
+     * for 16x16 (and 8x8) start lagging behind, thus leading to them lagging
+     * further behind and not being chosen for subsequent frames either. This
+     * is essentially a local minimum problem that we can probably fix by
+     * estimating real costs more closely within a frame, perhaps by re-
+     * calculating costs on-the-fly as frame encoding progresses. */
     if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
             cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] &&
         cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
@@ -1930,6 +1921,135 @@
   }
 }
 
+#if CONFIG_CODE_NONZEROCOUNT
+static void gather_nzcs_mb16(VP9_COMMON *const cm,
+                             MACROBLOCKD *xd) {
+  int i;
+  vpx_memset(xd->mode_info_context->mbmi.nzcs, 0,
+             384 * sizeof(xd->mode_info_context->mbmi.nzcs[0]));
+  switch (xd->mode_info_context->mbmi.txfm_size) {
+    case TX_4X4:
+      for (i = 0; i < 24; ++i) {
+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+      }
+      break;
+
+    case TX_8X8:
+      for (i = 0; i < 16; i += 4) {
+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+      }
+      if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
+          xd->mode_info_context->mbmi.mode == SPLITMV) {
+        for (i = 16; i < 24; ++i) {
+          xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+        }
+      } else {
+        for (i = 16; i < 24; i += 4) {
+          xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+        }
+      }
+      break;
+
+    case TX_16X16:
+      xd->mode_info_context->mbmi.nzcs[0] = xd->nzcs[0];
+      for (i = 16; i < 24; i += 4) {
+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+      }
+      break;
+
+    default:
+      break;
+  }
+}
+
+static void gather_nzcs_sb32(VP9_COMMON *const cm,
+                             MACROBLOCKD *xd) {
+  int i, j;
+  MODE_INFO *m = xd->mode_info_context;
+  int mis = cm->mode_info_stride;
+  vpx_memset(m->mbmi.nzcs, 0,
+             384 * sizeof(xd->mode_info_context->mbmi.nzcs[0]));
+  switch (xd->mode_info_context->mbmi.txfm_size) {
+    case TX_4X4:
+      for (i = 0; i < 96; ++i) {
+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+      }
+      break;
+
+    case TX_8X8:
+      for (i = 0; i < 96; i += 4) {
+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+      }
+      break;
+
+    case TX_16X16:
+      for (i = 0; i < 96; i += 16) {
+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+      }
+      break;
+
+    case TX_32X32:
+      xd->mode_info_context->mbmi.nzcs[0] = xd->nzcs[0];
+      for (i = 64; i < 96; i += 16) {
+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+      }
+      break;
+
+    default:
+      break;
+  }
+  for (i = 0; i < 2; ++i)
+    for (j = 0; j < 2; ++j) {
+      if (i == 0 && j == 0) continue;
+      vpx_memcpy((m + j + mis * i)->mbmi.nzcs, m->mbmi.nzcs,
+                 384 * sizeof(m->mbmi.nzcs[0]));
+    }
+}
+
+static void gather_nzcs_sb64(VP9_COMMON *const cm,
+                             MACROBLOCKD *xd) {
+  int i, j;
+  MODE_INFO *m = xd->mode_info_context;
+  int mis = cm->mode_info_stride;
+  vpx_memset(xd->mode_info_context->mbmi.nzcs, 0,
+             384 * sizeof(xd->mode_info_context->mbmi.nzcs[0]));
+  switch (xd->mode_info_context->mbmi.txfm_size) {
+    case TX_4X4:
+      for (i = 0; i < 384; ++i) {
+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+      }
+      break;
+
+    case TX_8X8:
+      for (i = 0; i < 384; i += 4) {
+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+      }
+      break;
+
+    case TX_16X16:
+      for (i = 0; i < 384; i += 16) {
+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+      }
+      break;
+
+    case TX_32X32:
+      for (i = 0; i < 384; i += 64) {
+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+      }
+      break;
+
+    default:
+      break;
+  }
+  for (i = 0; i < 4; ++i)
+    for (j = 0; j < 4; ++j) {
+      if (i == 0 && j == 0) continue;
+      vpx_memcpy((m + j + mis * i)->mbmi.nzcs, m->mbmi.nzcs,
+                 384 * sizeof(m->mbmi.nzcs[0]));
+    }
+}
+#endif
+
 static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
                               int output_enabled,
                               int mb_row, int mb_col) {
@@ -1944,8 +2064,8 @@
   assert(!xd->mode_info_context->mbmi.sb_type);
 
 #ifdef ENC_DEBUG
-  enc_debug = (cpi->common.current_video_frame == 2 &&
-               mb_row == 5 && mb_col == 18);
+  enc_debug = (cpi->common.current_video_frame == 1 &&
+               mb_row == 0 && mb_col == 0 && output_enabled);
   if (enc_debug)
     printf("Encode MB %d %d output %d\n", mb_row, mb_col, output_enabled);
 #endif
@@ -1997,14 +2117,14 @@
     }
 #endif
     if (mbmi->mode == B_PRED) {
-      vp9_encode_intra16x16mbuv(x);
+      vp9_encode_intra16x16mbuv(cm, x);
       vp9_encode_intra4x4mby(x);
     } else if (mbmi->mode == I8X8_PRED) {
       vp9_encode_intra8x8mby(x);
       vp9_encode_intra8x8mbuv(x);
     } else {
-      vp9_encode_intra16x16mbuv(x);
-      vp9_encode_intra16x16mby(x);
+      vp9_encode_intra16x16mbuv(cm, x);
+      vp9_encode_intra16x16mby(cm, x);
     }
 
     if (output_enabled)
@@ -2051,7 +2171,7 @@
     }
 
     if (!x->skip) {
-      vp9_encode_inter16x16(x, mb_row, mb_col);
+      vp9_encode_inter16x16(cm, x, mb_row, mb_col);
 
       // Clear mb_skip_coeff if mb_no_coeff_skip is not set
       if (!cpi->common.mb_no_coeff_skip)
@@ -2079,12 +2199,12 @@
   }
 
   if (!x->skip) {
-#if 0  // def ENC_DEBUG
+#ifdef ENC_DEBUG
     if (enc_debug) {
       int i, j;
       printf("\n");
       printf("qcoeff\n");
-      for (i = 0; i < 400; i++) {
+      for (i = 0; i < 384; i++) {
         printf("%3d ", xd->qcoeff[i]);
         if (i % 16 == 15) printf("\n");
       }
@@ -2131,6 +2251,9 @@
     }
 #endif
 
+#if CONFIG_CODE_NONZEROCOUNT
+    gather_nzcs_mb16(cm, xd);
+#endif
     vp9_tokenize_mb(cpi, xd, t, !output_enabled);
 
   } else {
@@ -2197,6 +2320,12 @@
   unsigned int segment_id = mi->mbmi.segment_id;
   const int mis = cm->mode_info_stride;
 
+#ifdef ENC_DEBUG
+  enc_debug = (cpi->common.current_video_frame == 1 &&
+               mb_row == 0 && mb_col == 0 && output_enabled);
+  if (enc_debug)
+    printf("Encode SB32 %d %d output %d\n", mb_row, mb_col, output_enabled);
+#endif
   if (cm->frame_type == KEY_FRAME) {
     if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
       adjust_act_zbin(cpi, x);
@@ -2294,8 +2423,8 @@
         vp9_quantize_sby_32x32(x);
         vp9_quantize_sbuv_16x16(x);
         if (x->optimize) {
-          vp9_optimize_sby_32x32(x);
-          vp9_optimize_sbuv_16x16(x);
+          vp9_optimize_sby_32x32(cm, x);
+          vp9_optimize_sbuv_16x16(cm, x);
         }
         vp9_inverse_transform_sby_32x32(xd);
         vp9_inverse_transform_sbuv_16x16(xd);
@@ -2306,8 +2435,8 @@
         vp9_quantize_sby_16x16(x);
         vp9_quantize_sbuv_16x16(x);
         if (x->optimize) {
-          vp9_optimize_sby_16x16(x);
-          vp9_optimize_sbuv_16x16(x);
+          vp9_optimize_sby_16x16(cm, x);
+          vp9_optimize_sbuv_16x16(cm, x);
         }
         vp9_inverse_transform_sby_16x16(xd);
         vp9_inverse_transform_sbuv_16x16(xd);
@@ -2318,8 +2447,8 @@
         vp9_quantize_sby_8x8(x);
         vp9_quantize_sbuv_8x8(x);
         if (x->optimize) {
-          vp9_optimize_sby_8x8(x);
-          vp9_optimize_sbuv_8x8(x);
+          vp9_optimize_sby_8x8(cm, x);
+          vp9_optimize_sbuv_8x8(cm, x);
         }
         vp9_inverse_transform_sby_8x8(xd);
         vp9_inverse_transform_sbuv_8x8(xd);
@@ -2330,8 +2459,8 @@
         vp9_quantize_sby_4x4(x);
         vp9_quantize_sbuv_4x4(x);
         if (x->optimize) {
-          vp9_optimize_sby_4x4(x);
-          vp9_optimize_sbuv_4x4(x);
+          vp9_optimize_sby_4x4(cm, x);
+          vp9_optimize_sbuv_4x4(cm, x);
         }
         vp9_inverse_transform_sby_4x4(xd);
         vp9_inverse_transform_sbuv_4x4(xd);
@@ -2340,6 +2469,9 @@
     }
     vp9_recon_sby_s_c(xd, dst);
     vp9_recon_sbuv_s_c(xd, udst, vdst);
+#if CONFIG_CODE_NONZEROCOUNT
+    gather_nzcs_sb32(cm, xd);
+#endif
 
     vp9_tokenize_sb(cpi, xd, t, !output_enabled);
   } else {
@@ -2407,6 +2539,12 @@
   unsigned int segment_id = mi->mbmi.segment_id;
   const int mis = cm->mode_info_stride;
 
+#ifdef ENC_DEBUG
+  enc_debug = (cpi->common.current_video_frame == 1 &&
+               mb_row == 0 && mb_col == 0 && output_enabled);
+  if (enc_debug)
+    printf("Encode SB64 %d %d output %d\n", mb_row, mb_col, output_enabled);
+#endif
   if (cm->frame_type == KEY_FRAME) {
     if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
       adjust_act_zbin(cpi, x);
@@ -2502,8 +2640,8 @@
         vp9_quantize_sb64y_32x32(x);
         vp9_quantize_sb64uv_32x32(x);
         if (x->optimize) {
-          vp9_optimize_sb64y_32x32(x);
-          vp9_optimize_sb64uv_32x32(x);
+          vp9_optimize_sb64y_32x32(cm, x);
+          vp9_optimize_sb64uv_32x32(cm, x);
         }
         vp9_inverse_transform_sb64y_32x32(xd);
         vp9_inverse_transform_sb64uv_32x32(xd);
@@ -2514,8 +2652,8 @@
         vp9_quantize_sb64y_16x16(x);
         vp9_quantize_sb64uv_16x16(x);
         if (x->optimize) {
-          vp9_optimize_sb64y_16x16(x);
-          vp9_optimize_sb64uv_16x16(x);
+          vp9_optimize_sb64y_16x16(cm, x);
+          vp9_optimize_sb64uv_16x16(cm, x);
         }
         vp9_inverse_transform_sb64y_16x16(xd);
         vp9_inverse_transform_sb64uv_16x16(xd);
@@ -2526,8 +2664,8 @@
         vp9_quantize_sb64y_8x8(x);
         vp9_quantize_sb64uv_8x8(x);
         if (x->optimize) {
-          vp9_optimize_sb64y_8x8(x);
-          vp9_optimize_sb64uv_8x8(x);
+          vp9_optimize_sb64y_8x8(cm, x);
+          vp9_optimize_sb64uv_8x8(cm, x);
         }
         vp9_inverse_transform_sb64y_8x8(xd);
         vp9_inverse_transform_sb64uv_8x8(xd);
@@ -2538,8 +2676,8 @@
         vp9_quantize_sb64y_4x4(x);
         vp9_quantize_sb64uv_4x4(x);
         if (x->optimize) {
-          vp9_optimize_sb64y_4x4(x);
-          vp9_optimize_sb64uv_4x4(x);
+          vp9_optimize_sb64y_4x4(cm, x);
+          vp9_optimize_sb64uv_4x4(cm, x);
         }
         vp9_inverse_transform_sb64y_4x4(xd);
         vp9_inverse_transform_sb64uv_4x4(xd);
@@ -2548,7 +2686,9 @@
     }
     vp9_recon_sb64y_s_c(xd, dst);
     vp9_recon_sb64uv_s_c(&x->e_mbd, udst, vdst);
-
+#if CONFIG_CODE_NONZEROCOUNT
+    gather_nzcs_sb64(cm, &x->e_mbd);
+#endif
     vp9_tokenize_sb64(cpi, &x->e_mbd, t, !output_enabled);
   } else {
     // FIXME(rbultje): not tile-aware (mi - 1)
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index 75c8ea8..3c98d4a 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -25,7 +25,7 @@
     mbmi->uv_mode = DC_PRED;
     mbmi->ref_frame = INTRA_FRAME;
 
-    vp9_encode_intra16x16mby(x);
+    vp9_encode_intra16x16mby(&cpi->common, x);
   } else {
     int i;
 
@@ -50,7 +50,7 @@
   vp9_intra4x4_predict(&x->e_mbd, b, b->bmi.as_mode.first, b->predictor);
   vp9_subtract_b(be, b, 16);
 
-  tx_type = get_tx_type_4x4(&x->e_mbd, b);
+  tx_type = get_tx_type_4x4(&x->e_mbd, ib);
   if (tx_type != DCT_DCT) {
     vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
     vp9_ht_quantize_b_4x4(x, ib, tx_type);
@@ -72,7 +72,7 @@
     vp9_encode_intra4x4block(mb, i);
 }
 
-void vp9_encode_intra16x16mby(MACROBLOCK *x) {
+void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x) {
   MACROBLOCKD *xd = &x->e_mbd;
   BLOCK *b = &x->block[0];
   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
@@ -86,21 +86,21 @@
       vp9_transform_mby_16x16(x);
       vp9_quantize_mby_16x16(x);
       if (x->optimize)
-        vp9_optimize_mby_16x16(x);
+        vp9_optimize_mby_16x16(cm, x);
       vp9_inverse_transform_mby_16x16(xd);
       break;
     case TX_8X8:
       vp9_transform_mby_8x8(x);
       vp9_quantize_mby_8x8(x);
       if (x->optimize)
-        vp9_optimize_mby_8x8(x);
+        vp9_optimize_mby_8x8(cm, x);
       vp9_inverse_transform_mby_8x8(xd);
       break;
     default:
       vp9_transform_mby_4x4(x);
       vp9_quantize_mby_4x4(x);
       if (x->optimize)
-        vp9_optimize_mby_4x4(x);
+        vp9_optimize_mby_4x4(cm, x);
       vp9_inverse_transform_mby_4x4(xd);
       break;
   }
@@ -108,7 +108,7 @@
   vp9_recon_mby(xd);
 }
 
-void vp9_encode_intra16x16mbuv(MACROBLOCK *x) {
+void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x) {
   MACROBLOCKD *xd = &x->e_mbd;
   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
 
@@ -122,14 +122,14 @@
       vp9_transform_mbuv_4x4(x);
       vp9_quantize_mbuv_4x4(x);
       if (x->optimize)
-        vp9_optimize_mbuv_4x4(x);
+        vp9_optimize_mbuv_4x4(cm, x);
       vp9_inverse_transform_mbuv_4x4(xd);
       break;
     default:  // 16x16 or 8x8
       vp9_transform_mbuv_8x8(x);
       vp9_quantize_mbuv_8x8(x);
       if (x->optimize)
-        vp9_optimize_mbuv_8x8(x);
+        vp9_optimize_mbuv_8x8(cm, x);
       vp9_inverse_transform_mbuv_8x8(xd);
       break;
     }
@@ -152,7 +152,7 @@
   if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
     int idx = (ib & 0x02) ? (ib + 2) : ib;
 
-    tx_type = get_tx_type_8x8(xd, &xd->block[ib]);
+    tx_type = get_tx_type_8x8(xd, ib);
     if (tx_type != DCT_DCT) {
       vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type);
       x->quantize_b_8x8(x, idx);
@@ -167,12 +167,13 @@
     for (i = 0; i < 4; i++) {
       b = &xd->block[ib + iblock[i]];
       be = &x->block[ib + iblock[i]];
-      tx_type = get_tx_type_4x4(xd, b);
+      tx_type = get_tx_type_4x4(xd, ib + iblock[i]);
       if (tx_type != DCT_DCT) {
         vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
         vp9_ht_quantize_b_4x4(x, ib + iblock[i], tx_type);
         vp9_short_iht4x4(b->dqcoeff, b->diff, 16, tx_type);
-      } else if (!(i & 1) && get_tx_type_4x4(xd, b + 1) == DCT_DCT) {
+      } else if (!(i & 1) &&
+                 get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) {
         x->fwd_txm8x4(be->src_diff, be->coeff, 32);
         x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1);
         vp9_inverse_transform_b_4x4(xd, xd->eobs[ib + iblock[i]],
diff --git a/vp9/encoder/vp9_encodeintra.h b/vp9/encoder/vp9_encodeintra.h
index b017673..0b19b56 100644
--- a/vp9/encoder/vp9_encodeintra.h
+++ b/vp9/encoder/vp9_encodeintra.h
@@ -14,8 +14,8 @@
 #include "vp9/encoder/vp9_onyx_int.h"
 
 int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred);
-void vp9_encode_intra16x16mby(MACROBLOCK *x);
-void vp9_encode_intra16x16mbuv(MACROBLOCK *x);
+void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x);
+void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_encode_intra4x4mby(MACROBLOCK *mb);
 void vp9_encode_intra4x4block(MACROBLOCK *x, int ib);
 void vp9_encode_intra8x8mby(MACROBLOCK *x);
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index b2ee800..dae177a 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -210,10 +210,10 @@
 
   for (i = 0; i < 16; i++) {
     BLOCK *b = &x->block[i];
-    TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[i]);
+    TX_TYPE tx_type = get_tx_type_4x4(xd, i);
     if (tx_type != DCT_DCT) {
       vp9_short_fht4x4(b->src_diff, b->coeff, 16, tx_type);
-    } else if (!(i & 1) && get_tx_type_4x4(xd, &xd->block[i + 1]) == DCT_DCT) {
+    } else if (!(i & 1) && get_tx_type_4x4(xd, i + 1) == DCT_DCT) {
       x->fwd_txm8x4(x->block[i].src_diff, x->block[i].coeff, 32);
       i++;
     } else {
@@ -241,7 +241,7 @@
 
   for (i = 0; i < 9; i += 8) {
     BLOCK *b = &x->block[i];
-    tx_type = get_tx_type_8x8(xd, &xd->block[i]);
+    tx_type = get_tx_type_8x8(xd, i);
     if (tx_type != DCT_DCT) {
       vp9_short_fht8x8(b->src_diff, b->coeff, 16, tx_type);
     } else {
@@ -250,7 +250,7 @@
   }
   for (i = 2; i < 11; i += 8) {
     BLOCK *b = &x->block[i];
-    tx_type = get_tx_type_8x8(xd, &xd->block[i]);
+    tx_type = get_tx_type_8x8(xd, i);
     if (tx_type != DCT_DCT) {
       vp9_short_fht8x8(b->src_diff, (b + 2)->coeff, 16, tx_type);
     } else {
@@ -274,7 +274,7 @@
 void vp9_transform_mby_16x16(MACROBLOCK *x) {
   MACROBLOCKD *xd = &x->e_mbd;
   BLOCK *b = &x->block[0];
-  TX_TYPE tx_type = get_tx_type_16x16(xd, &xd->block[0]);
+  TX_TYPE tx_type = get_tx_type_16x16(xd, 0);
   vp9_clear_system_state();
   if (tx_type != DCT_DCT) {
     vp9_short_fht16x16(b->src_diff, b->coeff, 16, tx_type);
@@ -293,35 +293,56 @@
 }
 
 void vp9_transform_sby_16x16(MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
   int n;
 
   for (n = 0; n < 4; n++) {
     const int x_idx = n & 1, y_idx = n >> 1;
+    const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 8 + x_idx) * 4);
 
-    x->fwd_txm16x16(x->src_diff + y_idx * 32 * 16 + x_idx * 16,
-                    x->coeff + n * 256, 64);
+    if (tx_type != DCT_DCT) {
+      vp9_short_fht16x16(x->src_diff + y_idx * 32 * 16 + x_idx * 16,
+                         x->coeff + n * 256, 32, tx_type);
+    } else {
+      x->fwd_txm16x16(x->src_diff + y_idx * 32 * 16 + x_idx * 16,
+                      x->coeff + n * 256, 64);
+    }
   }
 }
 
 void vp9_transform_sby_8x8(MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
   int n;
 
   for (n = 0; n < 16; n++) {
     const int x_idx = n & 3, y_idx = n >> 2;
+    const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 8 + x_idx) * 2);
 
-    x->fwd_txm8x8(x->src_diff + y_idx * 32 * 8 + x_idx * 8,
-                  x->coeff + n * 64, 64);
+    if (tx_type != DCT_DCT) {
+      vp9_short_fht8x8(x->src_diff + y_idx * 32 * 8 + x_idx * 8,
+                       x->coeff + n * 64, 32, tx_type);
+    } else {
+      x->fwd_txm8x8(x->src_diff + y_idx * 32 * 8 + x_idx * 8,
+                    x->coeff + n * 64, 64);
+    }
   }
 }
 
 void vp9_transform_sby_4x4(MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
   int n;
 
   for (n = 0; n < 64; n++) {
     const int x_idx = n & 7, y_idx = n >> 3;
+    const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 8 + x_idx);
 
-    x->fwd_txm4x4(x->src_diff + y_idx * 32 * 4 + x_idx * 4,
-                  x->coeff + n * 16, 64);
+    if (tx_type != DCT_DCT) {
+      vp9_short_fht4x4(x->src_diff + y_idx * 32 * 4 + x_idx * 4,
+                       x->coeff + n * 16, 32, tx_type);
+    } else {
+      x->fwd_txm4x4(x->src_diff + y_idx * 32 * 4 + x_idx * 4,
+                    x->coeff + n * 16, 64);
+    }
   }
 }
 
@@ -371,35 +392,56 @@
 }
 
 void vp9_transform_sb64y_16x16(MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
   int n;
 
   for (n = 0; n < 16; n++) {
     const int x_idx = n & 3, y_idx = n >> 2;
+    const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 16 + x_idx) * 4);
 
-    x->fwd_txm16x16(x->src_diff + y_idx * 64 * 16 + x_idx * 16,
-                    x->coeff + n * 256, 128);
+    if (tx_type != DCT_DCT) {
+      vp9_short_fht16x16(x->src_diff + y_idx * 64 * 16 + x_idx * 16,
+                         x->coeff + n * 256, 64, tx_type);
+    } else {
+      x->fwd_txm16x16(x->src_diff + y_idx * 64 * 16 + x_idx * 16,
+                      x->coeff + n * 256, 128);
+    }
   }
 }
 
 void vp9_transform_sb64y_8x8(MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
   int n;
 
   for (n = 0; n < 64; n++) {
     const int x_idx = n & 7, y_idx = n >> 3;
+    const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 16 + x_idx) * 2);
 
-    x->fwd_txm8x8(x->src_diff + y_idx * 64 * 8 + x_idx * 8,
-                  x->coeff + n * 64, 128);
+    if (tx_type != DCT_DCT) {
+      vp9_short_fht8x8(x->src_diff + y_idx * 64 * 8 + x_idx * 8,
+                         x->coeff + n * 64, 64, tx_type);
+    } else {
+      x->fwd_txm8x8(x->src_diff + y_idx * 64 * 8 + x_idx * 8,
+                    x->coeff + n * 64, 128);
+    }
   }
 }
 
 void vp9_transform_sb64y_4x4(MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
   int n;
 
   for (n = 0; n < 256; n++) {
     const int x_idx = n & 15, y_idx = n >> 4;
+    const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 16 + x_idx);
 
-    x->fwd_txm4x4(x->src_diff + y_idx * 64 * 4 + x_idx * 4,
-                  x->coeff + n * 16, 128);
+    if (tx_type != DCT_DCT) {
+      vp9_short_fht8x8(x->src_diff + y_idx * 64 * 4 + x_idx * 4,
+                       x->coeff + n * 16, 64, tx_type);
+    } else {
+      x->fwd_txm4x4(x->src_diff + y_idx * 64 * 4 + x_idx * 4,
+                    x->coeff + n * 16, 128);
+    }
   }
 }
 
@@ -491,7 +533,8 @@
   return vp9_get_coef_context(&recent_energy, token);
 }
 
-static void optimize_b(MACROBLOCK *mb, int ib, PLANE_TYPE type,
+static void optimize_b(VP9_COMMON *const cm,
+                       MACROBLOCK *mb, int ib, PLANE_TYPE type,
                        const int16_t *dequant_ptr,
                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                        int tx_size) {
@@ -512,26 +555,61 @@
   int default_eob;
   int const *scan;
   const int mul = 1 + (tx_size == TX_32X32);
+#if CONFIG_CODE_NONZEROCOUNT
+  // TODO(debargha): the dynamic programming approach used in this function
+  // is not compatible with the true rate cost when nzcs are used. Note
+  // the total rate is the sum of the nzc rate and the indicvidual token
+  // rates. The latter part can be optimized in this function, but because
+  // the nzc rate is a function of all the other tokens without a Markov
+  // relationship this rate cannot be considered correctly.
+  // The current implementation uses a suboptimal approach to account for
+  // the nzc rates somewhat, but in reality the optimization approach needs
+  // to change substantially.
+  uint16_t nzc = xd->nzcs[ib];
+  uint16_t nzc0, nzc1;
+  uint16_t final_nzc = 0, final_nzc_exp;
+  int nzc_context = vp9_get_nzc_context(cm, xd, ib);
+  unsigned int *nzc_cost;
+  nzc0 = nzc1 = nzc;
+#endif
 
   switch (tx_size) {
     default:
-    case TX_4X4:
+    case TX_4X4: {
+      const TX_TYPE tx_type = get_tx_type_4x4(xd, ib);
       default_eob = 16;
-      // FIXME(rbultje): although optimize_b currently isn't called for
-      // intra4x4, this should be changed to be adst-compatible
-      scan = vp9_default_zig_zag1d_4x4;
+#if CONFIG_CODE_NONZEROCOUNT
+      nzc_cost = mb->nzc_costs_4x4[nzc_context][ref][type];
+#endif
+      if (tx_type == DCT_ADST) {
+        scan = vp9_col_scan_4x4;
+      } else if (tx_type == ADST_DCT) {
+        scan = vp9_row_scan_4x4;
+      } else {
+        scan = vp9_default_zig_zag1d_4x4;
+      }
       break;
+    }
     case TX_8X8:
       scan = vp9_default_zig_zag1d_8x8;
       default_eob = 64;
+#if CONFIG_CODE_NONZEROCOUNT
+      nzc_cost = mb->nzc_costs_8x8[nzc_context][ref][type];
+#endif
       break;
     case TX_16X16:
       scan = vp9_default_zig_zag1d_16x16;
       default_eob = 256;
+#if CONFIG_CODE_NONZEROCOUNT
+      nzc_cost = mb->nzc_costs_16x16[nzc_context][ref][type];
+#endif
       break;
     case TX_32X32:
       scan = vp9_default_zig_zag1d_32x32;
       default_eob = 1024;
+#if CONFIG_CODE_NONZEROCOUNT
+      nzc_cost = mb->nzc_costs_32x32[nzc_context][ref][type];
+#endif
       break;
   }
 
@@ -542,7 +620,11 @@
   rddiv = mb->rddiv;
   memset(best_index, 0, sizeof(best_index));
   /* Initialize the sentinel node of the trellis. */
+#if CONFIG_CODE_NONZEROCOUNT
+  tokens[eob][0].rate = nzc_cost[nzc];
+#else
   tokens[eob][0].rate = 0;
+#endif
   tokens[eob][0].error = 0;
   tokens[eob][0].next = default_eob;
   tokens[eob][0].token = DCT_EOB_TOKEN;
@@ -551,6 +633,9 @@
   next = eob;
   for (i = eob; i-- > i0;) {
     int base_bits, d2, dx;
+#if CONFIG_CODE_NONZEROCOUNT
+    int new_nzc0, new_nzc1;
+#endif
 
     rc = scan[i];
     x = qcoeff_ptr[rc];
@@ -584,6 +669,10 @@
       tokens[i][0].token = t0;
       tokens[i][0].qc = x;
       best_index[i][0] = best;
+#if CONFIG_CODE_NONZEROCOUNT
+      new_nzc0 = (best ? nzc1 : nzc0);
+#endif
+
       /* Evaluate the second possibility for this state. */
       rate0 = tokens[next][0].rate;
       rate1 = tokens[next][1].rate;
@@ -609,6 +698,12 @@
              DCT_EOB_TOKEN : ZERO_TOKEN;
         t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
              DCT_EOB_TOKEN : ZERO_TOKEN;
+#if CONFIG_CODE_NONZEROCOUNT
+        // Account for rate drop because of the nzc change.
+        // TODO(debargha): Find a better solution
+        rate0 -= nzc_cost[nzc0] - nzc_cost[nzc0 - 1];
+        rate1 -= nzc_cost[nzc1] - nzc_cost[nzc1 - 1];
+#endif
       } else {
         t0 = t1 = (vp9_dct_value_tokens_ptr + x)->Token;
       }
@@ -641,6 +736,11 @@
       tokens[i][1].token = best ? t1 : t0;
       tokens[i][1].qc = x;
       best_index[i][1] = best;
+#if CONFIG_CODE_NONZEROCOUNT
+      new_nzc1 = (best ? nzc1 : nzc0) - (!x);
+      nzc0 = new_nzc0;
+      nzc1 = new_nzc1;
+#endif
       /* Finally, make this the new head of the trellis. */
       next = i;
     }
@@ -679,11 +779,18 @@
   rate1 += mb->token_costs[tx_size][type][ref][band][pt][t1];
   UPDATE_RD_COST();
   best = rd_cost1 < rd_cost0;
+#if CONFIG_CODE_NONZEROCOUNT
+  final_nzc_exp = (best ? nzc1 : nzc0);
+#endif
   final_eob = i0 - 1;
   for (i = next; i < eob; i = next) {
     x = tokens[i][best].qc;
-    if (x)
+    if (x) {
       final_eob = i;
+#if CONFIG_CODE_NONZEROCOUNT
+      ++final_nzc;
+#endif
+    }
     rc = scan[i];
     qcoeff_ptr[rc] = x;
     dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]) / mul;
@@ -695,9 +802,13 @@
 
   xd->eobs[ib] = final_eob;
   *a = *l = (final_eob > 0);
+#if CONFIG_CODE_NONZEROCOUNT
+  assert(final_nzc == final_nzc_exp);
+  xd->nzcs[ib] = final_nzc;
+#endif
 }
 
-void vp9_optimize_mby_4x4(MACROBLOCK *x) {
+void vp9_optimize_mby_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
   int b;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
   ENTROPY_CONTEXT *ta;
@@ -713,13 +824,13 @@
   tl = (ENTROPY_CONTEXT *)&t_left;
 
   for (b = 0; b < 16; b++) {
-    optimize_b(x, b, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[b].dequant,
+    optimize_b(cm, x, b, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[b].dequant,
                ta + vp9_block2above[TX_4X4][b],
                tl + vp9_block2left[TX_4X4][b], TX_4X4);
   }
 }
 
-void vp9_optimize_mbuv_4x4(MACROBLOCK *x) {
+void vp9_optimize_mbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
   int b;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
   ENTROPY_CONTEXT *ta;
@@ -735,18 +846,18 @@
   tl = (ENTROPY_CONTEXT *)&t_left;
 
   for (b = 16; b < 24; b++) {
-    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[b].dequant,
+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[b].dequant,
                ta + vp9_block2above[TX_4X4][b],
                tl + vp9_block2left[TX_4X4][b], TX_4X4);
   }
 }
 
-static void optimize_mb_4x4(MACROBLOCK *x) {
-  vp9_optimize_mby_4x4(x);
-  vp9_optimize_mbuv_4x4(x);
+static void optimize_mb_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
+  vp9_optimize_mby_4x4(cm, x);
+  vp9_optimize_mbuv_4x4(cm, x);
 }
 
-void vp9_optimize_mby_8x8(MACROBLOCK *x) {
+void vp9_optimize_mby_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
   int b;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
   ENTROPY_CONTEXT *ta;
@@ -765,14 +876,14 @@
     ENTROPY_CONTEXT *const l = tl + vp9_block2left[TX_8X8][b];
     ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;
     ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;
-    optimize_b(x, b, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[b].dequant,
+    optimize_b(cm, x, b, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[b].dequant,
                &above_ec, &left_ec, TX_8X8);
     a[1] = a[0] = above_ec;
     l[1] = l[0] = left_ec;
   }
 }
 
-void vp9_optimize_mbuv_8x8(MACROBLOCK *x) {
+void vp9_optimize_mbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
   int b;
   ENTROPY_CONTEXT *const ta = (ENTROPY_CONTEXT *)x->e_mbd.above_context;
   ENTROPY_CONTEXT *const tl = (ENTROPY_CONTEXT *)x->e_mbd.left_context;
@@ -785,17 +896,17 @@
     ENTROPY_CONTEXT *const l = tl + vp9_block2left[TX_8X8][b];
     ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;
     ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;
-    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[b].dequant,
+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[b].dequant,
                &above_ec, &left_ec, TX_8X8);
   }
 }
 
-static void optimize_mb_8x8(MACROBLOCK *x) {
-  vp9_optimize_mby_8x8(x);
-  vp9_optimize_mbuv_8x8(x);
+static void optimize_mb_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
+  vp9_optimize_mby_8x8(cm, x);
+  vp9_optimize_mbuv_8x8(cm, x);
 }
 
-void vp9_optimize_mby_16x16(MACROBLOCK *x) {
+void vp9_optimize_mby_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT_PLANES *const t_above = x->e_mbd.above_context;
   ENTROPY_CONTEXT_PLANES *const t_left = x->e_mbd.left_context;
   ENTROPY_CONTEXT ta, tl;
@@ -805,16 +916,16 @@
 
   ta = (t_above->y1[0] + t_above->y1[1] + t_above->y1[2] + t_above->y1[3]) != 0;
   tl = (t_left->y1[0] + t_left->y1[1] + t_left->y1[2] + t_left->y1[3]) != 0;
-  optimize_b(x, 0, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+  optimize_b(cm, x, 0, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
              &ta, &tl, TX_16X16);
 }
 
-static void optimize_mb_16x16(MACROBLOCK *x) {
-  vp9_optimize_mby_16x16(x);
-  vp9_optimize_mbuv_8x8(x);
+static void optimize_mb_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
+  vp9_optimize_mby_16x16(cm, x);
+  vp9_optimize_mbuv_8x8(cm, x);
 }
 
-void vp9_optimize_sby_32x32(MACROBLOCK *x) {
+void vp9_optimize_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
   ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
   ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
@@ -823,11 +934,11 @@
 
   ta = (a[0] + a[1] + a[2] + a[3] + a1[0] + a1[1] + a1[2] + a1[3]) != 0;
   tl = (l[0] + l[1] + l[2] + l[3] + l1[0] + l1[1] + l1[2] + l1[3]) != 0;
-  optimize_b(x, 0, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+  optimize_b(cm, x, 0, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
              &ta, &tl, TX_32X32);
 }
 
-void vp9_optimize_sby_16x16(MACROBLOCK *x) {
+void vp9_optimize_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
   ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
   ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
@@ -842,12 +953,12 @@
   for (n = 0; n < 4; n++) {
     const int x_idx = n & 1, y_idx = n >> 1;
 
-    optimize_b(x, n * 16, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+    optimize_b(cm, x, n * 16, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
                ta + x_idx, tl + y_idx, TX_16X16);
   }
 }
 
-void vp9_optimize_sby_8x8(MACROBLOCK *x) {
+void vp9_optimize_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
   ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
   ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
@@ -866,12 +977,12 @@
   for (n = 0; n < 16; n++) {
     const int x_idx = n & 3, y_idx = n >> 2;
 
-    optimize_b(x, n * 4, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+    optimize_b(cm, x, n * 4, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
                ta + x_idx, tl + y_idx, TX_8X8);
   }
 }
 
-void vp9_optimize_sby_4x4(MACROBLOCK *x) {
+void vp9_optimize_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT ta[8], tl[8];
   int n;
 
@@ -882,12 +993,12 @@
   for (n = 0; n < 64; n++) {
     const int x_idx = n & 7, y_idx = n >> 3;
 
-    optimize_b(x, n, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+    optimize_b(cm, x, n, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
                ta + x_idx, tl + y_idx, TX_4X4);
   }
 }
 
-void vp9_optimize_sbuv_16x16(MACROBLOCK *x) {
+void vp9_optimize_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
   ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
   ENTROPY_CONTEXT *a, *l, *a1, *l1, above_ec, left_ec;
@@ -901,12 +1012,12 @@
     l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
     above_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
     left_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
-    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
                &above_ec, &left_ec, TX_16X16);
   }
 }
 
-void vp9_optimize_sbuv_8x8(MACROBLOCK *x) {
+void vp9_optimize_sbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
   ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;
   ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;
@@ -921,14 +1032,14 @@
     l = tl + vp9_block2left_sb[TX_8X8][b];
     above_ec = (a[0] + a[1]) != 0;
     left_ec = (l[0] + l[1]) != 0;
-    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
                &above_ec, &left_ec, TX_8X8);
     a[0] = a[1] = above_ec;
     l[0] = l[1] = left_ec;
   }
 }
 
-void vp9_optimize_sbuv_4x4(MACROBLOCK *x) {
+void vp9_optimize_sbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
   ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;
   ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;
@@ -941,12 +1052,12 @@
     const int cidx = b >= 80 ? 20 : 16;
     a = ta + vp9_block2above_sb[TX_4X4][b];
     l = tl + vp9_block2left_sb[TX_4X4][b];
-    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
                a, l, TX_4X4);
   }
 }
 
-void vp9_optimize_sb64y_32x32(MACROBLOCK *x) {
+void vp9_optimize_sb64y_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
   ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
   ENTROPY_CONTEXT *a2 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 2);
@@ -965,12 +1076,12 @@
   for (n = 0; n < 4; n++) {
     const int x_idx = n & 1, y_idx = n >> 1;
 
-    optimize_b(x, n * 64, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+    optimize_b(cm, x, n * 64, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
                ta + x_idx, tl + y_idx, TX_32X32);
   }
 }
 
-void vp9_optimize_sb64y_16x16(MACROBLOCK *x) {
+void vp9_optimize_sb64y_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
   ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
   ENTROPY_CONTEXT *a2 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 2);
@@ -993,12 +1104,12 @@
   for (n = 0; n < 16; n++) {
     const int x_idx = n & 3, y_idx = n >> 2;
 
-    optimize_b(x, n * 16, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+    optimize_b(cm, x, n * 16, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
                ta + x_idx, tl + y_idx, TX_16X16);
   }
 }
 
-void vp9_optimize_sb64y_8x8(MACROBLOCK *x) {
+void vp9_optimize_sb64y_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
   ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
   ENTROPY_CONTEXT *a2 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 2);
@@ -1029,12 +1140,12 @@
   for (n = 0; n < 64; n++) {
     const int x_idx = n & 7, y_idx = n >> 3;
 
-    optimize_b(x, n * 4, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+    optimize_b(cm, x, n * 4, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
                ta + x_idx, tl + y_idx, TX_8X8);
   }
 }
 
-void vp9_optimize_sb64y_4x4(MACROBLOCK *x) {
+void vp9_optimize_sb64y_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT ta[16], tl[16];
   int n;
 
@@ -1049,12 +1160,12 @@
   for (n = 0; n < 256; n++) {
     const int x_idx = n & 15, y_idx = n >> 4;
 
-    optimize_b(x, n, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+    optimize_b(cm, x, n, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
                ta + x_idx, tl + y_idx, TX_4X4);
   }
 }
 
-void vp9_optimize_sb64uv_32x32(MACROBLOCK *x) {
+void vp9_optimize_sb64uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
   ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
   ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec;
@@ -1072,12 +1183,12 @@
     l3 = l + 3 * sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
     a_ec = (a[0] + a[1] + a1[0] + a1[1] + a2[0] + a2[1] + a3[0] + a3[1]) != 0;
     l_ec = (l[0] + l[1] + l1[0] + l1[1] + l2[0] + l2[1] + l3[0] + l3[1]) != 0;
-    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
                &a_ec, &l_ec, TX_32X32);
   }
 }
 
-void vp9_optimize_sb64uv_16x16(MACROBLOCK *x) {
+void vp9_optimize_sb64uv_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
   ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;
   ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;
@@ -1094,14 +1205,14 @@
     l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
     above_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
     left_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
-    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
                &above_ec, &left_ec, TX_16X16);
     a[0] = a[1] = a1[0] = a1[1] = above_ec;
     l[0] = l[1] = l1[0] = l1[1] = left_ec;
   }
 }
 
-void vp9_optimize_sb64uv_8x8(MACROBLOCK *x) {
+void vp9_optimize_sb64uv_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
   ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;
   ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;
@@ -1116,14 +1227,14 @@
     l = tl + vp9_block2left_sb64[TX_8X8][b];
     above_ec = (a[0] + a[1]) != 0;
     left_ec = (l[0] + l[1]) != 0;
-    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
                &above_ec, &left_ec, TX_8X8);
     a[0] = a[1] = above_ec;
     l[0] = l[1] = left_ec;
   }
 }
 
-void vp9_optimize_sb64uv_4x4(MACROBLOCK *x) {
+void vp9_optimize_sb64uv_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
   ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;
   ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;
@@ -1136,12 +1247,12 @@
     const int cidx = b >= 320 ? 20 : 16;
     a = ta + vp9_block2above_sb64[TX_4X4][b];
     l = tl + vp9_block2left_sb64[TX_4X4][b];
-    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
                a, l, TX_4X4);
   }
 }
 
-void vp9_fidct_mb(MACROBLOCK *x) {
+void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x) {
   MACROBLOCKD *const xd = &x->e_mbd;
   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
 
@@ -1149,7 +1260,7 @@
     vp9_transform_mb_16x16(x);
     vp9_quantize_mb_16x16(x);
     if (x->optimize)
-      optimize_mb_16x16(x);
+      optimize_mb_16x16(cm, x);
     vp9_inverse_transform_mb_16x16(xd);
   } else if (tx_size == TX_8X8) {
     if (xd->mode_info_context->mbmi.mode == SPLITMV) {
@@ -1159,8 +1270,8 @@
       vp9_quantize_mby_8x8(x);
       vp9_quantize_mbuv_4x4(x);
       if (x->optimize) {
-        vp9_optimize_mby_8x8(x);
-        vp9_optimize_mbuv_4x4(x);
+        vp9_optimize_mby_8x8(cm, x);
+        vp9_optimize_mbuv_4x4(cm, x);
       }
       vp9_inverse_transform_mby_8x8(xd);
       vp9_inverse_transform_mbuv_4x4(xd);
@@ -1168,24 +1279,25 @@
       vp9_transform_mb_8x8(x);
       vp9_quantize_mb_8x8(x);
       if (x->optimize)
-        optimize_mb_8x8(x);
+        optimize_mb_8x8(cm, x);
       vp9_inverse_transform_mb_8x8(xd);
     }
   } else {
     transform_mb_4x4(x);
     vp9_quantize_mb_4x4(x);
     if (x->optimize)
-      optimize_mb_4x4(x);
+      optimize_mb_4x4(cm, x);
     vp9_inverse_transform_mb_4x4(xd);
   }
 }
 
-void vp9_encode_inter16x16(MACROBLOCK *x, int mb_row, int mb_col) {
+void vp9_encode_inter16x16(VP9_COMMON *const cm, MACROBLOCK *x,
+                           int mb_row, int mb_col) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
   vp9_build_inter_predictors_mb(xd, mb_row, mb_col);
   subtract_mb(x);
-  vp9_fidct_mb(x);
+  vp9_fidct_mb(cm, x);
   vp9_recon_mb(xd);
 }
 
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 917cf8b..242afbe 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -13,6 +13,8 @@
 
 #include "./vpx_config.h"
 #include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/common/vp9_onyxc_int.h"
 
 typedef struct {
   MB_PREDICTION_MODE mode;
@@ -21,60 +23,60 @@
 } MODE_DEFINITION;
 
 
-#include "vp9/encoder/vp9_onyx_int.h"
 struct VP9_ENCODER_RTCD;
-void vp9_encode_inter16x16(MACROBLOCK *x, int mb_row, int mb_col);
+void vp9_encode_inter16x16(VP9_COMMON *const cm, MACROBLOCK *x,
+                           int mb_row, int mb_col);
 
 void vp9_transform_mbuv_4x4(MACROBLOCK *x);
 void vp9_transform_mby_4x4(MACROBLOCK *x);
 
-void vp9_optimize_mby_4x4(MACROBLOCK *x);
-void vp9_optimize_mbuv_4x4(MACROBLOCK *x);
+void vp9_optimize_mby_4x4(VP9_COMMON *const cm, MACROBLOCK *x);
+void vp9_optimize_mbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_encode_inter16x16y(MACROBLOCK *x, int mb_row, int mb_col);
 
 void vp9_transform_mb_8x8(MACROBLOCK *mb);
 void vp9_transform_mby_8x8(MACROBLOCK *x);
 void vp9_transform_mbuv_8x8(MACROBLOCK *x);
-void vp9_optimize_mby_8x8(MACROBLOCK *x);
-void vp9_optimize_mbuv_8x8(MACROBLOCK *x);
+void vp9_optimize_mby_8x8(VP9_COMMON *const cm, MACROBLOCK *x);
+void vp9_optimize_mbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x);
 
 void vp9_transform_mb_16x16(MACROBLOCK *mb);
 void vp9_transform_mby_16x16(MACROBLOCK *x);
-void vp9_optimize_mby_16x16(MACROBLOCK *x);
+void vp9_optimize_mby_16x16(VP9_COMMON *const cm, MACROBLOCK *x);
 
 void vp9_transform_sby_32x32(MACROBLOCK *x);
-void vp9_optimize_sby_32x32(MACROBLOCK *x);
+void vp9_optimize_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sby_16x16(MACROBLOCK *x);
-void vp9_optimize_sby_16x16(MACROBLOCK *x);
+void vp9_optimize_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sby_8x8(MACROBLOCK *x);
-void vp9_optimize_sby_8x8(MACROBLOCK *x);
+void vp9_optimize_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sby_4x4(MACROBLOCK *x);
-void vp9_optimize_sby_4x4(MACROBLOCK *x);
+void vp9_optimize_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sbuv_16x16(MACROBLOCK *x);
-void vp9_optimize_sbuv_16x16(MACROBLOCK *x);
+void vp9_optimize_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sbuv_8x8(MACROBLOCK *x);
-void vp9_optimize_sbuv_8x8(MACROBLOCK *x);
+void vp9_optimize_sbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sbuv_4x4(MACROBLOCK *x);
-void vp9_optimize_sbuv_4x4(MACROBLOCK *x);
+void vp9_optimize_sbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x);
 
 void vp9_transform_sb64y_32x32(MACROBLOCK *x);
-void vp9_optimize_sb64y_32x32(MACROBLOCK *x);
+void vp9_optimize_sb64y_32x32(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sb64y_16x16(MACROBLOCK *x);
-void vp9_optimize_sb64y_16x16(MACROBLOCK *x);
+void vp9_optimize_sb64y_16x16(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sb64y_8x8(MACROBLOCK *x);
-void vp9_optimize_sb64y_8x8(MACROBLOCK *x);
+void vp9_optimize_sb64y_8x8(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sb64y_4x4(MACROBLOCK *x);
-void vp9_optimize_sb64y_4x4(MACROBLOCK *x);
+void vp9_optimize_sb64y_4x4(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sb64uv_32x32(MACROBLOCK *x);
-void vp9_optimize_sb64uv_32x32(MACROBLOCK *x);
+void vp9_optimize_sb64uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sb64uv_16x16(MACROBLOCK *x);
-void vp9_optimize_sb64uv_16x16(MACROBLOCK *x);
+void vp9_optimize_sb64uv_16x16(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sb64uv_8x8(MACROBLOCK *x);
-void vp9_optimize_sb64uv_8x8(MACROBLOCK *x);
+void vp9_optimize_sb64uv_8x8(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sb64uv_4x4(MACROBLOCK *x);
-void vp9_optimize_sb64uv_4x4(MACROBLOCK *x);
+void vp9_optimize_sb64uv_4x4(VP9_COMMON *const cm, MACROBLOCK *x);
 
-void vp9_fidct_mb(MACROBLOCK *x);
+void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x);
 
 void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);
 
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 337276d..5e2f323 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -378,6 +378,19 @@
   vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
   int new_mv_mode_penalty = 256;
 
+  int sr = 0;
+  int quart_frm = MIN(cpi->common.Width, cpi->common.Height);
+
+  // refine the motion search range accroding to the frame dimension
+  // for first pass test
+  while ((quart_frm << sr) < MAX_FULL_PEL_VAL)
+    sr++;
+  if (sr)
+    sr--;
+
+  step_param    += sr;
+  further_steps -= sr;
+
   // override the default variance function to use MSE
   v_fn_ptr.vf = vp9_mse16x16;
 
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 121de65..715d683 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -420,7 +420,7 @@
       cpi->static_mb_pct = (ncnt[1] * 100) / cm->MBs;
 
     // This error case should not be reachable as this function should
-    // never be called with the common data structure unititialized.
+    // never be called with the common data structure uninitialized.
     else
       cpi->static_mb_pct = 0;
 
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 300d9f8..5fd1e83 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -21,9 +21,9 @@
 
 void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) {
   int col_min = (ref_mv->as_mv.col >> 3) - MAX_FULL_PEL_VAL +
-      ((ref_mv->as_mv.col & 7) ? 1 : 0);
+                                 ((ref_mv->as_mv.col & 7) ? 1 : 0);
   int row_min = (ref_mv->as_mv.row >> 3) - MAX_FULL_PEL_VAL +
-      ((ref_mv->as_mv.row & 7) ? 1 : 0);
+                                 ((ref_mv->as_mv.row & 7) ? 1 : 0);
   int col_max = (ref_mv->as_mv.col >> 3) + MAX_FULL_PEL_VAL;
   int row_max = (ref_mv->as_mv.row >> 3) + MAX_FULL_PEL_VAL;
 
@@ -38,6 +38,19 @@
     x->mv_row_max = row_max;
 }
 
+int vp9_init_search_range(int width, int height) {
+  int sr = 0;
+  int frm = MIN(width, height);
+
+  while ((frm << sr) < MAX_FULL_PEL_VAL)
+    sr++;
+
+  if (sr)
+    sr--;
+
+  return sr;
+}
+
 int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2],
                     int weight, int ishp) {
   MV v;
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 2479d72..d5c7032 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -19,12 +19,17 @@
 void print_mode_context(VP9_COMMON *pc);
 #endif
 
-
-#define MAX_MVSEARCH_STEPS 8                                    // The maximum number of steps in a step search given the largest allowed initial step
-#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1)      // Max full pel mv specified in 1 pel units
-#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))            // Maximum size of the first step in full pel units
+// The maximum number of steps in a step search given the largest
+// allowed initial step
+#define MAX_MVSEARCH_STEPS 10
+// Max full pel mv specified in 1 pel units
+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1)
+// Maximum size of the first step in full pel units
+#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))
 
 void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv);
+int vp9_init_search_range(int width, int height);
+
 int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost,
                            int *mvcost[2], int weight, int ishp);
 void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 6335827..5a565fc 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -114,6 +114,13 @@
 extern void print_nmvstats();
 #endif
 
+#if CONFIG_CODE_NONZEROCOUNT
+#ifdef NZC_STATS
+extern void init_nzcstats();
+extern void print_nzcstats();
+#endif
+#endif
+
 #ifdef SPEEDSTATS
 unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 #endif
@@ -1526,6 +1533,11 @@
 #ifdef NMV_STATS
   init_nmvstats();
 #endif
+#if CONFIG_CODE_NONZEROCOUNT
+#ifdef NZC_STATS
+  init_nzcstats();
+#endif
+#endif
 
   /*Initialize the feed-forward activity masking.*/
   cpi->activity_avg = 90 << 12;
@@ -1697,6 +1709,13 @@
   cpi->common.error.setjmp = 0;
 
   vp9_zero(cpi->y_uv_mode_count)
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_zero(cm->fc.nzc_counts_4x4);
+  vp9_zero(cm->fc.nzc_counts_8x8);
+  vp9_zero(cm->fc.nzc_counts_16x16);
+  vp9_zero(cm->fc.nzc_counts_32x32);
+  vp9_zero(cm->fc.nzc_pcat_counts);
+#endif
 
   return (VP9_PTR) cpi;
 }
@@ -1724,6 +1743,12 @@
     if (cpi->pass != 1)
       print_nmvstats();
 #endif
+#if CONFIG_CODE_NONZEROCOUNT
+#ifdef NZC_STATS
+    if (cpi->pass != 1)
+      print_nzcstats();
+#endif
+#endif
 
 #if CONFIG_INTERNAL_STATS
 
@@ -2845,7 +2870,15 @@
         cpi->active_best_quality * 15 / 16;
     }
   } else {
+#ifdef ONE_SHOT_Q_ESTIMATE
+#ifdef STRICT_ONE_SHOT_Q
+    cpi->active_best_quality = Q;
+#else
     cpi->active_best_quality = inter_minq[Q];
+#endif
+#else
+    cpi->active_best_quality = inter_minq[Q];
+#endif
 
     // For the constant/constrained quality mode we dont want
     // q to fall below the cq level.
@@ -3332,8 +3365,12 @@
   vp9_copy(cpi->common.fc.coef_counts_16x16, cpi->coef_counts_16x16);
   vp9_copy(cpi->common.fc.coef_counts_32x32, cpi->coef_counts_32x32);
   if (!cpi->common.error_resilient_mode &&
-      !cpi->common.frame_parallel_decoding_mode)
+      !cpi->common.frame_parallel_decoding_mode) {
     vp9_adapt_coef_probs(&cpi->common);
+#if CONFIG_CODE_NONZEROCOUNT
+    vp9_adapt_nzc_probs(&cpi->common);
+#endif
+  }
   if (cpi->common.frame_type != KEY_FRAME) {
     vp9_copy(cpi->common.fc.sb_ymode_counts, cpi->sb_ymode_count);
     vp9_copy(cpi->common.fc.ymode_counts, cpi->ymode_count);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 3dc4772..300e128 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -31,6 +31,7 @@
 
 // Experimental rate control switches
 // #define ONE_SHOT_Q_ESTIMATE 1
+// #define STRICT_ONE_SHOT_Q 1
 // #define DISABLE_RC_LONG_TERM_MEM 1
 
 // #define SPEEDSTATS 1
@@ -111,6 +112,18 @@
   int mv_ref_ct[INTER_MODE_CONTEXTS][4][2];
   int vp9_mode_contexts[INTER_MODE_CONTEXTS][4];
 
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_prob nzc_probs_4x4
+           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC4X4_NODES];
+  vp9_prob nzc_probs_8x8
+           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC8X8_NODES];
+  vp9_prob nzc_probs_16x16
+           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC16X16_NODES];
+  vp9_prob nzc_probs_32x32
+           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC32X32_NODES];
+  vp9_prob nzc_pcat_probs[MAX_NZC_CONTEXTS]
+                         [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA];
+#endif
 } CODING_CONTEXT;
 
 typedef struct {
@@ -480,6 +493,25 @@
   vp9_coeff_probs frame_coef_probs_32x32[BLOCK_TYPES];
   vp9_coeff_stats frame_branch_ct_32x32[BLOCK_TYPES];
 
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_prob frame_nzc_probs_4x4
+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC4X4_NODES];
+  unsigned int frame_nzc_branch_ct_4x4
+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC4X4_NODES][2];
+  vp9_prob frame_nzc_probs_8x8
+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC8X8_NODES];
+  unsigned int frame_nzc_branch_ct_8x8
+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC8X8_NODES][2];
+  vp9_prob frame_nzc_probs_16x16
+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC16X16_NODES];
+  unsigned int frame_nzc_branch_ct_16x16
+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC16X16_NODES][2];
+  vp9_prob frame_nzc_probs_32x32
+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC32X32_NODES];
+  unsigned int frame_nzc_branch_ct_32x32
+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC32X32_NODES][2];
+#endif
+
   int gfu_boost;
   int last_boost;
   int kf_boost;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 75f22fa..9ac2c84 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -21,35 +21,46 @@
 extern int enc_debug;
 #endif
 
+static INLINE int plane_idx(MACROBLOCKD *xd, int b_idx) {
+  const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
+  if (b_idx < (16 << (sb_type * 2)))
+    return 0;  // Y
+  else if (b_idx < (20 << (sb_type * 2)))
+    return 16;  // U
+  assert(b_idx < (24 << (sb_type * 2)));
+  return 20;  // V
+}
+
 void vp9_ht_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) {
   MACROBLOCKD *const xd = &mb->e_mbd;
-  BLOCK *const b = &mb->block[b_idx];
-  BLOCKD *const d = &xd->block[b_idx];
+  BLOCK *const b = &mb->block[0];
+  BLOCKD *const d = &xd->block[0];
   int i, rc, eob;
   int zbin;
   int x, y, z, sz;
+  int16_t *coeff_ptr       = mb->coeff + b_idx * 16;
+  int16_t *qcoeff_ptr      = xd->qcoeff + b_idx * 16;
+  int16_t *dqcoeff_ptr     = xd->dqcoeff + b_idx * 16;
   int16_t *zbin_boost_ptr  = b->zrun_zbin_boost;
-  int16_t *coeff_ptr       = b->coeff;
   int16_t *zbin_ptr        = b->zbin;
   int16_t *round_ptr       = b->round;
   int16_t *quant_ptr       = b->quant;
   uint8_t *quant_shift_ptr = b->quant_shift;
-  int16_t *qcoeff_ptr      = d->qcoeff;
-  int16_t *dqcoeff_ptr     = d->dqcoeff;
   int16_t *dequant_ptr     = d->dequant;
   int zbin_oq_value        = b->zbin_extra;
+  const int *pt_scan;
+#if CONFIG_CODE_NONZEROCOUNT
+  int nzc = 0;
+#endif
 
-  int const *pt_scan ;
-
+  assert(plane_idx(xd, b_idx) == 0);
   switch (tx_type) {
     case ADST_DCT:
       pt_scan = vp9_row_scan_4x4;
       break;
-
     case DCT_ADST:
       pt_scan = vp9_col_scan_4x4;
       break;
-
     default:
       pt_scan = vp9_default_zig_zag1d_4x4;
       break;
@@ -81,6 +92,9 @@
 
         if (y) {
           eob = i;                                // last nonzero coeffs
+#if CONFIG_CODE_NONZEROCOUNT
+          ++nzc;                                  // number of nonzero coeffs
+#endif
           zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength
         }
       }
@@ -88,25 +102,32 @@
   }
 
   xd->eobs[b_idx] = eob + 1;
+#if CONFIG_CODE_NONZEROCOUNT
+  xd->nzcs[b_idx] = nzc;
+#endif
 }
 
 void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx) {
   MACROBLOCKD *const xd = &mb->e_mbd;
-  BLOCK *const b = &mb->block[b_idx];
-  BLOCKD *const d = &xd->block[b_idx];
+  const int c_idx = plane_idx(xd, b_idx);
+  BLOCK *const b = &mb->block[c_idx];
+  BLOCKD *const d = &xd->block[c_idx];
   int i, rc, eob;
   int zbin;
   int x, y, z, sz;
+  int16_t *coeff_ptr       = mb->coeff + b_idx * 16;
+  int16_t *qcoeff_ptr      = xd->qcoeff + b_idx * 16;
+  int16_t *dqcoeff_ptr     = xd->dqcoeff + b_idx * 16;
   int16_t *zbin_boost_ptr  = b->zrun_zbin_boost;
-  int16_t *coeff_ptr       = b->coeff;
   int16_t *zbin_ptr        = b->zbin;
   int16_t *round_ptr       = b->round;
   int16_t *quant_ptr       = b->quant;
   uint8_t *quant_shift_ptr = b->quant_shift;
-  int16_t *qcoeff_ptr      = d->qcoeff;
-  int16_t *dqcoeff_ptr     = d->dqcoeff;
   int16_t *dequant_ptr     = d->dequant;
   int zbin_oq_value        = b->zbin_extra;
+#if CONFIG_CODE_NONZEROCOUNT
+  int nzc = 0;
+#endif
 
   vpx_memset(qcoeff_ptr, 0, 32);
   vpx_memset(dqcoeff_ptr, 0, 32);
@@ -135,6 +156,9 @@
 
         if (y) {
           eob = i;                                // last nonzero coeffs
+#if CONFIG_CODE_NONZEROCOUNT
+          ++nzc;                                  // number of nonzero coeffs
+#endif
           zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength
         }
       }
@@ -142,13 +166,16 @@
   }
 
   xd->eobs[b_idx] = eob + 1;
+#if CONFIG_CODE_NONZEROCOUNT
+  xd->nzcs[b_idx] = nzc;
+#endif
 }
 
-void vp9_quantize_mby_4x4_c(MACROBLOCK *x) {
+void vp9_quantize_mby_4x4(MACROBLOCK *x) {
   int i;
 
   for (i = 0; i < 16; i++) {
-    TX_TYPE tx_type = get_tx_type_4x4(&x->e_mbd, &x->e_mbd.block[i]);
+    TX_TYPE tx_type = get_tx_type_4x4(&x->e_mbd, i);
     if (tx_type != DCT_DCT) {
       vp9_ht_quantize_b_4x4(x, i, tx_type);
     } else {
@@ -157,24 +184,25 @@
   }
 }
 
-void vp9_quantize_mbuv_4x4_c(MACROBLOCK *x) {
+void vp9_quantize_mbuv_4x4(MACROBLOCK *x) {
   int i;
 
   for (i = 16; i < 24; i++)
     x->quantize_b_4x4(x, i);
 }
 
-void vp9_quantize_mb_4x4_c(MACROBLOCK *x) {
-  vp9_quantize_mby_4x4_c(x);
-  vp9_quantize_mbuv_4x4_c(x);
+void vp9_quantize_mb_4x4(MACROBLOCK *x) {
+  vp9_quantize_mby_4x4(x);
+  vp9_quantize_mbuv_4x4(x);
 }
 
 void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx) {
   MACROBLOCKD *const xd = &mb->e_mbd;
-  BLOCK *const b = &mb->block[b_idx];
-  BLOCKD *const d = &xd->block[b_idx];
-  int16_t *qcoeff_ptr = d->qcoeff;
-  int16_t *dqcoeff_ptr = d->dqcoeff;
+  int16_t *qcoeff_ptr = xd->qcoeff + 16 * b_idx;
+  int16_t *dqcoeff_ptr = xd->dqcoeff + 16 * b_idx;
+  const int c_idx = plane_idx(xd, b_idx);
+  BLOCK *const b = &mb->block[c_idx];
+  BLOCKD *const d = &xd->block[c_idx];
 
   vpx_memset(qcoeff_ptr, 0, 64 * sizeof(int16_t));
   vpx_memset(dqcoeff_ptr, 0, 64 * sizeof(int16_t));
@@ -185,13 +213,16 @@
     int x, y, z, sz;
     int zero_run;
     int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
-    int16_t *coeff_ptr  = b->coeff;
+    int16_t *coeff_ptr  = mb->coeff + 16 * b_idx;
     int16_t *zbin_ptr   = b->zbin;
     int16_t *round_ptr  = b->round;
     int16_t *quant_ptr  = b->quant;
     uint8_t *quant_shift_ptr = b->quant_shift;
     int16_t *dequant_ptr = d->dequant;
     int zbin_oq_value = b->zbin_extra;
+#if CONFIG_CODE_NONZEROCOUNT
+    int nzc = 0;
+#endif
 
     eob = -1;
 
@@ -215,6 +246,9 @@
 
         if (y) {
           eob = 0;                                   // last nonzero coeffs
+#if CONFIG_CODE_NONZEROCOUNT
+          ++nzc;                                  // number of nonzero coeffs
+#endif
           zero_run = 0;
         }
       }
@@ -241,19 +275,33 @@
 
         if (y) {
           eob = i;                                   // last nonzero coeffs
+#if CONFIG_CODE_NONZEROCOUNT
+          ++nzc;                                     // number of nonzero coeffs
+#endif
           zero_run = 0;
         }
       }
     }
     xd->eobs[b_idx] = eob + 1;
+#if CONFIG_CODE_NONZEROCOUNT
+    xd->nzcs[b_idx] = nzc;
+#endif
   } else {
     xd->eobs[b_idx] = 0;
+#if CONFIG_CODE_NONZEROCOUNT
+    xd->nzcs[b_idx] = 0;
+#endif
   }
 }
 
 void vp9_quantize_mby_8x8(MACROBLOCK *x) {
   int i;
 
+#if CONFIG_CODE_NONZEROCOUNT
+  for (i = 0; i < 16; i ++) {
+    x->e_mbd.nzcs[i] = 0;
+  }
+#endif
   for (i = 0; i < 16; i += 4) {
     x->quantize_b_8x8(x, i);
   }
@@ -262,6 +310,11 @@
 void vp9_quantize_mbuv_8x8(MACROBLOCK *x) {
   int i;
 
+#if CONFIG_CODE_NONZEROCOUNT
+  for (i = 16; i < 24; i ++) {
+    x->e_mbd.nzcs[i] = 0;
+  }
+#endif
   for (i = 16; i < 24; i += 4)
     x->quantize_b_8x8(x, i);
 }
@@ -272,6 +325,12 @@
 }
 
 void vp9_quantize_mby_16x16(MACROBLOCK *x) {
+#if CONFIG_CODE_NONZEROCOUNT
+  int i;
+  for (i = 0; i < 16; i++) {
+    x->e_mbd.nzcs[i] = 0;
+  }
+#endif
   x->quantize_b_16x16(x, 0);
 }
 
@@ -286,12 +345,19 @@
                      uint8_t *quant_shift_ptr,
                      int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
                      int16_t *dequant_ptr, int zbin_oq_value,
-                     uint16_t *eob_ptr, const int *scan, int mul) {
+                     uint16_t *eob_ptr,
+#if CONFIG_CODE_NONZEROCOUNT
+                     uint16_t *nzc_ptr,
+#endif
+                     const int *scan, int mul) {
   int i, rc, eob;
   int zbin;
   int x, y, z, sz;
   int zero_run = 0;
   int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
+#if CONFIG_CODE_NONZEROCOUNT
+  int nzc = 0;
+#endif
 
   vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
   vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
@@ -320,325 +386,173 @@
         if (y) {
           eob = i;                                  // last nonzero coeffs
           zero_run = 0;
+#if CONFIG_CODE_NONZEROCOUNT
+          ++nzc;                                    // number of nonzero coeffs
+#endif
         }
       }
     }
   }
 
   *eob_ptr = eob + 1;
+#if CONFIG_CODE_NONZEROCOUNT
+  *nzc_ptr = nzc;
+#endif
 }
 
 void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx) {
   MACROBLOCKD *const xd = &mb->e_mbd;
-  BLOCK *const b = &mb->block[b_idx];
-  BLOCKD *const d = &xd->block[b_idx];
+  const int c_idx = plane_idx(xd, b_idx);
+  BLOCK *const b = &mb->block[c_idx];
+  BLOCKD *const d = &xd->block[c_idx];
+
   quantize(b->zrun_zbin_boost,
-           b->coeff,
+           mb->coeff + 16 * b_idx,
            256, b->skip_block,
            b->zbin, b->round, b->quant, b->quant_shift,
-           d->qcoeff,
-           d->dqcoeff,
+           xd->qcoeff + 16 * b_idx,
+           xd->dqcoeff + 16 * b_idx,
            d->dequant,
            b->zbin_extra,
-           &xd->eobs[b_idx], vp9_default_zig_zag1d_16x16, 1);
+           &xd->eobs[b_idx],
+#if CONFIG_CODE_NONZEROCOUNT
+           &xd->nzcs[b_idx],
+#endif
+           vp9_default_zig_zag1d_16x16, 1);
 }
 
-void vp9_quantize_sby_32x32(MACROBLOCK *x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  BLOCK *const b = &x->block[0];
-  BLOCKD *const d = &xd->block[0];
+void vp9_regular_quantize_b_32x32(MACROBLOCK *mb, int b_idx) {
+  MACROBLOCKD *const xd = &mb->e_mbd;
+  const int c_idx = plane_idx(xd, b_idx);
+  BLOCK *const b = &mb->block[c_idx];
+  BLOCKD *const d = &xd->block[c_idx];
 
   quantize(b->zrun_zbin_boost,
-           x->coeff,
+           mb->coeff + b_idx * 16,
            1024, b->skip_block,
            b->zbin,
            b->round, b->quant, b->quant_shift,
-           xd->qcoeff,
-           xd->dqcoeff,
+           xd->qcoeff + b_idx * 16,
+           xd->dqcoeff + b_idx * 16,
            d->dequant,
            b->zbin_extra,
-           &xd->eobs[0],
+           &xd->eobs[b_idx],
+#if CONFIG_CODE_NONZEROCOUNT
+           &xd->nzcs[b_idx],
+#endif
            vp9_default_zig_zag1d_32x32, 2);
 }
 
+void vp9_quantize_sby_32x32(MACROBLOCK *x) {
+  vp9_regular_quantize_b_32x32(x, 0);
+}
+
 void vp9_quantize_sby_16x16(MACROBLOCK *x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  BLOCK *const b = &x->block[0];
-  BLOCKD *const d = &xd->block[0];
   int n;
 
   for (n = 0; n < 4; n++)
-    quantize(b->zrun_zbin_boost,
-             x->coeff + n * 256,
-             256, b->skip_block,
-             b->zbin,
-             b->round, b->quant, b->quant_shift,
-             xd->qcoeff + n * 256,
-             xd->dqcoeff + n * 256,
-             d->dequant,
-             b->zbin_extra,
-             &xd->eobs[n * 16],
-             vp9_default_zig_zag1d_16x16, 1);
+    x->quantize_b_16x16(x, n * 16);
 }
 
 void vp9_quantize_sby_8x8(MACROBLOCK *x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  BLOCK *const b = &x->block[0];
-  BLOCKD *const d = &xd->block[0];
   int n;
 
   for (n = 0; n < 16; n++)
-    quantize(b->zrun_zbin_boost,
-             x->coeff + n * 64,
-             64, b->skip_block,
-             b->zbin,
-             b->round, b->quant, b->quant_shift,
-             xd->qcoeff + n * 64,
-             xd->dqcoeff + n * 64,
-             d->dequant,
-             b->zbin_extra,
-             &xd->eobs[n * 4],
-             vp9_default_zig_zag1d_8x8, 1);
+    x->quantize_b_8x8(x, n * 4);
 }
 
 void vp9_quantize_sby_4x4(MACROBLOCK *x) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  BLOCK *const b = &x->block[0];
-  BLOCKD *const d = &xd->block[0];
   int n;
 
-  for (n = 0; n < 64; n++)
-    quantize(b->zrun_zbin_boost,
-             x->coeff + n * 16,
-             16, b->skip_block,
-             b->zbin,
-             b->round, b->quant, b->quant_shift,
-             xd->qcoeff + n * 16,
-             xd->dqcoeff + n * 16,
-             d->dequant,
-             b->zbin_extra,
-             &xd->eobs[n],
-             vp9_default_zig_zag1d_4x4, 1);
+  for (n = 0; n < 64; n++) {
+    const TX_TYPE tx_type = get_tx_type_4x4(xd, n);
+    if (tx_type != DCT_DCT) {
+      vp9_ht_quantize_b_4x4(x, n, tx_type);
+    } else {
+      x->quantize_b_4x4(x, n);
+    }
+  }
 }
 
 void vp9_quantize_sbuv_16x16(MACROBLOCK *x) {
-  int i;
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  for (i = 64; i < 96; i += 16) {
-    int cidx = i < 80 ? 16 : 20;
-    quantize(x->block[cidx].zrun_zbin_boost,
-             x->coeff + i * 16,
-             256, x->block[cidx].skip_block,
-             x->block[cidx].zbin, x->block[cidx].round,
-             x->block[cidx].quant, x->block[cidx].quant_shift,
-             xd->qcoeff + i * 16,
-             xd->dqcoeff + i * 16,
-             xd->block[cidx].dequant,
-             x->block[cidx].zbin_extra,
-             &xd->eobs[i],
-             vp9_default_zig_zag1d_16x16, 1);
-  }
+  x->quantize_b_16x16(x, 64);
+  x->quantize_b_16x16(x, 80);
 }
 
 void vp9_quantize_sbuv_8x8(MACROBLOCK *x) {
   int i;
-  MACROBLOCKD *const xd = &x->e_mbd;
 
-  for (i = 64; i < 96; i += 4) {
-    int cidx = i < 80 ? 16 : 20;
-    quantize(x->block[cidx].zrun_zbin_boost,
-             x->coeff + i * 16,
-             64, x->block[cidx].skip_block,
-             x->block[cidx].zbin, x->block[cidx].round,
-             x->block[cidx].quant, x->block[cidx].quant_shift,
-             xd->qcoeff + i * 16,
-             xd->dqcoeff + i * 16,
-             xd->block[cidx].dequant,
-             x->block[cidx].zbin_extra,
-             &xd->eobs[i],
-             vp9_default_zig_zag1d_8x8, 1);
-  }
+  for (i = 64; i < 96; i += 4)
+    x->quantize_b_8x8(x, i);
 }
 
 void vp9_quantize_sbuv_4x4(MACROBLOCK *x) {
   int i;
-  MACROBLOCKD *const xd = &x->e_mbd;
 
-  for (i = 64; i < 96; i++) {
-    int cidx = i < 80 ? 16 : 20;
-    quantize(x->block[cidx].zrun_zbin_boost,
-             x->coeff + i * 16,
-             16, x->block[cidx].skip_block,
-             x->block[cidx].zbin, x->block[cidx].round,
-             x->block[cidx].quant, x->block[cidx].quant_shift,
-             xd->qcoeff + i * 16,
-             xd->dqcoeff + i * 16,
-             xd->block[cidx].dequant,
-             x->block[cidx].zbin_extra,
-             &xd->eobs[i],
-             vp9_default_zig_zag1d_4x4, 1);
-  }
+  for (i = 64; i < 96; i++)
+    x->quantize_b_4x4(x, i);
 }
 
 void vp9_quantize_sb64y_32x32(MACROBLOCK *x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  BLOCK *const b = &x->block[0];
-  BLOCKD *const d = &xd->block[0];
   int n;
 
   for (n = 0; n < 4; n++)
-    quantize(b->zrun_zbin_boost,
-             x->coeff + n * 1024,
-             1024, b->skip_block,
-             b->zbin,
-             b->round, b->quant, b->quant_shift,
-             xd->qcoeff + n * 1024,
-             xd->dqcoeff + n * 1024,
-             d->dequant,
-             b->zbin_extra,
-             &xd->eobs[n * 64],
-             vp9_default_zig_zag1d_32x32, 2);
+    vp9_regular_quantize_b_32x32(x, n * 64);
 }
 
 void vp9_quantize_sb64y_16x16(MACROBLOCK *x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  BLOCK *const b = &x->block[0];
-  BLOCKD *const d = &xd->block[0];
   int n;
 
   for (n = 0; n < 16; n++)
-    quantize(b->zrun_zbin_boost,
-             x->coeff + n * 256,
-             256, b->skip_block,
-             b->zbin,
-             b->round, b->quant, b->quant_shift,
-             xd->qcoeff + n * 256,
-             xd->dqcoeff + n * 256,
-             d->dequant,
-             b->zbin_extra,
-             &xd->eobs[n * 16],
-             vp9_default_zig_zag1d_16x16, 1);
+    x->quantize_b_16x16(x, n * 16);
 }
 
 void vp9_quantize_sb64y_8x8(MACROBLOCK *x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  BLOCK *const b = &x->block[0];
-  BLOCKD *const d = &xd->block[0];
   int n;
 
   for (n = 0; n < 64; n++)
-    quantize(b->zrun_zbin_boost,
-             x->coeff + n * 64,
-             64, b->skip_block,
-             b->zbin,
-             b->round, b->quant, b->quant_shift,
-             xd->qcoeff + n * 64,
-             xd->dqcoeff + n * 64,
-             d->dequant,
-             b->zbin_extra,
-             &xd->eobs[n * 4],
-             vp9_default_zig_zag1d_8x8, 1);
+    x->quantize_b_8x8(x, n * 4);
 }
 
 void vp9_quantize_sb64y_4x4(MACROBLOCK *x) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  BLOCK *const b = &x->block[0];
-  BLOCKD *const d = &xd->block[0];
   int n;
 
-  for (n = 0; n < 256; n++)
-    quantize(b->zrun_zbin_boost,
-             x->coeff + n * 16,
-             16, b->skip_block,
-             b->zbin,
-             b->round, b->quant, b->quant_shift,
-             xd->qcoeff + n * 16,
-             xd->dqcoeff + n * 16,
-             d->dequant,
-             b->zbin_extra,
-             &xd->eobs[n],
-             vp9_default_zig_zag1d_4x4, 1);
+  for (n = 0; n < 256; n++) {
+    const TX_TYPE tx_type = get_tx_type_4x4(xd, n);
+    if (tx_type != DCT_DCT) {
+      vp9_ht_quantize_b_4x4(x, n, tx_type);
+    } else {
+      x->quantize_b_4x4(x, n);
+    }
+  }
 }
 
 void vp9_quantize_sb64uv_32x32(MACROBLOCK *x) {
-  int i;
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  for (i = 256; i < 384; i += 64) {
-    int cidx = i < 320 ? 16 : 20;
-    quantize(x->block[cidx].zrun_zbin_boost,
-             x->coeff + i * 16,
-             1024, x->block[cidx].skip_block,
-             x->block[cidx].zbin, x->block[cidx].round,
-             x->block[cidx].quant, x->block[cidx].quant_shift,
-             xd->qcoeff + i * 16,
-             xd->dqcoeff + i * 16,
-             xd->block[cidx].dequant,
-             x->block[cidx].zbin_extra,
-             &xd->eobs[i],
-             vp9_default_zig_zag1d_32x32, 2);
-  }
+  vp9_regular_quantize_b_32x32(x, 256);
+  vp9_regular_quantize_b_32x32(x, 320);
 }
 
 void vp9_quantize_sb64uv_16x16(MACROBLOCK *x) {
   int i;
-  MACROBLOCKD *const xd = &x->e_mbd;
 
-  for (i = 256; i < 384; i += 16) {
-    int cidx = i < 320 ? 16 : 20;
-    quantize(x->block[cidx].zrun_zbin_boost,
-             x->coeff + i * 16,
-             256, x->block[cidx].skip_block,
-             x->block[cidx].zbin, x->block[cidx].round,
-             x->block[cidx].quant, x->block[cidx].quant_shift,
-             xd->qcoeff + i * 16,
-             xd->dqcoeff + i * 16,
-             xd->block[cidx].dequant,
-             x->block[cidx].zbin_extra,
-             &xd->eobs[i],
-             vp9_default_zig_zag1d_16x16, 1);
-  }
+  for (i = 256; i < 384; i += 16)
+    x->quantize_b_16x16(x, i);
 }
 
 void vp9_quantize_sb64uv_8x8(MACROBLOCK *x) {
   int i;
-  MACROBLOCKD *const xd = &x->e_mbd;
 
-  for (i = 256; i < 384; i += 4) {
-    int cidx = i < 320 ? 16 : 20;
-    quantize(x->block[cidx].zrun_zbin_boost,
-             x->coeff + i * 16,
-             64, x->block[cidx].skip_block,
-             x->block[cidx].zbin, x->block[cidx].round,
-             x->block[cidx].quant, x->block[cidx].quant_shift,
-             xd->qcoeff + i * 16,
-             xd->dqcoeff + i * 16,
-             xd->block[cidx].dequant,
-             x->block[cidx].zbin_extra,
-             &xd->eobs[i],
-             vp9_default_zig_zag1d_8x8, 1);
-  }
+  for (i = 256; i < 384; i += 4)
+    x->quantize_b_8x8(x, i);
 }
 
 void vp9_quantize_sb64uv_4x4(MACROBLOCK *x) {
   int i;
-  MACROBLOCKD *const xd = &x->e_mbd;
 
-  for (i = 256; i < 384; i++) {
-    int cidx = i < 320 ? 16 : 20;
-    quantize(x->block[cidx].zrun_zbin_boost,
-             x->coeff + i * 16,
-             16, x->block[cidx].skip_block,
-             x->block[cidx].zbin, x->block[cidx].round,
-             x->block[cidx].quant, x->block[cidx].quant_shift,
-             xd->qcoeff + i * 16,
-             xd->dqcoeff + i * 16,
-             xd->block[cidx].dequant,
-             x->block[cidx].zbin_extra,
-             &xd->eobs[i],
-             vp9_default_zig_zag1d_4x4, 1);
-  }
+  for (i = 256; i < 384; i++)
+    x->quantize_b_4x4(x, i);
 }
 
 /* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
@@ -650,8 +564,7 @@
   vp9_regular_quantize_b_4x4(x, b_idx2);
 }
 
-static void invert_quant(int16_t *quant,
-                         uint8_t *shift, int d) {
+static void invert_quant(int16_t *quant, uint8_t *shift, int d) {
   unsigned t;
   int l;
   t = d;
@@ -665,56 +578,52 @@
 void vp9_init_quantizer(VP9_COMP *cpi) {
   int i;
   int quant_val;
-  int Q;
+  int q;
 
   static const int zbin_boost[16] = { 0,  0,  0,  8,  8,  8, 10, 12,
                                      14, 16, 20, 24, 28, 32, 36, 40 };
 
-  for (Q = 0; Q < QINDEX_RANGE; Q++) {
-    int qzbin_factor = (vp9_dc_quant(Q, 0) < 148) ? 84 : 80;
+  for (q = 0; q < QINDEX_RANGE; q++) {
+    int qzbin_factor = (vp9_dc_quant(q, 0) < 148) ? 84 : 80;
     int qrounding_factor = 48;
-    if (Q == 0) {
+    if (q == 0) {
       qzbin_factor = 64;
       qrounding_factor = 64;
     }
     // dc values
-    quant_val = vp9_dc_quant(Q, cpi->common.y1dc_delta_q);
-    invert_quant(cpi->Y1quant[Q] + 0,
-                 cpi->Y1quant_shift[Q] + 0, quant_val);
-    cpi->Y1zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
-    cpi->Y1round[Q][0] = (qrounding_factor * quant_val) >> 7;
-    cpi->common.Y1dequant[Q][0] = quant_val;
-    cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+    quant_val = vp9_dc_quant(q, cpi->common.y1dc_delta_q);
+    invert_quant(cpi->Y1quant[q] + 0, cpi->Y1quant_shift[q] + 0, quant_val);
+    cpi->Y1zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
+    cpi->Y1round[q][0] = (qrounding_factor * quant_val) >> 7;
+    cpi->common.Y1dequant[q][0] = quant_val;
+    cpi->zrun_zbin_boost_y1[q][0] = (quant_val * zbin_boost[0]) >> 7;
 
-    quant_val = vp9_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
-    invert_quant(cpi->UVquant[Q] + 0,
-                 cpi->UVquant_shift[Q] + 0, quant_val);
-    cpi->UVzbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
-    cpi->UVround[Q][0] = (qrounding_factor * quant_val) >> 7;
-    cpi->common.UVdequant[Q][0] = quant_val;
-    cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+    quant_val = vp9_dc_uv_quant(q, cpi->common.uvdc_delta_q);
+    invert_quant(cpi->UVquant[q] + 0, cpi->UVquant_shift[q] + 0, quant_val);
+    cpi->UVzbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
+    cpi->UVround[q][0] = (qrounding_factor * quant_val) >> 7;
+    cpi->common.UVdequant[q][0] = quant_val;
+    cpi->zrun_zbin_boost_uv[q][0] = (quant_val * zbin_boost[0]) >> 7;
 
     // all the 4x4 ac values =;
     for (i = 1; i < 16; i++) {
       int rc = vp9_default_zig_zag1d_4x4[i];
 
-      quant_val = vp9_ac_yquant(Q);
-      invert_quant(cpi->Y1quant[Q] + rc,
-                   cpi->Y1quant_shift[Q] + rc, quant_val);
-      cpi->Y1zbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->Y1round[Q][rc] = (qrounding_factor * quant_val) >> 7;
-      cpi->common.Y1dequant[Q][rc] = quant_val;
-      cpi->zrun_zbin_boost_y1[Q][i] =
-        ((quant_val * zbin_boost[i]) + 64) >> 7;
+      quant_val = vp9_ac_yquant(q);
+      invert_quant(cpi->Y1quant[q] + rc, cpi->Y1quant_shift[q] + rc, quant_val);
+      cpi->Y1zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
+      cpi->Y1round[q][rc] = (qrounding_factor * quant_val) >> 7;
+      cpi->common.Y1dequant[q][rc] = quant_val;
+      cpi->zrun_zbin_boost_y1[q][i] =
+          ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7);
 
-      quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);
-      invert_quant(cpi->UVquant[Q] + rc,
-                   cpi->UVquant_shift[Q] + rc, quant_val);
-      cpi->UVzbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->UVround[Q][rc] = (qrounding_factor * quant_val) >> 7;
-      cpi->common.UVdequant[Q][rc] = quant_val;
-      cpi->zrun_zbin_boost_uv[Q][i] =
-        ((quant_val * zbin_boost[i]) + 64) >> 7;
+      quant_val = vp9_ac_uv_quant(q, cpi->common.uvac_delta_q);
+      invert_quant(cpi->UVquant[q] + rc, cpi->UVquant_shift[q] + rc, quant_val);
+      cpi->UVzbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
+      cpi->UVround[q][rc] = (qrounding_factor * quant_val) >> 7;
+      cpi->common.UVdequant[q][rc] = quant_val;
+      cpi->zrun_zbin_boost_uv[q][i] =
+          ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7);
     }
   }
 }
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index 32eb05a..7392540 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -26,52 +26,24 @@
 #include "x86/vp9_quantize_x86.h"
 #endif
 
-#define prototype_quantize_block_type(sym) \
-  void (sym)(MACROBLOCK *mb, int b_ix, TX_TYPE type)
-extern prototype_quantize_block_type(vp9_ht_quantize_b_4x4);
+void vp9_ht_quantize_b_4x4(MACROBLOCK *mb, int b_ix, TX_TYPE type);
+void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx);
+void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *mb, int b_idx1, int b_idx2);
+void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx);
+void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx);
+void vp9_regular_quantize_b_32x32(MACROBLOCK *mb, int b_idx);
 
-#ifndef vp9_quantize_quantb_4x4
-#define vp9_quantize_quantb_4x4 vp9_regular_quantize_b_4x4
-#endif
-extern prototype_quantize_block(vp9_quantize_quantb_4x4);
-
-#ifndef vp9_quantize_quantb_4x4_pair
-#define vp9_quantize_quantb_4x4_pair vp9_regular_quantize_b_4x4_pair
-#endif
-extern prototype_quantize_block_pair(vp9_quantize_quantb_4x4_pair);
-
-#ifndef vp9_quantize_quantb_8x8
-#define vp9_quantize_quantb_8x8 vp9_regular_quantize_b_8x8
-#endif
-extern prototype_quantize_block(vp9_quantize_quantb_8x8);
-
-#ifndef vp9_quantize_quantb_16x16
-#define vp9_quantize_quantb_16x16 vp9_regular_quantize_b_16x16
-#endif
-extern prototype_quantize_block(vp9_quantize_quantb_16x16);
-
-#ifndef vp9_quantize_mb_4x4
-#define vp9_quantize_mb_4x4 vp9_quantize_mb_4x4_c
-#endif
-extern prototype_quantize_mb(vp9_quantize_mb_4x4);
+void vp9_quantize_mb_4x4(MACROBLOCK *x);
 void vp9_quantize_mb_8x8(MACROBLOCK *x);
 
-#ifndef vp9_quantize_mbuv_4x4
-#define vp9_quantize_mbuv_4x4 vp9_quantize_mbuv_4x4_c
-#endif
-extern prototype_quantize_mb(vp9_quantize_mbuv_4x4);
+void vp9_quantize_mbuv_4x4(MACROBLOCK *x);
+void vp9_quantize_mby_4x4(MACROBLOCK *x);
 
-#ifndef vp9_quantize_mby_4x4
-#define vp9_quantize_mby_4x4 vp9_quantize_mby_4x4_c
-#endif
-extern prototype_quantize_mb(vp9_quantize_mby_4x4);
-
-extern prototype_quantize_mb(vp9_quantize_mby_8x8);
-extern prototype_quantize_mb(vp9_quantize_mbuv_8x8);
+void vp9_quantize_mby_8x8(MACROBLOCK *x);
+void vp9_quantize_mbuv_8x8(MACROBLOCK *x);
 
 void vp9_quantize_mb_16x16(MACROBLOCK *x);
-extern prototype_quantize_block(vp9_quantize_quantb_16x16);
-extern prototype_quantize_mb(vp9_quantize_mby_16x16);
+void vp9_quantize_mby_16x16(MACROBLOCK *x);
 
 void vp9_quantize_sby_32x32(MACROBLOCK *x);
 void vp9_quantize_sby_16x16(MACROBLOCK *x);
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index a2a7957..82bd70b 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -120,11 +120,14 @@
   double q = vp9_convert_qindex_to_q(qindex);
 
   if (frame_type == KEY_FRAME) {
-    enumerator = 4500000;
+    enumerator = 4000000;
   } else {
-    enumerator = 2850000;
+    enumerator = 2500000;
   }
 
+  // Q based adjustment to baseline enumberator
+  enumerator += (int)(enumerator * q) >> 12;
+
   return (int)(0.5 + (enumerator * correction_factor / q));
 }
 
@@ -182,6 +185,13 @@
 #if CONFIG_COMP_INTERINTRA_PRED
   cc->interintra_prob = cm->fc.interintra_prob;
 #endif
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_copy(cc->nzc_probs_4x4, cm->fc.nzc_probs_4x4);
+  vp9_copy(cc->nzc_probs_8x8, cm->fc.nzc_probs_8x8);
+  vp9_copy(cc->nzc_probs_16x16, cm->fc.nzc_probs_16x16);
+  vp9_copy(cc->nzc_probs_32x32, cm->fc.nzc_probs_32x32);
+  vp9_copy(cc->nzc_pcat_probs, cm->fc.nzc_pcat_probs);
+#endif
 }
 
 void vp9_restore_coding_context(VP9_COMP *cpi) {
@@ -237,6 +247,13 @@
 #if CONFIG_COMP_INTERINTRA_PRED
   cm->fc.interintra_prob = cc->interintra_prob;
 #endif
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_copy(cm->fc.nzc_probs_4x4, cc->nzc_probs_4x4);
+  vp9_copy(cm->fc.nzc_probs_8x8, cc->nzc_probs_8x8);
+  vp9_copy(cm->fc.nzc_probs_16x16, cc->nzc_probs_16x16);
+  vp9_copy(cm->fc.nzc_probs_32x32, cc->nzc_probs_32x32);
+  vp9_copy(cm->fc.nzc_pcat_probs, cc->nzc_pcat_probs);
+#endif
 }
 
 void vp9_setup_key_frame(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 61379b8..3004d6b 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -156,6 +156,12 @@
     for (j = 0; j < REF_TYPES; j++)
       for (k = 0; k < COEF_BANDS; k++)
         for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
+#if CONFIG_CODE_NONZEROCOUNT
+          // All costs are without the EOB node
+          vp9_cost_tokens_skip((int *)(c[i][j][k][l]),
+                               p[i][j][k][l],
+                               vp9_coef_tree);
+#else
           if (l == 0 && k > 0)
             vp9_cost_tokens_skip((int *)(c[i][j][k][l]),
                                  p[i][j][k][l],
@@ -164,9 +170,63 @@
             vp9_cost_tokens((int *)(c[i][j][k][l]),
                             p[i][j][k][l],
                             vp9_coef_tree);
+#endif
         }
 }
 
+#if CONFIG_CODE_NONZEROCOUNT
+static void fill_nzc_costs(VP9_COMP *cpi, int block_size) {
+  int nzc_context, r, b, nzc, values;
+  int cost[16];
+  values = block_size * block_size + 1;
+
+  for (nzc_context = 0; nzc_context < MAX_NZC_CONTEXTS; ++nzc_context) {
+    for (r = 0; r < REF_TYPES; ++r) {
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        unsigned int *nzc_costs;
+        if (block_size == 4) {
+          vp9_cost_tokens(cost,
+                          cpi->common.fc.nzc_probs_4x4[nzc_context][r][b],
+                          vp9_nzc4x4_tree);
+          nzc_costs = cpi->mb.nzc_costs_4x4[nzc_context][r][b];
+        } else if (block_size == 8) {
+          vp9_cost_tokens(cost,
+                          cpi->common.fc.nzc_probs_8x8[nzc_context][r][b],
+                          vp9_nzc8x8_tree);
+          nzc_costs = cpi->mb.nzc_costs_8x8[nzc_context][r][b];
+        } else if (block_size == 16) {
+          vp9_cost_tokens(cost,
+                          cpi->common.fc.nzc_probs_16x16[nzc_context][r][b],
+                          vp9_nzc16x16_tree);
+          nzc_costs = cpi->mb.nzc_costs_16x16[nzc_context][r][b];
+        } else {
+          vp9_cost_tokens(cost,
+                          cpi->common.fc.nzc_probs_32x32[nzc_context][r][b],
+                          vp9_nzc32x32_tree);
+          nzc_costs = cpi->mb.nzc_costs_32x32[nzc_context][r][b];
+        }
+
+        for (nzc = 0; nzc < values; ++nzc) {
+          int e, c, totalcost = 0;
+          c = codenzc(nzc);
+          totalcost = cost[c];
+          if ((e = vp9_extranzcbits[c])) {
+            int x = nzc - vp9_basenzcvalue[c];
+            while (e--) {
+              totalcost += vp9_cost_bit(
+                  cpi->common.fc.nzc_pcat_probs[nzc_context]
+                                               [c - NZC_TOKENS_NOEXTRA][e],
+                  ((x >> e) & 1));
+            }
+          }
+          nzc_costs[nzc] = totalcost;
+        }
+      }
+    }
+  }
+}
+#endif
+
 
 static int rd_iifactor[32] =  { 4, 4, 3, 2, 1, 0, 0, 0,
                                 0, 0, 0, 0, 0, 0, 0, 0,
@@ -274,6 +334,12 @@
                    cpi->common.fc.coef_probs_16x16, BLOCK_TYPES);
   fill_token_costs(cpi->mb.token_costs[TX_32X32],
                    cpi->common.fc.coef_probs_32x32, BLOCK_TYPES);
+#if CONFIG_CODE_NONZEROCOUNT
+  fill_nzc_costs(cpi, 4);
+  fill_nzc_costs(cpi, 8);
+  fill_nzc_costs(cpi, 16);
+  fill_nzc_costs(cpi, 32);
+#endif
 
   /*rough estimate for costing*/
   cpi->common.kf_ymode_probs_index = cpi->common.base_qindex >> 4;
@@ -379,25 +445,20 @@
   return sse2;
 }
 
-static INLINE int cost_coeffs(MACROBLOCK *mb,
+static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
                               int ib, PLANE_TYPE type,
                               ENTROPY_CONTEXT *a,
                               ENTROPY_CONTEXT *l,
                               TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-  const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;
   int pt;
   const int eob = xd->eobs[ib];
   int c = 0;
-  int cost = 0, seg_eob;
-  const int segment_id = mbmi->segment_id;
+  int cost = 0;
   const int *scan;
   const int16_t *qcoeff_ptr = xd->qcoeff + ib * 16;
   const int ref = mbmi->ref_frame != INTRA_FRAME;
-  const TX_TYPE tx_type = (sb_type == BLOCK_SIZE_MB16X16 &&
-                           type == PLANE_TYPE_Y_WITH_DC) ?
-                          get_tx_type(xd, &xd->block[ib]) : DCT_DCT;
   unsigned int (*token_costs)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
       mb->token_costs[tx_size][type][ref];
   ENTROPY_CONTEXT a_ec, l_ec;
@@ -406,29 +467,59 @@
   ENTROPY_CONTEXT *const l1 = l +
       sizeof(ENTROPY_CONTEXT_PLANES)/sizeof(ENTROPY_CONTEXT);
 
+#if CONFIG_CODE_NONZEROCOUNT
+  int nzc_context = vp9_get_nzc_context(cm, xd, ib);
+  unsigned int *nzc_cost;
+#else
+  int seg_eob;
+  const int segment_id = xd->mode_info_context->mbmi.segment_id;
+#endif
+
+  // Check for consistency of tx_size with mode info
+  if (type == PLANE_TYPE_Y_WITH_DC) {
+    assert(xd->mode_info_context->mbmi.txfm_size == tx_size);
+  } else {
+    TX_SIZE tx_size_uv = get_uv_tx_size(xd);
+    assert(tx_size == tx_size_uv);
+  }
+
   switch (tx_size) {
-    case TX_4X4:
+    case TX_4X4: {
+      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+                              get_tx_type_4x4(xd, ib) : DCT_DCT;
       a_ec = *a;
       l_ec = *l;
-      scan = vp9_default_zig_zag1d_4x4;
+#if CONFIG_CODE_NONZEROCOUNT
+      nzc_cost = mb->nzc_costs_4x4[nzc_context][ref][type];
+#else
       seg_eob = 16;
-      if (type == PLANE_TYPE_Y_WITH_DC) {
-        if (tx_type == ADST_DCT) {
-          scan = vp9_row_scan_4x4;
-        } else if (tx_type == DCT_ADST) {
-          scan = vp9_col_scan_4x4;
-        }
+#endif
+      if (tx_type == ADST_DCT) {
+        scan = vp9_row_scan_4x4;
+      } else if (tx_type == DCT_ADST) {
+        scan = vp9_col_scan_4x4;
+      } else {
+        scan = vp9_default_zig_zag1d_4x4;
       }
       break;
+    }
     case TX_8X8:
       a_ec = (a[0] + a[1]) != 0;
       l_ec = (l[0] + l[1]) != 0;
       scan = vp9_default_zig_zag1d_8x8;
+#if CONFIG_CODE_NONZEROCOUNT
+      nzc_cost = mb->nzc_costs_8x8[nzc_context][ref][type];
+#else
       seg_eob = 64;
+#endif
       break;
     case TX_16X16:
       scan = vp9_default_zig_zag1d_16x16;
+#if CONFIG_CODE_NONZEROCOUNT
+      nzc_cost = mb->nzc_costs_16x16[nzc_context][ref][type];
+#else
       seg_eob = 256;
+#endif
       if (type == PLANE_TYPE_UV) {
         a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
         l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
@@ -439,7 +530,11 @@
       break;
     case TX_32X32:
       scan = vp9_default_zig_zag1d_32x32;
+#if CONFIG_CODE_NONZEROCOUNT
+      nzc_cost = mb->nzc_costs_32x32[nzc_context][ref][type];
+#else
       seg_eob = 1024;
+#endif
       if (type == PLANE_TYPE_UV) {
         ENTROPY_CONTEXT *a2, *a3, *l2, *l3;
         a2 = a1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
@@ -464,21 +559,33 @@
 
   VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec);
 
+#if CONFIG_CODE_NONZEROCOUNT == 0
   if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))
     seg_eob = 0;
+#endif
 
   {
     int recent_energy = 0;
+#if CONFIG_CODE_NONZEROCOUNT
+    int nzc = 0;
+#endif
     for (; c < eob; c++) {
       int v = qcoeff_ptr[scan[c]];
       int t = vp9_dct_value_tokens_ptr[v].Token;
+#if CONFIG_CODE_NONZEROCOUNT
+      nzc += (v != 0);
+#endif
       cost += token_costs[get_coef_band(tx_size, c)][pt][t];
       cost += vp9_dct_value_cost_ptr[v];
       pt = vp9_get_coef_context(&recent_energy, t);
     }
+#if CONFIG_CODE_NONZEROCOUNT
+    cost += nzc_cost[nzc];
+#else
     if (c < seg_eob)
       cost += mb->token_costs[tx_size][type][ref][get_coef_band(tx_size, c)]
           [pt][DCT_EOB_TOKEN];
+#endif
   }
 
   // is eob first coefficient;
@@ -501,27 +608,19 @@
   return cost;
 }
 
-static int rdcost_mby_4x4(MACROBLOCK *mb, int backup) {
+static int rdcost_mby_4x4(VP9_COMMON *const cm, MACROBLOCK *mb) {
   int cost = 0;
   int b;
   MACROBLOCKD *xd = &mb->e_mbd;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta;
-  ENTROPY_CONTEXT *tl;
+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *)&t_above;
+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *)&t_left;
 
-  if (backup) {
-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-    ta = (ENTROPY_CONTEXT *)&t_above;
-    tl = (ENTROPY_CONTEXT *)&t_left;
-  } else {
-    ta = (ENTROPY_CONTEXT *)xd->above_context;
-    tl = (ENTROPY_CONTEXT *)xd->left_context;
-  }
+  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
+  vpx_memcpy(&t_left, xd->left_context, sizeof(t_left));
 
   for (b = 0; b < 16; b++)
-    cost += cost_coeffs(mb, b, PLANE_TYPE_Y_WITH_DC,
+    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_Y_WITH_DC,
                         ta + vp9_block2above[TX_4X4][b],
                         tl + vp9_block2left[TX_4X4][b],
                         TX_4X4);
@@ -529,42 +628,35 @@
   return cost;
 }
 
-static void macro_block_yrd_4x4(MACROBLOCK *mb,
-                                int *Rate,
-                                int *Distortion,
-                                int *skippable, int backup) {
+static void macro_block_yrd_4x4(VP9_COMMON *const cm,
+                                MACROBLOCK *mb,
+                                int *rate,
+                                int *distortion,
+                                int *skippable) {
   MACROBLOCKD *const xd = &mb->e_mbd;
 
   xd->mode_info_context->mbmi.txfm_size = TX_4X4;
   vp9_transform_mby_4x4(mb);
   vp9_quantize_mby_4x4(mb);
 
-  *Distortion = vp9_mbblock_error(mb) >> 2;
-  *Rate = rdcost_mby_4x4(mb, backup);
+  *distortion = vp9_mbblock_error(mb) >> 2;
+  *rate = rdcost_mby_4x4(cm, mb);
   *skippable = vp9_mby_is_skippable_4x4(xd);
 }
 
-static int rdcost_mby_8x8(MACROBLOCK *mb, int backup) {
+static int rdcost_mby_8x8(VP9_COMMON *const cm, MACROBLOCK *mb) {
   int cost = 0;
   int b;
   MACROBLOCKD *xd = &mb->e_mbd;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta;
-  ENTROPY_CONTEXT *tl;
+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *)&t_above;
+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *)&t_left;
 
-  if (backup) {
-    vpx_memcpy(&t_above,xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-    ta = (ENTROPY_CONTEXT *)&t_above;
-    tl = (ENTROPY_CONTEXT *)&t_left;
-  } else {
-    ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;
-    tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;
-  }
+  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
+  vpx_memcpy(&t_left,  xd->left_context, sizeof(t_left));
 
   for (b = 0; b < 16; b += 4)
-    cost += cost_coeffs(mb, b, PLANE_TYPE_Y_WITH_DC,
+    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_Y_WITH_DC,
                         ta + vp9_block2above[TX_8X8][b],
                         tl + vp9_block2left[TX_8X8][b],
                         TX_8X8);
@@ -572,45 +664,37 @@
   return cost;
 }
 
-static void macro_block_yrd_8x8(MACROBLOCK *mb,
-                                int *Rate,
-                                int *Distortion,
-                                int *skippable, int backup) {
+static void macro_block_yrd_8x8(VP9_COMMON *const cm,
+                                MACROBLOCK *mb,
+                                int *rate,
+                                int *distortion,
+                                int *skippable) {
   MACROBLOCKD *const xd = &mb->e_mbd;
 
   xd->mode_info_context->mbmi.txfm_size = TX_8X8;
   vp9_transform_mby_8x8(mb);
   vp9_quantize_mby_8x8(mb);
 
-  *Distortion = vp9_mbblock_error(mb) >> 2;
-  *Rate = rdcost_mby_8x8(mb, backup);
+  *distortion = vp9_mbblock_error(mb) >> 2;
+  *rate = rdcost_mby_8x8(cm, mb);
   *skippable = vp9_mby_is_skippable_8x8(xd);
 }
 
-static int rdcost_mby_16x16(MACROBLOCK *mb, int backup) {
-  int cost;
-  MACROBLOCKD *xd = &mb->e_mbd;
+static int rdcost_mby_16x16(VP9_COMMON *const cm, MACROBLOCK *mb) {
+  MACROBLOCKD *const xd = &mb->e_mbd;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta, *tl;
+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *)&t_above;
+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *)&t_left;
 
-  if (backup) {
-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
+  vpx_memcpy(&t_left, xd->left_context, sizeof(t_left));
 
-    ta = (ENTROPY_CONTEXT *)&t_above;
-    tl = (ENTROPY_CONTEXT *)&t_left;
-  } else {
-    ta = (ENTROPY_CONTEXT *)xd->above_context;
-    tl = (ENTROPY_CONTEXT *)xd->left_context;
-  }
-
-  cost = cost_coeffs(mb, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_16X16);
-  return cost;
+  return cost_coeffs(cm, mb, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_16X16);
 }
 
-static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
-                                  int *skippable, int backup) {
-  MACROBLOCKD *xd = &mb->e_mbd;
+static void macro_block_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *mb,
+                                  int *rate, int *distortion, int *skippable) {
+  MACROBLOCKD *const xd = &mb->e_mbd;
 
   xd->mode_info_context->mbmi.txfm_size = TX_16X16;
   vp9_transform_mby_16x16(mb);
@@ -620,10 +704,10 @@
   //                optimization in the rate-distortion optimization loop?
   if (mb->optimize &&
       xd->mode_info_context->mbmi.mode < I8X8_PRED)
-    vp9_optimize_mby_16x16(mb);
+    vp9_optimize_mby_16x16(cm, mb);
 
-  *Distortion = vp9_mbblock_error(mb) >> 2;
-  *Rate = rdcost_mby_16x16(mb, backup);
+  *distortion = vp9_mbblock_error(mb) >> 2;
+  *rate = rdcost_mby_16x16(cm, mb);
   *skippable = vp9_mby_is_skippable_16x16(xd);
 }
 
@@ -715,15 +799,16 @@
 static void macro_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                             int *distortion, int *skippable,
                             int64_t txfm_cache[NB_TXFM_MODES]) {
+  VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   int r[TX_SIZE_MAX_MB][2], d[TX_SIZE_MAX_MB], s[TX_SIZE_MAX_MB];
 
   vp9_subtract_mby(x->src_diff, *(x->block[0].base_src), xd->predictor,
                    x->block[0].src_stride);
 
-  macro_block_yrd_16x16(x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16], 1);
-  macro_block_yrd_8x8(x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], 1);
-  macro_block_yrd_4x4(x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], 1);
+  macro_block_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
+  macro_block_yrd_8x8(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]);
+  macro_block_yrd_4x4(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]);
 
   choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skippable,
                            txfm_cache, TX_16X16);
@@ -738,27 +823,8 @@
   d[12] = p[12];
 }
 
-static int rdcost_sby_32x32(MACROBLOCK *x, int backup) {
-  MACROBLOCKD * const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
-  ENTROPY_CONTEXT *ta, *tl;
-
-  if (backup) {
-    ta = (ENTROPY_CONTEXT *) &t_above,
-    tl = (ENTROPY_CONTEXT *) &t_left;
-
-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
-    vpx_memcpy(&t_left,  xd->left_context,  sizeof(ENTROPY_CONTEXT_PLANES) * 2);
-  } else {
-    ta = (ENTROPY_CONTEXT *) xd->above_context;
-    tl = (ENTROPY_CONTEXT *) xd->left_context;
-  }
-
-  return cost_coeffs(x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_32X32);
-}
-
 static int vp9_sb_block_error_c(int16_t *coeff, int16_t *dqcoeff,
-                                int block_size) {
+                                int block_size, int shift) {
   int i;
   int64_t error = 0;
 
@@ -766,32 +832,126 @@
     unsigned int this_diff = coeff[i] - dqcoeff[i];
     error += this_diff * this_diff;
   }
+  error >>= shift;
 
   return error > INT_MAX ? INT_MAX : (int)error;
 }
 
-#define DEBUG_ERROR 0
-static void super_block_yrd_32x32(MACROBLOCK *x,
-                                  int *rate, int *distortion, int *skippable,
-                                  int backup) {
+static int rdcost_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
+  int cost = 0, b;
   MACROBLOCKD *const xd = &x->e_mbd;
-#if DEBUG_ERROR
-  int16_t out[1024];
-#endif
+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
 
+  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
+  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
+
+  for (b = 0; b < 64; b++)
+    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
+                        ta + vp9_block2above_sb[TX_4X4][b],
+                        tl + vp9_block2left_sb[TX_4X4][b], TX_4X4);
+
+  return cost;
+}
+
+static void super_block_yrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
+                                int *rate, int *distortion, int *skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  xd->mode_info_context->mbmi.txfm_size = TX_4X4;
+  vp9_transform_sby_4x4(x);
+  vp9_quantize_sby_4x4(x);
+
+  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 2);
+  *rate       = rdcost_sby_4x4(cm, x);
+  *skippable  = vp9_sby_is_skippable_4x4(xd);
+}
+
+static int rdcost_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
+  int cost = 0, b;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
+
+  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
+  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
+
+  for (b = 0; b < 64; b += 4)
+    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
+                        ta + vp9_block2above_sb[TX_8X8][b],
+                        tl + vp9_block2left_sb[TX_8X8][b], TX_8X8);
+
+  return cost;
+}
+
+static void super_block_yrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
+                                int *rate, int *distortion, int *skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
+  vp9_transform_sby_8x8(x);
+  vp9_quantize_sby_8x8(x);
+
+  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 2);
+  *rate       = rdcost_sby_8x8(cm, x);
+  *skippable  = vp9_sby_is_skippable_8x8(xd);
+}
+
+static int rdcost_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
+  int cost = 0, b;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
+
+  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
+  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
+
+  for (b = 0; b < 64; b += 16)
+    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
+                        ta + vp9_block2above_sb[TX_16X16][b],
+                        tl + vp9_block2left_sb[TX_16X16][b], TX_16X16);
+
+  return cost;
+}
+
+static void super_block_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
+                                  int *rate, int *distortion, int *skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  xd->mode_info_context->mbmi.txfm_size = TX_16X16;
+  vp9_transform_sby_16x16(x);
+  vp9_quantize_sby_16x16(x);
+
+  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 2);
+  *rate       = rdcost_sby_16x16(cm, x);
+  *skippable  = vp9_sby_is_skippable_16x16(xd);
+}
+
+static int rdcost_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {
+  MACROBLOCKD * const xd = &x->e_mbd;
+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
+
+  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
+  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
+
+  return cost_coeffs(cm, x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_32X32);
+}
+
+static void super_block_yrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
+                                  int *rate, int *distortion, int *skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  xd->mode_info_context->mbmi.txfm_size = TX_32X32;
   vp9_transform_sby_32x32(x);
   vp9_quantize_sby_32x32(x);
-#if DEBUG_ERROR
-  vp9_short_idct32x32(xd->dqcoeff, out, 64);
-#endif
 
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024);
-
-#if DEBUG_ERROR
-  printf("IDCT/FDCT error 32x32: %d (d: %d)\n",
-         vp9_block_error_c(x->src_diff, out, 1024), *distortion);
-#endif
-  *rate       = rdcost_sby_32x32(x, backup);
+  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 0);
+  *rate       = rdcost_sby_32x32(cm, x);
   *skippable  = vp9_sby_is_skippable_32x32(xd);
 }
 
@@ -799,179 +959,166 @@
                             MACROBLOCK *x, int *rate, int *distortion,
                             int *skip,
                             int64_t txfm_cache[NB_TXFM_MODES]) {
+  VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB], n;
+  int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB];
   const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
   int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
-  ENTROPY_CONTEXT_PLANES t_above[TX_SIZE_MAX_MB][2],
-                        *orig_above = xd->above_context;
-  ENTROPY_CONTEXT_PLANES t_left[TX_SIZE_MAX_MB][2],
-                        *orig_left = xd->left_context;
 
-  for (n = TX_4X4; n < TX_SIZE_MAX_MB; n++) {
-    vpx_memcpy(t_above[n], xd->above_context, sizeof(t_above[n]));
-    vpx_memcpy(t_left[n], xd->left_context, sizeof(t_left[n]));
-    r[n][0] = 0;
-    d[n] = 0;
-    s[n] = 1;
-  }
+  vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride);
+  super_block_yrd_32x32(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
+  super_block_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
+  super_block_yrd_8x8(cm, x,   &r[TX_8X8][0],   &d[TX_8X8],   &s[TX_8X8]);
+  super_block_yrd_4x4(cm, x,   &r[TX_4X4][0],   &d[TX_4X4],   &s[TX_4X4]);
 
-  vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride,
-                       dst, dst_y_stride);
-  super_block_yrd_32x32(x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32], 1);
-
-#if DEBUG_ERROR
-  int err[3] = { 0, 0, 0 };
-#endif
-  for (n = 0; n < 4; n++) {
-    int x_idx = n & 1, y_idx = n >> 1;
-    int r_tmp, d_tmp, s_tmp;
-
-    vp9_subtract_mby_s_c(x->src_diff,
-                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
-                         src_y_stride,
-                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
-                         dst_y_stride);
-
-    xd->above_context = &t_above[TX_16X16][x_idx];
-    xd->left_context = &t_left[TX_16X16][y_idx];
-    macro_block_yrd_16x16(x, &r_tmp, &d_tmp, &s_tmp, 0);
-    d[TX_16X16] += d_tmp;
-    r[TX_16X16][0] += r_tmp;
-    s[TX_16X16] = s[TX_16X16] && s_tmp;
-#if DEBUG_ERROR
-    vp9_inverse_transform_mby_16x16(xd);
-    err[2] += vp9_block_error_c(xd->diff, x->src_diff, 256);
-#endif
-
-    xd->above_context = &t_above[TX_4X4][x_idx];
-    xd->left_context = &t_left[TX_4X4][y_idx];
-    macro_block_yrd_4x4(x, &r_tmp, &d_tmp, &s_tmp, 0);
-    d[TX_4X4] += d_tmp;
-    r[TX_4X4][0] += r_tmp;
-    s[TX_4X4] = s[TX_4X4] && s_tmp;
-#if DEBUG_ERROR
-    vp9_inverse_transform_mby_4x4(xd);
-    err[0] += vp9_block_error_c(xd->diff, x->src_diff, 256);
-#endif
-
-    xd->above_context = &t_above[TX_8X8][x_idx];
-    xd->left_context = &t_left[TX_8X8][y_idx];
-    macro_block_yrd_8x8(x, &r_tmp, &d_tmp, &s_tmp, 0);
-    d[TX_8X8] += d_tmp;
-    r[TX_8X8][0] += r_tmp;
-    s[TX_8X8] = s[TX_8X8] && s_tmp;
-#if DEBUG_ERROR
-    vp9_inverse_transform_mby_8x8(xd);
-    err[1] += vp9_block_error_c(xd->diff, x->src_diff, 256);
-#endif
-  }
-#if DEBUG_ERROR
-  printf("IDCT/FDCT error 16x16: %d (d: %d)\n", err[2], d[2]);
-  printf("IDCT/FDCT error 8x8: %d (d: %d)\n", err[1], d[1]);
-  printf("IDCT/FDCT error 4x4: %d (d: %d)\n", err[0], d[0]);
-#endif
   choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,
                            TX_SIZE_MAX_SB - 1);
+}
 
-  xd->above_context = orig_above;
-  xd->left_context = orig_left;
+static int rdcost_sb64y_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
+  int cost = 0, b;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
+
+  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
+  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
+
+  for (b = 0; b < 256; b++)
+    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
+                        ta + vp9_block2above_sb64[TX_4X4][b],
+                        tl + vp9_block2left_sb64[TX_4X4][b], TX_4X4);
+
+  return cost;
+}
+
+static void super_block64_yrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
+                                  int *rate, int *distortion, int *skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  xd->mode_info_context->mbmi.txfm_size = TX_4X4;
+  vp9_transform_sb64y_4x4(x);
+  vp9_quantize_sb64y_4x4(x);
+
+  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 2);
+  *rate       = rdcost_sb64y_4x4(cm, x);
+  *skippable  = vp9_sb64y_is_skippable_4x4(xd);
+}
+
+static int rdcost_sb64y_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
+  int cost = 0, b;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
+
+  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
+  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
+
+  for (b = 0; b < 256; b += 4)
+    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
+                        ta + vp9_block2above_sb64[TX_8X8][b],
+                        tl + vp9_block2left_sb64[TX_8X8][b], TX_8X8);
+
+  return cost;
+}
+
+static void super_block64_yrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
+                                  int *rate, int *distortion, int *skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
+  vp9_transform_sb64y_8x8(x);
+  vp9_quantize_sb64y_8x8(x);
+
+  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 2);
+  *rate       = rdcost_sb64y_8x8(cm, x);
+  *skippable  = vp9_sb64y_is_skippable_8x8(xd);
+}
+
+static int rdcost_sb64y_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
+  int cost = 0, b;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
+
+  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
+  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
+
+  for (b = 0; b < 256; b += 16)
+    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
+                        ta + vp9_block2above_sb64[TX_16X16][b],
+                        tl + vp9_block2left_sb64[TX_16X16][b], TX_16X16);
+
+  return cost;
+}
+
+static void super_block64_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
+                                    int *rate, int *distortion,
+                                    int *skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  xd->mode_info_context->mbmi.txfm_size = TX_16X16;
+  vp9_transform_sb64y_16x16(x);
+  vp9_quantize_sb64y_16x16(x);
+
+  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 2);
+  *rate       = rdcost_sb64y_16x16(cm, x);
+  *skippable  = vp9_sb64y_is_skippable_16x16(xd);
+}
+
+static int rdcost_sb64y_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {
+  int cost = 0, b;
+  MACROBLOCKD * const xd = &x->e_mbd;
+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
+
+  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
+  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
+
+  for (b = 0; b < 256; b += 64)
+    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
+                        ta + vp9_block2above_sb64[TX_32X32][b],
+                        tl + vp9_block2left_sb64[TX_32X32][b], TX_32X32);
+
+  return cost;
+}
+
+static void super_block64_yrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
+                                    int *rate, int *distortion,
+                                    int *skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  xd->mode_info_context->mbmi.txfm_size = TX_32X32;
+  vp9_transform_sb64y_32x32(x);
+  vp9_quantize_sb64y_32x32(x);
+
+  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 0);
+  *rate       = rdcost_sb64y_32x32(cm, x);
+  *skippable  = vp9_sb64y_is_skippable_32x32(xd);
 }
 
 static void super_block_64_yrd(VP9_COMP *cpi,
                                MACROBLOCK *x, int *rate, int *distortion,
                                int *skip,
                                int64_t txfm_cache[NB_TXFM_MODES]) {
+  VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB], n;
+  int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB];
   const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
   int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
-  ENTROPY_CONTEXT_PLANES t_above[TX_SIZE_MAX_SB][4],
-                        *orig_above = xd->above_context;
-  ENTROPY_CONTEXT_PLANES t_left[TX_SIZE_MAX_SB][4],
-                        *orig_left = xd->left_context;
 
-  for (n = TX_4X4; n < TX_SIZE_MAX_SB; n++) {
-    vpx_memcpy(t_above[n], xd->above_context, sizeof(t_above[n]));
-    vpx_memcpy(t_left[n], xd->left_context, sizeof(t_left[n]));
-    r[n][0] = 0;
-    d[n] = 0;
-    s[n] = 1;
-  }
+  vp9_subtract_sb64y_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride);
+  super_block64_yrd_32x32(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
+  super_block64_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
+  super_block64_yrd_8x8(cm, x,   &r[TX_8X8][0],   &d[TX_8X8],   &s[TX_8X8]);
+  super_block64_yrd_4x4(cm, x,   &r[TX_4X4][0],   &d[TX_4X4],   &s[TX_4X4]);
 
-  for (n = 0; n < 4; n++) {
-    int x_idx = n & 1, y_idx = n >> 1;
-    int r_tmp, d_tmp, s_tmp;
-
-    xd->above_context = &t_above[TX_32X32][x_idx << 1];
-    xd->left_context = &t_left[TX_32X32][y_idx << 1];
-    vp9_subtract_sby_s_c(x->src_diff,
-                         src + 32 * x_idx + 32 * y_idx * src_y_stride,
-                         src_y_stride,
-                         dst + 32 * x_idx + 32 * y_idx * dst_y_stride,
-                         dst_y_stride);
-    super_block_yrd_32x32(x, &r_tmp, &d_tmp, &s_tmp, 0);
-    r[TX_32X32][0] += r_tmp;
-    d[TX_32X32] += d_tmp;
-    s[TX_32X32] = s[TX_32X32] && s_tmp;
-  }
-
-#if DEBUG_ERROR
-  int err[3] = { 0, 0, 0 };
-#endif
-  for (n = 0; n < 16; n++) {
-    int x_idx = n & 3, y_idx = n >> 2;
-    int r_tmp, d_tmp, s_tmp;
-
-    vp9_subtract_mby_s_c(x->src_diff,
-                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
-                         src_y_stride,
-                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
-                         dst_y_stride);
-
-    xd->above_context = &t_above[TX_16X16][x_idx];
-    xd->left_context = &t_left[TX_16X16][y_idx];
-    macro_block_yrd_16x16(x, &r_tmp, &d_tmp, &s_tmp, 0);
-    d[TX_16X16] += d_tmp;
-    r[TX_16X16][0] += r_tmp;
-    s[TX_16X16] = s[TX_16X16] && s_tmp;
-#if DEBUG_ERROR
-    vp9_inverse_transform_mby_16x16(xd);
-    err[2] += vp9_block_error_c(xd->diff, x->src_diff, 256);
-#endif
-
-    xd->above_context = &t_above[TX_4X4][x_idx];
-    xd->left_context = &t_left[TX_4X4][y_idx];
-    macro_block_yrd_4x4(x, &r_tmp, &d_tmp, &s_tmp, 0);
-    d[TX_4X4] += d_tmp;
-    r[TX_4X4][0] += r_tmp;
-    s[TX_4X4] = s[TX_4X4] && s_tmp;
-#if DEBUG_ERROR
-    vp9_inverse_transform_mby_4x4(xd);
-    err[0] += vp9_block_error_c(xd->diff, x->src_diff, 256);
-#endif
-
-    xd->above_context = &t_above[TX_8X8][x_idx];
-    xd->left_context = &t_left[TX_8X8][y_idx];
-    macro_block_yrd_8x8(x, &r_tmp, &d_tmp, &s_tmp, 0);
-    d[TX_8X8] += d_tmp;
-    r[TX_8X8][0] += r_tmp;
-    s[TX_8X8] = s[TX_8X8] && s_tmp;
-#if DEBUG_ERROR
-    vp9_inverse_transform_mby_8x8(xd);
-    err[1] += vp9_block_error_c(xd->diff, x->src_diff, 256);
-#endif
-  }
-#if DEBUG_ERROR
-  printf("IDCT/FDCT error 16x16: %d (d: %d)\n", err[2], d[2]);
-  printf("IDCT/FDCT error 8x8: %d (d: %d)\n", err[1], d[1]);
-  printf("IDCT/FDCT error 4x4: %d (d: %d)\n", err[0], d[0]);
-#endif
   choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,
                            TX_SIZE_MAX_SB - 1);
-
-  xd->above_context = orig_above;
-  xd->left_context = orig_left;
 }
 
 static void copy_predictor_8x8(uint8_t *dst, const uint8_t *predictor) {
@@ -1006,6 +1153,7 @@
   int64_t best_rd = INT64_MAX;
   int rate = 0;
   int distortion;
+  VP9_COMMON *const cm = &cpi->common;
 
   ENTROPY_CONTEXT ta = *a, tempa = *a;
   ENTROPY_CONTEXT tl = *l, templ = *l;
@@ -1022,6 +1170,7 @@
 #if CONFIG_NEWBINTRAMODES
   b->bmi.as_mode.context = vp9_find_bpred_context(b);
 #endif
+  xd->mode_info_context->mbmi.txfm_size = TX_4X4;
   for (mode = B_DC_PRED; mode < LEFT4X4; mode++) {
     int64_t this_rd;
     int ratey;
@@ -1048,7 +1197,7 @@
     vp9_subtract_b(be, b, 16);
 
     b->bmi.as_mode.first = mode;
-    tx_type = get_tx_type_4x4(xd, b);
+    tx_type = get_tx_type_4x4(xd, be - x->block);
     if (tx_type != DCT_DCT) {
       vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
       vp9_ht_quantize_b_4x4(x, be - x->block, tx_type);
@@ -1060,7 +1209,7 @@
     tempa = ta;
     templ = tl;
 
-    ratey = cost_coeffs(x, b - xd->block,
+    ratey = cost_coeffs(cm, x, b - xd->block,
                         PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4);
     rate += ratey;
     distortion = vp9_block_error(be->coeff, b->dqcoeff, 16) >> 2;
@@ -1311,6 +1460,7 @@
                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                                      int *bestrate, int *bestratey,
                                      int *bestdistortion) {
+  VP9_COMMON *const cm = &cpi->common;
   MB_PREDICTION_MODE mode;
   MACROBLOCKD *xd = &x->e_mbd;
   int64_t best_rd = INT64_MAX;
@@ -1346,7 +1496,7 @@
     vp9_subtract_4b_c(be, b, 16);
 
     if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
-      TX_TYPE tx_type = get_tx_type_8x8(xd, b);
+      TX_TYPE tx_type = get_tx_type_8x8(xd, ib);
       if (tx_type != DCT_DCT)
         vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type);
       else
@@ -1365,7 +1515,7 @@
       ta1 = ta0 + 1;
       tl1 = tl0 + 1;
 
-      rate_t = cost_coeffs(x, idx, PLANE_TYPE_Y_WITH_DC,
+      rate_t = cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
                            ta0, tl0, TX_8X8);
 
       rate += rate_t;
@@ -1385,11 +1535,12 @@
         int do_two = 0;
         b = &xd->block[ib + iblock[i]];
         be = &x->block[ib + iblock[i]];
-        tx_type = get_tx_type_4x4(xd, b);
+        tx_type = get_tx_type_4x4(xd, ib + iblock[i]);
         if (tx_type != DCT_DCT) {
           vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
           vp9_ht_quantize_b_4x4(x, ib + iblock[i], tx_type);
-        } else if (!(i & 1) && get_tx_type_4x4(xd, b + 1) == DCT_DCT) {
+        } else if (!(i & 1) &&
+                   get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) {
           x->fwd_txm8x4(be->src_diff, be->coeff, 32);
           x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1);
           do_two = 1;
@@ -1398,12 +1549,12 @@
           x->quantize_b_4x4(x, ib + iblock[i]);
         }
         distortion += vp9_block_error_c(be->coeff, b->dqcoeff, 16 << do_two);
-        rate_t += cost_coeffs(x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
+        rate_t += cost_coeffs(cm, x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
                               i&1 ? ta1 : ta0, i&2 ? tl1 : tl0,
                               TX_4X4);
         if (do_two) {
           i++;
-          rate_t += cost_coeffs(x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
+          rate_t += cost_coeffs(cm, x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
                                 i&1 ? ta1 : ta0, i&2 ? tl1 : tl0,
                                 TX_4X4);
         }
@@ -1491,7 +1642,80 @@
   return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
 }
 
-static int rd_cost_mbuv_4x4(MACROBLOCK *mb, int backup) {
+static int64_t rd_pick_intra8x8mby_modes_and_txsz(VP9_COMP *cpi, MACROBLOCK *x,
+                                                  int *rate, int *rate_y,
+                                                  int *distortion,
+                                                  int *mode8x8,
+                                                  int64_t best_yrd,
+                                                  int64_t *txfm_cache) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  int cost0 = vp9_cost_bit(cm->prob_tx[0], 0);
+  int cost1 = vp9_cost_bit(cm->prob_tx[0], 1);
+  int64_t tmp_rd_4x4s, tmp_rd_8x8s;
+  int64_t tmp_rd_4x4, tmp_rd_8x8, tmp_rd;
+  int r4x4, tok4x4, d4x4, r8x8, tok8x8, d8x8;
+
+  mbmi->txfm_size = TX_4X4;
+  tmp_rd_4x4 = rd_pick_intra8x8mby_modes(cpi, x, &r4x4, &tok4x4,
+                                         &d4x4, best_yrd);
+  mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;
+  mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;
+  mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;
+  mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;
+  mbmi->txfm_size = TX_8X8;
+  tmp_rd_8x8 = rd_pick_intra8x8mby_modes(cpi, x, &r8x8, &tok8x8,
+                                         &d8x8, best_yrd);
+  txfm_cache[ONLY_4X4]  = tmp_rd_4x4;
+  txfm_cache[ALLOW_8X8] = tmp_rd_8x8;
+  txfm_cache[ALLOW_16X16] = tmp_rd_8x8;
+  tmp_rd_4x4s = tmp_rd_4x4 + RDCOST(x->rdmult, x->rddiv, cost0, 0);
+  tmp_rd_8x8s = tmp_rd_8x8 + RDCOST(x->rdmult, x->rddiv, cost1, 0);
+  txfm_cache[TX_MODE_SELECT] = tmp_rd_4x4s < tmp_rd_8x8s ?
+                               tmp_rd_4x4s : tmp_rd_8x8s;
+  if (cm->txfm_mode == TX_MODE_SELECT) {
+    if (tmp_rd_4x4s < tmp_rd_8x8s) {
+      *rate = r4x4 + cost0;
+      *rate_y = tok4x4 + cost0;
+      *distortion = d4x4;
+      mbmi->txfm_size = TX_4X4;
+      tmp_rd = tmp_rd_4x4s;
+    } else {
+      *rate = r8x8 + cost1;
+      *rate_y = tok8x8 + cost1;
+      *distortion = d8x8;
+      mbmi->txfm_size = TX_8X8;
+      tmp_rd = tmp_rd_8x8s;
+
+      mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;
+      mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;
+      mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;
+      mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;
+    }
+  } else if (cm->txfm_mode == ONLY_4X4) {
+    *rate = r4x4;
+    *rate_y = tok4x4;
+    *distortion = d4x4;
+    mbmi->txfm_size = TX_4X4;
+    tmp_rd = tmp_rd_4x4;
+  } else {
+    *rate = r8x8;
+    *rate_y = tok8x8;
+    *distortion = d8x8;
+    mbmi->txfm_size = TX_8X8;
+    tmp_rd = tmp_rd_8x8;
+
+    mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;
+    mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;
+    mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;
+    mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;
+  }
+
+  return tmp_rd;
+}
+
+static int rd_cost_mbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) {
   int b;
   int cost = 0;
   MACROBLOCKD *xd = &mb->e_mbd;
@@ -1510,7 +1734,7 @@
   }
 
   for (b = 16; b < 24; b++)
-    cost += cost_coeffs(mb, b, PLANE_TYPE_UV,
+    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_UV,
                         ta + vp9_block2above[TX_4X4][b],
                         tl + vp9_block2left[TX_4X4][b],
                         TX_4X4);
@@ -1525,14 +1749,14 @@
   vp9_transform_mbuv_4x4(x);
   vp9_quantize_mbuv_4x4(x);
 
-  *rate       = rd_cost_mbuv_4x4(x, do_ctx_backup);
+  *rate       = rd_cost_mbuv_4x4(&cpi->common, x, do_ctx_backup);
   *distortion = vp9_mbuverror(x) / 4;
   *skip       = vp9_mbuv_is_skippable_4x4(&x->e_mbd);
 
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
-static int rd_cost_mbuv_8x8(MACROBLOCK *mb, int backup) {
+static int rd_cost_mbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) {
   int b;
   int cost = 0;
   MACROBLOCKD *xd = &mb->e_mbd;
@@ -1551,7 +1775,7 @@
   }
 
   for (b = 16; b < 24; b += 4)
-    cost += cost_coeffs(mb, b, PLANE_TYPE_UV,
+    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_UV,
                         ta + vp9_block2above[TX_8X8][b],
                         tl + vp9_block2left[TX_8X8][b], TX_8X8);
 
@@ -1564,14 +1788,14 @@
   vp9_transform_mbuv_8x8(x);
   vp9_quantize_mbuv_8x8(x);
 
-  *rate       = rd_cost_mbuv_8x8(x, do_ctx_backup);
+  *rate       = rd_cost_mbuv_8x8(&cpi->common, x, do_ctx_backup);
   *distortion = vp9_mbuverror(x) / 4;
   *skip       = vp9_mbuv_is_skippable_8x8(&x->e_mbd);
 
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
-static int rd_cost_sbuv_16x16(MACROBLOCK *x, int backup) {
+static int rd_cost_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x, int backup) {
   int b;
   int cost = 0;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -1590,24 +1814,24 @@
   }
 
   for (b = 16; b < 24; b += 4)
-    cost += cost_coeffs(x, b * 4, PLANE_TYPE_UV,
+    cost += cost_coeffs(cm, x, b * 4, PLANE_TYPE_UV,
                         ta + vp9_block2above[TX_8X8][b],
                         tl + vp9_block2left[TX_8X8][b], TX_16X16);
 
   return cost;
 }
 
-static void rd_inter32x32_uv_16x16(MACROBLOCK *x, int *rate,
-                                   int *distortion, int *skip,
+static void rd_inter32x32_uv_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
+                                   int *rate, int *distortion, int *skip,
                                    int backup) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
   vp9_transform_sbuv_16x16(x);
   vp9_quantize_sbuv_16x16(x);
 
-  *rate       = rd_cost_sbuv_16x16(x, backup);
-  *distortion = vp9_block_error_c(x->coeff + 1024,
-                                  xd->dqcoeff + 1024, 512) >> 2;
+  *rate       = rd_cost_sbuv_16x16(cm, x, backup);
+  *distortion = vp9_sb_block_error_c(x->coeff + 1024,
+                                     xd->dqcoeff + 1024, 512, 2);
   *skip       = vp9_sbuv_is_skippable_16x16(xd);
 }
 
@@ -1623,7 +1847,7 @@
     vp9_subtract_sbuv_s_c(x->src_diff,
                           usrc, vsrc, src_uv_stride,
                           udst, vdst, dst_uv_stride);
-    rd_inter32x32_uv_16x16(x, rate, distortion, skip, 1);
+    rd_inter32x32_uv_16x16(&cpi->common, x, rate, distortion, skip, 1);
   } else {
     int n, r = 0, d = 0;
     int skippable = 1;
@@ -1671,23 +1895,14 @@
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
-static void super_block_64_uvrd(MACROBLOCK *x, int *rate,
+static void super_block_64_uvrd(VP9_COMMON *const cm, MACROBLOCK *x, int *rate,
                                 int *distortion, int *skip);
 static int64_t rd_inter64x64_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                                 int *distortion, int fullpixel, int *skip) {
-  super_block_64_uvrd(x, rate, distortion, skip);
+  super_block_64_uvrd(&cpi->common, x, rate, distortion, skip);
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
-static int64_t rd_inter4x4_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                              int *distortion, int *skip, int fullpixel,
-                              int mb_row, int mb_col) {
-  vp9_build_inter4x4_predictors_mbuv(&x->e_mbd, mb_row, mb_col);
-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                    x->e_mbd.predictor, x->src.uv_stride);
-  return rd_inter16x16_uv_4x4(cpi, x, rate, distortion, fullpixel, skip, 1);
-}
-
 static void rd_pick_intra_mbuv_mode(VP9_COMP *cpi,
                                     MACROBLOCK *x,
                                     int *rate,
@@ -1702,6 +1917,7 @@
   int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
   int rate_to, UNINITIALIZED_IS_SAFE(skip);
 
+  xd->mode_info_context->mbmi.txfm_size = TX_4X4;
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     int rate;
     int distortion;
@@ -1715,7 +1931,7 @@
     vp9_transform_mbuv_4x4(x);
     vp9_quantize_mbuv_4x4(x);
 
-    rate_to = rd_cost_mbuv_4x4(x, 1);
+    rate_to = rd_cost_mbuv_4x4(&cpi->common, x, 1);
     rate = rate_to
            + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
 
@@ -1754,6 +1970,7 @@
   int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
   int rate_to, UNINITIALIZED_IS_SAFE(skip);
 
+  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     int rate;
     int distortion;
@@ -1767,7 +1984,7 @@
 
     vp9_quantize_mbuv_8x8(x);
 
-    rate_to = rd_cost_mbuv_8x8(x, 1);
+    rate_to = rd_cost_mbuv_8x8(&cpi->common, x, 1);
     rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
 
     distortion = vp9_mbuverror(x) / 4;
@@ -1789,7 +2006,8 @@
 }
 
 // TODO(rbultje) very similar to rd_inter32x32_uv(), merge?
-static void super_block_uvrd(MACROBLOCK *x,
+static void super_block_uvrd(VP9_COMMON *const cm,
+                             MACROBLOCK *x,
                              int *rate,
                              int *distortion,
                              int *skippable) {
@@ -1803,7 +2021,7 @@
     vp9_subtract_sbuv_s_c(x->src_diff,
                           usrc, vsrc, src_uv_stride,
                           udst, vdst, dst_uv_stride);
-    rd_inter32x32_uv_16x16(x, rate, distortion, skippable, 1);
+    rd_inter32x32_uv_16x16(cm, x, rate, distortion, skippable, 1);
   } else {
     int d = 0, r = 0, n, s = 1;
     ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
@@ -1837,9 +2055,9 @@
       xd->above_context = t_above + x_idx;
       xd->left_context = t_left + y_idx;
       if (mbmi->txfm_size == TX_4X4) {
-        r += rd_cost_mbuv_4x4(x, 0);
+        r += rd_cost_mbuv_4x4(cm, x, 0);
       } else {
-        r += rd_cost_mbuv_8x8(x, 0);
+        r += rd_cost_mbuv_8x8(cm, x, 0);
       }
     }
 
@@ -1852,7 +2070,8 @@
   }
 }
 
-static int rd_cost_sb64uv_32x32(MACROBLOCK *x, int backup) {
+static int rd_cost_sb64uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
+                                int backup) {
   int b;
   int cost = 0;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -1871,28 +2090,28 @@
   }
 
   for (b = 16; b < 24; b += 4)
-    cost += cost_coeffs(x, b * 16, PLANE_TYPE_UV,
+    cost += cost_coeffs(cm, x, b * 16, PLANE_TYPE_UV,
                         ta + vp9_block2above[TX_8X8][b],
                         tl + vp9_block2left[TX_8X8][b], TX_32X32);
 
   return cost;
 }
 
-static void rd_inter64x64_uv_32x32(MACROBLOCK *x, int *rate,
-                                   int *distortion, int *skip,
+static void rd_inter64x64_uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
+                                   int *rate, int *distortion, int *skip,
                                    int backup) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
   vp9_transform_sb64uv_32x32(x);
   vp9_quantize_sb64uv_32x32(x);
 
-  *rate       = rd_cost_sb64uv_32x32(x, backup);
-  *distortion = vp9_block_error_c(x->coeff + 4096,
-                                  xd->dqcoeff + 4096, 2048);
+  *rate       = rd_cost_sb64uv_32x32(cm, x, backup);
+  *distortion = vp9_sb_block_error_c(x->coeff + 4096,
+                                     xd->dqcoeff + 4096, 2048, 0);
   *skip       = vp9_sb64uv_is_skippable_32x32(xd);
 }
 
-static void super_block_64_uvrd(MACROBLOCK *x,
+static void super_block_64_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
                                 int *rate,
                                 int *distortion,
                                 int *skippable) {
@@ -1913,7 +2132,7 @@
   if (mbmi->txfm_size == TX_32X32) {
     vp9_subtract_sb64uv_s_c(x->src_diff, usrc, vsrc, src_uv_stride,
                             udst, vdst, dst_uv_stride);
-    rd_inter64x64_uv_32x32(x, &r, &d, &s, 1);
+    rd_inter64x64_uv_32x32(cm, x, &r, &d, &s, 1);
   } else if (mbmi->txfm_size == TX_16X16) {
     int n;
 
@@ -1931,7 +2150,7 @@
                             dst_uv_stride);
       xd->above_context = t_above + x_idx * 2;
       xd->left_context = t_left + y_idx * 2;
-      rd_inter32x32_uv_16x16(x, &r_tmp, &d_tmp, &s_tmp, 0);
+      rd_inter32x32_uv_16x16(cm, x, &r_tmp, &d_tmp, &s_tmp, 0);
       r += r_tmp;
       d += d_tmp;
       s = s && s_tmp;
@@ -1961,9 +2180,9 @@
       xd->left_context = t_left + y_idx;
       d += vp9_mbuverror(x) >> 2;
       if (mbmi->txfm_size == TX_4X4) {
-        r += rd_cost_mbuv_4x4(x, 0);
+        r += rd_cost_mbuv_4x4(cm, x, 0);
       } else {
-        r += rd_cost_mbuv_8x8(x, 0);
+        r += rd_cost_mbuv_8x8(cm, x, 0);
       }
     }
   }
@@ -1992,7 +2211,7 @@
     x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
     vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
 
-    super_block_uvrd(x, &this_rate_tokenonly,
+    super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,
                      &this_distortion, &s);
     this_rate = this_rate_tokenonly +
                 x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];
@@ -2029,7 +2248,7 @@
     x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
     vp9_build_intra_predictors_sb64uv_s(&x->e_mbd);
 
-    super_block_64_uvrd(x, &this_rate_tokenonly,
+    super_block_64_uvrd(&cpi->common, x, &this_rate_tokenonly,
                         &this_distortion, &s);
     this_rate = this_rate_tokenonly +
     x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];
@@ -2186,7 +2405,8 @@
   return cost;
 }
 
-static int64_t encode_inter_mb_segment(MACROBLOCK *x,
+static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
+                                       MACROBLOCK *x,
                                        int const *labels,
                                        int which_label,
                                        int *labelyrate,
@@ -2225,7 +2445,7 @@
       x->quantize_b_4x4(x, i);
       thisdistortion = vp9_block_error(be->coeff, bd->dqcoeff, 16);
       *distortion += thisdistortion;
-      *labelyrate += cost_coeffs(x, i, PLANE_TYPE_Y_WITH_DC,
+      *labelyrate += cost_coeffs(cm, x, i, PLANE_TYPE_Y_WITH_DC,
                                  ta + vp9_block2above[TX_4X4][i],
                                  tl + vp9_block2left[TX_4X4][i], TX_4X4);
     }
@@ -2234,7 +2454,8 @@
   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
 }
 
-static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x,
+static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm,
+                                           MACROBLOCK *x,
                                            int const *labels,
                                            int which_label,
                                            int *labelyrate,
@@ -2288,10 +2509,12 @@
           x->quantize_b_8x8(x, idx);
           thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
           otherdist += thisdistortion;
-          othercost += cost_coeffs(x, idx, PLANE_TYPE_Y_WITH_DC,
+          xd->mode_info_context->mbmi.txfm_size = TX_8X8;
+          othercost += cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
                                    tacp + vp9_block2above[TX_8X8][idx],
                                    tlcp + vp9_block2left[TX_8X8][idx],
                                    TX_8X8);
+          xd->mode_info_context->mbmi.txfm_size = TX_4X4;
         }
         for (j = 0; j < 4; j += 2) {
           bd = &xd->block[ib + iblock[j]];
@@ -2300,15 +2523,17 @@
           x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j] + 1);
           thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);
           *distortion += thisdistortion;
-          *labelyrate += cost_coeffs(x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
-                           ta + vp9_block2above[TX_4X4][ib + iblock[j]],
-                           tl + vp9_block2left[TX_4X4][ib + iblock[j]],
-                           TX_4X4);
-          *labelyrate += cost_coeffs(x, ib + iblock[j] + 1,
-                           PLANE_TYPE_Y_WITH_DC,
-                           ta + vp9_block2above[TX_4X4][ib + iblock[j] + 1],
-                           tl + vp9_block2left[TX_4X4][ib + iblock[j]],
-                           TX_4X4);
+          *labelyrate +=
+              cost_coeffs(cm, x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
+                          ta + vp9_block2above[TX_4X4][ib + iblock[j]],
+                          tl + vp9_block2left[TX_4X4][ib + iblock[j]],
+                          TX_4X4);
+          *labelyrate +=
+              cost_coeffs(cm, x, ib + iblock[j] + 1,
+                          PLANE_TYPE_Y_WITH_DC,
+                          ta + vp9_block2above[TX_4X4][ib + iblock[j] + 1],
+                          tl + vp9_block2left[TX_4X4][ib + iblock[j]],
+                          TX_4X4);
         }
       } else /* 8x8 */ {
         if (otherrd) {
@@ -2319,22 +2544,26 @@
             x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j]);
             thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);
             otherdist += thisdistortion;
-            othercost += cost_coeffs(x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
-                           tacp + vp9_block2above[TX_4X4][ib + iblock[j]],
-                           tlcp + vp9_block2left[TX_4X4][ib + iblock[j]],
-                           TX_4X4);
-            othercost += cost_coeffs(x, ib + iblock[j] + 1,
-                           PLANE_TYPE_Y_WITH_DC,
-                           tacp + vp9_block2above[TX_4X4][ib + iblock[j] + 1],
-                           tlcp + vp9_block2left[TX_4X4][ib + iblock[j]],
-                           TX_4X4);
+            xd->mode_info_context->mbmi.txfm_size = TX_4X4;
+            othercost +=
+                cost_coeffs(cm, x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
+                            tacp + vp9_block2above[TX_4X4][ib + iblock[j]],
+                            tlcp + vp9_block2left[TX_4X4][ib + iblock[j]],
+                            TX_4X4);
+            othercost +=
+                cost_coeffs(cm, x, ib + iblock[j] + 1,
+                            PLANE_TYPE_Y_WITH_DC,
+                            tacp + vp9_block2above[TX_4X4][ib + iblock[j] + 1],
+                            tlcp + vp9_block2left[TX_4X4][ib + iblock[j]],
+                            TX_4X4);
+            xd->mode_info_context->mbmi.txfm_size = TX_8X8;
           }
         }
         x->fwd_txm8x8(be->src_diff, be2->coeff, 32);
         x->quantize_b_8x8(x, idx);
         thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
         *distortion += thisdistortion;
-        *labelyrate += cost_coeffs(x, idx, PLANE_TYPE_Y_WITH_DC,
+        *labelyrate += cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
                                    ta + vp9_block2above[TX_8X8][idx],
                                    tl + vp9_block2left[TX_8X8][idx], TX_8X8);
       }
@@ -2574,11 +2803,13 @@
         continue;
 
       if (segmentation == PARTITIONING_4X4) {
-        this_rd = encode_inter_mb_segment(x, labels, i, &labelyrate,
+        this_rd = encode_inter_mb_segment(&cpi->common,
+                                          x, labels, i, &labelyrate,
                                           &distortion, ta_s, tl_s);
         other_rd = this_rd;
       } else {
-        this_rd = encode_inter_mb_segment_8x8(x, labels, i, &labelyrate,
+        this_rd = encode_inter_mb_segment_8x8(&cpi->common,
+                                              x, labels, i, &labelyrate,
                                               &distortion, &other_rd,
                                               ta_s, tl_s);
       }
@@ -3146,7 +3377,9 @@
   // UV cost and distortion
   vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
                     x->e_mbd.predictor, x->src.uv_stride);
-  if (x->e_mbd.mode_info_context->mbmi.txfm_size != TX_4X4)
+  if (x->e_mbd.mode_info_context->mbmi.txfm_size != TX_4X4 &&
+      x->e_mbd.mode_info_context->mbmi.mode != I8X8_PRED &&
+      x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
     rd_inter16x16_uv_8x8(cpi, x, rate_uv, distortion_uv,
                          cpi->common.full_pixel, &uv_skippable, 1);
   else
@@ -3344,6 +3577,8 @@
 
         vp9_clamp_mv_min_max(x, &ref_mv[0]);
 
+        sr = vp9_init_search_range(cpi->common.Width, cpi->common.Height);
+
         // mvp_full.as_int = ref_mv[0].as_int;
         mvp_full.as_int =
          mbmi->ref_mvs[refs[0]][x->mv_best_ref_index[refs[0]]].as_int;
@@ -3933,7 +4168,10 @@
 #if CONFIG_COMP_INTERINTRA_PRED
   int is_best_interintra = 0;
   int64_t best_intra16_rd = INT64_MAX;
-  int best_intra16_mode = DC_PRED, best_intra16_uv_mode = DC_PRED;
+  int best_intra16_mode = DC_PRED;
+#if SEPARATE_INTERINTRA_UV
+  int best_intra16_uv_mode = DC_PRED;
+#endif
 #endif
   int64_t best_overall_rd = INT64_MAX;
   INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;
@@ -4015,6 +4253,8 @@
   cpi->zbin_mode_boost = 0;
   vp9_update_zbin_extra(cpi, x);
 
+  xd->mode_info_context->mbmi.mode = DC_PRED;
+
   rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate,
                           &uv_intra_rate_tokenonly, &uv_intra_distortion,
                           &uv_intra_skippable);
@@ -4231,65 +4471,11 @@
         }
         break;
         case I8X8_PRED: {
-          int cost0 = vp9_cost_bit(cm->prob_tx[0], 0);
-          int cost1 = vp9_cost_bit(cm->prob_tx[0], 1);
-          int64_t tmp_rd_4x4s, tmp_rd_8x8s;
-          int64_t tmp_rd_4x4, tmp_rd_8x8, tmp_rd;
-          int r4x4, tok4x4, d4x4, r8x8, tok8x8, d8x8;
-          mbmi->txfm_size = TX_4X4;
-          tmp_rd_4x4 = rd_pick_intra8x8mby_modes(cpi, x, &r4x4, &tok4x4,
-                                                 &d4x4, best_yrd);
-          mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;
-          mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;
-          mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;
-          mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;
-          mbmi->txfm_size = TX_8X8;
-          tmp_rd_8x8 = rd_pick_intra8x8mby_modes(cpi, x, &r8x8, &tok8x8,
-                                                 &d8x8, best_yrd);
-          txfm_cache[ONLY_4X4]  = tmp_rd_4x4;
-          txfm_cache[ALLOW_8X8] = tmp_rd_8x8;
-          txfm_cache[ALLOW_16X16] = tmp_rd_8x8;
-          tmp_rd_4x4s = tmp_rd_4x4 + RDCOST(x->rdmult, x->rddiv, cost0, 0);
-          tmp_rd_8x8s = tmp_rd_8x8 + RDCOST(x->rdmult, x->rddiv, cost1, 0);
-          txfm_cache[TX_MODE_SELECT] = tmp_rd_4x4s < tmp_rd_8x8s ? tmp_rd_4x4s : tmp_rd_8x8s;
-          if (cm->txfm_mode == TX_MODE_SELECT) {
-            if (tmp_rd_4x4s < tmp_rd_8x8s) {
-              rate = r4x4 + cost0;
-              rate_y = tok4x4 + cost0;
-              distortion = d4x4;
-              mbmi->txfm_size = TX_4X4;
-              tmp_rd = tmp_rd_4x4s;
-            } else {
-              rate = r8x8 + cost1;
-              rate_y = tok8x8 + cost1;
-              distortion = d8x8;
-              mbmi->txfm_size = TX_8X8;
-              tmp_rd = tmp_rd_8x8s;
+          int64_t tmp_rd;
 
-              mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;
-              mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;
-              mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;
-              mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;
-            }
-          } else if (cm->txfm_mode == ONLY_4X4) {
-            rate = r4x4;
-            rate_y = tok4x4;
-            distortion = d4x4;
-            mbmi->txfm_size = TX_4X4;
-            tmp_rd = tmp_rd_4x4;
-          } else {
-            rate = r8x8;
-            rate_y = tok8x8;
-            distortion = d8x8;
-            mbmi->txfm_size = TX_8X8;
-            tmp_rd = tmp_rd_8x8;
-
-            mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;
-            mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;
-            mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;
-            mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;
-          }
-
+          tmp_rd = rd_pick_intra8x8mby_modes_and_txsz(cpi, x, &rate, &rate_y,
+                                                      &distortion, mode8x8,
+                                                      best_yrd, txfm_cache);
           rate2 += rate;
           rate2 += intra_cost_penalty;
           distortion2 += distortion;
@@ -4330,6 +4516,7 @@
       this_rd_thresh =
           (mbmi->ref_frame == GOLDEN_FRAME) ?
           cpi->rd_threshes[THR_NEWG] : this_rd_thresh;
+      xd->mode_info_context->mbmi.txfm_size = TX_4X4;
 
       for (switchable_filter_index = 0;
            switchable_filter_index < VP9_SWITCHABLE_FILTERS;
@@ -4421,8 +4608,11 @@
       if (tmp_rd < best_yrd) {
         int uv_skippable;
 
-        rd_inter4x4_uv(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
-                       cpi->common.full_pixel, mb_row, mb_col);
+        vp9_build_inter4x4_predictors_mbuv(&x->e_mbd, mb_row, mb_col);
+        vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
+                          x->e_mbd.predictor, x->src.uv_stride);
+        rd_inter16x16_uv_4x4(cpi, x, &rate_uv, &distortion_uv,
+                             cpi->common.full_pixel, &uv_skippable, 1);
         rate2 += rate_uv;
         distortion2 += distortion_uv;
         skippable = skippable && uv_skippable;
@@ -4543,8 +4733,10 @@
         (this_rd < best_intra16_rd)) {
       best_intra16_rd = this_rd;
       best_intra16_mode = this_mode;
+#if SEPARATE_INTERINTRA_UV
       best_intra16_uv_mode = (mbmi->txfm_size != TX_4X4 ?
                               uv_intra_mode_8x8 : uv_intra_mode);
+#endif
     }
 #endif
 
@@ -4792,6 +4984,7 @@
   int64_t txfm_cache[NB_TXFM_MODES], err;
   int i;
 
+  xd->mode_info_context->mbmi.mode = DC_PRED;
   err = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
                                &dist_y, &y_skip, txfm_cache);
   rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
@@ -4826,6 +5019,7 @@
   int64_t txfm_cache[NB_TXFM_MODES], err;
   int i;
 
+  xd->mode_info_context->mbmi.mode = DC_PRED;
   err = rd_pick_intra_sb64y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
                                  &dist_y, &y_skip, txfm_cache);
   rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
@@ -4866,77 +5060,96 @@
   int mode16x16;
   int mode8x8[4];
   int dist;
-  int modeuv, uv_intra_skippable, uv_intra_skippable_8x8;
+  int modeuv, modeuv8x8, uv_intra_skippable, uv_intra_skippable_8x8;
   int y_intra16x16_skippable = 0;
-  int64_t txfm_cache[NB_TXFM_MODES];
-  TX_SIZE txfm_size_16x16;
+  int64_t txfm_cache[2][NB_TXFM_MODES];
+  TX_SIZE txfm_size_16x16, txfm_size_8x8;
   int i;
 
   mbmi->ref_frame = INTRA_FRAME;
+  mbmi->mode = DC_PRED;
   rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv,
                           &uv_intra_skippable);
   modeuv = mbmi->uv_mode;
   if (cpi->common.txfm_mode != ONLY_4X4) {
     rd_pick_intra_mbuv_mode_8x8(cpi, x, &rateuv8x8, &rateuv8x8_tokenonly,
                                 &distuv8x8, &uv_intra_skippable_8x8);
+    modeuv8x8 = mbmi->uv_mode;
   } else {
     uv_intra_skippable_8x8 = uv_intra_skippable;
     rateuv8x8 = rateuv;
     distuv8x8 = distuv;
     rateuv8x8_tokenonly = rateuv_tokenonly;
+    modeuv8x8 = modeuv;
   }
 
   // current macroblock under rate-distortion optimization test loop
   error16x16 = rd_pick_intra16x16mby_mode(cpi, x, &rate16x16,
                                           &rate16x16_tokenonly, &dist16x16,
-                                          &y_intra16x16_skippable, txfm_cache);
+                                          &y_intra16x16_skippable,
+                                          txfm_cache[1]);
   mode16x16 = mbmi->mode;
   txfm_size_16x16 = mbmi->txfm_size;
+  if (cpi->common.mb_no_coeff_skip && y_intra16x16_skippable &&
+      ((cm->txfm_mode == ONLY_4X4 && uv_intra_skippable) ||
+       (cm->txfm_mode != ONLY_4X4 && uv_intra_skippable_8x8))) {
+    error16x16 -= RDCOST(x->rdmult, x->rddiv, rate16x16_tokenonly, 0);
+    rate16x16 -= rate16x16_tokenonly;
+  }
+  for (i = 0; i < NB_TXFM_MODES; i++) {
+    txfm_cache[0][i] = error16x16 - txfm_cache[1][cm->txfm_mode] +
+                       txfm_cache[1][i];
+  }
 
-  // FIXME(rbultje) support transform-size selection
-  mbmi->txfm_size = (cm->txfm_mode == ONLY_4X4) ? TX_4X4 : TX_8X8;
-  error8x8 = rd_pick_intra8x8mby_modes(cpi, x, &rate8x8, &rate8x8_tokenonly,
-                                       &dist8x8, error16x16);
-  mode8x8[0]= xd->mode_info_context->bmi[0].as_mode.first;
-  mode8x8[1]= xd->mode_info_context->bmi[2].as_mode.first;
-  mode8x8[2]= xd->mode_info_context->bmi[8].as_mode.first;
-  mode8x8[3]= xd->mode_info_context->bmi[10].as_mode.first;
+  error8x8 = rd_pick_intra8x8mby_modes_and_txsz(cpi, x, &rate8x8,
+                                                &rate8x8_tokenonly,
+                                                &dist8x8, mode8x8,
+                                                error16x16, txfm_cache[1]);
+  txfm_size_8x8 = mbmi->txfm_size;
+  for (i = 0; i < NB_TXFM_MODES; i++) {
+    int64_t tmp_rd = error8x8 - txfm_cache[1][cm->txfm_mode] + txfm_cache[1][i];
+    if (tmp_rd < txfm_cache[0][i])
+      txfm_cache[0][i] = tmp_rd;
+  }
 
   mbmi->txfm_size = TX_4X4;
   error4x4 = rd_pick_intra4x4mby_modes(cpi, x,
                                        &rate4x4, &rate4x4_tokenonly,
                                        &dist4x4, error16x16);
+  for (i = 0; i < NB_TXFM_MODES; i++) {
+    if (error4x4 < txfm_cache[0][i])
+      txfm_cache[0][i] = error4x4;
+  }
 
   mbmi->mb_skip_coeff = 0;
-  if (cpi->common.mb_no_coeff_skip &&
-      y_intra16x16_skippable && uv_intra_skippable_8x8) {
+  if (cpi->common.mb_no_coeff_skip && y_intra16x16_skippable &&
+      ((cm->txfm_mode == ONLY_4X4 && uv_intra_skippable) ||
+       (cm->txfm_mode != ONLY_4X4 && uv_intra_skippable_8x8))) {
     mbmi->mb_skip_coeff = 1;
     mbmi->mode = mode16x16;
-    mbmi->uv_mode = modeuv;
-    rate = rateuv8x8 + rate16x16 - rateuv8x8_tokenonly - rate16x16_tokenonly +
-           vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
-    dist = dist16x16 + (distuv8x8 >> 2);
+    mbmi->uv_mode = (cm->txfm_mode == ONLY_4X4) ? modeuv : modeuv8x8;
+    rate = rate16x16 + vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
+    dist = dist16x16;
+    if (cm->txfm_mode == ONLY_4X4) {
+      rate += rateuv - rateuv_tokenonly;
+      dist += (distuv >> 2);
+    } else {
+      rate += rateuv8x8 - rateuv8x8_tokenonly;
+      dist += (distuv8x8 >> 2);
+    }
 
     mbmi->txfm_size = txfm_size_16x16;
-    memset(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff, 0,
-           sizeof(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff));
   } else if (error8x8 > error16x16) {
     if (error4x4 < error16x16) {
       rate = rateuv + rate4x4;
       mbmi->mode = B_PRED;
       mbmi->txfm_size = TX_4X4;
       dist = dist4x4 + (distuv >> 2);
-      memset(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff, 0,
-             sizeof(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff));
     } else {
       mbmi->txfm_size = txfm_size_16x16;
       mbmi->mode = mode16x16;
       rate = rate16x16 + rateuv8x8;
       dist = dist16x16 + (distuv8x8 >> 2);
-      for (i = 0; i < NB_TXFM_MODES; i++) {
-        x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff[i] =
-            error16x16 - txfm_cache[i];
-      }
     }
     if (cpi->common.mb_no_coeff_skip)
       rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
@@ -4946,22 +5159,22 @@
       mbmi->mode = B_PRED;
       mbmi->txfm_size = TX_4X4;
       dist = dist4x4 + (distuv >> 2);
-      memset(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff, 0,
-             sizeof(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff));
     } else {
-      // FIXME(rbultje) support transform-size selection
       mbmi->mode = I8X8_PRED;
-      mbmi->txfm_size = (cm->txfm_mode == ONLY_4X4) ? TX_4X4 : TX_8X8;
+      mbmi->txfm_size = txfm_size_8x8;
       set_i8x8_block_modes(x, mode8x8);
       rate = rate8x8 + rateuv;
       dist = dist8x8 + (distuv >> 2);
-      memset(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff, 0,
-             sizeof(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff));
     }
     if (cpi->common.mb_no_coeff_skip)
       rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
   }
 
+  for (i = 0; i < NB_TXFM_MODES; i++) {
+    x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff[i] =
+        txfm_cache[0][cm->txfm_mode] - txfm_cache[0][i];
+  }
+
   *returnrate = rate;
   *returndist = dist;
 }
@@ -5002,7 +5215,10 @@
 #if CONFIG_COMP_INTERINTRA_PRED
   int is_best_interintra = 0;
   int64_t best_intra16_rd = INT64_MAX;
-  int best_intra16_mode = DC_PRED, best_intra16_uv_mode = DC_PRED;
+  int best_intra16_mode = DC_PRED;
+#if SEPARATE_INTERINTRA_UV
+  int best_intra16_uv_mode = DC_PRED;
+#endif
 #endif
   int64_t best_overall_rd = INT64_MAX;
   INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;
@@ -5334,8 +5550,10 @@
         (this_rd < best_intra16_rd)) {
       best_intra16_rd = this_rd;
       best_intra16_mode = this_mode;
+#if SEPARATE_INTERINTRA_UV
       best_intra16_uv_mode = (mbmi->txfm_size != TX_4X4 ?
                               mode_uv_8x8 : mode_uv_4x4);
+#endif
     }
 #endif
 
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index cfaf5f5..a04a20c 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -219,10 +219,8 @@
   const int segment_id = mi->mbmi.segment_id;
 
   xd->mode_info_context = mi;
-  xd->mb_to_top_edge = -((mb_row * 16) << 3);
-  xd->mb_to_left_edge = -((mb_col * 16) << 3);
-  xd->mb_to_bottom_edge = ((cm->mb_rows - mb_size - mb_row) * 16) << 3;
-  xd->mb_to_right_edge  = ((cm->mb_cols - mb_size - mb_col) * 16) << 3;
+  set_mb_row(cm, xd, mb_row, mb_size);
+  set_mb_col(cm, xd, mb_col, mb_size);
 
   // Count the number of hits on each segment with no prediction
   no_pred_segcounts[segment_id]++;
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index d115fe8..df05bec 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -36,6 +36,21 @@
 extern vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES];
 #endif  /* ENTROPY_STATS */
 
+#if CONFIG_CODE_NONZEROCOUNT
+#ifdef NZC_STATS
+unsigned int nzc_counts_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                           [NZC4X4_TOKENS];
+unsigned int nzc_counts_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                           [NZC8X8_TOKENS];
+unsigned int nzc_counts_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                             [NZC16X16_TOKENS];
+unsigned int nzc_counts_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                             [NZC32X32_TOKENS];
+unsigned int nzc_pcat_counts[MAX_NZC_CONTEXTS][NZC_TOKENS_EXTRA]
+                            [NZC_BITS_EXTRA][2];
+#endif
+#endif
+
 static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
 const TOKENVALUE *vp9_dct_value_tokens_ptr;
 static int dct_value_cost[DCT_MAX_VALUE * 2];
@@ -114,11 +129,13 @@
   const int *scan;
   vp9_coeff_count *counts;
   vp9_coeff_probs *probs;
-  const TX_TYPE tx_type = (sb_type == BLOCK_SIZE_MB16X16 &&
-                           type == PLANE_TYPE_Y_WITH_DC) ?
-                          get_tx_type(xd, &xd->block[ib]) : DCT_DCT;
   const int ref = mbmi->ref_frame != INTRA_FRAME;
   ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec;
+#if CONFIG_CODE_NONZEROCOUNT
+  int zerosleft, nzc = 0;
+  if (eob == 0)
+    assert(xd->nzcs[ib] == 0);
+#endif
 
   if (sb_type == BLOCK_SIZE_SB64X64) {
     a = (ENTROPY_CONTEXT *)xd->above_context +
@@ -144,7 +161,9 @@
 
   switch (tx_size) {
     default:
-    case TX_4X4:
+    case TX_4X4: {
+      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+                              get_tx_type_4x4(xd, ib) : DCT_DCT;
       a_ec = *a;
       l_ec = *l;
       seg_eob = 16;
@@ -159,6 +178,7 @@
       counts = cpi->coef_counts_4x4;
       probs = cpi->common.fc.coef_probs_4x4;
       break;
+    }
     case TX_8X8:
       a_ec = (a[0] + a[1]) != 0;
       l_ec = (l[0] + l[1]) != 0;
@@ -207,29 +227,47 @@
   do {
     const int band = get_coef_band(tx_size, c);
     int token;
-
+    int v = 0;
+#if CONFIG_CODE_NONZEROCOUNT
+    zerosleft = seg_eob - xd->nzcs[ib] - c + nzc;
+#endif
     if (c < eob) {
       const int rc = scan[c];
-      const int v = qcoeff_ptr[rc];
+      v = qcoeff_ptr[rc];
       assert(-DCT_MAX_VALUE <= v  &&  v < DCT_MAX_VALUE);
 
       t->Extra = vp9_dct_value_tokens_ptr[v].Extra;
       token    = vp9_dct_value_tokens_ptr[v].Token;
     } else {
+#if CONFIG_CODE_NONZEROCOUNT
+      break;
+#else
       token = DCT_EOB_TOKEN;
+#endif
     }
 
     t->Token = token;
     t->context_tree = probs[type][ref][band][pt];
+#if CONFIG_CODE_NONZEROCOUNT
+    // Skip zero node if there are no zeros left
+    t->skip_eob_node = 1 + (zerosleft == 0);
+#else
     t->skip_eob_node = (pt == 0) && (band > 0);
+#endif
     assert(vp9_coef_encodings[t->Token].Len - t->skip_eob_node > 0);
     if (!dry_run) {
       ++counts[type][ref][band][pt][token];
     }
+#if CONFIG_CODE_NONZEROCOUNT
+    nzc += (v != 0);
+#endif
 
     pt = vp9_get_coef_context(&recent_energy, token);
     ++t;
   } while (c < eob && ++c < seg_eob);
+#if CONFIG_CODE_NONZEROCOUNT
+  assert(nzc == xd->nzcs[ib]);
+#endif
 
   *tp = t;
   a_ec = l_ec = (c > 0); /* 0 <-> all coeff data is zero */
@@ -331,7 +369,7 @@
          vp9_sbuv_is_skippable_16x16(xd);
 }
 
-static int sby_is_skippable_16x16(MACROBLOCKD *xd) {
+int vp9_sby_is_skippable_16x16(MACROBLOCKD *xd) {
   int skip = 1;
   int i = 0;
 
@@ -342,10 +380,10 @@
 }
 
 static int sb_is_skippable_16x16(MACROBLOCKD *xd) {
-  return sby_is_skippable_16x16(xd) & vp9_sbuv_is_skippable_16x16(xd);
+  return vp9_sby_is_skippable_16x16(xd) & vp9_sbuv_is_skippable_16x16(xd);
 }
 
-static int sby_is_skippable_8x8(MACROBLOCKD *xd) {
+int vp9_sby_is_skippable_8x8(MACROBLOCKD *xd) {
   int skip = 1;
   int i = 0;
 
@@ -355,7 +393,7 @@
   return skip;
 }
 
-static int sbuv_is_skippable_8x8(MACROBLOCKD *xd) {
+int vp9_sbuv_is_skippable_8x8(MACROBLOCKD *xd) {
   int skip = 1;
   int i = 0;
 
@@ -366,10 +404,10 @@
 }
 
 static int sb_is_skippable_8x8(MACROBLOCKD *xd) {
-  return sby_is_skippable_8x8(xd) & sbuv_is_skippable_8x8(xd);
+  return vp9_sby_is_skippable_8x8(xd) & vp9_sbuv_is_skippable_8x8(xd);
 }
 
-static int sby_is_skippable_4x4(MACROBLOCKD *xd) {
+int vp9_sby_is_skippable_4x4(MACROBLOCKD *xd) {
   int skip = 1;
   int i = 0;
 
@@ -379,7 +417,7 @@
   return skip;
 }
 
-static int sbuv_is_skippable_4x4(MACROBLOCKD *xd) {
+int vp9_sbuv_is_skippable_4x4(MACROBLOCKD *xd) {
   int skip = 1;
   int i = 0;
 
@@ -390,7 +428,7 @@
 }
 
 static int sb_is_skippable_4x4(MACROBLOCKD *xd) {
-  return sby_is_skippable_4x4(xd) & sbuv_is_skippable_4x4(xd);
+  return vp9_sby_is_skippable_4x4(xd) & vp9_sbuv_is_skippable_4x4(xd);
 }
 
 void vp9_tokenize_sb(VP9_COMP *cpi,
@@ -476,7 +514,7 @@
     *t = t_backup;
 }
 
-static int sb64y_is_skippable_32x32(MACROBLOCKD *xd) {
+int vp9_sb64y_is_skippable_32x32(MACROBLOCKD *xd) {
   int skip = 1;
   int i = 0;
 
@@ -491,10 +529,10 @@
 }
 
 static int sb64_is_skippable_32x32(MACROBLOCKD *xd) {
-  return sb64y_is_skippable_32x32(xd) & vp9_sb64uv_is_skippable_32x32(xd);
+  return vp9_sb64y_is_skippable_32x32(xd) & vp9_sb64uv_is_skippable_32x32(xd);
 }
 
-static int sb64y_is_skippable_16x16(MACROBLOCKD *xd) {
+int vp9_sb64y_is_skippable_16x16(MACROBLOCKD *xd) {
   int skip = 1;
   int i = 0;
 
@@ -504,7 +542,7 @@
   return skip;
 }
 
-static int sb64uv_is_skippable_16x16(MACROBLOCKD *xd) {
+int vp9_sb64uv_is_skippable_16x16(MACROBLOCKD *xd) {
   int skip = 1;
   int i = 0;
 
@@ -515,10 +553,10 @@
 }
 
 static int sb64_is_skippable_16x16(MACROBLOCKD *xd) {
-  return sb64y_is_skippable_16x16(xd) & sb64uv_is_skippable_16x16(xd);
+  return vp9_sb64y_is_skippable_16x16(xd) & vp9_sb64uv_is_skippable_16x16(xd);
 }
 
-static int sb64y_is_skippable_8x8(MACROBLOCKD *xd) {
+int vp9_sb64y_is_skippable_8x8(MACROBLOCKD *xd) {
   int skip = 1;
   int i = 0;
 
@@ -528,7 +566,7 @@
   return skip;
 }
 
-static int sb64uv_is_skippable_8x8(MACROBLOCKD *xd) {
+int vp9_sb64uv_is_skippable_8x8(MACROBLOCKD *xd) {
   int skip = 1;
   int i = 0;
 
@@ -539,10 +577,10 @@
 }
 
 static int sb64_is_skippable_8x8(MACROBLOCKD *xd) {
-  return sb64y_is_skippable_8x8(xd) & sb64uv_is_skippable_8x8(xd);
+  return vp9_sb64y_is_skippable_8x8(xd) & vp9_sb64uv_is_skippable_8x8(xd);
 }
 
-static int sb64y_is_skippable_4x4(MACROBLOCKD *xd) {
+int vp9_sb64y_is_skippable_4x4(MACROBLOCKD *xd) {
   int skip = 1;
   int i = 0;
 
@@ -552,7 +590,7 @@
   return skip;
 }
 
-static int sb64uv_is_skippable_4x4(MACROBLOCKD *xd) {
+int vp9_sb64uv_is_skippable_4x4(MACROBLOCKD *xd) {
   int skip = 1;
   int i = 0;
 
@@ -563,7 +601,7 @@
 }
 
 static int sb64_is_skippable_4x4(MACROBLOCKD *xd) {
-  return sb64y_is_skippable_4x4(xd) & sb64uv_is_skippable_4x4(xd);
+  return vp9_sb64y_is_skippable_4x4(xd) & vp9_sb64uv_is_skippable_4x4(xd);
 }
 
 void vp9_tokenize_sb64(VP9_COMP *cpi,
@@ -903,13 +941,15 @@
                     PLANE_TYPE type,
                     TX_SIZE tx_size,
                     int dry_run) {
+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;
+#if CONFIG_CODE_NONZEROCOUNT == 0
   vp9_coeff_count *counts;
   vp9_coeff_probs *probs;
   int pt, band;
   TOKENEXTRA *t = *tp;
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   const int ref = mbmi->ref_frame != INTRA_FRAME;
-  const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;
+#endif
   ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec;
 
   if (sb_type == BLOCK_SIZE_SB32X32) {
@@ -939,14 +979,18 @@
     case TX_4X4:
       a_ec = a[0];
       l_ec = l[0];
+#if CONFIG_CODE_NONZEROCOUNT == 0
       counts = cpi->coef_counts_4x4;
       probs = cpi->common.fc.coef_probs_4x4;
+#endif
       break;
     case TX_8X8:
       a_ec = (a[0] + a[1]) != 0;
       l_ec = (l[0] + l[1]) != 0;
+#if CONFIG_CODE_NONZEROCOUNT == 0
       counts = cpi->coef_counts_8x8;
       probs = cpi->common.fc.coef_probs_8x8;
+#endif
       break;
     case TX_16X16:
       if (type != PLANE_TYPE_UV) {
@@ -956,8 +1000,10 @@
         a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
         l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
       }
+#if CONFIG_CODE_NONZEROCOUNT == 0
       counts = cpi->coef_counts_16x16;
       probs = cpi->common.fc.coef_probs_16x16;
+#endif
       break;
     case TX_32X32:
       if (type != PLANE_TYPE_UV) {
@@ -971,19 +1017,25 @@
         l_ec = (l[0] + l[1] + l1[0] + l1[1] +
                 l2[0] + l2[1] + l3[0] + l3[1]) != 0;
       }
+#if CONFIG_CODE_NONZEROCOUNT == 0
       counts = cpi->coef_counts_32x32;
       probs = cpi->common.fc.coef_probs_32x32;
+#endif
       break;
   }
 
+#if CONFIG_CODE_NONZEROCOUNT == 0
   VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec);
-
   band = get_coef_band(tx_size, 0);
   t->Token = DCT_EOB_TOKEN;
   t->context_tree = probs[type][ref][band][pt];
   t->skip_eob_node = 0;
   ++t;
   *tp = t;
+  if (!dry_run) {
+    ++counts[type][ref][band][pt][DCT_EOB_TOKEN];
+  }
+#endif
   *a = *l = 0;
   if (tx_size == TX_8X8) {
     a[1] = 0;
@@ -1009,10 +1061,6 @@
       l2[0] = l2[1] = l3[0] = l3[1] = l_ec;
     }
   }
-
-  if (!dry_run) {
-    ++counts[type][ref][band][pt][DCT_EOB_TOKEN];
-  }
 }
 
 static void stuff_mb_8x8(VP9_COMP *cpi, MACROBLOCKD *xd,
diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index 4d6fe63..464d7ca 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -37,8 +37,20 @@
 int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd);
 int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd);
 int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd);
+int vp9_sby_is_skippable_16x16(MACROBLOCKD *xd);
+int vp9_sby_is_skippable_8x8(MACROBLOCKD *xd);
+int vp9_sby_is_skippable_4x4(MACROBLOCKD *xd);
 int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd);
+int vp9_sbuv_is_skippable_8x8(MACROBLOCKD *xd);
+int vp9_sbuv_is_skippable_4x4(MACROBLOCKD *xd);
+int vp9_sb64y_is_skippable_32x32(MACROBLOCKD *xd);
+int vp9_sb64y_is_skippable_16x16(MACROBLOCKD *xd);
+int vp9_sb64y_is_skippable_8x8(MACROBLOCKD *xd);
+int vp9_sb64y_is_skippable_4x4(MACROBLOCKD *xd);
 int vp9_sb64uv_is_skippable_32x32(MACROBLOCKD *xd);
+int vp9_sb64uv_is_skippable_16x16(MACROBLOCKD *xd);
+int vp9_sb64uv_is_skippable_8x8(MACROBLOCKD *xd);
+int vp9_sb64uv_is_skippable_4x4(MACROBLOCKD *xd);
 
 struct VP9_COMP;
 
diff --git a/vp9/encoder/vp9_treewriter.c b/vp9/encoder/vp9_treewriter.c
index 8e25281..951ffa7 100644
--- a/vp9/encoder/vp9_treewriter.c
+++ b/vp9/encoder/vp9_treewriter.c
@@ -35,5 +35,6 @@
 }
 
 void vp9_cost_tokens_skip(int *c, const vp9_prob *p, vp9_tree t) {
+  c[0] = 0;
   cost(c, t, p, 2, 0);
 }
diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk
index 7622fc0..239ae30 100644
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk
@@ -38,5 +38,11 @@
 
 VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_idct_blk_sse2.c
 
+VP9_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/vp9_dequantize_x86.c
+ifeq ($(HAVE_SSE2),yes)
+vp9/decoder/x86/vp9_dequantize_x86.c.o: CFLAGS += -msse2
+vp9/decoder/x86/vp9_dequantize_x86.c.d: CFLAGS += -msse2
+endif
+
 $(eval $(call asm_offsets_template,\
          vp9_asm_dec_offsets.asm, $(VP9_PREFIX)decoder/vp9_asm_dec_offsets.c))
diff --git a/vpx_ports/mem.h b/vpx_ports/mem.h
index b130da8..62b86bb 100644
--- a/vpx_ports/mem.h
+++ b/vpx_ports/mem.h
@@ -11,6 +11,7 @@
 
 #ifndef VPX_PORTS_MEM_H
 #define VPX_PORTS_MEM_H
+
 #include "vpx_config.h"
 #include "vpx/vpx_integer.h"