Merge "Add several new test vectors with small resolution."

diff --git a/test/vp9_error_block_test.cc b/test/vp9_error_block_test.cc
index d779706..77b12ea 100644
--- a/test/vp9_error_block_test.cc
+++ b/test/vp9_error_block_test.cc

@@ -67,12 +67,22 @@
   int64_t ret;
   int64_t ref_ssz;
   int64_t ref_ret;
+  const int msb = bit_depth_ + 8 - 1;
   for (int i = 0; i < kNumIterations; ++i) {
     int err_count = 0;
     block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64
     for (int j = 0; j < block_size; j++) {
-      coeff[j]   = rnd(2 << 20) - (1 << 20);
-      dqcoeff[j] = rnd(2 << 20) - (1 << 20);
+      // coeff and dqcoeff will always have at least the same sign, and this
+      // can be used for optimization, so generate test input precisely.
+      if (rnd(2)) {
+        // Positive number
+        coeff[j]   = rnd(1 << msb);
+        dqcoeff[j] = rnd(1 << msb);
+      } else {
+        // Negative number
+        coeff[j]   = -rnd(1 << msb);
+        dqcoeff[j] = -rnd(1 << msb);
+      }
     }
     ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
                                   bit_depth_);
@@ -85,7 +95,7 @@
     err_count_total += err_count;
   }
   EXPECT_EQ(0, err_count_total)
-      << "Error: Error Block Test, C output doesn't match SSE2 output. "
+      << "Error: Error Block Test, C output doesn't match optimized output. "
       << "First failed at test case " << first_failure;
 }
 
@@ -100,23 +110,36 @@
   int64_t ret;
   int64_t ref_ssz;
   int64_t ref_ret;
-  int max_val = ((1 << 20) - 1);
+  const int msb = bit_depth_ + 8 - 1;
+  int max_val = ((1 << msb) - 1);
   for (int i = 0; i < kNumIterations; ++i) {
     int err_count = 0;
-    int k = (i / 9) % 5;
+    int k = (i / 9) % 9;
 
     // Change the maximum coeff value, to test different bit boundaries
-    if ( k == 4 && (i % 9) == 0 ) {
+    if ( k == 8 && (i % 9) == 0 ) {
       max_val >>= 1;
     }
     block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64
     for (int j = 0; j < block_size; j++) {
-      if (k < 4) {  // Test at maximum values
-        coeff[j]   = k % 2 ? max_val : -max_val;
-        dqcoeff[j] = (k >> 1) % 2 ? max_val : -max_val;
+      if (k < 4) {
+        // Test at positive maximum values
+        coeff[j]   = k % 2 ? max_val : 0;
+        dqcoeff[j] = (k >> 1) % 2 ? max_val : 0;
+      } else if (k < 8) {
+        // Test at negative maximum values
+        coeff[j]   = k % 2 ? -max_val : 0;
+        dqcoeff[j] = (k >> 1) % 2 ? -max_val : 0;
       } else {
-        coeff[j]   = rnd(2 << 14) - (1 << 14);
-        dqcoeff[j] = rnd(2 << 14) - (1 << 14);
+        if (rnd(2)) {
+          // Positive number
+          coeff[j]   = rnd(1 << 14);
+          dqcoeff[j] = rnd(1 << 14);
+        } else {
+          // Negative number
+          coeff[j]   = -rnd(1 << 14);
+          dqcoeff[j] = -rnd(1 << 14);
+        }
       }
     }
     ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
@@ -130,21 +153,13 @@
     err_count_total += err_count;
   }
   EXPECT_EQ(0, err_count_total)
-      << "Error: Error Block Test, C output doesn't match SSE2 output. "
+      << "Error: Error Block Test, C output doesn't match optimized output. "
       << "First failed at test case " << first_failure;
 }
 
 using std::tr1::make_tuple;
 
-#if CONFIG_USE_X86INC && HAVE_SSE2
-int64_t wrap_vp9_highbd_block_error_8bit_sse2(const tran_low_t *coeff,
-                                              const tran_low_t *dqcoeff,
-                                              intptr_t block_size,
-                                              int64_t *ssz, int bps) {
-  assert(bps == 8);
-  return vp9_highbd_block_error_8bit_sse2(coeff, dqcoeff, block_size, ssz);
-}
-
+#if CONFIG_USE_X86INC
 int64_t wrap_vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
                                            const tran_low_t *dqcoeff,
                                            intptr_t block_size,
@@ -153,6 +168,15 @@
   return vp9_highbd_block_error_8bit_c(coeff, dqcoeff, block_size, ssz);
 }
 
+#if HAVE_SSE2
+int64_t wrap_vp9_highbd_block_error_8bit_sse2(const tran_low_t *coeff,
+                                              const tran_low_t *dqcoeff,
+                                              intptr_t block_size,
+                                              int64_t *ssz, int bps) {
+  assert(bps == 8);
+  return vp9_highbd_block_error_8bit_sse2(coeff, dqcoeff, block_size, ssz);
+}
+
 INSTANTIATE_TEST_CASE_P(
     SSE2, ErrorBlockTest,
     ::testing::Values(
@@ -165,5 +189,23 @@
         make_tuple(&wrap_vp9_highbd_block_error_8bit_sse2,
                    &wrap_vp9_highbd_block_error_8bit_c, VPX_BITS_8)));
 #endif  // HAVE_SSE2
+
+#if HAVE_AVX
+int64_t wrap_vp9_highbd_block_error_8bit_avx(const tran_low_t *coeff,
+                                              const tran_low_t *dqcoeff,
+                                              intptr_t block_size,
+                                              int64_t *ssz, int bps) {
+  assert(bps == 8);
+  return vp9_highbd_block_error_8bit_avx(coeff, dqcoeff, block_size, ssz);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AVX, ErrorBlockTest,
+    ::testing::Values(
+        make_tuple(&wrap_vp9_highbd_block_error_8bit_avx,
+                   &wrap_vp9_highbd_block_error_8bit_c, VPX_BITS_8)));
+#endif  // HAVE_AVX
+
+#endif  // CONFIG_USE_X86INC
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace

diff --git a/vp10/common/blockd.c b/vp10/common/blockd.c
index 5394b5e..b6f910f 100644
--- a/vp10/common/blockd.c
+++ b/vp10/common/blockd.c

@@ -66,7 +66,7 @@
   for (r = 0; r < max_blocks_high; r += (1 << tx_size)) {
     // Skip visiting the sub blocks that are wholly within the UMV.
     for (c = 0; c < max_blocks_wide; c += (1 << tx_size)) {
-      visit(plane, i, plane_bsize, tx_size, arg);
+      visit(plane, i, r, c, plane_bsize, tx_size, arg);
       i += step;
     }
     i += extra_step;

diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index 8454154..b89d791 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h

@@ -283,6 +283,7 @@
 }
 
 typedef void (*foreach_transformed_block_visitor)(int plane, int block,
+                                                  int blk_row, int blk_col,
                                                   BLOCK_SIZE plane_bsize,
                                                   TX_SIZE tx_size,
                                                   void *arg);
@@ -296,17 +297,6 @@
     const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
     foreach_transformed_block_visitor visit, void *arg);
 
-static INLINE void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
-                                            TX_SIZE tx_size, int block,
-                                            int *x, int *y) {
-  const int bwl = b_width_log2_lookup[plane_bsize];
-  const int tx_cols_log2 = bwl - tx_size;
-  const int tx_cols = 1 << tx_cols_log2;
-  const int raster_mb = block >> (tx_size << 1);
-  *x = (raster_mb & (tx_cols - 1)) << tx_size;
-  *y = (raster_mb >> tx_cols_log2) << tx_size;
-}
-
 void vp10_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
                       BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
                       int aoff, int loff);

diff --git a/vp10/common/entropy.c b/vp10/common/entropy.c
index 56dd73a..1676506 100644
--- a/vp10/common/entropy.c
+++ b/vp10/common/entropy.c

@@ -403,7 +403,6 @@
   {255, 241, 243, 255, 236, 255, 252, 254},
   {255, 243, 245, 255, 237, 255, 252, 254},
   {255, 246, 247, 255, 239, 255, 253, 255},
-  {255, 246, 247, 255, 239, 255, 253, 255},
 };
 
 static const vp10_coeff_probs_model default_coef_probs_4x4[PLANE_TYPES] = {
@@ -743,14 +742,16 @@
 };
 
 static void extend_to_full_distribution(vpx_prob *probs, vpx_prob p) {
-  memcpy(probs, vp10_pareto8_full[p = 0 ? 0 : p - 1],
-         MODEL_NODES * sizeof(vpx_prob));
+  memcpy(probs, vp10_pareto8_full[p - 1], MODEL_NODES * sizeof(vpx_prob));
 }
 
 void vp10_model_to_full_probs(const vpx_prob *model, vpx_prob *full) {
   if (full != model)
     memcpy(full, model, sizeof(vpx_prob) * UNCONSTRAINED_NODES);
-  extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
+  // TODO(aconverse): model[PIVOT_NODE] should never be zero.
+  // https://code.google.com/p/webm/issues/detail?id=1089
+  if (model[PIVOT_NODE] != 0)
+    extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
 }
 
 void vp10_default_coef_probs(VP10_COMMON *cm) {

diff --git a/vp10/common/entropy.h b/vp10/common/entropy.h
index fba7020..2f93cb3 100644
--- a/vp10/common/entropy.h
+++ b/vp10/common/entropy.h

@@ -153,7 +153,7 @@
 // 1, 3, 5, 7, ..., 253, 255
 // In between probabilities are interpolated linearly
 
-#define COEFF_PROB_MODELS 256
+#define COEFF_PROB_MODELS 255
 
 #define UNCONSTRAINED_NODES         3
 

diff --git a/vp10/encoder/context_tree.c b/vp10/encoder/context_tree.c
index 532e82c..6c056d2 100644
--- a/vp10/encoder/context_tree.c
+++ b/vp10/encoder/context_tree.c

@@ -30,13 +30,13 @@
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     for (k = 0; k < 3; ++k) {
       CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
-                      vpx_memalign(16, num_pix * sizeof(*ctx->coeff[i][k])));
+                      vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k])));
       CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
-                      vpx_memalign(16, num_pix * sizeof(*ctx->qcoeff[i][k])));
+                      vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k])));
       CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
-                      vpx_memalign(16, num_pix * sizeof(*ctx->dqcoeff[i][k])));
+                      vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k])));
       CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
-                      vpx_memalign(16, num_blk * sizeof(*ctx->eobs[i][k])));
+                      vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k])));
       ctx->coeff_pbuf[i][k]   = ctx->coeff[i][k];
       ctx->qcoeff_pbuf[i][k]  = ctx->qcoeff[i][k];
       ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k];

diff --git a/vp10/encoder/encodemb.c b/vp10/encoder/encodemb.c
index ff23fee..92ba4dd 100644
--- a/vp10/encoder/encodemb.c
+++ b/vp10/encoder/encodemb.c

@@ -323,195 +323,6 @@
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-void vp10_xform_quant_fp(MACROBLOCK *x, int plane, int block,
-                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblock_plane *const p = &x->plane[plane];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
-  TX_TYPE tx_type = get_tx_type(plane_type, xd, block);
-  const scan_order *const scan_order = get_scan(tx_size, tx_type);
-  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  uint16_t *const eob = &p->eobs[block];
-  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
-  int i, j;
-  const int16_t *src_diff;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
-  src_diff = &p->src_diff[4 * (j * diff_stride + i)];
-
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    switch (tx_size) {
-      case TX_32X32:
-        highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vp10_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
-                                     p->round_fp, p->quant_fp, p->quant_shift,
-                                     qcoeff, dqcoeff, pd->dequant,
-                                     eob, scan_order->scan,
-                                     scan_order->iscan);
-        break;
-      case TX_16X16:
-        vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
-        vp10_highbd_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
-                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, eob,
-                               scan_order->scan, scan_order->iscan);
-        break;
-      case TX_8X8:
-        vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
-        vp10_highbd_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
-                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, eob,
-                               scan_order->scan, scan_order->iscan);
-        break;
-      case TX_4X4:
-        if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-          vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
-        } else {
-          vpx_highbd_fdct4x4(src_diff, coeff, diff_stride);
-        }
-        vp10_highbd_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
-                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, eob,
-                               scan_order->scan, scan_order->iscan);
-        break;
-      default:
-        assert(0);
-    }
-    return;
-  }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-  switch (tx_size) {
-    case TX_32X32:
-      fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vp10_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
-                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                            pd->dequant, eob, scan_order->scan,
-                            scan_order->iscan);
-      break;
-    case TX_16X16:
-      vpx_fdct16x16(src_diff, coeff, diff_stride);
-      vp10_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
-                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                      pd->dequant, eob,
-                      scan_order->scan, scan_order->iscan);
-      break;
-    case TX_8X8:
-      vp10_fdct8x8_quant(src_diff, diff_stride, coeff, 64,
-                        x->skip_block, p->zbin, p->round_fp,
-                        p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                        pd->dequant, eob,
-                        scan_order->scan, scan_order->iscan);
-      break;
-    case TX_4X4:
-      if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-        vp10_fwht4x4(src_diff, coeff, diff_stride);
-      } else {
-        vpx_fdct4x4(src_diff, coeff, diff_stride);
-      }
-      vp10_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
-                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                      pd->dequant, eob,
-                      scan_order->scan, scan_order->iscan);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-}
-
-void vp10_xform_quant_dc(MACROBLOCK *x, int plane, int block,
-                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblock_plane *const p = &x->plane[plane];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  uint16_t *const eob = &p->eobs[block];
-  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
-  int i, j;
-  const int16_t *src_diff;
-
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
-  src_diff = &p->src_diff[4 * (j * diff_stride + i)];
-
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    switch (tx_size) {
-      case TX_32X32:
-        vpx_highbd_fdct32x32_1(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc_32x32(coeff, x->skip_block, p->round,
-                                     p->quant_fp[0], qcoeff, dqcoeff,
-                                     pd->dequant[0], eob);
-        break;
-      case TX_16X16:
-        vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc(coeff, 256, x->skip_block, p->round,
-                               p->quant_fp[0], qcoeff, dqcoeff,
-                               pd->dequant[0], eob);
-        break;
-      case TX_8X8:
-        vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc(coeff, 64, x->skip_block, p->round,
-                               p->quant_fp[0], qcoeff, dqcoeff,
-                               pd->dequant[0], eob);
-        break;
-      case TX_4X4:
-        if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-          vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
-        } else {
-          vpx_highbd_fdct4x4(src_diff, coeff, diff_stride);
-        }
-        vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
-                               p->quant_fp[0], qcoeff, dqcoeff,
-                               pd->dequant[0], eob);
-        break;
-      default:
-        assert(0);
-    }
-    return;
-  }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-  switch (tx_size) {
-    case TX_32X32:
-      vpx_fdct32x32_1(src_diff, coeff, diff_stride);
-      vpx_quantize_dc_32x32(coeff, x->skip_block, p->round,
-                            p->quant_fp[0], qcoeff, dqcoeff,
-                            pd->dequant[0], eob);
-      break;
-    case TX_16X16:
-      vpx_fdct16x16_1(src_diff, coeff, diff_stride);
-      vpx_quantize_dc(coeff, 256, x->skip_block, p->round,
-                     p->quant_fp[0], qcoeff, dqcoeff,
-                     pd->dequant[0], eob);
-      break;
-    case TX_8X8:
-      vpx_fdct8x8_1(src_diff, coeff, diff_stride);
-      vpx_quantize_dc(coeff, 64, x->skip_block, p->round,
-                      p->quant_fp[0], qcoeff, dqcoeff,
-                      pd->dequant[0], eob);
-      break;
-    case TX_4X4:
-      if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-        vp10_fwht4x4(src_diff, coeff, diff_stride);
-      } else {
-        vpx_fdct4x4(src_diff, coeff, diff_stride);
-      }
-      vpx_quantize_dc(coeff, 16, x->skip_block, p->round,
-                      p->quant_fp[0], qcoeff, dqcoeff,
-                      pd->dequant[0], eob);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-}
-
 void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
                        int diff_stride, TX_TYPE tx_type, int lossless) {
   if (lossless) {
@@ -657,8 +468,9 @@
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-void vp10_xform_quant(MACROBLOCK *x, int plane, int block,
-                     BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+void vp10_xform_quant_fp(MACROBLOCK *x, int plane, int block,
+                         int blk_row, int blk_col,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -670,10 +482,196 @@
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
-  int i, j;
   const int16_t *src_diff;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
-  src_diff = &p->src_diff[4 * (j * diff_stride + i)];
+  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    switch (tx_size) {
+      case TX_32X32:
+        highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+        vp10_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
+                                     p->round_fp, p->quant_fp, p->quant_shift,
+                                     qcoeff, dqcoeff, pd->dequant,
+                                     eob, scan_order->scan,
+                                     scan_order->iscan);
+        break;
+      case TX_16X16:
+        vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
+        vp10_highbd_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
+                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                               pd->dequant, eob,
+                               scan_order->scan, scan_order->iscan);
+        break;
+      case TX_8X8:
+        vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
+        vp10_highbd_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
+                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                               pd->dequant, eob,
+                               scan_order->scan, scan_order->iscan);
+        break;
+      case TX_4X4:
+        if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+          vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
+        } else {
+          vpx_highbd_fdct4x4(src_diff, coeff, diff_stride);
+        }
+        vp10_highbd_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
+                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                               pd->dequant, eob,
+                               scan_order->scan, scan_order->iscan);
+        break;
+      default:
+        assert(0);
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  switch (tx_size) {
+    case TX_32X32:
+      fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+      vp10_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, eob, scan_order->scan,
+                            scan_order->iscan);
+      break;
+    case TX_16X16:
+      vpx_fdct16x16(src_diff, coeff, diff_stride);
+      vp10_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
+                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                      pd->dequant, eob,
+                      scan_order->scan, scan_order->iscan);
+      break;
+    case TX_8X8:
+      vp10_fdct8x8_quant(src_diff, diff_stride, coeff, 64,
+                        x->skip_block, p->zbin, p->round_fp,
+                        p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                        pd->dequant, eob,
+                        scan_order->scan, scan_order->iscan);
+      break;
+    case TX_4X4:
+      if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+        vp10_fwht4x4(src_diff, coeff, diff_stride);
+      } else {
+        vpx_fdct4x4(src_diff, coeff, diff_stride);
+      }
+      vp10_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
+                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                      pd->dequant, eob,
+                      scan_order->scan, scan_order->iscan);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+void vp10_xform_quant_dc(MACROBLOCK *x, int plane, int block,
+                         int blk_row, int blk_col,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int16_t *src_diff;
+  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    switch (tx_size) {
+      case TX_32X32:
+        vpx_highbd_fdct32x32_1(src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_dc_32x32(coeff, x->skip_block, p->round,
+                                     p->quant_fp[0], qcoeff, dqcoeff,
+                                     pd->dequant[0], eob);
+        break;
+      case TX_16X16:
+        vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_dc(coeff, 256, x->skip_block, p->round,
+                               p->quant_fp[0], qcoeff, dqcoeff,
+                               pd->dequant[0], eob);
+        break;
+      case TX_8X8:
+        vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_dc(coeff, 64, x->skip_block, p->round,
+                               p->quant_fp[0], qcoeff, dqcoeff,
+                               pd->dequant[0], eob);
+        break;
+      case TX_4X4:
+        if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+          vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
+        } else {
+          vpx_highbd_fdct4x4(src_diff, coeff, diff_stride);
+        }
+        vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
+                               p->quant_fp[0], qcoeff, dqcoeff,
+                               pd->dequant[0], eob);
+        break;
+      default:
+        assert(0);
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  switch (tx_size) {
+    case TX_32X32:
+      vpx_fdct32x32_1(src_diff, coeff, diff_stride);
+      vpx_quantize_dc_32x32(coeff, x->skip_block, p->round,
+                            p->quant_fp[0], qcoeff, dqcoeff,
+                            pd->dequant[0], eob);
+      break;
+    case TX_16X16:
+      vpx_fdct16x16_1(src_diff, coeff, diff_stride);
+      vpx_quantize_dc(coeff, 256, x->skip_block, p->round,
+                     p->quant_fp[0], qcoeff, dqcoeff,
+                     pd->dequant[0], eob);
+      break;
+    case TX_8X8:
+      vpx_fdct8x8_1(src_diff, coeff, diff_stride);
+      vpx_quantize_dc(coeff, 64, x->skip_block, p->round,
+                      p->quant_fp[0], qcoeff, dqcoeff,
+                      pd->dequant[0], eob);
+      break;
+    case TX_4X4:
+      if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+        vp10_fwht4x4(src_diff, coeff, diff_stride);
+      } else {
+        vpx_fdct4x4(src_diff, coeff, diff_stride);
+      }
+      vpx_quantize_dc(coeff, 16, x->skip_block, p->round,
+                      p->quant_fp[0], qcoeff, dqcoeff,
+                      pd->dequant[0], eob);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+
+
+void vp10_xform_quant(MACROBLOCK *x, int plane, int block,
+                      int blk_row, int blk_col,
+                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block);
+  const scan_order *const scan_order = get_scan(tx_size, tx_type);
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int16_t *src_diff;
+  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -751,7 +749,8 @@
   }
 }
 
-static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
+static void encode_block(int plane, int block, int blk_row, int blk_col,
+                         BLOCK_SIZE plane_bsize,
                          TX_SIZE tx_size, void *arg) {
   struct encode_b_args *const args = arg;
   MACROBLOCK *const x = args->x;
@@ -760,14 +759,12 @@
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  int i, j;
   uint8_t *dst;
   ENTROPY_CONTEXT *a, *l;
   TX_TYPE tx_type = get_tx_type(pd->plane_type, xd, block);
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
-  dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
-  a = &ctx->ta[plane][i];
-  l = &ctx->tl[plane][j];
+  dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
+  a = &ctx->ta[plane][blk_col];
+  l = &ctx->tl[plane][blk_row];
 
   // TODO(jingning): per transformed block zero forcing only enabled for
   // luma component. will integrate chroma components as well.
@@ -786,17 +783,20 @@
         *a = *l = 0;
         return;
       } else {
-        vp10_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
+        vp10_xform_quant_fp(x, plane, block, blk_row, blk_col,
+                            plane_bsize, tx_size);
       }
     } else {
       if (max_txsize_lookup[plane_bsize] == tx_size) {
         int txfm_blk_index = (plane << 2) + (block >> (tx_size << 1));
         if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_NONE) {
           // full forward transform and quantization
-          vp10_xform_quant(x, plane, block, plane_bsize, tx_size);
+          vp10_xform_quant(x, plane, block, blk_row, blk_col,
+                           plane_bsize, tx_size);
         } else if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_AC_ONLY) {
           // fast path forward transform and quantization
-          vp10_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+          vp10_xform_quant_dc(x, plane, block, blk_row, blk_col,
+                              plane_bsize, tx_size);
         } else {
           // skip forward transform
           p->eobs[block] = 0;
@@ -804,7 +804,8 @@
           return;
         }
       } else {
-        vp10_xform_quant(x, plane, block, plane_bsize, tx_size);
+        vp10_xform_quant(x, plane, block, blk_row, blk_col,
+                         plane_bsize, tx_size);
       }
     }
   }
@@ -879,19 +880,18 @@
   }
 }
 
-static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
+static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
+                               BLOCK_SIZE plane_bsize,
                                TX_SIZE tx_size, void *arg) {
   MACROBLOCK *const x = (MACROBLOCK *)arg;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  int i, j;
   uint8_t *dst;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
-  dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
+  dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
 
-  vp10_xform_quant(x, plane, block, plane_bsize, tx_size);
+  vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
 
   if (p->eobs[block] > 0) {
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -948,8 +948,9 @@
   }
 }
 
-void vp10_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
-                            TX_SIZE tx_size, void *arg) {
+void vp10_encode_block_intra(int plane, int block, int blk_row, int blk_col,
+                             BLOCK_SIZE plane_bsize,
+                             TX_SIZE tx_size, void *arg) {
   struct encode_b_args* const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -971,15 +972,13 @@
   uint16_t *eob = &p->eobs[block];
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
-  int i, j;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
-  dst = &pd->dst.buf[4 * (j * dst_stride + i)];
-  src = &p->src.buf[4 * (j * src_stride + i)];
-  src_diff = &p->src_diff[4 * (j * diff_stride + i)];
+  dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
+  src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
+  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
 
   mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
   vp10_predict_intra_block(xd, bwl, bhl, tx_size, mode, dst, dst_stride,
-                          dst, dst_stride, i, j, plane);
+                          dst, dst_stride, blk_col, blk_row, plane);
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -1130,5 +1129,5 @@
   struct encode_b_args arg = {x, NULL, &xd->mi[0]->mbmi.skip};
 
   vp10_foreach_transformed_block_in_plane(xd, bsize, plane,
-                                         vp10_encode_block_intra, &arg);
+                                          vp10_encode_block_intra, &arg);
 }

diff --git a/vp10/encoder/encodemb.h b/vp10/encoder/encodemb.h
index 62a7db4..2e6516e 100644
--- a/vp10/encoder/encodemb.h
+++ b/vp10/encoder/encodemb.h

@@ -26,16 +26,20 @@
 void vp10_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
 void vp10_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
 void vp10_xform_quant_fp(MACROBLOCK *x, int plane, int block,
-                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
+                         int blk_row, int blk_col,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
 void vp10_xform_quant_dc(MACROBLOCK *x, int plane, int block,
-                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
+                         int blk_row, int blk_col,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
 void vp10_xform_quant(MACROBLOCK *x, int plane, int block,
-                     BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
+                      int blk_row, int blk_col,
+                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
 
 void vp10_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
-void vp10_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
-                            TX_SIZE tx_size, void *arg);
+void vp10_encode_block_intra(int plane, int block, int blk_row, int blk_col,
+                             BLOCK_SIZE plane_bsize,
+                             TX_SIZE tx_size, void *arg);
 
 void vp10_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 

diff --git a/vp10/encoder/palette.c b/vp10/encoder/palette.c
index b6fd653..522e185 100644
--- a/vp10/encoder/palette.c
+++ b/vp10/encoder/palette.c

@@ -39,12 +39,20 @@
   }
 }
 
+// Generate a random number in the range [0, 32768).
+static unsigned int lcg_rand16(unsigned int *state) {
+  *state = *state * 1103515245 + 12345;
+  return *state / 65536 % 32768;
+}
+
 static void calc_centroids(const double *data, double *centroids,
                            const uint8_t *indices, int n, int k, int dim) {
   int i, j, index;
   int count[PALETTE_MAX_SIZE];
+  unsigned int rand_state = (unsigned int)data[0];
 
-  srand((unsigned int) data[0]);
+  assert(n <= 32768);
+
   memset(count, 0, sizeof(count[0]) * k);
   memset(centroids, 0, sizeof(centroids[0]) * k * dim);
 
@@ -59,8 +67,7 @@
 
   for (i = 0; i < k; ++i) {
     if (count[i] == 0) {
-      // TODO(huisu): replace rand() with something else.
-      memcpy(centroids + i * dim, data + (rand() % n) * dim,
+      memcpy(centroids + i * dim, data + (lcg_rand16(&rand_state) % n) * dim,
                  sizeof(centroids[0]) * dim);
     } else {
       const double norm = 1.0 / count[i];

diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 2e000af..cecc59c 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c

@@ -451,18 +451,16 @@
   *out_sse = this_sse >> shift;
 }
 
-static int rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
+static int rate_block(int plane, int block, int blk_row, int blk_col,
                       TX_SIZE tx_size, struct rdcost_block_args* args) {
-  int x_idx, y_idx;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x_idx, &y_idx);
-
-  return cost_coeffs(args->x, plane, block, args->t_above + x_idx,
-                     args->t_left + y_idx, tx_size,
+  return cost_coeffs(args->x, plane, block, args->t_above + blk_col,
+                     args->t_left + blk_row, tx_size,
                      args->so->scan, args->so->neighbors,
                      args->use_fast_coef_costing);
 }
 
-static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
+static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
+                          BLOCK_SIZE plane_bsize,
                           TX_SIZE tx_size, void *arg) {
   struct rdcost_block_args *args = arg;
   MACROBLOCK *const x = args->x;
@@ -478,20 +476,23 @@
 
   if (!is_inter_block(mbmi)) {
     struct encode_b_args arg = {x, NULL, &mbmi->skip};
-    vp10_encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
+    vp10_encode_block_intra(plane, block, blk_row, blk_col,
+                            plane_bsize, tx_size, &arg);
     dist_block(x, plane, block, tx_size, &dist, &sse);
   } else if (max_txsize_lookup[plane_bsize] == tx_size) {
     if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
         SKIP_TXFM_NONE) {
       // full forward transform and quantization
-      vp10_xform_quant(x, plane, block, plane_bsize, tx_size);
+      vp10_xform_quant(x, plane, block, blk_row, blk_col,
+                       plane_bsize, tx_size);
       dist_block(x, plane, block, tx_size, &dist, &sse);
     } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
                SKIP_TXFM_AC_ONLY) {
       // compute DC coefficient
       tran_low_t *const coeff   = BLOCK_OFFSET(x->plane[plane].coeff, block);
       tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
-      vp10_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+      vp10_xform_quant_dc(x, plane, block, blk_row, blk_col,
+                          plane_bsize, tx_size);
       sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
       dist = sse;
       if (x->plane[plane].eobs[block]) {
@@ -515,7 +516,7 @@
     }
   } else {
     // full forward transform and quantization
-    vp10_xform_quant(x, plane, block, plane_bsize, tx_size);
+    vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
     dist_block(x, plane, block, tx_size, &dist, &sse);
   }
 
@@ -525,7 +526,7 @@
     return;
   }
 
-  rate = rate_block(plane, block, plane_bsize, tx_size, args);
+  rate = rate_block(plane, block, blk_row, blk_col, tx_size, args);
   rd1 = RDCOST(x->rdmult, x->rddiv, rate, dist);
   rd2 = RDCOST(x->rdmult, x->rddiv, 0, sse);
 

diff --git a/vp10/encoder/tokenize.c b/vp10/encoder/tokenize.c
index 2c9998b..e568c0b 100644
--- a/vp10/encoder/tokenize.c
+++ b/vp10/encoder/tokenize.c

@@ -443,7 +443,9 @@
   TOKENEXTRA **tp;
 };
 
-static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize,
+static void set_entropy_context_b(int plane, int block,
+                                  int blk_row, int blk_col,
+                                  BLOCK_SIZE plane_bsize,
                                   TX_SIZE tx_size, void *arg) {
   struct tokenize_b_args* const args = arg;
   ThreadData *const td = args->td;
@@ -451,10 +453,8 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
-  int aoff, loff;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
   vp10_set_contexts(xd, pd, plane_bsize, tx_size, p->eobs[block] > 0,
-                   aoff, loff);
+                    blk_col, blk_row);
 }
 
 static INLINE void add_token(TOKENEXTRA **t, const vpx_prob *context_tree,
@@ -520,7 +520,8 @@
   }
 }
 
-static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
+static void tokenize_b(int plane, int block, int blk_row, int blk_col,
+                       BLOCK_SIZE plane_bsize,
                        TX_SIZE tx_size, void *arg) {
   struct tokenize_b_args* const args = arg;
   VP10_COMP *cpi = args->cpi;
@@ -553,11 +554,8 @@
   const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
   int16_t token;
   EXTRABIT extra;
-  int aoff, loff;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
-
-  pt = get_entropy_context(tx_size, pd->above_context + aoff,
-                           pd->left_context + loff);
+  pt = get_entropy_context(tx_size, pd->above_context + blk_col,
+                           pd->left_context + blk_row);
   scan = so->scan;
   nb = so->neighbors;
   c = 0;
@@ -597,20 +595,22 @@
 
   *tp = t;
 
-  vp10_set_contexts(xd, pd, plane_bsize, tx_size, c > 0, aoff, loff);
+  vp10_set_contexts(xd, pd, plane_bsize, tx_size, c > 0, blk_col, blk_row);
 }
 
 struct is_skippable_args {
   uint16_t *eobs;
   int *skippable;
 };
-static void is_skippable(int plane, int block,
+static void is_skippable(int plane, int block, int blk_row, int blk_col,
                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                          void *argv) {
   struct is_skippable_args *args = argv;
   (void)plane;
   (void)plane_bsize;
   (void)tx_size;
+  (void)blk_row;
+  (void)blk_col;
   args->skippable[0] &= (!args->eobs[block]);
 }
 
@@ -624,13 +624,15 @@
   return result;
 }
 
-static void has_high_freq_coeff(int plane, int block,
+static void has_high_freq_coeff(int plane, int block, int blk_row, int blk_col,
                                 BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                                 void *argv) {
   struct is_skippable_args *args = argv;
   int eobs = (tx_size == TX_4X4) ? 3 : 10;
   (void) plane;
   (void) plane_bsize;
+  (void) blk_row;
+  (void) blk_col;
 
   *(args->skippable) |= (args->eobs[block] > eobs);
 }

diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 579857b..719e542 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c

@@ -403,7 +403,6 @@
   {255, 241, 243, 255, 236, 255, 252, 254},
   {255, 243, 245, 255, 237, 255, 252, 254},
   {255, 246, 247, 255, 239, 255, 253, 255},
-  {255, 246, 247, 255, 239, 255, 253, 255},
 };
 
 static const vp9_coeff_probs_model default_coef_probs_4x4[PLANE_TYPES] = {
@@ -743,14 +742,16 @@
 };
 
 static void extend_to_full_distribution(vpx_prob *probs, vpx_prob p) {
-  memcpy(probs, vp9_pareto8_full[p = 0 ? 0 : p - 1],
-         MODEL_NODES * sizeof(vpx_prob));
+  memcpy(probs, vp9_pareto8_full[p - 1], MODEL_NODES * sizeof(vpx_prob));
 }
 
 void vp9_model_to_full_probs(const vpx_prob *model, vpx_prob *full) {
   if (full != model)
     memcpy(full, model, sizeof(vpx_prob) * UNCONSTRAINED_NODES);
-  extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
+  // TODO(aconverse): model[PIVOT_NODE] should never be zero.
+  // https://code.google.com/p/webm/issues/detail?id=1089
+  if (model[PIVOT_NODE] != 0)
+    extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
 }
 
 void vp9_default_coef_probs(VP9_COMMON *cm) {

diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 21611ed..63b3bff 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h

@@ -138,7 +138,7 @@
 // 1, 3, 5, 7, ..., 253, 255
 // In between probabilities are interpolated linearly
 
-#define COEFF_PROB_MODELS 256
+#define COEFF_PROB_MODELS 255
 
 #define UNCONSTRAINED_NODES         3
 

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index ed5f4ca..5bf71ef 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -248,7 +248,7 @@
   specialize qw/vp9_highbd_block_error/, "$sse2_x86inc";
 
   add_proto qw/int64_t vp9_highbd_block_error_8bit/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-  specialize qw/vp9_highbd_block_error_8bit/, "$sse2_x86inc";
+  specialize qw/vp9_highbd_block_error_8bit/, "$sse2_x86inc", "$avx_x86inc";
 
   add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_fp/;

diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 2cd89c0..bdc95d4 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c

@@ -486,10 +486,17 @@
   // Account for larger interval on base layer for temporal layers.
   if (cr->percent_refresh > 0 &&
       rc->frames_since_key <  (4 * cpi->svc.number_temporal_layers) *
-      (100 / cr->percent_refresh))
+      (100 / cr->percent_refresh)) {
     cr->rate_ratio_qdelta = 3.0;
-  else
+  } else {
     cr->rate_ratio_qdelta = 2.0;
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 &&
+      cpi->denoiser.denoising_level >= kMedium)
+    // Reduce the delta-qp if the estimated source noise is above threshold.
+    cr->rate_ratio_qdelta = 1.5;
+#endif
+  }
   // Adjust some parameters for low resolutions at low bitrates.
   if (cm->width <= 352 &&
       cm->height <= 288 &&

diff --git a/vp9/encoder/vp9_context_tree.c b/vp9/encoder/vp9_context_tree.c
index e87cccb..396ed3f 100644
--- a/vp9/encoder/vp9_context_tree.c
+++ b/vp9/encoder/vp9_context_tree.c

@@ -30,13 +30,13 @@
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     for (k = 0; k < 3; ++k) {
       CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
-                      vpx_memalign(16, num_pix * sizeof(*ctx->coeff[i][k])));
+                      vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k])));
       CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
-                      vpx_memalign(16, num_pix * sizeof(*ctx->qcoeff[i][k])));
+                      vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k])));
       CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
-                      vpx_memalign(16, num_pix * sizeof(*ctx->dqcoeff[i][k])));
+                      vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k])));
       CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
-                      vpx_memalign(16, num_blk * sizeof(*ctx->eobs[i][k])));
+                      vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k])));
       ctx->coeff_pbuf[i][k]   = ctx->coeff[i][k];
       ctx->qcoeff_pbuf[i][k]  = ctx->qcoeff[i][k];
       ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k];

diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 678e312..05aa1d2 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c

@@ -10,6 +10,8 @@
 
 #include <assert.h>
 #include <limits.h>
+#include <math.h>
+
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_scale/yv12config.h"
@@ -17,13 +19,9 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/encoder/vp9_context_tree.h"
 #include "vp9/encoder/vp9_denoiser.h"
+#include "vp9/encoder/vp9_encoder.h"
 
-/* The VP9 denoiser is a work-in-progress. It currently is only designed to work
- * with speed 6, though it (inexplicably) seems to also work with speed 5 (one
- * would need to modify the source code in vp9_pickmode.c and vp9_encoder.c to
- * make the calls to the vp9_denoiser_* functions when in speed 5).
- *
- * The implementation is very similar to that of the VP8 denoiser. While
+/* The VP9 denoiser is similar to that of the VP8 denoiser. While
  * choosing the motion vectors / reference frames, the denoiser is run, and if
  * it did not modify the signal to much, the denoised block is copied to the
  * signal.
@@ -195,7 +193,7 @@
                                                          int mi_row,
                                                          int mi_col,
                                                          PICK_MODE_CONTEXT *ctx,
-                                                         int *motion_magnitude,
+                                                         int motion_magnitude,
                                                          int is_skin) {
   int mv_col, mv_row;
   int sse_diff = ctx->zeromv_sse - ctx->newmv_sse;
@@ -209,18 +207,17 @@
 
   mv_col = ctx->best_sse_mv.as_mv.col;
   mv_row = ctx->best_sse_mv.as_mv.row;
-  *motion_magnitude = mv_row * mv_row + mv_col * mv_col;
   frame = ctx->best_reference_frame;
 
   saved_mbmi = *mbmi;
 
-  if (is_skin && *motion_magnitude > 16)
+  if (is_skin && motion_magnitude > 16)
     return COPY_BLOCK;
 
   // If the best reference frame uses inter-prediction and there is enough of a
   // difference in sum-squared-error, use it.
   if (frame != INTRA_FRAME &&
-      sse_diff > sse_diff_thresh(bs, increase_denoising, *motion_magnitude)) {
+      sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) {
     mbmi->ref_frame[0] = ctx->best_reference_frame;
     mbmi->mode = ctx->best_sse_inter_mode;
     mbmi->mv[0] = ctx->best_sse_mv;
@@ -242,7 +239,7 @@
     *mbmi = saved_mbmi;
     return COPY_BLOCK;
   }
-  if (*motion_magnitude >
+  if (motion_magnitude >
      (noise_motion_thresh(bs, increase_denoising) << 3)) {
     // Restore everything to its original state
     *mbmi = saved_mbmi;
@@ -315,6 +312,7 @@
 void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                           int mi_row, int mi_col, BLOCK_SIZE bs,
                           PICK_MODE_CONTEXT *ctx) {
+  int mv_col, mv_row;
   int motion_magnitude = 0;
   VP9_DENOISER_DECISION decision = COPY_BLOCK;
   YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
@@ -325,7 +323,7 @@
   struct buf_2d src = mb->plane[0].src;
   int is_skin = 0;
 
-  if (bs <= BLOCK_16X16) {
+  if (bs <= BLOCK_16X16 && denoiser->denoising_level >= kMedium) {
     // Take center pixel in block to determine is_skin.
     const int y_width_shift = (4 << b_width_log2_lookup[bs]) >> 1;
     const int y_height_shift = (4 << b_height_log2_lookup[bs]) >> 1;
@@ -342,17 +340,28 @@
     is_skin = vp9_skin_pixel(ysource, usource, vsource);
   }
 
-  decision = perform_motion_compensation(denoiser, mb, bs,
-                                         denoiser->increase_denoising,
-                                         mi_row, mi_col, ctx,
-                                         &motion_magnitude,
-                                         is_skin);
+  mv_col = ctx->best_sse_mv.as_mv.col;
+  mv_row = ctx->best_sse_mv.as_mv.row;
+  motion_magnitude = mv_row * mv_row + mv_col * mv_col;
+  if (denoiser->denoising_level == kHigh && motion_magnitude < 16) {
+    denoiser->increase_denoising = 1;
+  } else {
+    denoiser->increase_denoising = 0;
+  }
+
+  if (denoiser->denoising_level >= kMedium)
+    decision = perform_motion_compensation(denoiser, mb, bs,
+                                           denoiser->increase_denoising,
+                                           mi_row, mi_col, ctx,
+                                           motion_magnitude,
+                                           is_skin);
 
   if (decision == FILTER_BLOCK) {
     decision = vp9_denoiser_filter(src.buf, src.stride,
                                  mc_avg_start, mc_avg.y_stride,
                                  avg_start, avg.y_stride,
-                                 0, bs, motion_magnitude);
+                                 denoiser->increase_denoising,
+                                 bs, motion_magnitude);
   }
 
   if (decision == FILTER_BLOCK) {
@@ -499,14 +508,43 @@
     vp9_denoiser_free(denoiser);
     return 1;
   }
+
+  fail = vpx_alloc_frame_buffer(&denoiser->last_source, width, height,
+                                ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                use_highbitdepth,
+#endif
+                                border, legacy_byte_alignment);
+  if (fail) {
+    vp9_denoiser_free(denoiser);
+    return 1;
+  }
 #ifdef OUTPUT_YUV_DENOISED
   make_grayscale(&denoiser->running_avg_y[i]);
 #endif
   denoiser->increase_denoising = 0;
   denoiser->frame_buffer_initialized = 1;
+  vp9_denoiser_init_noise_estimate(denoiser, width, height);
   return 0;
 }
 
+void vp9_denoiser_init_noise_estimate(VP9_DENOISER *denoiser,
+                                      int width,
+                                      int height) {
+  // Denoiser is off by default, i.e., no denoising is performed.
+  // Noise level is measured periodically, and if observed to be above
+  // thresh_noise_estimate, then denoising is performed.
+  denoiser->denoising_level = kLow;
+  denoiser->noise_estimate = 0;
+  denoiser->noise_estimate_count = 0;
+  denoiser->thresh_noise_estimate = 20;
+  if (width * height >= 1920 * 1080) {
+    denoiser->thresh_noise_estimate = 70;
+  } else if (width * height >= 1280 * 720) {
+    denoiser->thresh_noise_estimate = 40;
+  }
+}
+
 void vp9_denoiser_free(VP9_DENOISER *denoiser) {
   int i;
   denoiser->frame_buffer_initialized = 0;
@@ -517,6 +555,125 @@
     vpx_free_frame_buffer(&denoiser->running_avg_y[i]);
   }
   vpx_free_frame_buffer(&denoiser->mc_running_avg_y);
+  vpx_free_frame_buffer(&denoiser->last_source);
+}
+
+void vp9_denoiser_update_noise_estimate(VP9_COMP *const cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int frame_period = 10;
+  int thresh_consec_zeromv = 8;
+  unsigned int thresh_sum_diff = 128;
+  int num_frames_estimate = 20;
+  int min_blocks_estimate = cm->mi_rows * cm->mi_cols >> 7;
+  // Estimate of noise level every frame_period frames.
+  // Estimate is between current source and last source.
+  if (cm->current_video_frame % frame_period != 0 ||
+     cpi->denoiser.last_source.y_buffer == NULL) {
+    copy_frame(&cpi->denoiser.last_source, cpi->Source);
+    return;
+  } else {
+    int num_samples = 0;
+    uint64_t avg_est = 0;
+    int bsize = BLOCK_16X16;
+    static const unsigned char const_source[16] = {
+         128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+         128, 128};
+    // Loop over sub-sample of 16x16 blocks of frame, and for blocks that have
+    // been encoded as zero/small mv at least x consecutive frames, compute
+    // the variance to update estimate of noise in the source.
+    const uint8_t *src_y = cpi->Source->y_buffer;
+    const int src_ystride = cpi->Source->y_stride;
+    const uint8_t *last_src_y = cpi->denoiser.last_source.y_buffer;
+    const int last_src_ystride = cpi->denoiser.last_source.y_stride;
+    const uint8_t *src_u = cpi->Source->u_buffer;
+    const uint8_t *src_v = cpi->Source->v_buffer;
+    const int src_uvstride = cpi->Source->uv_stride;
+    const int y_width_shift = (4 << b_width_log2_lookup[bsize]) >> 1;
+    const int y_height_shift = (4 << b_height_log2_lookup[bsize]) >> 1;
+    const int uv_width_shift = y_width_shift >> 1;
+    const int uv_height_shift = y_height_shift >> 1;
+    int mi_row, mi_col;
+    for (mi_row = 0; mi_row < cm->mi_rows; mi_row ++) {
+      for (mi_col = 0; mi_col < cm->mi_cols; mi_col ++) {
+        // 16x16 blocks, 1/4 sample of frame.
+        if (mi_row % 4 == 0 && mi_col % 4 == 0) {
+          int bl_index = mi_row * cm->mi_cols + mi_col;
+          int bl_index1 = bl_index + 1;
+          int bl_index2 = bl_index + cm->mi_cols;
+          int bl_index3 = bl_index2 + 1;
+          // Only consider blocks that are likely steady background. i.e, have
+          // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
+          // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
+          // 4 sub-blocks for 16x16 block. Also, avoid skin blocks.
+          const uint8_t ysource =
+            src_y[y_height_shift * src_ystride + y_width_shift];
+          const uint8_t usource =
+            src_u[uv_height_shift * src_uvstride + uv_width_shift];
+          const uint8_t vsource =
+            src_v[uv_height_shift * src_uvstride + uv_width_shift];
+          int is_skin = vp9_skin_pixel(ysource, usource, vsource);
+          if (cr->consec_zero_mv[bl_index] > thresh_consec_zeromv &&
+              cr->consec_zero_mv[bl_index1] > thresh_consec_zeromv &&
+              cr->consec_zero_mv[bl_index2] > thresh_consec_zeromv &&
+              cr->consec_zero_mv[bl_index3] > thresh_consec_zeromv &&
+              !is_skin) {
+            // Compute variance.
+            unsigned int sse;
+            unsigned int variance = cpi->fn_ptr[bsize].vf(src_y,
+                                                          src_ystride,
+                                                          last_src_y,
+                                                          last_src_ystride,
+                                                          &sse);
+            // Only consider this block as valid for noise measurement if the
+            // average term (sse - variance = N * avg^{2}, N = 16X16) of the
+            // temporal residual is small (avoid effects from lighting change).
+            if ((sse - variance) < thresh_sum_diff) {
+              unsigned int sse2;
+              const unsigned int spatial_variance =
+                  cpi->fn_ptr[bsize].vf(src_y, src_ystride, const_source,
+                                        0, &sse2);
+              avg_est += variance / (10 + spatial_variance);
+              num_samples++;
+            }
+          }
+        }
+        src_y += 8;
+        last_src_y += 8;
+        src_u += 4;
+        src_v += 4;
+      }
+      src_y += (src_ystride << 3) - (cm->mi_cols << 3);
+      last_src_y += (last_src_ystride << 3) - (cm->mi_cols << 3);
+      src_u += (src_uvstride << 2) - (cm->mi_cols << 2);
+      src_v += (src_uvstride << 2) - (cm->mi_cols << 2);
+    }
+    // Update noise estimate if we have at a minimum number of block samples,
+    // and avg_est > 0 (avg_est == 0 can happen if the application inputs
+    // duplicate frames).
+    if (num_samples > min_blocks_estimate && avg_est > 0) {
+      // Normalize.
+      avg_est = (avg_est << 8) / num_samples;
+      // Update noise estimate.
+      cpi->denoiser.noise_estimate =  (3 * cpi->denoiser.noise_estimate +
+          avg_est) >> 2;
+      cpi->denoiser.noise_estimate_count++;
+      if (cpi->denoiser.noise_estimate_count == num_frames_estimate) {
+        // Reset counter and check noise level condition.
+        cpi->denoiser.noise_estimate_count = 0;
+        if (cpi->denoiser.noise_estimate >
+            (cpi->denoiser.thresh_noise_estimate << 1))
+          cpi->denoiser.denoising_level = kHigh;
+        else
+          if (cpi->denoiser.noise_estimate >
+              cpi->denoiser.thresh_noise_estimate)
+            cpi->denoiser.denoising_level = kMedium;
+          else
+            cpi->denoiser.denoising_level = kLow;
+      }
+    }
+  }
+  copy_frame(&cpi->denoiser.last_source, cpi->Source);
 }
 
 #ifdef OUTPUT_YUV_DENOISED

diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index ec0b25e..c149e9d 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h

@@ -26,13 +26,26 @@
   FILTER_BLOCK
 } VP9_DENOISER_DECISION;
 
+typedef enum vp9_denoiser_level {
+  kLow,
+  kMedium,
+  kHigh
+} VP9_DENOISER_LEVEL;
+
 typedef struct vp9_denoiser {
   YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES];
   YV12_BUFFER_CONFIG mc_running_avg_y;
+  YV12_BUFFER_CONFIG last_source;
   int increase_denoising;
   int frame_buffer_initialized;
+  VP9_DENOISER_LEVEL denoising_level;
+  int noise_estimate;
+  int thresh_noise_estimate;
+  int noise_estimate_count;
 } VP9_DENOISER;
 
+struct VP9_COMP;
+
 void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
                                     YV12_BUFFER_CONFIG src,
                                     FRAME_TYPE frame_type,
@@ -69,6 +82,12 @@
 
 void vp9_denoiser_free(VP9_DENOISER *denoiser);
 
+void vp9_denoiser_init_noise_estimate(VP9_DENOISER *denoiser,
+                                      int width,
+                                      int height);
+
+void vp9_denoiser_update_noise_estimate(struct VP9_COMP *const cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 2333a13..1f5709c 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -481,7 +481,7 @@
   VP9_COMMON *const cm = &cpi->common;
   const int is_key_frame = (cm->frame_type == KEY_FRAME);
   const int threshold_multiplier = is_key_frame ? 20 : 1;
-  const int64_t threshold_base = (int64_t)(threshold_multiplier *
+  int64_t threshold_base = (int64_t)(threshold_multiplier *
       cpi->y_dequant[q][1]);
   if (is_key_frame) {
     thresholds[0] = threshold_base;
@@ -489,6 +489,16 @@
     thresholds[2] = threshold_base >> 2;
     thresholds[3] = threshold_base << 2;
   } else {
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0) {
+    // Increase base variance threshold is estimated noise level is high.
+    if (cpi->denoiser.denoising_level == kHigh)
+      threshold_base = threshold_base << 2;
+    else
+      if (cpi->denoiser.denoising_level == kMedium)
+        threshold_base = threshold_base << 1;
+  }
+#endif
     thresholds[1] = threshold_base;
     if (cm->width <= 352 && cm->height <= 288) {
       thresholds[0] = threshold_base >> 2;

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 5b75d67..72eafec 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c

@@ -3250,6 +3250,13 @@
                                              &cpi->scaled_last_source,
                                              (cpi->oxcf.pass == 0));
 
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 &&
+      cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    vp9_denoiser_update_noise_estimate(cpi);
+  }
+#endif
+
   if (cpi->oxcf.pass == 0 &&
       cpi->oxcf.rc_mode == VPX_CBR &&
       cpi->resize_state == 0 &&

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index a6b5ebb..30738b5 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c

@@ -1183,10 +1183,13 @@
                                      double group_weight_factor) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  // Clamp the target rate to VBR min / max limts.
+  const int target_rate =
+      vp9_rc_clamp_pframe_target_size(cpi, section_target_bandwidth);
 
   inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
 
-  if (section_target_bandwidth <= 0) {
+  if (target_rate <= 0) {
     return rc->worst_quality;  // Highest value allowed
   } else {
     const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
@@ -1195,7 +1198,7 @@
     const double av_err_per_mb = section_err / active_mbs;
     const double speed_term = 1.0 + 0.04 * oxcf->speed;
     const double ediv_size_correction = (double)num_mbs / EDIV_SIZE_FACTOR;
-    const int target_norm_bits_per_mb = ((uint64_t)section_target_bandwidth <<
+    const int target_norm_bits_per_mb = ((uint64_t)target_rate <<
                                          BPER_MB_NORMBITS) / active_mbs;
 
     int q;
@@ -2444,7 +2447,7 @@
     if ((i <= rc->max_gf_interval) ||
         ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
       const double frame_boost =
-        calc_frame_boost(cpi, this_frame, 0, KF_MAX_BOOST);
+        calc_frame_boost(cpi, &next_frame, 0, KF_MAX_BOOST);
 
       // How fast is prediction quality decaying.
       if (!detect_flash(twopass, 0)) {
@@ -2737,11 +2740,6 @@
   }
 
   target_rate = gf_group->bit_allocation[gf_group->index];
-  if (cpi->common.frame_type == KEY_FRAME)
-    target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
-  else
-    target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
-
   rc->base_frame_target = target_rate;
 
   {

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index fc4d9ae..af04583 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c

@@ -1068,6 +1068,21 @@
     {GOLDEN_FRAME, NEWMV}
 };
 
+int set_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize) {
+  const VP9_COMMON *const cm = &cpi->common;
+  // Reduce the intra cost penalty for small blocks (<=16x16).
+  int reduction_fac =
+      (bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0;
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 &&
+      cpi->denoiser.denoising_level == kHigh)
+     // Don't reduce intra cost penalty if estimated noise level is high.
+     reduction_fac = 0;
+#endif
+  return vp9_get_intra_cost_penalty(
+      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) >> reduction_fac;
+}
+
 // TODO(jingning) placeholder for inter-frame non-RD mode decision.
 // this needs various further optimizations. to be continued..
 void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
@@ -1094,11 +1109,7 @@
   // var_y and sse_y are saved to be used in skipping checking
   unsigned int var_y = UINT_MAX;
   unsigned int sse_y = UINT_MAX;
-  // Reduce the intra cost penalty for small blocks (<=16x16).
-  const int reduction_fac = (bsize <= BLOCK_16X16) ?
-      ((bsize <= BLOCK_8X8) ? 4 : 2) : 0;
-  const int intra_cost_penalty = vp9_get_intra_cost_penalty(
-      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) >> reduction_fac;
+  const int intra_cost_penalty = set_intra_cost_penalty(cpi, bsize);
   const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv,
                                            intra_cost_penalty, 0);
   const int *const rd_threshes = cpi->rd.threshes[mbmi->segment_id][bsize];

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 1d13199..d700685 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c

@@ -1816,6 +1816,11 @@
   RATE_CONTROL *const rc = &cpi->rc;
   int target_rate = rc->base_frame_target;
 
+  if (cpi->common.frame_type == KEY_FRAME)
+    target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
+  else
+    target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
+
   // Correction to rate target based on prior over or under shoot.
   if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ)
     vbr_rate_correction(cpi, &target_rate);

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 1944291..4f3a06e 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -296,30 +296,11 @@
                                       const tran_low_t *dqcoeff,
                                       intptr_t block_size,
                                       int64_t *ssz) {
-  int i;
-  int32_t c, d;
-  int64_t error = 0, sqcoeff = 0;
-  int16_t diff;
-
-  const int32_t hi = 0x00007fff;
-  const int32_t lo = 0xffff8000;
-
-  for (i = 0; i < block_size; i++) {
-    c = coeff[i];
-    d = dqcoeff[i];
-
-    // Saturate to 16 bits
-    c = (c > hi) ? hi : ((c < lo) ? lo : c);
-    d = (d > hi) ? hi : ((d < lo) ? lo : d);
-
-    diff = d - c;
-    error +=  diff * diff;
-    sqcoeff += c * c;
-  }
-  assert(error >= 0 && sqcoeff >= 0);
-
-  *ssz = sqcoeff;
-  return error;
+  // Note that the C versions of these 2 functions (vp9_block_error and
+  // vp9_highbd_block_error_8bit are the same, but the optimized assembly
+  // routines are not compatible in the non high bitdepth configuration, so
+  // they still cannot share the same name.
+  return vp9_block_error_c(coeff, dqcoeff, block_size, ssz);
 }
 
 static int64_t vp9_highbd_block_error_dispatch(const tran_low_t *coeff,

diff --git a/vp9/encoder/x86/vp9_highbd_error_avx.asm b/vp9/encoder/x86/vp9_highbd_error_avx.asm
new file mode 100644
index 0000000..e476323
--- /dev/null
+++ b/vp9/encoder/x86/vp9_highbd_error_avx.asm

@@ -0,0 +1,261 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp9
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+ALIGN 16
+
+;
+; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
+;                                     intptr_t block_size, int64_t *ssz)
+;
+
+INIT_XMM avx
+cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz
+  vzeroupper
+
+  ; If only one iteration is required, then handle this as a special case.
+  ; It is the most frequent case, so we can have a significant gain here
+  ; by not setting up a loop and accumulators.
+  cmp    sizeq, 16
+  jne   .generic
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; Common case of size == 16
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+  ; Load input vectors
+  mova      xm0, [dqcq]
+  packssdw  xm0, [dqcq+16]
+  mova      xm2, [uqcq]
+  packssdw  xm2, [uqcq+16]
+
+  mova      xm1, [dqcq+32]
+  packssdw  xm1, [dqcq+48]
+  mova      xm3, [uqcq+32]
+  packssdw  xm3, [uqcq+48]
+
+  ; Compute the errors.
+  psubw     xm0, xm2
+  psubw     xm1, xm3
+
+  ; Individual errors are max 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
+  pmaddwd   xm2, xm2
+  pmaddwd   xm3, xm3
+
+  pmaddwd   xm0, xm0
+  pmaddwd   xm1, xm1
+
+  ; Squares are always positive, so we can use unsigned arithmetic after
+  ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
+  ; fit in 32bits
+  paddd     xm2, xm3
+  paddd     xm0, xm1
+
+  ; Accumulate horizontally in 64 bits, there is no chance of overflow here
+  pxor      xm5, xm5
+
+  pblendw   xm3, xm5, xm2, 0x33 ; Zero extended  low of a pair of 32 bits
+  psrlq     xm2, 32             ; Zero extended high of a pair of 32 bits
+
+  pblendw   xm1, xm5, xm0, 0x33 ; Zero extended  low of a pair of 32 bits
+  psrlq     xm0, 32             ; Zero extended high of a pair of 32 bits
+
+  paddq     xm2, xm3
+  paddq     xm0, xm1
+
+  psrldq    xm3, xm2, 8
+  psrldq    xm1, xm0, 8
+
+  paddq     xm2, xm3
+  paddq     xm0, xm1
+
+  ; Store the return value
+%if ARCH_X86_64
+  movq      rax, xm0
+  movq   [sszq], xm2
+%else
+  movd      eax, xm0
+  pextrd    edx, xm0, 1
+  movq   [sszd], xm2
+%endif
+  RET
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; Generic case of size != 16, speculative low precision
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ALIGN 16
+.generic:
+  pxor      xm4, xm4                ; sse accumulator
+  pxor      xm5, xm5                ; overflow detection register for xm4
+  pxor      xm6, xm6                ; ssz accumulator
+  pxor      xm7, xm7                ; overflow detection register for xm6
+  lea      uqcq, [uqcq+sizeq*4]
+  lea      dqcq, [dqcq+sizeq*4]
+  neg     sizeq
+
+  ; Push the negative size as the high precision code might need it
+  push    sizeq
+
+.loop:
+  ; Load input vectors
+  mova      xm0, [dqcq+sizeq*4]
+  packssdw  xm0, [dqcq+sizeq*4+16]
+  mova      xm2, [uqcq+sizeq*4]
+  packssdw  xm2, [uqcq+sizeq*4+16]
+
+  mova      xm1, [dqcq+sizeq*4+32]
+  packssdw  xm1, [dqcq+sizeq*4+48]
+  mova      xm3, [uqcq+sizeq*4+32]
+  packssdw  xm3, [uqcq+sizeq*4+48]
+
+  add     sizeq, 16
+
+  ; Compute the squared errors.
+  ; Individual errors are max 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
+  psubw     xm0, xm2
+  pmaddwd   xm2, xm2
+  pmaddwd   xm0, xm0
+
+  psubw     xm1, xm3
+  pmaddwd   xm3, xm3
+  pmaddwd   xm1, xm1
+
+  ; Squares are always positive, so we can use unsigned arithmetic after
+  ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
+  ; fit in 32bits
+  paddd     xm2, xm3
+  paddd     xm0, xm1
+
+  ; We accumulate using 32 bit arithmetic, but detect potential overflow
+  ; by checking if the MSB of the accumulators have ever been a set bit.
+  ; If yes, we redo the whole compute at the end on higher precision, but
+  ; this happens extremely rarely, so we still achieve a net gain.
+  paddd     xm4, xm0
+  paddd     xm6, xm2
+  por       xm5, xm4  ; OR in the accumulator for overflow detection
+  por       xm7, xm6  ; OR in the accumulator for overflow detection
+
+  jnz .loop
+
+  ; Add pairs horizontally (still only on 32 bits)
+  phaddd    xm4, xm4
+  por       xm5, xm4  ; OR in the accumulator for overflow detection
+  phaddd    xm6, xm6
+  por       xm7, xm6  ; OR in the accumulator for overflow detection
+
+  ; Check for possibility of overflow by testing if bit 32 of each dword lane
+  ; have ever been set. If they were not, then there was no overflow and the
+  ; final sum will fit in 32 bits. If overflow happened, then
+  ; we redo the whole computation on higher precision.
+  por       xm7, xm5
+  pmovmskb   r4, xm7
+  test       r4, 0x8888
+  jnz .highprec
+
+  phaddd    xm4, xm4
+  phaddd    xm6, xm6
+  pmovzxdq  xm4, xm4
+  pmovzxdq  xm6, xm6
+
+  ; Restore stack
+  pop     sizeq
+
+  ; Store the return value
+%if ARCH_X86_64
+  movq      rax, xm4
+  movq   [sszq], xm6
+%else
+  movd      eax, xm4
+  pextrd    edx, xm4, 1
+  movq   [sszd], xm6
+%endif
+  RET
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; Generic case of size != 16, high precision case
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+.highprec:
+  pxor      xm4, xm4                 ; sse accumulator
+  pxor      xm5, xm5                 ; dedicated zero register
+  pxor      xm6, xm6                 ; ssz accumulator
+  pop     sizeq
+
+.loophp:
+  mova      xm0, [dqcq+sizeq*4]
+  packssdw  xm0, [dqcq+sizeq*4+16]
+  mova      xm2, [uqcq+sizeq*4]
+  packssdw  xm2, [uqcq+sizeq*4+16]
+
+  mova      xm1, [dqcq+sizeq*4+32]
+  packssdw  xm1, [dqcq+sizeq*4+48]
+  mova      xm3, [uqcq+sizeq*4+32]
+  packssdw  xm3, [uqcq+sizeq*4+48]
+
+  add     sizeq, 16
+
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+
+  psubw     xm0, xm2
+  pmaddwd   xm2, xm2
+  pmaddwd   xm0, xm0
+
+  psubw     xm1, xm3
+  pmaddwd   xm3, xm3
+  pmaddwd   xm1, xm1
+
+  ; accumulate in 64bit
+  punpckldq xm7, xm0, xm5
+  punpckhdq xm0, xm5
+  paddq     xm4, xm7
+
+  punpckldq xm7, xm2, xm5
+  punpckhdq xm2, xm5
+  paddq     xm6, xm7
+
+  punpckldq xm7, xm1, xm5
+  punpckhdq xm1, xm5
+  paddq     xm4, xm7
+
+  punpckldq xm7, xm3, xm5
+  punpckhdq xm3, xm5
+  paddq     xm6, xm7
+
+  paddq     xm4, xm0
+  paddq     xm4, xm1
+  paddq     xm6, xm2
+  paddq     xm6, xm3
+
+  jnz .loophp
+
+  ; Accumulate horizontally
+  movhlps   xm5, xm4
+  movhlps   xm7, xm6
+  paddq     xm4, xm5
+  paddq     xm6, xm7
+
+  ; Store the return value
+%if ARCH_X86_64
+  movq      rax, xm4
+  movq   [sszq], xm6
+%else
+  movd      eax, xm4
+  pextrd    edx, xm4, 1
+  movq   [sszd], xm6
+%endif
+  RET
+
+END

diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index a2cbacf..25a176f 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk

@@ -102,6 +102,7 @@
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm
+VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm
 else
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
 endif

diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 31d8c75..9620eaa 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk

@@ -248,7 +248,8 @@
 endif
 ifeq ($(ARCH_X86_64),yes)
 ifeq ($(CONFIG_USE_X86INC),yes)
-DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3_x86_64.asm
+DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3_x86_64.asm
+DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx_x86_64.asm
 endif
 endif
 endif  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER

diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 0117f17..b369b05 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -913,25 +913,19 @@
 # Quantization
 #
 if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64_x86inc";
+  specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64_x86inc", "$avx_x86_64_x86inc";
 
   add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64_x86inc";
+  specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64_x86inc", "$avx_x86_64_x86inc";
 
-  add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_highbd_quantize_b sse2/;
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vpx_highbd_quantize_b sse2/;
 
-  add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
-} else {
-  add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64_x86inc";
-
-  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64_x86inc";
-}  # CONFIG_VP9_HIGHBITDEPTH
+    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
+  }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
 
 if (vpx_config("CONFIG_ENCODERS") eq "yes") {

diff --git a/vpx_dsp/x86/quantize_avx_x86_64.asm b/vpx_dsp/x86/quantize_avx_x86_64.asm
new file mode 100644
index 0000000..01c4129
--- /dev/null
+++ b/vpx_dsp/x86/quantize_avx_x86_64.asm

@@ -0,0 +1,544 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, \
+                                eob, scan, iscan
+
+  vzeroupper
+
+  ; If we can skip this block, then just zero the output
+  cmp                         skipmp, 0
+  jne .blank
+
+%ifnidn %1, b_32x32
+
+  ; Special case for ncoeff == 16, as it is frequent and we can save on
+  ; not setting up a loop.
+  cmp                       ncoeffmp, 16
+  jne .generic
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; Special case of ncoeff == 16
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+.single:
+
+  movifnidn                   coeffq, coeffmp
+  movifnidn                    zbinq, zbinmp
+  mova                            m0, [zbinq]              ; m0 = zbin
+
+  ; Get DC and first 15 AC coeffs - in this special case, that is all.
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; coeff stored as 32bit numbers but we process them as 16 bit numbers
+  mova                            m9, [coeffq]
+  packssdw                        m9, [coeffq+16]          ; m9 = c[i]
+  mova                           m10, [coeffq+32]
+  packssdw                       m10, [coeffq+48]          ; m10 = c[i]
+%else
+  mova                            m9, [coeffq]             ; m9 = c[i]
+  mova                           m10, [coeffq+16]          ; m10 = c[i]
+%endif
+
+  mov                             r0, eobmp                ; Output pointer
+  mov                             r1, qcoeffmp             ; Output pointer
+  mov                             r2, dqcoeffmp            ; Output pointer
+
+  pxor                            m5, m5                   ; m5 = dedicated zero
+
+  pcmpeqw                         m4, m4                   ; All word lanes -1
+  paddw                           m0, m4                   ; m0 = zbin - 1
+
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  punpckhqdq                      m0, m0
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+
+  ; Check if all coeffs are less than zbin. If yes, we just write zeros
+  ; to the outputs and we are done.
+  por                            m14, m7, m12
+  ptest                          m14, m14
+  jnz .single_nonzero
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova                       [r1   ], ymm5
+  mova                       [r1+32], ymm5
+  mova                       [r2   ], ymm5
+  mova                       [r2+32], ymm5
+%else
+  mova                          [r1], ymm5
+  mova                          [r2], ymm5
+%endif
+  mov                           [r0], word 0
+
+  vzeroupper
+  RET
+
+.single_nonzero:
+
+  ; Actual quantization of size 16 block - setup pointers, rounders, etc.
+  movifnidn                       r4, roundmp
+  movifnidn                       r5, quantmp
+  mov                             r3, dequantmp
+  mov                             r6, shiftmp
+  mova                            m1, [r4]              ; m1 = round
+  mova                            m2, [r5]              ; m2 = quant
+  mova                            m3, [r3]              ; m3 = dequant
+  mova                            m4, [r6]              ; m4 = shift
+
+  mov                             r3, iscanmp
+
+  DEFINE_ARGS eob, qcoeff, dqcoeff, iscan
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                           m8, m6                   ; m8 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  punpckhqdq                      m4, m4
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                            m8, m7
+  pand                           m13, m12
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m8
+  punpckhwd                       m6, m8, m6
+  pmovsxwd                       m11, m8
+  mova                  [qcoeffq   ], m11
+  mova                  [qcoeffq+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova                  [qcoeffq+32], m11
+  mova                  [qcoeffq+48], m6
+%else
+  mova                  [qcoeffq   ], m8
+  mova                  [qcoeffq+16], m13
+%endif
+
+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m8
+  punpckhwd                       m6, m8, m6
+  pmovsxwd                       m11, m8
+  mova                 [dqcoeffq   ], m11
+  mova                 [dqcoeffq+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova                 [dqcoeffq+32], m11
+  mova                 [dqcoeffq+48], m6
+%else
+  mova                 [dqcoeffq   ], m8
+  mova                 [dqcoeffq+16], m13
+%endif
+
+  mova                            m6, [iscanq]            ; m6 = scan[i]
+  mova                           m11, [iscanq+16]         ; m11 = scan[i]
+
+  pcmpeqw                         m8,  m8,  m5            ; m8 = c[i] == 0
+  pcmpeqw                        m13, m13,  m5            ; m13 = c[i] == 0
+  psubw                           m6,  m6,  m7            ; m6 = scan[i] + 1
+  psubw                          m11, m11, m12            ; m11 = scan[i] + 1
+  pandn                           m8,  m8,  m6            ; m8 = max(eob)
+  pandn                          m13, m13, m11            ; m13 = max(eob)
+  pmaxsw                          m8,  m8, m13
+
+  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  movq                           rax, m8
+  mov                         [eobq], ax
+
+  vzeroupper
+  RET
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; Generic case of ncoeff != 16
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+.generic:
+
+%endif ; %ifnidn %1, b_32x32
+
+DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
+            qcoeff, dqcoeff, dequant, eob, scan, iscan
+
+  ; Actual quantization loop - setup pointers, rounders, etc.
+  movifnidn                   coeffq, coeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, dequantmp
+  movifnidn                    zbinq, zbinmp
+  movifnidn                   roundq, roundmp
+  movifnidn                   quantq, quantmp
+  mova                            m0, [zbinq]              ; m0 = zbin
+  mova                            m1, [roundq]             ; m1 = round
+  mova                            m2, [quantq]             ; m2 = quant
+  mova                            m3, [r2]                 ; m3 = dequant
+  pcmpeqw                         m4, m4                   ; All lanes -1
+%ifidn %1, b_32x32
+  psubw                           m0, m4
+  psubw                           m1, m4
+  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
+  paddw                           m0, m4                   ; m0 = m0 + 1
+
+  mov                             r2, shiftmp
+  mov                             r3, qcoeffmp
+  mova                            m4, [r2]                 ; m4 = shift
+  mov                             r4, dqcoeffmp
+  mov                             r5, iscanmp
+%ifidn %1, b_32x32
+  psllw                           m4, 1
+%endif
+  pxor                            m5, m5                   ; m5 = dedicated zero
+
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  lea                         coeffq, [  coeffq+ncoeffq*4]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
+%else
+  lea                         coeffq, [  coeffq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+%endif
+  lea                         iscanq, [  iscanq+ncoeffq*2]
+  neg                        ncoeffq
+
+  ; get DC and first 15 AC coeffs
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; coeff stored as 32bit numbers & require 16bit numbers
+  mova                            m9, [coeffq+ncoeffq*4+ 0]
+  packssdw                        m9, [coeffq+ncoeffq*4+16]
+  mova                           m10, [coeffq+ncoeffq*4+32]
+  packssdw                       m10, [coeffq+ncoeffq*4+48]
+%else
+  mova                            m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
+%endif
+
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  punpckhqdq                      m0, m0
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+
+  ; Check if all coeffs are less than zbin. If yes, skip forward quickly.
+  por                            m14, m7, m12
+  ptest                          m14, m14
+  jnz .first_nonzero
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova        [qcoeffq+ncoeffq*4   ], ymm5
+  mova        [qcoeffq+ncoeffq*4+32], ymm5
+  mova       [dqcoeffq+ncoeffq*4   ], ymm5
+  mova       [dqcoeffq+ncoeffq*4+32], ymm5
+%else
+  mova           [qcoeffq+ncoeffq*2], ymm5
+  mova          [dqcoeffq+ncoeffq*2], ymm5
+%endif
+
+  add                        ncoeffq, mmsize
+
+  punpckhqdq                      m1, m1
+  punpckhqdq                      m2, m2
+  punpckhqdq                      m3, m3
+  punpckhqdq                      m4, m4
+  pxor                            m8, m8
+
+  jmp .ac_only_loop
+
+.first_nonzero:
+
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                           m8, m6                   ; m8 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  punpckhqdq                      m4, m4
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                            m8, m7
+  pand                           m13, m12
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m8
+  punpckhwd                       m6, m8, m6
+  pmovsxwd                       m11, m8
+  mova        [qcoeffq+ncoeffq*4+ 0], m11
+  mova        [qcoeffq+ncoeffq*4+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova        [qcoeffq+ncoeffq*4+32], m11
+  mova        [qcoeffq+ncoeffq*4+48], m6
+%else
+  mova        [qcoeffq+ncoeffq*2+ 0], m8
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%endif
+
+%ifidn %1, b_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+%endif
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m8
+  punpckhwd                       m6, m8, m6
+  pmovsxwd                       m11, m8
+  mova       [dqcoeffq+ncoeffq*4+ 0], m11
+  mova       [dqcoeffq+ncoeffq*4+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova       [dqcoeffq+ncoeffq*4+32], m11
+  mova       [dqcoeffq+ncoeffq*4+48], m6
+%else
+  mova       [dqcoeffq+ncoeffq*2+ 0], m8
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+%endif
+
+  pcmpeqw                         m8, m5                    ; m8 = c[i] == 0
+  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
+  mova                            m6, [iscanq+ncoeffq*2]    ; m6 = scan[i]
+  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                    ; m6 = scan[i] + 1
+  psubw                          m11, m12                   ; m11 = scan[i] + 1
+  pandn                           m8, m6                    ; m8 = max(eob)
+  pandn                          m13, m11                   ; m13 = max(eob)
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+
+.ac_only_loop:
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; pack coeff from 32bit to 16bit array
+  mova                            m9, [coeffq+ncoeffq*4+ 0]
+  packssdw                        m9, [coeffq+ncoeffq*4+16]
+  mova                           m10, [coeffq+ncoeffq*4+32]
+  packssdw                       m10, [coeffq+ncoeffq*4+48]
+%else
+  mova                            m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
+%endif
+
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+
+  ; Check if all coeffs are less than zbin. If yes, skip this itertion.
+  ; And just write zeros as the result would be.
+  por                            m14, m7, m12
+  ptest                          m14, m14
+  jnz .rest_nonzero
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova        [qcoeffq+ncoeffq*4+ 0], ymm5
+  mova        [qcoeffq+ncoeffq*4+32], ymm5
+  mova       [dqcoeffq+ncoeffq*4+ 0], ymm5
+  mova       [dqcoeffq+ncoeffq*4+32], ymm5
+%else
+  mova        [qcoeffq+ncoeffq*2+ 0], ymm5
+  mova       [dqcoeffq+ncoeffq*2+ 0], ymm5
+%endif
+  add                        ncoeffq, mmsize
+  jnz .ac_only_loop
+
+  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  movq                           rax, m8
+  mov                           [r2], ax
+  vzeroupper
+  RET
+
+.rest_nonzero:
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                          m14, m6                   ; m14 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                         m14, m9                   ; m14 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                           m14, m7
+  pand                           m13, m12
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m14
+  punpckhwd                       m6, m14, m6
+  pmovsxwd                       m11, m14
+  mova        [qcoeffq+ncoeffq*4+ 0], m11
+  mova        [qcoeffq+ncoeffq*4+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova        [qcoeffq+ncoeffq*4+32], m11
+  mova        [qcoeffq+ncoeffq*4+48], m6
+%else
+  mova        [qcoeffq+ncoeffq*2+ 0], m14
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%endif
+
+%ifidn %1, b_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
+  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m14
+  punpckhwd                       m6, m14, m6
+  pmovsxwd                       m11, m14
+  mova       [dqcoeffq+ncoeffq*4+ 0], m11
+  mova       [dqcoeffq+ncoeffq*4+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova       [dqcoeffq+ncoeffq*4+32], m11
+  mova       [dqcoeffq+ncoeffq*4+48], m6
+%else
+  mova       [dqcoeffq+ncoeffq*2+ 0], m14
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+%endif
+
+  pcmpeqw                        m14, m5                    ; m14 = c[i] == 0
+  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
+  mova                            m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                    ; m6 = scan[i] + 1
+  psubw                          m11, m12                   ; m11 = scan[i] + 1
+  pandn                          m14, m6                    ; m14 = max(eob)
+  pandn                          m13, m11                   ; m13 = max(eob)
+  pmaxsw                          m8, m14
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jnz .ac_only_loop
+
+  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  movq                           rax, m8
+  mov                           [r2], ax
+  vzeroupper
+  RET
+
+  ; Skip-block, i.e. just write all zeroes
+.blank:
+
+DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
+            qcoeff, dqcoeff, dequant, eob, scan, iscan
+
+  mov                             r0, dqcoeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, qcoeffmp
+  mov                             r3, eobmp
+
+DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
+%else
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+%endif
+
+  neg                        ncoeffq
+  pxor                            m7, m7
+
+.blank_loop:
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova       [dqcoeffq+ncoeffq*4+ 0], ymm7
+  mova       [dqcoeffq+ncoeffq*4+32], ymm7
+  mova        [qcoeffq+ncoeffq*4+ 0], ymm7
+  mova        [qcoeffq+ncoeffq*4+32], ymm7
+%else
+  mova       [dqcoeffq+ncoeffq*2+ 0], ymm7
+  mova        [qcoeffq+ncoeffq*2+ 0], ymm7
+%endif
+  add                        ncoeffq, mmsize
+  jl .blank_loop
+
+  mov                         [eobq], word 0
+
+  vzeroupper
+  RET
+%endmacro
+
+INIT_XMM avx
+QUANTIZE_FN b, 7
+QUANTIZE_FN b_32x32, 7
+
+END