Merge "Minor refactor of decode_block for supertx." into nextgenv2

diff --git a/vp10/common/reconinter.c b/vp10/common/reconinter.c
index d7b63a3..866bda8 100644
--- a/vp10/common/reconinter.c
+++ b/vp10/common/reconinter.c

@@ -333,22 +333,15 @@
   return master;
 }
 
-static const uint8_t *get_wedge_mask(int wedge_index,
-                                     int neg,
-                                     BLOCK_SIZE bsize) {
-  return wedge_params_lookup[bsize].masks[neg][wedge_index];
-}
-
 const uint8_t *vp10_get_soft_mask(int wedge_index,
                                   int wedge_sign,
                                   BLOCK_SIZE sb_type,
                                   int offset_x,
                                   int offset_y) {
-  const int bw = 4 * num_4x4_blocks_wide_lookup[sb_type];
   const uint8_t *mask =
-      get_wedge_mask(wedge_index, wedge_sign, sb_type);
+      get_wedge_mask_inplace(wedge_index, wedge_sign, sb_type);
   if (mask)
-    mask -= (offset_x + offset_y * bw);
+    mask -= (offset_x + offset_y * MASK_MASTER_STRIDE);
   return mask;
 }
 
@@ -469,7 +462,7 @@
   vpx_blend_mask6(dst, dst_stride,
                   src0, src0_stride,
                   src1, src1_stride,
-                  mask, 4 * num_4x4_blocks_wide_lookup[sb_type],
+                  mask, MASK_MASTER_STRIDE,
                   h, w, subh, subw);
 }
 
@@ -489,7 +482,7 @@
   vpx_highbd_blend_mask6(dst_8, dst_stride,
                          src0_8, src0_stride,
                          src1_8, src1_stride,
-                         mask, 4 * num_4x4_blocks_wide_lookup[sb_type],
+                         mask, MASK_MASTER_STRIDE,
                          h, w, subh, subw, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -506,8 +499,8 @@
   // pass in subsampling factors directly.
   const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
   const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
-  const uint8_t *mask = vp10_get_soft_mask(wedge_index, wedge_sign,
-                                           sb_type, 0, 0);
+  const uint8_t *mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign,
+                                                      sb_type);
   vpx_blend_mask6(dst, dst_stride,
                   src0, src0_stride,
                   src1, src1_stride,
@@ -526,8 +519,8 @@
   // pass in subsampling factors directly.
   const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
   const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
-  const uint8_t *mask = vp10_get_soft_mask(wedge_index, wedge_sign,
-                                           sb_type, 0, 0);
+  const uint8_t *mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign,
+                                                      sb_type);
   vpx_highbd_blend_mask6(dst_8, dst_stride,
                          src0_8, src0_stride,
                          src1_8, src1_stride,
@@ -713,66 +706,79 @@
 
 #if CONFIG_DUAL_FILTER
   if (mi->mbmi.sb_type < BLOCK_8X8 && plane > 0) {
-    int blk_num = 1 << (pd->subsampling_x + pd->subsampling_y);
-    int chr_idx;
-    int x_base = x;
-    int y_base = y;
-    int x_step = w >> pd->subsampling_x;
-    int y_step = h >> pd->subsampling_y;
+    // block size in log2
+    const int b4_wl = b_width_log2_lookup[mi->mbmi.sb_type];
+    const int b4_hl = b_height_log2_lookup[mi->mbmi.sb_type];
+    const int b8_sl = b_width_log2_lookup[BLOCK_8X8];
 
-    for (chr_idx = 0; chr_idx < blk_num; ++chr_idx) {
-      for (ref = 0; ref < 1 + is_compound; ++ref) {
-        const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
-        struct buf_2d *const pre_buf = &pd->pre[ref];
-        struct buf_2d *const dst_buf = &pd->dst;
-        uint8_t *dst = dst_buf->buf;
-        const MV mv = mi->bmi[chr_idx].as_mv[ref].as_mv;
-        const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
-                                                   pd->subsampling_x,
-                                                   pd->subsampling_y);
-        uint8_t *pre;
-        MV32 scaled_mv;
-        int xs, ys, subpel_x, subpel_y;
-        const int is_scaled = vp10_is_scaled(sf);
+    // block size
+    const int b4_w = 1 << b4_wl;
+    const int b4_h = 1 << b4_hl;
+    const int b8_s = 1 << b8_sl;
+    int idx, idy;
 
-        x = x_base + (chr_idx & 0x01) * x_step;
-        y = y_base + (chr_idx >> 1) * y_step;
+    const int x_base = x;
+    const int y_base = y;
 
-        dst += dst_buf->stride * y + x;
+    // processing unit size
+    const int x_step = w >> (b8_sl - b4_wl);
+    const int y_step = h >> (b8_sl - b4_hl);
 
-        if (is_scaled) {
-          pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
-          scaled_mv = vp10_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
-          xs = sf->x_step_q4;
-          ys = sf->y_step_q4;
-        } else {
-          pre = pre_buf->buf + y * pre_buf->stride + x;
-          scaled_mv.row = mv_q4.row;
-          scaled_mv.col = mv_q4.col;
-          xs = ys = 16;
+    for (idy = 0; idy < b8_s; idy += b4_h) {
+      for (idx = 0; idx < b8_s; idx += b4_w) {
+        const int chr_idx = (idy * 2) + idx;
+        for (ref = 0; ref < 1 + is_compound; ++ref) {
+          const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+          struct buf_2d *const pre_buf = &pd->pre[ref];
+          struct buf_2d *const dst_buf = &pd->dst;
+          uint8_t *dst = dst_buf->buf;
+          const MV mv = mi->bmi[chr_idx].as_mv[ref].as_mv;
+          const MV mv_q4 = clamp_mv_to_umv_border_sb(
+              xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+          uint8_t *pre;
+          MV32 scaled_mv;
+          int xs, ys, subpel_x, subpel_y;
+          const int is_scaled = vp10_is_scaled(sf);
+
+          x = x_base + idx * x_step;
+          y = y_base + idy * y_step;
+
+          dst += dst_buf->stride * y + x;
+
+          if (is_scaled) {
+            pre =
+                pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
+            scaled_mv = vp10_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+            xs = sf->x_step_q4;
+            ys = sf->y_step_q4;
+          } else {
+            pre = pre_buf->buf + y * pre_buf->stride + x;
+            scaled_mv.row = mv_q4.row;
+            scaled_mv.col = mv_q4.col;
+            xs = ys = 16;
+          }
+
+          subpel_x = scaled_mv.col & SUBPEL_MASK;
+          subpel_y = scaled_mv.row & SUBPEL_MASK;
+          pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride +
+                 (scaled_mv.col >> SUBPEL_BITS);
+
+#if CONFIG_EXT_INTER
+          if (ref && is_interinter_wedge_used(mi->mbmi.sb_type) &&
+              mi->mbmi.use_wedge_interinter)
+            vp10_make_masked_inter_predictor(
+                pre, pre_buf->stride, dst, dst_buf->stride, subpel_x, subpel_y,
+                sf, w, h, mi->mbmi.interp_filter, xs, ys,
+#if CONFIG_SUPERTX
+                wedge_offset_x, wedge_offset_y,
+#endif  // CONFIG_SUPERTX
+                xd);
+          else
+#endif  // CONFIG_EXT_INTER
+            vp10_make_inter_predictor(
+                pre, pre_buf->stride, dst, dst_buf->stride, subpel_x, subpel_y,
+                sf, x_step, y_step, ref, mi->mbmi.interp_filter, xs, ys, xd);
         }
-
-        subpel_x = scaled_mv.col & SUBPEL_MASK;
-        subpel_y = scaled_mv.row & SUBPEL_MASK;
-        pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride
-               + (scaled_mv.col >> SUBPEL_BITS);
-
-    #if CONFIG_EXT_INTER
-        if (ref && is_interinter_wedge_used(mi->mbmi.sb_type) &&
-            mi->mbmi.use_wedge_interinter)
-          vp10_make_masked_inter_predictor(
-              pre, pre_buf->stride, dst, dst_buf->stride,
-              subpel_x, subpel_y, sf, w, h,
-              mi->mbmi.interp_filter, xs, ys,
-    #if CONFIG_SUPERTX
-              wedge_offset_x, wedge_offset_y,
-    #endif  // CONFIG_SUPERTX
-              xd);
-        else
-    #endif  // CONFIG_EXT_INTER
-          vp10_make_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
-                                    subpel_x, subpel_y, sf, x_step, y_step, ref,
-                                    mi->mbmi.interp_filter, xs, ys, xd);
       }
     }
     return;
@@ -1887,8 +1893,9 @@
 
   if (use_wedge_interintra) {
     if (is_interintra_wedge_used(bsize)) {
-      const uint8_t *mask = vp10_get_soft_mask(wedge_index, wedge_sign,
-                                               bsize, 0, 0);
+      const uint8_t *mask = vp10_get_contiguous_soft_mask(wedge_index,
+                                                          wedge_sign,
+                                                          bsize);
       const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
       const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
       vpx_blend_mask6(comppred, compstride,
@@ -2026,8 +2033,9 @@
 
   if (use_wedge_interintra) {
     if (is_interintra_wedge_used(bsize)) {
-      const uint8_t *mask = vp10_get_soft_mask(wedge_index, wedge_sign,
-                                               bsize, 0, 0);
+      const uint8_t *mask = vp10_get_contiguous_soft_mask(wedge_index,
+                                                          wedge_sign,
+                                                          bsize);
       const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
       const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
       vpx_highbd_blend_mask6(comppred8, compstride,

diff --git a/vp10/common/reconinter.h b/vp10/common/reconinter.h
index e84e20e..537d767 100644
--- a/vp10/common/reconinter.h
+++ b/vp10/common/reconinter.h

@@ -589,6 +589,12 @@
 
 void vp10_init_wedge_masks();
 
+static INLINE const uint8_t *vp10_get_contiguous_soft_mask(int wedge_index,
+                                                           int wedge_sign,
+                                                           BLOCK_SIZE sb_type) {
+  return wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
+}
+
 const uint8_t *vp10_get_soft_mask(int wedge_index,
                                   int wedge_sign,
                                   BLOCK_SIZE sb_type,

diff --git a/vp10/encoder/encodemb.c b/vp10/encoder/encodemb.c
index 9d0eb66..bd9dc55 100644
--- a/vp10/encoder/encodemb.c
+++ b/vp10/encoder/encodemb.c

@@ -60,8 +60,10 @@
   tran_low_t    qc;
 } vp10_token_state;
 
-// TODO(jimbankoski): experiment to find optimal RD numbers.
-static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] ={ {9, 7}, {8, 5}, };
+// These numbers are empirically obtained.
+static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
+    {10, 6}, {8, 5},
+};
 
 #define UPDATE_RD_COST()\
 {\
@@ -97,17 +99,17 @@
   const int eob = p->eobs[block];
   const PLANE_TYPE type = pd->plane_type;
   const int default_eob = 16 << (tx_size << 1);
-  int mul;
   const int16_t *dequant_ptr = pd->dequant;
-#if CONFIG_NEW_QUANT
-  const dequant_val_type_nuq *dequant_val = pd->dequant_val_nuq;
-#endif  // CONFIG_NEW_QUANT
   const uint8_t *const band_translate = get_band_translate(tx_size);
   TX_TYPE tx_type = get_tx_type(type, xd, block, tx_size);
   const scan_order *const so =
       get_scan(tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
   const int16_t *const scan = so->scan;
   const int16_t *const nb = so->neighbors;
+#if CONFIG_NEW_QUANT
+  const dequant_val_type_nuq *dequant_val = pd->dequant_val_nuq;
+#endif  // CONFIG_NEW_QUANT
+  int shift = get_tx_scale(xd, tx_type, tx_size);
   int next = eob, sz = 0;
   const int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][type]) >> 1;
   const int64_t rddiv = mb->rddiv;
@@ -116,7 +118,6 @@
   int16_t t0, t1;
   EXTRABIT e0;
   int best, band, pt, i, final_eob;
-  int shift = get_tx_scale(xd, tx_type, tx_size);
 #if CONFIG_VP9_HIGHBITDEPTH
   const int *cat6_high_cost = vp10_get_high_cost_table(xd->bd);
 #else
@@ -125,7 +126,6 @@
 
   assert((!type && !plane) || (type && plane));
   assert(eob <= default_eob);
-  mul = 1 << shift;
 
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
   /* Initialize the sentinel node of the trellis. */
@@ -166,7 +166,8 @@
       /* And pick the best. */
       best = rd_cost1 < rd_cost0;
       base_bits = vp10_get_cost(t0, e0, cat6_high_cost);
-      dx = mul * (dqcoeff[rc] - coeff[rc]);
+
+      dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         dx >>= xd->bd - 8;
@@ -188,14 +189,13 @@
       shortcut = (
           (dequant_abscoeff_nuq(
               abs(x), dequant_ptr[rc != 0],
-              dequant_val[band_translate[i]]) > abs(coeff[rc]) * mul) &&
+              dequant_val[band_translate[i]]) > (abs(coeff[rc]) << shift)) &&
           (dequant_abscoeff_nuq(
               abs(x) - 1, dequant_ptr[rc != 0],
-              dequant_val[band_translate[i]]) < abs(coeff[rc]) * mul));
+              dequant_val[band_translate[i]]) < (abs(coeff[rc]) << shift)));
 #else   // CONFIG_NEW_QUANT
-
-      if ((abs(x) * dequant_ptr[rc != 0] > abs(coeff[rc]) * mul) &&
-          (abs(x) * dequant_ptr[rc != 0] < abs(coeff[rc]) * mul +
+      if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) &&
+          (abs(x) * dequant_ptr[rc != 0] < (abs(coeff[rc]) << shift) +
                                                dequant_ptr[rc != 0]))
         shortcut = 1;
       else
@@ -205,6 +205,11 @@
       if (shortcut) {
         sz = -(x < 0);
         x -= 2 * sz + 1;
+      } else {
+        tokens[i][1] = tokens[i][0];
+        best_index[i][1] = best_index[i][0];
+        next = i;
+        continue;
       }
 
       /* Consider both possible successor states. */
@@ -242,7 +247,7 @@
 #if CONFIG_NEW_QUANT
         dx = dequant_coeff_nuq(
             x, dequant_ptr[rc != 0],
-            dequant_val[band_translate[i]]) - coeff[rc] * mul;
+            dequant_val[band_translate[i]]) - (coeff[rc] << shift);
 #if CONFIG_VP9_HIGHBITDEPTH
         if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
           dx >>= xd->bd - 8;
@@ -320,7 +325,8 @@
     if (shift) dqcoeff[rc] = ROUND_POWER_OF_TWO(dqcoeff[rc], shift);
     if (x < 0) dqcoeff[rc] = -dqcoeff[rc];
 #else
-    dqcoeff[rc] = (x * dequant_ptr[rc != 0]) / mul;
+    dqcoeff[rc] = (abs(x * dequant_ptr[rc != 0]) >> shift);
+    if (x < 0) dqcoeff[rc] = -dqcoeff[rc];
 #endif  // CONFIG_NEW_QUANT
 
     next = tokens[i][best].next;
@@ -894,7 +900,6 @@
   struct encode_b_args *const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct optimize_ctx *const ctx = args->ctx;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
@@ -906,8 +911,8 @@
   const int bwl = b_width_log2_lookup[plane_bsize];
 #endif
   dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
-  a = &ctx->ta[plane][blk_col];
-  l = &ctx->tl[plane][blk_row];
+  a = &args->ta[blk_col];
+  l = &args->tl[blk_row];
 
   // TODO(jingning): per transformed block zero forcing only enabled for
   // luma component. will integrate chroma components as well.
@@ -989,7 +994,7 @@
   }
 #endif
 
-  if (x->optimize) {
+  if (x->optimize && p->eobs[block]) {
     int ctx;
 #if CONFIG_VAR_TX
     switch (tx_size) {
@@ -1149,7 +1154,7 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct encode_b_args arg = {x, &ctx, &mbmi->skip};
+  struct encode_b_args arg = {x, &ctx, &mbmi->skip, NULL, NULL};
   int plane;
 
   mbmi->skip = 1;
@@ -1173,6 +1178,9 @@
 #endif
     vp10_subtract_plane(x, bsize, plane);
 
+    arg.ta = ctx.ta[plane];
+    arg.tl = ctx.tl[plane];
+
     if (x->optimize) {
 #if CONFIG_VAR_TX
       vp10_get_entropy_contexts(bsize, TX_4X4, pd,
@@ -1205,7 +1213,7 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct encode_b_args arg = {x, &ctx, &mbmi->skip};
+  struct encode_b_args arg = {x, &ctx, &mbmi->skip, NULL, NULL};
   int plane;
 
   mbmi->skip = 1;
@@ -1218,6 +1226,8 @@
     vp10_subtract_plane(x, bsize, plane);
     vp10_get_entropy_contexts(bsize, tx_size, pd,
                               ctx.ta[plane], ctx.tl[plane]);
+    arg.ta = ctx.ta[plane];
+    arg.tl = ctx.tl[plane];
     vp10_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
                                            &arg);
   }
@@ -1246,8 +1256,8 @@
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
   const int tx1d_size = get_tx1d_size(tx_size);
-
   INV_TXFM_PARAM inv_txfm_param;
+  ENTROPY_CONTEXT *a, *l;
 
   dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
   src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
@@ -1274,21 +1284,16 @@
                        tx_size);
 #else  // CONFIG_NEW_QUANT
   vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                   VP10_XFORM_QUANT_B);
+                   VP10_XFORM_QUANT_FP);
 #endif  // CONFIG_NEW_QUANT
-
-  if (args->ctx != NULL) {
-    struct optimize_ctx *const ctx = args->ctx;
-    ENTROPY_CONTEXT *a, *l;
-    a = &ctx->ta[plane][blk_col];
-    l = &ctx->tl[plane][blk_row];
-    if (x->optimize) {
-      int ctx;
-      ctx = combine_entropy_contexts(*a, *l);
-      *a = *l = vp10_optimize_b(x, plane, block, tx_size, ctx) > 0;
-    } else {
-      *a = *l = p->eobs[block] > 0;
-    }
+  a = &args->ta[blk_col];
+  l = &args->tl[blk_row];
+  if (x->optimize && p->eobs[block]) {
+    int ctx;
+    ctx = combine_entropy_contexts(*a, *l);
+    *a = *l = vp10_optimize_b(x, plane, block, tx_size, ctx) > 0;
+  } else {
+    *a = *l = p->eobs[block] > 0;
   }
 
   if (*eob) {
@@ -1315,18 +1320,18 @@
 void vp10_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
                                    int enable_optimize_b) {
   const MACROBLOCKD *const xd = &x->e_mbd;
-  struct optimize_ctx ctx;
-  struct encode_b_args arg = {x, &ctx, &xd->mi[0]->mbmi.skip};
+  ENTROPY_CONTEXT ta[2 * MAX_MIB_SIZE];
+  ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE];
+
+  struct encode_b_args arg = {x, NULL, &xd->mi[0]->mbmi.skip, ta, tl};
 
   if (enable_optimize_b && x->optimize) {
     const struct macroblockd_plane* const pd = &xd->plane[plane];
     const TX_SIZE tx_size = plane ? get_uv_tx_size(&xd->mi[0]->mbmi, pd) :
         xd->mi[0]->mbmi.tx_size;
-    vp10_get_entropy_contexts(bsize, tx_size, pd,
-                              ctx.ta[plane], ctx.tl[plane]);
-  } else {
-    arg.ctx = NULL;
+    vp10_get_entropy_contexts(bsize, tx_size, pd, ta, tl);
   }
+
   vp10_foreach_transformed_block_in_plane(xd, bsize, plane,
                                           vp10_encode_block_intra, &arg);
 }

diff --git a/vp10/encoder/encodemb.h b/vp10/encoder/encodemb.h
index cef6ccc..c241b00 100644
--- a/vp10/encoder/encodemb.h
+++ b/vp10/encoder/encodemb.h

@@ -22,6 +22,8 @@
   MACROBLOCK *x;
   struct optimize_ctx *ctx;
   int8_t *skip;
+  ENTROPY_CONTEXT *ta;
+  ENTROPY_CONTEXT *tl;
 };
 
 typedef enum VP10_XFORM_QUANT {

diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index b81c561..b000da8 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c

@@ -1213,18 +1213,17 @@
   int rate;
   int64_t dist;
   int64_t sse;
-#if !CONFIG_NEW_QUANT
   ENTROPY_CONTEXT coeff_ctx = combine_entropy_contexts(
       *(args->t_above + blk_col), *(args->t_left + blk_row));
-#endif
 
   if (args->exit_early)
     return;
 
   if (!is_inter_block(mbmi)) {
-    struct encode_b_args arg = {x, NULL, &mbmi->skip};
+    struct encode_b_args intra_arg = {x, NULL, &mbmi->skip, args->t_above,
+                                      args->t_left};
     vp10_encode_block_intra(plane, block, blk_row, blk_col,
-                            plane_bsize, tx_size, &arg);
+                            plane_bsize, tx_size, &intra_arg);
 
     if (args->cpi->sf.use_transform_domain_distortion) {
       dist_block(args->cpi, x, plane, block, blk_row, blk_col,
@@ -1269,9 +1268,10 @@
 #else
       vp10_xform_quant(x, plane, block, blk_row, blk_col,
                        plane_bsize, tx_size, VP10_XFORM_QUANT_FP);
-      vp10_optimize_b(x, plane, block, tx_size, coeff_ctx);
 #endif  // CONFIG_NEW_QUANT
-     dist_block(args->cpi, x, plane, block, blk_row, blk_col,
+      if (x->plane[plane].eobs[block])
+        vp10_optimize_b(x, plane, block, tx_size, coeff_ctx);
+      dist_block(args->cpi, x, plane, block, blk_row, blk_col,
                  tx_size, &dist, &sse);
     } else if (x->skip_txfm[plane][block >> (tx_size << 1)] ==
                SKIP_TXFM_AC_ONLY) {
@@ -1324,8 +1324,9 @@
 #else
     vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                      VP10_XFORM_QUANT_FP);
-    vp10_optimize_b(x, plane, block, tx_size, coeff_ctx);
 #endif  // CONFIG_NEW_QUANT
+    if (x->plane[plane].eobs[block])
+      vp10_optimize_b(x, plane, block, tx_size, coeff_ctx);
     dist_block(args->cpi, x, plane, block, blk_row, blk_col,
                tx_size, &dist, &sse);
   }
@@ -6512,7 +6513,7 @@
   BLOCK_SIZE sb_type = mbmi->sb_type;
   const uint8_t *mask;
   const int mask_stride = 4 * num_4x4_blocks_wide_lookup[bsize];
-  mask = vp10_get_soft_mask(wedge_index, wedge_sign, sb_type, 0, 0);
+  mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type);
 
   if (which == 0 || which == 2)
     do_masked_motion_search(cpi, x, mask, mask_stride, bsize,
@@ -6521,7 +6522,7 @@
 
   if (which == 1 || which == 2) {
     // get the negative mask
-    mask = vp10_get_soft_mask(wedge_index, !wedge_sign, sb_type, 0, 0);
+    mask = vp10_get_contiguous_soft_mask(wedge_index, !wedge_sign, sb_type);
     do_masked_motion_search(cpi, x, mask, mask_stride, bsize,
                             mi_row, mi_col, &tmp_mv[1], &rate_mv[1],
                             1, mv_idx[1]);
@@ -6577,6 +6578,13 @@
   uint32_t esq[2][4], var;
   int64_t tl, br;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    pred0 = CONVERT_TO_BYTEPTR(pred0);
+    pred1 = CONVERT_TO_BYTEPTR(pred1);
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   var = cpi->fn_ptr[f_index].vf(
       src, src_stride,
       pred0, stride0, &esq[0][0]);
@@ -7577,8 +7585,8 @@
         // Refine motion vector.
         if (have_newmv_in_inter_mode(this_mode) && best_wedge_index > -1) {
           // get negative of mask
-          const uint8_t* mask = vp10_get_soft_mask(
-              best_wedge_index, 1, bsize, 0, 0);
+          const uint8_t* mask = vp10_get_contiguous_soft_mask(
+              best_wedge_index, 1, bsize);
           mbmi->interintra_wedge_index = best_wedge_index;
           mbmi->interintra_wedge_sign = 0;
           do_masked_motion_search(cpi, x, mask, bw, bsize,
@@ -8307,6 +8315,189 @@
       color_map[r * cols + c] = indices[r * cols + c];
 }
 
+#if CONFIG_EXT_INTRA
+static void pick_ext_intra_iframe(VP10_COMP *cpi, MACROBLOCK *x,
+                                  PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize,
+                                  int *rate_uv_intra, int *rate_uv_tokenonly,
+                                  int64_t *dist_uv, int *skip_uv,
+                                  PREDICTION_MODE *mode_uv,
+                                  EXT_INTRA_MODE_INFO *ext_intra_mode_info_uv,
+                                  PALETTE_MODE_INFO *pmi_uv,
+                                  int8_t *uv_angle_delta,
+                                  int palette_ctx, int skip_mask,
+                                  unsigned int *ref_costs_single,
+                                  int64_t *best_rd, int64_t *best_intra_rd,
+                                  PREDICTION_MODE *best_intra_mode,
+                                  int *best_mode_index, int *best_skip2,
+                                  int *best_mode_skippable,
+#if CONFIG_SUPERTX
+                                  int *returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                                  int64_t *best_pred_rd,
+                                  MB_MODE_INFO *best_mbmode, RD_COST *rd_cost) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  int rate2 = 0, rate_y = INT_MAX, skippable = 0, rate_uv, rate_dummy, i;
+  int dc_mode_index;
+  const int * const intra_mode_cost =
+      cpi->mbmode_cost[size_group_lookup[bsize]];
+  int64_t distortion2 = 0, distortion_y = 0, this_rd = *best_rd, distortion_uv;
+  TX_SIZE uv_tx;
+
+  for (i = 0; i < MAX_MODES; ++i)
+    if (vp10_mode_order[i].mode == DC_PRED &&
+        vp10_mode_order[i].ref_frame[0] == INTRA_FRAME)
+      break;
+  dc_mode_index = i;
+  assert(i < MAX_MODES);
+
+  // TODO(huisu): use skip_mask for further speedup.
+  (void)skip_mask;
+  mbmi->mode = DC_PRED;
+  mbmi->uv_mode = DC_PRED;
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE;
+  memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
+  if (!rd_pick_ext_intra_sby(cpi, x, &rate_dummy, &rate_y, &distortion_y,
+                             &skippable, bsize,
+                             intra_mode_cost[mbmi->mode], &this_rd, 0))
+    return;
+  if (rate_y == INT_MAX)
+    return;
+
+  uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize,
+                              xd->plane[1].subsampling_x,
+                              xd->plane[1].subsampling_y);
+  if (rate_uv_intra[uv_tx] == INT_MAX) {
+    choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx,
+                         &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
+                         &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
+    if (cm->allow_screen_content_tools)
+      pmi_uv[uv_tx] = *pmi;
+    ext_intra_mode_info_uv[uv_tx] = mbmi->ext_intra_mode_info;
+    uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
+  }
+
+  rate_uv = rate_uv_tokenonly[uv_tx];
+  distortion_uv = dist_uv[uv_tx];
+  skippable = skippable && skip_uv[uv_tx];
+  mbmi->uv_mode = mode_uv[uv_tx];
+  if (cm->allow_screen_content_tools) {
+    pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
+    memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+           pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
+           2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+  }
+  mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[1] =
+      ext_intra_mode_info_uv[uv_tx].use_ext_intra_mode[1];
+  if (ext_intra_mode_info_uv[uv_tx].use_ext_intra_mode[1]) {
+    mbmi->ext_intra_mode_info.ext_intra_mode[1] =
+        ext_intra_mode_info_uv[uv_tx].ext_intra_mode[1];
+  }
+
+  rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv +
+      cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
+  if (cpi->common.allow_screen_content_tools && mbmi->mode == DC_PRED)
+    rate2 +=
+        vp10_cost_bit(vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8]
+                                                       [palette_ctx], 0);
+
+  if (!xd->lossless[mbmi->segment_id]) {
+    // super_block_yrd above includes the cost of the tx_size in the
+    // tokenonly rate, but for intra blocks, tx_size is always coded
+    // (prediction granularity), so we account for it in the full rate,
+    // not the tokenonly rate.
+    rate_y -=
+        cpi->tx_size_cost[max_tx_size - TX_8X8][get_tx_size_context(xd)]
+                                                [mbmi->tx_size];
+  }
+
+  rate2 += vp10_cost_bit(cm->fc->ext_intra_probs[0],
+                         mbmi->ext_intra_mode_info.use_ext_intra_mode[0]);
+  rate2 += write_uniform_cost(FILTER_INTRA_MODES,
+                              mbmi->ext_intra_mode_info.ext_intra_mode[0]);
+  if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED) {
+    rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1,
+                                MAX_ANGLE_DELTAS +
+                                mbmi->angle_delta[1]);
+  }
+  if (ALLOW_FILTER_INTRA_MODES && mbmi->mode == DC_PRED) {
+    rate2 += vp10_cost_bit(cpi->common.fc->ext_intra_probs[1],
+                           mbmi->ext_intra_mode_info.use_ext_intra_mode[1]);
+    if (mbmi->ext_intra_mode_info.use_ext_intra_mode[1])
+      rate2 +=
+          write_uniform_cost(FILTER_INTRA_MODES,
+                             mbmi->ext_intra_mode_info.ext_intra_mode[1]);
+  }
+  distortion2 = distortion_y + distortion_uv;
+  vp10_encode_intra_block_plane(x, bsize, 0, 0);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    x->recon_variance =
+        vp10_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst,
+                                            bsize, xd->bd);
+  } else {
+    x->recon_variance =
+        vp10_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+  }
+#else
+  x->recon_variance =
+      vp10_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  rate2 += ref_costs_single[INTRA_FRAME];
+
+  if (skippable) {
+    rate2 -= (rate_y + rate_uv);
+    rate_y = 0;
+    rate_uv = 0;
+    rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
+  } else {
+    rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 0);
+  }
+  this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+  rd_variance_adjustment(x, &this_rd, INTRA_FRAME,
+#if CONFIG_OBMC
+                         is_inter_block(mbmi),
+#endif  // CONFIG_OBMC
+                         x->source_variance);
+
+  if (this_rd < *best_intra_rd) {
+    *best_intra_rd = this_rd;
+    *best_intra_mode = mbmi->mode;
+  }
+  for (i = 0; i < REFERENCE_MODES; ++i)
+    best_pred_rd[i] = VPXMIN(best_pred_rd[i], this_rd);
+
+  if (this_rd < *best_rd) {
+    *best_mode_index = dc_mode_index;
+    mbmi->mv[0].as_int = 0;
+    rd_cost->rate = rate2;
+#if CONFIG_SUPERTX
+    if (x->skip)
+      *returnrate_nocoef = rate2;
+    else
+      *returnrate_nocoef = rate2 - rate_y - rate_uv;
+    *returnrate_nocoef -= vp10_cost_bit(vp10_get_skip_prob(cm, xd), skippable);
+    *returnrate_nocoef -= vp10_cost_bit(vp10_get_intra_inter_prob(cm, xd),
+                                        mbmi->ref_frame[0] != INTRA_FRAME);
+#endif  // CONFIG_SUPERTX
+    rd_cost->dist = distortion2;
+    rd_cost->rdcost = this_rd;
+    *best_rd = this_rd;
+    *best_mbmode = *mbmi;
+    *best_skip2 = 0;
+    *best_mode_skippable = skippable;
+    memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
+           sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+  }
+}
+#endif  // CONFIG_EXT_INTRA
+
 void vp10_rd_pick_inter_mode_sb(VP10_COMP *cpi,
                                 TileDataEnc *tile_data,
                                 MACROBLOCK *x,
@@ -8373,7 +8564,7 @@
   PALETTE_MODE_INFO pmi_uv[TX_SIZES];
 #if CONFIG_EXT_INTRA
   EXT_INTRA_MODE_INFO ext_intra_mode_info_uv[TX_SIZES];
-  int8_t uv_angle_delta[TX_SIZES];
+  int8_t uv_angle_delta[TX_SIZES], dc_skipped = 1;
   int is_directional_mode, angle_stats_ready = 0;
   int rate_overhead, rate_dummy;
   uint8_t directional_mode_skip_mask[INTRA_MODES];
@@ -8936,28 +9127,6 @@
         mbmi->angle_delta[0] = 0;
         super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
                         NULL, bsize, best_rd);
-        if (rate_y == INT_MAX)
-          continue;
-      }
-
-      // TODO(huisu): ext-intra is turned off in lossless mode for now to
-      // avoid a unit test failure
-      if (mbmi->mode == DC_PRED && !xd->lossless[mbmi->segment_id] &&
-          ALLOW_FILTER_INTRA_MODES) {
-        MB_MODE_INFO mbmi_copy = *mbmi;
-
-        if (rate_y != INT_MAX) {
-          int this_rate = rate_y + intra_mode_cost[mbmi->mode] +
-              vp10_cost_bit(cm->fc->ext_intra_probs[0], 0);
-          this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, distortion_y);
-        } else {
-          this_rd = best_rd;
-        }
-
-        if (!rd_pick_ext_intra_sby(cpi, x, &rate_dummy, &rate_y, &distortion_y,
-                                   &skippable, bsize,
-                                   intra_mode_cost[mbmi->mode], &this_rd, 0))
-          *mbmi = mbmi_copy;
       }
 #else
       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
@@ -8966,6 +9135,12 @@
 
       if (rate_y == INT_MAX)
         continue;
+
+#if CONFIG_EXT_INTRA
+      if (mbmi->mode == DC_PRED)
+        dc_skipped = 0;
+#endif  // CONFIG_EXT_INTRA
+
       uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize, pd->subsampling_x,
                                   pd->subsampling_y);
       if (rate_uv_intra[uv_tx] == INT_MAX) {
@@ -9033,16 +9208,31 @@
         rate2 += vp10_cost_bit(cm->fc->ext_intra_probs[0],
                                mbmi->ext_intra_mode_info.use_ext_intra_mode[0]);
         if (mbmi->ext_intra_mode_info.use_ext_intra_mode[0]) {
-          EXT_INTRA_MODE ext_intra_mode =
-              mbmi->ext_intra_mode_info.ext_intra_mode[0];
-          rate2 += write_uniform_cost(FILTER_INTRA_MODES, ext_intra_mode);
+          rate2 +=
+              write_uniform_cost(FILTER_INTRA_MODES,
+                                 mbmi->ext_intra_mode_info.ext_intra_mode[0]);
         }
       }
+
+      if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED) {
+        rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1,
+                                    MAX_ANGLE_DELTAS +
+                                    mbmi->angle_delta[1]);
+      }
+
+      if (ALLOW_FILTER_INTRA_MODES && mbmi->mode == DC_PRED) {
+        rate2 += vp10_cost_bit(cpi->common.fc->ext_intra_probs[1],
+                               mbmi->ext_intra_mode_info.use_ext_intra_mode[1]);
+        if (mbmi->ext_intra_mode_info.use_ext_intra_mode[1])
+          rate2 +=
+              write_uniform_cost(FILTER_INTRA_MODES,
+                                 mbmi->ext_intra_mode_info.ext_intra_mode[1]);
+      }
 #endif  // CONFIG_EXT_INTRA
       if (this_mode != DC_PRED && this_mode != TM_PRED)
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
-      vp10_encode_intra_block_plane(x, bsize, 0, 0);
+      vp10_encode_intra_block_plane(x, bsize, 0, 1);
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         x->recon_variance =
@@ -9505,9 +9695,11 @@
       break;
   }
 
-  if (sf->tx_type_search.fast_inter_tx_type_search == 1 &&
-      xd->lossless[mbmi->segment_id] == 0 &&
-      best_mode_index >= 0) {
+  if (xd->lossless[mbmi->segment_id] == 0 && best_mode_index >= 0 &&
+      ((sf->tx_type_search.fast_inter_tx_type_search == 1 &&
+        is_inter_mode(best_mbmode.mode)) ||
+       (sf->tx_type_search.fast_intra_tx_type_search == 1 &&
+        !is_inter_mode(best_mbmode.mode)))) {
     int rate_y = 0, rate_uv = 0;
     int64_t dist_y = 0, dist_uv = 0;
     int skip_y = 0, skip_uv = 0, skip_blk = 0;
@@ -9706,6 +9898,26 @@
   }
   PALETTE_EXIT:
 
+#if CONFIG_EXT_INTRA
+  // TODO(huisu): ext-intra is turned off in lossless mode for now to
+  // avoid a unit test failure
+  if (!xd->lossless[mbmi->segment_id] &&
+      mbmi->palette_mode_info.palette_size[0] == 0 && !dc_skipped &&
+      best_mode_index >= 0 && (best_intra_rd >> 1)  < best_rd) {
+    pick_ext_intra_iframe(cpi, x, ctx, bsize, rate_uv_intra,
+                          rate_uv_tokenonly, dist_uv, skip_uv,
+                          mode_uv, ext_intra_mode_info_uv,
+                          pmi_uv, uv_angle_delta, palette_ctx, 0,
+                          ref_costs_single, &best_rd, &best_intra_rd,
+                          &best_intra_mode, &best_mode_index,
+                          &best_skip2, &best_mode_skippable,
+#if CONFIG_SUPERTX
+                          returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                          best_pred_rd, &best_mbmode, rd_cost);
+  }
+#endif  // CONFIG_EXT_INTRA
+
   // The inter modes' rate costs are not calculated precisely in some cases.
   // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
   // ZEROMV. Here, checks are added for those cases, and the mode decisions

diff --git a/vp10/encoder/speed_features.c b/vp10/encoder/speed_features.c
index bd0cb81..53b8bd7 100644
--- a/vp10/encoder/speed_features.c
+++ b/vp10/encoder/speed_features.c

@@ -132,6 +132,9 @@
 
   sf->adaptive_rd_thresh = 1;
 
+  sf->tx_type_search.fast_intra_tx_type_search = 1;
+  sf->tx_type_search.fast_inter_tx_type_search = 1;
+
   if (speed >= 1) {
     if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
         vp10_internal_image_edge(cpi)) {