Merge "Adds compound wedge prediction modes" into nextgenv2
diff --git a/configure b/configure
index 5eec2a1..e7eb152 100755
--- a/configure
+++ b/configure
@@ -282,6 +282,7 @@
     ans
     loop_restoration
     ext_partition
+    ext_tile
     obmc
     affine_motion
 "
diff --git a/test/test.mk b/test/test.mk
index 5983f42..1f120ce 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -168,6 +168,7 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_dct_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_fht4x4_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_fht8x8_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_fht16x16_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ANS)          += vp10_ans_test.cc
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += sum_squares_test.cc
diff --git a/test/vp10_fht16x16_test.cc b/test/vp10_fht16x16_test.cc
new file mode 100644
index 0000000..d501e10
--- /dev/null
+++ b/test/vp10_fht16x16_test.cc
@@ -0,0 +1,124 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp10_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+#include "vpx_ports/mem.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+
+using libvpx_test::FhtFunc;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t, int>
+Ht16x16Param;
+
+void fht16x16_ref(const int16_t *in, tran_low_t *out, int stride,
+                int tx_type) {
+  vp10_fht16x16_c(in, out, stride, tx_type);
+}
+
+class VP10Trans16x16HT
+    : public libvpx_test::TransformTestBase,
+      public ::testing::TestWithParam<Ht16x16Param> {
+ public:
+  virtual ~VP10Trans16x16HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 16;
+    fwd_txfm_ref = fht16x16_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
+  }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
+};
+
+TEST_P(VP10Trans16x16HT, CoeffCheck) {
+  RunCoeffCheck();
+}
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP10Trans16x16HT,
+    ::testing::Values(
+#if !CONFIG_EXT_TX
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 0,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 1,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 2,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 3,
+                 VPX_BITS_8, 256)));
+#else
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 0,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 1,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 2,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 3,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 4,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 5,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 6,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 7,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 8,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 9,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 10,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 11,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 12,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 13,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 14,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 15,
+                 VPX_BITS_8, 256)));
+#endif  // !CONFIG_EXT_TX
+#endif  // HAVE_SSE2
+
+}  // namespace
diff --git a/vp10/common/ans.h b/vp10/common/ans.h
index 163a7a9..6bd3012 100644
--- a/vp10/common/ans.h
+++ b/vp10/common/ans.h
@@ -277,8 +277,8 @@
     ++i;
   }
   out->val = i - 1;
-  out->prob = cdf[i] - cdf[i - 1];
-  out->cum_prob = cdf[i - 1];
+  out->prob = (AnsP8)(cdf[i] - cdf[i - 1]);
+  out->cum_prob = (AnsP8)cdf[i - 1];
 }
 
 static INLINE int rans_read(struct AnsDecoder *ans,
diff --git a/vp10/common/entropymode.c b/vp10/common/entropymode.c
index d5c8f44..b939424 100644
--- a/vp10/common/entropymode.c
+++ b/vp10/common/entropymode.c
@@ -329,17 +329,6 @@
 #endif  // CONFIG_EXT_REFS
 };
 
-static const struct tx_probs default_tx_probs = {
-  { { 3, 136, 37 },
-    { 5, 52,  13 } },
-
-  { { 20, 152 },
-    { 15, 101 } },
-
-  { { 100 },
-    { 66  } }
-};
-
 const vpx_tree_index vp10_palette_size_tree[TREE_SIZE(PALETTE_SIZES)] = {
     -TWO_COLORS, 2,
     -THREE_COLORS, 4,
@@ -703,6 +692,34 @@
     9680, 10648, 10890, 13310
 };
 
+const vpx_tree_index vp10_tx_size_tree[TX_SIZES - 1][TREE_SIZE(TX_SIZES)] = {
+    {  // Max tx_size is 8X8
+        -TX_4X4, -TX_8X8,
+    },
+    {  // Max tx_size is 16X16
+        -TX_4X4, 2,
+        -TX_8X8, -TX_16X16,
+    },
+    {  // Max tx_size is 32X32
+        -TX_4X4, 2,
+        -TX_8X8, 4,
+        -TX_16X16, -TX_32X32,
+    },
+};
+
+static const vpx_prob
+default_tx_size_prob[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES - 1] = {
+    {  // Max tx_size is 8X8
+        { 100, }, { 66, },
+    },
+    {  // Max tx_size is 16X16
+        { 20, 152, }, { 15, 101, },
+    },
+    {  // Max tx_size is 32X32
+        { 3, 136, 37 }, { 5, 52,  13 },
+    },
+};
+
 int vp10_get_palette_color_context(const uint8_t *color_map, int cols,
                                    int r, int c, int n, int *color_order) {
   int i, j, max, max_idx, temp;
@@ -776,33 +793,6 @@
   return color_ctx;
 }
 
-void vp10_tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
-                                      unsigned int (*ct_32x32p)[2]) {
-  ct_32x32p[0][0] = tx_count_32x32p[TX_4X4];
-  ct_32x32p[0][1] = tx_count_32x32p[TX_8X8] +
-                    tx_count_32x32p[TX_16X16] +
-                    tx_count_32x32p[TX_32X32];
-  ct_32x32p[1][0] = tx_count_32x32p[TX_8X8];
-  ct_32x32p[1][1] = tx_count_32x32p[TX_16X16] +
-                    tx_count_32x32p[TX_32X32];
-  ct_32x32p[2][0] = tx_count_32x32p[TX_16X16];
-  ct_32x32p[2][1] = tx_count_32x32p[TX_32X32];
-}
-
-void vp10_tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
-                                      unsigned int (*ct_16x16p)[2]) {
-  ct_16x16p[0][0] = tx_count_16x16p[TX_4X4];
-  ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] + tx_count_16x16p[TX_16X16];
-  ct_16x16p[1][0] = tx_count_16x16p[TX_8X8];
-  ct_16x16p[1][1] = tx_count_16x16p[TX_16X16];
-}
-
-void vp10_tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
-                                    unsigned int (*ct_8x8p)[2]) {
-  ct_8x8p[0][0] = tx_count_8x8p[TX_4X4];
-  ct_8x8p[0][1] = tx_count_8x8p[TX_8X8];
-}
-
 #if CONFIG_VAR_TX
 static const vpx_prob default_txfm_partition_probs[TXFM_PARTITION_CONTEXTS] = {
     192, 128, 64, 192, 128, 64, 192, 128, 64,
@@ -1324,7 +1314,7 @@
   vp10_copy(fc->comp_inter_prob, default_comp_inter_p);
   vp10_copy(fc->comp_ref_prob, default_comp_ref_p);
   vp10_copy(fc->single_ref_prob, default_single_ref_p);
-  fc->tx_probs = default_tx_probs;
+  vp10_copy(fc->tx_size_probs, default_tx_size_prob);
 #if CONFIG_VAR_TX
   vp10_copy(fc->txfm_partition_prob, default_txfm_partition_probs);
 #endif
@@ -1487,32 +1477,18 @@
 }
 
 void vp10_adapt_intra_frame_probs(VP10_COMMON *cm) {
-  int i;
+  int i, j;
   FRAME_CONTEXT *fc = cm->fc;
   const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
   const FRAME_COUNTS *counts = &cm->counts;
 
   if (cm->tx_mode == TX_MODE_SELECT) {
-    int j;
-    unsigned int branch_ct_8x8p[TX_SIZES - 3][2];
-    unsigned int branch_ct_16x16p[TX_SIZES - 2][2];
-    unsigned int branch_ct_32x32p[TX_SIZES - 1][2];
-
-    for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-      vp10_tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
-      for (j = 0; j < TX_SIZES - 3; ++j)
-        fc->tx_probs.p8x8[i][j] = mode_mv_merge_probs(
-            pre_fc->tx_probs.p8x8[i][j], branch_ct_8x8p[j]);
-
-      vp10_tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p);
-      for (j = 0; j < TX_SIZES - 2; ++j)
-        fc->tx_probs.p16x16[i][j] = mode_mv_merge_probs(
-            pre_fc->tx_probs.p16x16[i][j], branch_ct_16x16p[j]);
-
-      vp10_tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p);
-      for (j = 0; j < TX_SIZES - 1; ++j)
-        fc->tx_probs.p32x32[i][j] = mode_mv_merge_probs(
-            pre_fc->tx_probs.p32x32[i][j], branch_ct_32x32p[j]);
+    for (i = 0; i < TX_SIZES - 1; ++i) {
+      for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+        vpx_tree_merge_probs(vp10_tx_size_tree[i],
+                             pre_fc->tx_size_probs[i][j],
+                             counts->tx_size[i][j],
+                             fc->tx_size_probs[i][j]);
     }
   }
 
@@ -1552,7 +1528,6 @@
   }
 #else
   for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
-    int j;
     for (j = 0; j < TX_TYPES; ++j)
       vpx_tree_merge_probs(vp10_ext_tx_tree,
                            pre_fc->intra_ext_tx_prob[i][j],
diff --git a/vp10/common/entropymode.h b/vp10/common/entropymode.h
index b208dcf..ba36ddb 100644
--- a/vp10/common/entropymode.h
+++ b/vp10/common/entropymode.h
@@ -37,19 +37,6 @@
 
 struct VP10Common;
 
-struct tx_probs {
-  vpx_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZES - 1];
-  vpx_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-  vpx_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 3];
-};
-
-struct tx_counts {
-  unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZES];
-  unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 1];
-  unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-  unsigned int tx_totals[TX_SIZES];
-};
-
 struct seg_counts {
   unsigned int tree_total[MAX_SEGMENTS];
   unsigned int tree_mispred[MAX_SEGMENTS];
@@ -91,7 +78,7 @@
   vpx_prob comp_inter_prob[COMP_INTER_CONTEXTS];
   vpx_prob single_ref_prob[REF_CONTEXTS][SINGLE_REFS-1];
   vpx_prob comp_ref_prob[REF_CONTEXTS][COMP_REFS-1];
-  struct tx_probs tx_probs;
+  vpx_prob tx_size_probs[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES - 1];
 #if CONFIG_VAR_TX
   vpx_prob txfm_partition_prob[TXFM_PARTITION_CONTEXTS];
 #endif
@@ -155,7 +142,8 @@
   unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
   unsigned int single_ref[REF_CONTEXTS][SINGLE_REFS-1][2];
   unsigned int comp_ref[REF_CONTEXTS][COMP_REFS-1][2];
-  struct tx_counts tx;
+  unsigned int tx_size_totals[TX_SIZES];
+  unsigned int tx_size[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
 #if CONFIG_VAR_TX
   unsigned int txfm_partition[TXFM_PARTITION_CONTEXTS][2];
 #endif
@@ -209,6 +197,8 @@
 extern const vpx_tree_index vp10_palette_size_tree[TREE_SIZE(PALETTE_SIZES)];
 extern const vpx_tree_index
 vp10_palette_color_tree[PALETTE_MAX_SIZE - 1][TREE_SIZE(PALETTE_COLORS)];
+extern const vpx_tree_index
+vp10_tx_size_tree[TX_SIZES - 1][TREE_SIZE(TX_SIZES)];
 #if CONFIG_EXT_INTRA
 extern const vpx_tree_index vp10_intra_filter_tree[TREE_SIZE(INTRA_FILTERS)];
 #endif  // CONFIG_EXT_INTRA
@@ -227,13 +217,6 @@
 void vp10_adapt_intra_frame_probs(struct VP10Common *cm);
 void vp10_adapt_inter_frame_probs(struct VP10Common *cm);
 
-void vp10_tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
-                                      unsigned int (*ct_32x32p)[2]);
-void vp10_tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
-                                      unsigned int (*ct_16x16p)[2]);
-void vp10_tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
-                                    unsigned int (*ct_8x8p)[2]);
-
 static INLINE int vp10_ceil_log2(int n) {
   int i = 1, p = 2;
   while (p < n) {
diff --git a/vp10/common/pred_common.h b/vp10/common/pred_common.h
index 7d2f28a..83a3597 100644
--- a/vp10/common/pred_common.h
+++ b/vp10/common/pred_common.h
@@ -185,48 +185,11 @@
   return (above_ctx + left_ctx) > max_tx_size;
 }
 
-static INLINE const vpx_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
-                                           const struct tx_probs *tx_probs) {
-  switch (max_tx_size) {
-    case TX_8X8:
-      return tx_probs->p8x8[ctx];
-    case TX_16X16:
-      return tx_probs->p16x16[ctx];
-    case TX_32X32:
-      return tx_probs->p32x32[ctx];
-    default:
-      assert(0 && "Invalid max_tx_size.");
-      return NULL;
-  }
-}
-
-static INLINE const vpx_prob *get_tx_probs2(TX_SIZE max_tx_size,
-                                            const MACROBLOCKD *xd,
-                                            const struct tx_probs *tx_probs) {
-  return get_tx_probs(max_tx_size, get_tx_size_context(xd), tx_probs);
-}
-
-static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
-                                          struct tx_counts *tx_counts) {
-  switch (max_tx_size) {
-    case TX_8X8:
-      return tx_counts->p8x8[ctx];
-    case TX_16X16:
-      return tx_counts->p16x16[ctx];
-    case TX_32X32:
-      return tx_counts->p32x32[ctx];
-    default:
-      assert(0 && "Invalid max_tx_size.");
-      return NULL;
-  }
-}
-
 #if CONFIG_VAR_TX
 static void update_tx_counts(VP10_COMMON *cm, MACROBLOCKD *xd,
                              MB_MODE_INFO *mbmi, BLOCK_SIZE plane_bsize,
                              TX_SIZE tx_size, int blk_row, int blk_col,
-                             TX_SIZE max_tx_size, int ctx,
-                             struct tx_counts *tx_counts) {
+                             TX_SIZE max_tx_size, int ctx) {
   const struct macroblockd_plane *const pd = &xd->plane[0];
   const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
   int tx_idx = (blk_row >> (1 - pd->subsampling_y)) * 8 +
@@ -244,7 +207,7 @@
     return;
 
   if (tx_size == plane_tx_size) {
-    ++get_tx_counts(max_tx_size, ctx, tx_counts)[tx_size];
+    ++xd->counts->tx_size[max_tx_size - TX_8X8][ctx][tx_size];
     mbmi->tx_size = tx_size;
   } else {
     int bsl = b_width_log2_lookup[bsize];
@@ -260,8 +223,7 @@
       if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide)
         continue;
       update_tx_counts(cm, xd, mbmi, plane_bsize,
-                       tx_size - 1, offsetr, offsetc,
-                       max_tx_size, ctx, tx_counts);
+                       tx_size - 1, offsetr, offsetc, max_tx_size, ctx);
     }
   }
 }
@@ -270,8 +232,7 @@
                                                MACROBLOCKD *xd,
                                                MB_MODE_INFO *mbmi,
                                                BLOCK_SIZE plane_bsize,
-                                               int ctx,
-                                               struct tx_counts *tx_counts) {
+                                               int ctx) {
   const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
   const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
   TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
@@ -282,7 +243,7 @@
   for (idy = 0; idy < mi_height; idy += bh)
     for (idx = 0; idx < mi_width; idx += bh)
       update_tx_counts(cm, xd, mbmi, plane_bsize, max_tx_size, idy, idx,
-                       max_tx_size, ctx, tx_counts);
+                       max_tx_size, ctx);
 }
 #endif
 
diff --git a/vp10/common/scan.c b/vp10/common/scan.c
index 21d291f..e26b40d 100644
--- a/vp10/common/scan.c
+++ b/vp10/common/scan.c
@@ -19,6 +19,20 @@
   7, 14, 11, 15,
 };
 
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x4[16]) = {
+  0, 4, 8, 12,
+  1, 5, 9, 13,
+  2, 6, 10, 14,
+  3, 7, 11, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x4[16]) = {
+  0, 1, 2, 3,
+  4, 5, 6, 7,
+  8, 9, 10, 11,
+  12, 13, 14, 15,
+};
+
 DECLARE_ALIGNED(16, static const int16_t, col_scan_4x4[16]) = {
   0,  4,  8,  1,
   12,  5,  9,  2,
@@ -34,7 +48,7 @@
 };
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = {
-  0,  8,  1, 16,  9,  2, 17, 24,
+  0,   8,  1, 16,  9,  2, 17, 24,
   10,  3, 18, 25, 32, 11,  4, 26,
   33, 19, 40, 12, 34, 27,  5, 41,
   20, 48, 13, 35, 42, 28, 21,  6,
@@ -44,6 +58,28 @@
   46, 39, 61, 54, 47, 62, 55, 63,
 };
 
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x8[64]) = {
+  0,   8,  16,  24,  32,  40,  48,  56,
+  1,   9,  17,  25,  33,  41,  49,  57,
+  2,  10,  18,  26,  34,  42,  50,  58,
+  3,  11,  19,  27,  35,  43,  51,  59,
+  4,  12,  20,  28,  36,  44,  52,  60,
+  5,  13,  21,  29,  37,  45,  53,  61,
+  6,  14,  22,  30,  38,  46,  54,  62,
+  7,  15,  23,  31,  39,  47,  55,  63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x8[64]) = {
+  0,    1,   2,   3,   4,   5,   6,   7,
+  8,    9,  10,  11,  12,  13,  14,  15,
+  16,  17,  18,  19,  20,  21,  22,  23,
+  24,  25,  26,  27,  28,  29,  30,  31,
+  32,  33,  34,  35,  36,  37,  38,  39,
+  40,  41,  42,  43,  44,  45,  46,  47,
+  48,  49,  50,  51,  52,  53,  54,  55,
+  56,  57,  58,  59,  60,  61,  62,  63,
+};
+
 DECLARE_ALIGNED(16, static const int16_t, col_scan_8x8[64]) = {
   0,  8, 16,  1, 24,  9, 32, 17,
   2, 40, 25, 10, 33, 18, 48,  3,
@@ -87,6 +123,53 @@
   255,
 };
 
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x16[256]) = {
+  0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
+  1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241,
+  2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242,
+  3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243,
+  4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
+  5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
+  6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
+  7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+  8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
+  9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
+  10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
+  11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
+  12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
+  13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
+  14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+  15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x16[256]) = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+  64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+  80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+  96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+  112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124,
+  125, 126, 127,
+  128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140,
+  141, 142, 143,
+  144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
+  157, 158, 159,
+  160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172,
+  173, 174, 175,
+  176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188,
+  189, 190, 191,
+  192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204,
+  205, 206, 207,
+  208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220,
+  221, 222, 223,
+  224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236,
+  237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
+  253, 254, 255,
+};
+
 DECLARE_ALIGNED(16, static const int16_t, col_scan_16x16[256]) = {
   0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81,
   34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4,
@@ -130,6 +213,201 @@
   255,
 };
 
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x32[1024]) = {
+  0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480,
+  512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960,
+  992,
+  1, 33, 65, 97, 129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481,
+  513, 545, 577, 609, 641, 673, 705, 737, 769, 801, 833, 865, 897, 929, 961,
+  993,
+  2, 34, 66, 98, 130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482,
+  514, 546, 578, 610, 642, 674, 706, 738, 770, 802, 834, 866, 898, 930, 962,
+  994,
+  3, 35, 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483,
+  515, 547, 579, 611, 643, 675, 707, 739, 771, 803, 835, 867, 899, 931, 963,
+  995,
+  4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484,
+  516, 548, 580, 612, 644, 676, 708, 740, 772, 804, 836, 868, 900, 932, 964,
+  996,
+  5, 37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485,
+  517, 549, 581, 613, 645, 677, 709, 741, 773, 805, 837, 869, 901, 933, 965,
+  997,
+  6, 38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486,
+  518, 550, 582, 614, 646, 678, 710, 742, 774, 806, 838, 870, 902, 934, 966,
+  998,
+  7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487,
+  519, 551, 583, 615, 647, 679, 711, 743, 775, 807, 839, 871, 903, 935, 967,
+  999,
+  8, 40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488,
+  520, 552, 584, 616, 648, 680, 712, 744, 776, 808, 840, 872, 904, 936, 968,
+  1000,
+  9, 41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489,
+  521, 553, 585, 617, 649, 681, 713, 745, 777, 809, 841, 873, 905, 937, 969,
+  1001,
+  10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458,
+  490, 522, 554, 586, 618, 650, 682, 714, 746, 778, 810, 842, 874, 906,
+  938, 970, 1002,
+  11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459,
+  491, 523, 555, 587, 619, 651, 683, 715, 747, 779, 811, 843, 875, 907,
+  939, 971, 1003,
+  12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460,
+  492, 524, 556, 588, 620, 652, 684, 716, 748, 780, 812, 844, 876, 908,
+  940, 972, 1004,
+  13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461,
+  493, 525, 557, 589, 621, 653, 685, 717, 749, 781, 813, 845, 877, 909,
+  941, 973, 1005,
+  14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462,
+  494, 526, 558, 590, 622, 654, 686, 718, 750, 782, 814, 846, 878, 910,
+  942, 974, 1006,
+  15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463,
+  495, 527, 559, 591, 623, 655, 687, 719, 751, 783, 815, 847, 879, 911,
+  943, 975, 1007,
+  16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464,
+  496, 528, 560, 592, 624, 656, 688, 720, 752, 784, 816, 848, 880, 912,
+  944, 976, 1008,
+  17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465,
+  497, 529, 561, 593, 625, 657, 689, 721, 753, 785, 817, 849, 881, 913,
+  945, 977, 1009,
+  18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466,
+  498, 530, 562, 594, 626, 658, 690, 722, 754, 786, 818, 850, 882, 914,
+  946, 978, 1010,
+  19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467,
+  499, 531, 563, 595, 627, 659, 691, 723, 755, 787, 819, 851, 883, 915,
+  947, 979, 1011,
+  20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468,
+  500, 532, 564, 596, 628, 660, 692, 724, 756, 788, 820, 852, 884, 916,
+  948, 980, 1012,
+  21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469,
+  501, 533, 565, 597, 629, 661, 693, 725, 757, 789, 821, 853, 885, 917,
+  949, 981, 1013,
+  22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470,
+  502, 534, 566, 598, 630, 662, 694, 726, 758, 790, 822, 854, 886, 918,
+  950, 982, 1014,
+  23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471,
+  503, 535, 567, 599, 631, 663, 695, 727, 759, 791, 823, 855, 887, 919,
+  951, 983, 1015,
+  24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472,
+  504, 536, 568, 600, 632, 664, 696, 728, 760, 792, 824, 856, 888, 920,
+  952, 984, 1016,
+  25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473,
+  505, 537, 569, 601, 633, 665, 697, 729, 761, 793, 825, 857, 889, 921,
+  953, 985, 1017,
+  26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474,
+  506, 538, 570, 602, 634, 666, 698, 730, 762, 794, 826, 858, 890, 922,
+  954, 986, 1018,
+  27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475,
+  507, 539, 571, 603, 635, 667, 699, 731, 763, 795, 827, 859, 891, 923,
+  955, 987, 1019,
+  28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476,
+  508, 540, 572, 604, 636, 668, 700, 732, 764, 796, 828, 860, 892, 924,
+  956, 988, 1020,
+  29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477,
+  509, 541, 573, 605, 637, 669, 701, 733, 765, 797, 829, 861, 893, 925,
+  957, 989, 1021,
+  30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478,
+  510, 542, 574, 606, 638, 670, 702, 734, 766, 798, 830, 862, 894, 926,
+  958, 990, 1022,
+  31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479,
+  511, 543, 575, 607, 639, 671, 703, 735, 767, 799, 831, 863, 895, 927,
+  959, 991, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x32[1024]) = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+    19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+    64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+    80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+    96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
+    110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
+    123, 124, 125, 126, 127,
+    128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140,
+    141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
+    154, 155, 156, 157, 158, 159,
+    160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172,
+    173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185,
+    186, 187, 188, 189, 190, 191,
+    192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204,
+    205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217,
+    218, 219, 220, 221, 222, 223,
+    224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236,
+    237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
+    250, 251, 252, 253, 254, 255,
+    256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268,
+    269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
+    282, 283, 284, 285, 286, 287,
+    288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300,
+    301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313,
+    314, 315, 316, 317, 318, 319,
+    320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332,
+    333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345,
+    346, 347, 348, 349, 350, 351,
+    352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364,
+    365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377,
+    378, 379, 380, 381, 382, 383,
+    384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396,
+    397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409,
+    410, 411, 412, 413, 414, 415,
+    416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428,
+    429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441,
+    442, 443, 444, 445, 446, 447,
+    448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460,
+    461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473,
+    474, 475, 476, 477, 478, 479,
+    480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492,
+    493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+    506, 507, 508, 509, 510, 511,
+    512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524,
+    525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537,
+    538, 539, 540, 541, 542, 543,
+    544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556,
+    557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569,
+    570, 571, 572, 573, 574, 575,
+    576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588,
+    589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601,
+    602, 603, 604, 605, 606, 607,
+    608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620,
+    621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633,
+    634, 635, 636, 637, 638, 639,
+    640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652,
+    653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665,
+    666, 667, 668, 669, 670, 671,
+    672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684,
+    685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697,
+    698, 699, 700, 701, 702, 703,
+    704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716,
+    717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729,
+    730, 731, 732, 733, 734, 735,
+    736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748,
+    749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761,
+    762, 763, 764, 765, 766, 767,
+    768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780,
+    781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793,
+    794, 795, 796, 797, 798, 799,
+    800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812,
+    813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825,
+    826, 827, 828, 829, 830, 831,
+    832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844,
+    845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857,
+    858, 859, 860, 861, 862, 863,
+    864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876,
+    877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889,
+    890, 891, 892, 893, 894, 895,
+    896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908,
+    909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921,
+    922, 923, 924, 925, 926, 927,
+    928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940,
+    941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953,
+    954, 955, 956, 957, 958, 959,
+    960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972,
+    973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985,
+    986, 987, 988, 989, 990, 991,
+    992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003,
+    1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014,
+    1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
+};
+
 DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = {
   0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160,
   129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193,
@@ -512,6 +790,18 @@
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 4, 4, 8, 8, 0, 0, 1, 4, 5, 8, 9, 12, 1, 1, 2, 5, 6, 9, 10, 13,
+  2, 2, 3, 6, 7, 10, 11, 14, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 1, 1, 2, 2, 0, 0, 1, 4, 2, 5, 3, 6, 4, 4, 5, 8, 6, 9, 7, 10, 8,
+  8, 9, 12, 10, 13, 11, 14, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
                 col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
   0, 0, 0, 0, 4, 4, 0, 0, 8, 8, 1, 1, 5, 5, 1, 1, 9, 9, 2, 2, 6, 6, 2, 2, 3,
   3, 10, 10, 7, 7, 11, 11, 0, 0,
@@ -535,6 +825,28 @@
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 8, 8, 16, 16, 24, 24, 32, 32, 40, 40, 48, 48, 0, 0, 1, 8, 9,
+  16, 17, 24, 25, 32, 33, 40, 41, 48, 49, 56, 1, 1, 2, 9, 10, 17, 18, 25,
+  26, 33, 34, 41, 42, 49, 50, 57, 2, 2, 3, 10, 11, 18, 19, 26, 27, 34, 35,
+  42, 43, 50, 51, 58, 3, 3, 4, 11, 12, 19, 20, 27, 28, 35, 36, 43, 44, 51,
+  52, 59, 4, 4, 5, 12, 13, 20, 21, 28, 29, 36, 37, 44, 45, 52, 53, 60, 5,
+  5, 6, 13, 14, 21, 22, 29, 30, 37, 38, 45, 46, 53, 54, 61, 6, 6, 7, 14,
+  15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 0, 0, 1, 8, 2, 9, 3, 10,
+  4, 11, 5, 12, 6, 13, 7, 14, 8, 8, 9, 16, 10, 17, 11, 18, 12, 19, 13, 20,
+  14, 21, 15, 22, 16, 16, 17, 24, 18, 25, 19, 26, 20, 27, 21, 28, 22, 29,
+  23, 30, 24, 24, 25, 32, 26, 33, 27, 34, 28, 35, 29, 36, 30, 37, 31, 38,
+  32, 32, 33, 40, 34, 41, 35, 42, 36, 43, 37, 44, 38, 45, 39, 46, 40, 40,
+  41, 48, 42, 49, 43, 50, 44, 51, 45, 52, 46, 53, 47, 54, 48, 48, 49, 56,
+  50, 57, 51, 58, 52, 59, 53, 60, 54, 61, 55, 62, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
                 row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
   0, 0, 0, 0, 1, 1, 0, 0, 8, 8, 2, 2, 8, 8, 9, 9, 3, 3, 16, 16, 10, 10, 16, 16,
   4, 4, 17, 17, 24, 24, 11, 11, 18, 18, 25, 25, 24, 24, 5, 5, 12, 12, 19, 19,
@@ -557,6 +869,114 @@
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 16, 16, 32, 32, 48, 48, 64, 64, 80, 80, 96, 96,
+  112, 112, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 208,
+  208, 224, 224,
+  0, 0, 1, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 96, 97, 112,
+  113, 128, 129, 144, 145, 160, 161, 176, 177, 192, 193, 208, 209,
+  224, 225, 240,
+  1, 1, 2, 17, 18, 33, 34, 49, 50, 65, 66, 81, 82, 97, 98, 113,
+  114, 129, 130, 145, 146, 161, 162, 177, 178, 193, 194, 209, 210,
+  225, 226, 241,
+  2, 2, 3, 18, 19, 34, 35, 50, 51, 66, 67, 82, 83, 98, 99, 114,
+  115, 130, 131, 146, 147, 162, 163, 178, 179, 194, 195, 210, 211,
+  226, 227, 242,
+  3, 3, 4, 19, 20, 35, 36, 51, 52, 67, 68, 83, 84, 99, 100, 115,
+  116, 131, 132, 147, 148, 163, 164, 179, 180, 195, 196, 211, 212,
+  227, 228, 243,
+  4, 4, 5, 20, 21, 36, 37, 52, 53, 68, 69, 84, 85, 100, 101, 116,
+  117, 132, 133, 148, 149, 164, 165, 180, 181, 196, 197, 212, 213,
+  228, 229, 244,
+  5, 5, 6, 21, 22, 37, 38, 53, 54, 69, 70, 85, 86, 101, 102, 117,
+  118, 133, 134, 149, 150, 165, 166, 181, 182, 197, 198, 213, 214,
+  229, 230, 245,
+  6, 6, 7, 22, 23, 38, 39, 54, 55, 70, 71, 86, 87, 102, 103, 118,
+  119, 134, 135, 150, 151, 166, 167, 182, 183, 198, 199, 214, 215,
+  230, 231, 246,
+  7, 7, 8, 23, 24, 39, 40, 55, 56, 71, 72, 87, 88, 103, 104, 119,
+  120, 135, 136, 151, 152, 167, 168, 183, 184, 199, 200, 215, 216,
+  231, 232, 247,
+  8, 8, 9, 24, 25, 40, 41, 56, 57, 72, 73, 88, 89, 104, 105, 120,
+  121, 136, 137, 152, 153, 168, 169, 184, 185, 200, 201, 216, 217,
+  232, 233, 248,
+  9, 9, 10, 25, 26, 41, 42, 57, 58, 73, 74, 89, 90, 105, 106, 121,
+  122, 137, 138, 153, 154, 169, 170, 185, 186, 201, 202, 217, 218,
+  233, 234, 249,
+  10, 10, 11, 26, 27, 42, 43, 58, 59, 74, 75, 90, 91, 106, 107, 122,
+  123, 138, 139, 154, 155, 170, 171, 186, 187, 202, 203, 218, 219,
+  234, 235, 250,
+  11, 11, 12, 27, 28, 43, 44, 59, 60, 75, 76, 91, 92, 107, 108, 123,
+  124, 139, 140, 155, 156, 171, 172, 187, 188, 203, 204, 219, 220,
+  235, 236, 251,
+  12, 12, 13, 28, 29, 44, 45, 60, 61, 76, 77, 92, 93, 108, 109, 124,
+  125, 140, 141, 156, 157, 172, 173, 188, 189, 204, 205, 220, 221,
+  236, 237, 252,
+  13, 13, 14, 29, 30, 45, 46, 61, 62, 77, 78, 93, 94, 109, 110, 125,
+  126, 141, 142, 157, 158, 173, 174, 189, 190, 205, 206, 221, 222,
+  237, 238, 253,
+  14, 14, 15, 30, 31, 46, 47, 62, 63, 78, 79, 94, 95, 110, 111, 126,
+  127, 142, 143, 158, 159, 174, 175, 190, 191, 206, 207, 222, 223,
+  238, 239, 254,
+  0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
+  7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
+  0, 0, 1, 16, 2, 17, 3, 18, 4, 19, 5, 20, 6, 21, 7, 22,
+  8, 23, 9, 24, 10, 25, 11, 26, 12, 27, 13, 28, 14, 29, 15, 30,
+  16, 16, 17, 32, 18, 33, 19, 34, 20, 35, 21, 36, 22, 37, 23, 38,
+  24, 39, 25, 40, 26, 41, 27, 42, 28, 43, 29, 44, 30, 45, 31, 46,
+  32, 32, 33, 48, 34, 49, 35, 50, 36, 51, 37, 52, 38, 53, 39, 54,
+  40, 55, 41, 56, 42, 57, 43, 58, 44, 59, 45, 60, 46, 61, 47, 62,
+  48, 48, 49, 64, 50, 65, 51, 66, 52, 67, 53, 68, 54, 69, 55, 70,
+  56, 71, 57, 72, 58, 73, 59, 74, 60, 75, 61, 76, 62, 77, 63, 78,
+  64, 64, 65, 80, 66, 81, 67, 82, 68, 83, 69, 84, 70, 85, 71, 86,
+  72, 87, 73, 88, 74, 89, 75, 90, 76, 91, 77, 92, 78, 93, 79, 94,
+  80, 80, 81, 96, 82, 97, 83, 98, 84, 99, 85, 100, 86, 101, 87, 102,
+  88, 103, 89, 104, 90, 105, 91, 106, 92, 107, 93, 108, 94, 109, 95, 110,
+  96, 96, 97, 112, 98, 113, 99, 114, 100, 115, 101, 116, 102, 117,
+  103, 118,
+  104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110,
+  125, 111, 126,
+  112, 112, 113, 128, 114, 129, 115, 130, 116, 131, 117, 132, 118,
+  133, 119, 134,
+  120, 135, 121, 136, 122, 137, 123, 138, 124, 139, 125, 140, 126,
+  141, 127, 142,
+  128, 128, 129, 144, 130, 145, 131, 146, 132, 147, 133, 148, 134,
+  149, 135, 150,
+  136, 151, 137, 152, 138, 153, 139, 154, 140, 155, 141, 156, 142,
+  157, 143, 158,
+  144, 144, 145, 160, 146, 161, 147, 162, 148, 163, 149, 164, 150,
+  165, 151, 166,
+  152, 167, 153, 168, 154, 169, 155, 170, 156, 171, 157, 172, 158,
+  173, 159, 174,
+  160, 160, 161, 176, 162, 177, 163, 178, 164, 179, 165, 180, 166,
+  181, 167, 182,
+  168, 183, 169, 184, 170, 185, 171, 186, 172, 187, 173, 188, 174,
+  189, 175, 190,
+  176, 176, 177, 192, 178, 193, 179, 194, 180, 195, 181, 196, 182,
+  197, 183, 198,
+  184, 199, 185, 200, 186, 201, 187, 202, 188, 203, 189, 204, 190,
+  205, 191, 206,
+  192, 192, 193, 208, 194, 209, 195, 210, 196, 211, 197, 212, 198,
+  213, 199, 214,
+  200, 215, 201, 216, 202, 217, 203, 218, 204, 219, 205, 220, 206,
+  221, 207, 222,
+  208, 208, 209, 224, 210, 225, 211, 226, 212, 227, 213, 228, 214,
+  229, 215, 230,
+  216, 231, 217, 232, 218, 233, 219, 234, 220, 235, 221, 236, 222,
+  237, 223, 238,
+  224, 224, 225, 240, 226, 241, 227, 242, 228, 243, 229, 244, 230,
+  245, 231, 246,
+  232, 247, 233, 248, 234, 249, 235, 250, 236, 251, 237, 252, 238,
+  253, 239, 254,
+  0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
                 col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
   0, 0, 0, 0, 16, 16, 32, 32, 0, 0, 48, 48, 1, 1, 64, 64,
   17, 17, 80, 80, 33, 33, 1, 1, 49, 49, 96, 96, 2, 2, 65, 65,
@@ -668,6 +1088,394 @@
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 32, 32, 64, 64, 96, 96, 128, 128, 160, 160, 192, 192,
+  224, 224, 256, 256, 288, 288, 320, 320, 352, 352, 384, 384, 416,
+  416, 448, 448,
+  480, 480, 512, 512, 544, 544, 576, 576, 608, 608, 640, 640, 672,
+  672, 704, 704, 736, 736, 768, 768, 800, 800, 832, 832, 864, 864,
+  896, 896, 928, 928, 960, 960,
+  0, 0, 1, 32, 33, 64, 65, 96, 97, 128, 129, 160, 161, 192, 193,
+  224, 225, 256, 257, 288, 289, 320, 321, 352, 353, 384, 385, 416,
+  417, 448, 449, 480,
+  481, 512, 513, 544, 545, 576, 577, 608, 609, 640, 641, 672, 673,
+  704, 705, 736, 737, 768, 769, 800, 801, 832, 833, 864, 865, 896,
+  897, 928, 929, 960, 961, 992,
+  1, 1, 2, 33, 34, 65, 66, 97, 98, 129, 130, 161, 162, 193, 194,
+  225, 226, 257, 258, 289, 290, 321, 322, 353, 354, 385, 386, 417,
+  418, 449, 450, 481,
+  482, 513, 514, 545, 546, 577, 578, 609, 610, 641, 642, 673, 674,
+  705, 706, 737, 738, 769, 770, 801, 802, 833, 834, 865, 866, 897,
+  898, 929, 930, 961, 962, 993,
+  2, 2, 3, 34, 35, 66, 67, 98, 99, 130, 131, 162, 163, 194, 195,
+  226, 227, 258, 259, 290, 291, 322, 323, 354, 355, 386, 387, 418,
+  419, 450, 451, 482,
+  483, 514, 515, 546, 547, 578, 579, 610, 611, 642, 643, 674, 675,
+  706, 707, 738, 739, 770, 771, 802, 803, 834, 835, 866, 867, 898,
+  899, 930, 931, 962, 963, 994,
+  3, 3, 4, 35, 36, 67, 68, 99, 100, 131, 132, 163, 164, 195, 196,
+  227, 228, 259, 260, 291, 292, 323, 324, 355, 356, 387, 388, 419,
+  420, 451, 452, 483,
+  484, 515, 516, 547, 548, 579, 580, 611, 612, 643, 644, 675, 676,
+  707, 708, 739, 740, 771, 772, 803, 804, 835, 836, 867, 868, 899,
+  900, 931, 932, 963, 964, 995,
+  4, 4, 5, 36, 37, 68, 69, 100, 101, 132, 133, 164, 165, 196, 197,
+  228, 229, 260, 261, 292, 293, 324, 325, 356, 357, 388, 389, 420,
+  421, 452, 453, 484,
+  485, 516, 517, 548, 549, 580, 581, 612, 613, 644, 645, 676, 677,
+  708, 709, 740, 741, 772, 773, 804, 805, 836, 837, 868, 869, 900,
+  901, 932, 933, 964, 965, 996,
+  5, 5, 6, 37, 38, 69, 70, 101, 102, 133, 134, 165, 166, 197, 198,
+  229, 230, 261, 262, 293, 294, 325, 326, 357, 358, 389, 390, 421,
+  422, 453, 454, 485,
+  486, 517, 518, 549, 550, 581, 582, 613, 614, 645, 646, 677, 678,
+  709, 710, 741, 742, 773, 774, 805, 806, 837, 838, 869, 870, 901,
+  902, 933, 934, 965, 966, 997,
+  6, 6, 7, 38, 39, 70, 71, 102, 103, 134, 135, 166, 167, 198, 199,
+  230, 231, 262, 263, 294, 295, 326, 327, 358, 359, 390, 391, 422,
+  423, 454, 455, 486,
+  487, 518, 519, 550, 551, 582, 583, 614, 615, 646, 647, 678, 679,
+  710, 711, 742, 743, 774, 775, 806, 807, 838, 839, 870, 871, 902,
+  903, 934, 935, 966, 967, 998,
+  7, 7, 8, 39, 40, 71, 72, 103, 104, 135, 136, 167, 168, 199, 200,
+  231, 232, 263, 264, 295, 296, 327, 328, 359, 360, 391, 392, 423,
+  424, 455, 456, 487,
+  488, 519, 520, 551, 552, 583, 584, 615, 616, 647, 648, 679, 680,
+  711, 712, 743, 744, 775, 776, 807, 808, 839, 840, 871, 872, 903,
+  904, 935, 936, 967, 968, 999,
+  8, 8, 9, 40, 41, 72, 73, 104, 105, 136, 137, 168, 169, 200, 201,
+  232, 233, 264, 265, 296, 297, 328, 329, 360, 361, 392, 393, 424,
+  425, 456, 457, 488,
+  489, 520, 521, 552, 553, 584, 585, 616, 617, 648, 649, 680, 681,
+  712, 713, 744, 745, 776, 777, 808, 809, 840, 841, 872, 873, 904,
+  905, 936, 937, 968, 969, 1000,
+  9, 9, 10, 41, 42, 73, 74, 105, 106, 137, 138, 169, 170, 201, 202,
+  233, 234, 265, 266, 297, 298, 329, 330, 361, 362, 393, 394, 425,
+  426, 457, 458, 489,
+  490, 521, 522, 553, 554, 585, 586, 617, 618, 649, 650, 681, 682,
+  713, 714, 745, 746, 777, 778, 809, 810, 841, 842, 873, 874, 905,
+  906, 937, 938, 969, 970, 1001,
+  10, 10, 11, 42, 43, 74, 75, 106, 107, 138, 139, 170, 171, 202,
+  203, 234, 235, 266, 267, 298, 299, 330, 331, 362, 363, 394, 395,
+  426, 427, 458, 459, 490,
+  491, 522, 523, 554, 555, 586, 587, 618, 619, 650, 651, 682, 683,
+  714, 715, 746, 747, 778, 779, 810, 811, 842, 843, 874, 875, 906,
+  907, 938, 939, 970, 971, 1002,
+  11, 11, 12, 43, 44, 75, 76, 107, 108, 139, 140, 171, 172, 203,
+  204, 235, 236, 267, 268, 299, 300, 331, 332, 363, 364, 395, 396,
+  427, 428, 459, 460, 491,
+  492, 523, 524, 555, 556, 587, 588, 619, 620, 651, 652, 683, 684,
+  715, 716, 747, 748, 779, 780, 811, 812, 843, 844, 875, 876, 907,
+  908, 939, 940, 971, 972, 1003,
+  12, 12, 13, 44, 45, 76, 77, 108, 109, 140, 141, 172, 173, 204,
+  205, 236, 237, 268, 269, 300, 301, 332, 333, 364, 365, 396, 397,
+  428, 429, 460, 461, 492,
+  493, 524, 525, 556, 557, 588, 589, 620, 621, 652, 653, 684, 685,
+  716, 717, 748, 749, 780, 781, 812, 813, 844, 845, 876, 877, 908,
+  909, 940, 941, 972, 973, 1004,
+  13, 13, 14, 45, 46, 77, 78, 109, 110, 141, 142, 173, 174, 205,
+  206, 237, 238, 269, 270, 301, 302, 333, 334, 365, 366, 397, 398,
+  429, 430, 461, 462, 493,
+  494, 525, 526, 557, 558, 589, 590, 621, 622, 653, 654, 685, 686,
+  717, 718, 749, 750, 781, 782, 813, 814, 845, 846, 877, 878, 909,
+  910, 941, 942, 973, 974, 1005,
+  14, 14, 15, 46, 47, 78, 79, 110, 111, 142, 143, 174, 175, 206,
+  207, 238, 239, 270, 271, 302, 303, 334, 335, 366, 367, 398, 399,
+  430, 431, 462, 463, 494,
+  495, 526, 527, 558, 559, 590, 591, 622, 623, 654, 655, 686, 687,
+  718, 719, 750, 751, 782, 783, 814, 815, 846, 847, 878, 879, 910,
+  911, 942, 943, 974, 975, 1006,
+  15, 15, 16, 47, 48, 79, 80, 111, 112, 143, 144, 175, 176, 207,
+  208, 239, 240, 271, 272, 303, 304, 335, 336, 367, 368, 399, 400,
+  431, 432, 463, 464, 495,
+  496, 527, 528, 559, 560, 591, 592, 623, 624, 655, 656, 687, 688,
+  719, 720, 751, 752, 783, 784, 815, 816, 847, 848, 879, 880, 911,
+  912, 943, 944, 975, 976, 1007,
+  16, 16, 17, 48, 49, 80, 81, 112, 113, 144, 145, 176, 177, 208,
+  209, 240, 241, 272, 273, 304, 305, 336, 337, 368, 369, 400, 401,
+  432, 433, 464, 465, 496,
+  497, 528, 529, 560, 561, 592, 593, 624, 625, 656, 657, 688, 689,
+  720, 721, 752, 753, 784, 785, 816, 817, 848, 849, 880, 881, 912,
+  913, 944, 945, 976, 977, 1008,
+  17, 17, 18, 49, 50, 81, 82, 113, 114, 145, 146, 177, 178, 209,
+  210, 241, 242, 273, 274, 305, 306, 337, 338, 369, 370, 401, 402,
+  433, 434, 465, 466, 497,
+  498, 529, 530, 561, 562, 593, 594, 625, 626, 657, 658, 689, 690,
+  721, 722, 753, 754, 785, 786, 817, 818, 849, 850, 881, 882, 913,
+  914, 945, 946, 977, 978, 1009,
+  18, 18, 19, 50, 51, 82, 83, 114, 115, 146, 147, 178, 179, 210,
+  211, 242, 243, 274, 275, 306, 307, 338, 339, 370, 371, 402, 403,
+  434, 435, 466, 467, 498,
+  499, 530, 531, 562, 563, 594, 595, 626, 627, 658, 659, 690, 691,
+  722, 723, 754, 755, 786, 787, 818, 819, 850, 851, 882, 883, 914,
+  915, 946, 947, 978, 979, 1010,
+  19, 19, 20, 51, 52, 83, 84, 115, 116, 147, 148, 179, 180, 211,
+  212, 243, 244, 275, 276, 307, 308, 339, 340, 371, 372, 403, 404,
+  435, 436, 467, 468, 499,
+  500, 531, 532, 563, 564, 595, 596, 627, 628, 659, 660, 691, 692,
+  723, 724, 755, 756, 787, 788, 819, 820, 851, 852, 883, 884, 915,
+  916, 947, 948, 979, 980, 1011,
+  20, 20, 21, 52, 53, 84, 85, 116, 117, 148, 149, 180, 181, 212,
+  213, 244, 245, 276, 277, 308, 309, 340, 341, 372, 373, 404, 405,
+  436, 437, 468, 469, 500,
+  501, 532, 533, 564, 565, 596, 597, 628, 629, 660, 661, 692, 693,
+  724, 725, 756, 757, 788, 789, 820, 821, 852, 853, 884, 885, 916,
+  917, 948, 949, 980, 981, 1012,
+  21, 21, 22, 53, 54, 85, 86, 117, 118, 149, 150, 181, 182, 213,
+  214, 245, 246, 277, 278, 309, 310, 341, 342, 373, 374, 405, 406,
+  437, 438, 469, 470, 501,
+  502, 533, 534, 565, 566, 597, 598, 629, 630, 661, 662, 693, 694,
+  725, 726, 757, 758, 789, 790, 821, 822, 853, 854, 885, 886, 917,
+  918, 949, 950, 981, 982, 1013,
+  22, 22, 23, 54, 55, 86, 87, 118, 119, 150, 151, 182, 183, 214,
+  215, 246, 247, 278, 279, 310, 311, 342, 343, 374, 375, 406, 407,
+  438, 439, 470, 471, 502,
+  503, 534, 535, 566, 567, 598, 599, 630, 631, 662, 663, 694, 695,
+  726, 727, 758, 759, 790, 791, 822, 823, 854, 855, 886, 887, 918,
+  919, 950, 951, 982, 983, 1014,
+  23, 23, 24, 55, 56, 87, 88, 119, 120, 151, 152, 183, 184, 215,
+  216, 247, 248, 279, 280, 311, 312, 343, 344, 375, 376, 407, 408,
+  439, 440, 471, 472, 503,
+  504, 535, 536, 567, 568, 599, 600, 631, 632, 663, 664, 695, 696,
+  727, 728, 759, 760, 791, 792, 823, 824, 855, 856, 887, 888, 919,
+  920, 951, 952, 983, 984, 1015,
+  24, 24, 25, 56, 57, 88, 89, 120, 121, 152, 153, 184, 185, 216,
+  217, 248, 249, 280, 281, 312, 313, 344, 345, 376, 377, 408, 409,
+  440, 441, 472, 473, 504,
+  505, 536, 537, 568, 569, 600, 601, 632, 633, 664, 665, 696, 697,
+  728, 729, 760, 761, 792, 793, 824, 825, 856, 857, 888, 889, 920,
+  921, 952, 953, 984, 985, 1016,
+  25, 25, 26, 57, 58, 89, 90, 121, 122, 153, 154, 185, 186, 217,
+  218, 249, 250, 281, 282, 313, 314, 345, 346, 377, 378, 409, 410,
+  441, 442, 473, 474, 505,
+  506, 537, 538, 569, 570, 601, 602, 633, 634, 665, 666, 697, 698,
+  729, 730, 761, 762, 793, 794, 825, 826, 857, 858, 889, 890, 921,
+  922, 953, 954, 985, 986, 1017,
+  26, 26, 27, 58, 59, 90, 91, 122, 123, 154, 155, 186, 187, 218,
+  219, 250, 251, 282, 283, 314, 315, 346, 347, 378, 379, 410, 411,
+  442, 443, 474, 475, 506,
+  507, 538, 539, 570, 571, 602, 603, 634, 635, 666, 667, 698, 699,
+  730, 731, 762, 763, 794, 795, 826, 827, 858, 859, 890, 891, 922,
+  923, 954, 955, 986, 987, 1018,
+  27, 27, 28, 59, 60, 91, 92, 123, 124, 155, 156, 187, 188, 219,
+  220, 251, 252, 283, 284, 315, 316, 347, 348, 379, 380, 411, 412,
+  443, 444, 475, 476, 507,
+  508, 539, 540, 571, 572, 603, 604, 635, 636, 667, 668, 699, 700,
+  731, 732, 763, 764, 795, 796, 827, 828, 859, 860, 891, 892, 923,
+  924, 955, 956, 987, 988, 1019,
+  28, 28, 29, 60, 61, 92, 93, 124, 125, 156, 157, 188, 189, 220,
+  221, 252, 253, 284, 285, 316, 317, 348, 349, 380, 381, 412, 413,
+  444, 445, 476, 477, 508,
+  509, 540, 541, 572, 573, 604, 605, 636, 637, 668, 669, 700, 701,
+  732, 733, 764, 765, 796, 797, 828, 829, 860, 861, 892, 893, 924,
+  925, 956, 957, 988, 989, 1020,
+  29, 29, 30, 61, 62, 93, 94, 125, 126, 157, 158, 189, 190, 221,
+  222, 253, 254, 285, 286, 317, 318, 349, 350, 381, 382, 413, 414,
+  445, 446, 477, 478, 509,
+  510, 541, 542, 573, 574, 605, 606, 637, 638, 669, 670, 701, 702,
+  733, 734, 765, 766, 797, 798, 829, 830, 861, 862, 893, 894, 925,
+  926, 957, 958, 989, 990, 1021,
+  30, 30, 31, 62, 63, 94, 95, 126, 127, 158, 159, 190, 191, 222,
+  223, 254, 255, 286, 287, 318, 319, 350, 351, 382, 383, 414, 415,
+  446, 447, 478, 479, 510,
+  511, 542, 543, 574, 575, 606, 607, 638, 639, 670, 671, 702, 703,
+  734, 735, 766, 767, 798, 799, 830, 831, 862, 863, 894, 895, 926,
+  927, 958, 959, 990, 991, 1022,
+  0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9,
+  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
+  15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22,
+  23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30,
+  0, 0, 1, 32, 2, 33, 3, 34, 4, 35, 5, 36, 6, 37, 7, 38, 8, 39, 9,
+  40, 10, 41, 11, 42, 12, 43, 13, 44, 14, 45, 15, 46,
+  16, 47, 17, 48, 18, 49, 19, 50, 20, 51, 21, 52, 22, 53, 23, 54,
+  24, 55, 25, 56, 26, 57, 27, 58, 28, 59, 29, 60, 30, 61, 31, 62,
+  32, 32, 33, 64, 34, 65, 35, 66, 36, 67, 37, 68, 38, 69, 39, 70,
+  40, 71, 41, 72, 42, 73, 43, 74, 44, 75, 45, 76, 46, 77, 47, 78,
+  48, 79, 49, 80, 50, 81, 51, 82, 52, 83, 53, 84, 54, 85, 55, 86,
+  56, 87, 57, 88, 58, 89, 59, 90, 60, 91, 61, 92, 62, 93, 63, 94,
+  64, 64, 65, 96, 66, 97, 67, 98, 68, 99, 69, 100, 70, 101, 71,
+  102, 72, 103, 73, 104, 74, 105, 75, 106, 76, 107, 77, 108, 78,
+  109, 79, 110,
+  80, 111, 81, 112, 82, 113, 83, 114, 84, 115, 85, 116, 86, 117,
+  87, 118, 88, 119, 89, 120, 90, 121, 91, 122, 92, 123, 93, 124,
+  94, 125, 95, 126,
+  96, 96, 97, 128, 98, 129, 99, 130, 100, 131, 101, 132, 102,
+  133, 103, 134, 104, 135, 105, 136, 106, 137, 107, 138, 108,
+  139, 109, 140, 110, 141, 111, 142,
+  112, 143, 113, 144, 114, 145, 115, 146, 116, 147, 117, 148,
+  118, 149, 119, 150, 120, 151, 121, 152, 122, 153, 123, 154,
+  124, 155, 125, 156, 126, 157, 127, 158,
+  128, 128, 129, 160, 130, 161, 131, 162, 132, 163, 133, 164,
+  134, 165, 135, 166, 136, 167, 137, 168, 138, 169, 139, 170,
+  140, 171, 141, 172, 142, 173, 143, 174,
+  144, 175, 145, 176, 146, 177, 147, 178, 148, 179, 149, 180,
+  150, 181, 151, 182, 152, 183, 153, 184, 154, 185, 155, 186,
+  156, 187, 157, 188, 158, 189, 159, 190,
+  160, 160, 161, 192, 162, 193, 163, 194, 164, 195, 165, 196,
+  166, 197, 167, 198, 168, 199, 169, 200, 170, 201, 171, 202,
+  172, 203, 173, 204, 174, 205, 175, 206,
+  176, 207, 177, 208, 178, 209, 179, 210, 180, 211, 181, 212,
+  182, 213, 183, 214, 184, 215, 185, 216, 186, 217, 187, 218,
+  188, 219, 189, 220, 190, 221, 191, 222,
+  192, 192, 193, 224, 194, 225, 195, 226, 196, 227, 197, 228,
+  198, 229, 199, 230, 200, 231, 201, 232, 202, 233, 203, 234,
+  204, 235, 205, 236, 206, 237, 207, 238,
+  208, 239, 209, 240, 210, 241, 211, 242, 212, 243, 213, 244,
+  214, 245, 215, 246, 216, 247, 217, 248, 218, 249, 219, 250,
+  220, 251, 221, 252, 222, 253, 223, 254,
+  224, 224, 225, 256, 226, 257, 227, 258, 228, 259, 229, 260,
+  230, 261, 231, 262, 232, 263, 233, 264, 234, 265, 235, 266,
+  236, 267, 237, 268, 238, 269, 239, 270,
+  240, 271, 241, 272, 242, 273, 243, 274, 244, 275, 245, 276,
+  246, 277, 247, 278, 248, 279, 249, 280, 250, 281, 251, 282,
+  252, 283, 253, 284, 254, 285, 255, 286,
+  256, 256, 257, 288, 258, 289, 259, 290, 260, 291, 261, 292,
+  262, 293, 263, 294, 264, 295, 265, 296, 266, 297, 267, 298,
+  268, 299, 269, 300, 270, 301, 271, 302,
+  272, 303, 273, 304, 274, 305, 275, 306, 276, 307, 277, 308,
+  278, 309, 279, 310, 280, 311, 281, 312, 282, 313, 283, 314,
+  284, 315, 285, 316, 286, 317, 287, 318,
+  288, 288, 289, 320, 290, 321, 291, 322, 292, 323, 293, 324,
+  294, 325, 295, 326, 296, 327, 297, 328, 298, 329, 299, 330,
+  300, 331, 301, 332, 302, 333, 303, 334,
+  304, 335, 305, 336, 306, 337, 307, 338, 308, 339, 309, 340,
+  310, 341, 311, 342, 312, 343, 313, 344, 314, 345, 315, 346,
+  316, 347, 317, 348, 318, 349, 319, 350,
+  320, 320, 321, 352, 322, 353, 323, 354, 324, 355, 325, 356,
+  326, 357, 327, 358, 328, 359, 329, 360, 330, 361, 331, 362,
+  332, 363, 333, 364, 334, 365, 335, 366,
+  336, 367, 337, 368, 338, 369, 339, 370, 340, 371, 341, 372,
+  342, 373, 343, 374, 344, 375, 345, 376, 346, 377, 347, 378,
+  348, 379, 349, 380, 350, 381, 351, 382,
+  352, 352, 353, 384, 354, 385, 355, 386, 356, 387, 357, 388,
+  358, 389, 359, 390, 360, 391, 361, 392, 362, 393, 363, 394,
+  364, 395, 365, 396, 366, 397, 367, 398,
+  368, 399, 369, 400, 370, 401, 371, 402, 372, 403, 373, 404,
+  374, 405, 375, 406, 376, 407, 377, 408, 378, 409, 379, 410,
+  380, 411, 381, 412, 382, 413, 383, 414,
+  384, 384, 385, 416, 386, 417, 387, 418, 388, 419, 389, 420,
+  390, 421, 391, 422, 392, 423, 393, 424, 394, 425, 395, 426,
+  396, 427, 397, 428, 398, 429, 399, 430,
+  400, 431, 401, 432, 402, 433, 403, 434, 404, 435, 405, 436,
+  406, 437, 407, 438, 408, 439, 409, 440, 410, 441, 411, 442,
+  412, 443, 413, 444, 414, 445, 415, 446,
+  416, 416, 417, 448, 418, 449, 419, 450, 420, 451, 421, 452,
+  422, 453, 423, 454, 424, 455, 425, 456, 426, 457, 427, 458,
+  428, 459, 429, 460, 430, 461, 431, 462,
+  432, 463, 433, 464, 434, 465, 435, 466, 436, 467, 437, 468,
+  438, 469, 439, 470, 440, 471, 441, 472, 442, 473, 443, 474,
+  444, 475, 445, 476, 446, 477, 447, 478,
+  448, 448, 449, 480, 450, 481, 451, 482, 452, 483, 453, 484,
+  454, 485, 455, 486, 456, 487, 457, 488, 458, 489, 459, 490,
+  460, 491, 461, 492, 462, 493, 463, 494,
+  464, 495, 465, 496, 466, 497, 467, 498, 468, 499, 469, 500,
+  470, 501, 471, 502, 472, 503, 473, 504, 474, 505, 475, 506,
+  476, 507, 477, 508, 478, 509, 479, 510,
+  480, 480, 481, 512, 482, 513, 483, 514, 484, 515, 485, 516,
+  486, 517, 487, 518, 488, 519, 489, 520, 490, 521, 491, 522,
+  492, 523, 493, 524, 494, 525, 495, 526,
+  496, 527, 497, 528, 498, 529, 499, 530, 500, 531, 501, 532,
+  502, 533, 503, 534, 504, 535, 505, 536, 506, 537, 507, 538,
+  508, 539, 509, 540, 510, 541, 511, 542,
+  512, 512, 513, 544, 514, 545, 515, 546, 516, 547, 517, 548,
+  518, 549, 519, 550, 520, 551, 521, 552, 522, 553, 523, 554,
+  524, 555, 525, 556, 526, 557, 527, 558,
+  528, 559, 529, 560, 530, 561, 531, 562, 532, 563, 533, 564,
+  534, 565, 535, 566, 536, 567, 537, 568, 538, 569, 539, 570,
+  540, 571, 541, 572, 542, 573, 543, 574,
+  544, 544, 545, 576, 546, 577, 547, 578, 548, 579, 549, 580,
+  550, 581, 551, 582, 552, 583, 553, 584, 554, 585, 555, 586,
+  556, 587, 557, 588, 558, 589, 559, 590,
+  560, 591, 561, 592, 562, 593, 563, 594, 564, 595, 565, 596,
+  566, 597, 567, 598, 568, 599, 569, 600, 570, 601, 571, 602,
+  572, 603, 573, 604, 574, 605, 575, 606,
+  576, 576, 577, 608, 578, 609, 579, 610, 580, 611, 581, 612,
+  582, 613, 583, 614, 584, 615, 585, 616, 586, 617, 587, 618,
+  588, 619, 589, 620, 590, 621, 591, 622,
+  592, 623, 593, 624, 594, 625, 595, 626, 596, 627, 597, 628,
+  598, 629, 599, 630, 600, 631, 601, 632, 602, 633, 603, 634,
+  604, 635, 605, 636, 606, 637, 607, 638,
+  608, 608, 609, 640, 610, 641, 611, 642, 612, 643, 613, 644,
+  614, 645, 615, 646, 616, 647, 617, 648, 618, 649, 619, 650,
+  620, 651, 621, 652, 622, 653, 623, 654,
+  624, 655, 625, 656, 626, 657, 627, 658, 628, 659, 629, 660,
+  630, 661, 631, 662, 632, 663, 633, 664, 634, 665, 635, 666,
+  636, 667, 637, 668, 638, 669, 639, 670,
+  640, 640, 641, 672, 642, 673, 643, 674, 644, 675, 645, 676,
+  646, 677, 647, 678, 648, 679, 649, 680, 650, 681, 651, 682,
+  652, 683, 653, 684, 654, 685, 655, 686,
+  656, 687, 657, 688, 658, 689, 659, 690, 660, 691, 661, 692,
+  662, 693, 663, 694, 664, 695, 665, 696, 666, 697, 667, 698,
+  668, 699, 669, 700, 670, 701, 671, 702,
+  672, 672, 673, 704, 674, 705, 675, 706, 676, 707, 677, 708,
+  678, 709, 679, 710, 680, 711, 681, 712, 682, 713, 683, 714,
+  684, 715, 685, 716, 686, 717, 687, 718,
+  688, 719, 689, 720, 690, 721, 691, 722, 692, 723, 693, 724,
+  694, 725, 695, 726, 696, 727, 697, 728, 698, 729, 699, 730,
+  700, 731, 701, 732, 702, 733, 703, 734,
+  704, 704, 705, 736, 706, 737, 707, 738, 708, 739, 709, 740,
+  710, 741, 711, 742, 712, 743, 713, 744, 714, 745, 715, 746,
+  716, 747, 717, 748, 718, 749, 719, 750,
+  720, 751, 721, 752, 722, 753, 723, 754, 724, 755, 725, 756,
+  726, 757, 727, 758, 728, 759, 729, 760, 730, 761, 731, 762,
+  732, 763, 733, 764, 734, 765, 735, 766,
+  736, 736, 737, 768, 738, 769, 739, 770, 740, 771, 741, 772,
+  742, 773, 743, 774, 744, 775, 745, 776, 746, 777, 747, 778,
+  748, 779, 749, 780, 750, 781, 751, 782,
+  752, 783, 753, 784, 754, 785, 755, 786, 756, 787, 757, 788,
+  758, 789, 759, 790, 760, 791, 761, 792, 762, 793, 763, 794,
+  764, 795, 765, 796, 766, 797, 767, 798,
+  768, 768, 769, 800, 770, 801, 771, 802, 772, 803, 773, 804,
+  774, 805, 775, 806, 776, 807, 777, 808, 778, 809, 779, 810,
+  780, 811, 781, 812, 782, 813, 783, 814,
+  784, 815, 785, 816, 786, 817, 787, 818, 788, 819, 789, 820,
+  790, 821, 791, 822, 792, 823, 793, 824, 794, 825, 795, 826,
+  796, 827, 797, 828, 798, 829, 799, 830,
+  800, 800, 801, 832, 802, 833, 803, 834, 804, 835, 805, 836,
+  806, 837, 807, 838, 808, 839, 809, 840, 810, 841, 811, 842,
+  812, 843, 813, 844, 814, 845, 815, 846,
+  816, 847, 817, 848, 818, 849, 819, 850, 820, 851, 821, 852,
+  822, 853, 823, 854, 824, 855, 825, 856, 826, 857, 827, 858,
+  828, 859, 829, 860, 830, 861, 831, 862,
+  832, 832, 833, 864, 834, 865, 835, 866, 836, 867, 837, 868,
+  838, 869, 839, 870, 840, 871, 841, 872, 842, 873, 843, 874,
+  844, 875, 845, 876, 846, 877, 847, 878,
+  848, 879, 849, 880, 850, 881, 851, 882, 852, 883, 853, 884,
+  854, 885, 855, 886, 856, 887, 857, 888, 858, 889, 859, 890,
+  860, 891, 861, 892, 862, 893, 863, 894,
+  864, 864, 865, 896, 866, 897, 867, 898, 868, 899, 869, 900,
+  870, 901, 871, 902, 872, 903, 873, 904, 874, 905, 875, 906,
+  876, 907, 877, 908, 878, 909, 879, 910,
+  880, 911, 881, 912, 882, 913, 883, 914, 884, 915, 885, 916,
+  886, 917, 887, 918, 888, 919, 889, 920, 890, 921, 891, 922,
+  892, 923, 893, 924, 894, 925, 895, 926,
+  896, 896, 897, 928, 898, 929, 899, 930, 900, 931, 901, 932,
+  902, 933, 903, 934, 904, 935, 905, 936, 906, 937, 907, 938,
+  908, 939, 909, 940, 910, 941, 911, 942,
+  912, 943, 913, 944, 914, 945, 915, 946, 916, 947, 917, 948,
+  918, 949, 919, 950, 920, 951, 921, 952, 922, 953, 923, 954,
+  924, 955, 925, 956, 926, 957, 927, 958,
+  928, 928, 929, 960, 930, 961, 931, 962, 932, 963, 933, 964,
+  934, 965, 935, 966, 936, 967, 937, 968, 938, 969, 939, 970,
+  940, 971, 941, 972, 942, 973, 943, 974,
+  944, 975, 945, 976, 946, 977, 947, 978, 948, 979, 949, 980,
+  950, 981, 951, 982, 952, 983, 953, 984, 954, 985, 955, 986,
+  956, 987, 957, 988, 958, 989, 959, 990,
+  960, 960, 961, 992, 962, 993, 963, 994, 964, 995, 965, 996,
+  966, 997, 967, 998, 968, 999, 969, 1000, 970, 1001, 971, 1002,
+  972, 1003, 973, 1004, 974, 1005, 975, 1006,
+  976, 1007, 977, 1008, 978, 1009, 979, 1010, 980, 1011, 981,
+  1012, 982, 1013, 983, 1014, 984, 1015, 985, 1016, 986, 1017,
+  987, 1018, 988, 1019, 989, 1020, 990, 1021, 991, 1022,
+  0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
   0, 0, 0, 0, 0, 0, 32, 32, 1, 32, 1, 1, 64, 64, 33, 64,
   2, 33, 96, 96, 2, 2, 65, 96, 34, 65, 128, 128, 97, 128, 3, 34,
@@ -1335,6 +2143,14 @@
   0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15,
 };
 
+DECLARE_ALIGNED(16, static const int16_t, vp10_mcol_iscan_4x4[16]) = {
+  0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_mrow_iscan_4x4[16]) = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+};
+
 DECLARE_ALIGNED(16, static const int16_t, vp10_col_iscan_4x4[16]) = {
   0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15,
 };
@@ -1343,6 +2159,20 @@
   0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15,
 };
 
+DECLARE_ALIGNED(16, static const int16_t, vp10_mcol_iscan_8x8[64]) = {
+  0, 8, 16, 24, 32, 40, 48, 56, 1, 9, 17, 25, 33, 41, 49, 57, 2, 10,
+  18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20,
+  28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61, 6, 14, 22, 30,
+  38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_mrow_iscan_8x8[64]) = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+  20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
+  37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
+  54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
 DECLARE_ALIGNED(16, static const int16_t, vp10_col_iscan_8x8[64]) = {
   0, 3, 8, 15, 22, 32, 40, 47, 1, 5, 11, 18, 26, 34, 44, 51,
   2, 7, 13, 20, 28, 38, 46, 54, 4, 10, 16, 24, 31, 41, 50, 56,
@@ -1364,6 +2194,53 @@
   25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63,
 };
 
+DECLARE_ALIGNED(16, static const int16_t, vp10_mcol_iscan_16x16[256]) = {
+  0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
+  1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241,
+  2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242,
+  3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243,
+  4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
+  5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
+  6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
+  7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+  8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
+  9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
+  10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
+  11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
+  12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
+  13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
+  14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+  15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_mrow_iscan_16x16[256]) = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+  64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+  80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+  96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+  112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127,
+  128, 129, 130, 131, 132, 133, 134, 135,
+  136, 137, 138, 139, 140, 141, 142, 143,
+  144, 145, 146, 147, 148, 149, 150, 151,
+  152, 153, 154, 155, 156, 157, 158, 159,
+  160, 161, 162, 163, 164, 165, 166, 167,
+  168, 169, 170, 171, 172, 173, 174, 175,
+  176, 177, 178, 179, 180, 181, 182, 183,
+  184, 185, 186, 187, 188, 189, 190, 191,
+  192, 193, 194, 195, 196, 197, 198, 199,
+  200, 201, 202, 203, 204, 205, 206, 207,
+  208, 209, 210, 211, 212, 213, 214, 215,
+  216, 217, 218, 219, 220, 221, 222, 223,
+  224, 225, 226, 227, 228, 229, 230, 231,
+  232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247,
+  248, 249, 250, 251, 252, 253, 254, 255,
+};
+
 DECLARE_ALIGNED(16, static const int16_t, vp10_col_iscan_16x16[256]) = {
   0, 4, 11, 20, 31, 43, 59, 75, 85, 109, 130, 150, 165, 181, 195, 198,
   1, 6, 14, 23, 34, 47, 64, 81, 95, 114, 135, 153, 171, 188, 201, 212,
@@ -1423,6 +2300,204 @@
   249, 253, 255,
 };
 
+DECLARE_ALIGNED(16, static const int16_t, vp10_mcol_iscan_32x32[1024]) = {
+  0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416,
+  448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832,
+  864, 896, 928, 960, 992,
+  1, 33, 65, 97, 129, 161, 193, 225, 257, 289, 321, 353, 385, 417,
+  449, 481, 513, 545, 577, 609, 641, 673, 705, 737, 769, 801, 833,
+  865, 897, 929, 961, 993,
+  2, 34, 66, 98, 130, 162, 194, 226, 258, 290, 322, 354, 386, 418,
+  450, 482, 514, 546, 578, 610, 642, 674, 706, 738, 770, 802, 834,
+  866, 898, 930, 962, 994,
+  3, 35, 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419,
+  451, 483, 515, 547, 579, 611, 643, 675, 707, 739, 771, 803, 835,
+  867, 899, 931, 963, 995,
+  4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420,
+  452, 484, 516, 548, 580, 612, 644, 676, 708, 740, 772, 804, 836,
+  868, 900, 932, 964, 996,
+  5, 37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421,
+  453, 485, 517, 549, 581, 613, 645, 677, 709, 741, 773, 805, 837,
+  869, 901, 933, 965, 997,
+  6, 38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422,
+  454, 486, 518, 550, 582, 614, 646, 678, 710, 742, 774, 806, 838,
+  870, 902, 934, 966, 998,
+  7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423,
+  455, 487, 519, 551, 583, 615, 647, 679, 711, 743, 775, 807, 839,
+  871, 903, 935, 967, 999,
+  8, 40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424,
+  456, 488, 520, 552, 584, 616, 648, 680, 712, 744, 776, 808, 840,
+  872, 904, 936, 968, 1000,
+  9, 41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425,
+  457, 489, 521, 553, 585, 617, 649, 681, 713, 745, 777, 809, 841,
+  873, 905, 937, 969, 1001,
+  10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394,
+  426, 458, 490, 522, 554, 586, 618, 650, 682, 714, 746, 778, 810,
+  842, 874, 906, 938, 970, 1002,
+  11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395,
+  427, 459, 491, 523, 555, 587, 619, 651, 683, 715, 747, 779, 811,
+  843, 875, 907, 939, 971, 1003,
+  12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396,
+  428, 460, 492, 524, 556, 588, 620, 652, 684, 716, 748, 780,
+  812, 844, 876, 908, 940, 972, 1004,
+  13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397,
+  429, 461, 493, 525, 557, 589, 621, 653, 685, 717, 749, 781,
+  813, 845, 877, 909, 941, 973, 1005,
+  14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398,
+  430, 462, 494, 526, 558, 590, 622, 654, 686, 718, 750, 782,
+  814, 846, 878, 910, 942, 974, 1006,
+  15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399,
+  431, 463, 495, 527, 559, 591, 623, 655, 687, 719, 751, 783,
+  815, 847, 879, 911, 943, 975, 1007,
+  16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400,
+  432, 464, 496, 528, 560, 592, 624, 656, 688, 720, 752, 784,
+  816, 848, 880, 912, 944, 976, 1008,
+  17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401,
+  433, 465, 497, 529, 561, 593, 625, 657, 689, 721, 753, 785,
+  817, 849, 881, 913, 945, 977, 1009,
+  18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402,
+  434, 466, 498, 530, 562, 594, 626, 658, 690, 722, 754, 786,
+  818, 850, 882, 914, 946, 978, 1010,
+  19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403,
+  435, 467, 499, 531, 563, 595, 627, 659, 691, 723, 755, 787,
+  819, 851, 883, 915, 947, 979, 1011,
+  20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404,
+  436, 468, 500, 532, 564, 596, 628, 660, 692, 724, 756, 788,
+  820, 852, 884, 916, 948, 980, 1012,
+  21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405,
+  437, 469, 501, 533, 565, 597, 629, 661, 693, 725, 757, 789,
+  821, 853, 885, 917, 949, 981, 1013,
+  22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406,
+  438, 470, 502, 534, 566, 598, 630, 662, 694, 726, 758, 790,
+  822, 854, 886, 918, 950, 982, 1014,
+  23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407,
+  439, 471, 503, 535, 567, 599, 631, 663, 695, 727, 759, 791,
+  823, 855, 887, 919, 951, 983, 1015,
+  24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408,
+  440, 472, 504, 536, 568, 600, 632, 664, 696, 728, 760, 792,
+  824, 856, 888, 920, 952, 984, 1016,
+  25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409,
+  441, 473, 505, 537, 569, 601, 633, 665, 697, 729, 761, 793,
+  825, 857, 889, 921, 953, 985, 1017,
+  26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410,
+  442, 474, 506, 538, 570, 602, 634, 666, 698, 730, 762, 794,
+  826, 858, 890, 922, 954, 986, 1018,
+  27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411,
+  443, 475, 507, 539, 571, 603, 635, 667, 699, 731, 763, 795,
+  827, 859, 891, 923, 955, 987, 1019,
+  28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412,
+  444, 476, 508, 540, 572, 604, 636, 668, 700, 732, 764, 796,
+  828, 860, 892, 924, 956, 988, 1020,
+  29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413,
+  445, 477, 509, 541, 573, 605, 637, 669, 701, 733, 765, 797,
+  829, 861, 893, 925, 957, 989, 1021,
+  30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414,
+  446, 478, 510, 542, 574, 606, 638, 670, 702, 734, 766, 798,
+  830, 862, 894, 926, 958, 990, 1022,
+  31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415,
+  447, 479, 511, 543, 575, 607, 639, 671, 703, 735, 767, 799,
+  831, 863, 895, 927, 959, 991, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_mrow_iscan_32x32[1024]) = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+  17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+  46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+  60, 61, 62, 63,
+  64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
+  78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91,
+  92, 93, 94, 95,
+  96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
+  108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
+  119, 120, 121, 122, 123, 124, 125, 126, 127,
+  128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
+  139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+  160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170,
+  171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181,
+  182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+  192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202,
+  203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213,
+  214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+  224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234,
+  235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245,
+  246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
+  256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266,
+  267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277,
+  278, 279, 280, 281, 282, 283, 284, 285, 286, 287,
+  288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298,
+  299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309,
+  310, 311, 312, 313, 314, 315, 316, 317, 318, 319,
+  320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330,
+  331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341,
+  342, 343, 344, 345, 346, 347, 348, 349, 350, 351,
+  352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362,
+  363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373,
+  374, 375, 376, 377, 378, 379, 380, 381, 382, 383,
+  384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394,
+  395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405,
+  406, 407, 408, 409, 410, 411, 412, 413, 414, 415,
+  416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426,
+  427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437,
+  438, 439, 440, 441, 442, 443, 444, 445, 446, 447,
+  448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458,
+  459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469,
+  470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+  480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490,
+  491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501,
+  502, 503, 504, 505, 506, 507, 508, 509, 510, 511,
+  512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522,
+  523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533,
+  534, 535, 536, 537, 538, 539, 540, 541, 542, 543,
+  544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554,
+  555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565,
+  566, 567, 568, 569, 570, 571, 572, 573, 574, 575,
+  576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586,
+  587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597,
+  598, 599, 600, 601, 602, 603, 604, 605, 606, 607,
+  608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618,
+  619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629,
+  630, 631, 632, 633, 634, 635, 636, 637, 638, 639,
+  640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650,
+  651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661,
+  662, 663, 664, 665, 666, 667, 668, 669, 670, 671,
+  672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682,
+  683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693,
+  694, 695, 696, 697, 698, 699, 700, 701, 702, 703,
+  704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714,
+  715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725,
+  726, 727, 728, 729, 730, 731, 732, 733, 734, 735,
+  736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746,
+  747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757,
+  758, 759, 760, 761, 762, 763, 764, 765, 766, 767,
+  768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778,
+  779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789,
+  790, 791, 792, 793, 794, 795, 796, 797, 798, 799,
+  800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810,
+  811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821,
+  822, 823, 824, 825, 826, 827, 828, 829, 830, 831,
+  832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842,
+  843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853,
+  854, 855, 856, 857, 858, 859, 860, 861, 862, 863,
+  864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874,
+  875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885,
+  886, 887, 888, 889, 890, 891, 892, 893, 894, 895,
+  896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906,
+  907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917,
+  918, 919, 920, 921, 922, 923, 924, 925, 926, 927,
+  928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938,
+  939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949,
+  950, 951, 952, 953, 954, 955, 956, 957, 958, 959,
+  960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970,
+  971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981,
+  982, 983, 984, 985, 986, 987, 988, 989, 990, 991,
+  992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001,
+  1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010,
+  1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019,
+  1020, 1021, 1022, 1023,
+};
+
 DECLARE_ALIGNED(16, static const int16_t, vp10_default_iscan_32x32[1024]) = {
   0, 2, 5, 10, 17, 25, 38, 47, 62, 83, 101, 121, 145, 170, 193, 204,
   210, 219, 229, 233, 245, 257, 275, 299, 342, 356, 377, 405, 455, 471, 495,
@@ -1908,8 +2983,8 @@
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
-    {row_scan_4x4,     vp10_row_iscan_4x4,     row_scan_4x4_neighbors},
-    {col_scan_4x4,     vp10_col_iscan_4x4,     col_scan_4x4_neighbors},
+    {mrow_scan_4x4,    vp10_mrow_iscan_4x4,    mrow_scan_4x4_neighbors},
+    {mcol_scan_4x4,    vp10_mcol_iscan_4x4,    mcol_scan_4x4_neighbors},
   }, {  // TX_8X8
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
@@ -1928,8 +3003,8 @@
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
-    {row_scan_8x8,     vp10_row_iscan_8x8,     row_scan_8x8_neighbors},
-    {col_scan_8x8,     vp10_col_iscan_8x8,     col_scan_8x8_neighbors},
+    {mrow_scan_8x8,    vp10_mrow_iscan_8x8,    mrow_scan_8x8_neighbors},
+    {mcol_scan_8x8,    vp10_mcol_iscan_8x8,    mcol_scan_8x8_neighbors},
   }, {  // TX_16X16
     {default_scan_16x16, vp10_default_iscan_16x16,
      default_scan_16x16_neighbors},
@@ -1965,8 +3040,8 @@
      default_scan_16x16_neighbors},
     {default_scan_16x16, vp10_default_iscan_16x16,
      default_scan_16x16_neighbors},
-     {row_scan_16x16,     vp10_row_iscan_16x16,     row_scan_16x16_neighbors},
-     {col_scan_16x16,     vp10_col_iscan_16x16,     col_scan_16x16_neighbors},
+     {mrow_scan_16x16,  vp10_mrow_iscan_16x16,  mrow_scan_16x16_neighbors},
+     {mcol_scan_16x16,  vp10_mcol_iscan_16x16,  mcol_scan_16x16_neighbors},
   }, {  // TX_32X32
     {default_scan_32x32, vp10_default_iscan_32x32,
      default_scan_32x32_neighbors},
@@ -2002,10 +3077,8 @@
      qtr_scan_32x32_neighbors},
     {default_scan_32x32, vp10_default_iscan_32x32,
      default_scan_32x32_neighbors},
-     {h2_scan_32x32, vp10_h2_iscan_32x32,
-      h2_scan_32x32_neighbors},
-     {v2_scan_32x32, vp10_v2_iscan_32x32,
-      v2_scan_32x32_neighbors},
+    {mrow_scan_32x32,  vp10_mrow_iscan_32x32,  mrow_scan_32x32_neighbors},
+    {mcol_scan_32x32,  vp10_mcol_iscan_32x32,  mcol_scan_32x32_neighbors},
   }
 };
 
diff --git a/vp10/common/thread_common.c b/vp10/common/thread_common.c
index 963eed1..f8bfc89 100644
--- a/vp10/common/thread_common.c
+++ b/vp10/common/thread_common.c
@@ -439,19 +439,13 @@
       for (k = 0; k < 2; k++)
         cm->counts.comp_ref[i][j][k] += counts->comp_ref[i][j][k];
 
-  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-    for (j = 0; j < TX_SIZES; j++)
-      cm->counts.tx.p32x32[i][j] += counts->tx.p32x32[i][j];
+  for (i = 0; i < TX_SIZES - 1; ++i)
+    for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+      for (k = 0; k < i + 2; ++k)
+        cm->counts.tx_size[i][j][k] += counts->tx_size[i][j][k];
 
-    for (j = 0; j < TX_SIZES - 1; j++)
-      cm->counts.tx.p16x16[i][j] += counts->tx.p16x16[i][j];
-
-    for (j = 0; j < TX_SIZES - 2; j++)
-      cm->counts.tx.p8x8[i][j] += counts->tx.p8x8[i][j];
-  }
-
-  for (i = 0; i < TX_SIZES; i++)
-    cm->counts.tx.tx_totals[i] += counts->tx.tx_totals[i];
+  for (i = 0; i < TX_SIZES; ++i)
+    cm->counts.tx_size_totals[i] += counts->tx_size_totals[i];
 
 #if CONFIG_VAR_TX
   for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i)
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index 1e2ef58..f617ff6 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -155,7 +155,7 @@
     specialize qw/vp10_iht8x8_64_add sse2/;
 
     add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-    specialize qw/vp10_iht16x16_256_add/;
+    specialize qw/vp10_iht16x16_256_add sse2/;
 
     add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
     specialize qw/vp10_fdct4x4 sse2/;
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index 98d2910..320e66e 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -100,22 +100,6 @@
   return vpx_rb_read_bit(rb) ? TX_MODE_SELECT : vpx_rb_read_literal(rb, 2);
 }
 
-static void read_tx_mode_probs(struct tx_probs *tx_probs, vpx_reader *r) {
-  int i, j;
-
-  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
-    for (j = 0; j < TX_SIZES - 3; ++j)
-      vp10_diff_update_prob(r, &tx_probs->p8x8[i][j]);
-
-  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
-    for (j = 0; j < TX_SIZES - 2; ++j)
-      vp10_diff_update_prob(r, &tx_probs->p16x16[i][j]);
-
-  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
-    for (j = 0; j < TX_SIZES - 1; ++j)
-      vp10_diff_update_prob(r, &tx_probs->p32x32[i][j]);
-}
-
 static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
   int i, j;
   for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
@@ -292,10 +276,15 @@
     } else {
       if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
         memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+#if CONFIG_EXT_TX
+      else
+        memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
+#else
       else if (tx_size == TX_32X32 && eob <= 34)
         memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
       else
         memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
+#endif
     }
   }
 }
@@ -3700,8 +3689,13 @@
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder 0");
 
-  if (cm->tx_mode == TX_MODE_SELECT)
-    read_tx_mode_probs(&fc->tx_probs, &r);
+  if (cm->tx_mode == TX_MODE_SELECT) {
+    for (i = 0; i < TX_SIZES - 1; ++i)
+      for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+        for (k = 0; k < i + 1; ++k)
+          vp10_diff_update_prob(&r, &fc->tx_size_probs[i][j][k]);
+  }
+
   read_coef_probs(fc, cm->tx_mode, &r);
 
 #if CONFIG_VAR_TX
@@ -3853,7 +3847,8 @@
                  sizeof(cm->counts.single_ref)));
   assert(!memcmp(cm->counts.comp_ref, zero_counts.comp_ref,
                  sizeof(cm->counts.comp_ref)));
-  assert(!memcmp(&cm->counts.tx, &zero_counts.tx, sizeof(cm->counts.tx)));
+  assert(!memcmp(&cm->counts.tx_size, &zero_counts.tx_size,
+                 sizeof(cm->counts.tx_size)));
   assert(!memcmp(cm->counts.skip, zero_counts.skip, sizeof(cm->counts.skip)));
 #if CONFIG_REF_MV
   assert(!memcmp(&cm->counts.mv[0], &zero_counts.mv[0],
diff --git a/vp10/decoder/decodemv.c b/vp10/decoder/decodemv.c
index a42d08b..4cd6d1d 100644
--- a/vp10/decoder/decodemv.c
+++ b/vp10/decoder/decodemv.c
@@ -273,16 +273,11 @@
                                      TX_SIZE max_tx_size, vpx_reader *r) {
   FRAME_COUNTS *counts = xd->counts;
   const int ctx = get_tx_size_context(xd);
-  const vpx_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc->tx_probs);
-  int tx_size = vpx_read(r, tx_probs[0]);
-  if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
-    tx_size += vpx_read(r, tx_probs[1]);
-    if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
-      tx_size += vpx_read(r, tx_probs[2]);
-  }
-
+  const int tx_size_cat = max_tx_size - TX_8X8;
+  int tx_size = vpx_read_tree(r, vp10_tx_size_tree[tx_size_cat],
+                              cm->fc->tx_size_probs[tx_size_cat][ctx]);
   if (counts)
-    ++get_tx_counts(max_tx_size, ctx, &counts->tx)[tx_size];
+    ++counts->tx_size[tx_size_cat][ctx][tx_size];
   return (TX_SIZE)tx_size;
 }
 
@@ -1537,7 +1532,7 @@
                              idy, idx, r);
       if (xd->counts) {
         const int ctx = get_tx_size_context(xd);
-        ++get_tx_counts(max_tx_size, ctx, &xd->counts->tx)[mbmi->tx_size];
+        ++xd->counts->tx_size[max_tx_size - TX_8X8][ctx][mbmi->tx_size];
       }
     } else {
       mbmi->tx_size = read_tx_size(cm, xd, !mbmi->skip || !inter_block, r);
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index ed9d2a9..24a9366 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -81,6 +81,13 @@
         {30, 5}, {62, 6}, {126, 7}, {127, 7}},  // 8 colors
 };
 
+static const struct vp10_token
+tx_size_encodings[TX_SIZES - 1][TX_SIZES] = {
+    {{0, 1}, {1, 1}},  // Max tx_size is 8X8
+    {{0, 1}, {2, 2}, {3, 2}},  // Max tx_size is 16X16
+    {{0, 1}, {2, 2}, {6, 3}, {7, 3}},  // Max tx_size is 32X32
+};
+
 static INLINE void write_uniform(vpx_writer *w, int n, int v) {
   int l = get_unsigned_bits(n);
   int m = (1 << l) - n;
@@ -314,13 +321,11 @@
   TX_SIZE tx_size = xd->mi[0]->mbmi.tx_size;
   BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
-  const vpx_prob *const tx_probs = get_tx_probs2(max_tx_size, xd,
-                                                 &cm->fc->tx_probs);
-  vpx_write(w, tx_size != TX_4X4, tx_probs[0]);
-  if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
-    vpx_write(w, tx_size != TX_8X8, tx_probs[1]);
-    if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
-      vpx_write(w, tx_size != TX_16X16, tx_probs[2]);
+  if (max_tx_size > TX_4X4) {
+    vp10_write_token(w, vp10_tx_size_tree[max_tx_size - TX_8X8],
+                     cm->fc->tx_size_probs[max_tx_size - TX_8X8]
+                                          [get_tx_size_context(xd)],
+                     &tx_size_encodings[max_tx_size - TX_8X8][tx_size]);
   }
 }
 
@@ -1871,7 +1876,7 @@
   for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) {
     vp10_coeff_stats frame_branch_ct[PLANE_TYPES];
     vp10_coeff_probs_model frame_coef_probs[PLANE_TYPES];
-    if (cpi->td.counts->tx.tx_totals[tx_size] <= 20 ||
+    if (cpi->td.counts->tx_size_totals[tx_size] <= 20 ||
         (tx_size >= TX_16X16 && cpi->sf.tx_size_search_method == USE_TX_8X8)) {
       vpx_write_bit(w, 0);
     } else {
@@ -2052,30 +2057,11 @@
                               FRAME_COUNTS *counts) {
   if (cm->tx_mode == TX_MODE_SELECT) {
     int i, j;
-    unsigned int ct_8x8p[TX_SIZES - 3][2];
-    unsigned int ct_16x16p[TX_SIZES - 2][2];
-    unsigned int ct_32x32p[TX_SIZES - 1][2];
-
-
-    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      vp10_tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], ct_8x8p);
-      for (j = 0; j < TX_SIZES - 3; j++)
-        vp10_cond_prob_diff_update(w, &cm->fc->tx_probs.p8x8[i][j], ct_8x8p[j]);
-    }
-
-    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      vp10_tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], ct_16x16p);
-      for (j = 0; j < TX_SIZES - 2; j++)
-        vp10_cond_prob_diff_update(w, &cm->fc->tx_probs.p16x16[i][j],
-                                  ct_16x16p[j]);
-    }
-
-    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      vp10_tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], ct_32x32p);
-      for (j = 0; j < TX_SIZES - 1; j++)
-        vp10_cond_prob_diff_update(w, &cm->fc->tx_probs.p32x32[i][j],
-                                  ct_32x32p[j]);
-    }
+    for (i = 0; i < TX_SIZES - 1; ++i)
+      for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+        prob_diff_update(vp10_tx_size_tree[i],
+                         cm->fc->tx_size_probs[i][j],
+                         counts->tx_size[i][j], i + 2, w);
   }
 }
 
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index bea0157..7f55295 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -4169,19 +4169,18 @@
       int count8x8_lp = 0, count8x8_8x8p = 0;
       int count16x16_16x16p = 0, count16x16_lp = 0;
       int count32x32 = 0;
-
       for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-        count4x4 += counts->tx.p32x32[i][TX_4X4];
-        count4x4 += counts->tx.p16x16[i][TX_4X4];
-        count4x4 += counts->tx.p8x8[i][TX_4X4];
+        count4x4 += counts->tx_size[0][i][TX_4X4];
+        count4x4 += counts->tx_size[1][i][TX_4X4];
+        count4x4 += counts->tx_size[2][i][TX_4X4];
 
-        count8x8_lp += counts->tx.p32x32[i][TX_8X8];
-        count8x8_lp += counts->tx.p16x16[i][TX_8X8];
-        count8x8_8x8p += counts->tx.p8x8[i][TX_8X8];
+        count8x8_lp += counts->tx_size[1][i][TX_8X8];
+        count8x8_lp += counts->tx_size[2][i][TX_8X8];
+        count8x8_8x8p += counts->tx_size[0][i][TX_8X8];
 
-        count16x16_16x16p += counts->tx.p16x16[i][TX_16X16];
-        count16x16_lp += counts->tx.p32x32[i][TX_16X16];
-        count32x32 += counts->tx.p32x32[i][TX_32X32];
+        count16x16_16x16p += counts->tx_size[1][i][TX_16X16];
+        count16x16_lp += counts->tx_size[2][i][TX_16X16];
+        count32x32 += counts->tx_size[2][i][TX_32X32];
       }
       if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
 #if CONFIG_SUPERTX
@@ -4546,8 +4545,8 @@
       if (is_inter_block(mbmi))
         tx_partition_count_update(cm, xd, bsize, mi_row, mi_col, td->counts);
 #endif
-      ++get_tx_counts(max_txsize_lookup[bsize], get_tx_size_context(xd),
-                      &td->counts->tx)[mbmi->tx_size];
+      ++td->counts->tx_size[max_txsize_lookup[bsize] - TX_8X8]
+                           [get_tx_size_context(xd)][mbmi->tx_size];
     } else {
       int x, y;
       TX_SIZE tx_size;
@@ -4563,8 +4562,8 @@
           if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows)
             mi_8x8[mis * y + x]->mbmi.tx_size = tx_size;
     }
-    ++td->counts->tx.tx_totals[mbmi->tx_size];
-    ++td->counts->tx.tx_totals[get_uv_tx_size(mbmi, &xd->plane[1])];
+    ++td->counts->tx_size_totals[mbmi->tx_size];
+    ++td->counts->tx_size_totals[get_uv_tx_size(mbmi, &xd->plane[1])];
 #if CONFIG_EXT_TX
     if (get_ext_tx_types(mbmi->tx_size, bsize, is_inter_block(mbmi)) > 1 &&
         cm->base_qindex > 0 && !mbmi->skip &&
diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index 49cac0c..b2c242c 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@@ -516,6 +516,7 @@
                                                  [PALETTE_COLORS];
   int palette_uv_color_cost[PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS]
                                                   [PALETTE_COLORS];
+  int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
 #if CONFIG_EXT_TX
   int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
   int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
diff --git a/vp10/encoder/hybrid_fwd_txfm.c b/vp10/encoder/hybrid_fwd_txfm.c
index 029240f..ffc925c 100644
--- a/vp10/encoder/hybrid_fwd_txfm.c
+++ b/vp10/encoder/hybrid_fwd_txfm.c
@@ -134,8 +134,6 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      vp10_fht16x16(src_diff, coeff, diff_stride, tx_type);
-      break;
     case DST_DST:
     case DCT_DST:
     case DST_DCT:
@@ -143,8 +141,7 @@
     case ADST_DST:
     case DST_FLIPADST:
     case FLIPADST_DST:
-      // Use C version since DST exists only in C
-      vp10_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
+      vp10_fht16x16(src_diff, coeff, diff_stride, tx_type);
       break;
     case H_DCT:
     case V_DCT:
diff --git a/vp10/encoder/rd.c b/vp10/encoder/rd.c
index 3f60a1b..78e8e9a 100644
--- a/vp10/encoder/rd.c
+++ b/vp10/encoder/rd.c
@@ -104,6 +104,12 @@
                        vp10_default_palette_uv_color_prob[i][j],
                        vp10_palette_color_tree[i]);
     }
+
+  for (i = 0; i < TX_SIZES - 1; ++i)
+    for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+      vp10_cost_tokens(cpi->tx_size_cost[i][j], fc->tx_size_probs[i][j],
+                       vp10_tx_size_tree[i]);
+
 #if CONFIG_EXT_TX
   for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
     int s;
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index c65bdf1..1dbac3d 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -1247,21 +1247,6 @@
                    mbmi->tx_size, cpi->sf.use_fast_coef_costing);
 }
 
-static INLINE int vp10_cost_tx_size(TX_SIZE tx_size, TX_SIZE max_tx_size,
-                                    const vpx_prob *tx_probs) {
-  int m;
-  int r_tx_size = 0;
-
-  for (m = 0; m <= tx_size - (tx_size == max_tx_size); ++m) {
-    if (m == tx_size)
-      r_tx_size += vp10_cost_zero(tx_probs[m]);
-    else
-      r_tx_size += vp10_cost_one(tx_probs[m]);
-  }
-
-  return r_tx_size;
-}
-
 static void choose_tx_size_from_rd(VP10_COMP *cpi, MACROBLOCK *x,
                                    int *rate,
                                    int64_t *distortion,
@@ -1284,7 +1269,6 @@
   int start_tx, end_tx;
   const int tx_select = cm->tx_mode == TX_MODE_SELECT;
   const int is_inter = is_inter_block(mbmi);
-  const vpx_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
   TX_TYPE tx_type, best_tx_type = DCT_DCT;
   int prune = 0;
 #if CONFIG_EXT_TX
@@ -1316,7 +1300,8 @@
   for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
     last_rd = INT64_MAX;
     for (n = start_tx; n >= end_tx; --n) {
-      const int r_tx_size = vp10_cost_tx_size(n, max_tx_size, tx_probs);
+      const int r_tx_size =
+          cpi->tx_size_cost[max_tx_size - TX_8X8][get_tx_size_context(xd)][n];
       if (FIXED_TX_TYPE && tx_type != get_default_tx_type(0, xd, 0, n))
           continue;
 #if CONFIG_EXT_TX
@@ -1434,8 +1419,12 @@
 
   mbmi->tx_size = best_tx;
   mbmi->tx_type = best_tx_type;
+
+#if !CONFIG_EXT_TX
   if (mbmi->tx_size >= TX_32X32)
     assert(mbmi->tx_type == DCT_DCT);
+#endif
+
   txfm_rd_in_plane(x,
                    cpi,
                    &r, &d, &s,
@@ -2145,14 +2134,15 @@
       p_angle = mode_to_angle_map[mbmi->mode] +
           mbmi->angle_delta[0] * ANGLE_STEP;
       for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) {
+        int64_t tmp_best_rd;
         if ((FILTER_FAST_SEARCH || !pick_intra_filter(p_angle)) &&
             filter != INTRA_FILTER_LINEAR)
           continue;
         mic->mbmi.intra_filter = filter;
+        tmp_best_rd = (i == 0 && filter == INTRA_FILTER_LINEAR &&
+            best_rd < INT64_MAX) ? (int64_t)(best_rd * rd_adjust) : best_rd;
         super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
-                        &s, NULL, bsize,
-                        (i == 0 && filter == INTRA_FILTER_LINEAR &&
-                         best_rd < INT64_MAX) ? best_rd * rd_adjust : best_rd);
+                        &s, NULL, bsize, tmp_best_rd);
         if (this_rate_tokenonly == INT_MAX) {
           if (i == 0 && filter == INTRA_FILTER_LINEAR)
             return best_rd;
@@ -2257,7 +2247,7 @@
   return best_rd;
 }
 
-static inline int get_angle_index(double angle) {
+static INLINE int get_angle_index(double angle) {
   const double step = 22.5, base = 45;
   return (int)round((angle - base) / step);
 }
@@ -2388,8 +2378,6 @@
   const PREDICTION_MODE A = vp10_above_block_mode(mic, above_mi, 0);
   const PREDICTION_MODE L = vp10_left_block_mode(mic, left_mi, 0);
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
-  const vpx_prob *tx_probs = get_tx_probs2(max_tx_size, xd,
-                                           &cpi->common.fc->tx_probs);
   bmode_costs = cpi->y_mode_costs[A][L];
 
 #if CONFIG_EXT_INTRA
@@ -2466,8 +2454,9 @@
       // tokenonly rate, but for intra blocks, tx_size is always coded
       // (prediction granularity), so we account for it in the full rate,
       // not the tokenonly rate.
-      this_rate_tokenonly -= vp10_cost_tx_size(mic->mbmi.tx_size, max_tx_size,
-                                               tx_probs);
+      this_rate_tokenonly -=
+          cpi->tx_size_cost[max_tx_size - TX_8X8][get_tx_size_context(xd)]
+                                                 [mic->mbmi.tx_size];
     }
     if (cpi->common.allow_screen_content_tools && mode == DC_PRED)
       this_rate +=
@@ -3399,11 +3388,12 @@
     int i, j, best_i = -1;
 
     for (i = 0; i < level1; ++i) {
+      int64_t tmp_best_rd;
       mbmi->angle_delta[1] = deltas_level1[i];
-      if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
-                            &this_distortion, &s, &this_sse, bsize,
-                            (i == 0 && best_rd < INT64_MAX) ?
-                                best_rd * rd_adjust : best_rd)) {
+      tmp_best_rd = (i == 0 && best_rd < INT64_MAX) ?
+          (int64_t)(best_rd * rd_adjust) : best_rd;
+      if (!super_block_uvrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+                            &s, &this_sse, bsize, tmp_best_rd)) {
         if (i == 0)
           break;
         else
@@ -7146,7 +7136,6 @@
   int64_t mask_filter = 0;
   int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
-  const vpx_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
 #if CONFIG_OBMC
 #if CONFIG_VP9_HIGHBITDEPTH
   DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * 64 * 64]);
@@ -7702,7 +7691,9 @@
         // tokenonly rate, but for intra blocks, tx_size is always coded
         // (prediction granularity), so we account for it in the full rate,
         // not the tokenonly rate.
-        rate_y -= vp10_cost_tx_size(mbmi->tx_size, max_tx_size, tx_probs);
+        rate_y -=
+            cpi->tx_size_cost[max_tx_size - TX_8X8][get_tx_size_context(xd)]
+                                                   [mbmi->tx_size];
       }
 #if CONFIG_EXT_INTRA
       if (is_directional_mode) {
diff --git a/vp10/encoder/x86/dct_sse2.c b/vp10/encoder/x86/dct_sse2.c
index aaf1e6a..8ff7c9c 100644
--- a/vp10/encoder/x86/dct_sse2.c
+++ b/vp10/encoder/x86/dct_sse2.c
@@ -2420,6 +2420,351 @@
   in[15] = _mm_sub_epi16(kZero, s[1]);
 }
 
+#if CONFIG_EXT_TX
+static void fdst16_8col(__m128i *in) {
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t) cospi_16_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t) -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+
+  const __m128i k__cospi_m08_m24 = pair_set_epi16(-cospi_8_64, -cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+
+  const __m128i k__cospi_m30_p02 = pair_set_epi16(-cospi_30_64, cospi_2_64);
+  const __m128i k__cospi_m14_p18 = pair_set_epi16(-cospi_14_64, cospi_18_64);
+  const __m128i k__cospi_m22_p10 = pair_set_epi16(-cospi_22_64, cospi_10_64);
+  const __m128i k__cospi_m06_p26 = pair_set_epi16(-cospi_6_64, cospi_26_64);
+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i x0, x1, x2, x3, t0, t1, t2, t3;
+  __m128i y0, y1, y2, y3, y4, y5, y6, y7;
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7;
+
+  //  (1)
+  u0 = _mm_sub_epi16(in[0], in[15]);
+  v7 = _mm_add_epi16(in[0], in[15]);
+
+  u1 = _mm_sub_epi16(in[1], in[14]);  // -u1
+  v6 = _mm_add_epi16(in[1], in[14]);  // -v6
+
+  u2 = _mm_sub_epi16(in[2], in[13]);
+  v5 = _mm_add_epi16(in[2], in[13]);
+
+  u3 = _mm_sub_epi16(in[3], in[12]);  // -u3
+  v4 = _mm_add_epi16(in[3], in[12]);  // -v4
+
+  u4 = _mm_sub_epi16(in[4], in[11]);
+  v3 = _mm_add_epi16(in[4], in[11]);
+
+  u5 = _mm_sub_epi16(in[5], in[10]);  // -u5
+  v2 = _mm_add_epi16(in[5], in[10]);  // -v2
+
+  u6 = _mm_sub_epi16(in[6], in[9]);
+  v1 = _mm_add_epi16(in[6], in[9]);
+
+  u7 = _mm_sub_epi16(in[7], in[8]);   // -u7
+  v0 = _mm_add_epi16(in[7], in[8]);   // -v0
+
+  s0 = _mm_sub_epi16(u0, u7);
+  s1 = _mm_sub_epi16(u1, u6);  // -s1
+  s2 = _mm_sub_epi16(u2, u5);
+  s3 = _mm_sub_epi16(u3, u4);  // -s3
+  s4 = _mm_add_epi16(u3, u4);  // -s4
+  s5 = _mm_add_epi16(u2, u5);
+  s6 = _mm_add_epi16(u1, u6);  // -s6
+  s7 = _mm_add_epi16(u0, u7);
+
+  x0 = _mm_sub_epi16(s0, s3);
+  x1 = _mm_sub_epi16(s1, s2);  // -x1
+  x2 = _mm_add_epi16(s1, s2);  // -x2
+  x3 = _mm_add_epi16(s0, s3);
+
+  y0 = _mm_unpacklo_epi16(x0, x1);
+  y1 = _mm_unpackhi_epi16(x0, x1);
+  y2 = _mm_unpacklo_epi16(x2, x3);
+  y3 = _mm_unpackhi_epi16(x2, x3);
+
+  t0 = _mm_madd_epi16(y0, k__cospi_p16_m16);
+  t1 = _mm_madd_epi16(y1, k__cospi_p16_m16);
+  t2 = _mm_madd_epi16(y0, k__cospi_p16_p16);
+  t3 = _mm_madd_epi16(y1, k__cospi_p16_p16);
+  x0 = _mm_madd_epi16(y2, k__cospi_m24_p08);
+  x1 = _mm_madd_epi16(y3, k__cospi_m24_p08);
+  x2 = _mm_madd_epi16(y2, k__cospi_p08_p24);
+  x3 = _mm_madd_epi16(y3, k__cospi_p08_p24);
+
+  y0 = _mm_add_epi32(t0, k__DCT_CONST_ROUNDING);
+  y1 = _mm_add_epi32(t1, k__DCT_CONST_ROUNDING);
+  y2 = _mm_add_epi32(t2, k__DCT_CONST_ROUNDING);
+  y3 = _mm_add_epi32(t3, k__DCT_CONST_ROUNDING);
+  y4 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
+  y5 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
+  y6 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
+  y7 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
+
+  t0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
+  t1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
+  t2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
+  t3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
+  x0 = _mm_srai_epi32(y4, DCT_CONST_BITS);
+  x1 = _mm_srai_epi32(y5, DCT_CONST_BITS);
+  x2 = _mm_srai_epi32(y6, DCT_CONST_BITS);
+  x3 = _mm_srai_epi32(y7, DCT_CONST_BITS);
+
+  in[15] = _mm_packs_epi32(t0, t1);
+  in[11] = _mm_packs_epi32(x0, x1);
+  in[7] = _mm_packs_epi32(t2, t3);
+  in[3] = _mm_packs_epi32(x2, x3);
+
+  //  (2)
+  t0 = _mm_unpacklo_epi16(s6, s5);
+  t1 = _mm_unpackhi_epi16(s6, s5);
+
+  y0 = _mm_madd_epi16(t0, k__cospi_m16_m16);
+  y1 = _mm_madd_epi16(t1, k__cospi_m16_m16);
+  y2 = _mm_madd_epi16(t0, k__cospi_m16_p16);
+  y3 = _mm_madd_epi16(t1, k__cospi_m16_p16);
+
+  x0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING);
+  x1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING);
+  x2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING);
+  x3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING);
+
+  y4 = _mm_srai_epi32(x0, DCT_CONST_BITS);
+  y5 = _mm_srai_epi32(x1, DCT_CONST_BITS);
+  y6 = _mm_srai_epi32(x2, DCT_CONST_BITS);
+  y7 = _mm_srai_epi32(x3, DCT_CONST_BITS);
+
+  t2 = _mm_packs_epi32(y4, y5);
+  t3 = _mm_packs_epi32(y6, y7);
+
+  x0 = _mm_sub_epi16(s4, t2);  // -x0
+  x1 = _mm_add_epi16(s4, t2);  // -x1
+  x2 = _mm_sub_epi16(s7, t3);
+  x3 = _mm_add_epi16(s7, t3);
+
+  y0 = _mm_unpacklo_epi16(x0, x3);
+  y1 = _mm_unpackhi_epi16(x0, x3);
+  y2 = _mm_unpacklo_epi16(x1, x2);
+  y3 = _mm_unpackhi_epi16(x1, x2);
+
+  w0 = _mm_madd_epi16(y0, k__cospi_m28_p04);
+  w1 = _mm_madd_epi16(y1, k__cospi_m28_p04);
+  w2 = _mm_madd_epi16(y2, k__cospi_m12_p20);
+  w3 = _mm_madd_epi16(y3, k__cospi_m12_p20);
+  w4 = _mm_madd_epi16(y2, k__cospi_p20_p12);
+  w5 = _mm_madd_epi16(y3, k__cospi_p20_p12);
+  w6 = _mm_madd_epi16(y0, k__cospi_p04_p28);
+  w7 = _mm_madd_epi16(y1, k__cospi_p04_p28);
+
+  u0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+  y0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  y1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  y2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  y3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  y4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  y5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  y6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  y7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  in[13] = _mm_packs_epi32(y0, y1);
+  in[9] = _mm_packs_epi32(y4, y5);
+  in[5] = _mm_packs_epi32(y2, y3);
+  in[1] = _mm_packs_epi32(y6, y7);
+
+  //  (3)
+  y0 = _mm_unpacklo_epi16(v5, v2);
+  y1 = _mm_unpackhi_epi16(v5, v2);
+  y2 = _mm_unpacklo_epi16(v4, v3);
+  y3 = _mm_unpackhi_epi16(v4, v3);
+
+  u0 = _mm_madd_epi16(y0, k__cospi_p16_p16);
+  u1 = _mm_madd_epi16(y1, k__cospi_p16_p16);
+  u2 = _mm_madd_epi16(y2, k__cospi_m16_m16);
+  u3 = _mm_madd_epi16(y3, k__cospi_m16_m16);
+  u4 = _mm_madd_epi16(y2, k__cospi_m16_p16);
+  u5 = _mm_madd_epi16(y3, k__cospi_m16_p16);
+  u6 = _mm_madd_epi16(y0, k__cospi_p16_m16);
+  u7 = _mm_madd_epi16(y1, k__cospi_p16_m16);
+
+  w0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+  w1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+  w2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+  w3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+  w4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+  w5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+  w6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+  w7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+  s0 = _mm_srai_epi32(w0, DCT_CONST_BITS);
+  s1 = _mm_srai_epi32(w1, DCT_CONST_BITS);
+  s2 = _mm_srai_epi32(w2, DCT_CONST_BITS);
+  s3 = _mm_srai_epi32(w3, DCT_CONST_BITS);
+  s4 = _mm_srai_epi32(w4, DCT_CONST_BITS);
+  s5 = _mm_srai_epi32(w5, DCT_CONST_BITS);
+  s6 = _mm_srai_epi32(w6, DCT_CONST_BITS);
+  s7 = _mm_srai_epi32(w7, DCT_CONST_BITS);
+
+  y2 = _mm_packs_epi32(s0, s1);
+  y3 = _mm_packs_epi32(s2, s3);
+  y4 = _mm_packs_epi32(s4, s5);
+  y5 = _mm_packs_epi32(s6, s7);
+
+  //  step 3
+  w0 = _mm_sub_epi16(v0, y3);  // -w0
+  w1 = _mm_add_epi16(v1, y2);
+  w2 = _mm_sub_epi16(v1, y2);
+  w3 = _mm_add_epi16(v0, y3);  // -w3
+  w4 = _mm_sub_epi16(v7, y4);
+  w5 = _mm_add_epi16(v6, y5);  // -w5
+  w6 = _mm_sub_epi16(v6, y5);  // -w6
+  w7 = _mm_add_epi16(v7, y4);
+
+  //  step 4
+  x0 = _mm_unpacklo_epi16(w1, w6);
+  x1 = _mm_unpackhi_epi16(w1, w6);
+  x2 = _mm_unpacklo_epi16(w2, w5);
+  x3 = _mm_unpackhi_epi16(w2, w5);
+
+  u0 = _mm_madd_epi16(x0, k__cospi_m08_m24);
+  u1 = _mm_madd_epi16(x1, k__cospi_m08_m24);
+  u2 = _mm_madd_epi16(x2, k__cospi_p24_m08);
+  u3 = _mm_madd_epi16(x3, k__cospi_p24_m08);
+  u4 = _mm_madd_epi16(x2, k__cospi_p08_p24);
+  u5 = _mm_madd_epi16(x3, k__cospi_p08_p24);
+  u6 = _mm_madd_epi16(x0, k__cospi_p24_m08);
+  u7 = _mm_madd_epi16(x1, k__cospi_p24_m08);
+
+  s0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+  s1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+  s2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+  s3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+  s4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+  s5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+  s6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+  s7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(s0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(s1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(s2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(s3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(s4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(s5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(s6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(s7, DCT_CONST_BITS);
+
+  y1 = _mm_packs_epi32(u0, u1);
+  y2 = _mm_packs_epi32(u2, u3);
+  y5 = _mm_packs_epi32(u4, u5);
+  y6 = _mm_packs_epi32(u6, u7);
+
+  //  step 5
+  v0 = _mm_sub_epi16(w0, y1);  // -v0
+  v1 = _mm_add_epi16(w0, y1);  // -v1
+  v2 = _mm_sub_epi16(w3, y2);  // -v2
+  v3 = _mm_add_epi16(w3, y2);  // -v3
+  v4 = _mm_sub_epi16(w4, y5);
+  v5 = _mm_add_epi16(w4, y5);
+  v6 = _mm_sub_epi16(w7, y6);
+  v7 = _mm_add_epi16(w7, y6);
+
+  u0 = _mm_unpacklo_epi16(v0, v7);
+  u1 = _mm_unpackhi_epi16(v0, v7);
+  u2 = _mm_unpacklo_epi16(v1, v6);
+  u3 = _mm_unpackhi_epi16(v1, v6);
+  u4 = _mm_unpacklo_epi16(v2, v5);
+  u5 = _mm_unpackhi_epi16(v2, v5);
+  u6 = _mm_unpacklo_epi16(v3, v4);
+  u7 = _mm_unpackhi_epi16(v3, v4);
+
+  s0 = _mm_madd_epi16(u0, k__cospi_m30_p02);  // x0
+  s1 = _mm_madd_epi16(u1, k__cospi_m30_p02);
+  s2 = _mm_madd_epi16(u2, k__cospi_m14_p18);  // x1
+  s3 = _mm_madd_epi16(u3, k__cospi_m14_p18);
+  s4 = _mm_madd_epi16(u4, k__cospi_m22_p10);  // x2
+  s5 = _mm_madd_epi16(u5, k__cospi_m22_p10);
+  s6 = _mm_madd_epi16(u6, k__cospi_m06_p26);  // x3
+  s7 = _mm_madd_epi16(u7, k__cospi_m06_p26);
+
+  w0 = _mm_madd_epi16(u6, k__cospi_p26_p06);  // x4
+  w1 = _mm_madd_epi16(u7, k__cospi_p26_p06);
+  w2 = _mm_madd_epi16(u4, k__cospi_p10_p22);  // x5
+  w3 = _mm_madd_epi16(u5, k__cospi_p10_p22);
+  w4 = _mm_madd_epi16(u2, k__cospi_p18_p14);  // x6
+  w5 = _mm_madd_epi16(u3, k__cospi_p18_p14);
+  w6 = _mm_madd_epi16(u0, k__cospi_p02_p30);  // x7
+  w7 = _mm_madd_epi16(u1, k__cospi_p02_p30);
+
+  v0 = _mm_add_epi32(s0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(s1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(s2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(s3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(s4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(s5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(s6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(s7, k__DCT_CONST_ROUNDING);
+
+  y0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  y1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  y2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  y3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  y4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  y5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  y6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  y7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  s0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
+  s1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
+  s2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
+  s3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
+  s4 = _mm_srai_epi32(y4, DCT_CONST_BITS);
+  s5 = _mm_srai_epi32(y5, DCT_CONST_BITS);
+  s6 = _mm_srai_epi32(y6, DCT_CONST_BITS);
+  s7 = _mm_srai_epi32(y7, DCT_CONST_BITS);
+
+  in[14] = _mm_packs_epi32(u0, u1);
+  in[6] = _mm_packs_epi32(u2, u3);
+  in[10] = _mm_packs_epi32(u4, u5);
+  in[2] = _mm_packs_epi32(u6, u7);
+  in[12] = _mm_packs_epi32(s0, s1);
+  in[4] = _mm_packs_epi32(s2, s3);
+  in[8] = _mm_packs_epi32(s4, s5);
+  in[0] = _mm_packs_epi32(s6, s7);
+}
+#endif  // CONFIG_EXT_TX
+
 static void fdct16_sse2(__m128i *in0, __m128i *in1) {
   fdct16_8col(in0);
   fdct16_8col(in1);
@@ -2432,6 +2777,14 @@
   array_transpose_16x16(in0, in1);
 }
 
+#if CONFIG_EXT_TX
+static void fdst16_sse2(__m128i *in0, __m128i *in1) {
+  fdst16_8col(in0);
+  fdst16_8col(in1);
+  array_transpose_16x16(in0, in1);
+}
+#endif  // CONFIG_EXT_TX
+
 void vp10_fht16x16_sse2(const int16_t *input, tran_low_t *output,
                        int stride, int tx_type) {
   __m128i in0[16], in1[16];
@@ -2497,6 +2850,55 @@
       fadst16_sse2(in0, in1);
       write_buffer_16x16(output, in0, in1, 16);
       break;
+    case DST_DST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fdst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case DCT_DST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fdct16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case DST_DCT:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fdst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case DST_ADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fdst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case ADST_DST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case DST_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 1);
+      fdst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case FLIPADST_DST:
+      load_buffer_16x16(input, in0, in1, stride, 1, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
 #endif  // CONFIG_EXT_TX
     default:
       assert(0);