Rectangular transforms 4x8 & 8x4

Added a new expt rect-tx to be used in conjunction with ext-tx.
[rect-tx is a temporary config flag and will eventually be
merged into ext-tx once it works correctly with all other
experiments].

Added 4x8 and 8x4 tranforms for use initially with rectangular
sub8x8 y blocks as part of this experiment.

There is about a -0.2% BDRATE improvement on lowres, others pending.

When var-tx is on rectangular transforms are currently not used.
That will be enabled in a subsequent patch.

Change-Id: Iaf3f88ede2740ffe6a0ffb1ef5fc01a16cd0283a
diff --git a/configure b/configure
index cf6a7c3..473d35e 100755
--- a/configure
+++ b/configure
@@ -267,6 +267,7 @@
     fp_mb_stats
     emulate_hardware
     var_tx
+    rect_tx
     ref_mv
     dual_filter
     ext_tx
diff --git a/vp10/common/blockd.c b/vp10/common/blockd.c
index 5ca5c05..6062917 100644
--- a/vp10/common/blockd.c
+++ b/vp10/common/blockd.c
@@ -53,7 +53,9 @@
   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
   const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
   const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-  const int step = 1 << (tx_size << 1);
+  const uint8_t num_4x4_tw = num_4x4_blocks_wide_txsize_lookup[tx_size];
+  const uint8_t num_4x4_th = num_4x4_blocks_high_txsize_lookup[tx_size];
+  const int step = num_4x4_tw * num_4x4_th;
   int i = 0, r, c;
 
   // If mb_to_right_edge is < 0 we are in a situation in which
@@ -63,13 +65,15 @@
       xd->mb_to_right_edge >> (5 + pd->subsampling_x));
   const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
       xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-  const int extra_step = ((num_4x4_w - max_blocks_wide) >> tx_size) * step;
+  const int extra_step =
+      ((num_4x4_w - max_blocks_wide) >>
+       num_4x4_blocks_wide_txsize_log2_lookup[tx_size]) * step;
 
   // Keep track of the row and column of the blocks we use so that we know
   // if we are in the unrestricted motion border.
-  for (r = 0; r < max_blocks_high; r += (1 << tx_size)) {
+  for (r = 0; r < max_blocks_high; r += num_4x4_th) {
     // Skip visiting the sub blocks that are wholly within the UMV.
-    for (c = 0; c < max_blocks_wide; c += (1 << tx_size)) {
+    for (c = 0; c < max_blocks_wide; c += num_4x4_tw) {
       visit(plane, i, r, c, plane_bsize, tx_size, arg);
       i += step;
     }
@@ -82,33 +86,33 @@
                                    foreach_transformed_block_visitor visit,
                                    void *arg) {
   int plane;
-
   for (plane = 0; plane < MAX_MB_PLANE; ++plane)
     vp10_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
 }
 
 void vp10_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
-                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
-                      int aoff, int loff) {
+                       BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
+                       int aoff, int loff) {
   ENTROPY_CONTEXT *const a = pd->above_context + aoff;
   ENTROPY_CONTEXT *const l = pd->left_context + loff;
-  const int tx_size_in_blocks = 1 << tx_size;
+  const int tx_w_in_blocks = num_4x4_blocks_wide_txsize_lookup[tx_size];
+  const int tx_h_in_blocks = num_4x4_blocks_high_txsize_lookup[tx_size];
 
   // above
   if (has_eob && xd->mb_to_right_edge < 0) {
     int i;
     const int blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize] +
                             (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-    int above_contexts = tx_size_in_blocks;
+    int above_contexts = tx_w_in_blocks;
     if (above_contexts + aoff > blocks_wide)
       above_contexts = blocks_wide - aoff;
 
     for (i = 0; i < above_contexts; ++i)
       a[i] = has_eob;
-    for (i = above_contexts; i < tx_size_in_blocks; ++i)
+    for (i = above_contexts; i < tx_w_in_blocks; ++i)
       a[i] = 0;
   } else {
-    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_w_in_blocks);
   }
 
   // left
@@ -116,16 +120,16 @@
     int i;
     const int blocks_high = num_4x4_blocks_high_lookup[plane_bsize] +
                             (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-    int left_contexts = tx_size_in_blocks;
+    int left_contexts = tx_h_in_blocks;
     if (left_contexts + loff > blocks_high)
       left_contexts = blocks_high - loff;
 
     for (i = 0; i < left_contexts; ++i)
       l[i] = has_eob;
-    for (i = left_contexts; i < tx_size_in_blocks; ++i)
+    for (i = left_contexts; i < tx_h_in_blocks; ++i)
       l[i] = 0;
   } else {
-    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_h_in_blocks);
   }
 }
 
diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index 4c46cbb..399fefe 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@@ -422,6 +422,18 @@
 }
 #endif  // CONFIG_SUPERTX
 
+static INLINE int get_tx1d_width(TX_SIZE tx_size) {
+  return num_4x4_blocks_wide_txsize_lookup[tx_size] << 2;
+}
+
+static INLINE int get_tx1d_height(TX_SIZE tx_size) {
+  return num_4x4_blocks_high_txsize_lookup[tx_size] << 2;
+}
+
+static INLINE int get_tx2d_size(TX_SIZE tx_size) {
+  return num_4x4_blocks_txsize_lookup[tx_size] << 4;
+}
+
 #if CONFIG_EXT_TX
 #define ALLOW_INTRA_EXT_TX          1
 // whether masked transforms are used for 32X32
@@ -438,6 +450,7 @@
 #if EXT_TX_SIZES == 4
 static INLINE int get_ext_tx_set(TX_SIZE tx_size, BLOCK_SIZE bs,
                                  int is_inter) {
+  tx_size = txsize_sqr_map[tx_size];
   if (tx_size > TX_32X32 || bs < BLOCK_8X8) return 0;
 #if USE_REDUCED_TXSET_FOR_16X16
   if (tx_size == TX_32X32)
@@ -468,6 +481,7 @@
 static INLINE int get_ext_tx_set(TX_SIZE tx_size, BLOCK_SIZE bs,
                                  int is_inter) {
   (void) is_inter;
+  tx_size = txsize_sqr_map[tx_size];
   if (tx_size > TX_32X32 || bs < BLOCK_8X8) return 0;
   if (tx_size == TX_32X32) return 0;
 #if USE_REDUCED_TXSET_FOR_16X16
@@ -622,10 +636,11 @@
 
 #if CONFIG_EXT_TX
 #if EXT_TX_SIZES == 4
-  if (xd->lossless[mbmi->segment_id] || tx_size > TX_32X32 ||
-      (tx_size >= TX_32X32 && !is_inter_block(mbmi)))
+  if (xd->lossless[mbmi->segment_id] ||
+      txsize_sqr_map[tx_size] > TX_32X32 ||
+      (txsize_sqr_map[tx_size] >= TX_32X32 && !is_inter_block(mbmi)))
 #else
-  if (xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32)
+  if (xd->lossless[mbmi->segment_id] || txsize_sqr_map[tx_size] >= TX_32X32)
 #endif
     return DCT_DCT;
   if (mbmi->sb_type >= BLOCK_8X8) {
@@ -637,8 +652,8 @@
     }
     if (is_inter_block(mbmi))
       // UV Inter only
-      return (mbmi->tx_type == IDTX && tx_size == TX_32X32 ?
-              DCT_DCT : mbmi->tx_type);
+      return (mbmi->tx_type == IDTX && txsize_sqr_map[tx_size] == TX_32X32) ?
+              DCT_DCT : mbmi->tx_type;
   }
 
   // Sub8x8-Inter/Intra OR UV-Intra
@@ -647,10 +662,10 @@
   else  // Sub8x8 Intra OR UV-Intra
     return intra_mode_to_tx_type_context[plane_type == PLANE_TYPE_Y ?
         get_y_mode(mi, block_idx) : mbmi->uv_mode];
-#else
+#else   // CONFIG_EXT_TX
   (void) block_idx;
   if (plane_type != PLANE_TYPE_Y || xd->lossless[mbmi->segment_id] ||
-      tx_size >= TX_32X32)
+      txsize_sqr_map[tx_size] >= TX_32X32)
     return DCT_DCT;
   return mbmi->tx_type;
 #endif  // CONFIG_EXT_TX
diff --git a/vp10/common/common_data.h b/vp10/common/common_data.h
index 44ebff2..2506986 100644
--- a/vp10/common/common_data.h
+++ b/vp10/common/common_data.h
@@ -50,6 +50,46 @@
 static const uint8_t num_16x16_blocks_high_lookup[BLOCK_SIZES] =
   {1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4, IF_EXT_PARTITION(8, 4, 8)};
 
+static const uint8_t num_4x4_blocks_txsize_lookup[TX_SIZES_ALL] = {
+  1, 4, 16, 64,
+#if CONFIG_EXT_TX
+  2, 2
+#endif  // CONFIG_EXT_TX
+};
+static const uint8_t num_4x4_blocks_wide_txsize_lookup[TX_SIZES_ALL] = {
+  1, 2, 4, 8,
+#if CONFIG_EXT_TX
+  1, 2
+#endif  // CONFIG_EXT_TX
+};
+static const uint8_t num_4x4_blocks_high_txsize_lookup[TX_SIZES_ALL] = {
+  1, 2, 4, 8,
+#if CONFIG_EXT_TX
+  2, 1
+#endif  // CONFIG_EXT_TX
+};
+
+static const uint8_t num_4x4_blocks_txsize_log2_lookup[TX_SIZES_ALL] = {
+  0, 2, 4, 6,
+#if CONFIG_EXT_TX
+  1, 1
+#endif  // CONFIG_EXT_TX
+};
+static const uint8_t num_4x4_blocks_wide_txsize_log2_lookup
+    [TX_SIZES_ALL] = {
+  0, 1, 2, 3,
+#if CONFIG_EXT_TX
+  0, 1
+#endif  // CONFIG_EXT_TX
+};
+static const uint8_t num_4x4_blocks_high_txsize_log2_lookup
+    [TX_SIZES_ALL] = {
+  0, 1, 2, 3,
+#if CONFIG_EXT_TX
+  1, 0
+#endif  // CONFIG_EXT_TX
+};
+
 // VPXMIN(3, VPXMIN(b_width_log2(bsize), b_height_log2(bsize)))
 static const uint8_t size_group_lookup[BLOCK_SIZES] =
   {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, IF_EXT_PARTITION(3, 3, 3)};
@@ -297,12 +337,58 @@
 #endif  // CONFIG_EXT_PARTITION
 };
 
-static const BLOCK_SIZE txsize_to_bsize[TX_SIZES] = {
-    BLOCK_4X4,  // TX_4X4
-    BLOCK_8X8,  // TX_8X8
-    BLOCK_16X16,  // TX_16X16
-    BLOCK_32X32,  // TX_32X32
+#if CONFIG_EXT_TX
+static const TX_SIZE max_txsize_rect_lookup[BLOCK_SIZES] = {
+  //                   4X4
+                       TX_4X4,
+  // 4X8,    8X4,      8X8
+  TX_4X8,    TX_8X4,   TX_8X8,
+  // 8X16,   16X8,     16X16
+  TX_8X8,    TX_8X8,   TX_16X16,
+  // 16X32,  32X16,    32X32
+  TX_16X16,  TX_16X16, TX_32X32,
+  // 32X64,  64X32,    64X64
+  TX_32X32,  TX_32X32, TX_32X32,
+#if CONFIG_EXT_PARTITION
+  // 64x128, 128x64,   128x128
+  TX_32X32,  TX_32X32, TX_32X32,
+#endif  // CONFIG_EXT_PARTITION
 };
+#endif  // CONFIG_EXT_TX
+
+static const BLOCK_SIZE txsize_to_bsize[TX_SIZES_ALL] = {
+  BLOCK_4X4,    // TX_4X4
+  BLOCK_8X8,    // TX_8X8
+  BLOCK_16X16,  // TX_16X16
+  BLOCK_32X32,  // TX_32X32
+#if CONFIG_EXT_TX
+  BLOCK_4X8,    // TX_4X8
+  BLOCK_8X4,    // TX_8X4
+#endif  // CONFIG_EXT_TX
+};
+
+static const TX_SIZE txsize_sqr_map[TX_SIZES_ALL] = {
+  TX_4X4,    // TX_4X4
+  TX_8X8,    // TX_8X8
+  TX_16X16,  // TX_16X16
+  TX_32X32,  // TX_32X32
+#if CONFIG_EXT_TX
+  TX_4X4,    // TX_4X8
+  TX_4X4,    // TX_8X4
+#endif  // CONFIG_EXT_TX
+};
+
+static const TX_SIZE txsize_sqr_up_map[TX_SIZES_ALL] = {
+  TX_4X4,    // TX_4X4
+  TX_8X8,    // TX_8X8
+  TX_16X16,  // TX_16X16
+  TX_32X32,  // TX_32X32
+#if CONFIG_EXT_TX
+  TX_8X8,    // TX_4X8
+  TX_8X8,    // TX_8X4
+#endif  // CONFIG_EXT_TX
+};
+
 
 static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
   TX_4X4,  // ONLY_4X4
diff --git a/vp10/common/entropy.c b/vp10/common/entropy.c
index eea552c..1ce801a 100644
--- a/vp10/common/entropy.c
+++ b/vp10/common/entropy.c
@@ -56,11 +56,33 @@
 };
 #endif
 
+const uint16_t band_count_table[TX_SIZES_ALL][8] = {
+  { 1, 2, 3, 4,  3,   16 - 13, 0 },
+  { 1, 2, 3, 4, 11,   64 - 21, 0 },
+  { 1, 2, 3, 4, 11,  256 - 21, 0 },
+  { 1, 2, 3, 4, 11, 1024 - 21, 0 },
+#if CONFIG_EXT_TX
+  { 1, 2, 3, 4,  8,   32 - 18, 0 },
+  { 1, 2, 3, 4,  8,   32 - 18, 0 },
+#endif  // CONFIG_EXT_TX
+};
+
+const uint16_t band_cum_count_table[TX_SIZES_ALL][8] = {
+  { 0, 1, 3, 6, 10, 13, 16, 0 },
+  { 0, 1, 3, 6, 10, 21, 64, 0 },
+  { 0, 1, 3, 6, 10, 21, 256, 0 },
+  { 0, 1, 3, 6, 10, 21, 1024, 0 },
+#if CONFIG_EXT_TX
+  { 0, 1, 3, 6, 10, 18, 32, 0 },
+  { 0, 1, 3, 6, 10, 18, 32, 0 },
+#endif  // CONFIG_EXT_TX
+};
+
 const uint8_t vp10_coefband_trans_8x8plus[1024] = {
   0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
   4, 4, 4, 4, 4, 5,
   // beyond MAXBAND_INDEX+1 all values are filled as 5
-                    5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
@@ -125,6 +147,13 @@
   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 };
 
+#if CONFIG_EXT_TX
+const uint8_t vp10_coefband_trans_8x4_4x8[32] = {
+  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
+  4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+};
+#endif  // CONFIG_EXT_TX
+
 const uint8_t vp10_coefband_trans_4x4[16] = {
   0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
 };
diff --git a/vp10/common/entropy.h b/vp10/common/entropy.h
index d0ca880..baaa515 100644
--- a/vp10/common/entropy.h
+++ b/vp10/common/entropy.h
@@ -155,11 +155,28 @@
 #define MAXBAND_INDEX 21
 
 DECLARE_ALIGNED(16, extern const uint8_t, vp10_coefband_trans_8x8plus[1024]);
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, extern const uint8_t, vp10_coefband_trans_8x4_4x8[32]);
+#endif  // CONFIG_EXT_TX
 DECLARE_ALIGNED(16, extern const uint8_t, vp10_coefband_trans_4x4[16]);
 
+DECLARE_ALIGNED(16, extern const uint16_t,
+                band_count_table[TX_SIZES_ALL][8]);
+DECLARE_ALIGNED(16, extern const uint16_t,
+                band_cum_count_table[TX_SIZES_ALL][8]);
+
 static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
-  return tx_size == TX_4X4 ? vp10_coefband_trans_4x4
-                           : vp10_coefband_trans_8x8plus;
+  switch (tx_size) {
+    case TX_4X4:
+      return vp10_coefband_trans_4x4;
+#if CONFIG_EXT_TX
+    case TX_4X8:
+    case TX_8X4:
+      return vp10_coefband_trans_8x4_4x8;
+#endif  // CONFIG_EXT_TX
+    default:
+      return vp10_coefband_trans_8x8plus;
+  }
 }
 
 // 128 lists of probabilities are stored for the following ONE node probs:
@@ -198,7 +215,8 @@
   return (a != 0) + (b != 0);
 }
 
-static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
+static INLINE int get_entropy_context(TX_SIZE tx_size,
+                                      const ENTROPY_CONTEXT *a,
                                       const ENTROPY_CONTEXT *l) {
   ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
 
@@ -207,6 +225,16 @@
       above_ec = a[0] != 0;
       left_ec = l[0] != 0;
       break;
+#if CONFIG_EXT_TX
+    case TX_4X8:
+      above_ec = a[0] != 0;
+      left_ec = !!*(const uint16_t *)l;
+      break;
+    case TX_8X4:
+      above_ec = !!*(const uint16_t *)a;
+      left_ec = l[0] != 0;
+      break;
+#endif  // CONFIG_EXT_TX
     case TX_8X8:
       above_ec = !!*(const uint16_t *)a;
       left_ec  = !!*(const uint16_t *)l;
@@ -223,7 +251,6 @@
       assert(0 && "Invalid transform size.");
       break;
   }
-
   return combine_entropy_contexts(above_ec, left_ec);
 }
 
diff --git a/vp10/common/enums.h b/vp10/common/enums.h
index d1ce121..a93becc 100644
--- a/vp10/common/enums.h
+++ b/vp10/common/enums.h
@@ -137,6 +137,14 @@
 #define TX_32X32 ((TX_SIZE)3)   // 32x32 transform
 #define TX_SIZES ((TX_SIZE)4)
 
+#if CONFIG_EXT_TX
+#define TX_4X8   ((TX_SIZE)4)      // 4x8 transform
+#define TX_8X4   ((TX_SIZE)5)      // 8x4 transform
+#define TX_SIZES_ALL ((TX_SIZE)6)  // Includes rectangular transforms
+#else
+#define TX_SIZES_ALL ((TX_SIZE)4)
+#endif  // CONFIG_EXT_TX
+
 #define MAX_TX_SIZE_LOG2  5
 #define MAX_TX_SIZE       (1 << MAX_TX_SIZE_LOG2)
 #define MIN_TX_SIZE_LOG2  2
@@ -170,10 +178,10 @@
 } TX_TYPE_1D;
 
 typedef enum {
-  DCT_DCT   = 0,                      // DCT  in both horizontal and vertical
-  ADST_DCT  = 1,                      // ADST in vertical, DCT in horizontal
-  DCT_ADST  = 2,                      // DCT  in vertical, ADST in horizontal
-  ADST_ADST = 3,                      // ADST in both directions
+  DCT_DCT   = 0,                  // DCT  in both horizontal and vertical
+  ADST_DCT  = 1,                  // ADST in vertical, DCT in horizontal
+  DCT_ADST  = 2,                  // DCT  in vertical, ADST in horizontal
+  ADST_ADST = 3,                  // ADST in both directions
 #if CONFIG_EXT_TX
   FLIPADST_DCT = 4,
   DCT_FLIPADST = 5,
diff --git a/vp10/common/idct.c b/vp10/common/idct.c
index 1a573bd..9b70857 100644
--- a/vp10/common/idct.c
+++ b/vp10/common/idct.c
@@ -144,7 +144,7 @@
 
 static void maybe_flip_strides(uint8_t **dst, int *dstride,
                                tran_low_t **src, int *sstride,
-                               int tx_type, int size) {
+                               int tx_type, int sizey, int sizex) {
   // Note that the transpose of src will be added to dst. In order to LR
   // flip the addends (in dst coordinates), we UD flip the src. To UD flip
   // the addends, we UD flip the dst.
@@ -163,19 +163,19 @@
     case FLIPADST_ADST:
     case V_FLIPADST:
       // flip UD
-      FLIPUD_PTR(*dst, *dstride, size);
+      FLIPUD_PTR(*dst, *dstride, sizey);
       break;
     case DCT_FLIPADST:
     case ADST_FLIPADST:
     case H_FLIPADST:
       // flip LR
-      FLIPUD_PTR(*src, *sstride, size);
+      FLIPUD_PTR(*src, *sstride, sizex);
       break;
     case FLIPADST_FLIPADST:
       // flip UD
-      FLIPUD_PTR(*dst, *dstride, size);
+      FLIPUD_PTR(*dst, *dstride, sizey);
       // flip LR
-      FLIPUD_PTR(*src, *sstride, size);
+      FLIPUD_PTR(*src, *sstride, sizex);
       break;
     default:
       assert(0);
@@ -445,7 +445,7 @@
 
 static void maybe_flip_strides16(uint16_t **dst, int *dstride,
                                  tran_low_t **src, int *sstride,
-                                 int tx_type, int size) {
+                                 int tx_type, int sizey, int sizex) {
   // Note that the transpose of src will be added to dst. In order to LR
   // flip the addends (in dst coordinates), we UD flip the src. To UD flip
   // the addends, we UD flip the dst.
@@ -464,19 +464,19 @@
     case FLIPADST_ADST:
     case V_FLIPADST:
       // flip UD
-      FLIPUD_PTR(*dst, *dstride, size);
+      FLIPUD_PTR(*dst, *dstride, sizey);
       break;
     case DCT_FLIPADST:
     case ADST_FLIPADST:
     case H_FLIPADST:
       // flip LR
-      FLIPUD_PTR(*src, *sstride, size);
+      FLIPUD_PTR(*src, *sstride, sizex);
       break;
     case FLIPADST_FLIPADST:
       // flip UD
-      FLIPUD_PTR(*dst, *dstride, size);
+      FLIPUD_PTR(*dst, *dstride, sizey);
       // flip LR
-      FLIPUD_PTR(*src, *sstride, size);
+      FLIPUD_PTR(*src, *sstride, sizex);
       break;
     default:
       assert(0);
@@ -536,7 +536,7 @@
   }
 
 #if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 4);
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 4, 4);
 #endif
 
   // Sum with the destination
@@ -549,6 +549,116 @@
   }
 }
 
+void vp10_iht4x8_32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                          int tx_type) {
+  static const transform_2d IHT_4x8[] = {
+    { idct8_c,  idct4_c  },  // DCT_DCT
+    { iadst8_c, idct4_c  },  // ADST_DCT
+    { idct8_c,  iadst4_c },  // DCT_ADST
+    { iadst8_c, iadst4_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { iadst8_c, idct4_c  },  // FLIPADST_DCT
+    { idct8_c,  iadst4_c },  // DCT_FLIPADST
+    { iadst8_c, iadst4_c },  // FLIPADST_FLIPADST
+    { iadst8_c, iadst4_c },  // ADST_FLIPADST
+    { iadst8_c, iadst4_c },  // FLIPADST_ADST
+    { iidtx8_c, iidtx4_c },  // IDTX
+    { idct8_c,  iidtx4_c },  // V_DCT
+    { iidtx8_c, idct4_c  },  // H_DCT
+    { iadst8_c, iidtx4_c },  // V_ADST
+    { iidtx8_c, iadst4_c },  // H_ADST
+    { iadst8_c, iidtx4_c },  // V_FLIPADST
+    { iidtx8_c, iadst4_c },  // H_FLIPADST
+#endif  // CONFIG_EXT_TX
+  };
+
+  int i, j;
+  tran_low_t out[4][8], outtmp[4];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 8;
+
+  // inverse transform row vectors and transpose
+  for (i = 0; i < 8; ++i) {
+    IHT_4x8[tx_type].rows(input, outtmp);
+    for (j = 0; j < 4; ++j)
+      out[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
+    input  += 4;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 4; ++i) {
+    IHT_4x8[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 8, 4);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 4; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+    }
+  }
+}
+
+void vp10_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                          int tx_type) {
+  static const transform_2d IHT_8x4[] = {
+    { idct4_c,  idct8_c  },  // DCT_DCT
+    { iadst4_c, idct8_c  },  // ADST_DCT
+    { idct4_c,  iadst8_c },  // DCT_ADST
+    { iadst4_c, iadst8_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { iadst4_c, idct8_c  },  // FLIPADST_DCT
+    { idct4_c,  iadst8_c },  // DCT_FLIPADST
+    { iadst4_c, iadst8_c },  // FLIPADST_FLIPADST
+    { iadst4_c, iadst8_c },  // ADST_FLIPADST
+    { iadst4_c, iadst8_c },  // FLIPADST_ADST
+    { iidtx4_c, iidtx8_c },  // IDTX
+    { idct4_c,  iidtx8_c },  // V_DCT
+    { iidtx4_c, idct8_c  },  // H_DCT
+    { iadst4_c, iidtx8_c },  // V_ADST
+    { iidtx4_c, iadst8_c },  // H_ADST
+    { iadst4_c, iidtx8_c },  // V_FLIPADST
+    { iidtx4_c, iadst8_c },  // H_FLIPADST
+#endif  // CONFIG_EXT_TX
+  };
+
+  int i, j;
+  tran_low_t out[8][4], outtmp[8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 4;
+
+  // inverse transform row vectors and transpose
+  for (i = 0; i < 4; ++i) {
+    IHT_8x4[tx_type].rows(input, outtmp);
+    for (j = 0; j < 8; ++j)
+      out[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
+    input  += 8;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 8; ++i) {
+    IHT_8x4[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 4, 8);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 8; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+    }
+  }
+}
+
 void vp10_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          int tx_type) {
   static const transform_2d IHT_8[] = {
@@ -599,7 +709,7 @@
   }
 
 #if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 8);
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 8, 8);
 #endif
 
   // Sum with the destination
@@ -662,7 +772,7 @@
   }
 
 #if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 16);
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 16, 16);
 #endif
 
   // Sum with the destination
@@ -723,7 +833,7 @@
     IHT_32[tx_type].cols(out[i], out[i]);
   }
 
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 32);
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 32, 32);
 
   // Sum with the destination
   for (i = 0; i < 32; ++i) {
@@ -840,6 +950,20 @@
   }
 }
 
+#if CONFIG_EXT_TX
+void vp10_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
+                           int stride, int eob, TX_TYPE tx_type) {
+  (void) eob;
+  vp10_iht8x4_32_add(input, dest, stride, tx_type);
+}
+
+void vp10_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
+                           int stride, int eob, TX_TYPE tx_type) {
+  (void) eob;
+  vp10_iht4x8_32_add(input, dest, stride, tx_type);
+}
+#endif  // CONFIG_EXT_TX
+
 void vp10_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
                            int stride, int eob, TX_TYPE tx_type) {
   switch (tx_type) {
@@ -1002,7 +1126,7 @@
   }
 
 #if CONFIG_EXT_TX
-  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 4);
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 4, 4);
 #endif
 
   // Sum with the destination
@@ -1016,6 +1140,118 @@
   }
 }
 
+#if CONFIG_EXT_TX
+void vp10_highbd_iht4x8_32_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_4x8[] = {
+    { vpx_highbd_idct8_c,  vpx_highbd_idct4_c  },  // DCT_DCT
+    { vpx_highbd_iadst8_c, vpx_highbd_idct4_c  },  // ADST_DCT
+    { vpx_highbd_idct8_c,  vpx_highbd_iadst4_c },  // DCT_ADST
+    { vpx_highbd_iadst8_c, vpx_highbd_iadst4_c },  // ADST_ADST
+    { vpx_highbd_iadst8_c, vpx_highbd_idct4_c  },  // FLIPADST_DCT
+    { vpx_highbd_idct8_c,  vpx_highbd_iadst4_c },  // DCT_FLIPADST
+    { vpx_highbd_iadst8_c, vpx_highbd_iadst4_c },  // FLIPADST_FLIPADST
+    { vpx_highbd_iadst8_c, vpx_highbd_iadst4_c },  // ADST_FLIPADST
+    { vpx_highbd_iadst8_c, vpx_highbd_iadst4_c },  // FLIPADST_ADST
+    {     highbd_iidtx8_c,     highbd_iidtx4_c },  // IDTX
+    { vpx_highbd_idct8_c,      highbd_iidtx4_c },  // V_DCT
+    {     highbd_iidtx8_c, vpx_highbd_idct4_c  },  // H_DCT
+    { vpx_highbd_iadst8_c,     highbd_iidtx4_c },  // V_ADST
+    {     highbd_iidtx8_c, vpx_highbd_iadst4_c },  // H_ADST
+    { vpx_highbd_iadst8_c,     highbd_iidtx4_c },  // V_FLIPADST
+    {     highbd_iidtx8_c, vpx_highbd_iadst4_c },  // H_FLIPADST
+  };
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t out[4][8], outtmp[4];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 8;
+
+  // inverse transform row vectors, and transpose
+  for (i = 0; i < 8; ++i) {
+    HIGH_IHT_4x8[tx_type].rows(input, outtmp, bd);
+    for (j = 0; j < 4; ++j)
+      out[j][i] = (tran_low_t)highbd_dct_const_round_shift(outtmp[j] * Sqrt2,
+                                                           bd);
+    input  += 4;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 4; ++i) {
+    HIGH_IHT_4x8[tx_type].cols(out[i], out[i], bd);
+  }
+
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 8, 4);
+
+  // Sum with the destination
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 4; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = highbd_clip_pixel_add(dest[d],
+                                      ROUND_POWER_OF_TWO(outp[s], 5), bd);
+    }
+  }
+}
+
+void vp10_highbd_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_8x4[] = {
+    { vpx_highbd_idct4_c,  vpx_highbd_idct8_c  },  // DCT_DCT
+    { vpx_highbd_iadst4_c, vpx_highbd_idct8_c  },  // ADST_DCT
+    { vpx_highbd_idct4_c,  vpx_highbd_iadst8_c },  // DCT_ADST
+    { vpx_highbd_iadst4_c, vpx_highbd_iadst8_c },  // ADST_ADST
+    { vpx_highbd_iadst4_c, vpx_highbd_idct8_c  },  // FLIPADST_DCT
+    { vpx_highbd_idct4_c,  vpx_highbd_iadst8_c },  // DCT_FLIPADST
+    { vpx_highbd_iadst4_c, vpx_highbd_iadst8_c },  // FLIPADST_FLIPADST
+    { vpx_highbd_iadst4_c, vpx_highbd_iadst8_c },  // ADST_FLIPADST
+    { vpx_highbd_iadst4_c, vpx_highbd_iadst8_c },  // FLIPADST_ADST
+    {     highbd_iidtx4_c,     highbd_iidtx8_c },  // IDTX
+    { vpx_highbd_idct4_c,      highbd_iidtx8_c },  // V_DCT
+    {     highbd_iidtx4_c, vpx_highbd_idct8_c  },  // H_DCT
+    { vpx_highbd_iadst4_c,     highbd_iidtx8_c },  // V_ADST
+    {     highbd_iidtx4_c, vpx_highbd_iadst8_c },  // H_ADST
+    { vpx_highbd_iadst4_c,     highbd_iidtx8_c },  // V_FLIPADST
+    {     highbd_iidtx4_c, vpx_highbd_iadst8_c },  // H_FLIPADST
+  };
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t out[8][4], outtmp[8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 4;
+
+  // inverse transform row vectors, and transpose
+  for (i = 0; i < 4; ++i) {
+    HIGH_IHT_8x4[tx_type].rows(input, outtmp, bd);
+    for (j = 0; j < 8; ++j)
+      out[j][i] = (tran_low_t)highbd_dct_const_round_shift(outtmp[j] * Sqrt2,
+                                                           bd);
+    input  += 8;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 8; ++i) {
+    HIGH_IHT_8x4[tx_type].cols(out[i], out[i], bd);
+  }
+
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 4, 8);
+
+  // Sum with the destination
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 8; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = highbd_clip_pixel_add(dest[d],
+                                      ROUND_POWER_OF_TWO(outp[s], 5), bd);
+    }
+  }
+}
+#endif  // CONFIG_EXT_TX
+
 void vp10_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int tx_type, int bd) {
   static const highbd_transform_2d HIGH_IHT_8[] = {
@@ -1068,7 +1304,7 @@
   }
 
 #if CONFIG_EXT_TX
-  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 8);
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 8, 8);
 #endif
 
   // Sum with the destination
@@ -1134,7 +1370,7 @@
   }
 
 #if CONFIG_EXT_TX
-  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 16);
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 16, 16);
 #endif
 
   // Sum with the destination
@@ -1198,7 +1434,7 @@
     HIGH_IHT_32[tx_type].cols(out[i], out[i], bd);
   }
 
-  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 32);
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 32, 32);
 
   // Sum with the destination
   for (i = 0; i < 32; ++i) {
@@ -1320,6 +1556,22 @@
   }
 }
 
+#if CONFIG_EXT_TX
+void vp10_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
+                                  int stride, int eob, int bd,
+                                  TX_TYPE tx_type) {
+  (void) eob;
+  vp10_highbd_iht8x4_32_add_c(input, dest, stride, tx_type, bd);
+}
+
+void vp10_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
+                                  int stride, int eob, int bd,
+                                  TX_TYPE tx_type) {
+  (void) eob;
+  vp10_highbd_iht4x8_32_add_c(input, dest, stride, tx_type, bd);
+}
+#endif  // CONFIG_EXT_TX
+
 void vp10_highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
                                   int stride, int eob, int bd,
                                   TX_TYPE tx_type) {
@@ -1454,6 +1706,14 @@
     case TX_8X8:
       vp10_inv_txfm_add_8x8(input, dest, stride, eob, tx_type);
       break;
+#if CONFIG_EXT_TX
+    case TX_4X8:
+      vp10_inv_txfm_add_4x8(input, dest, stride, eob, tx_type);
+      break;
+    case TX_8X4:
+      vp10_inv_txfm_add_8x4(input, dest, stride, eob, tx_type);
+      break;
+#endif  // CONFIG_EXT_TX
     case TX_4X4:
       // this is like vp10_short_idct4x4 but has a special case around eob<=1
       // which is significant (not just an optimization) for the lossless
@@ -1486,6 +1746,14 @@
     case TX_8X8:
       vp10_highbd_inv_txfm_add_8x8(input, dest, stride, eob, bd, tx_type);
       break;
+#if CONFIG_EXT_TX
+    case TX_4X8:
+      vp10_highbd_inv_txfm_add_4x8(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_8X4:
+      vp10_highbd_inv_txfm_add_8x4(input, dest, stride, eob, bd, tx_type);
+      break;
+#endif  // CONFIG_EXT_TX
     case TX_4X4:
       // this is like vp10_short_idct4x4 but has a special case around eob<=1
       // which is significant (not just an optimization) for the lossless
diff --git a/vp10/common/idct.h b/vp10/common/idct.h
index 5d52314..f20a154 100644
--- a/vp10/common/idct.h
+++ b/vp10/common/idct.h
@@ -66,6 +66,12 @@
 
 void vp10_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
                            int stride, int eob, TX_TYPE tx_type, int lossless);
+#if CONFIG_EXT_TX
+void vp10_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
+                           int stride, int eob, TX_TYPE tx_type);
+void vp10_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
+                           int stride, int eob, TX_TYPE tx_type);
+#endif  // CONFIG_EXT_TX
 void vp10_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
                            int stride, int eob, TX_TYPE tx_type);
 void vp10_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
@@ -88,6 +94,12 @@
 void vp10_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
                                   int stride, int eob, int bd, TX_TYPE tx_type,
                                   int lossless);
+#if CONFIG_EXT_TX
+void vp10_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
+                                  int stride, int eob, int bd, TX_TYPE tx_type);
+void vp10_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
+                                  int stride, int eob, int bd, TX_TYPE tx_type);
+#endif  // CONFIG_EXT_TX
 void vp10_highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
                                   int stride, int eob, int bd, TX_TYPE tx_type);
 void vp10_highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c
index 55715d7..59446c2 100644
--- a/vp10/common/loopfilter.c
+++ b/vp10/common/loopfilter.c
@@ -722,8 +722,11 @@
                         LOOP_FILTER_MASK *lfm) {
   const MB_MODE_INFO *mbmi = &mi->mbmi;
   const BLOCK_SIZE block_size = mbmi->sb_type;
-  const TX_SIZE tx_size_y = mbmi->tx_size;
-  const TX_SIZE tx_size_uv = get_uv_tx_size_impl(tx_size_y, block_size, 1, 1);
+  // TODO(debargha): Check if masks can be setup correctly when
+  // rectangular transfroms are used with the EXT_TX expt.
+  const TX_SIZE tx_size_y = txsize_sqr_map[mbmi->tx_size];
+  const TX_SIZE tx_size_uv =
+      get_uv_tx_size_impl(mbmi->tx_size, block_size, 1, 1);
   const int filter_level = get_filter_level(lfi_n, mbmi);
   uint64_t *const left_y = &lfm->left_y[tx_size_y];
   uint64_t *const above_y = &lfm->above_y[tx_size_y];
@@ -803,7 +806,7 @@
 #endif  // CONFIG_SUPERTX
                          LOOP_FILTER_MASK *lfm) {
   const MB_MODE_INFO *mbmi = &mi->mbmi;
-  const TX_SIZE tx_size_y = mbmi->tx_size;
+  const TX_SIZE tx_size_y = txsize_sqr_map[mbmi->tx_size];
 #if CONFIG_SUPERTX
   const BLOCK_SIZE block_size =
       supertx_enabled ? (BLOCK_SIZE)(3 * tx_size_y) : mbmi->sb_type;
@@ -1267,8 +1270,8 @@
       const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
       const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
 
-      TX_SIZE tx_size_c = tx_size;
-      TX_SIZE tx_size_r = tx_size;
+      TX_SIZE tx_size_c = num_4x4_blocks_wide_txsize_log2_lookup[tx_size];
+      TX_SIZE tx_size_r = num_4x4_blocks_high_txsize_log2_lookup[tx_size];
 
       int tx_size_mask = 0;
       // Filter level can vary per MI
diff --git a/vp10/common/pred_common.h b/vp10/common/pred_common.h
index d4ae980..9b73eb2 100644
--- a/vp10/common/pred_common.h
+++ b/vp10/common/pred_common.h
@@ -177,10 +177,11 @@
   const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
   const int has_above = xd->up_available;
   const int has_left = xd->left_available;
-  int above_ctx = (has_above && !above_mbmi->skip) ? (int)above_mbmi->tx_size
-                                                   : max_tx_size;
-  int left_ctx = (has_left && !left_mbmi->skip) ? (int)left_mbmi->tx_size
-                                                : max_tx_size;
+  int above_ctx = (has_above && !above_mbmi->skip) ?
+      (int)txsize_sqr_map[above_mbmi->tx_size] : max_tx_size;
+  int left_ctx = (has_left && !left_mbmi->skip) ?
+      (int)txsize_sqr_map[left_mbmi->tx_size] : max_tx_size;
+  assert(xd->mi[0]->mbmi.sb_type >= BLOCK_8X8);
   if (!has_left)
     left_ctx = above_ctx;
 
diff --git a/vp10/common/reconintra.c b/vp10/common/reconintra.c
index 89ff13b..fe98373 100644
--- a/vp10/common/reconintra.c
+++ b/vp10/common/reconintra.c
@@ -673,7 +673,7 @@
                          INTRA_FILTER filter_type) {
   const int dx = (int)dr_intra_derivative[angle][0];
   const int dy = (int)dr_intra_derivative[angle][1];
-  const int bs = 4 << tx_size;
+  const int bs = 4 * num_4x4_blocks_wide_txsize_lookup[tx_size];
   assert(angle > 0 && angle < 270);
 
   if (angle > 0 && angle < 90) {
@@ -1159,7 +1159,7 @@
   DECLARE_ALIGNED(16, uint16_t, above_data[MAX_SB_SIZE + 16]);
   uint16_t *above_row = above_data + 16;
   const uint16_t *const_above_row = above_row;
-  const int bs = 4 << tx_size;
+  const int bs = 4 * num_4x4_blocks_wide_txsize_lookup[tx_size];
   int need_left = extend_modes[mode] & NEED_LEFT;
   int need_above = extend_modes[mode] & NEED_ABOVE;
   const uint16_t *above_ref = ref - ref_stride;
@@ -1331,7 +1331,7 @@
   DECLARE_ALIGNED(16, uint8_t, above_data[MAX_SB_SIZE + 16]);
   uint8_t *above_row = above_data + 16;
   const uint8_t *const_above_row = above_row;
-  const int bs = 4 << tx_size;
+  const int bs = 4 * num_4x4_blocks_wide_txsize_lookup[tx_size];
   int need_left = extend_modes[mode] & NEED_LEFT;
   int need_above = extend_modes[mode] & NEED_ABOVE;
 #if CONFIG_EXT_INTRA
@@ -1491,7 +1491,7 @@
                               const uint8_t *ref, int ref_stride,
                               uint8_t *dst, int dst_stride,
                               int col_off, int row_off, int plane) {
-  const int txw = (1 << tx_size);
+  const int txw = num_4x4_blocks_wide_txsize_lookup[tx_size];
   const int have_top = row_off || xd->up_available;
   const int have_left = col_off || xd->left_available;
   const int x = col_off * 4;
@@ -1531,7 +1531,7 @@
       (hpx - y - txpx);
 
   if (xd->mi[0]->mbmi.palette_mode_info.palette_size[plane != 0] > 0) {
-    const int bs = 4 * (1 << tx_size);
+    const int bs = 4 * num_4x4_blocks_wide_txsize_lookup[tx_size];
     const int stride = 4 * (1 << bwl_in);
     int r, c;
     uint8_t *map = NULL;
diff --git a/vp10/common/scan.c b/vp10/common/scan.c
index 8cfeb97..4c176d3 100644
--- a/vp10/common/scan.c
+++ b/vp10/common/scan.c
@@ -49,6 +49,50 @@
   13, 11, 14, 15,
 };
 
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, default_scan_4x8[32]) = {
+  0, 1, 4, 5, 2, 8, 6, 9,
+  10,  3, 12,  7, 13, 11, 14, 16,
+  17, 15, 18, 19, 20, 21, 22, 23,
+  24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x8[32]) = {
+  0,  4,  8, 12, 16, 20, 24, 28,
+  1,  5,  9, 13, 17, 21, 25, 29,
+  2,  6, 10, 14, 18, 22, 26, 30,
+  3,  7, 11, 15, 19, 23, 27, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x8[32]) = {
+  0, 1, 2, 3, 4, 5, 6, 7,
+  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23,
+  24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x4[32]) = {
+  0,  1,  8,  9,  2, 16, 10, 17,
+  18,  3, 24, 11, 25, 19, 26,  4,
+  12, 27, 20,  5, 28, 13, 21, 29,
+  6, 14, 22, 30,  7, 15, 23, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x4[32]) = {
+  0,  8, 16, 24,  1,  9, 17, 25,
+  2, 10, 18, 26,  3, 11, 19, 27,
+  4, 12, 20, 28,  5, 13, 21, 29,
+  6, 14, 22, 30,  7, 15, 23, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x4[32]) = {
+  0, 1, 2, 3, 4, 5, 6, 7,
+  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23,
+  24, 25, 26, 27, 28, 29, 30, 31,
+};
+#endif  // CONFIG_EXT_TX
+
 DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = {
   0,   8,  1, 16,  9,  2, 17, 24,
   10,  3, 18, 25, 32, 11,  4, 26,
@@ -824,6 +868,86 @@
   9,  12,   7,  10,  10,  13,  11,  14,   0,   0,
 };
 
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 0, 1, 4,
+  1, 1, 4, 4, 2, 5, 5, 8,
+  6, 9, 2, 2, 8, 8, 3, 6,
+  9, 12,  7, 10, 10, 13, 12, 12,
+  13, 16, 11, 14, 14, 17, 15, 18,
+  16, 16, 17, 20, 18, 21, 19, 22,
+  20, 20, 21, 24, 22, 25, 23, 26,
+  24, 24, 25, 28, 26, 29, 27, 30,
+  0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 4, 4, 8, 8,
+  12, 12, 16, 16, 20, 20, 24, 24,
+  0,  0,  1,  4,  5,  8,  9, 12,
+  13, 16, 17, 20, 21, 24, 25, 28,
+  1,  1,  2,  5,  6,  9, 10, 13,
+  14, 17, 18, 21, 22, 25, 26, 29,
+  2,  2,  3,  6,  7, 10, 11, 14,
+  15, 18, 19, 22, 23, 26, 27, 30,
+  0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 1, 1, 2, 2,
+  0, 0, 1, 4, 2, 5, 3, 6,
+  4,  4,  5,  8,  6,  9,  7, 10,
+  8,  8,  9, 12, 10, 13, 11, 14,
+  12, 12, 13, 16, 14, 17, 15, 18,
+  16, 16, 17, 20, 18, 21, 19, 22,
+  20, 20, 21, 24, 22, 25, 23, 26,
+  24, 24, 25, 28, 26, 29, 27, 30,
+  0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 0, 1, 8,
+  1,  1,  8,  8,  2,  9,  9, 16,
+  10, 17,  2,  2, 16, 16,  3, 10,
+  17, 24, 11, 18, 18, 25,  3,  3,
+  4, 11, 19, 26, 12, 19,  4,  4,
+  20, 27,  5, 12, 13, 20, 21, 28,
+  5,  5,  6, 13, 14, 21, 22, 29,
+  6,  6,  7, 14, 15, 22, 23, 30,
+  0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  8,  8, 16, 16,
+  0,  0,  1,  8,  9, 16, 17, 24,
+  1,  1,  2,  9, 10, 17, 18, 25,
+  2,  2,  3, 10, 11, 18, 19, 26,
+  3,  3,  4, 11, 12, 19, 20, 27,
+  4,  4,  5, 12, 13, 20, 21, 28,
+  5,  5,  6, 13, 14, 21, 22, 29,
+  6,  6,  7, 14, 15, 22, 23, 30,
+  0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 1, 1, 2, 2,
+  3, 3, 4, 4, 5, 5, 6, 6,
+  0,  0,  1,  8,  2,  9,  3, 10,
+  4, 11,  5, 12,  6, 13,  7, 14,
+  8,  8,  9, 16, 10, 17, 11, 18,
+  12, 19, 13, 20, 14, 21, 15, 22,
+  16, 16, 17, 24, 18, 25, 19, 26,
+  20, 27, 21, 28, 22, 29, 23, 30,
+  0, 0
+};
+#endif  // CONFIG_EXT_TX
+
 DECLARE_ALIGNED(16, static const int16_t,
                 col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
   0,   0,   0,   0,   8,   8,   8,   0,  16,  16,   1,   8,
@@ -2259,6 +2383,50 @@
 };
 
 #if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, vp10_default_iscan_4x8[32]) = {
+  0,  1,  4,  9,  2,  3,  6, 11,
+  5,  7,  8, 13, 10, 12, 14, 17,
+  15, 16, 18, 19, 20, 21, 22, 23,
+  24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_mcol_iscan_4x8[32]) = {
+  0,  8, 16, 24,  1,  9, 17, 25,
+  2, 10, 18, 26,  3, 11, 19, 27,
+  4, 12, 20, 28,  5, 13, 21, 29,
+  6, 14, 22, 30,  7, 15, 23, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_mrow_iscan_4x8[32]) = {
+  0, 1, 2, 3, 4, 5, 6, 7,
+  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23,
+  24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_default_iscan_8x4[32]) = {
+  0,  1,  4,  9, 15, 19, 24, 28,
+  2,  3,  6, 11, 16, 21, 25, 29,
+  5,  7,  8, 13, 18, 22, 26, 30,
+  10, 12, 14, 17, 20, 23, 27, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_mcol_iscan_8x4[32]) = {
+  0,  4,  8, 12, 16, 20, 24, 28,
+  1,  5,  9, 13, 17, 21, 25, 29,
+  2,  6, 10, 14, 18, 22, 26, 30,
+  3,  7, 11, 15, 19, 23, 27, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_mrow_iscan_8x4[32]) = {
+  0, 1, 2, 3, 4, 5, 6, 7,
+  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23,
+  24, 25, 26, 27, 28, 29, 30, 31,
+};
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, vp10_mcol_iscan_8x8[64]) = {
   0, 8, 16, 24, 32, 40, 48, 56, 1, 9, 17, 25, 33, 41, 49, 57, 2, 10,
   18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20,
@@ -2943,13 +3111,6 @@
 };
 #endif  // CONFIG_EXT_TX
 
-const scan_order vp10_default_scan_orders[TX_SIZES] = {
-  {default_scan_4x4,   vp10_default_iscan_4x4,   default_scan_4x4_neighbors},
-  {default_scan_8x8,   vp10_default_iscan_8x8,   default_scan_8x8_neighbors},
-  {default_scan_16x16, vp10_default_iscan_16x16, default_scan_16x16_neighbors},
-  {default_scan_32x32, vp10_default_iscan_32x32, default_scan_32x32_neighbors},
-};
-
 #if CONFIG_EXT_TX
 const scan_order vp10_intra_scan_orders[TX_SIZES][TX_TYPES] = {
   {  // TX_4X4
@@ -3039,7 +3200,7 @@
   }
 };
 
-const scan_order vp10_inter_scan_orders[TX_SIZES][TX_TYPES] = {
+const scan_order vp10_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
   {  // TX_4X4
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
@@ -3126,6 +3287,40 @@
     {mcol_scan_32x32,  vp10_mcol_iscan_32x32,  mcol_scan_32x32_neighbors},
     {mrow_scan_32x32,  vp10_mrow_iscan_32x32,  mrow_scan_32x32_neighbors},
     {mcol_scan_32x32,  vp10_mcol_iscan_32x32,  mcol_scan_32x32_neighbors},
+  }, {  // TX_4X8
+    {default_scan_4x8, vp10_default_iscan_4x8, default_scan_4x8_neighbors},
+    {default_scan_4x8, vp10_default_iscan_4x8, default_scan_4x8_neighbors},
+    {default_scan_4x8, vp10_default_iscan_4x8, default_scan_4x8_neighbors},
+    {default_scan_4x8, vp10_default_iscan_4x8, default_scan_4x8_neighbors},
+    {default_scan_4x8, vp10_default_iscan_4x8, default_scan_4x8_neighbors},
+    {default_scan_4x8, vp10_default_iscan_4x8, default_scan_4x8_neighbors},
+    {default_scan_4x8, vp10_default_iscan_4x8, default_scan_4x8_neighbors},
+    {default_scan_4x8, vp10_default_iscan_4x8, default_scan_4x8_neighbors},
+    {default_scan_4x8, vp10_default_iscan_4x8, default_scan_4x8_neighbors},
+    {mrow_scan_4x8,    vp10_mrow_iscan_4x8,    mrow_scan_4x8_neighbors},
+    {mrow_scan_4x8,    vp10_mrow_iscan_4x8,    mrow_scan_4x8_neighbors},
+    {mcol_scan_4x8,    vp10_mcol_iscan_4x8,    mcol_scan_4x8_neighbors},
+    {mrow_scan_4x8,    vp10_mrow_iscan_4x8,    mrow_scan_4x8_neighbors},
+    {mcol_scan_4x8,    vp10_mcol_iscan_4x8,    mcol_scan_4x8_neighbors},
+    {mrow_scan_4x8,    vp10_mrow_iscan_4x8,    mrow_scan_4x8_neighbors},
+    {mcol_scan_4x8,    vp10_mcol_iscan_4x8,    mcol_scan_4x8_neighbors},
+  }, {  // TX_8X4
+    {default_scan_8x4, vp10_default_iscan_8x4, default_scan_8x4_neighbors},
+    {default_scan_8x4, vp10_default_iscan_8x4, default_scan_8x4_neighbors},
+    {default_scan_8x4, vp10_default_iscan_8x4, default_scan_8x4_neighbors},
+    {default_scan_8x4, vp10_default_iscan_8x4, default_scan_8x4_neighbors},
+    {default_scan_8x4, vp10_default_iscan_8x4, default_scan_8x4_neighbors},
+    {default_scan_8x4, vp10_default_iscan_8x4, default_scan_8x4_neighbors},
+    {default_scan_8x4, vp10_default_iscan_8x4, default_scan_8x4_neighbors},
+    {default_scan_8x4, vp10_default_iscan_8x4, default_scan_8x4_neighbors},
+    {default_scan_8x4, vp10_default_iscan_8x4, default_scan_8x4_neighbors},
+    {mrow_scan_8x4,    vp10_mrow_iscan_8x4,    mrow_scan_8x4_neighbors},
+    {mrow_scan_8x4,    vp10_mrow_iscan_8x4,    mrow_scan_8x4_neighbors},
+    {mcol_scan_8x4,    vp10_mcol_iscan_8x4,    mcol_scan_8x4_neighbors},
+    {mrow_scan_8x4,    vp10_mrow_iscan_8x4,    mrow_scan_8x4_neighbors},
+    {mcol_scan_8x4,    vp10_mcol_iscan_8x4,    mcol_scan_8x4_neighbors},
+    {mrow_scan_8x4,    vp10_mrow_iscan_8x4,    mrow_scan_8x4_neighbors},
+    {mcol_scan_8x4,    vp10_mcol_iscan_8x4,    mcol_scan_8x4_neighbors},
   }
 };
 
diff --git a/vp10/common/scan.h b/vp10/common/scan.h
index aadae40..92a8e6b 100644
--- a/vp10/common/scan.h
+++ b/vp10/common/scan.h
@@ -29,7 +29,6 @@
   const int16_t *neighbors;
 } scan_order;
 
-extern const scan_order vp10_default_scan_orders[TX_SIZES];
 extern const scan_order vp10_intra_scan_orders[TX_SIZES][TX_TYPES];
 
 static INLINE int get_coef_context(const int16_t *neighbors,
@@ -44,7 +43,7 @@
 }
 
 #if CONFIG_EXT_TX
-extern const scan_order vp10_inter_scan_orders[TX_SIZES][TX_TYPES];
+extern const scan_order vp10_inter_scan_orders[TX_SIZES_ALL][TX_TYPES];
 
 static INLINE const scan_order *get_inter_scan(TX_SIZE tx_size,
                                                TX_TYPE tx_type) {
diff --git a/vp10/common/vp10_inv_txfm2d.c b/vp10/common/vp10_inv_txfm2d.c
index 85a33ba..071419e 100644
--- a/vp10/common/vp10_inv_txfm2d.c
+++ b/vp10/common/vp10_inv_txfm2d.c
@@ -82,7 +82,7 @@
 }
 
 TXFM_2D_FLIP_CFG vp10_get_inv_txfm_64x64_cfg(int tx_type) {
-  TXFM_2D_FLIP_CFG cfg;
+  TXFM_2D_FLIP_CFG cfg = {0, 0, NULL};
   switch (tx_type) {
     case DCT_DCT:
       cfg.cfg = &inv_txfm_2d_cfg_dct_dct_64;
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index 8f87b02..ab2fa16 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -83,6 +83,12 @@
     add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/vp10_iht4x4_16_add/;
 
+    add_proto qw/void vp10_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp10_iht8x4_32_add/;
+
+    add_proto qw/void vp10_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp10_iht4x8_32_add/;
+
     add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/vp10_iht8x8_64_add/;
 
@@ -143,6 +149,12 @@
     add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/vp10_iht4x4_16_add sse2/;
 
+    add_proto qw/void vp10_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp10_iht8x4_32_add/;
+
+    add_proto qw/void vp10_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp10_iht4x8_32_add/;
+
     add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/vp10_iht8x8_64_add sse2/;
 
@@ -206,6 +218,12 @@
     add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/vp10_iht4x4_16_add/;
 
+    add_proto qw/void vp10_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp10_iht8x4_32_add/;
+
+    add_proto qw/void vp10_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp10_iht4x8_32_add/;
+
     add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/vp10_iht8x8_64_add/;
 
@@ -242,6 +260,12 @@
     add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/vp10_iht4x4_16_add sse2 neon dspr2/;
 
+    add_proto qw/void vp10_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp10_iht8x4_32_add/;
+
+    add_proto qw/void vp10_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp10_iht4x8_32_add/;
+
     add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/vp10_iht8x8_64_add sse2 neon dspr2/;
 
@@ -348,6 +372,12 @@
   add_proto qw/void vp10_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
   specialize qw/vp10_highbd_iht4x4_16_add/;
 
+  add_proto qw/void vp10_highbd_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  specialize qw/vp10_highbd_iht8x4_32_add/;
+
+  add_proto qw/void vp10_highbd_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  specialize qw/vp10_highbd_iht4x8_32_add/;
+
   add_proto qw/void vp10_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
   specialize qw/vp10_highbd_iht8x8_64_add/;
 
@@ -407,6 +437,12 @@
   add_proto qw/void vp10_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp10_fht4x4 sse2/;
 
+  add_proto qw/void vp10_fht8x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_fht8x4/;
+
+  add_proto qw/void vp10_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_fht4x8/;
+
   add_proto qw/void vp10_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp10_fht8x8 sse2/;
 
@@ -422,6 +458,12 @@
   add_proto qw/void vp10_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp10_fht4x4 sse2/;
 
+  add_proto qw/void vp10_fht8x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_fht8x4/;
+
+  add_proto qw/void vp10_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_fht4x8/;
+
   add_proto qw/void vp10_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp10_fht8x8 sse2/;
 
@@ -699,6 +741,12 @@
   add_proto qw/void vp10_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp10_highbd_fht4x4 sse4_1/;
 
+  add_proto qw/void vp10_highbd_fht8x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_highbd_fht8x4/;
+
+  add_proto qw/void vp10_highbd_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_highbd_fht4x8/;
+
   add_proto qw/void vp10_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp10_highbd_fht8x8/;
 
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index 66b44a3..6eab340 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -249,15 +249,16 @@
       dqcoeff[0] = 0;
     } else {
       if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
-        memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+        memset(dqcoeff, 0, 4 * 4 * num_4x4_blocks_wide_txsize_lookup[tx_size] *
+               sizeof(dqcoeff[0]));
 #if CONFIG_EXT_TX
       else
-        memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
+        memset(dqcoeff, 0, get_tx2d_size(tx_size) * sizeof(dqcoeff[0]));
 #else
       else if (tx_size == TX_32X32 && eob <= 34)
         memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
       else
-        memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
+        memset(dqcoeff, 0, get_tx2d_size(tx_size) * sizeof(dqcoeff[0]));
 #endif
     }
   }
@@ -285,8 +286,8 @@
       mode = xd->mi[0]->bmi[(row << 1) + col].as_mode;
 
   vp10_predict_intra_block(xd, pd->n4_wl, pd->n4_hl, tx_size, mode,
-                          dst, pd->dst.stride, dst, pd->dst.stride,
-                          col, row, plane);
+                           dst, pd->dst.stride, dst, pd->dst.stride,
+                           col, row, plane);
 
   if (!mbmi->skip) {
     TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx, tx_size);
@@ -323,14 +324,18 @@
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
     return;
 
-  if (tx_size == plane_tx_size) {
+  if (tx_size == plane_tx_size
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      || plane_tx_size >= TX_SIZES
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+      ) {
     PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
-    TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
-    const scan_order *sc = get_scan(tx_size, tx_type, 1);
+    TX_TYPE tx_type = get_tx_type(plane_type, xd, block, plane_tx_size);
+    const scan_order *sc = get_scan(plane_tx_size, tx_type, 1);
     const int eob = vp10_decode_block_tokens(xd, plane, sc,
-                                             blk_col, blk_row, tx_size,
+                                             blk_col, blk_row, plane_tx_size,
                                              tx_type, r, mbmi->segment_id);
-    inverse_transform_block(xd, plane, tx_type, tx_size,
+    inverse_transform_block(xd, plane, tx_type, plane_tx_size,
         &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col],
         pd->dst.stride, eob);
     *eob_total += eob;
@@ -344,7 +349,7 @@
     for (i = 0; i < 4; ++i) {
       const int offsetr = blk_row + ((i >> 1) << bsl);
       const int offsetc = blk_col + ((i & 0x01) << bsl);
-      int step = 1 << (2 * (tx_size - 1));
+      int step = num_4x4_blocks_txsize_lookup[tx_size - 1];
 
       if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide)
         continue;
@@ -431,7 +436,6 @@
 
   set_skip_context(xd, mi_row, mi_col);
 
-
 #if CONFIG_VAR_TX
   xd->max_tx_size = max_txsize_lookup[bsize];
 #endif
@@ -1321,7 +1325,8 @@
           : mbmi->tx_size;
       const int num_4x4_w = pd->n4_w;
       const int num_4x4_h = pd->n4_h;
-      const int step = (1 << tx_size);
+      const int stepr = num_4x4_blocks_high_txsize_lookup[tx_size];
+      const int stepc = num_4x4_blocks_wide_txsize_lookup[tx_size];
       int row, col;
       const int max_blocks_wide = num_4x4_w +
           (xd->mb_to_right_edge >= 0 ?
@@ -1330,8 +1335,8 @@
           (xd->mb_to_bottom_edge >= 0 ?
            0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
 
-      for (row = 0; row < max_blocks_high; row += step)
-        for (col = 0; col < max_blocks_wide; col += step)
+      for (row = 0; row < max_blocks_high; row += stepr)
+        for (col = 0; col < max_blocks_wide; col += stepc)
           predict_and_reconstruct_intra_block(xd,
                                               r,
                                               mbmi, plane,
@@ -1409,15 +1414,20 @@
         int row, col;
 #if CONFIG_VAR_TX
         // TODO(jingning): This can be simplified for decoder performance.
-        const BLOCK_SIZE plane_bsize =
-            get_plane_block_size(VPXMAX(bsize, BLOCK_8X8), pd);
+        const BLOCK_SIZE plane_bsize = get_plane_block_size(
+            VPXMAX(bsize, BLOCK_8X8), pd);
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+        const TX_SIZE max_tx_size = plane ?
+            max_txsize_lookup[plane_bsize] : max_txsize_rect_lookup[plane_bsize];
+#else
         const TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
-        const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
-        int bw = num_4x4_blocks_wide_lookup[txb_size];
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+        int bw = num_4x4_blocks_wide_txsize_lookup[max_tx_size];
+        int bh = num_4x4_blocks_high_txsize_lookup[max_tx_size];
+        const int step = num_4x4_blocks_txsize_lookup[max_tx_size];
         int block = 0;
-        const int step = 1 << (max_tx_size << 1);
 
-        for (row = 0; row < num_4x4_h; row += bw) {
+        for (row = 0; row < num_4x4_h; row += bh) {
           for (col = 0; col < num_4x4_w; col += bw) {
             decode_reconstruct_tx(xd, r, mbmi, plane, plane_bsize,
                                   block, row, col, max_tx_size, &eobtotal);
@@ -1428,7 +1438,8 @@
         const TX_SIZE tx_size =
             plane ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
             : mbmi->tx_size;
-        const int step = (1 << tx_size);
+        const int stepr = num_4x4_blocks_high_txsize_lookup[tx_size];
+        const int stepc = num_4x4_blocks_wide_txsize_lookup[tx_size];
         const int max_blocks_wide = num_4x4_w +
             (xd->mb_to_right_edge >= 0 ?
              0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
@@ -1436,8 +1447,8 @@
             (xd->mb_to_bottom_edge >= 0 ?
              0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
 
-        for (row = 0; row < max_blocks_high; row += step)
-          for (col = 0; col < max_blocks_wide; col += step)
+        for (row = 0; row < max_blocks_high; row += stepr)
+          for (col = 0; col < max_blocks_wide; col += stepc)
             eobtotal += reconstruct_inter_block(xd,
                                                 r,
                                                 mbmi->segment_id,
@@ -1831,7 +1842,8 @@
         const TX_SIZE tx_size =
             i ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
             : mbmi->tx_size;
-        const int step = (1 << tx_size);
+        const int stepr = num_4x4_blocks_high_txsize_lookup[tx_size];
+        const int stepc = num_4x4_blocks_wide_txsize_lookup[tx_size];
         const int max_blocks_wide = num_4x4_w +
             (xd->mb_to_right_edge >= 0 ?
              0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
@@ -1839,8 +1851,8 @@
             (xd->mb_to_bottom_edge >= 0 ?
              0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
 
-        for (row = 0; row < max_blocks_high; row += step)
-          for (col = 0; col < max_blocks_wide; col += step)
+        for (row = 0; row < max_blocks_high; row += stepr)
+          for (col = 0; col < max_blocks_wide; col += stepc)
             eobtotal += reconstruct_inter_block(xd,
                                                 r,
                                                 mbmi->segment_id_supertx,
diff --git a/vp10/decoder/decodemv.c b/vp10/decoder/decodemv.c
index 8528370..e036ceb 100644
--- a/vp10/decoder/decodemv.c
+++ b/vp10/decoder/decodemv.c
@@ -233,7 +233,7 @@
 }
 
 #if CONFIG_VAR_TX
-static void read_tx_size_inter(VP10_COMMON *cm, MACROBLOCKD *xd,
+static void read_tx_size_vartx(VP10_COMMON *cm, MACROBLOCKD *xd,
                                MB_MODE_INFO *mbmi, FRAME_COUNTS *counts,
                                TX_SIZE tx_size, int blk_row, int blk_col,
                                vp10_reader *r) {
@@ -279,14 +279,14 @@
     for (i = 0; i < 4; ++i) {
       int offsetr = blk_row + ((i >> 1) << bsl);
       int offsetc = blk_col + ((i & 0x01) << bsl);
-      read_tx_size_inter(cm, xd, mbmi, counts,
+      read_tx_size_vartx(cm, xd, mbmi, counts,
                          tx_size - 1, offsetr, offsetc, r);
     }
   } else {
     int idx, idy;
     inter_tx_size[0][0] = tx_size;
-    for (idy = 0; idy < (1 << tx_size) / 2; ++idy)
-      for (idx = 0; idx < (1 << tx_size) / 2; ++idx)
+    for (idy = 0; idy < num_4x4_blocks_high_txsize_lookup[tx_size] / 2; ++idy)
+      for (idx = 0; idx < num_4x4_blocks_wide_txsize_lookup[tx_size] / 2; ++idx)
         inter_tx_size[idy][idx] = tx_size;
     mbmi->tx_size = tx_size;
     if (counts)
@@ -309,17 +309,44 @@
   return (TX_SIZE)tx_size;
 }
 
-static TX_SIZE read_tx_size(VP10_COMMON *cm, MACROBLOCKD *xd,
-                            int allow_select, vp10_reader *r) {
+static TX_SIZE read_tx_size_intra(VP10_COMMON *cm, MACROBLOCKD *xd,
+                                  vp10_reader *r) {
   TX_MODE tx_mode = cm->tx_mode;
   BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
   if (xd->lossless[xd->mi[0]->mbmi.segment_id])
     return TX_4X4;
-  if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8)
-    return read_selected_tx_size(cm, xd, max_tx_size, r);
-  else
-    return VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]);
+  if (bsize >= BLOCK_8X8) {
+    const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+    if (tx_mode == TX_MODE_SELECT) {
+      return read_selected_tx_size(cm, xd, max_tx_size, r);
+    } else {
+      return VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]);
+    }
+  } else {
+    return TX_4X4;
+  }
+}
+
+static TX_SIZE read_tx_size_inter(VP10_COMMON *cm, MACROBLOCKD *xd,
+                                  int allow_select, vp10_reader *r) {
+  TX_MODE tx_mode = cm->tx_mode;
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  if (xd->lossless[xd->mi[0]->mbmi.segment_id])
+    return TX_4X4;
+  if (bsize >= BLOCK_8X8) {
+    const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+    if (allow_select && tx_mode == TX_MODE_SELECT) {
+      return read_selected_tx_size(cm, xd, max_tx_size, r);
+    } else {
+      return VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]);
+    }
+  } else {
+#if CONFIG_EXT_TX && CONFIG_RECT_TX && !CONFIG_VAR_TX
+    return max_txsize_rect_lookup[bsize];
+#else
+    return TX_4X4;
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && !CONFIG_VAR_TX
+  }
 }
 
 static int dec_get_segment_id(const VP10_COMMON *cm, const uint8_t *segment_ids,
@@ -577,7 +604,7 @@
 
   mbmi->segment_id = read_intra_segment_id(cm, xd, mi_offset, x_mis, y_mis, r);
   mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
-  mbmi->tx_size = read_tx_size(cm, xd, 1, r);
+  mbmi->tx_size = read_tx_size_intra(cm, xd, r);
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE;
 
@@ -1670,14 +1697,18 @@
       int idx, idy;
       for (idy = 0; idy < height; idy += bs)
         for (idx = 0; idx < width; idx += bs)
-          read_tx_size_inter(cm, xd, mbmi, xd->counts, max_tx_size,
+          read_tx_size_vartx(cm, xd, mbmi, xd->counts, max_tx_size,
                              idy, idx, r);
       if (xd->counts) {
         const int ctx = get_tx_size_context(xd);
         ++xd->counts->tx_size[max_tx_size - TX_8X8][ctx][mbmi->tx_size];
       }
     } else {
-      mbmi->tx_size = read_tx_size(cm, xd, !mbmi->skip || !inter_block, r);
+      if (inter_block)
+        mbmi->tx_size = read_tx_size_inter(cm, xd, !mbmi->skip, r);
+      else
+        mbmi->tx_size = read_tx_size_intra(cm, xd, r);
+
       if (inter_block) {
         const int width  = num_4x4_blocks_wide_lookup[bsize];
         const int height = num_4x4_blocks_high_lookup[bsize];
@@ -1691,7 +1722,10 @@
       set_txfm_ctx(xd->above_txfm_context, mbmi->tx_size, xd->n8_w);
     }
 #else
-    mbmi->tx_size = read_tx_size(cm, xd, !mbmi->skip || !inter_block, r);
+    if (inter_block)
+      mbmi->tx_size = read_tx_size_inter(cm, xd, !mbmi->skip, r);
+    else
+      mbmi->tx_size = read_tx_size_intra(cm, xd, r);
 #endif  // CONFIG_VAR_TX
 #if CONFIG_SUPERTX
   }
diff --git a/vp10/decoder/detokenize.c b/vp10/decoder/detokenize.c
index cc3b18b..7cbf01e 100644
--- a/vp10/decoder/detokenize.c
+++ b/vp10/decoder/detokenize.c
@@ -55,12 +55,13 @@
                         int ctx, const int16_t *scan, const int16_t *nb,
                         vp10_reader *r) {
   FRAME_COUNTS *counts = xd->counts;
-  const int max_eob = 16 << (tx_size << 1);
+  const int max_eob = get_tx2d_size(tx_size);
   const FRAME_CONTEXT *const fc = xd->fc;
   const int ref = is_inter_block(&xd->mi[0]->mbmi);
   int band, c = 0;
+  const int tx_size_ctx = txsize_sqr_map[tx_size];
   const vpx_prob (*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
-      fc->coef_probs[tx_size][type][ref];
+      fc->coef_probs[tx_size_ctx][type][ref];
   const vpx_prob *prob;
   unsigned int (*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1];
   unsigned int (*eob_branch_count)[COEFF_CONTEXTS];
@@ -80,8 +81,8 @@
   const uint8_t *cat6_prob;
 
   if (counts) {
-    coef_counts = counts->coef[tx_size][type][ref];
-    eob_branch_count = counts->eob_branch[tx_size][type][ref];
+    coef_counts = counts->coef[tx_size_ctx][type][ref];
+    eob_branch_count = counts->eob_branch[tx_size_ctx][type][ref];
   }
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -249,15 +250,16 @@
                             int ctx, const int16_t *scan, const int16_t *nb,
                             struct AnsDecoder *const ans) {
   FRAME_COUNTS *counts = xd->counts;
-  const int max_eob = 16 << (tx_size << 1);
+  const int max_eob = get_tx2d_size(tx_size);
   const FRAME_CONTEXT *const fc = xd->fc;
   const int ref = is_inter_block(&xd->mi[0]->mbmi);
   int band, c = 0;
   int skip_eob = 0;
+  const int tx_size_ctx = txsize_sqr_map[tx_size];
   const vpx_prob (*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
-      fc->coef_probs[tx_size][type][ref];
+      fc->coef_probs[tx_size_ctx][type][ref];
   const rans_dec_lut(*coef_cdfs)[COEFF_CONTEXTS] =
-      fc->coef_cdfs[tx_size][type][ref];
+      fc->coef_cdfs[tx_size_ctx][type][ref];
   const vpx_prob *prob;
   const rans_dec_lut *cdf;
   unsigned int (*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1];
@@ -280,8 +282,8 @@
   dq_shift = get_tx_scale(xd, tx_type, tx_size);
 
   if (counts) {
-    coef_counts = counts->coef[tx_size][type][ref];
-    eob_branch_count = counts->eob_branch[tx_size][type][ref];
+    coef_counts = counts->coef[tx_size_ctx][type][ref];
+    eob_branch_count = counts->eob_branch[tx_size_ctx][type][ref];
   }
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -425,23 +427,24 @@
                       int aoff, int loff) {
   ENTROPY_CONTEXT *const a = pd->above_context + aoff;
   ENTROPY_CONTEXT *const l = pd->left_context + loff;
-  const int tx_size_in_blocks = 1 << tx_size;
+  const int tx_w_in_blocks = num_4x4_blocks_wide_txsize_lookup[tx_size];
+  const int tx_h_in_blocks = num_4x4_blocks_high_txsize_lookup[tx_size];
 
   // above
   if (has_eob && xd->mb_to_right_edge < 0) {
     int i;
     const int blocks_wide = pd->n4_w +
                             (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-    int above_contexts = tx_size_in_blocks;
+    int above_contexts = tx_w_in_blocks;
     if (above_contexts + aoff > blocks_wide)
       above_contexts = blocks_wide - aoff;
 
     for (i = 0; i < above_contexts; ++i)
       a[i] = has_eob;
-    for (i = above_contexts; i < tx_size_in_blocks; ++i)
+    for (i = above_contexts; i < tx_w_in_blocks; ++i)
       a[i] = 0;
   } else {
-    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_w_in_blocks);
   }
 
   // left
@@ -449,16 +452,16 @@
     int i;
     const int blocks_high = pd->n4_h +
                             (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-    int left_contexts = tx_size_in_blocks;
+    int left_contexts = tx_h_in_blocks;
     if (left_contexts + loff > blocks_high)
       left_contexts = blocks_high - loff;
 
     for (i = 0; i < left_contexts; ++i)
       l[i] = has_eob;
-    for (i = left_contexts; i < tx_size_in_blocks; ++i)
+    for (i = left_contexts; i < tx_h_in_blocks; ++i)
       l[i] = 0;
   } else {
-    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_h_in_blocks);
   }
 }
 
@@ -528,7 +531,10 @@
                                    ctx, sc->scan, sc->neighbors, r);
 #endif  // !CONFIG_ANS
   dec_set_contexts(xd, pd, tx_size, eob > 0, x, y);
+  /*
+  vp10_set_contexts(xd, pd,
+                    get_plane_block_size(xd->mi[0]->mbmi.sb_type, pd),
+                    tx_size, eob > 0, x, y);
+                    */
   return eob;
 }
-
-
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index 6430a710..d63c5d3 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -375,7 +375,8 @@
   TX_SIZE tx_size = xd->mi[0]->mbmi.tx_size;
   BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
-  if (max_tx_size > TX_4X4) {
+  // For sub8x8 blocks the tx_size symbol does not need to be sent
+  if (bsize >= BLOCK_8X8) {
     vp10_write_token(w, vp10_tx_size_tree[max_tx_size - TX_8X8],
                      cm->fc->tx_size_probs[max_tx_size - TX_8X8]
                                           [get_tx_size_context(xd)],
@@ -801,7 +802,7 @@
     for (i = 0; i < 4; ++i) {
       const int offsetr = blk_row + ((i >> 1) << bsl);
       const int offsetc = blk_col + ((i & 0x01) << bsl);
-      int step = 1 << (2 * (tx_size - 1));
+      int step = num_4x4_blocks_txsize_lookup[tx_size - 1];
 
       if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide)
         continue;
@@ -1662,7 +1663,7 @@
         const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
         int bw = num_4x4_blocks_wide_lookup[txb_size];
         int block = 0;
-        const int step = 1 << (max_tx_size << 1);
+        const int step = num_4x4_blocks_txsize_lookup[max_tx_size];
         for (row = 0; row < num_4x4_h; row += bw) {
           for (col = 0; col < num_4x4_w; col += bw) {
             pack_txb_tokens(w, tok, tok_end, xd, mbmi, plane, plane_bsize,
diff --git a/vp10/encoder/dct.c b/vp10/encoder/dct.c
index 11d4a8e..46bcd0b 100644
--- a/vp10/encoder/dct.c
+++ b/vp10/encoder/dct.c
@@ -1038,29 +1038,29 @@
   // Note overall scaling factor is 4 times orthogonal
 }
 
-static void copy_block(const int16_t *src, int src_stride, int l,
+static void copy_block(const int16_t *src, int src_stride, int l, int w,
                        int16_t *dest, int dest_stride) {
   int i;
   for (i = 0; i < l; ++i) {
     memcpy(dest + dest_stride * i, src + src_stride * i,
-           l * sizeof(int16_t));
+           w * sizeof(int16_t));
   }
 }
 
-static void fliplr(int16_t *dest, int stride, int l) {
+static void fliplr(int16_t *dest, int stride, int l, int w) {
   int i, j;
   for (i = 0; i < l; ++i) {
-    for (j = 0; j < l / 2; ++j) {
+    for (j = 0; j < w / 2; ++j) {
       const int16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[i * stride + l - 1 - j];
-      dest[i * stride + l - 1 - j] = tmp;
+      dest[i * stride + j] = dest[i * stride + w - 1 - j];
+      dest[i * stride + w - 1 - j] = tmp;
     }
   }
 }
 
-static void flipud(int16_t *dest, int stride, int l) {
+static void flipud(int16_t *dest, int stride, int l, int w) {
   int i, j;
-  for (j = 0; j < l; ++j) {
+  for (j = 0; j < w; ++j) {
     for (i = 0; i < l / 2; ++i) {
       const int16_t tmp = dest[i * stride + j];
       dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
@@ -1069,36 +1069,40 @@
   }
 }
 
-static void fliplrud(int16_t *dest, int stride, int l) {
+static void fliplrud(int16_t *dest, int stride, int l, int w) {
   int i, j;
   for (i = 0; i < l / 2; ++i) {
-    for (j = 0; j < l; ++j) {
+    for (j = 0; j < w; ++j) {
       const int16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(l - 1 - i) * stride + l - 1 - j];
-      dest[(l - 1 - i) * stride + l - 1 - j] = tmp;
+      dest[i * stride + j] = dest[(l - 1 - i) * stride + w - 1 - j];
+      dest[(l - 1 - i) * stride + w - 1 - j] = tmp;
     }
   }
 }
 
-static void copy_fliplr(const int16_t *src, int src_stride, int l,
+static void copy_fliplr(const int16_t *src, int src_stride,
+                        int l, int w,
+                        int16_t *dest, int dest_stride) {
+  copy_block(src, src_stride, l, w, dest, dest_stride);
+  fliplr(dest, dest_stride, l, w);
+}
+
+static void copy_flipud(const int16_t *src, int src_stride,
+                        int l, int w,
+                        int16_t *dest, int dest_stride) {
+  copy_block(src, src_stride, l, w, dest, dest_stride);
+  flipud(dest, dest_stride, l, w);
+}
+
+static void copy_fliplrud(const int16_t *src, int src_stride,
+                          int l, int w,
                           int16_t *dest, int dest_stride) {
-  copy_block(src, src_stride, l, dest, dest_stride);
-  fliplr(dest, dest_stride, l);
+  copy_block(src, src_stride, l, w, dest, dest_stride);
+  fliplrud(dest, dest_stride, l, w);
 }
 
-static void copy_flipud(const int16_t *src, int src_stride, int l,
-                          int16_t *dest, int dest_stride) {
-  copy_block(src, src_stride, l, dest, dest_stride);
-  flipud(dest, dest_stride, l);
-}
-
-static void copy_fliplrud(const int16_t *src, int src_stride, int l,
-                            int16_t *dest, int dest_stride) {
-  copy_block(src, src_stride, l, dest, dest_stride);
-  fliplrud(dest, dest_stride, l);
-}
-
-static void maybe_flip_input(const int16_t **src, int *src_stride, int l,
+static void maybe_flip_input(const int16_t **src, int *src_stride,
+                             int l, int w,
                              int16_t *buff, int tx_type) {
   switch (tx_type) {
     case DCT_DCT:
@@ -1114,21 +1118,21 @@
     case FLIPADST_DCT:
     case FLIPADST_ADST:
     case V_FLIPADST:
-      copy_flipud(*src, *src_stride, l, buff, l);
+      copy_flipud(*src, *src_stride, l, w, buff, w);
       *src = buff;
-      *src_stride = l;
+      *src_stride = w;
       break;
     case DCT_FLIPADST:
     case ADST_FLIPADST:
     case H_FLIPADST:
-      copy_fliplr(*src, *src_stride, l, buff, l);
+      copy_fliplr(*src, *src_stride, l, w, buff, w);
       *src = buff;
-      *src_stride = l;
+      *src_stride = w;
       break;
     case FLIPADST_FLIPADST:
-      copy_fliplrud(*src, *src_stride, l, buff, l);
+      copy_fliplrud(*src, *src_stride, l, w, buff, w);
       *src = buff;
-      *src_stride = l;
+      *src_stride = w;
       break;
     default:
       assert(0);
@@ -1219,6 +1223,44 @@
   { fhalfright32, fidtx32 },           // V_FLIPADST
   { fidtx32, fhalfright32 },           // H_FLIPADST
 };
+
+static const transform_2d FHT_4x8[] = {
+  { fdct8,  fdct4  },  // DCT_DCT
+  { fadst8, fdct4  },  // ADST_DCT
+  { fdct8,  fadst4 },  // DCT_ADST
+  { fadst8, fadst4 },  // ADST_ADST
+  { fadst8, fdct4  },  // FLIPADST_DCT
+  { fdct8,  fadst4 },  // DCT_FLIPADST
+  { fadst8, fadst4 },  // FLIPADST_FLIPADST
+  { fadst8, fadst4 },  // ADST_FLIPADST
+  { fadst8, fadst4 },  // FLIPADST_ADST
+  { fidtx8, fidtx4 },  // IDTX
+  { fdct8,  fidtx4 },  // V_DCT
+  { fidtx8, fdct4  },  // H_DCT
+  { fadst8, fidtx4 },  // V_ADST
+  { fidtx8, fadst4 },  // H_ADST
+  { fadst8, fidtx4 },  // V_FLIPADST
+  { fidtx8, fadst4 },  // H_FLIPADST
+};
+
+static const transform_2d FHT_8x4[] = {
+  { fdct4,  fdct8  },  // DCT_DCT
+  { fadst4, fdct8  },  // ADST_DCT
+  { fdct4,  fadst8 },  // DCT_ADST
+  { fadst4, fadst8 },  // ADST_ADST
+  { fadst4, fdct8  },  // FLIPADST_DCT
+  { fdct4,  fadst8 },  // DCT_FLIPADST
+  { fadst4, fadst8 },  // FLIPADST_FLIPADST
+  { fadst4, fadst8 },  // ADST_FLIPADST
+  { fadst4, fadst8 },  // FLIPADST_ADST
+  { fidtx4, fidtx8 },  // IDTX
+  { fdct4,  fidtx8 },  // V_DCT
+  { fidtx4, fdct8  },  // H_DCT
+  { fadst4, fidtx8 },  // V_ADST
+  { fidtx4, fadst8 },  // H_ADST
+  { fadst4, fidtx8 },  // V_FLIPADST
+  { fidtx4, fadst8 },  // H_FLIPADST
+};
 #endif  // CONFIG_EXT_TX
 
 void vp10_fht4x4_c(const int16_t *input, tran_low_t *output,
@@ -1233,7 +1275,7 @@
 
 #if CONFIG_EXT_TX
     int16_t flipped_input[4 * 4];
-    maybe_flip_input(&input, &stride, 4, flipped_input, tx_type);
+    maybe_flip_input(&input, &stride, 4, 4, flipped_input, tx_type);
 #endif
 
     // Columns
@@ -1258,6 +1300,70 @@
   }
 }
 
+#if CONFIG_EXT_TX
+void vp10_fht4x8_c(const int16_t *input, tran_low_t *output,
+                   int stride, int tx_type) {
+  const int n = 4;
+  const int n2 = 8;
+  tran_low_t out[8 * 4];
+  tran_low_t temp_in[8], temp_out[8];
+  int i, j;
+  const transform_2d ht = FHT_4x8[tx_type];
+  int16_t flipped_input[8 * 4];
+  maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
+
+  // Columns
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j)
+      temp_in[j] = input[j * stride + i] * 8;
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < n2; ++j)
+      out[j * n + i] = (tran_low_t)fdct_round_shift(temp_out[j] * Sqrt2);
+  }
+
+  // Rows
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j)
+      temp_in[j] = out[j + i * n];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < n; ++j)
+      output[j + i * n] = (temp_out[j] + 1) >> 2;
+  }
+  // Note: overall scale factor of transform is 8 times unitary
+}
+
+void vp10_fht8x4_c(const int16_t *input, tran_low_t *output,
+                   int stride, int tx_type) {
+  const int n = 4;
+  const int n2 = 8;
+  tran_low_t out[8 * 4];
+  tran_low_t temp_in[8], temp_out[8];
+  int i, j;
+  const transform_2d ht = FHT_8x4[tx_type];
+  int16_t flipped_input[8 * 4];
+  maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
+
+  // Columns
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j)
+      temp_in[j] = input[j * stride + i] * 8;
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < n; ++j)
+      out[j * n2 + i] = (tran_low_t)fdct_round_shift(temp_out[j] * Sqrt2);
+  }
+
+  // Rows
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j)
+      temp_in[j] = out[j + i * n2];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < n2; ++j)
+      output[j + i * n2] = (temp_out[j] + 1) >> 2;
+  }
+  // Note: overall scale factor of transform is 8 times unitary
+}
+#endif  // CONFIG_EXT_TX
+
 void vp10_fdct8x8_quant_c(const int16_t *input, int stride,
                           tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           int skip_block,
@@ -1382,7 +1488,7 @@
 
 #if CONFIG_EXT_TX
     int16_t flipped_input[8 * 8];
-    maybe_flip_input(&input, &stride, 8, flipped_input, tx_type);
+    maybe_flip_input(&input, &stride, 8, 8, flipped_input, tx_type);
 #endif
 
     // Columns
@@ -1473,7 +1579,7 @@
 
 #if CONFIG_EXT_TX
     int16_t flipped_input[16 * 16];
-    maybe_flip_input(&input, &stride, 16, flipped_input, tx_type);
+    maybe_flip_input(&input, &stride, 16, 16, flipped_input, tx_type);
 #endif
 
     // Columns
@@ -1498,17 +1604,29 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp10_highbd_fht4x4_c(const int16_t *input, tran_low_t *output,
-                         int stride, int tx_type) {
+                          int stride, int tx_type) {
   vp10_fht4x4_c(input, output, stride, tx_type);
 }
 
+#if CONFIG_EXT_TX
+void vp10_highbd_fht8x4_c(const int16_t *input, tran_low_t *output,
+                          int stride, int tx_type) {
+  vp10_fht8x4_c(input, output, stride, tx_type);
+}
+
+void vp10_highbd_fht4x8_c(const int16_t *input, tran_low_t *output,
+                          int stride, int tx_type) {
+  vp10_fht4x8_c(input, output, stride, tx_type);
+}
+#endif  // CONFIG_EXT_TX
+
 void vp10_highbd_fht8x8_c(const int16_t *input, tran_low_t *output,
-                         int stride, int tx_type) {
+                          int stride, int tx_type) {
   vp10_fht8x8_c(input, output, stride, tx_type);
 }
 
 void vp10_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
-                          int stride) {
+                           int stride) {
   vp10_fwht4x4_c(input, output, stride);
 }
 
@@ -1530,7 +1648,7 @@
     const transform_2d ht = FHT_32[tx_type];
 
     int16_t flipped_input[32 * 32];
-    maybe_flip_input(&input, &stride, 32, flipped_input, tx_type);
+    maybe_flip_input(&input, &stride, 32, 32, flipped_input, tx_type);
 
     // Columns
     for (i = 0; i < 32; ++i) {
diff --git a/vp10/encoder/encodemb.c b/vp10/encoder/encodemb.c
index aceb10f..aa8b402 100644
--- a/vp10/encoder/encodemb.c
+++ b/vp10/encoder/encodemb.c
@@ -67,20 +67,6 @@
   rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\
 }
 
-static const int16_t band_count_table[TX_SIZES][8] = {
-  { 1, 2, 3, 4,  3,   16 - 13, 0 },
-  { 1, 2, 3, 4, 11,   64 - 21, 0 },
-  { 1, 2, 3, 4, 11,  256 - 21, 0 },
-  { 1, 2, 3, 4, 11, 1024 - 21, 0 },
-};
-
-static const int16_t band_cum_count_table[TX_SIZES][8] = {
-  { 0, 1, 3, 6, 10, 13, 16, 0 },
-  { 0, 1, 3, 6, 10, 21, 64, 0 },
-  { 0, 1, 3, 6, 10, 21, 256, 0 },
-  { 0, 1, 3, 6, 10, 21, 1024, 0 },
-};
-
 int vp10_optimize_b(MACROBLOCK *mb, int plane, int block,
                     TX_SIZE tx_size, int ctx) {
   MACROBLOCKD *const xd = &mb->e_mbd;
@@ -95,7 +81,7 @@
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   const int eob = p->eobs[block];
   const PLANE_TYPE type = pd->plane_type;
-  const int default_eob = 16 << (tx_size << 1);
+  const int default_eob = get_tx2d_size(tx_size);
   const int16_t* const dequant_ptr = pd->dequant;
   const uint8_t* const band_translate = get_band_translate(tx_size);
   TX_TYPE tx_type = get_tx_type(type, xd, block, tx_size);
@@ -125,9 +111,9 @@
   const int *cat6_high_cost = vp10_get_high_cost_table(8);
 #endif
   unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
-                   mb->token_costs[tx_size][type][ref];
-  const int16_t *band_counts = &band_count_table[tx_size][band];
-  int16_t band_left = eob - band_cum_count_table[tx_size][band] + 1;
+                   mb->token_costs[txsize_sqr_map[tx_size]][type][ref];
+  const uint16_t *band_counts = &band_count_table[tx_size][band];
+  uint16_t band_left = eob - band_cum_count_table[tx_size][band] + 1;
   int shortcut = 0;
   int next_shortcut = 0;
 
@@ -444,8 +430,7 @@
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int16_t *src_diff;
-  const int tx1d_size = get_tx1d_size(tx_size);
-  const int tx2d_size = tx1d_size * tx1d_size;
+  const int tx2d_size = get_tx2d_size(tx_size);
 
   FWD_TXFM_PARAM fwd_txfm_param;
   QUANT_PARAM qparam;
@@ -524,89 +509,44 @@
   fwd_txfm_param.bd = xd->bd;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
-    switch (tx_size) {
-      case TX_32X32:
-        highbd_quantize_32x32_nuq(coeff, 1024, x->skip_block,
-                                  p->quant, p->quant_shift, pd->dequant,
-                                  (const cuml_bins_type_nuq *)
-                                      p->cuml_bins_nuq[dq],
-                                  (const dequant_val_type_nuq *)
-                                      pd->dequant_val_nuq[dq],
-                                  qcoeff, dqcoeff, eob,
-                                  scan_order->scan, band);
-        break;
-      case TX_16X16:
-        highbd_quantize_nuq(coeff, 256, x->skip_block,
-                            p->quant, p->quant_shift, pd->dequant,
-                            (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-                            (const dequant_val_type_nuq *)
+    if (tx_size == TX_32X32) {
+      highbd_quantize_32x32_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+                                p->quant, p->quant_shift, pd->dequant,
+                                (const cuml_bins_type_nuq *)
+                                p->cuml_bins_nuq[dq],
+                                (const dequant_val_type_nuq *)
                                 pd->dequant_val_nuq[dq],
-                            qcoeff, dqcoeff, eob,
-                            scan_order->scan, band);
-        break;
-      case TX_8X8:
-        highbd_quantize_nuq(coeff, 64, x->skip_block,
-                            p->quant, p->quant_shift, pd->dequant,
-                            (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-                            (const dequant_val_type_nuq *)
-                                pd->dequant_val_nuq[dq],
-                            qcoeff, dqcoeff, eob,
-                            scan_order->scan, band);
-        break;
-      case TX_4X4:
-        highbd_quantize_nuq(coeff, 16, x->skip_block,
-                            p->quant, p->quant_shift, pd->dequant,
-                            (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-                            (const dequant_val_type_nuq *)
-                                pd->dequant_val_nuq[dq],
-                            qcoeff, dqcoeff, eob,
-                            scan_order->scan, band);
-        break;
-      default:
-        assert(0);
+                                qcoeff, dqcoeff, eob,
+                                scan_order->scan, band);
+    } else {
+      highbd_quantize_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+                          p->quant, p->quant_shift, pd->dequant,
+                          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                          (const dequant_val_type_nuq *)
+                          pd->dequant_val_nuq[dq],
+                          qcoeff, dqcoeff, eob,
+                          scan_order->scan, band);
     }
     return;
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
-  switch (tx_size) {
-    case TX_32X32:
-      quantize_32x32_nuq(coeff, 1024, x->skip_block,
-                         p->quant, p->quant_shift, pd->dequant,
-                         (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-                         (const dequant_val_type_nuq *)
-                         pd->dequant_val_nuq[dq],
-                         qcoeff, dqcoeff, eob,
-                         scan_order->scan, band);
-      break;
-    case TX_16X16:
-      quantize_nuq(coeff, 256, x->skip_block,
-                   p->quant, p->quant_shift, pd->dequant,
-                   (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-                   (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
-                   qcoeff, dqcoeff, eob,
-                   scan_order->scan, band);
-      break;
-    case TX_8X8:
-      quantize_nuq(coeff, 64, x->skip_block,
-                   p->quant, p->quant_shift, pd->dequant,
-                   (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-                   (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
-                   qcoeff, dqcoeff, eob,
-                   scan_order->scan, band);
-      break;
-    case TX_4X4:
-      quantize_nuq(coeff, 16, x->skip_block,
-                   p->quant, p->quant_shift, pd->dequant,
-                   (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-                   (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
-                   qcoeff, dqcoeff, eob,
-                   scan_order->scan, band);
-      break;
-    default:
-      assert(0);
-      break;
+  if (tx_size == TX_32X32) {
+    quantize_32x32_nuq(coeff, 1024, x->skip_block,
+                       p->quant, p->quant_shift, pd->dequant,
+                       (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                       (const dequant_val_type_nuq *)
+                       pd->dequant_val_nuq[dq],
+                       qcoeff, dqcoeff, eob,
+                       scan_order->scan, band);
+  } else {
+    quantize_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+                 p->quant, p->quant_shift, pd->dequant,
+                 (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                 (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
+                 qcoeff, dqcoeff, eob,
+                 scan_order->scan, band);
   }
 }
 
@@ -645,99 +585,48 @@
   fwd_txfm_param.bd = xd->bd;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
-    switch (tx_size) {
-      case TX_32X32:
-        highbd_quantize_32x32_fp_nuq(coeff, 1024, x->skip_block,
-                                     p->quant_fp, pd->dequant,
-                                     (const cuml_bins_type_nuq *)
-                                         p->cuml_bins_nuq[dq],
-                                     (const dequant_val_type_nuq *)
-                                         pd->dequant_val_nuq[dq],
-                                     qcoeff, dqcoeff, eob,
-                                     scan_order->scan, band);
-        break;
-      case TX_16X16:
-        highbd_quantize_fp_nuq(coeff, 256, x->skip_block,
-                               p->quant_fp, pd->dequant,
-                               (const cuml_bins_type_nuq *)
-                                  p->cuml_bins_nuq[dq],
-                               (const dequant_val_type_nuq *)
-                                   pd->dequant_val_nuq[dq],
-                               qcoeff, dqcoeff, eob,
-                               scan_order->scan, band);
-        break;
-      case TX_8X8:
-        highbd_quantize_fp_nuq(coeff, 64, x->skip_block,
-                               p->quant_fp, pd->dequant,
-                               (const cuml_bins_type_nuq *)
-                                  p->cuml_bins_nuq[dq],
-                               (const dequant_val_type_nuq *)
-                                   pd->dequant_val_nuq[dq],
-                               qcoeff, dqcoeff, eob,
-                               scan_order->scan, band);
-        break;
-      case TX_4X4:
-        highbd_quantize_fp_nuq(coeff, 16, x->skip_block,
-                               p->quant_fp, pd->dequant,
-                               (const cuml_bins_type_nuq *)
+    if (tx_size == TX_32X32) {
+      highbd_quantize_32x32_fp_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+                                   p->quant_fp, pd->dequant,
+                                   (const cuml_bins_type_nuq *)
                                    p->cuml_bins_nuq[dq],
-                               (const dequant_val_type_nuq *)
+                                   (const dequant_val_type_nuq *)
                                    pd->dequant_val_nuq[dq],
-                               qcoeff, dqcoeff, eob,
-                               scan_order->scan, band);
-        break;
-      default:
-        assert(0);
+                                   qcoeff, dqcoeff, eob,
+                                   scan_order->scan, band);
+    } else {
+      highbd_quantize_fp_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+                             p->quant_fp, pd->dequant,
+                             (const cuml_bins_type_nuq *)
+                             p->cuml_bins_nuq[dq],
+                             (const dequant_val_type_nuq *)
+                             pd->dequant_val_nuq[dq],
+                             qcoeff, dqcoeff, eob,
+                             scan_order->scan, band);
     }
     return;
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
-  switch (tx_size) {
-    case TX_32X32:
-      quantize_32x32_fp_nuq(coeff, 1024, x->skip_block,
-                            p->quant_fp, pd->dequant,
-                            (const cuml_bins_type_nuq *)
-                                p->cuml_bins_nuq[dq],
-                            (const dequant_val_type_nuq *)
-                                pd->dequant_val_nuq[dq],
-                            qcoeff, dqcoeff, eob,
-                            scan_order->scan, band);
-      break;
-    case TX_16X16:
-      quantize_fp_nuq(coeff, 256, x->skip_block,
-                      p->quant_fp, pd->dequant,
-                      (const cuml_bins_type_nuq *)
+  if (tx_size == TX_32X32) {
+    quantize_32x32_fp_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+                          p->quant_fp, pd->dequant,
+                          (const cuml_bins_type_nuq *)
                           p->cuml_bins_nuq[dq],
-                      (const dequant_val_type_nuq *)
+                          (const dequant_val_type_nuq *)
                           pd->dequant_val_nuq[dq],
-                      qcoeff, dqcoeff, eob,
-                      scan_order->scan, band);
-      break;
-    case TX_8X8:
-      quantize_fp_nuq(coeff, 64, x->skip_block,
-                      p->quant_fp, pd->dequant,
-                      (const cuml_bins_type_nuq *)
-                          p->cuml_bins_nuq[dq],
-                      (const dequant_val_type_nuq *)
-                          pd->dequant_val_nuq[dq],
-                      qcoeff, dqcoeff, eob,
-                      scan_order->scan, band);
-      break;
-    case TX_4X4:
-      quantize_fp_nuq(coeff, 16, x->skip_block,
-                      p->quant_fp, pd->dequant,
-                      (const cuml_bins_type_nuq *)
-                          p->cuml_bins_nuq[dq],
-                      (const dequant_val_type_nuq *)
-                          pd->dequant_val_nuq[dq],
-                      qcoeff, dqcoeff, eob,
-                      scan_order->scan, band);
-      break;
-    default:
-      assert(0);
-      break;
+                          qcoeff, dqcoeff, eob,
+                          scan_order->scan, band);
+  } else {
+    quantize_fp_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+                    p->quant_fp, pd->dequant,
+                    (const cuml_bins_type_nuq *)
+                    p->cuml_bins_nuq[dq],
+                    (const dequant_val_type_nuq *)
+                    pd->dequant_val_nuq[dq],
+                    qcoeff, dqcoeff, eob,
+                    scan_order->scan, band);
   }
 }
 
@@ -773,79 +662,38 @@
   fwd_txfm_param.bd = xd->bd;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
-    switch (tx_size) {
-      case TX_32X32:
-        highbd_quantize_dc_32x32_nuq(coeff, 1024, x->skip_block,
-                                     p->quant[0], p->quant_shift[0],
-                                     pd->dequant[0],
-                                     p->cuml_bins_nuq[dq][0],
-                                     pd->dequant_val_nuq[dq][0],
-                                     qcoeff, dqcoeff, eob);
-        break;
-      case TX_16X16:
-        highbd_quantize_dc_nuq(coeff, 256, x->skip_block,
-                               p->quant[0], p->quant_shift[0],
-                               pd->dequant[0],
-                               p->cuml_bins_nuq[dq][0],
-                               pd->dequant_val_nuq[dq][0],
-                               qcoeff, dqcoeff, eob);
-        break;
-      case TX_8X8:
-        highbd_quantize_dc_nuq(coeff, 64, x->skip_block,
-                               p->quant[0], p->quant_shift[0],
-                               pd->dequant[0],
-                               p->cuml_bins_nuq[dq][0],
-                               pd->dequant_val_nuq[dq][0],
-                               qcoeff, dqcoeff, eob);
-        break;
-      case TX_4X4:
-        highbd_quantize_dc_nuq(coeff, 16, x->skip_block,
-                               p->quant[0], p->quant_shift[0],
-                               pd->dequant[0],
-                               p->cuml_bins_nuq[dq][0],
-                               pd->dequant_val_nuq[dq][0],
-                               qcoeff, dqcoeff, eob);
-        break;
-      default:
-        assert(0);
+    if (tx_size == TX_32X32) {
+      highbd_quantize_dc_32x32_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+                                   p->quant[0], p->quant_shift[0],
+                                   pd->dequant[0],
+                                   p->cuml_bins_nuq[dq][0],
+                                   pd->dequant_val_nuq[dq][0],
+                                   qcoeff, dqcoeff, eob);
+    } else {
+      highbd_quantize_dc_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+                             p->quant[0], p->quant_shift[0],
+                             pd->dequant[0],
+                             p->cuml_bins_nuq[dq][0],
+                             pd->dequant_val_nuq[dq][0],
+                             qcoeff, dqcoeff, eob);
     }
     return;
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
-  switch (tx_size) {
-    case TX_32X32:
-      quantize_dc_32x32_nuq(coeff, 1024, x->skip_block,
-                            p->quant[0], p->quant_shift[0], pd->dequant[0],
-                            p->cuml_bins_nuq[dq][0],
-                            pd->dequant_val_nuq[dq][0],
-                            qcoeff, dqcoeff, eob);
-      break;
-    case TX_16X16:
-      quantize_dc_nuq(coeff, 256, x->skip_block,
-                      p->quant[0], p->quant_shift[0], pd->dequant[0],
-                      p->cuml_bins_nuq[dq][0],
-                      pd->dequant_val_nuq[dq][0],
-                      qcoeff, dqcoeff, eob);
-      break;
-    case TX_8X8:
-      quantize_dc_nuq(coeff, 64, x->skip_block,
-                      p->quant[0], p->quant_shift[0], pd->dequant[0],
-                      p->cuml_bins_nuq[dq][0],
-                      pd->dequant_val_nuq[dq][0],
-                      qcoeff, dqcoeff, eob);
-      break;
-    case TX_4X4:
-      quantize_dc_nuq(coeff, 16, x->skip_block,
-                      p->quant[0], p->quant_shift[0], pd->dequant[0],
-                      p->cuml_bins_nuq[dq][0],
-                      pd->dequant_val_nuq[dq][0],
-                      qcoeff, dqcoeff, eob);
-      break;
-    default:
-      assert(0);
-      break;
+  if (tx_size == TX_32X32) {
+    quantize_dc_32x32_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+                          p->quant[0], p->quant_shift[0], pd->dequant[0],
+                          p->cuml_bins_nuq[dq][0],
+                          pd->dequant_val_nuq[dq][0],
+                          qcoeff, dqcoeff, eob);
+  } else {
+    quantize_dc_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+                    p->quant[0], p->quant_shift[0], pd->dequant[0],
+                    p->cuml_bins_nuq[dq][0],
+                    pd->dequant_val_nuq[dq][0],
+                    qcoeff, dqcoeff, eob);
   }
 }
 
@@ -882,76 +730,37 @@
   fwd_txfm_param.bd = xd->bd;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
-    switch (tx_size) {
-      case TX_32X32:
-        highbd_quantize_dc_32x32_fp_nuq(coeff, 1024, x->skip_block,
-                                        p->quant_fp[0], pd->dequant[0],
-                                        p->cuml_bins_nuq[dq][0],
-                                        pd->dequant_val_nuq[dq][0],
-                                        qcoeff, dqcoeff, eob);
-        break;
-      case TX_16X16:
-        highbd_quantize_dc_fp_nuq(coeff, 256, x->skip_block,
-                                  p->quant_fp[0], pd->dequant[0],
-                                  p->cuml_bins_nuq[dq][0],
-                                  pd->dequant_val_nuq[dq][0],
-                                  qcoeff, dqcoeff, eob);
-        break;
-      case TX_8X8:
-        highbd_quantize_dc_fp_nuq(coeff, 64, x->skip_block,
-                                  p->quant_fp[0], pd->dequant[0],
-                                  p->cuml_bins_nuq[dq][0],
-                                  pd->dequant_val_nuq[dq][0],
-                                  qcoeff, dqcoeff, eob);
-        break;
-      case TX_4X4:
-        highbd_quantize_dc_fp_nuq(coeff, 16, x->skip_block,
-                                  p->quant_fp[0], pd->dequant[0],
-                                  p->cuml_bins_nuq[dq][0],
-                                  pd->dequant_val_nuq[dq][0],
-                                  qcoeff, dqcoeff, eob);
-        break;
-      default:
-        assert(0);
+    if (tx_size == TX_32X32) {
+      highbd_quantize_dc_32x32_fp_nuq(coeff, get_tx2d_size(tx_size),
+                                      x->skip_block,
+                                      p->quant_fp[0], pd->dequant[0],
+                                      p->cuml_bins_nuq[dq][0],
+                                      pd->dequant_val_nuq[dq][0],
+                                      qcoeff, dqcoeff, eob);
+    } else {
+      highbd_quantize_dc_fp_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+                                p->quant_fp[0], pd->dequant[0],
+                                p->cuml_bins_nuq[dq][0],
+                                pd->dequant_val_nuq[dq][0],
+                                qcoeff, dqcoeff, eob);
     }
     return;
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
-  switch (tx_size) {
-    case TX_32X32:
-      quantize_dc_32x32_fp_nuq(coeff, 1024, x->skip_block,
-                               p->quant_fp[0], pd->dequant[0],
-                               p->cuml_bins_nuq[dq][0],
-                               pd->dequant_val_nuq[dq][0],
-                               qcoeff, dqcoeff, eob);
-      break;
-    case TX_16X16:
-      quantize_dc_fp_nuq(coeff, 256, x->skip_block,
-                         p->quant_fp[0], pd->dequant[0],
-                         p->cuml_bins_nuq[dq][0],
-                         pd->dequant_val_nuq[dq][0],
-                         qcoeff, dqcoeff, eob);
-
-      break;
-    case TX_8X8:
-      quantize_dc_fp_nuq(coeff, 64, x->skip_block,
-                         p->quant_fp[0], pd->dequant[0],
-                         p->cuml_bins_nuq[dq][0],
-                         pd->dequant_val_nuq[dq][0],
-                         qcoeff, dqcoeff, eob);
-      break;
-    case TX_4X4:
-      quantize_dc_fp_nuq(coeff, 16, x->skip_block,
-                         p->quant_fp[0], pd->dequant[0],
-                         p->cuml_bins_nuq[dq][0],
-                         pd->dequant_val_nuq[dq][0],
-                         qcoeff, dqcoeff, eob);
-      break;
-    default:
-      assert(0);
-      break;
+  if (tx_size == TX_32X32) {
+    quantize_dc_32x32_fp_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+                             p->quant_fp[0], pd->dequant[0],
+                             p->cuml_bins_nuq[dq][0],
+                             pd->dequant_val_nuq[dq][0],
+                             qcoeff, dqcoeff, eob);
+  } else {
+    quantize_dc_fp_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+                       p->quant_fp[0], pd->dequant[0],
+                       p->cuml_bins_nuq[dq][0],
+                       pd->dequant_val_nuq[dq][0],
+                       qcoeff, dqcoeff, eob);
   }
 }
 #endif  // CONFIG_NEW_QUANT
@@ -1011,8 +820,10 @@
   }
 
 #if CONFIG_VAR_TX
-  for (i = 0; i < (1 << tx_size); ++i) {
+  for (i = 0; i < num_4x4_blocks_wide_txsize_lookup[tx_size]; ++i) {
     a[i] = a[0];
+  }
+  for (i = 0; i < num_4x4_blocks_high_txsize_lookup[tx_size]; ++i) {
     l[i] = l[0];
   }
 #endif
@@ -1076,10 +887,14 @@
     assert(bsl > 0);
     --bsl;
 
+#if CONFIG_EXT_TX
+    assert(tx_size < TX_SIZES);
+#endif  // CONFIG_EXT_TX
+
     for (i = 0; i < 4; ++i) {
       const int offsetr = blk_row + ((i >> 1) << bsl);
       const int offsetc = blk_col + ((i & 0x01) << bsl);
-      int step = 1 << (2 * (tx_size - 1));
+      int step = num_4x4_blocks_txsize_lookup[tx_size - 1];
 
       if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide)
         continue;
@@ -1165,7 +980,7 @@
     const int bh = num_4x4_blocks_wide_lookup[txb_size];
     int idx, idy;
     int block = 0;
-    int step = 1 << (max_tx_size * 2);
+    int step = num_4x4_blocks_txsize_lookup[max_tx_size];
     vp10_get_entropy_contexts(bsize, TX_4X4, pd, ctx.ta[plane], ctx.tl[plane]);
 #else
     const struct macroblockd_plane* const pd = &xd->plane[plane];
@@ -1242,12 +1057,15 @@
   uint16_t *eob = &p->eobs[block];
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
-  const int tx1d_size = get_tx1d_size(tx_size);
+  const int tx1d_width = num_4x4_blocks_wide_txsize_lookup[tx_size] << 2;
+  const int tx1d_height = num_4x4_blocks_high_txsize_lookup[tx_size] << 2;
   ENTROPY_CONTEXT *a = NULL, *l = NULL;
   int ctx;
 
   INV_TXFM_PARAM inv_txfm_param;
 
+  assert(tx1d_width == tx1d_height);
+
   dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
   src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
   src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
@@ -1257,14 +1075,14 @@
                            dst_stride, blk_col, blk_row, plane);
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vpx_highbd_subtract_block(tx1d_size, tx1d_size, src_diff, diff_stride, src,
-                              src_stride, dst, dst_stride, xd->bd);
+    vpx_highbd_subtract_block(tx1d_height, tx1d_width, src_diff, diff_stride,
+                              src, src_stride, dst, dst_stride, xd->bd);
   } else {
-    vpx_subtract_block(tx1d_size, tx1d_size, src_diff, diff_stride, src,
+    vpx_subtract_block(tx1d_height, tx1d_width, src_diff, diff_stride, src,
                        src_stride, dst, dst_stride);
   }
 #else
-  vpx_subtract_block(tx1d_size, tx1d_size, src_diff, diff_stride, src,
+  vpx_subtract_block(tx1d_height, tx1d_width, src_diff, diff_stride, src,
                      src_stride, dst, dst_stride);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -1274,8 +1092,8 @@
 
   if (args->enable_optimize_b) {
 #if CONFIG_NEW_QUANT
-  vp10_xform_quant_fp_nuq(x, plane, block, blk_row, blk_col, plane_bsize,
-                          tx_size, ctx);
+    vp10_xform_quant_fp_nuq(x, plane, block, blk_row, blk_col, plane_bsize,
+                            tx_size, ctx);
 #else  // CONFIG_NEW_QUANT
     vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                      VP10_XFORM_QUANT_FP);
diff --git a/vp10/encoder/hybrid_fwd_txfm.c b/vp10/encoder/hybrid_fwd_txfm.c
index a0e0fdc..d5cf827 100644
--- a/vp10/encoder/hybrid_fwd_txfm.c
+++ b/vp10/encoder/hybrid_fwd_txfm.c
@@ -61,6 +61,22 @@
   }
 }
 
+#if CONFIG_EXT_TX
+static void fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, TX_TYPE tx_type,
+                         FWD_TXFM_OPT fwd_txfm_opt) {
+  (void) fwd_txfm_opt;
+  vp10_fht8x4(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, TX_TYPE tx_type,
+                         FWD_TXFM_OPT fwd_txfm_opt) {
+  (void) fwd_txfm_opt;
+  vp10_fht4x8(src_diff, coeff, diff_stride, tx_type);
+}
+#endif  // CONFIG_EXT_TX
+
 static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
                          int diff_stride, TX_TYPE tx_type,
                          FWD_TXFM_OPT fwd_txfm_opt) {
@@ -214,6 +230,24 @@
   }
 }
 
+#if CONFIG_EXT_TX
+static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TX_TYPE tx_type,
+                                FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void) fwd_txfm_opt;
+  (void) bd;
+  vp10_highbd_fht8x4(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void highbd_fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TX_TYPE tx_type,
+                                FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void) fwd_txfm_opt;
+  (void) bd;
+  vp10_highbd_fht4x8(src_diff, coeff, diff_stride, tx_type);
+}
+#endif  // CONFIG_EXT_TX
+
 static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
                                 int diff_stride, TX_TYPE tx_type,
                                 FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
@@ -344,6 +378,14 @@
     case TX_8X8:
       fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
       break;
+#if CONFIG_EXT_TX
+    case TX_4X8:
+      fwd_txfm_4x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+    case TX_8X4:
+      fwd_txfm_8x4(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+#endif  // CONFIG_EXT_TX
     case TX_4X4:
       fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless);
       break;
@@ -375,6 +417,16 @@
       highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type,
                           fwd_txfm_opt, bd);
       break;
+#if CONFIG_EXT_TX
+    case TX_4X8:
+      highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, tx_type,
+                          fwd_txfm_opt, bd);
+      break;
+    case TX_8X4:
+      highbd_fwd_txfm_8x4(src_diff, coeff, diff_stride, tx_type,
+                          fwd_txfm_opt, bd);
+      break;
+#endif  // CONFIG_EXT_TX
     case TX_4X4:
       highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type,
                                lossless, bd);
diff --git a/vp10/encoder/hybrid_fwd_txfm.h b/vp10/encoder/hybrid_fwd_txfm.h
index cd028bc..07b832c 100644
--- a/vp10/encoder/hybrid_fwd_txfm.h
+++ b/vp10/encoder/hybrid_fwd_txfm.h
@@ -38,22 +38,6 @@
                      int diff_stride, FWD_TXFM_PARAM *fwd_txfm_param);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-static INLINE int get_tx1d_size(TX_SIZE tx_size) {
-  switch (tx_size) {
-    case TX_32X32:
-      return 32;
-    case TX_16X16:
-      return 16;
-    case TX_8X8:
-      return 8;
-    case TX_4X4:
-      return 4;
-    default:
-      assert(0);
-      return -1;
-  }
-}
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/encoder/rd.c b/vp10/encoder/rd.c
index 028d578..cbdcc94 100644
--- a/vp10/encoder/rd.c
+++ b/vp10/encoder/rd.c
@@ -597,6 +597,18 @@
       memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
       memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
       break;
+#if CONFIG_EXT_TX
+    case TX_4X8:
+      memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+      for (i = 0; i < num_4x4_h; i += 2)
+        t_left[i] = !!*(const uint16_t *)&left[i];
+      break;
+    case TX_8X4:
+      for (i = 0; i < num_4x4_w; i += 2)
+        t_above[i] = !!*(const uint16_t *)&above[i];
+      memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
+      break;
+#endif  // CONFIG_EXT_TX
     case TX_8X8:
       for (i = 0; i < num_4x4_w; i += 2)
         t_above[i] = !!*(const uint16_t *)&above[i];
@@ -622,9 +634,9 @@
 }
 
 void vp10_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
-                              const struct macroblockd_plane *pd,
-                              ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
-                              ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) {
+                               const struct macroblockd_plane *pd,
+                               ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
+                               ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) {
   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
   get_entropy_contexts_plane(plane_bsize, tx_size, pd, t_above, t_left);
 }
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 8177212..97b6a6ff 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -21,6 +21,7 @@
 #include "vpx_ports/system_state.h"
 
 #include "vp10/common/common.h"
+#include "vp10/common/common_data.h"
 #include "vp10/common/entropy.h"
 #include "vp10/common/entropymode.h"
 #include "vp10/common/idct.h"
@@ -927,12 +928,6 @@
  * can skip this if the last coefficient in this transform block, e.g. the
  * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
  * were non-zero). */
-static const int16_t band_counts[TX_SIZES][8] = {
-  { 1, 2, 3, 4,  3,   16 - 13, 0 },
-  { 1, 2, 3, 4, 11,   64 - 21, 0 },
-  { 1, 2, 3, 4, 11,  256 - 21, 0 },
-  { 1, 2, 3, 4, 11, 1024 - 21, 0 },
-};
 static int cost_coeffs(MACROBLOCK *x,
                        int plane, int block,
 #if CONFIG_VAR_TX
@@ -948,11 +943,12 @@
   const struct macroblock_plane *p = &x->plane[plane];
   const struct macroblockd_plane *pd = &xd->plane[plane];
   const PLANE_TYPE type = pd->plane_type;
-  const int16_t *band_count = &band_counts[tx_size][1];
+  const uint16_t *band_count = &band_count_table[tx_size][1];
   const int eob = p->eobs[block];
   const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  const int tx_size_ctx = txsize_sqr_map[tx_size];
   unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
-                   x->token_costs[tx_size][type][is_inter_block(mbmi)];
+                   x->token_costs[tx_size_ctx][type][is_inter_block(mbmi)];
   uint8_t token_cache[MAX_TX_SQUARE];
 #if CONFIG_VAR_TX
   int pt = coeff_ctx;
@@ -1064,7 +1060,7 @@
   if (cpi->sf.use_transform_domain_distortion) {
     // Transform domain distortion computation is more accurate as it does
     // not involve an inverse transform, but it is less accurate.
-    const int ss_txfrm_size = tx_size << 1;
+    const int ss_txfrm_size = num_4x4_blocks_txsize_log2_lookup[tx_size];
     int64_t this_sse;
     int tx_type = get_tx_type(pd->plane_type, xd, block, tx_size);
     int shift = (MAX_TX_SCALE - get_tx_scale(xd, tx_type, tx_size)) * 2;
@@ -1081,7 +1077,8 @@
     *out_sse = this_sse >> shift;
   } else {
     const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
-    const int bs = 4*num_4x4_blocks_wide_lookup[tx_bsize];
+    const int bsw = 4 * num_4x4_blocks_wide_lookup[tx_bsize];
+    const int bsh = 4 * num_4x4_blocks_high_lookup[tx_bsize];
     const int src_stride = x->plane[plane].src.stride;
     const int dst_stride = xd->plane[plane].dst.stride;
     const int src_idx = 4 * (blk_row * src_stride + blk_col);
@@ -1121,13 +1118,13 @@
         recon = CONVERT_TO_BYTEPTR(recon);
         inv_txfm_param.bd = xd->bd;
         vpx_highbd_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE,
-                                 NULL, 0, NULL, 0, bs, bs, xd->bd);
+                                 NULL, 0, NULL, 0, bsw, bsh, xd->bd);
         highbd_inv_txfm_add(dqcoeff, recon, MAX_TX_SIZE, &inv_txfm_param);
       } else
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       {
         vpx_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE,
-                          NULL, 0, NULL, 0, bs, bs);
+                          NULL, 0, NULL, 0, bsw, bsh);
         inv_txfm_add(dqcoeff, recon, MAX_TX_SIZE, &inv_txfm_param);
       }
 
@@ -1159,6 +1156,29 @@
 #endif  // CONFIG_VAR_TX
 }
 
+static uint64_t sum_squares_2d(const int16_t *diff, int diff_stride,
+                               TX_SIZE tx_size) {
+  uint64_t sse;
+  switch (tx_size) {
+#if CONFIG_EXT_TX
+    case TX_4X8:
+      sse = vpx_sum_squares_2d_i16(diff, diff_stride, 4) +
+          vpx_sum_squares_2d_i16(diff + 4 * diff_stride, diff_stride, 4);
+      break;
+    case TX_8X4:
+      sse = vpx_sum_squares_2d_i16(diff, diff_stride, 4) +
+          vpx_sum_squares_2d_i16(diff + 4, diff_stride, 4);;
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(tx_size < TX_SIZES);
+      sse = vpx_sum_squares_2d_i16(
+          diff, diff_stride, num_4x4_blocks_wide_txsize_lookup[tx_size] << 2);
+      break;
+  }
+  return sse;
+}
+
 static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
                           BLOCK_SIZE plane_bsize,
                           TX_SIZE tx_size, void *arg) {
@@ -1188,7 +1208,6 @@
     } else {
       // Note that the encode block_intra call above already calls
       // inv_txfm_add, so we can't just call dist_block here.
-      const int bs = 4 << tx_size;
       const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
       const vpx_variance_fn_t variance = args->cpi->fn_ptr[tx_bsize].vf;
 
@@ -1204,8 +1223,8 @@
       const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
 
       unsigned int tmp;
+      sse = sum_squares_2d(diff, diff_stride, tx_size);
 
-      sse = vpx_sum_squares_2d_i16(diff, diff_stride, bs);
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
         sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
@@ -1316,6 +1335,10 @@
   args.best_rd = ref_best_rd;
   args.use_fast_coef_costing = use_fast_coef_casting;
 
+#if CONFIG_EXT_TX
+  assert(tx_size < TX_SIZES);
+#endif  // CONFIG_EXT_TX
+
   if (plane == 0)
     xd->mi[0]->mbmi.tx_size = tx_size;
 
@@ -1361,6 +1384,7 @@
 #endif  // CONFIG_EXT_TX
 
   assert(skip_prob > 0);
+
   s0 = vp10_cost_bit(skip_prob, 0);
   s1 = vp10_cost_bit(skip_prob, 1);
 
@@ -2955,6 +2979,10 @@
   int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
   int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
 
+#if CONFIG_EXT_TX
+  assert(tx_size < TX_SIZES);
+#endif  // CONFIG_EXT_TX
+
   if (xd->mb_to_bottom_edge < 0)
     max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
   if (xd->mb_to_right_edge < 0)
@@ -3087,6 +3115,10 @@
   int tmp_eob = 0;
   int zero_blk_rate;
 
+#if CONFIG_EXT_TX
+  assert(tx_size < TX_SIZES);
+#endif  // CONFIG_EXT_TX
+
   if (ref_best_rd < 0) {
     *is_cost_valid = 0;
     return;
@@ -3158,7 +3190,7 @@
   if (tx_size > TX_4X4) {
     BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
     int bsl = b_height_log2_lookup[bsize];
-    int sub_step = 1 << (2 * (tx_size - 1));
+    int sub_step = num_4x4_blocks_txsize_lookup[tx_size - 1];
     int i;
     int this_rate;
     int64_t this_dist;
@@ -3167,6 +3199,9 @@
     int this_cost_valid = 1;
     int64_t tmp_rd = 0;
 
+#if CONFIG_EXT_TX
+    assert(tx_size < TX_SIZES);
+#endif  // CONFIG_EXT_TX
     --bsl;
     for (i = 0; i < 4 && this_cost_valid; ++i) {
       int offsetr = (i >> 1) << bsl;
@@ -3191,13 +3226,15 @@
 
   if (this_rd < sum_rd) {
     int idx, idy;
-    for (i = 0; i < (1 << tx_size); ++i)
-      pta[i] = ptl[i] = !(tmp_eob == 0);
+    for (i = 0; i < num_4x4_blocks_wide_txsize_lookup[tx_size]; ++i)
+      pta[i] = !(tmp_eob == 0);
+    for (i = 0; i < num_4x4_blocks_high_txsize_lookup[tx_size]; ++i)
+      ptl[i] = !(tmp_eob == 0);
     txfm_partition_update(tx_above + (blk_col >> 1),
                           tx_left + (blk_row >> 1), tx_size);
     inter_tx_size[0][0] = tx_size;
-    for (idy = 0; idy < (1 << tx_size) / 2; ++idy)
-      for (idx = 0; idx < (1 << tx_size) / 2; ++idx)
+    for (idy = 0; idy < num_4x4_blocks_high_txsize_lookup[tx_size] / 2; ++idy)
+      for (idx = 0; idx < num_4x4_blocks_wide_txsize_lookup[tx_size] / 2; ++idx)
         inter_tx_size[idy][idx] = tx_size;
     mbmi->tx_size = tx_size;
     if (this_rd == INT64_MAX)
@@ -3453,6 +3490,10 @@
   int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
   int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
 
+#if CONFIG_EXT_TX
+  assert(tx_size < TX_SIZES);
+#endif  // CONFIG_EXT_TX
+
   if (xd->mb_to_bottom_edge < 0)
     max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
   if (xd->mb_to_right_edge < 0)
@@ -3487,13 +3528,13 @@
     coeff_ctx = combine_entropy_contexts(ta[0], tl[0]);
     vp10_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
                        plane_bsize, coeff_ctx, rate, dist, bsse, skip);
-    for (i = 0; i < (1 << tx_size); ++i) {
+    for (i = 0; i < num_4x4_blocks_wide_txsize_lookup[tx_size]; ++i)
       ta[i] = !(p->eobs[block] == 0);
+    for (i = 0; i < num_4x4_blocks_high_txsize_lookup[tx_size]; ++i)
       tl[i] = !(p->eobs[block] == 0);
-    }
   } else {
     int bsl = b_width_log2_lookup[bsize];
-    int step = 1 << (2 * (tx_size - 1));
+    int step = num_4x4_blocks_txsize_lookup[tx_size - 1];
     int i;
 
     assert(bsl > 0);
@@ -3590,7 +3631,7 @@
 
   return is_cost_valid;
 }
-#endif
+#endif  // CONFIG_VAR_TX
 
 // Return value 0: early termination triggered, no valid rd cost available;
 //              1: rd cost values are valid.
@@ -4402,11 +4443,22 @@
   const uint8_t *const src =
       &p->src.buf[vp10_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
   uint8_t *const dst = &pd->dst.buf[vp10_raster_block_offset(BLOCK_8X8, i,
-                                                            pd->dst.stride)];
+                                                             pd->dst.stride)];
   int64_t thisdistortion = 0, thissse = 0;
   int thisrate = 0;
-  TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, i, TX_4X4);
-  const scan_order *so = get_scan(TX_4X4, tx_type, 1);
+  TX_SIZE tx_size = mi->mbmi.tx_size;
+
+  TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, i, tx_size);
+  const scan_order *so = get_scan(tx_size, tx_type, 1);
+  const int num_4x4_w = num_4x4_blocks_wide_txsize_lookup[tx_size];
+  const int num_4x4_h = num_4x4_blocks_high_txsize_lookup[tx_size];
+
+#if CONFIG_EXT_TX && CONFIG_RECT_TX && !CONFIG_VAR_TX
+  assert(tx_size == max_txsize_rect_lookup[mi->mbmi.sb_type]);
+#else
+  assert(tx_size == TX_4X4);
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && !CONFIG_VAR_TX
+  assert(tx_type == DCT_DCT);
 
   vp10_build_inter_predictor_sub8x8(xd, 0, i, ir, ic, mi_row, mi_col);
 
@@ -4427,39 +4479,51 @@
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   k = i;
-  for (idy = 0; idy < height / 4; ++idy) {
-    for (idx = 0; idx < width / 4; ++idx) {
-      int64_t dist, ssz, rd, rd1, rd2;
+  for (idy = 0; idy < height / 4; idy += num_4x4_h) {
+    for (idx = 0; idx < width / 4; idx += num_4x4_w) {
+      int64_t dist, ssz, rd, rd1, rd2, block;
       int coeff_ctx;
       k += (idy * 2 + idx);
+      if (tx_size == TX_4X4)
+        block = k;
+      else
+        block = (i ? 2 : 0);
       coeff_ctx = combine_entropy_contexts(*(ta + (k & 1)),
                                            *(tl + (k >> 1)));
 #if CONFIG_NEW_QUANT
-      vp10_xform_quant_fp_nuq(x, 0, k, idy + (i >> 1), idx + (i & 0x01),
-                              BLOCK_8X8, TX_4X4, coeff_ctx);
+      vp10_xform_quant_fp_nuq(x, 0, block, idy + (i >> 1), idx + (i & 0x01),
+                              BLOCK_8X8, tx_size, coeff_ctx);
 #else
-      vp10_xform_quant(x, 0, k, idy + (i >> 1), idx + (i & 0x01), BLOCK_8X8,
-                       TX_4X4, VP10_XFORM_QUANT_FP);
+      vp10_xform_quant(x, 0, block, idy + (i >> 1), idx + (i & 0x01), BLOCK_8X8,
+                       tx_size, VP10_XFORM_QUANT_FP);
 #endif  // CONFIG_NEW_QUANT
       if (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0)
-        vp10_optimize_b(x, 0, k, TX_4X4, coeff_ctx);
-      dist_block(cpi, x, 0, k, idy + (i >> 1), idx + (i & 0x1), TX_4X4,
+        vp10_optimize_b(x, 0, block, tx_size, coeff_ctx);
+      dist_block(cpi, x, 0, block, idy + (i >> 1), idx + (i & 0x1), tx_size,
                  &dist, &ssz);
       thisdistortion += dist;
       thissse += ssz;
 #if CONFIG_VAR_TX
-      thisrate += cost_coeffs(x, 0, k, coeff_ctx,
-                              TX_4X4,
+      thisrate += cost_coeffs(x, 0, block, coeff_ctx,
+                              tx_size,
                               so->scan, so->neighbors,
                               cpi->sf.use_fast_coef_costing);
-      *(ta + (k & 1)) = !(p->eobs[k] == 0);
-      *(tl + (k >> 1)) = !(p->eobs[k] == 0);
+      *(ta + (k & 1)) = !(p->eobs[block] == 0);
+      *(tl + (k >> 1)) = !(p->eobs[block] == 0);
 #else
-      thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1),
-                              TX_4X4,
+      thisrate += cost_coeffs(x, 0, block, ta + (k & 1), tl + (k >> 1),
+                              tx_size,
                               so->scan, so->neighbors,
                               cpi->sf.use_fast_coef_costing);
-#endif
+#if CONFIG_EXT_TX
+      if (tx_size == TX_8X4) {
+        *(ta + (k & 1) + 1) = *(ta + (k & 1));
+      }
+      if (tx_size == TX_4X8) {
+        *(tl + (k >> 1) + 1) = *(tl + (k >> 1));
+      }
+#endif  // CONFIG_EXT_TX
+#endif  // CONFIG_VAR_TX
       rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion);
       rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse);
       rd = VPXMIN(rd1, rd2);
@@ -4951,6 +5015,11 @@
   const int has_second_rf = has_second_ref(mbmi);
   const int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+#if CONFIG_EXT_TX && CONFIG_RECT_TX && !CONFIG_VAR_TX
+  mbmi->tx_size = max_txsize_rect_lookup[bsize];
+#else
+  mbmi->tx_size = TX_4X4;
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && !CONFIG_VAR_TX
 
   vp10_zero(*bsi);
 
@@ -5020,8 +5089,8 @@
 #if CONFIG_EXT_INTER
                                        mv_ref_list,
 #endif  // CONFIG_EXT_INTER
-                                      &frame_mv[NEARESTMV][frame],
-                                      &frame_mv[NEARMV][frame]);
+                                       &frame_mv[NEARESTMV][frame],
+                                       &frame_mv[NEARMV][frame]);
 
 #if CONFIG_REF_MV
         tmp_ref_mv[ref] = frame_mv[NEARESTMV][mbmi->ref_frame[ref]];
@@ -5072,10 +5141,11 @@
 #if CONFIG_EXT_INTER
       for (this_mode = (has_second_rf ? NEAREST_NEARESTMV : NEARESTMV);
            this_mode <= (has_second_rf ? NEW_NEWMV : NEWFROMNEARMV);
-           ++this_mode) {
+           ++this_mode)
 #else
-      for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+      for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode)
 #endif  // CONFIG_EXT_INTER
+      {
         const struct buf_2d orig_src = x->plane[0].src;
         struct buf_2d orig_pre[2];
         // This flag controls if the motion estimation will kick off. When it
@@ -5342,10 +5412,11 @@
             this_mode == NEWMV &&
 #endif  // CONFIG_EXT_INTER
 #if CONFIG_DUAL_FILTER
-            (mbmi->interp_filter[0] == EIGHTTAP_REGULAR || run_mv_search)) {
+            (mbmi->interp_filter[0] == EIGHTTAP_REGULAR || run_mv_search))
 #else
-            (mbmi->interp_filter == EIGHTTAP_REGULAR || run_mv_search)) {
+            (mbmi->interp_filter == EIGHTTAP_REGULAR || run_mv_search))
 #endif
+            {
           // adjust src pointers
           mi_buf_shift(x, i);
           if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
diff --git a/vp10/encoder/tokenize.c b/vp10/encoder/tokenize.c
index c25f8bc..734ae8b 100644
--- a/vp10/encoder/tokenize.c
+++ b/vp10/encoder/tokenize.c
@@ -393,7 +393,7 @@
 
 static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
                              TX_SIZE tx_size) {
-  const int eob_max = 16 << (tx_size << 1);
+  const int eob_max = num_4x4_blocks_txsize_lookup[tx_size] << 4;
   return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
@@ -463,21 +463,21 @@
   const scan_order *const so = get_scan(tx_size, tx_type, is_inter_block(mbmi));
   const int ref = is_inter_block(mbmi);
   unsigned int (*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
-      td->rd_counts.coef_counts[tx_size][type][ref];
+      td->rd_counts.coef_counts[txsize_sqr_map[tx_size]][type][ref];
 #if CONFIG_ENTROPY
   vpx_prob (*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
       cpi->subframe_stats.coef_probs_buf[cpi->common.coef_probs_update_idx]
-                                        [tx_size][type][ref];
+                                        [txsize_sqr_map[tx_size]][type][ref];
 #else
   vpx_prob (*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
-      cpi->common.fc->coef_probs[tx_size][type][ref];
+      cpi->common.fc->coef_probs[txsize_sqr_map[tx_size]][type][ref];
 #endif  // CONFIG_ENTROPY
 #if CONFIG_ANS
   rans_dec_lut (*const coef_cdfs)[COEFF_CONTEXTS] =
-      cpi->common.fc->coef_cdfs[tx_size][type][ref];
+      cpi->common.fc->coef_cdfs[txsize_sqr_map[tx_size]][type][ref];
 #endif  // CONFIG_ANS
   unsigned int (*const eob_branch)[COEFF_CONTEXTS] =
-      td->counts->eob_branch[tx_size][type][ref];
+      td->counts->eob_branch[txsize_sqr_map[tx_size]][type][ref];
   const uint8_t *const band = get_band_translate(tx_size);
   const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
   int skip_eob = 0;
@@ -539,7 +539,7 @@
   int result = 1;
   struct is_skippable_args args = {x->plane[plane].eobs, &result};
   vp10_foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane, is_skippable,
-                                         &args);
+                                          &args);
   return result;
 }
 
@@ -560,7 +560,7 @@
   int result = 0;
   struct is_skippable_args args = {x->plane[plane].eobs, &result};
   vp10_foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane,
-                                         has_high_freq_coeff, &args);
+                                          has_high_freq_coeff, &args);
   return result;
 }
 
@@ -582,6 +582,9 @@
 
   int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
   int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+
+  assert(tx_size < TX_SIZES);
+
   if (xd->mb_to_bottom_edge < 0)
     max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
   if (xd->mb_to_right_edge < 0)
@@ -608,7 +611,7 @@
     for (i = 0; i < 4; ++i) {
       const int offsetr = blk_row + ((i >> 1) << bsl);
       const int offsetc = blk_col + ((i & 0x01) << bsl);
-      int step = 1 << (2 * (tx_size - 1));
+      int step = num_4x4_blocks_txsize_lookup[tx_size - 1];
 
       if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide)
         continue;
@@ -659,7 +662,7 @@
     int bh = num_4x4_blocks_wide_lookup[txb_size];
     int idx, idy;
     int block = 0;
-    int step = 1 << (max_tx_size * 2);
+    int step = num_4x4_blocks_txsize_lookup[max_tx_size];
     for (idy = 0; idy < mi_height; idy += bh) {
       for (idx = 0; idx < mi_width; idx += bh) {
         tokenize_tx(td, t, dry_run, max_tx_size, plane_bsize, idy, idx,
@@ -674,7 +677,7 @@
     }
   }
 }
-#endif
+#endif  // CONFIG_VAR_TX
 
 void vp10_tokenize_sb(VP10_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
                      int dry_run, BLOCK_SIZE bsize) {