Merge "No need to store the deringing filter direction variance in an array" into nextgenv2
diff --git a/av1/common/av1_fwd_txfm2d_cfg.h b/av1/common/av1_fwd_txfm2d_cfg.h
index 49d324d..5a7c218 100644
--- a/av1/common/av1_fwd_txfm2d_cfg.h
+++ b/av1/common/av1_fwd_txfm2d_cfg.h
@@ -109,7 +109,7 @@
 };  // .txfm_type_row
 
 //  ---------------- config fwd_dct_dct_64 ----------------
-static const int8_t fwd_shift_dct_dct_64[3] = { 2, -2, -2 };
+static const int8_t fwd_shift_dct_dct_64[3] = { 0, -2, -2 };
 static const int8_t fwd_stage_range_col_dct_dct_64[12] = {
   13, 14, 15, 16, 17, 18, 19, 19, 19, 19, 19, 19
 };
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index f3b1328..597d5b2 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -61,25 +61,23 @@
     add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht4x4_16_add/;
 
-    if (aom_config("CONFIG_EXT_TX") eq "yes") {
-      add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht4x8_32_add/;
 
-      add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht8x4_32_add/;
 
-      add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht8x16_128_add/;
 
-      add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht16x8_128_add/;
 
-      add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht16x32_512_add/;
 
-      add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht32x16_512_add/;
-    }
 
     add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht8x8_64_add/;
@@ -90,25 +88,23 @@
     add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht4x4_16_add sse2/;
 
-    if (aom_config("CONFIG_EXT_TX") eq "yes") {
-      add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht4x8_32_add sse2/;
 
-      add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht8x4_32_add sse2/;
 
-      add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht8x16_128_add sse2/;
 
-      add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht16x8_128_add sse2/;
 
-      add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht16x32_512_add sse2/;
 
-      add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht32x16_512_add sse2/;
-    }
 
     add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht8x8_64_add sse2/;
@@ -122,25 +118,23 @@
     add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht4x4_16_add/;
 
-    if (aom_config("CONFIG_EXT_TX") eq "yes") {
-      add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht4x8_32_add/;
 
-      add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht8x4_32_add/;
 
-      add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht8x16_128_add/;
 
-      add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht16x8_128_add/;
 
-      add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht16x32_512_add/;
 
-      add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht32x16_512_add/;
-    }
 
     add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht8x8_64_add/;
@@ -151,25 +145,23 @@
     add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht4x4_16_add sse2 neon dspr2/;
 
-    if (aom_config("CONFIG_EXT_TX") eq "yes") {
-      add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht4x8_32_add sse2/;
 
-      add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht8x4_32_add sse2/;
 
-      add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht8x16_128_add sse2/;
 
-      add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht16x8_128_add sse2/;
 
-      add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht16x32_512_add sse2/;
 
-      add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht32x16_512_add sse2/;
-    }
 
     add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht8x8_64_add sse2 neon dspr2/;
@@ -283,25 +275,23 @@
   add_proto qw/void av1_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
   specialize qw/av1_highbd_iht4x4_16_add/;
 
-  if (aom_config("CONFIG_EXT_TX") eq "yes") {
-    add_proto qw/void av1_highbd_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  add_proto qw/void av1_highbd_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
     specialize qw/av1_highbd_iht4x8_32_add/;
 
-    add_proto qw/void av1_highbd_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  add_proto qw/void av1_highbd_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
     specialize qw/av1_highbd_iht8x4_32_add/;
 
-    add_proto qw/void av1_highbd_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  add_proto qw/void av1_highbd_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
     specialize qw/av1_highbd_iht8x16_128_add/;
 
-    add_proto qw/void av1_highbd_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  add_proto qw/void av1_highbd_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
     specialize qw/av1_highbd_iht16x8_128_add/;
 
-    add_proto qw/void av1_highbd_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  add_proto qw/void av1_highbd_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
     specialize qw/av1_highbd_iht16x32_512_add/;
 
-    add_proto qw/void av1_highbd_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  add_proto qw/void av1_highbd_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
     specialize qw/av1_highbd_iht32x16_512_add/;
-  }
 
   add_proto qw/void av1_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
   specialize qw/av1_highbd_iht8x8_64_add/;
@@ -394,26 +384,29 @@
 add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 specialize qw/av1_fht32x32 avx2/;
 
-if (aom_config("CONFIG_EXT_TX") eq "yes") {
-  add_proto qw/void av1_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/av1_fht4x8 sse2/;
-
-  add_proto qw/void av1_fht8x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/av1_fht8x4 sse2/;
-
-  add_proto qw/void av1_fht8x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/av1_fht8x16 sse2/;
-
-  add_proto qw/void av1_fht16x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/av1_fht16x8 sse2/;
-
-  add_proto qw/void av1_fht16x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/av1_fht16x32 sse2/;
-
-  add_proto qw/void av1_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/av1_fht32x16 sse2/;
+if (aom_config("CONFIG_TX64X64") eq "yes") {
+  add_proto qw/void av1_fht64x64/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_fht64x64/;
 }
 
+add_proto qw/void av1_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht4x8 sse2/;
+
+add_proto qw/void av1_fht8x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht8x4 sse2/;
+
+add_proto qw/void av1_fht8x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht8x16 sse2/;
+
+add_proto qw/void av1_fht16x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht16x8 sse2/;
+
+add_proto qw/void av1_fht16x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht16x32 sse2/;
+
+add_proto qw/void av1_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht32x16 sse2/;
+
 if (aom_config("CONFIG_AOM_HIGHBITDEPTH") ne "yes") {
   if (aom_config("CONFIG_EXT_TX") ne "yes") {
     specialize qw/av1_fht4x4 msa/;
@@ -536,6 +529,11 @@
   add_proto qw/void av1_highbd_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/av1_highbd_fht32x32/;
 
+  if (aom_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void av1_highbd_fht64x64/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+    specialize qw/av1_highbd_fht64x64/;
+  }
+
   add_proto qw/void av1_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/av1_highbd_fwht4x4/;
 
diff --git a/av1/common/enums.h b/av1/common/enums.h
index 35b9e5f..2ec83ec 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h
@@ -134,18 +134,16 @@
 #if CONFIG_CB4X4
   TX_2X2,  // 2x2 transform
 #endif
-  TX_4X4,    // 4x4 transform
-  TX_8X8,    // 8x8 transform
-  TX_16X16,  // 16x16 transform
-  TX_32X32,  // 32x32 transform
-#if CONFIG_EXT_TX
+  TX_4X4,                   // 4x4 transform
+  TX_8X8,                   // 8x8 transform
+  TX_16X16,                 // 16x16 transform
+  TX_32X32,                 // 32x32 transform
   TX_4X8,                   // 4x8 transform
   TX_8X4,                   // 8x4 transform
   TX_8X16,                  // 8x16 transform
   TX_16X8,                  // 16x8 transform
   TX_16X32,                 // 16x32 transform
   TX_32X16,                 // 32x16 transform
-#endif                      // CONFIG_EXT_TX
   TX_SIZES_ALL,             // Includes rectangular transforms
   TX_SIZES = TX_32X32 + 1,  // Does NOT include rectangular transforms
   TX_INVALID = 255          // Invalid transform size
diff --git a/av1/common/idct.c b/av1/common/idct.c
index 96b34ee..223c577 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -58,6 +58,7 @@
   int i;
   for (i = 0; i < 32; ++i) output[i] = input[i] * 4;
 }
+#endif  // CONFIG_EXT_TX
 
 // For use in lieu of ADST
 static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
@@ -74,7 +75,49 @@
   // Note overall scaling factor is 4 times orthogonal
 }
 
+#if CONFIG_TX64X64
+static void idct64_col_c(const tran_low_t *input, tran_low_t *output) {
+  int32_t in[64], out[64];
+  int i;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_idct64_new(in, out, inv_cos_bit_col_dct_dct_64,
+                 inv_stage_range_col_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+static void idct64_row_c(const tran_low_t *input, tran_low_t *output) {
+  int32_t in[64], out[64];
+  int i;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_idct64_new(in, out, inv_cos_bit_row_dct_dct_64,
+                 inv_stage_range_row_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+static void iidtx64_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 64; ++i)
+    output[i] = (tran_low_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
+}
+
+// For use in lieu of ADST
+static void ihalfright64_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[32];
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 32; ++i) {
+    inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
+  }
+  for (i = 0; i < 32; ++i) {
+    output[i] = (tran_low_t)dct_const_round_shift(input[32 + i] * 4 * Sqrt2);
+  }
+  aom_idct32_c(inputhalf, output + 32);
+  // Note overall scaling factor is 4 * sqrt(2)  times orthogonal
+}
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_AOM_HIGHBITDEPTH
+#if CONFIG_EXT_TX
 static void highbd_iidtx4_c(const tran_low_t *input, tran_low_t *output,
                             int bd) {
   int i;
@@ -104,6 +147,7 @@
   (void)bd;
   for (i = 0; i < 32; ++i) output[i] = input[i] * 4;
 }
+#endif  // CONFIG_EXT_TX
 
 static void highbd_ihalfright32_c(const tran_low_t *input, tran_low_t *output,
                                   int bd) {
@@ -120,9 +164,62 @@
   aom_highbd_idct16_c(inputhalf, output + 16, bd);
   // Note overall scaling factor is 4 times orthogonal
 }
+
+#if CONFIG_EXT_TX
+#if CONFIG_TX64X64
+static void highbd_iidtx64_c(const tran_low_t *input, tran_low_t *output,
+                             int bd) {
+  int i;
+  for (i = 0; i < 64; ++i)
+    output[i] =
+        HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[i] * 4 * Sqrt2), bd);
+}
+
+// For use in lieu of ADST
+static void highbd_ihalfright64_c(const tran_low_t *input, tran_low_t *output,
+                                  int bd) {
+  int i;
+  tran_low_t inputhalf[32];
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 32; ++i) {
+    inputhalf[i] =
+        HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[i] * Sqrt2), bd);
+  }
+  for (i = 0; i < 32; ++i) {
+    output[i] = HIGHBD_WRAPLOW(
+        highbd_dct_const_round_shift(input[32 + i] * 4 * Sqrt2), bd);
+  }
+  aom_highbd_idct32_c(inputhalf, output + 32, bd);
+  // Note overall scaling factor is 4 * sqrt(2)  times orthogonal
+}
+
+static void highbd_idct64_col_c(const tran_low_t *input, tran_low_t *output,
+                                int bd) {
+  int32_t in[64], out[64];
+  int i;
+  (void)bd;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_idct64_new(in, out, inv_cos_bit_col_dct_dct_64,
+                 inv_stage_range_col_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+static void highbd_idct64_row_c(const tran_low_t *input, tran_low_t *output,
+                                int bd) {
+  int32_t in[64], out[64];
+  int i;
+  (void)bd;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_idct64_new(in, out, inv_cos_bit_row_dct_dct_64,
+                 inv_stage_range_row_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_EXT_TX
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 // Inverse identity transform and add.
+#if CONFIG_EXT_TX
 static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                            int bs, int tx_type) {
   int r, c;
@@ -136,6 +233,7 @@
     }
   }
 }
+#endif  // CONFIG_EXT_TX
 
 #define FLIPUD_PTR(dest, stride, size)       \
   do {                                       \
@@ -143,6 +241,7 @@
     (stride) = -(stride);                    \
   } while (0)
 
+#if CONFIG_EXT_TX
 static void maybe_flip_strides(uint8_t **dst, int *dstride, tran_low_t **src,
                                int *sstride, int tx_type, int sizey,
                                int sizex) {
@@ -180,8 +279,10 @@
     default: assert(0); break;
   }
 }
+#endif  // CONFIG_EXT_TX
 
 #if CONFIG_AOM_HIGHBITDEPTH
+#if CONFIG_EXT_TX
 static void highbd_inv_idtx_add_c(const tran_low_t *input, uint8_t *dest8,
                                   int stride, int bs, int tx_type, int bd) {
   int r, c;
@@ -235,8 +336,8 @@
     default: assert(0); break;
   }
 }
-#endif  // CONFIG_AOM_HIGHBITDEPTH
 #endif  // CONFIG_EXT_TX
+#endif  // CONFIG_AOM_HIGHBITDEPTH
 
 void av1_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          int tx_type) {
@@ -301,7 +402,6 @@
   }
 }
 
-#if CONFIG_EXT_TX
 void av1_iht4x8_32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          int tx_type) {
   static const transform_2d IHT_4x8[] = {
@@ -309,6 +409,7 @@
     { aom_iadst8_c, aom_idct4_c },   // ADST_DCT
     { aom_idct8_c, aom_iadst4_c },   // DCT_ADST
     { aom_iadst8_c, aom_iadst4_c },  // ADST_ADST
+#if CONFIG_EXT_TX
     { aom_iadst8_c, aom_idct4_c },   // FLIPADST_DCT
     { aom_idct8_c, aom_iadst4_c },   // DCT_FLIPADST
     { aom_iadst8_c, aom_iadst4_c },  // FLIPADST_FLIPADST
@@ -321,6 +422,7 @@
     { iidtx8_c, aom_iadst4_c },      // H_ADST
     { aom_iadst8_c, iidtx4_c },      // V_FLIPADST
     { iidtx8_c, aom_iadst4_c },      // H_FLIPADST
+#endif
   };
 
   const int n = 4;
@@ -343,7 +445,9 @@
     IHT_4x8[tx_type].cols(out[i], out[i]);
   }
 
+#if CONFIG_EXT_TX
   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif
 
   // Sum with the destination
   for (i = 0; i < n2; ++i) {
@@ -362,6 +466,7 @@
     { aom_iadst4_c, aom_idct8_c },   // ADST_DCT
     { aom_idct4_c, aom_iadst8_c },   // DCT_ADST
     { aom_iadst4_c, aom_iadst8_c },  // ADST_ADST
+#if CONFIG_EXT_TX
     { aom_iadst4_c, aom_idct8_c },   // FLIPADST_DCT
     { aom_idct4_c, aom_iadst8_c },   // DCT_FLIPADST
     { aom_iadst4_c, aom_iadst8_c },  // FLIPADST_FLIPADST
@@ -374,6 +479,7 @@
     { iidtx4_c, aom_iadst8_c },      // H_ADST
     { aom_iadst4_c, iidtx8_c },      // V_FLIPADST
     { iidtx4_c, aom_iadst8_c },      // H_FLIPADST
+#endif
   };
   const int n = 4;
   const int n2 = 8;
@@ -396,7 +502,9 @@
     IHT_8x4[tx_type].cols(out[i], out[i]);
   }
 
+#if CONFIG_EXT_TX
   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif
 
   // Sum with the destination
   for (i = 0; i < n; ++i) {
@@ -415,6 +523,7 @@
     { aom_iadst16_c, aom_idct8_c },   // ADST_DCT
     { aom_idct16_c, aom_iadst8_c },   // DCT_ADST
     { aom_iadst16_c, aom_iadst8_c },  // ADST_ADST
+#if CONFIG_EXT_TX
     { aom_iadst16_c, aom_idct8_c },   // FLIPADST_DCT
     { aom_idct16_c, aom_iadst8_c },   // DCT_FLIPADST
     { aom_iadst16_c, aom_iadst8_c },  // FLIPADST_FLIPADST
@@ -427,6 +536,7 @@
     { iidtx16_c, aom_iadst8_c },      // H_ADST
     { aom_iadst16_c, iidtx8_c },      // V_FLIPADST
     { iidtx16_c, aom_iadst8_c },      // H_FLIPADST
+#endif
   };
 
   const int n = 8;
@@ -449,7 +559,9 @@
     IHT_8x16[tx_type].cols(out[i], out[i]);
   }
 
+#if CONFIG_EXT_TX
   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif
 
   // Sum with the destination
   for (i = 0; i < n2; ++i) {
@@ -468,6 +580,7 @@
     { aom_iadst8_c, aom_idct16_c },   // ADST_DCT
     { aom_idct8_c, aom_iadst16_c },   // DCT_ADST
     { aom_iadst8_c, aom_iadst16_c },  // ADST_ADST
+#if CONFIG_EXT_TX
     { aom_iadst8_c, aom_idct16_c },   // FLIPADST_DCT
     { aom_idct8_c, aom_iadst16_c },   // DCT_FLIPADST
     { aom_iadst8_c, aom_iadst16_c },  // FLIPADST_FLIPADST
@@ -480,6 +593,7 @@
     { iidtx8_c, aom_iadst16_c },      // H_ADST
     { aom_iadst8_c, iidtx16_c },      // V_FLIPADST
     { iidtx8_c, aom_iadst16_c },      // H_FLIPADST
+#endif
   };
   const int n = 8;
   const int n2 = 16;
@@ -502,7 +616,9 @@
     IHT_16x8[tx_type].cols(out[i], out[i]);
   }
 
+#if CONFIG_EXT_TX
   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif
 
   // Sum with the destination
   for (i = 0; i < n; ++i) {
@@ -521,6 +637,7 @@
     { ihalfright32_c, aom_idct16_c },   // ADST_DCT
     { aom_idct32_c, aom_iadst16_c },    // DCT_ADST
     { ihalfright32_c, aom_iadst16_c },  // ADST_ADST
+#if CONFIG_EXT_TX
     { ihalfright32_c, aom_idct16_c },   // FLIPADST_DCT
     { aom_idct32_c, aom_iadst16_c },    // DCT_FLIPADST
     { ihalfright32_c, aom_iadst16_c },  // FLIPADST_FLIPADST
@@ -533,6 +650,7 @@
     { iidtx32_c, aom_iadst16_c },       // H_ADST
     { ihalfright32_c, iidtx16_c },      // V_FLIPADST
     { iidtx32_c, aom_iadst16_c },       // H_FLIPADST
+#endif
   };
 
   const int n = 16;
@@ -555,7 +673,9 @@
     IHT_16x32[tx_type].cols(out[i], out[i]);
   }
 
+#if CONFIG_EXT_TX
   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif
 
   // Sum with the destination
   for (i = 0; i < n2; ++i) {
@@ -574,6 +694,7 @@
     { aom_iadst16_c, aom_idct32_c },    // ADST_DCT
     { aom_idct16_c, ihalfright32_c },   // DCT_ADST
     { aom_iadst16_c, ihalfright32_c },  // ADST_ADST
+#if CONFIG_EXT_TX
     { aom_iadst16_c, aom_idct32_c },    // FLIPADST_DCT
     { aom_idct16_c, ihalfright32_c },   // DCT_FLIPADST
     { aom_iadst16_c, ihalfright32_c },  // FLIPADST_FLIPADST
@@ -586,6 +707,7 @@
     { iidtx16_c, ihalfright32_c },      // H_ADST
     { aom_iadst16_c, iidtx32_c },       // V_FLIPADST
     { iidtx16_c, ihalfright32_c },      // H_FLIPADST
+#endif
   };
   const int n = 16;
   const int n2 = 32;
@@ -608,7 +730,9 @@
     IHT_32x16[tx_type].cols(out[i], out[i]);
   }
 
+#if CONFIG_EXT_TX
   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif
 
   // Sum with the destination
   for (i = 0; i < n; ++i) {
@@ -619,7 +743,6 @@
     }
   }
 }
-#endif  // CONFIG_EXT_TX
 
 void av1_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          int tx_type) {
@@ -763,10 +886,10 @@
     { iidtx32_c, iidtx32_c },            // IDTX
     { aom_idct32_c, iidtx32_c },         // V_DCT
     { iidtx32_c, aom_idct32_c },         // H_DCT
-    { ihalfright32_c, iidtx16_c },       // V_ADST
-    { iidtx16_c, ihalfright32_c },       // H_ADST
-    { ihalfright32_c, iidtx16_c },       // V_FLIPADST
-    { iidtx16_c, ihalfright32_c },       // H_FLIPADST
+    { ihalfright32_c, iidtx32_c },       // V_ADST
+    { iidtx32_c, ihalfright32_c },       // H_ADST
+    { ihalfright32_c, iidtx32_c },       // V_FLIPADST
+    { iidtx32_c, ihalfright32_c },       // H_FLIPADST
   };
 
   int i, j;
@@ -806,6 +929,68 @@
     }
   }
 }
+
+#if CONFIG_TX64X64
+void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                             int tx_type) {
+  static const transform_2d IHT_64[] = {
+    { idct64_col_c, idct64_row_c },      // DCT_DCT
+    { ihalfright64_c, idct64_row_c },    // ADST_DCT
+    { idct64_col_c, ihalfright64_c },    // DCT_ADST
+    { ihalfright64_c, ihalfright64_c },  // ADST_ADST
+    { ihalfright64_c, idct64_row_c },    // FLIPADST_DCT
+    { idct64_col_c, ihalfright64_c },    // DCT_FLIPADST
+    { ihalfright64_c, ihalfright64_c },  // FLIPADST_FLIPADST
+    { ihalfright64_c, ihalfright64_c },  // ADST_FLIPADST
+    { ihalfright64_c, ihalfright64_c },  // FLIPADST_ADST
+    { iidtx64_c, iidtx64_c },            // IDTX
+    { idct64_col_c, iidtx64_c },         // V_DCT
+    { iidtx64_c, idct64_row_c },         // H_DCT
+    { ihalfright64_c, iidtx64_c },       // V_ADST
+    { iidtx64_c, ihalfright64_c },       // H_ADST
+    { ihalfright64_c, iidtx64_c },       // V_FLIPADST
+    { iidtx64_c, ihalfright64_c },       // H_FLIPADST
+  };
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[64][64];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 64;
+
+  // inverse transform row vectors
+  for (i = 0; i < 64; ++i) {
+    IHT_64[tx_type].rows(input, out[i]);
+    for (j = 0; j < 64; ++j) out[i][j] = ROUND_POWER_OF_TWO(out[i][j], 1);
+    input += 64;
+  }
+
+  // transpose
+  for (i = 1; i < 64; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 64; ++i) {
+    IHT_64[tx_type].cols(out[i], out[i]);
+  }
+
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 64, 64);
+
+  // Sum with the destination
+  for (i = 0; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+    }
+  }
+}
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_EXT_TX
 
 // idct
@@ -905,7 +1090,6 @@
   }
 }
 
-#if CONFIG_EXT_TX
 void av1_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest, int stride,
                           int eob, TX_TYPE tx_type) {
   (void)eob;
@@ -941,7 +1125,6 @@
   (void)eob;
   av1_iht32x16_512_add(input, dest, stride, tx_type);
 }
-#endif  // CONFIG_EXT_TX
 
 void av1_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest, int stride,
                           int eob, TX_TYPE tx_type) {
@@ -1090,7 +1273,6 @@
   }
 }
 
-#if CONFIG_EXT_TX
 void av1_highbd_iht4x8_32_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int tx_type, int bd) {
   static const highbd_transform_2d HIGH_IHT_4x8[] = {
@@ -1098,6 +1280,7 @@
     { aom_highbd_iadst8_c, aom_highbd_idct4_c },   // ADST_DCT
     { aom_highbd_idct8_c, aom_highbd_iadst4_c },   // DCT_ADST
     { aom_highbd_iadst8_c, aom_highbd_iadst4_c },  // ADST_ADST
+#if CONFIG_EXT_TX
     { aom_highbd_iadst8_c, aom_highbd_idct4_c },   // FLIPADST_DCT
     { aom_highbd_idct8_c, aom_highbd_iadst4_c },   // DCT_FLIPADST
     { aom_highbd_iadst8_c, aom_highbd_iadst4_c },  // FLIPADST_FLIPADST
@@ -1110,6 +1293,7 @@
     { highbd_iidtx8_c, aom_highbd_iadst4_c },      // H_ADST
     { aom_highbd_iadst8_c, highbd_iidtx4_c },      // V_FLIPADST
     { highbd_iidtx8_c, aom_highbd_iadst4_c },      // H_FLIPADST
+#endif                                             // CONFIG_EXT_TX
   };
   const int n = 4;
   const int n2 = 8;
@@ -1136,7 +1320,9 @@
     HIGH_IHT_4x8[tx_type].cols(out[i], out[i], bd);
   }
 
+#if CONFIG_EXT_TX
   maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif  // CONFIG_EXT_TX
 
   // Sum with the destination
   for (i = 0; i < n2; ++i) {
@@ -1156,6 +1342,7 @@
     { aom_highbd_iadst4_c, aom_highbd_idct8_c },   // ADST_DCT
     { aom_highbd_idct4_c, aom_highbd_iadst8_c },   // DCT_ADST
     { aom_highbd_iadst4_c, aom_highbd_iadst8_c },  // ADST_ADST
+#if CONFIG_EXT_TX
     { aom_highbd_iadst4_c, aom_highbd_idct8_c },   // FLIPADST_DCT
     { aom_highbd_idct4_c, aom_highbd_iadst8_c },   // DCT_FLIPADST
     { aom_highbd_iadst4_c, aom_highbd_iadst8_c },  // FLIPADST_FLIPADST
@@ -1168,6 +1355,7 @@
     { highbd_iidtx4_c, aom_highbd_iadst8_c },      // H_ADST
     { aom_highbd_iadst4_c, highbd_iidtx8_c },      // V_FLIPADST
     { highbd_iidtx4_c, aom_highbd_iadst8_c },      // H_FLIPADST
+#endif                                             // CONFIG_EXT_TX
   };
   const int n = 4;
   const int n2 = 8;
@@ -1194,7 +1382,9 @@
     HIGH_IHT_8x4[tx_type].cols(out[i], out[i], bd);
   }
 
+#if CONFIG_EXT_TX
   maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif  // CONFIG_EXT_TX
 
   // Sum with the destination
   for (i = 0; i < n; ++i) {
@@ -1214,6 +1404,7 @@
     { aom_highbd_iadst16_c, aom_highbd_idct8_c },   // ADST_DCT
     { aom_highbd_idct16_c, aom_highbd_iadst8_c },   // DCT_ADST
     { aom_highbd_iadst16_c, aom_highbd_iadst8_c },  // ADST_ADST
+#if CONFIG_EXT_TX
     { aom_highbd_iadst16_c, aom_highbd_idct8_c },   // FLIPADST_DCT
     { aom_highbd_idct16_c, aom_highbd_iadst8_c },   // DCT_FLIPADST
     { aom_highbd_iadst16_c, aom_highbd_iadst8_c },  // FLIPADST_FLIPADST
@@ -1226,6 +1417,7 @@
     { highbd_iidtx16_c, aom_highbd_iadst8_c },      // H_ADST
     { aom_highbd_iadst16_c, highbd_iidtx8_c },      // V_FLIPADST
     { highbd_iidtx16_c, aom_highbd_iadst8_c },      // H_FLIPADST
+#endif                                              // CONFIG_EXT_TX
   };
   const int n = 8;
   const int n2 = 16;
@@ -1251,7 +1443,9 @@
     HIGH_IHT_8x16[tx_type].cols(out[i], out[i], bd);
   }
 
+#if CONFIG_EXT_TX
   maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif  // CONFIG_EXT_TX
 
   // Sum with the destination
   for (i = 0; i < n2; ++i) {
@@ -1271,6 +1465,7 @@
     { aom_highbd_iadst8_c, aom_highbd_idct16_c },   // ADST_DCT
     { aom_highbd_idct8_c, aom_highbd_iadst16_c },   // DCT_ADST
     { aom_highbd_iadst8_c, aom_highbd_iadst16_c },  // ADST_ADST
+#if CONFIG_EXT_TX
     { aom_highbd_iadst8_c, aom_highbd_idct16_c },   // FLIPADST_DCT
     { aom_highbd_idct8_c, aom_highbd_iadst16_c },   // DCT_FLIPADST
     { aom_highbd_iadst8_c, aom_highbd_iadst16_c },  // FLIPADST_FLIPADST
@@ -1283,6 +1478,7 @@
     { highbd_iidtx8_c, aom_highbd_iadst16_c },      // H_ADST
     { aom_highbd_iadst8_c, highbd_iidtx16_c },      // V_FLIPADST
     { highbd_iidtx8_c, aom_highbd_iadst16_c },      // H_FLIPADST
+#endif                                              // CONFIG_EXT_TX
   };
   const int n = 8;
   const int n2 = 16;
@@ -1308,7 +1504,9 @@
     HIGH_IHT_16x8[tx_type].cols(out[i], out[i], bd);
   }
 
+#if CONFIG_EXT_TX
   maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif  // CONFIG_EXT_TX
 
   // Sum with the destination
   for (i = 0; i < n; ++i) {
@@ -1328,6 +1526,7 @@
     { highbd_ihalfright32_c, aom_highbd_idct16_c },   // ADST_DCT
     { aom_highbd_idct32_c, aom_highbd_iadst16_c },    // DCT_ADST
     { highbd_ihalfright32_c, aom_highbd_iadst16_c },  // ADST_ADST
+#if CONFIG_EXT_TX
     { highbd_ihalfright32_c, aom_highbd_idct16_c },   // FLIPADST_DCT
     { aom_highbd_idct32_c, aom_highbd_iadst16_c },    // DCT_FLIPADST
     { highbd_ihalfright32_c, aom_highbd_iadst16_c },  // FLIPADST_FLIPADST
@@ -1340,6 +1539,7 @@
     { highbd_iidtx32_c, aom_highbd_iadst16_c },       // H_ADST
     { highbd_ihalfright32_c, highbd_iidtx16_c },      // V_FLIPADST
     { highbd_iidtx32_c, aom_highbd_iadst16_c },       // H_FLIPADST
+#endif                                                // CONFIG_EXT_TX
   };
   const int n = 16;
   const int n2 = 32;
@@ -1365,7 +1565,9 @@
     HIGH_IHT_16x32[tx_type].cols(out[i], out[i], bd);
   }
 
+#if CONFIG_EXT_TX
   maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif  // CONFIG_EXT_TX
 
   // Sum with the destination
   for (i = 0; i < n2; ++i) {
@@ -1385,6 +1587,7 @@
     { aom_highbd_iadst16_c, aom_highbd_idct32_c },    // ADST_DCT
     { aom_highbd_idct16_c, highbd_ihalfright32_c },   // DCT_ADST
     { aom_highbd_iadst16_c, highbd_ihalfright32_c },  // ADST_ADST
+#if CONFIG_EXT_TX
     { aom_highbd_iadst16_c, aom_highbd_idct32_c },    // FLIPADST_DCT
     { aom_highbd_idct16_c, highbd_ihalfright32_c },   // DCT_FLIPADST
     { aom_highbd_iadst16_c, highbd_ihalfright32_c },  // FLIPADST_FLIPADST
@@ -1397,6 +1600,7 @@
     { highbd_iidtx16_c, highbd_ihalfright32_c },      // H_ADST
     { aom_highbd_iadst16_c, highbd_iidtx32_c },       // V_FLIPADST
     { highbd_iidtx16_c, highbd_ihalfright32_c },      // H_FLIPADST
+#endif                                                // CONFIG_EXT_TX
   };
   const int n = 16;
   const int n2 = 32;
@@ -1422,7 +1626,9 @@
     HIGH_IHT_32x16[tx_type].cols(out[i], out[i], bd);
   }
 
+#if CONFIG_EXT_TX
   maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif  // CONFIG_EXT_TX
 
   // Sum with the destination
   for (i = 0; i < n; ++i) {
@@ -1434,7 +1640,6 @@
     }
   }
 }
-#endif  // CONFIG_EXT_TX
 
 void av1_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int tx_type, int bd) {
@@ -1630,6 +1835,71 @@
     }
   }
 }
+
+#if CONFIG_TX64X64
+void av1_highbd_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_64[] = {
+    { highbd_idct64_col_c, highbd_idct64_row_c },      // DCT_DCT
+    { highbd_ihalfright64_c, highbd_idct64_row_c },    // ADST_DCT
+    { highbd_idct64_col_c, highbd_ihalfright64_c },    // DCT_ADST
+    { highbd_ihalfright64_c, highbd_ihalfright64_c },  // ADST_ADST
+    { highbd_ihalfright64_c, highbd_idct64_row_c },    // FLIPADST_DCT
+    { highbd_idct64_col_c, highbd_ihalfright64_c },    // DCT_FLIPADST
+    { highbd_ihalfright64_c, highbd_ihalfright64_c },  // FLIPADST_FLIPADST
+    { highbd_ihalfright64_c, highbd_ihalfright64_c },  // ADST_FLIPADST
+    { highbd_ihalfright64_c, highbd_ihalfright64_c },  // FLIPADST_ADST
+    { highbd_iidtx64_c, highbd_iidtx64_c },            // IDTX
+    { highbd_idct64_col_c, highbd_iidtx64_c },         // V_DCT
+    { highbd_iidtx64_c, highbd_idct64_row_c },         // H_DCT
+    { highbd_ihalfright64_c, highbd_iidtx64_c },       // V_ADST
+    { highbd_iidtx64_c, highbd_ihalfright64_c },       // H_ADST
+    { highbd_ihalfright64_c, highbd_iidtx64_c },       // V_FLIPADST
+    { highbd_iidtx64_c, highbd_ihalfright64_c },       // H_FLIPADST
+  };
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[64][64];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 64;
+
+  // inverse transform row vectors
+  for (i = 0; i < 64; ++i) {
+    HIGH_IHT_64[tx_type].rows(input, out[i], bd);
+    for (j = 0; j < 64; ++j) out[i][j] = ROUND_POWER_OF_TWO(out[i][j], 1);
+    input += 64;
+  }
+
+  // transpose
+  for (i = 1; i < 64; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 64; ++i) {
+    HIGH_IHT_64[tx_type].cols(out[i], out[i], bd);
+  }
+
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 64, 64);
+
+  // Sum with the destination
+  for (i = 0; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5), bd);
+    }
+  }
+}
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_EXT_TX
 
 // idct
@@ -1737,7 +2007,6 @@
   }
 }
 
-#if CONFIG_EXT_TX
 void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
                                  int stride, int eob, int bd, TX_TYPE tx_type) {
   (void)eob;
@@ -1777,7 +2046,6 @@
   (void)eob;
   av1_highbd_iht32x16_512_add_c(input, dest, stride, tx_type, bd);
 }
-#endif  // CONFIG_EXT_TX
 
 void av1_highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
                                  int stride, int eob, int bd, TX_TYPE tx_type) {
@@ -1904,7 +2172,6 @@
       av1_inv_txfm_add_16x16(input, dest, stride, eob, tx_type);
       break;
     case TX_8X8: av1_inv_txfm_add_8x8(input, dest, stride, eob, tx_type); break;
-#if CONFIG_EXT_TX
     case TX_4X8: av1_inv_txfm_add_4x8(input, dest, stride, eob, tx_type); break;
     case TX_8X4: av1_inv_txfm_add_8x4(input, dest, stride, eob, tx_type); break;
     case TX_8X16:
@@ -1919,7 +2186,6 @@
     case TX_32X16:
       av1_inv_txfm_add_32x16(input, dest, stride, eob, tx_type);
       break;
-#endif  // CONFIG_EXT_TX
     case TX_4X4:
       // this is like av1_short_idct4x4 but has a special case around eob<=1
       // which is significant (not just an optimization) for the lossless
@@ -1949,7 +2215,6 @@
     case TX_8X8:
       av1_highbd_inv_txfm_add_8x8(input, dest, stride, eob, bd, tx_type);
       break;
-#if CONFIG_EXT_TX
     case TX_4X8:
       av1_highbd_inv_txfm_add_4x8(input, dest, stride, eob, bd, tx_type);
       break;
@@ -1968,7 +2233,6 @@
     case TX_32X16:
       av1_highbd_inv_txfm_add_32x16(input, dest, stride, eob, bd, tx_type);
       break;
-#endif  // CONFIG_EXT_TX
     case TX_4X4:
       // this is like av1_short_idct4x4 but has a special case around eob<=1
       // which is significant (not just an optimization) for the lossless
diff --git a/av1/common/idct.h b/av1/common/idct.h
index 1acc825..db9a6e2 100644
--- a/av1/common/idct.h
+++ b/av1/common/idct.h
@@ -67,12 +67,10 @@
 
 void av1_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, int stride,
                           int eob, TX_TYPE tx_type, int lossless);
-#if CONFIG_EXT_TX
 void av1_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest, int stride,
                           int eob, TX_TYPE tx_type);
 void av1_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest, int stride,
                           int eob, TX_TYPE tx_type);
-#endif  // CONFIG_EXT_TX
 void av1_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest, int stride,
                           int eob, TX_TYPE tx_type);
 void av1_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest, int stride,
@@ -95,12 +93,10 @@
 void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
                                  int stride, int eob, int bd, TX_TYPE tx_type,
                                  int lossless);
-#if CONFIG_EXT_TX
 void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
                                  int stride, int eob, int bd, TX_TYPE tx_type);
 void av1_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
                                  int stride, int eob, int bd, TX_TYPE tx_type);
-#endif  // CONFIG_EXT_TX
 void av1_highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
                                  int stride, int eob, int bd, TX_TYPE tx_type);
 void av1_highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
diff --git a/av1/common/scan.c b/av1/common/scan.c
index cab3729..b2386b9 100644
--- a/av1/common/scan.c
+++ b/av1/common/scan.c
@@ -36,12 +36,12 @@
   0, 1, 4, 2, 5, 3, 6, 8, 9, 7, 12, 10, 13, 11, 14, 15,
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, default_scan_4x8[32]) = {
   0,  1,  4,  5,  2,  8,  6,  9,  10, 3,  12, 7,  13, 11, 14, 16,
   17, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x8[32]) = {
   0, 4, 8,  12, 16, 20, 24, 28, 1, 5, 9,  13, 17, 21, 25, 29,
   2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31,
@@ -51,12 +51,14 @@
   0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
 };
+#endif
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_8x4[32]) = {
   0,  1,  8,  9, 2,  16, 10, 17, 18, 3,  24, 11, 25, 19, 26, 4,
   12, 27, 20, 5, 28, 13, 21, 29, 6,  14, 22, 30, 7,  15, 23, 31,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x4[32]) = {
   0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
   4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31,
@@ -66,7 +68,7 @@
   0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
 };
-#endif  // CONFIG_EXT_TX
+#endif
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = {
   0,  8,  1,  16, 9,  2,  17, 24, 10, 3,  18, 25, 32, 11, 4,  26,
@@ -105,7 +107,6 @@
   58, 45, 38, 52, 31, 59, 53, 46, 60, 39, 61, 47, 54, 55, 62, 63,
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, default_scan_8x16[128]) = {
   0,   1,   8,   2,   9,   16,  3,   10,  17,  24,  4,   11,  18,  25,  32,
   5,   12,  19,  26,  33,  40,  6,   13,  20,  27,  34,  41,  48,  7,   14,
@@ -129,6 +130,7 @@
   122, 63, 78,  93,  108, 123, 79, 94, 109, 124, 95,  110, 125, 111, 126, 127,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x16[128]) = {
   0, 8,  16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96,  104, 112, 120,
   1, 9,  17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97,  105, 113, 121,
@@ -174,6 +176,7 @@
   105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
   120, 121, 122, 123, 124, 125, 126, 127,
 };
+#endif
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_16x32[512]) = {
   0,   1,   16,  2,   17,  32,  3,   18,  33,  48,  4,   19,  34,  49,  64,
@@ -251,6 +254,7 @@
   510, 511,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x32[512]) = {
   0,   16,  32,  48,  64,  80,  96,  112, 128, 144, 160, 176, 192, 208, 224,
   240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464,
@@ -1034,7 +1038,6 @@
   8, 3, 6, 8, 9, 6, 9, 9, 12, 7, 10, 10, 13, 11, 14, 0, 0,
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
   0,  0,  0,  0,  0,  0,  1,  4,  1,  1,  4,  4,  2,  5,  5,  8,  6,
@@ -1043,6 +1046,7 @@
   24, 22, 25, 23, 26, 24, 24, 25, 28, 26, 29, 27, 30, 0,  0
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mcol_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
   0, 0, 0,  0,  4,  4,  8,  8,  12, 12, 16, 16, 20, 20, 24, 24, 0,
@@ -1058,6 +1062,7 @@
   13, 16, 14, 17, 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21,
   24, 22, 25, 23, 26, 24, 24, 25, 28, 26, 29, 27, 30, 0,  0
 };
+#endif
 
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
@@ -1067,6 +1072,7 @@
   13, 14, 21, 22, 29, 6, 6,  7,  14, 15, 22, 23, 30, 0,  0
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mcol_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
   0,  0,  0,  0,  8,  8,  16, 16, 0,  0,  1,  8,  9,  16, 17, 24, 1,
@@ -1141,7 +1147,6 @@
   31, 38, 53, 60, 46, 53, 39, 46, 54, 61, 47, 54, 55, 62, 0,  0,
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
   0,   0,   0,   0,   0,   0,   1,   1,   1,   8,   8,   8,   2,   2,   2,
@@ -1186,6 +1191,7 @@
   126, 0,   0
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mcol_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
   0,  0,  0,  0,  8,  8,  16, 16, 24, 24,  32,  32,  40,  40,  48,  48,
@@ -1271,6 +1277,7 @@
   104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
   126, 0,   0
 };
+#endif
 
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
@@ -1418,6 +1425,7 @@
   478, 509, 479, 510, 0,   0
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mcol_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
   0,   0,   0,   0,   16,  16,  32,  32,  48,  48,  64,  64,  80,  80,  96,
@@ -2841,12 +2849,12 @@
   0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15,
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x8[32]) = {
   0,  1,  4,  9,  2,  3,  6,  11, 5,  7,  8,  13, 10, 12, 14, 17,
   15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x8[32]) = {
   0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
   4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31,
@@ -2856,12 +2864,14 @@
   0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
 };
+#endif
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x4[32]) = {
   0, 1, 4, 9,  15, 19, 24, 28, 2,  3,  6,  11, 16, 21, 25, 29,
   5, 7, 8, 13, 18, 22, 26, 30, 10, 12, 14, 17, 20, 23, 27, 31,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x4[32]) = {
   0, 4, 8,  12, 16, 20, 24, 28, 1, 5, 9,  13, 17, 21, 25, 29,
   2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31,
@@ -2910,7 +2920,6 @@
   25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63,
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x16[128]) = {
   0,  1,  3,   6,   10,  15,  21,  28,  2,  4,   7,   11,  16,  22,  29,  36,
   5,  8,  12,  17,  23,  30,  37,  44,  9,  13,  18,  24,  31,  38,  45,  52,
@@ -2933,6 +2942,7 @@
   35, 43, 51, 59, 67, 75, 83, 91, 99, 106, 112, 117, 121, 124, 126, 127,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x16[128]) = {
   0,  16, 32, 48, 64, 80, 96,  112, 1,  17, 33, 49, 65, 81, 97,  113,
   2,  18, 34, 50, 66, 82, 98,  114, 3,  19, 35, 51, 67, 83, 99,  115,
@@ -2978,6 +2988,7 @@
   105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
   120, 121, 122, 123, 124, 125, 126, 127,
 };
+#endif
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x32[512]) = {
   0,   1,   3,   6,   10,  15,  21,  28,  36,  45,  55,  66,  78,  91,  105,
@@ -3055,6 +3066,7 @@
   510, 511,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x32[512]) = {
   0,  32, 64, 96,  128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480,
   1,  33, 65, 97,  129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481,
@@ -3810,7 +3822,6 @@
   { default_scan_32x32, av1_default_iscan_32x32, default_scan_32x32_neighbors },
 };
 
-#if CONFIG_EXT_TX
 const SCAN_ORDER av1_intra_scan_orders[TX_SIZES][TX_TYPES] = {
 #if CONFIG_CB4X4
   {
@@ -3819,6 +3830,7 @@
       { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
       { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
@@ -3831,6 +3843,7 @@
       { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
       { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
       { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+#endif  // CONFIG_EXT_TX
   },
 #endif
   {
@@ -3839,6 +3852,7 @@
       { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
       { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
@@ -3851,6 +3865,7 @@
       { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
       { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
       { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_8X8
@@ -3858,6 +3873,7 @@
       { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
       { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
@@ -3870,6 +3886,7 @@
       { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
       { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
       { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_16X16
@@ -3879,6 +3896,7 @@
       { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
       { default_scan_16x16, av1_default_iscan_16x16,
         default_scan_16x16_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_16x16, av1_default_iscan_16x16,
         default_scan_16x16_neighbors },
       { default_scan_16x16, av1_default_iscan_16x16,
@@ -3896,11 +3914,13 @@
       { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
       { row_scan_16x16, av1_row_iscan_16x16, row_scan_16x16_neighbors },
       { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_32X32
       { default_scan_32x32, av1_default_iscan_32x32,
         default_scan_32x32_neighbors },
+#if CONFIG_EXT_TX
       { h2_scan_32x32, av1_h2_iscan_32x32, h2_scan_32x32_neighbors },
       { v2_scan_32x32, av1_v2_iscan_32x32, v2_scan_32x32_neighbors },
       { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
@@ -3916,6 +3936,7 @@
       { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
       { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
       { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+#endif  // CONFIG_EXT_TX
   }
 };
 
@@ -3927,6 +3948,7 @@
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
@@ -3939,6 +3961,7 @@
       { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
       { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
       { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+#endif  // CONFIG_EXT_TX
   },
 #endif
   {
@@ -3947,6 +3970,7 @@
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
@@ -3959,6 +3983,7 @@
       { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
       { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
       { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_8X8
@@ -3966,6 +3991,7 @@
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
@@ -3978,6 +4004,7 @@
       { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
       { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
       { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_16X16
@@ -3989,6 +4016,7 @@
         default_scan_16x16_neighbors },
       { default_scan_16x16, av1_default_iscan_16x16,
         default_scan_16x16_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_16x16, av1_default_iscan_16x16,
         default_scan_16x16_neighbors },
       { default_scan_16x16, av1_default_iscan_16x16,
@@ -4006,11 +4034,13 @@
       { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
       { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
       { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_32X32
       { default_scan_32x32, av1_default_iscan_32x32,
         default_scan_32x32_neighbors },
+#if CONFIG_EXT_TX
       { h2_scan_32x32, av1_h2_iscan_32x32, h2_scan_32x32_neighbors },
       { v2_scan_32x32, av1_v2_iscan_32x32, v2_scan_32x32_neighbors },
       { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
@@ -4026,6 +4056,7 @@
       { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
       { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
       { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_4X8
@@ -4033,6 +4064,7 @@
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
@@ -4045,6 +4077,7 @@
       { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
       { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
       { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_8X4
@@ -4052,6 +4085,7 @@
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
@@ -4064,6 +4098,7 @@
       { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
       { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
       { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_8X16
@@ -4075,6 +4110,7 @@
         default_scan_8x16_neighbors },
       { default_scan_8x16, av1_default_iscan_8x16,
         default_scan_8x16_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_8x16, av1_default_iscan_8x16,
         default_scan_8x16_neighbors },
       { default_scan_8x16, av1_default_iscan_8x16,
@@ -4092,6 +4128,7 @@
       { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
       { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
       { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_16X8
@@ -4103,6 +4140,7 @@
         default_scan_16x8_neighbors },
       { default_scan_16x8, av1_default_iscan_16x8,
         default_scan_16x8_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_16x8, av1_default_iscan_16x8,
         default_scan_16x8_neighbors },
       { default_scan_16x8, av1_default_iscan_16x8,
@@ -4120,6 +4158,7 @@
       { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
       { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
       { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_16X32
@@ -4131,6 +4170,7 @@
         default_scan_16x32_neighbors },
       { default_scan_16x32, av1_default_iscan_16x32,
         default_scan_16x32_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_16x32, av1_default_iscan_16x32,
         default_scan_16x32_neighbors },
       { default_scan_16x32, av1_default_iscan_16x32,
@@ -4148,6 +4188,7 @@
       { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
       { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
       { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_32X16
@@ -4159,6 +4200,7 @@
         default_scan_32x16_neighbors },
       { default_scan_32x16, av1_default_iscan_32x16,
         default_scan_32x16_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_32x16, av1_default_iscan_32x16,
         default_scan_32x16_neighbors },
       { default_scan_32x16, av1_default_iscan_32x16,
@@ -4176,49 +4218,9 @@
       { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
       { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
       { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
-  }
-};
-
-#else  // CONFIG_EXT_TX
-
-const SCAN_ORDER av1_intra_scan_orders[TX_SIZES][TX_TYPES] = {
-#if CONFIG_CB4X4
-  { // TX_2X2
-    { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-    { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
-    { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
-    { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors } },
-#endif
-  { // TX_4X4
-    { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-    { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
-    { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
-    { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors } },
-  { // TX_8X8
-    { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-    { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
-    { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
-    { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors } },
-  { // TX_16X16
-    { default_scan_16x16, av1_default_iscan_16x16,
-      default_scan_16x16_neighbors },
-    { row_scan_16x16, av1_row_iscan_16x16, row_scan_16x16_neighbors },
-    { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
-    { default_scan_16x16, av1_default_iscan_16x16,
-      default_scan_16x16_neighbors } },
-  {
-      // TX_32X32
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-  }
-};
 #endif  // CONFIG_EXT_TX
+  }
+};
 
 #if CONFIG_ADAPT_SCAN
 // TX_32X32 will has 1024 coefficients whose indexes can be represented in 10
diff --git a/av1/common/scan.h b/av1/common/scan.h
index af39993..2078e99 100644
--- a/av1/common/scan.h
+++ b/av1/common/scan.h
@@ -27,6 +27,7 @@
 
 extern const SCAN_ORDER av1_default_scan_orders[TX_SIZES];
 extern const SCAN_ORDER av1_intra_scan_orders[TX_SIZES][TX_TYPES];
+extern const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES];
 
 #if CONFIG_ADAPT_SCAN
 void av1_update_scan_prob(AV1_COMMON *cm, TX_SIZE tx_size, TX_TYPE tx_type,
@@ -87,7 +88,7 @@
   return &cm->fc->sc[tx_size][tx_type];
 #else  // CONFIG_ADAPT_SCAN
   (void)cm;
-#if CONFIG_EXT_TX
+#if CONFIG_EXT_TX || CONFIG_VAR_TX
   return is_inter ? &av1_inter_scan_orders[tx_size][tx_type]
                   : &av1_intra_scan_orders[tx_size][tx_type];
 #else
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index e5ed39d..f73e777 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -35,18 +35,18 @@
     const int x = *(points++), y = *(points++);
     if (subsampling_x)
       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
-          ((x << (WARPEDMODEL_PREC_BITS + 1)) + mat[1]),
+          ((x * (1 << (WARPEDMODEL_PREC_BITS + 1))) + mat[1]),
           WARPEDDIFF_PREC_BITS + 1);
     else
       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
-          ((x << WARPEDMODEL_PREC_BITS) + mat[1]), WARPEDDIFF_PREC_BITS);
+          ((x * (1 << WARPEDMODEL_PREC_BITS)) + mat[1]), WARPEDDIFF_PREC_BITS);
     if (subsampling_y)
       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
-          ((y << (WARPEDMODEL_PREC_BITS + 1)) + mat[0]),
+          ((y * (1 << (WARPEDMODEL_PREC_BITS + 1))) + mat[0]),
           WARPEDDIFF_PREC_BITS + 1);
     else
       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
-          ((y << WARPEDMODEL_PREC_BITS)) + mat[0], WARPEDDIFF_PREC_BITS);
+          ((y * (1 << WARPEDMODEL_PREC_BITS))) + mat[0], WARPEDDIFF_PREC_BITS);
     points += stride_points - 2;
     proj += stride_proj - 2;
   }
@@ -119,12 +119,12 @@
     y = (subsampling_y ? 4 * y + 1 : 2 * y);
 
     Z = (mat[7] * x + mat[6] * y + (1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS + 1)));
-    xp = (mat[1] * x + mat[0] * y + 2 * mat[3])
-         << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
-             WARPEDMODEL_PREC_BITS);
-    yp = (mat[2] * x + mat[5] * y + 2 * mat[4])
-         << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
-             WARPEDMODEL_PREC_BITS);
+    xp = (mat[1] * x + mat[0] * y + 2 * mat[3]) *
+         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
+                WARPEDMODEL_PREC_BITS));
+    yp = (mat[2] * x + mat[5] * y + 2 * mat[4]) *
+         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
+                WARPEDMODEL_PREC_BITS));
 
     xp = xp > 0 ? (xp + Z / 2) / Z : (xp - Z / 2) / Z;
     yp = yp > 0 ? (yp + Z / 2) / Z : (yp - Z / 2) / Z;
@@ -220,9 +220,9 @@
     const int64_t v3 = x * (p[1] - p[-1]);
     const int64_t v4 = 2 * p[0];
     return (int32_t)ROUND_POWER_OF_TWO_SIGNED(
-        (v4 << (3 * WARPEDPIXEL_PREC_BITS)) +
-            (v3 << (2 * WARPEDPIXEL_PREC_BITS)) +
-            (v2 << WARPEDPIXEL_PREC_BITS) + v1,
+        (v4 * (1 << (3 * WARPEDPIXEL_PREC_BITS))) +
+            (v3 * (1 << (2 * WARPEDPIXEL_PREC_BITS))) +
+            (v2 * (1 << WARPEDPIXEL_PREC_BITS)) + v1,
         3 * WARPEDPIXEL_PREC_BITS + 1 - WARPEDPIXEL_FILTER_BITS);
   }
 }
@@ -246,10 +246,10 @@
                   i + k + 1 - WARPEDPIXEL_FILTER_TAPS / 2,
                   j + 1 - WARPEDPIXEL_FILTER_TAPS / 2);
     arr[k] = do_ntap_filter(arr_temp + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
-                            y - (j << WARPEDPIXEL_PREC_BITS));
+                            y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
   }
   val = do_ntap_filter(arr + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
-                       x - (i << WARPEDPIXEL_PREC_BITS));
+                       x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
   val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
   return (uint8_t)clip_pixel(val);
 }
@@ -262,9 +262,10 @@
   for (k = 0; k < 4; ++k) {
     int32_t arr_temp[4];
     get_subcolumn(4, ref, arr_temp, stride, i + k - 1, j - 1);
-    arr[k] = do_cubic_filter(arr_temp + 1, y - (j << WARPEDPIXEL_PREC_BITS));
+    arr[k] =
+        do_cubic_filter(arr_temp + 1, y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
   }
-  val = do_cubic_filter(arr + 1, x - (i << WARPEDPIXEL_PREC_BITS));
+  val = do_cubic_filter(arr + 1, x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
   val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
   return (uint8_t)clip_pixel(val);
 }
@@ -272,8 +273,8 @@
 static uint8_t bi_linear_filter(uint8_t *ref, int x, int y, int stride) {
   const int ix = x >> WARPEDPIXEL_PREC_BITS;
   const int iy = y >> WARPEDPIXEL_PREC_BITS;
-  const int sx = x - (ix << WARPEDPIXEL_PREC_BITS);
-  const int sy = y - (iy << WARPEDPIXEL_PREC_BITS);
+  const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
+  const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
   int32_t val;
   val = ROUND_POWER_OF_TWO_SIGNED(
       ref[iy * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sy) *
@@ -289,8 +290,8 @@
                                 int height, int stride) {
   int ix = x >> WARPEDPIXEL_PREC_BITS;
   int iy = y >> WARPEDPIXEL_PREC_BITS;
-  int sx = x - (ix << WARPEDPIXEL_PREC_BITS);
-  int sy = y - (iy << WARPEDPIXEL_PREC_BITS);
+  int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
+  int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
   int32_t v;
 
   if (ix < 0 && iy < 0)
@@ -357,10 +358,10 @@
                          i + k + 1 - WARPEDPIXEL_FILTER_TAPS / 2,
                          j + 1 - WARPEDPIXEL_FILTER_TAPS / 2);
     arr[k] = do_ntap_filter(arr_temp + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
-                            y - (j << WARPEDPIXEL_PREC_BITS));
+                            y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
   }
   val = do_ntap_filter(arr + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
-                       x - (i << WARPEDPIXEL_PREC_BITS));
+                       x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
   val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
   return (uint16_t)clip_pixel_highbd(val, bd);
 }
@@ -374,9 +375,10 @@
   for (k = 0; k < 4; ++k) {
     int32_t arr_temp[4];
     highbd_get_subcolumn(4, ref, arr_temp, stride, i + k - 1, j - 1);
-    arr[k] = do_cubic_filter(arr_temp + 1, y - (j << WARPEDPIXEL_PREC_BITS));
+    arr[k] =
+        do_cubic_filter(arr_temp + 1, y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
   }
-  val = do_cubic_filter(arr + 1, x - (i << WARPEDPIXEL_PREC_BITS));
+  val = do_cubic_filter(arr + 1, x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
   val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
   return (uint16_t)clip_pixel_highbd(val, bd);
 }
@@ -385,8 +387,8 @@
                                         int bd) {
   const int ix = x >> WARPEDPIXEL_PREC_BITS;
   const int iy = y >> WARPEDPIXEL_PREC_BITS;
-  const int sx = x - (ix << WARPEDPIXEL_PREC_BITS);
-  const int sy = y - (iy << WARPEDPIXEL_PREC_BITS);
+  const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
+  const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
   int32_t val;
   val = ROUND_POWER_OF_TWO_SIGNED(
       ref[iy * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sy) *
@@ -402,8 +404,8 @@
                                         int height, int stride, int bd) {
   int ix = x >> WARPEDPIXEL_PREC_BITS;
   int iy = y >> WARPEDPIXEL_PREC_BITS;
-  int sx = x - (ix << WARPEDPIXEL_PREC_BITS);
-  int sy = y - (iy << WARPEDPIXEL_PREC_BITS);
+  int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
+  int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
   int32_t v;
 
   if (ix < 0 && iy < 0)
diff --git a/av1/common/x86/idct_intrin_sse2.c b/av1/common/x86/idct_intrin_sse2.c
index 5d28a28..a6b6e1e 100644
--- a/av1/common/x86/idct_intrin_sse2.c
+++ b/av1/common/x86/idct_intrin_sse2.c
@@ -571,6 +571,7 @@
   in[6] = mm_reverse_epi16(in[6]);
   in[7] = mm_reverse_epi16(in[7]);
 }
+#endif  // CONFIG_EXT_TX
 
 static INLINE void scale_sqrt2_8x4(__m128i *in) {
   // Implements 'ROUND_POWER_OF_TWO(input * Sqrt2, DCT_CONST_BITS)'
@@ -693,8 +694,10 @@
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
     case H_DCT:
+#endif
       aom_idct8_sse2(in);
       array_transpose_8x8(in, in);
       aom_idct8_sse2(in + 8);
@@ -702,17 +705,20 @@
       break;
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
     case H_ADST:
     case H_FLIPADST:
+#endif
       aom_iadst8_sse2(in);
       array_transpose_8x8(in, in);
       aom_iadst8_sse2(in + 8);
       array_transpose_8x8(in + 8, in + 8);
       break;
+#if CONFIG_EXT_TX
     case V_FLIPADST:
     case V_ADST:
     case V_DCT:
@@ -720,6 +726,7 @@
       iidtx8_sse2(in);
       iidtx8_sse2(in + 8);
       break;
+#endif
     default: assert(0); break;
   }
   scale_sqrt2_8x8(in);
@@ -729,33 +736,50 @@
   switch (tx_type) {
     case DCT_DCT:
     case DCT_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
-    case V_DCT: idct16_8col(in); break;
+    case V_DCT:
+#endif
+      idct16_8col(in);
+      break;
     case ADST_DCT:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case FLIPADST_ADST:
     case ADST_FLIPADST:
     case FLIPADST_FLIPADST:
     case FLIPADST_DCT:
     case V_ADST:
-    case V_FLIPADST: iadst16_8col(in); break;
+    case V_FLIPADST:
+#endif
+      iadst16_8col(in);
+      break;
+#if CONFIG_EXT_TX
     case H_DCT:
     case H_ADST:
     case H_FLIPADST:
     case IDTX: iidtx16_8col(in); break;
+#endif
     default: assert(0); break;
   }
 
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
+#if CONFIG_EXT_TX
     case H_DCT:
+#endif
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case H_ADST:
     case V_ADST:
     case V_DCT:
-    case IDTX: write_buffer_8x16(dest, in, stride); break;
+    case IDTX:
+#endif
+      write_buffer_8x16(dest, in, stride);
+      break;
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
     case FLIPADST_ADST:
     case V_FLIPADST: write_buffer_8x16(dest + stride * 15, in, -stride); break;
@@ -771,6 +795,7 @@
       flip_buffer_lr_8x8(in + 8);
       write_buffer_8x16(dest + stride * 15, in, -stride);
       break;
+#endif
     default: assert(0); break;
   }
 }
@@ -837,20 +862,30 @@
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
-    case H_DCT: idct16_8col(in); break;
+    case H_DCT:
+#endif
+      idct16_8col(in);
+      break;
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
     case H_ADST:
-    case H_FLIPADST: iadst16_8col(in); break;
+    case H_FLIPADST:
+#endif
+      iadst16_8col(in);
+      break;
+#if CONFIG_EXT_TX
     case V_FLIPADST:
     case V_ADST:
     case V_DCT:
     case IDTX: iidtx16_8col(in); break;
+#endif
     default: assert(0); break;
   }
 
@@ -862,22 +897,27 @@
   switch (tx_type) {
     case DCT_DCT:
     case DCT_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
     case V_DCT:
+#endif
       aom_idct8_sse2(in);
       aom_idct8_sse2(in + 8);
       break;
     case ADST_DCT:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case FLIPADST_ADST:
     case ADST_FLIPADST:
     case FLIPADST_FLIPADST:
     case FLIPADST_DCT:
     case V_ADST:
     case V_FLIPADST:
+#endif
       aom_iadst8_sse2(in);
       aom_iadst8_sse2(in + 8);
       break;
+#if CONFIG_EXT_TX
     case H_DCT:
     case H_ADST:
     case H_FLIPADST:
@@ -887,22 +927,26 @@
       iidtx8_sse2(in);
       iidtx8_sse2(in + 8);
       break;
+#endif
     default: assert(0); break;
   }
 
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
-    case H_DCT:
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
     case H_ADST:
     case V_ADST:
     case V_DCT:
     case IDTX:
+#endif
       write_buffer_8x8_round6(dest, in, stride);
       write_buffer_8x8_round6(dest + 8, in + 8, stride);
       break;
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
     case FLIPADST_ADST:
     case V_FLIPADST:
@@ -923,6 +967,7 @@
       write_buffer_8x8_round6(dest + stride * 7, in + 8, -stride);
       write_buffer_8x8_round6(dest + stride * 7 + 8, in, -stride);
       break;
+#endif
     default: assert(0); break;
   }
 }
@@ -961,10 +1006,15 @@
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
-    case H_DCT: aom_idct8_sse2(in); break;
+    case H_DCT:
+#endif
+      aom_idct8_sse2(in);
+      break;
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
@@ -974,9 +1024,8 @@
     case V_FLIPADST:
     case V_ADST:
     case V_DCT:
-    case IDTX:
-      iidtx8_sse2(in);
-      array_transpose_8x8(in, in);
+    case IDTX: iidtx8_sse2(in); array_transpose_8x8(in, in);
+#endif
       break;
     default: assert(0); break;
   }
@@ -995,22 +1044,27 @@
   switch (tx_type) {
     case DCT_DCT:
     case DCT_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
     case V_DCT:
+#endif
       aom_idct4_sse2(in + 4);
       aom_idct4_sse2(in + 6);
       break;
     case ADST_DCT:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case FLIPADST_ADST:
     case ADST_FLIPADST:
     case FLIPADST_FLIPADST:
     case FLIPADST_DCT:
     case V_ADST:
     case V_FLIPADST:
+#endif
       aom_iadst4_sse2(in + 4);
       aom_iadst4_sse2(in + 6);
       break;
+#if CONFIG_EXT_TX
     case H_DCT:
     case H_ADST:
     case H_FLIPADST:
@@ -1020,6 +1074,7 @@
       iidtx4_sse2(in + 6);
       array_transpose_4x4(in + 6);
       break;
+#endif
     default: assert(0); break;
   }
 
@@ -1032,9 +1087,10 @@
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
-    case H_DCT:
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
     case H_ADST:
     case V_ADST:
     case V_DCT:
@@ -1056,6 +1112,7 @@
       in[2] = mm_reverse_epi16(in[2]);
       in[3] = mm_reverse_epi16(in[3]);
       FLIPUD_PTR(dest, stride, 4);
+#endif
       break;
     default: assert(0); break;
   }
@@ -1139,22 +1196,27 @@
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
     case H_DCT:
+#endif
       aom_idct4_sse2(in + 4);
       aom_idct4_sse2(in + 6);
       break;
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
     case H_ADST:
     case H_FLIPADST:
+#endif
       aom_iadst4_sse2(in + 4);
       aom_iadst4_sse2(in + 6);
       break;
+#if CONFIG_EXT_TX
     case V_FLIPADST:
     case V_ADST:
     case V_DCT:
@@ -1164,6 +1226,7 @@
       iidtx4_sse2(in + 6);
       array_transpose_4x4(in + 6);
       break;
+#endif
     default: assert(0); break;
   }
 
@@ -1177,16 +1240,25 @@
   switch (tx_type) {
     case DCT_DCT:
     case DCT_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
-    case V_DCT: aom_idct8_sse2(in); break;
+    case V_DCT:
+#endif
+      aom_idct8_sse2(in);
+      break;
     case ADST_DCT:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case FLIPADST_ADST:
     case ADST_FLIPADST:
     case FLIPADST_FLIPADST:
     case FLIPADST_DCT:
     case V_ADST:
-    case V_FLIPADST: aom_iadst8_sse2(in); break;
+    case V_FLIPADST:
+#endif
+      aom_iadst8_sse2(in);
+      break;
+#if CONFIG_EXT_TX
     case H_DCT:
     case H_ADST:
     case H_FLIPADST:
@@ -1194,19 +1266,24 @@
       iidtx8_sse2(in);
       array_transpose_8x8(in, in);
       break;
+#endif
     default: assert(0); break;
   }
 
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
-    case H_DCT:
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
     case H_ADST:
     case V_ADST:
     case V_DCT:
-    case IDTX: break;
+    case IDTX:
+#endif
+      break;
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
     case FLIPADST_ADST:
     case V_FLIPADST: FLIPUD_PTR(dest, stride, 8); break;
@@ -1233,6 +1310,7 @@
       in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
       FLIPUD_PTR(dest, stride, 8);
       break;
+#endif
     default: assert(0); break;
   }
   in[0] = _mm_unpacklo_epi64(in[0], in[1]);
@@ -1283,6 +1361,7 @@
   aom_idct16_sse2(bl, br);  // Includes a transposition
 }
 
+#if CONFIG_EXT_TX
 static INLINE void iidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
                                  __m128i *br) {
   int i;
@@ -1295,6 +1374,7 @@
     br[i] = _mm_slli_epi16(br[i], 2);
   }
 }
+#endif  // CONFIG_EXT_TX
 
 static INLINE void write_buffer_16x32_round6(uint8_t *dest, __m128i *intl,
                                              __m128i *intr, __m128i *inbl,
@@ -1335,22 +1415,27 @@
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
     case H_DCT:
+#endif
       aom_idct16_sse2(intl, intr);
       aom_idct16_sse2(inbl, inbr);
       break;
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
     case H_ADST:
     case H_FLIPADST:
+#endif
       aom_iadst16_sse2(intl, intr);
       aom_iadst16_sse2(inbl, inbr);
       break;
+#if CONFIG_EXT_TX
     case V_FLIPADST:
     case V_ADST:
     case V_DCT:
@@ -1358,6 +1443,7 @@
       iidtx16_sse2(intl, intr);
       iidtx16_sse2(inbl, inbr);
       break;
+#endif
     default: assert(0); break;
   }
 
@@ -1370,33 +1456,47 @@
   switch (tx_type) {
     case DCT_DCT:
     case DCT_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
-    case V_DCT: idct32_16col(intl, intr, inbl, inbr); break;
+    case V_DCT:
+#endif
+      idct32_16col(intl, intr, inbl, inbr);
+      break;
     case ADST_DCT:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case FLIPADST_ADST:
     case ADST_FLIPADST:
     case FLIPADST_FLIPADST:
     case FLIPADST_DCT:
     case V_ADST:
-    case V_FLIPADST: ihalfright32_16col(intl, intr, inbl, inbr); break;
+    case V_FLIPADST:
+#endif
+      ihalfright32_16col(intl, intr, inbl, inbr);
+      break;
+#if CONFIG_EXT_TX
     case H_DCT:
     case H_ADST:
     case H_FLIPADST:
     case IDTX: iidtx32_16col(intl, intr, inbl, inbr); break;
+#endif
     default: assert(0); break;
   }
 
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
-    case H_DCT:
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
     case H_ADST:
     case V_ADST:
     case V_DCT:
-    case IDTX: break;
+    case IDTX:
+#endif
+      break;
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
     case FLIPADST_ADST:
     case V_FLIPADST: FLIPUD_PTR(dest, stride, 32); break;
@@ -1423,6 +1523,7 @@
       }
       FLIPUD_PTR(dest, stride, 32);
       break;
+#endif
     default: assert(0); break;
   }
   write_buffer_16x32_round6(dest, intl, intr, inbl, inbr, stride);
@@ -1467,20 +1568,30 @@
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
-    case H_DCT: idct32_16col(in0, in1, in2, in3); break;
+    case H_DCT:
+#endif
+      idct32_16col(in0, in1, in2, in3);
+      break;
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
     case H_ADST:
-    case H_FLIPADST: ihalfright32_16col(in0, in1, in2, in3); break;
+    case H_FLIPADST:
+#endif
+      ihalfright32_16col(in0, in1, in2, in3);
+      break;
+#if CONFIG_EXT_TX
     case V_FLIPADST:
     case V_ADST:
     case V_DCT:
     case IDTX: iidtx32_16col(in0, in1, in2, in3); break;
+#endif
     default: assert(0); break;
   }
 
@@ -1493,22 +1604,27 @@
   switch (tx_type) {
     case DCT_DCT:
     case DCT_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
     case V_DCT:
+#endif
       aom_idct16_sse2(in0, in1);
       aom_idct16_sse2(in2, in3);
       break;
     case ADST_DCT:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case FLIPADST_ADST:
     case ADST_FLIPADST:
     case FLIPADST_FLIPADST:
     case FLIPADST_DCT:
     case V_ADST:
     case V_FLIPADST:
+#endif
       aom_iadst16_sse2(in0, in1);
       aom_iadst16_sse2(in2, in3);
       break;
+#if CONFIG_EXT_TX
     case H_DCT:
     case H_ADST:
     case H_FLIPADST:
@@ -1516,19 +1632,24 @@
       iidtx16_sse2(in0, in1);
       iidtx16_sse2(in2, in3);
       break;
+#endif
     default: assert(0); break;
   }
 
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
-    case H_DCT:
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
     case H_ADST:
     case V_ADST:
     case V_DCT:
-    case IDTX: break;
+    case IDTX:
+#endif
+      break;
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
     case FLIPADST_ADST:
     case V_FLIPADST: FLIPUD_PTR(dest, stride, 16); break;
@@ -1555,8 +1676,8 @@
       }
       FLIPUD_PTR(dest, stride, 16);
       break;
+#endif
     default: assert(0); break;
   }
   write_buffer_32x16_round6(dest, in0, in1, in2, in3, stride);
 }
-#endif  // CONFIG_EXT_TX
diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index 221e3cd..c002dab 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c
@@ -18,6 +18,8 @@
 #include "aom_dsp/fwd_txfm.h"
 #include "aom_ports/mem.h"
 #include "av1/common/blockd.h"
+#include "av1/common/av1_fwd_txfm1d.h"
+#include "av1/common/av1_fwd_txfm2d_cfg.h"
 #include "av1/common/idct.h"
 
 static INLINE void range_check(const tran_low_t *input, const int size,
@@ -997,6 +999,21 @@
   output[15] = (tran_low_t)-x1;
 }
 
+// For use in lieu of ADST
+static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[16];
+  for (i = 0; i < 16; ++i) {
+    output[16 + i] = input[i] * 4;
+  }
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 16] * Sqrt2);
+  }
+  fdct16(inputhalf, output);
+  // Note overall scaling factor is 4 times orthogonal
+}
+
 #if CONFIG_EXT_TX
 static void fidtx4(const tran_low_t *input, tran_low_t *output) {
   int i;
@@ -1020,21 +1037,6 @@
   for (i = 0; i < 32; ++i) output[i] = input[i] * 4;
 }
 
-// For use in lieu of ADST
-static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  tran_low_t inputhalf[16];
-  for (i = 0; i < 16; ++i) {
-    output[16 + i] = input[i] * 4;
-  }
-  // Multiply input by sqrt(2)
-  for (i = 0; i < 16; ++i) {
-    inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 16] * Sqrt2);
-  }
-  fdct16(inputhalf, output);
-  // Note overall scaling factor is 4 times orthogonal
-}
-
 static void copy_block(const int16_t *src, int src_stride, int l, int w,
                        int16_t *dest, int dest_stride) {
   int i;
@@ -1182,7 +1184,6 @@
   }
 }
 
-#if CONFIG_EXT_TX
 void av1_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
                   int tx_type) {
   static const transform_2d FHT[] = {
@@ -1190,6 +1191,7 @@
     { fadst8, fdct4 },   // ADST_DCT
     { fdct8, fadst4 },   // DCT_ADST
     { fadst8, fadst4 },  // ADST_ADST
+#if CONFIG_EXT_TX
     { fadst8, fdct4 },   // FLIPADST_DCT
     { fdct8, fadst4 },   // DCT_FLIPADST
     { fadst8, fadst4 },  // FLIPADST_FLIPADST
@@ -1202,6 +1204,7 @@
     { fidtx8, fadst4 },  // H_ADST
     { fadst8, fidtx4 },  // V_FLIPADST
     { fidtx8, fadst4 },  // H_FLIPADST
+#endif
   };
   const transform_2d ht = FHT[tx_type];
   const int n = 4;
@@ -1209,8 +1212,10 @@
   tran_low_t out[8 * 4];
   tran_low_t temp_in[8], temp_out[8];
   int i, j;
+#if CONFIG_EXT_TX
   int16_t flipped_input[8 * 4];
   maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
+#endif
 
   // Columns
   for (i = 0; i < n; ++i) {
@@ -1237,6 +1242,7 @@
     { fadst4, fdct8 },   // ADST_DCT
     { fdct4, fadst8 },   // DCT_ADST
     { fadst4, fadst8 },  // ADST_ADST
+#if CONFIG_EXT_TX
     { fadst4, fdct8 },   // FLIPADST_DCT
     { fdct4, fadst8 },   // DCT_FLIPADST
     { fadst4, fadst8 },  // FLIPADST_FLIPADST
@@ -1249,6 +1255,7 @@
     { fidtx4, fadst8 },  // H_ADST
     { fadst4, fidtx8 },  // V_FLIPADST
     { fidtx4, fadst8 },  // H_FLIPADST
+#endif
   };
   const transform_2d ht = FHT[tx_type];
   const int n = 4;
@@ -1256,8 +1263,10 @@
   tran_low_t out[8 * 4];
   tran_low_t temp_in[8], temp_out[8];
   int i, j;
+#if CONFIG_EXT_TX
   int16_t flipped_input[8 * 4];
   maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
+#endif
 
   // Columns
   for (i = 0; i < n2; ++i) {
@@ -1284,6 +1293,7 @@
     { fadst16, fdct8 },   // ADST_DCT
     { fdct16, fadst8 },   // DCT_ADST
     { fadst16, fadst8 },  // ADST_ADST
+#if CONFIG_EXT_TX
     { fadst16, fdct8 },   // FLIPADST_DCT
     { fdct16, fadst8 },   // DCT_FLIPADST
     { fadst16, fadst8 },  // FLIPADST_FLIPADST
@@ -1296,6 +1306,7 @@
     { fidtx16, fadst8 },  // H_ADST
     { fadst16, fidtx8 },  // V_FLIPADST
     { fidtx16, fadst8 },  // H_FLIPADST
+#endif
   };
   const transform_2d ht = FHT[tx_type];
   const int n = 8;
@@ -1303,8 +1314,10 @@
   tran_low_t out[16 * 8];
   tran_low_t temp_in[16], temp_out[16];
   int i, j;
+#if CONFIG_EXT_TX
   int16_t flipped_input[16 * 8];
   maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
+#endif
 
   // Columns
   for (i = 0; i < n; ++i) {
@@ -1332,6 +1345,7 @@
     { fadst8, fdct16 },   // ADST_DCT
     { fdct8, fadst16 },   // DCT_ADST
     { fadst8, fadst16 },  // ADST_ADST
+#if CONFIG_EXT_TX
     { fadst8, fdct16 },   // FLIPADST_DCT
     { fdct8, fadst16 },   // DCT_FLIPADST
     { fadst8, fadst16 },  // FLIPADST_FLIPADST
@@ -1344,6 +1358,7 @@
     { fidtx8, fadst16 },  // H_ADST
     { fadst8, fidtx16 },  // V_FLIPADST
     { fidtx8, fadst16 },  // H_FLIPADST
+#endif
   };
   const transform_2d ht = FHT[tx_type];
   const int n = 8;
@@ -1351,8 +1366,10 @@
   tran_low_t out[16 * 8];
   tran_low_t temp_in[16], temp_out[16];
   int i, j;
+#if CONFIG_EXT_TX
   int16_t flipped_input[16 * 8];
   maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
+#endif
 
   // Columns
   for (i = 0; i < n2; ++i) {
@@ -1380,6 +1397,7 @@
     { fhalfright32, fdct16 },   // ADST_DCT
     { fdct32, fadst16 },        // DCT_ADST
     { fhalfright32, fadst16 },  // ADST_ADST
+#if CONFIG_EXT_TX
     { fhalfright32, fdct16 },   // FLIPADST_DCT
     { fdct32, fadst16 },        // DCT_FLIPADST
     { fhalfright32, fadst16 },  // FLIPADST_FLIPADST
@@ -1392,6 +1410,7 @@
     { fidtx32, fadst16 },       // H_ADST
     { fhalfright32, fidtx16 },  // V_FLIPADST
     { fidtx32, fadst16 },       // H_FLIPADST
+#endif
   };
   const transform_2d ht = FHT[tx_type];
   const int n = 16;
@@ -1399,8 +1418,10 @@
   tran_low_t out[32 * 16];
   tran_low_t temp_in[32], temp_out[32];
   int i, j;
+#if CONFIG_EXT_TX
   int16_t flipped_input[32 * 16];
   maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
+#endif
 
   // Columns
   for (i = 0; i < n; ++i) {
@@ -1428,6 +1449,7 @@
     { fadst16, fdct32 },        // ADST_DCT
     { fdct16, fhalfright32 },   // DCT_ADST
     { fadst16, fhalfright32 },  // ADST_ADST
+#if CONFIG_EXT_TX
     { fadst16, fdct32 },        // FLIPADST_DCT
     { fdct16, fhalfright32 },   // DCT_FLIPADST
     { fadst16, fhalfright32 },  // FLIPADST_FLIPADST
@@ -1440,6 +1462,7 @@
     { fidtx16, fhalfright32 },  // H_ADST
     { fadst16, fidtx32 },       // V_FLIPADST
     { fidtx16, fhalfright32 },  // H_FLIPADST
+#endif
   };
   const transform_2d ht = FHT[tx_type];
   const int n = 16;
@@ -1447,8 +1470,10 @@
   tran_low_t out[32 * 16];
   tran_low_t temp_in[32], temp_out[32];
   int i, j;
+#if CONFIG_EXT_TX
   int16_t flipped_input[32 * 16];
   maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
+#endif
 
   // Columns
   for (i = 0; i < n2; ++i) {
@@ -1469,8 +1494,6 @@
   // Note: overall scale factor of transform is 4 times unitary
 }
 
-#endif  // CONFIG_EXT_TX
-
 void av1_fdct8x8_quant_c(const int16_t *input, int stride,
                          tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          int skip_block, const int16_t *zbin_ptr,
@@ -1759,7 +1782,6 @@
   av1_fht4x4_c(input, output, stride, tx_type);
 }
 
-#if CONFIG_EXT_TX
 void av1_highbd_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
                          int tx_type) {
   av1_fht4x8_c(input, output, stride, tx_type);
@@ -1789,7 +1811,6 @@
                            int tx_type) {
   av1_fht32x16_c(input, output, stride, tx_type);
 }
-#endif  // CONFIG_EXT_TX
 
 void av1_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
                          int tx_type) {
@@ -1874,12 +1895,103 @@
   }
 }
 
+#if CONFIG_TX64X64
+#if CONFIG_EXT_TX
+static void fidtx64(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 64; ++i)
+    output[i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2);
+}
+
+// For use in lieu of ADST
+static void fhalfright64(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[32];
+  for (i = 0; i < 32; ++i) {
+    output[32 + i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2);
+  }
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 32; ++i) {
+    inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 32] * Sqrt2);
+  }
+  fdct32(inputhalf, output);
+  // Note overall scaling factor is 2 times unitary
+}
+#endif  // CONFIG_EXT_TX
+
+static void fdct64_col(const tran_low_t *input, tran_low_t *output) {
+  int32_t in[64], out[64];
+  int i;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_fdct64_new(in, out, fwd_cos_bit_col_dct_dct_64,
+                 fwd_stage_range_col_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+static void fdct64_row(const tran_low_t *input, tran_low_t *output) {
+  int32_t in[64], out[64];
+  int i;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_fdct64_new(in, out, fwd_cos_bit_row_dct_dct_64,
+                 fwd_stage_range_row_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
+                    int tx_type) {
+  static const transform_2d FHT[] = {
+    { fdct64_col, fdct64_row },  // DCT_DCT
+#if CONFIG_EXT_TX
+    { fhalfright64, fdct64_row },    // ADST_DCT
+    { fdct64_col, fhalfright64 },    // DCT_ADST
+    { fhalfright64, fhalfright64 },  // ADST_ADST
+    { fhalfright64, fdct64_row },    // FLIPADST_DCT
+    { fdct64_col, fhalfright64 },    // DCT_FLIPADST
+    { fhalfright64, fhalfright64 },  // FLIPADST_FLIPADST
+    { fhalfright64, fhalfright64 },  // ADST_FLIPADST
+    { fhalfright64, fhalfright64 },  // FLIPADST_ADST
+    { fidtx64, fidtx64 },            // IDTX
+    { fdct64_col, fidtx64 },         // V_DCT
+    { fidtx64, fdct64_row },         // H_DCT
+    { fhalfright64, fidtx64 },       // V_ADST
+    { fidtx64, fhalfright64 },       // H_ADST
+    { fhalfright64, fidtx64 },       // V_FLIPADST
+    { fidtx64, fhalfright64 },       // H_FLIPADST
+#endif
+  };
+  const transform_2d ht = FHT[tx_type];
+  tran_low_t out[4096];
+  int i, j;
+  tran_low_t temp_in[64], temp_out[64];
+#if CONFIG_EXT_TX
+  int16_t flipped_input[64 * 64];
+  maybe_flip_input(&input, &stride, 64, 64, flipped_input, tx_type);
+#endif
+  // Columns
+  for (i = 0; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i];
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < 64; ++j)
+      out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  }
+
+  // Rows
+  for (i = 0; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) temp_in[j] = out[j + i * 64];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < 64; ++j)
+      output[j + i * 64] =
+          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+  }
+}
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_EXT_TX
 // Forward identity transform.
 void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride,
                     int bs, int tx_type) {
   int r, c;
-  const int shift = bs < 32 ? 3 : 2;
+  const int shift = bs < 32 ? 3 : (bs < 64 ? 2 : 1);
   if (tx_type == IDTX) {
     for (r = 0; r < bs; ++r) {
       for (c = 0; c < bs; ++c) coeff[c] = src_diff[c] * (1 << shift);
@@ -1894,5 +2006,12 @@
                            int tx_type) {
   av1_fht32x32_c(input, output, stride, tx_type);
 }
+
+#if CONFIG_TX64X64
+void av1_highbd_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
+                           int tx_type) {
+  av1_fht64x64_c(input, output, stride, tx_type);
+}
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 #endif  // CONFIG_EXT_TX
diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index 6d5eccd..ff03516 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c
@@ -55,7 +55,6 @@
   }
 }
 
-#if CONFIG_EXT_TX
 static void fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
                          int diff_stride, TX_TYPE tx_type,
                          FWD_TXFM_OPT fwd_txfm_opt) {
@@ -97,7 +96,6 @@
   (void)fwd_txfm_opt;
   av1_fht32x16(src_diff, coeff, diff_stride, tx_type);
 }
-#endif  // CONFIG_EXT_TX
 
 static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
                          int diff_stride, TX_TYPE tx_type,
@@ -233,7 +231,6 @@
   }
 }
 
-#if CONFIG_EXT_TX
 static void highbd_fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
                                 int diff_stride, TX_TYPE tx_type,
                                 FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
@@ -281,7 +278,6 @@
   (void)bd;
   av1_highbd_fht32x16(src_diff, coeff, diff_stride, tx_type);
 }
-#endif  // CONFIG_EXT_TX
 
 static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
                                 int diff_stride, TX_TYPE tx_type,
@@ -403,7 +399,6 @@
     case TX_8X8:
       fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
       break;
-#if CONFIG_EXT_TX
     case TX_4X8:
       fwd_txfm_4x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
       break;
@@ -422,7 +417,6 @@
     case TX_32X16:
       fwd_txfm_32x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
       break;
-#endif  // CONFIG_EXT_TX
     case TX_4X4:
       fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless);
       break;
@@ -452,7 +446,6 @@
       highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
                           bd);
       break;
-#if CONFIG_EXT_TX
     case TX_4X8:
       highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
                           bd);
@@ -477,7 +470,6 @@
       highbd_fwd_txfm_32x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
                             bd);
       break;
-#endif  // CONFIG_EXT_TX
     case TX_4X4:
       highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless, bd);
       break;
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index b37714c..8682a3e 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -6745,24 +6745,24 @@
           av1_set_mvcost(x, mbmi->ref_frame[0], 0, mbmi->ref_mv_idx);
 #endif  // CONFIG_REF_MV
           rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
-                                    &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+                                    &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
                                     x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 #if CONFIG_REF_MV
           av1_set_mvcost(x, mbmi->ref_frame[1], 1, mbmi->ref_mv_idx);
 #endif  // CONFIG_REF_MV
           rate_mv += av1_mv_bit_cost(
-              &frame_mv[refs[1]].as_mv, &x->mbmi_ext->ref_mvs[refs[1]][0].as_mv,
+              &frame_mv[refs[1]].as_mv, &mbmi_ext->ref_mvs[refs[1]][0].as_mv,
               x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
         }
       } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
         frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
         rate_mv = av1_mv_bit_cost(&frame_mv[refs[1]].as_mv,
-                                  &x->mbmi_ext->ref_mvs[refs[1]][0].as_mv,
+                                  &mbmi_ext->ref_mvs[refs[1]][0].as_mv,
                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
       } else {
         frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
         rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
-                                  &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+                                  &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
       }
 #else
@@ -6778,13 +6778,13 @@
         av1_set_mvcost(x, mbmi->ref_frame[0], 0, mbmi->ref_mv_idx);
 #endif  // CONFIG_REF_MV
         rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
-                                  &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+                                  &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 #if CONFIG_REF_MV
         av1_set_mvcost(x, mbmi->ref_frame[1], 1, mbmi->ref_mv_idx);
 #endif  // CONFIG_REF_MV
         rate_mv += av1_mv_bit_cost(&frame_mv[refs[1]].as_mv,
-                                   &x->mbmi_ext->ref_mvs[refs[1]][0].as_mv,
+                                   &mbmi_ext->ref_mvs[refs[1]][0].as_mv,
                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
       }
 #endif  // CONFIG_EXT_INTER
diff --git a/av1/encoder/x86/dct_intrin_sse2.c b/av1/encoder/x86/dct_intrin_sse2.c
index 22cc342..3596292 100644
--- a/av1/encoder/x86/dct_intrin_sse2.c
+++ b/av1/encoder/x86/dct_intrin_sse2.c
@@ -2592,7 +2592,6 @@
   }
 }
 
-#if CONFIG_EXT_TX
 static INLINE void scale_sqrt2_8x4(__m128i *in) {
   // Implements fdct_round_shift(input * Sqrt2), which is equivalent to
   // ROUND_POWER_OF_TWO(input * Sqrt2, DCT_CONST_BITS),
@@ -2767,9 +2766,9 @@
                      int tx_type) {
   __m128i in[8];
 
+  load_buffer_4x8(input, in, stride, 0, 0);
   switch (tx_type) {
     case DCT_DCT:
-      load_buffer_4x8(input, in, stride, 0, 0);
       fdct8_sse2(in);
       // Repack data into two 4x4 blocks so we can reuse the 4x4 transforms
       // The other cases (and the 8x4 transforms) all behave similarly
@@ -2781,7 +2780,6 @@
       fdct4_sse2(in + 4);
       break;
     case ADST_DCT:
-      load_buffer_4x8(input, in, stride, 0, 0);
       fadst8_sse2(in);
       in[4] = _mm_shuffle_epi32(in[0], 0xe);
       in[5] = _mm_shuffle_epi32(in[1], 0xe);
@@ -2791,7 +2789,6 @@
       fdct4_sse2(in + 4);
       break;
     case DCT_ADST:
-      load_buffer_4x8(input, in, stride, 0, 0);
       fdct8_sse2(in);
       in[4] = _mm_shuffle_epi32(in[0], 0xe);
       in[5] = _mm_shuffle_epi32(in[1], 0xe);
@@ -2801,7 +2798,6 @@
       fadst4_sse2(in + 4);
       break;
     case ADST_ADST:
-      load_buffer_4x8(input, in, stride, 0, 0);
       fadst8_sse2(in);
       in[4] = _mm_shuffle_epi32(in[0], 0xe);
       in[5] = _mm_shuffle_epi32(in[1], 0xe);
@@ -2810,6 +2806,7 @@
       fadst4_sse2(in);
       fadst4_sse2(in + 4);
       break;
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
       load_buffer_4x8(input, in, stride, 1, 0);
       fadst8_sse2(in);
@@ -2930,6 +2927,7 @@
       fadst4_sse2(in);
       fadst4_sse2(in + 4);
       break;
+#endif
     default: assert(0); break;
   }
   write_buffer_4x8(output, in);
@@ -3023,6 +3021,7 @@
       fadst4_sse2(in + 4);
       fadst8_sse2(in);
       break;
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
       load_buffer_8x4(input, in, stride, 1, 0);
       fadst4_sse2(in);
@@ -3095,6 +3094,7 @@
       fidtx4_sse2(in + 4);
       fadst8_sse2(in);
       break;
+#endif
     default: assert(0); break;
   }
   write_buffer_8x4(output, in);
@@ -3158,6 +3158,7 @@
       fadst8_sse2(t);
       fadst8_sse2(b);
       break;
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
       load_buffer_8x16(input, in, stride, 1, 0);
       fadst16_8col(in);
@@ -3254,6 +3255,7 @@
       fadst8_sse2(t);
       fadst8_sse2(b);
       break;
+#endif
     default: assert(0); break;
   }
   right_shift_8x8(t, 2);
@@ -3314,6 +3316,7 @@
       fadst8_sse2(r);
       fadst16_8col(in);
       break;
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
       load_buffer_16x8(input, in, stride, 1, 0);
       fadst8_sse2(l);
@@ -3386,6 +3389,7 @@
       fidtx8_sse2(r);
       fadst16_8col(in);
       break;
+#endif
     default: assert(0); break;
   }
   array_transpose_8x8(l, l);
@@ -3436,6 +3440,7 @@
   fdct16_sse2(tl, tr);
 }
 
+#if CONFIG_EXT_TX
 static INLINE void fidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
                                  __m128i *br) {
   int i;
@@ -3448,6 +3453,7 @@
   array_transpose_16x16(tl, tr);
   array_transpose_16x16(bl, br);
 }
+#endif
 
 static INLINE void load_buffer_16x32(const int16_t *input, __m128i *intl,
                                      __m128i *intr, __m128i *inbl,
@@ -3507,7 +3513,7 @@
   }
 }
 
-// Note on data layout, for both this and the 32x16 tranforms:
+// Note on data layout, for both this and the 32x16 transforms:
 // So that we can reuse the 16-element transforms easily,
 // we want to split the input into 8x16 blocks.
 // For 16x32, this means the input is a 2x2 grid of such blocks.
@@ -3541,6 +3547,7 @@
       fadst16_sse2(intl, intr);
       fadst16_sse2(inbl, inbr);
       break;
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
       fhalfright32_16col(intl, intr, inbl, inbr);
@@ -3613,6 +3620,7 @@
       fadst16_sse2(intl, intr);
       fadst16_sse2(inbl, inbr);
       break;
+#endif
     default: assert(0); break;
   }
   write_buffer_16x32(output, intl, intr, inbl, inbr);
@@ -3671,31 +3679,29 @@
                        int tx_type) {
   __m128i in0[16], in1[16], in2[16], in3[16];
 
+  load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
   switch (tx_type) {
     case DCT_DCT:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
       fdct16_sse2(in0, in1);
       fdct16_sse2(in2, in3);
       fdct32_16col(in0, in1, in2, in3);
       break;
     case ADST_DCT:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
       fadst16_sse2(in0, in1);
       fadst16_sse2(in2, in3);
       fdct32_16col(in0, in1, in2, in3);
       break;
     case DCT_ADST:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
       fdct16_sse2(in0, in1);
       fdct16_sse2(in2, in3);
       fhalfright32_16col(in0, in1, in2, in3);
       break;
     case ADST_ADST:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
       fadst16_sse2(in0, in1);
       fadst16_sse2(in2, in3);
       fhalfright32_16col(in0, in1, in2, in3);
       break;
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
       load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
       fadst16_sse2(in0, in1);
@@ -3768,8 +3774,8 @@
       fidtx16_sse2(in2, in3);
       fhalfright32_16col(in0, in1, in2, in3);
       break;
+#endif
     default: assert(0); break;
   }
   write_buffer_32x16(output, in0, in1, in2, in3);
 }
-#endif  // CONFIG_EXT_TX