Hybrid 1-D/2-D transform coding

This commit enables a hybrid 1-D/2-D transform coding scheme and
the accompany entropy coding system. It currently uses hybrid
1-D/2-D DCT transform coding. It provides coding performance gains:

lowres_all  0.55%
hdres_all   0.43%

Change-Id: I2b30dcafd21eb2bb3371f6e854cbab440a4dfa78
diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index f96aa2e..3cb73e3 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@@ -360,7 +360,7 @@
 #define USE_MSKTX_FOR_32X32      1
 
 static const int num_ext_tx_set_inter[EXT_TX_SETS_INTER] = {
-  1, 17, 10, 2
+  1, 19, 12, 2
 };
 static const int num_ext_tx_set_intra[EXT_TX_SETS_INTRA] = {
   1, 17, 10
@@ -421,10 +421,10 @@
 
 // Transform types used in each inter set
 static const int ext_tx_used_inter[EXT_TX_SETS_INTER][TX_TYPES] = {
-  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
-  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, },
-  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, },
-  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, },
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1},
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1},
 };
 
 static INLINE int get_ext_tx_types(TX_SIZE tx_size, BLOCK_SIZE bs,
diff --git a/vp10/common/entropymode.c b/vp10/common/entropymode.c
index 8bb653c..e4c27a7 100644
--- a/vp10/common/entropymode.c
+++ b/vp10/common/entropymode.c
@@ -838,46 +838,52 @@
                                            [TREE_SIZE(TX_TYPES)] = {
   { // ToDo(yaowu): remove used entry 0.
     -IDTX, 2,
-    -DCT_DCT, 4,
-    -DST_DST, 6,
-    8, 18,
-    10, 12,
-    -DST_DCT, -DCT_DST,
+    -V_DCT, 4,
+    -H_DCT, 6,
+    -DCT_DCT, 8,
+    -DST_DST, 10,
+    12, 22,
     14, 16,
+    -DST_DCT, -DCT_DST,
+    18, 20,
     -ADST_DCT, -DCT_ADST,
     -FLIPADST_DCT, -DCT_FLIPADST,
-    20, 26,
-    22, 24,
+    24, 30,
+    26, 28,
     -DST_ADST, -ADST_DST,
     -DST_FLIPADST, -FLIPADST_DST,
-    28, 30,
+    32, 34,
     -ADST_ADST, -FLIPADST_FLIPADST,
     -ADST_FLIPADST, -FLIPADST_ADST,
   }, {
     -IDTX, 2,
-    -DCT_DCT, 4,
-    -DST_DST, 6,
-    8, 18,
-    10, 12,
-    -DST_DCT, -DCT_DST,
+    -V_DCT, 4,
+    -H_DCT, 6,
+    -DCT_DCT, 8,
+    -DST_DST, 10,
+    12, 22,
     14, 16,
+    -DST_DCT, -DCT_DST,
+    18, 20,
     -ADST_DCT, -DCT_ADST,
     -FLIPADST_DCT, -DCT_FLIPADST,
-    20, 26,
-    22, 24,
+    24, 30,
+    26, 28,
     -DST_ADST, -ADST_DST,
     -DST_FLIPADST, -FLIPADST_DST,
-    28, 30,
+    32, 34,
     -ADST_ADST, -FLIPADST_FLIPADST,
     -ADST_FLIPADST, -FLIPADST_ADST,
   }, {
     -IDTX, 2,
-    -DCT_DCT, 4,
-    6, 12,
-    8, 10,
+    -V_DCT, 4,
+    -H_DCT, 6,
+    -DCT_DCT, 8,
+    10, 16,
+    12, 14,
     -ADST_DCT, -DCT_ADST,
     -FLIPADST_DCT, -DCT_FLIPADST,
-    14, 16,
+    18, 20,
     -ADST_ADST, -FLIPADST_FLIPADST,
     -ADST_FLIPADST, -FLIPADST_ADST
   }, {
@@ -937,33 +943,33 @@
 static const vpx_prob
 default_inter_ext_tx_prob[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES - 1] = {
   { // ToDo(yaowu): remove unused entry 0.
-    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-    128 },
-    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-    128 },
-    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-    128 },
+    { 12, 15, 15, 112, 16, 128, 128, 128, 128, 128,
+      128, 128, 128, 128, 128, 128, 128, 128 },
+    { 12, 15, 15, 112, 16, 128, 128, 128, 128, 128,
+      128, 128, 128, 128, 128, 128, 128, 128 },
+    { 12, 15, 15, 112, 16, 128, 128, 128, 128, 128,
+      128, 128, 128, 128, 128, 128, 128, 128 },
 #if EXT_TX_SIZES == 4
-    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-    128 },
+    { 12, 15, 15, 112, 16, 128, 128, 128, 128, 128,
+      128, 128, 128, 128, 128, 128, 128, 128 },
 #endif
   }, {
-    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-      128 },
-    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-      128 },
-    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-      128 },
+    { 12, 15, 15, 112, 16, 128, 128, 128, 128, 128,
+      128, 128, 128, 128, 128, 128, 128, 128 },
+    { 12, 15, 15, 112, 16, 128, 128, 128, 128, 128,
+      128, 128, 128, 128, 128, 128, 128, 128 },
+    { 12, 15, 15, 112, 16, 128, 128, 128, 128, 128,
+      128, 128, 128, 128, 128, 128, 128, 128 },
 #if EXT_TX_SIZES == 4
-    { 12, 160, 16, 144, 160, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-      128 },
+    { 12, 15, 15, 160, 16, 144, 160, 128, 128, 128,
+      128, 128, 128, 128, 128, 128, 128, 128 },
 #endif
   }, {
-    { 12, 112, 128, 128, 128, 128, 128, 128, 128 },
-    { 12, 112, 128, 128, 128, 128, 128, 128, 128 },
-    { 12, 112, 128, 128, 128, 128, 128, 128, 128 },
+    { 12, 15, 15, 112, 128, 128, 128, 128, 128, 128, 128 },
+    { 12, 15, 15, 112, 128, 128, 128, 128, 128, 128, 128 },
+    { 12, 15, 15, 112, 128, 128, 128, 128, 128, 128, 128 },
 #if EXT_TX_SIZES == 4
-    { 12, 160, 128, 128, 128, 128, 128, 128, 128 },
+    { 12, 15, 15, 160, 128, 128, 128, 128, 128, 128, 128 },
 #endif
   }, {
     { 12, },
diff --git a/vp10/common/enums.h b/vp10/common/enums.h
index 2233649..4e3a5b1 100644
--- a/vp10/common/enums.h
+++ b/vp10/common/enums.h
@@ -108,6 +108,8 @@
   FLIPADST_DST = 14,
   DST_DST = 15,
   IDTX = 16,
+  V_DCT = 17,
+  H_DCT = 18,
 #endif  // CONFIG_EXT_TX
   TX_TYPES,
 } TX_TYPE;
diff --git a/vp10/common/idct.c b/vp10/common/idct.c
index dbb50fb..a941f64 100644
--- a/vp10/common/idct.c
+++ b/vp10/common/idct.c
@@ -326,11 +326,79 @@
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_EXT_TX
 
-// Inverse identiy transform and add.
+// Inverse identity transform and add.
 static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                           int bs) {
+                           int bs, int tx_type) {
   int r, c;
   const int shift = bs < 32 ? 3 : 2;
+
+  tran_low_t temp_in[32], temp_out[32];
+  transform_2d ht = {idct4_c, idct4_c};
+  int out_scale = 1;
+  int coeff_stride = 0;
+
+  switch (bs) {
+    case 4:
+      ht.cols = idct4_c;
+      ht.rows = idct4_c;
+      out_scale = cospi_16_64 >> 3;
+      coeff_stride = 4;
+      break;
+    case 8:
+      ht.cols = idct8_c;
+      ht.rows = idct8_c;
+      out_scale = (1 << (DCT_CONST_BITS - 4));
+      coeff_stride = 8;
+      break;
+    case 16:
+      ht.cols = idct16_c;
+      ht.rows = idct16_c;
+      out_scale = cospi_16_64 >> 4;
+      coeff_stride = 16;
+      break;
+    case 32:
+      ht.cols = idct32_c;
+      ht.rows = idct32_c;
+      out_scale = (1 << (DCT_CONST_BITS - 4));
+      coeff_stride = 32;
+      break;
+    default:
+      assert(0);
+  }
+
+  // Columns
+  if (tx_type == V_DCT) {
+    for (c = 0; c < bs; ++c) {
+      for (r = 0; r < bs; ++r)
+        temp_in[r] = input[r * coeff_stride + c];
+      ht.cols(temp_in, temp_out);
+
+      for (r = 0; r < bs; ++r) {
+        tran_high_t temp = (tran_high_t)temp_out[r] * out_scale;
+        temp >>= DCT_CONST_BITS;
+        dest[r * stride + c] = clip_pixel_add(dest[r * stride + c],
+                                              (tran_low_t)temp);
+      }
+    }
+    return;
+  }
+
+  if (tx_type == H_DCT) {
+    for (r = 0; r < bs; ++r) {
+      for (c = 0; c < bs; ++c)
+        temp_in[c] = input[r * coeff_stride + c];
+      ht.rows(temp_in, temp_out);
+
+      for (c = 0; c < bs; ++c) {
+        tran_high_t temp = (tran_high_t)temp_out[c] * out_scale;
+        temp >>= DCT_CONST_BITS;
+        dest[r * stride + c] = clip_pixel_add(dest[r * stride + c],
+                                              (tran_low_t)temp);
+      }
+    }
+    return;
+  }
+
   for (r = 0; r < bs; ++r) {
     for (c = 0; c < bs; ++c)
       dest[c] = clip_pixel_add(dest[c], input[c] >> shift);
@@ -360,6 +428,8 @@
     case DST_DCT:
     case DST_ADST:
     case ADST_DST:
+    case V_DCT:
+    case H_DCT:
       break;
     case FLIPADST_DCT:
     case FLIPADST_ADST:
@@ -1031,8 +1101,10 @@
       // Use C version since DST only exists in C code
       vp10_iht4x4_16_add_c(input, dest, stride, tx_type);
       break;
+    case H_DCT:
+    case V_DCT:
     case IDTX:
-      inv_idtx_add_c(input, dest, stride, 4);
+      inv_idtx_add_c(input, dest, stride, 4, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default:
@@ -1070,8 +1142,10 @@
       // Use C version since DST only exists in C code
       vp10_iht8x8_64_add_c(input, dest, stride, tx_type);
       break;
+    case H_DCT:
+    case V_DCT:
     case IDTX:
-      inv_idtx_add_c(input, dest, stride, 8);
+      inv_idtx_add_c(input, dest, stride, 8, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default:
@@ -1109,8 +1183,10 @@
       // Use C version since DST only exists in C code
       vp10_iht16x16_256_add_c(input, dest, stride, tx_type);
       break;
+    case H_DCT:
+    case V_DCT:
     case IDTX:
-      inv_idtx_add_c(input, dest, stride, 16);
+      inv_idtx_add_c(input, dest, stride, 16, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default:
@@ -1143,8 +1219,10 @@
     case DST_FLIPADST:
       vp10_iht32x32_1024_add_c(input, dest, stride, tx_type);
       break;
+    case H_DCT:
+    case V_DCT:
     case IDTX:
-      inv_idtx_add_c(input, dest, stride, 32);
+      inv_idtx_add_c(input, dest, stride, 32, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default:
diff --git a/vp10/common/scan.c b/vp10/common/scan.c
index 672ac1d..21d291f 100644
--- a/vp10/common/scan.c
+++ b/vp10/common/scan.c
@@ -1790,6 +1790,8 @@
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {row_scan_4x4,     vp10_row_iscan_4x4,     row_scan_4x4_neighbors},
+    {col_scan_4x4,     vp10_col_iscan_4x4,     col_scan_4x4_neighbors},
   }, {  // TX_8X8
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
     {row_scan_8x8,     vp10_row_iscan_8x8,     row_scan_8x8_neighbors},
@@ -1808,6 +1810,8 @@
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {row_scan_8x8,     vp10_row_iscan_8x8,     row_scan_8x8_neighbors},
+    {col_scan_8x8,     vp10_col_iscan_8x8,     col_scan_8x8_neighbors},
   }, {  // TX_16X16
     {default_scan_16x16, vp10_default_iscan_16x16,
      default_scan_16x16_neighbors},
@@ -1841,6 +1845,8 @@
      default_scan_16x16_neighbors},
     {default_scan_16x16, vp10_default_iscan_16x16,
      default_scan_16x16_neighbors},
+     {row_scan_16x16,     vp10_row_iscan_16x16,     row_scan_16x16_neighbors},
+     {col_scan_16x16,     vp10_col_iscan_16x16,     col_scan_16x16_neighbors},
   }, {  // TX_32X32
     {default_scan_32x32, vp10_default_iscan_32x32,
      default_scan_32x32_neighbors},
@@ -1876,6 +1882,10 @@
      qtr_scan_32x32_neighbors},
     {default_scan_32x32, vp10_default_iscan_32x32,
      default_scan_32x32_neighbors},
+     {h2_scan_32x32, vp10_h2_iscan_32x32,
+      h2_scan_32x32_neighbors},
+     {v2_scan_32x32, vp10_v2_iscan_32x32,
+      v2_scan_32x32_neighbors},
   }
 };
 
@@ -1898,6 +1908,8 @@
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {row_scan_4x4,     vp10_row_iscan_4x4,     row_scan_4x4_neighbors},
+    {col_scan_4x4,     vp10_col_iscan_4x4,     col_scan_4x4_neighbors},
   }, {  // TX_8X8
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
@@ -1916,6 +1928,8 @@
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {row_scan_8x8,     vp10_row_iscan_8x8,     row_scan_8x8_neighbors},
+    {col_scan_8x8,     vp10_col_iscan_8x8,     col_scan_8x8_neighbors},
   }, {  // TX_16X16
     {default_scan_16x16, vp10_default_iscan_16x16,
      default_scan_16x16_neighbors},
@@ -1951,6 +1965,8 @@
      default_scan_16x16_neighbors},
     {default_scan_16x16, vp10_default_iscan_16x16,
      default_scan_16x16_neighbors},
+     {row_scan_16x16,     vp10_row_iscan_16x16,     row_scan_16x16_neighbors},
+     {col_scan_16x16,     vp10_col_iscan_16x16,     col_scan_16x16_neighbors},
   }, {  // TX_32X32
     {default_scan_32x32, vp10_default_iscan_32x32,
      default_scan_32x32_neighbors},
@@ -1986,6 +2002,10 @@
      qtr_scan_32x32_neighbors},
     {default_scan_32x32, vp10_default_iscan_32x32,
      default_scan_32x32_neighbors},
+     {h2_scan_32x32, vp10_h2_iscan_32x32,
+      h2_scan_32x32_neighbors},
+     {v2_scan_32x32, vp10_v2_iscan_32x32,
+      v2_scan_32x32_neighbors},
   }
 };
 
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index c9f0295..2344ce2 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -426,6 +426,9 @@
   specialize qw/vp10_fwht4x4 msa/, "$mmx_x86inc";
 }
 
+add_proto qw/void vp10_fwd_idtx/, "const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type";
+  specialize qw/vp10_fwd_idtx/;
+
 # Inverse transform
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # Note as optimized versions of these functions are added we need to add a check to ensure
diff --git a/vp10/encoder/dct.c b/vp10/encoder/dct.c
index 333adbb..31a4c87 100644
--- a/vp10/encoder/dct.c
+++ b/vp10/encoder/dct.c
@@ -1315,6 +1315,8 @@
     case DST_DCT:
     case DST_ADST:
     case ADST_DST:
+    case H_DCT:
+    case V_DCT:
       break;
     case FLIPADST_DCT:
     case FLIPADST_ADST:
@@ -1758,6 +1760,95 @@
   }
 }
 
+// Forward identity transform.
+void vp10_fwd_idtx_c(const int16_t *src_diff,
+                     tran_low_t *coeff, int stride,
+                     int bs, int tx_type) {
+  int r, c;
+  const int shift = bs < 32 ? 3 : 2;
+
+  const int16_t *input = src_diff;
+  tran_low_t *output = coeff;
+
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+  transform_2d ht = {fdct4, fdct4};
+  int in_scale = 1;
+  int out_scale = 1;
+  int coeff_stride = 0;
+
+  switch (bs) {
+    case 4:
+      ht.cols = fdct4;
+      ht.rows = fdct4;
+      in_scale = 16;
+      out_scale = cospi_16_64 >> 1;
+      coeff_stride = 4;
+      break;
+    case 8:
+      ht.cols = fdct8;
+      ht.rows = fdct8;
+      in_scale = 4;
+      out_scale = (1 << DCT_CONST_BITS);
+      coeff_stride = 8;
+      break;
+    case 16:
+      ht.cols = fdct16;
+      ht.rows = fdct16;
+      in_scale = 4;
+      out_scale = cospi_16_64;
+      coeff_stride = 16;
+      break;
+    case 32:
+      ht.cols = fdct32;
+      ht.rows = fdct32;
+      in_scale = 4;
+      out_scale = (1 << (DCT_CONST_BITS - 2));
+      coeff_stride = 32;
+      break;
+    default:
+      assert(0);
+  }
+
+  // Columns
+  if (tx_type == V_DCT) {
+    for (i = 0; i < bs; ++i) {
+      for (j = 0; j < bs; ++j)
+        temp_in[j] = input[j * stride + i] * in_scale;
+      ht.cols(temp_in, temp_out);
+
+      for (j = 0; j < bs; ++j) {
+        tran_high_t temp = (tran_high_t)temp_out[j] * out_scale;
+        temp >>= DCT_CONST_BITS;
+        output[j * coeff_stride + i] = (tran_low_t)temp;
+      }
+    }
+    return;
+  }
+
+  // Rows
+  if (tx_type == H_DCT) {
+    for (j = 0; j < bs; ++j) {
+      for (i = 0; i < bs; ++i)
+        temp_in[i] = input[j * stride + i] * in_scale;
+      ht.rows(temp_in, temp_out);
+
+      for (i = 0; i < bs; ++i) {
+        tran_high_t temp = (tran_high_t)temp_out[i] * out_scale;
+        temp >>= DCT_CONST_BITS;
+        output[j * coeff_stride + i] = (tran_low_t)temp;
+      }
+    }
+    return;
+  }
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) coeff[c] = src_diff[c] << shift;
+    src_diff += stride;
+    coeff += bs;
+  }
+}
+
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp10_highbd_fht32x32_c(const int16_t *input, tran_low_t *output,
                             int stride, int tx_type) {
diff --git a/vp10/encoder/hybrid_fwd_txfm.c b/vp10/encoder/hybrid_fwd_txfm.c
index 03d9c6d..029240f 100644
--- a/vp10/encoder/hybrid_fwd_txfm.c
+++ b/vp10/encoder/hybrid_fwd_txfm.c
@@ -33,21 +33,6 @@
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if CONFIG_EXT_TX
-// Forward identity transform.
-static void fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride,
-                       int bs) {
-  int r, c;
-  const int shift = bs < 32 ? 3 : 2;
-
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) coeff[c] = src_diff[c] << shift;
-    src_diff += stride;
-    coeff += bs;
-  }
-}
-#endif  // CONFIG_EXT_TX
-
 void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
                        int diff_stride, TX_TYPE tx_type, int lossless) {
   if (lossless) {
@@ -78,8 +63,10 @@
     case FLIPADST_DST:
       vp10_fht4x4(src_diff, coeff, diff_stride, tx_type);
       break;
+    case H_DCT:
+    case V_DCT:
     case IDTX:
-      fwd_idtx_c(src_diff, coeff, diff_stride, 4);
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 4, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default:
@@ -116,8 +103,10 @@
     case FLIPADST_DST:
       vp10_fht8x8(src_diff, coeff, diff_stride, tx_type);
       break;
+    case H_DCT:
+    case V_DCT:
     case IDTX:
-      fwd_idtx_c(src_diff, coeff, diff_stride, 8);
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default:
@@ -157,8 +146,10 @@
       // Use C version since DST exists only in C
       vp10_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
       break;
+    case H_DCT:
+    case V_DCT:
     case IDTX:
-      fwd_idtx_c(src_diff, coeff, diff_stride, 16);
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default:
@@ -195,8 +186,10 @@
     case FLIPADST_DST:
       vp10_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
       break;
+    case H_DCT:
+    case V_DCT:
     case IDTX:
-      fwd_idtx_c(src_diff, coeff, diff_stride, 32);
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default:
@@ -240,7 +233,7 @@
       vp10_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
-      fwd_idtx_c(src_diff, coeff, diff_stride, 4);
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 4, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default:
@@ -282,7 +275,7 @@
       vp10_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
-      fwd_idtx_c(src_diff, coeff, diff_stride, 8);
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default:
@@ -324,7 +317,7 @@
       vp10_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
-      fwd_idtx_c(src_diff, coeff, diff_stride, 16);
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default:
@@ -362,7 +355,7 @@
       vp10_highbd_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
-      fwd_idtx_c(src_diff, coeff, diff_stride, 32);
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default: