Add the missing IDTX type optimization to hybrid txfm

Change-Id: I99b15e5270bfefe2eb3e982aeba06ed564540d73
diff --git a/av1/common/x86/hybrid_inv_txfm_avx2.c b/av1/common/x86/hybrid_inv_txfm_avx2.c
index 754152c..9f69959 100644
--- a/av1/common/x86/hybrid_inv_txfm_avx2.c
+++ b/av1/common/x86/hybrid_inv_txfm_avx2.c
@@ -470,6 +470,10 @@
       iadst16(in);
       flip_col(&dest, &stride, 16);
       break;
+    case IDTX:
+      iidtx16(in);
+      iidtx16(in);
+      break;
     case V_DCT:
       iidtx16(in);
       idct16(in);
diff --git a/av1/common/x86/idct_intrin_sse2.c b/av1/common/x86/idct_intrin_sse2.c
index 62c2731..293fedd 100644
--- a/av1/common/x86/idct_intrin_sse2.c
+++ b/av1/common/x86/idct_intrin_sse2.c
@@ -494,6 +494,10 @@
       aom_iadst16_sse2(in0, in1);
       FLIPUD_PTR(dest, stride, 16);
       break;
+    case IDTX:
+      iidtx16_sse2(in0, in1);
+      iidtx16_sse2(in0, in1);
+      break;
     case V_DCT:
       iidtx16_sse2(in0, in1);
       aom_idct16_sse2(in0, in1);
diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index fbc0f07..fe20c55 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c
@@ -52,12 +52,7 @@
     return;
   }
 
-#if CONFIG_EXT_TX
-  if (tx_type == IDTX)
-    av1_fwd_idtx_c(src_diff, coeff, diff_stride, 4, tx_type);
-  else
-#endif
-    av1_fht4x4(src_diff, coeff, diff_stride, tx_type);
+  av1_fht4x4(src_diff, coeff, diff_stride, tx_type);
 }
 
 static void fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
@@ -106,36 +101,21 @@
                          int diff_stride, TX_TYPE tx_type,
                          FWD_TXFM_OPT fwd_txfm_opt) {
   (void)fwd_txfm_opt;
-#if CONFIG_EXT_TX
-  if (tx_type == IDTX)
-    av1_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type);
-  else
-#endif
-    av1_fht8x8(src_diff, coeff, diff_stride, tx_type);
+  av1_fht8x8(src_diff, coeff, diff_stride, tx_type);
 }
 
 static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
                            int diff_stride, TX_TYPE tx_type,
                            FWD_TXFM_OPT fwd_txfm_opt) {
   (void)fwd_txfm_opt;
-#if CONFIG_EXT_TX
-  if (tx_type == IDTX)
-    av1_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type);
-  else
-#endif
-    av1_fht16x16(src_diff, coeff, diff_stride, tx_type);
+  av1_fht16x16(src_diff, coeff, diff_stride, tx_type);
 }
 
 static void fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
                            int diff_stride, TX_TYPE tx_type,
                            FWD_TXFM_OPT fwd_txfm_opt) {
   (void)fwd_txfm_opt;
-#if CONFIG_EXT_TX
-  if (tx_type == IDTX)
-    av1_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type);
-  else
-#endif
-    av1_fht32x32(src_diff, coeff, diff_stride, tx_type);
+  av1_fht32x32(src_diff, coeff, diff_stride, tx_type);
 }
 
 #if CONFIG_TX64X64
diff --git a/av1/encoder/x86/dct_intrin_sse2.c b/av1/encoder/x86/dct_intrin_sse2.c
index 6bba11a..f613dc0 100644
--- a/av1/encoder/x86/dct_intrin_sse2.c
+++ b/av1/encoder/x86/dct_intrin_sse2.c
@@ -257,6 +257,12 @@
       fadst4_sse2(in);
       write_buffer_4x4(output, in);
       break;
+    case IDTX:
+      load_buffer_4x4(input, in, stride, 0, 0);
+      fidtx4_sse2(in);
+      fidtx4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
     case V_DCT:
       load_buffer_4x4(input, in, stride, 0, 0);
       fdct4_sse2(in);
@@ -1357,6 +1363,13 @@
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
+    case IDTX:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fidtx8_sse2(in);
+      fidtx8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
     case V_DCT:
       load_buffer_8x8(input, in, stride, 0, 0);
       fdct8_sse2(in);
@@ -2579,6 +2592,13 @@
       fadst16_sse2(in0, in1);
       write_buffer_16x16(output, in0, in1, 16);
       break;
+    case IDTX:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fidtx16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fidtx16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
     case V_DCT:
       load_buffer_16x16(input, in0, in1, stride, 0, 0);
       fdct16_sse2(in0, in1);
diff --git a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
index 0b7b1b6..53e970f 100644
--- a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
+++ b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
@@ -1025,6 +1025,13 @@
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
+    case IDTX:
+      load_buffer_16x16(input, stride, 0, 0, in);
+      fidtx16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fidtx16_avx2(in);
+      break;
     case V_DCT:
       load_buffer_16x16(input, stride, 0, 0, in);
       fdct16_avx2(in);
@@ -1621,6 +1628,12 @@
       right_shift_32x32(in0, in1);
       fhalfright32_avx2(in0, in1);
       break;
+    case IDTX:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fidtx32_avx2(in0, in1);
+      right_shift_32x32(in0, in1);
+      fidtx32_avx2(in0, in1);
+      break;
     case V_DCT:
       load_buffer_32x32(input, stride, 0, 0, in0, in1);
       fdct32_avx2(in0, in1);
diff --git a/test/av1_fht16x16_test.cc b/test/av1_fht16x16_test.cc
index 0eee35b..b1c4b53 100644
--- a/test/av1_fht16x16_test.cc
+++ b/test/av1_fht16x16_test.cc
@@ -184,6 +184,8 @@
              256),
   make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 8, AOM_BITS_8,
              256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 9, AOM_BITS_8,
+             256),
   make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 10, AOM_BITS_8,
              256),
   make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 11, AOM_BITS_8,
@@ -223,6 +225,8 @@
              256),
   make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 8, AOM_BITS_8,
              256),
+  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 9, AOM_BITS_8,
+             256),
   make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 10, AOM_BITS_8,
              256),
   make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 11, AOM_BITS_8,
diff --git a/test/av1_fht4x4_test.cc b/test/av1_fht4x4_test.cc
index 611b867..bf0f0d6 100644
--- a/test/av1_fht4x4_test.cc
+++ b/test/av1_fht4x4_test.cc
@@ -177,6 +177,7 @@
   make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 6, AOM_BITS_8, 16),
   make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 7, AOM_BITS_8, 16),
   make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 8, AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 9, AOM_BITS_8, 16),
   make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 10, AOM_BITS_8, 16),
   make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 11, AOM_BITS_8, 16),
   make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 12, AOM_BITS_8, 16),
diff --git a/test/av1_fht8x8_test.cc b/test/av1_fht8x8_test.cc
index 29ca004..72c3766 100644
--- a/test/av1_fht8x8_test.cc
+++ b/test/av1_fht8x8_test.cc
@@ -177,6 +177,7 @@
   make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 6, AOM_BITS_8, 64),
   make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 7, AOM_BITS_8, 64),
   make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 8, AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 9, AOM_BITS_8, 64),
   make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 10, AOM_BITS_8, 64),
   make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 11, AOM_BITS_8, 64),
   make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 12, AOM_BITS_8, 64),
diff --git a/test/fht32x32_test.cc b/test/fht32x32_test.cc
index 4589dc7..5a6fda9 100644
--- a/test/fht32x32_test.cc
+++ b/test/fht32x32_test.cc
@@ -201,6 +201,7 @@
   make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 6, AOM_BITS_8, 1024),
   make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 7, AOM_BITS_8, 1024),
   make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 8, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 9, AOM_BITS_8, 1024),
   make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 10, AOM_BITS_8, 1024),
   make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 11, AOM_BITS_8, 1024),
   make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 12, AOM_BITS_8, 1024),