Add the missing IDTX type optimization to hybrid txfm
Change-Id: I99b15e5270bfefe2eb3e982aeba06ed564540d73
diff --git a/av1/common/x86/hybrid_inv_txfm_avx2.c b/av1/common/x86/hybrid_inv_txfm_avx2.c
index 754152c..9f69959 100644
--- a/av1/common/x86/hybrid_inv_txfm_avx2.c
+++ b/av1/common/x86/hybrid_inv_txfm_avx2.c
@@ -470,6 +470,10 @@
iadst16(in);
flip_col(&dest, &stride, 16);
break;
+ case IDTX:
+ iidtx16(in);
+ iidtx16(in);
+ break;
case V_DCT:
iidtx16(in);
idct16(in);
diff --git a/av1/common/x86/idct_intrin_sse2.c b/av1/common/x86/idct_intrin_sse2.c
index 62c2731..293fedd 100644
--- a/av1/common/x86/idct_intrin_sse2.c
+++ b/av1/common/x86/idct_intrin_sse2.c
@@ -494,6 +494,10 @@
aom_iadst16_sse2(in0, in1);
FLIPUD_PTR(dest, stride, 16);
break;
+ case IDTX:
+ iidtx16_sse2(in0, in1);
+ iidtx16_sse2(in0, in1);
+ break;
case V_DCT:
iidtx16_sse2(in0, in1);
aom_idct16_sse2(in0, in1);
diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index fbc0f07..fe20c55 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c
@@ -52,12 +52,7 @@
return;
}
-#if CONFIG_EXT_TX
- if (tx_type == IDTX)
- av1_fwd_idtx_c(src_diff, coeff, diff_stride, 4, tx_type);
- else
-#endif
- av1_fht4x4(src_diff, coeff, diff_stride, tx_type);
+ av1_fht4x4(src_diff, coeff, diff_stride, tx_type);
}
static void fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
@@ -106,36 +101,21 @@
int diff_stride, TX_TYPE tx_type,
FWD_TXFM_OPT fwd_txfm_opt) {
(void)fwd_txfm_opt;
-#if CONFIG_EXT_TX
- if (tx_type == IDTX)
- av1_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type);
- else
-#endif
- av1_fht8x8(src_diff, coeff, diff_stride, tx_type);
+ av1_fht8x8(src_diff, coeff, diff_stride, tx_type);
}
static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TX_TYPE tx_type,
FWD_TXFM_OPT fwd_txfm_opt) {
(void)fwd_txfm_opt;
-#if CONFIG_EXT_TX
- if (tx_type == IDTX)
- av1_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type);
- else
-#endif
- av1_fht16x16(src_diff, coeff, diff_stride, tx_type);
+ av1_fht16x16(src_diff, coeff, diff_stride, tx_type);
}
static void fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TX_TYPE tx_type,
FWD_TXFM_OPT fwd_txfm_opt) {
(void)fwd_txfm_opt;
-#if CONFIG_EXT_TX
- if (tx_type == IDTX)
- av1_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type);
- else
-#endif
- av1_fht32x32(src_diff, coeff, diff_stride, tx_type);
+ av1_fht32x32(src_diff, coeff, diff_stride, tx_type);
}
#if CONFIG_TX64X64
diff --git a/av1/encoder/x86/dct_intrin_sse2.c b/av1/encoder/x86/dct_intrin_sse2.c
index 6bba11a..f613dc0 100644
--- a/av1/encoder/x86/dct_intrin_sse2.c
+++ b/av1/encoder/x86/dct_intrin_sse2.c
@@ -257,6 +257,12 @@
fadst4_sse2(in);
write_buffer_4x4(output, in);
break;
+ case IDTX:
+ load_buffer_4x4(input, in, stride, 0, 0);
+ fidtx4_sse2(in);
+ fidtx4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
case V_DCT:
load_buffer_4x4(input, in, stride, 0, 0);
fdct4_sse2(in);
@@ -1357,6 +1363,13 @@
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
+ case IDTX:
+ load_buffer_8x8(input, in, stride, 0, 0);
+ fidtx8_sse2(in);
+ fidtx8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
case V_DCT:
load_buffer_8x8(input, in, stride, 0, 0);
fdct8_sse2(in);
@@ -2579,6 +2592,13 @@
fadst16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
+ case IDTX:
+ load_buffer_16x16(input, in0, in1, stride, 0, 0);
+ fidtx16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fidtx16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
case V_DCT:
load_buffer_16x16(input, in0, in1, stride, 0, 0);
fdct16_sse2(in0, in1);
diff --git a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
index 0b7b1b6..53e970f 100644
--- a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
+++ b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
@@ -1025,6 +1025,13 @@
right_shift_16x16(in);
fadst16_avx2(in);
break;
+ case IDTX:
+ load_buffer_16x16(input, stride, 0, 0, in);
+ fidtx16_avx2(in);
+ mm256_transpose_16x16(in);
+ right_shift_16x16(in);
+ fidtx16_avx2(in);
+ break;
case V_DCT:
load_buffer_16x16(input, stride, 0, 0, in);
fdct16_avx2(in);
@@ -1621,6 +1628,12 @@
right_shift_32x32(in0, in1);
fhalfright32_avx2(in0, in1);
break;
+ case IDTX:
+ load_buffer_32x32(input, stride, 0, 0, in0, in1);
+ fidtx32_avx2(in0, in1);
+ right_shift_32x32(in0, in1);
+ fidtx32_avx2(in0, in1);
+ break;
case V_DCT:
load_buffer_32x32(input, stride, 0, 0, in0, in1);
fdct32_avx2(in0, in1);
diff --git a/test/av1_fht16x16_test.cc b/test/av1_fht16x16_test.cc
index 0eee35b..b1c4b53 100644
--- a/test/av1_fht16x16_test.cc
+++ b/test/av1_fht16x16_test.cc
@@ -184,6 +184,8 @@
256),
make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 8, AOM_BITS_8,
256),
+ make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 9, AOM_BITS_8,
+ 256),
make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 10, AOM_BITS_8,
256),
make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 11, AOM_BITS_8,
@@ -223,6 +225,8 @@
256),
make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 8, AOM_BITS_8,
256),
+ make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 9, AOM_BITS_8,
+ 256),
make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 10, AOM_BITS_8,
256),
make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 11, AOM_BITS_8,
diff --git a/test/av1_fht4x4_test.cc b/test/av1_fht4x4_test.cc
index 611b867..bf0f0d6 100644
--- a/test/av1_fht4x4_test.cc
+++ b/test/av1_fht4x4_test.cc
@@ -177,6 +177,7 @@
make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 6, AOM_BITS_8, 16),
make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 7, AOM_BITS_8, 16),
make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 8, AOM_BITS_8, 16),
+ make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 9, AOM_BITS_8, 16),
make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 10, AOM_BITS_8, 16),
make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 11, AOM_BITS_8, 16),
make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 12, AOM_BITS_8, 16),
diff --git a/test/av1_fht8x8_test.cc b/test/av1_fht8x8_test.cc
index 29ca004..72c3766 100644
--- a/test/av1_fht8x8_test.cc
+++ b/test/av1_fht8x8_test.cc
@@ -177,6 +177,7 @@
make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 6, AOM_BITS_8, 64),
make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 7, AOM_BITS_8, 64),
make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 8, AOM_BITS_8, 64),
+ make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 9, AOM_BITS_8, 64),
make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 10, AOM_BITS_8, 64),
make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 11, AOM_BITS_8, 64),
make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 12, AOM_BITS_8, 64),
diff --git a/test/fht32x32_test.cc b/test/fht32x32_test.cc
index 4589dc7..5a6fda9 100644
--- a/test/fht32x32_test.cc
+++ b/test/fht32x32_test.cc
@@ -201,6 +201,7 @@
make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 6, AOM_BITS_8, 1024),
make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 7, AOM_BITS_8, 1024),
make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 8, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 9, AOM_BITS_8, 1024),
make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 10, AOM_BITS_8, 1024),
make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 11, AOM_BITS_8, 1024),
make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 12, AOM_BITS_8, 1024),