Add SSE2 av1_fht32x32
BUG=aomedia:407
Change-Id: I27a7a230bbc701920a996d1e22ae4d22ca8cfead
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 8247523..e79fa71 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -514,7 +514,7 @@
specialize qw/av1_fht16x16 sse2 avx2/;
add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-specialize qw/av1_fht32x32 avx2/;
+specialize qw/av1_fht32x32 sse2 avx2/;
if (aom_config("CONFIG_TX64X64") eq "yes") {
add_proto qw/void av1_fht64x64/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
diff --git a/av1/encoder/x86/dct_intrin_sse2.c b/av1/encoder/x86/dct_intrin_sse2.c
index 77a1ded..6bba11a 100644
--- a/av1/encoder/x86/dct_intrin_sse2.c
+++ b/av1/encoder/x86/dct_intrin_sse2.c
@@ -3947,3 +3947,206 @@
}
write_buffer_32x16(output, in0, in1, in2, in3);
}
+
+// Note:
+// 32x32 hybrid fwd txfm
+// 4x2 grids of 8x16 block. Each block is represented by __m128i in[16]
+static INLINE void load_buffer_32x32(const int16_t *input,
+ __m128i *in0 /*in0[32]*/,
+ __m128i *in1 /*in1[32]*/,
+ __m128i *in2 /*in2[32]*/,
+ __m128i *in3 /*in3[32]*/, int stride,
+ int flipud, int fliplr) {
+ if (flipud) {
+ input += 31 * stride;
+ stride = -stride;
+ }
+
+ int i;
+ for (i = 0; i < 32; ++i) {
+ in0[i] = _mm_slli_epi16(
+ _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
+ in1[i] = _mm_slli_epi16(
+ _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
+ in2[i] = _mm_slli_epi16(
+ _mm_load_si128((const __m128i *)(input + i * stride + 16)), 2);
+ in3[i] = _mm_slli_epi16(
+ _mm_load_si128((const __m128i *)(input + i * stride + 24)), 2);
+ }
+
+ if (fliplr) {
+ for (i = 0; i < 32; ++i) {
+ __m128i tmp1 = in0[i];
+ __m128i tmp2 = in1[i];
+ in0[i] = mm_reverse_epi16(in3[i]);
+ in1[i] = mm_reverse_epi16(in2[i]);
+ in2[i] = mm_reverse_epi16(tmp2);
+ in3[i] = mm_reverse_epi16(tmp1);
+ }
+ }
+}
+
+static INLINE void swap_16x16(__m128i *b0l /*b0l[16]*/,
+ __m128i *b0r /*b0r[16]*/,
+ __m128i *b1l /*b1l[16]*/,
+ __m128i *b1r /*b1r[16]*/) {
+ int i;
+ for (i = 0; i < 16; ++i) {
+ __m128i tmp0 = b1l[i];
+ __m128i tmp1 = b1r[i];
+ b1l[i] = b0l[i];
+ b1r[i] = b0r[i];
+ b0l[i] = tmp0;
+ b0r[i] = tmp1;
+ }
+}
+
+static INLINE void fdct32(__m128i *in0, __m128i *in1, __m128i *in2,
+ __m128i *in3) {
+ fdct32_8col(in0, &in0[16]);
+ fdct32_8col(in1, &in1[16]);
+ fdct32_8col(in2, &in2[16]);
+ fdct32_8col(in3, &in3[16]);
+
+ array_transpose_16x16(in0, in1);
+ array_transpose_16x16(&in0[16], &in1[16]);
+ array_transpose_16x16(in2, in3);
+ array_transpose_16x16(&in2[16], &in3[16]);
+
+ swap_16x16(&in0[16], &in1[16], in2, in3);
+}
+
+static INLINE void fhalfright32(__m128i *in0, __m128i *in1, __m128i *in2,
+ __m128i *in3) {
+ fhalfright32_16col(in0, in1, &in0[16], &in1[16], no_transpose);
+ fhalfright32_16col(in2, in3, &in2[16], &in3[16], no_transpose);
+ swap_16x16(&in0[16], &in1[16], in2, in3);
+}
+
+#if CONFIG_EXT_TX
+static INLINE void fidtx32(__m128i *in0, __m128i *in1, __m128i *in2,
+ __m128i *in3) {
+ fidtx32_16col(in0, in1, &in0[16], &in1[16]);
+ fidtx32_16col(in2, in3, &in2[16], &in3[16]);
+ swap_16x16(&in0[16], &in1[16], in2, in3);
+}
+#endif
+
+static INLINE void round_signed_32x32(__m128i *in0, __m128i *in1, __m128i *in2,
+ __m128i *in3) {
+ round_signed_16x16(in0, in1);
+ round_signed_16x16(&in0[16], &in1[16]);
+ round_signed_16x16(in2, in3);
+ round_signed_16x16(&in2[16], &in3[16]);
+}
+
+static INLINE void write_buffer_32x32(__m128i *in0, __m128i *in1, __m128i *in2,
+ __m128i *in3, tran_low_t *output) {
+ int i;
+ for (i = 0; i < 32; ++i) {
+ store_output(&in0[i], output + i * 32 + 0);
+ store_output(&in1[i], output + i * 32 + 8);
+ store_output(&in2[i], output + i * 32 + 16);
+ store_output(&in3[i], output + i * 32 + 24);
+ }
+}
+
+void av1_fht32x32_sse2(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ __m128i in0[32], in1[32], in2[32], in3[32];
+
+ load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 0);
+ switch (tx_type) {
+ case DCT_DCT:
+ fdct32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fdct32(in0, in1, in2, in3);
+ break;
+ case ADST_DCT:
+ fhalfright32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fdct32(in0, in1, in2, in3);
+ break;
+ case DCT_ADST:
+ fdct32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fhalfright32(in0, in1, in2, in3);
+ break;
+ case ADST_ADST:
+ fhalfright32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fhalfright32(in0, in1, in2, in3);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
+ fhalfright32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fdct32(in0, in1, in2, in3);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
+ fdct32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fhalfright32(in0, in1, in2, in3);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 1);
+ fhalfright32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fhalfright32(in0, in1, in2, in3);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
+ fhalfright32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fhalfright32(in0, in1, in2, in3);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
+ fhalfright32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fhalfright32(in0, in1, in2, in3);
+ break;
+ case IDTX:
+ fidtx32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fidtx32(in0, in1, in2, in3);
+ break;
+ case V_DCT:
+ fdct32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fidtx32(in0, in1, in2, in3);
+ break;
+ case H_DCT:
+ fidtx32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fdct32(in0, in1, in2, in3);
+ break;
+ case V_ADST:
+ fhalfright32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fidtx32(in0, in1, in2, in3);
+ break;
+ case H_ADST:
+ fidtx32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fhalfright32(in0, in1, in2, in3);
+ break;
+ case V_FLIPADST:
+ load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
+ fhalfright32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fidtx32(in0, in1, in2, in3);
+ break;
+ case H_FLIPADST:
+ load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
+ fidtx32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fhalfright32(in0, in1, in2, in3);
+ break;
+#endif
+ default: assert(0);
+ }
+ write_buffer_32x32(in0, in1, in2, in3, output);
+}
diff --git a/test/fht32x32_test.cc b/test/fht32x32_test.cc
index 160bd5b..4589dc7 100644
--- a/test/fht32x32_test.cc
+++ b/test/fht32x32_test.cc
@@ -164,6 +164,31 @@
using std::tr1::make_tuple;
+#if HAVE_SSE2
+const Ht32x32Param kArrayHt32x32Param_sse2[] = {
+ make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 0, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 1, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 2, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 3, AOM_BITS_8, 1024),
+#if CONFIG_EXT_TX
+ make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 4, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 5, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 6, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 7, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 8, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 9, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 10, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 11, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 12, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 13, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 14, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 15, AOM_BITS_8, 1024)
+#endif // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans32x32HT,
+ ::testing::ValuesIn(kArrayHt32x32Param_sse2));
+#endif // HAVE_SSE2
+
#if HAVE_AVX2
const Ht32x32Param kArrayHt32x32Param_avx2[] = {
make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 0, AOM_BITS_8, 1024),
diff --git a/test/test.mk b/test/test.mk
index 0a4a288..7096d23 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -1,3 +1,14 @@
+##
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+
LIBAOM_TEST_SRCS-yes += acm_random.h
LIBAOM_TEST_SRCS-yes += clear_system_state.h
LIBAOM_TEST_SRCS-yes += codec_factory.h