8x8/16x16 HT types V_DCT to H_FLIPADST SSE2 optimization - Wrote function: fidtx8_sse2() and fidtx16_sse2(). - Turned on vp10_fht8x8_sse2()/vp10_fht16x16_sse2() for new types. - Updated 8x8/16x16 unit tests for accuracy/speed. - Running 20K times with random numbers and getting through tx type from V_DCT to H_FLIPADST, SSE2 speed improvement: 8x8: ~131% 16x16: ~66% Change-Id: Ibbb707e932a08fec3b1f423a7dab280a1d696c9a
diff --git a/test/vp10_fht16x16_test.cc b/test/vp10_fht16x16_test.cc index 3967149..7994d7f 100644 --- a/test/vp10_fht16x16_test.cc +++ b/test/vp10_fht16x16_test.cc
@@ -70,6 +70,61 @@ RunCoeffCheck(); } +#if CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH +TEST(VP10Trans16x16HTSpeedTest, C_version) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 20000; + int bit_depth = 8; + int mask = (1 << bit_depth) - 1; + const int num_coeffs = 256; + int16_t *input = new int16_t[num_coeffs]; + tran_low_t *output = new tran_low_t[num_coeffs]; + const int stride = 16; + int tx_type; + + for (int i = 0; i < count_test_block; ++i) { + for (int j = 0; j < num_coeffs; ++j) { + input[j] = (rnd.Rand8() & mask) - (rnd.Rand8() & mask); + } + for (tx_type = V_DCT; tx_type <= H_FLIPADST; ++tx_type) { + vp10_fht16x16_c(input, output, stride, tx_type); + } + } + + delete[] input; + delete[] output; +} +#endif // CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_SSE2 && CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH +TEST(VP10Trans16x16HTSpeedTest, SSE2_version) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 20000; + int bit_depth = 8; + int mask = (1 << bit_depth) - 1; + const int num_coeffs = 256; + int16_t *input = reinterpret_cast<int16_t *> + (vpx_memalign(16, sizeof(int16_t) * num_coeffs)); + tran_low_t *output = reinterpret_cast<tran_low_t *> + (vpx_memalign(16, sizeof(tran_low_t) * num_coeffs)); + + const int stride = 16; + int tx_type; + + for (int i = 0; i < count_test_block; ++i) { + for (int j = 0; j < num_coeffs; ++j) { + input[j] = (rnd.Rand8() & mask) - (rnd.Rand8() & mask); + } + for (tx_type = V_DCT; tx_type <= H_FLIPADST; ++tx_type) { + vp10_fht16x16_sse2(input, output, stride, tx_type); + } + } + + vpx_free(input); + vpx_free(output); +} +#endif // HAVE_SSE2 && CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH + using std::tr1::make_tuple; #if HAVE_SSE2 @@ -103,6 +158,18 @@ make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 7, VPX_BITS_8, 256), make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 8, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 10, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 11, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 12, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 13, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 14, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 15, VPX_BITS_8, 256))); #endif // !CONFIG_EXT_TX #endif // HAVE_SSE2
diff --git a/test/vp10_fht8x8_test.cc b/test/vp10_fht8x8_test.cc index 96f5632..07ab61d 100644 --- a/test/vp10_fht8x8_test.cc +++ b/test/vp10_fht8x8_test.cc
@@ -69,6 +69,61 @@ RunCoeffCheck(); } +#if CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH +TEST(VP10Trans8x8HTSpeedTest, C_version) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 20000; + int bit_depth = 8; + int mask = (1 << bit_depth) - 1; + const int num_coeffs = 64; + int16_t *input = new int16_t[num_coeffs]; + tran_low_t *output = new tran_low_t[num_coeffs]; + const int stride = 8; + int tx_type; + + for (int i = 0; i < count_test_block; ++i) { + for (int j = 0; j < num_coeffs; ++j) { + input[j] = (rnd.Rand8() & mask) - (rnd.Rand8() & mask); + } + for (tx_type = V_DCT; tx_type <= H_FLIPADST; ++tx_type) { + vp10_fht8x8_c(input, output, stride, tx_type); + } + } + + delete[] input; + delete[] output; +} +#endif // CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_SSE2 && CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH +TEST(VP10Trans8x8HTSpeedTest, SSE2_version) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 20000; + int bit_depth = 8; + int mask = (1 << bit_depth) - 1; + const int num_coeffs = 64; + int16_t *input = reinterpret_cast<int16_t *> + (vpx_memalign(16, sizeof(int16_t) * num_coeffs)); + tran_low_t *output = reinterpret_cast<tran_low_t *> + (vpx_memalign(16, sizeof(tran_low_t) * num_coeffs)); + + const int stride = 8; + int tx_type; + + for (int i = 0; i < count_test_block; ++i) { + for (int j = 0; j < num_coeffs; ++j) { + input[j] = (rnd.Rand8() & mask) - (rnd.Rand8() & mask); + } + for (tx_type = V_DCT; tx_type <= H_FLIPADST; ++tx_type) { + vp10_fht8x8_sse2(input, output, stride, tx_type); + } + } + + vpx_free(input); + vpx_free(output); +} +#endif // HAVE_SSE2 && CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH + using std::tr1::make_tuple; #if HAVE_SSE2 @@ -102,6 +157,18 @@ make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 7, VPX_BITS_8, 64), make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 8, + VPX_BITS_8, 64), + make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 10, + VPX_BITS_8, 64), + make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 11, + VPX_BITS_8, 64), + make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 12, + VPX_BITS_8, 64), + make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 13, + VPX_BITS_8, 64), + make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 14, + VPX_BITS_8, 64), + make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 15, VPX_BITS_8, 64))); #endif // !CONFIG_EXT_TX #endif // HAVE_SSE2
diff --git a/vp10/encoder/hybrid_fwd_txfm.c b/vp10/encoder/hybrid_fwd_txfm.c index fd1cff2..2018960 100644 --- a/vp10/encoder/hybrid_fwd_txfm.c +++ b/vp10/encoder/hybrid_fwd_txfm.c
@@ -54,8 +54,6 @@ case FLIPADST_FLIPADST: case ADST_FLIPADST: case FLIPADST_ADST: - vp10_fht4x4(src_diff, coeff, diff_stride, tx_type); - break; case V_DCT: case H_DCT: case V_ADST: @@ -70,7 +68,6 @@ #endif // CONFIG_EXT_TX default: assert(0); - break; } } @@ -93,15 +90,13 @@ case FLIPADST_FLIPADST: case ADST_FLIPADST: case FLIPADST_ADST: - vp10_fht8x8(src_diff, coeff, diff_stride, tx_type); - break; case V_DCT: case H_DCT: case V_ADST: case H_ADST: case V_FLIPADST: case H_FLIPADST: - vp10_fht8x8_c(src_diff, coeff, diff_stride, tx_type); + vp10_fht8x8(src_diff, coeff, diff_stride, tx_type); break; case IDTX: vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type); @@ -109,7 +104,6 @@ #endif // CONFIG_EXT_TX default: assert(0); - break; } } @@ -132,15 +126,13 @@ case FLIPADST_FLIPADST: case ADST_FLIPADST: case FLIPADST_ADST: - vp10_fht16x16(src_diff, coeff, diff_stride, tx_type); - break; case V_DCT: case H_DCT: case V_ADST: case H_ADST: case V_FLIPADST: case H_FLIPADST: - vp10_fht16x16_c(src_diff, coeff, diff_stride, tx_type); + vp10_fht16x16(src_diff, coeff, diff_stride, tx_type); break; case IDTX: vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type); @@ -148,7 +140,6 @@ #endif // CONFIG_EXT_TX default: assert(0); - break; } }
diff --git a/vp10/encoder/x86/dct_sse2.c b/vp10/encoder/x86/dct_sse2.c index 0e568dd..47422ad 100644 --- a/vp10/encoder/x86/dct_sse2.c +++ b/vp10/encoder/x86/dct_sse2.c
@@ -1280,6 +1280,21 @@ array_transpose_8x8(in, in); } +#if CONFIG_EXT_TX +static void fidtx8_sse2(__m128i *in) { + in[0] = _mm_slli_epi16(in[0], 1); + in[1] = _mm_slli_epi16(in[1], 1); + in[2] = _mm_slli_epi16(in[2], 1); + in[3] = _mm_slli_epi16(in[3], 1); + in[4] = _mm_slli_epi16(in[4], 1); + in[5] = _mm_slli_epi16(in[5], 1); + in[6] = _mm_slli_epi16(in[6], 1); + in[7] = _mm_slli_epi16(in[7], 1); + + array_transpose_8x8(in, in); +} +#endif // CONFIG_EXT_TX + void vp10_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type) { __m128i in[8]; @@ -1345,10 +1360,51 @@ right_shift_8x8(in, 1); write_buffer_8x8(output, in, 8); break; + case V_DCT: + load_buffer_8x8(input, in, stride, 0, 0); + fdct8_sse2(in); + fidtx8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + case H_DCT: + load_buffer_8x8(input, in, stride, 0, 0); + fidtx8_sse2(in); + fdct8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + case V_ADST: + load_buffer_8x8(input, in, stride, 0, 0); + fadst8_sse2(in); + fidtx8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + case H_ADST: + load_buffer_8x8(input, in, stride, 0, 0); + fidtx8_sse2(in); + fadst8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + case V_FLIPADST: + load_buffer_8x8(input, in, stride, 1, 0); + fadst8_sse2(in); + fidtx8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + case H_FLIPADST: + load_buffer_8x8(input, in, stride, 0, 1); + fidtx8_sse2(in); + fadst8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; #endif // CONFIG_EXT_TX default: assert(0); - break; } } @@ -2226,6 +2282,204 @@ array_transpose_16x16(in0, in1); } +#if CONFIG_EXT_TX +static void fidtx16_8col(__m128i *in) { + const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0); + const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i y0, y1, y2, y3, y4, y5, y6, y7; + + in[0] = _mm_slli_epi16(in[0], 1); + in[1] = _mm_slli_epi16(in[1], 1); + in[2] = _mm_slli_epi16(in[2], 1); + in[3] = _mm_slli_epi16(in[3], 1); + in[4] = _mm_slli_epi16(in[4], 1); + in[5] = _mm_slli_epi16(in[5], 1); + in[6] = _mm_slli_epi16(in[6], 1); + in[7] = _mm_slli_epi16(in[7], 1); + in[8] = _mm_slli_epi16(in[8], 1); + in[9] = _mm_slli_epi16(in[9], 1); + in[10] = _mm_slli_epi16(in[10], 1); + in[11] = _mm_slli_epi16(in[11], 1); + in[12] = _mm_slli_epi16(in[12], 1); + in[13] = _mm_slli_epi16(in[13], 1); + in[14] = _mm_slli_epi16(in[14], 1); + in[15] = _mm_slli_epi16(in[15], 1); + + v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16); + v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16); + v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16); + v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16); + v4 = _mm_unpacklo_epi16(in[4], k__zero_epi16); + v5 = _mm_unpacklo_epi16(in[5], k__zero_epi16); + v6 = _mm_unpacklo_epi16(in[6], k__zero_epi16); + v7 = _mm_unpacklo_epi16(in[7], k__zero_epi16); + + u0 = _mm_unpacklo_epi16(in[8], k__zero_epi16); + u1 = _mm_unpacklo_epi16(in[9], k__zero_epi16); + u2 = _mm_unpacklo_epi16(in[10], k__zero_epi16); + u3 = _mm_unpacklo_epi16(in[11], k__zero_epi16); + u4 = _mm_unpacklo_epi16(in[12], k__zero_epi16); + u5 = _mm_unpacklo_epi16(in[13], k__zero_epi16); + u6 = _mm_unpacklo_epi16(in[14], k__zero_epi16); + u7 = _mm_unpacklo_epi16(in[15], k__zero_epi16); + + x0 = _mm_unpackhi_epi16(in[0], k__zero_epi16); + x1 = _mm_unpackhi_epi16(in[1], k__zero_epi16); + x2 = _mm_unpackhi_epi16(in[2], k__zero_epi16); + x3 = _mm_unpackhi_epi16(in[3], k__zero_epi16); + x4 = _mm_unpackhi_epi16(in[4], k__zero_epi16); + x5 = _mm_unpackhi_epi16(in[5], k__zero_epi16); + x6 = _mm_unpackhi_epi16(in[6], k__zero_epi16); + x7 = _mm_unpackhi_epi16(in[7], k__zero_epi16); + + y0 = _mm_unpackhi_epi16(in[8], k__zero_epi16); + y1 = _mm_unpackhi_epi16(in[9], k__zero_epi16); + y2 = _mm_unpackhi_epi16(in[10], k__zero_epi16); + y3 = _mm_unpackhi_epi16(in[11], k__zero_epi16); + y4 = _mm_unpackhi_epi16(in[12], k__zero_epi16); + y5 = _mm_unpackhi_epi16(in[13], k__zero_epi16); + y6 = _mm_unpackhi_epi16(in[14], k__zero_epi16); + y7 = _mm_unpackhi_epi16(in[15], k__zero_epi16); + + v0 = _mm_madd_epi16(v0, k__sqrt2_epi16); + v1 = _mm_madd_epi16(v1, k__sqrt2_epi16); + v2 = _mm_madd_epi16(v2, k__sqrt2_epi16); + v3 = _mm_madd_epi16(v3, k__sqrt2_epi16); + v4 = _mm_madd_epi16(v4, k__sqrt2_epi16); + v5 = _mm_madd_epi16(v5, k__sqrt2_epi16); + v6 = _mm_madd_epi16(v6, k__sqrt2_epi16); + v7 = _mm_madd_epi16(v7, k__sqrt2_epi16); + + x0 = _mm_madd_epi16(x0, k__sqrt2_epi16); + x1 = _mm_madd_epi16(x1, k__sqrt2_epi16); + x2 = _mm_madd_epi16(x2, k__sqrt2_epi16); + x3 = _mm_madd_epi16(x3, k__sqrt2_epi16); + x4 = _mm_madd_epi16(x4, k__sqrt2_epi16); + x5 = _mm_madd_epi16(x5, k__sqrt2_epi16); + x6 = _mm_madd_epi16(x6, k__sqrt2_epi16); + x7 = _mm_madd_epi16(x7, k__sqrt2_epi16); + + u0 = _mm_madd_epi16(u0, k__sqrt2_epi16); + u1 = _mm_madd_epi16(u1, k__sqrt2_epi16); + u2 = _mm_madd_epi16(u2, k__sqrt2_epi16); + u3 = _mm_madd_epi16(u3, k__sqrt2_epi16); + u4 = _mm_madd_epi16(u4, k__sqrt2_epi16); + u5 = _mm_madd_epi16(u5, k__sqrt2_epi16); + u6 = _mm_madd_epi16(u6, k__sqrt2_epi16); + u7 = _mm_madd_epi16(u7, k__sqrt2_epi16); + + y0 = _mm_madd_epi16(y0, k__sqrt2_epi16); + y1 = _mm_madd_epi16(y1, k__sqrt2_epi16); + y2 = _mm_madd_epi16(y2, k__sqrt2_epi16); + y3 = _mm_madd_epi16(y3, k__sqrt2_epi16); + y4 = _mm_madd_epi16(y4, k__sqrt2_epi16); + y5 = _mm_madd_epi16(y5, k__sqrt2_epi16); + y6 = _mm_madd_epi16(y6, k__sqrt2_epi16); + y7 = _mm_madd_epi16(y7, k__sqrt2_epi16); + + v0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); + + x0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING); + x1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING); + x2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING); + x3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING); + x4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING); + x5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING); + x6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING); + x7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING); + + u0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + u1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + u2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + u3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + u4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + u5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + u6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + u7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + + y0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING); + y1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING); + y2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING); + y3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING); + y4 = _mm_add_epi32(y4, k__DCT_CONST_ROUNDING); + y5 = _mm_add_epi32(y5, k__DCT_CONST_ROUNDING); + y6 = _mm_add_epi32(y6, k__DCT_CONST_ROUNDING); + y7 = _mm_add_epi32(y7, k__DCT_CONST_ROUNDING); + + v0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + v1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + v2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + v3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + v4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + v5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + v6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + v7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + + x0 = _mm_srai_epi32(x0, DCT_CONST_BITS); + x1 = _mm_srai_epi32(x1, DCT_CONST_BITS); + x2 = _mm_srai_epi32(x2, DCT_CONST_BITS); + x3 = _mm_srai_epi32(x3, DCT_CONST_BITS); + x4 = _mm_srai_epi32(x4, DCT_CONST_BITS); + x5 = _mm_srai_epi32(x5, DCT_CONST_BITS); + x6 = _mm_srai_epi32(x6, DCT_CONST_BITS); + x7 = _mm_srai_epi32(x7, DCT_CONST_BITS); + + u0 = _mm_srai_epi32(u0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(u1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(u2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(u3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(u4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(u5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(u6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(u7, DCT_CONST_BITS); + + y0 = _mm_srai_epi32(y0, DCT_CONST_BITS); + y1 = _mm_srai_epi32(y1, DCT_CONST_BITS); + y2 = _mm_srai_epi32(y2, DCT_CONST_BITS); + y3 = _mm_srai_epi32(y3, DCT_CONST_BITS); + y4 = _mm_srai_epi32(y4, DCT_CONST_BITS); + y5 = _mm_srai_epi32(y5, DCT_CONST_BITS); + y6 = _mm_srai_epi32(y6, DCT_CONST_BITS); + y7 = _mm_srai_epi32(y7, DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(v0, x0); + in[1] = _mm_packs_epi32(v1, x1); + in[2] = _mm_packs_epi32(v2, x2); + in[3] = _mm_packs_epi32(v3, x3); + in[4] = _mm_packs_epi32(v4, x4); + in[5] = _mm_packs_epi32(v5, x5); + in[6] = _mm_packs_epi32(v6, x6); + in[7] = _mm_packs_epi32(v7, x7); + + in[8] = _mm_packs_epi32(u0, y0); + in[9] = _mm_packs_epi32(u1, y1); + in[10] = _mm_packs_epi32(u2, y2); + in[11] = _mm_packs_epi32(u3, y3); + in[12] = _mm_packs_epi32(u4, y4); + in[13] = _mm_packs_epi32(u5, y5); + in[14] = _mm_packs_epi32(u6, y6); + in[15] = _mm_packs_epi32(u7, y7); +} + +static void fidtx16_sse2(__m128i *in0, __m128i *in1) { + fidtx16_8col(in0); + fidtx16_8col(in1); + array_transpose_16x16(in0, in1); +} +#endif // CONFIG_EXT_TX + void vp10_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type) { __m128i in0[16], in1[16]; @@ -2291,6 +2545,48 @@ fadst16_sse2(in0, in1); write_buffer_16x16(output, in0, in1, 16); break; + case V_DCT: + load_buffer_16x16(input, in0, in1, stride, 0, 0); + fdct16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fidtx16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case H_DCT: + load_buffer_16x16(input, in0, in1, stride, 0, 0); + fidtx16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fdct16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case V_ADST: + load_buffer_16x16(input, in0, in1, stride, 0, 0); + fadst16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fidtx16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case H_ADST: + load_buffer_16x16(input, in0, in1, stride, 0, 0); + fidtx16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fadst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case V_FLIPADST: + load_buffer_16x16(input, in0, in1, stride, 1, 0); + fadst16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fidtx16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case H_FLIPADST: + load_buffer_16x16(input, in0, in1, stride, 0, 1); + fidtx16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fadst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; #endif // CONFIG_EXT_TX default: assert(0);