Refactor inv txfm sse2 for sizes with 4 Add a set of 1D inv txfm funcitons which process 4 pixels at one time, which are faster than those process 8 pixels at one time, for sizes with 4. They are used in 2D inv txfm such as 4x4, 4x8, 4x16, 8x4 and 16x4. The unittests show 10%~30% speedup for these tx sizes. Change-Id: Ic9a416362bb42a8bab5b9e2067bd731ab97d4575
diff --git a/av1/common/x86/av1_inv_txfm_sse2.c b/av1/common/x86/av1_inv_txfm_sse2.c index 8ba8210..60a2efd 100644 --- a/av1/common/x86/av1_inv_txfm_sse2.c +++ b/av1/common/x86/av1_inv_txfm_sse2.c
@@ -43,6 +43,35 @@ output[2] = _mm_subs_epi16(x2[1], x2[2]); } +void idct4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { + (void)(cos_bit); + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + + // stage 1 + __m128i x1[4]; + x1[0] = input[0]; + x1[1] = input[2]; + x1[2] = input[1]; + x1[3] = input[3]; + + // stage 2 + __m128i x2[4]; + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x2[0], x2[1]); + btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x1[2], x1[3], x2[2], x2[3]); + + // stage 3 + output[0] = _mm_adds_epi16(x2[0], x2[3]); + output[3] = _mm_subs_epi16(x2[0], x2[3]); + output[1] = _mm_adds_epi16(x2[1], x2[2]); + output[2] = _mm_subs_epi16(x2[1], x2[2]); +} + void idct8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); @@ -108,6 +137,71 @@ output[4] = _mm_subs_epi16(x4[3], x4[4]); } +void idct8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { + (void)(cos_bit); + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m128i x1[8]; + x1[0] = input[0]; + x1[1] = input[4]; + x1[2] = input[2]; + x1[3] = input[6]; + x1[4] = input[1]; + x1[5] = input[5]; + x1[6] = input[3]; + x1[7] = input[7]; + + // stage 2 + __m128i x2[8]; + x2[0] = x1[0]; + x2[1] = x1[1]; + x2[2] = x1[2]; + x2[3] = x1[3]; + btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x1[4], x1[7], x2[4], x2[7]); + btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x1[5], x1[6], x2[5], x2[6]); + + // stage 3 + __m128i x3[8]; + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x2[0], x2[1], x3[0], x3[1]); + btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x2[2], x2[3], x3[2], x3[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[5]); + x3[5] = _mm_subs_epi16(x2[4], x2[5]); + x3[6] = _mm_subs_epi16(x2[7], x2[6]); + x3[7] = _mm_adds_epi16(x2[6], x2[7]); + + // stage 4 + __m128i x4[8]; + x4[0] = _mm_adds_epi16(x3[0], x3[3]); + x4[3] = _mm_subs_epi16(x3[0], x3[3]); + x4[1] = _mm_adds_epi16(x3[1], x3[2]); + x4[2] = _mm_subs_epi16(x3[1], x3[2]); + x4[4] = x3[4]; + btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x3[5], x3[6], x4[5], x4[6]); + x4[7] = x3[7]; + + // stage 5 + output[0] = _mm_adds_epi16(x4[0], x4[7]); + output[7] = _mm_subs_epi16(x4[0], x4[7]); + output[1] = _mm_adds_epi16(x4[1], x4[6]); + output[6] = _mm_subs_epi16(x4[1], x4[6]); + output[2] = _mm_adds_epi16(x4[2], x4[5]); + output[5] = _mm_subs_epi16(x4[2], x4[5]); + output[3] = _mm_adds_epi16(x4[3], x4[4]); + output[4] = _mm_subs_epi16(x4[3], x4[4]); +} + void idct16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); @@ -254,6 +348,152 @@ output[8] = _mm_subs_epi16(x6[7], x6[8]); } +void idct16_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { + (void)(cos_bit); + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m128i x1[16]; + x1[0] = input[0]; + x1[1] = input[8]; + x1[2] = input[4]; + x1[3] = input[12]; + x1[4] = input[2]; + x1[5] = input[10]; + x1[6] = input[6]; + x1[7] = input[14]; + x1[8] = input[1]; + x1[9] = input[9]; + x1[10] = input[5]; + x1[11] = input[13]; + x1[12] = input[3]; + x1[13] = input[11]; + x1[14] = input[7]; + x1[15] = input[15]; + + // stage 2 + __m128i x2[16]; + x2[0] = x1[0]; + x2[1] = x1[1]; + x2[2] = x1[2]; + x2[3] = x1[3]; + x2[4] = x1[4]; + x2[5] = x1[5]; + x2[6] = x1[6]; + x2[7] = x1[7]; + btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x1[8], x1[15], x2[8], x2[15]); + btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x1[9], x1[14], x2[9], x2[14]); + btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x1[10], x1[13], x2[10], x2[13]); + btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x1[11], x1[12], x2[11], x2[12]); + + // stage 3 + __m128i x3[16]; + x3[0] = x2[0]; + x3[1] = x2[1]; + x3[2] = x2[2]; + x3[3] = x2[3]; + btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x2[4], x2[7], x3[4], x3[7]); + btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x2[5], x2[6], x3[5], x3[6]); + x3[8] = _mm_adds_epi16(x2[8], x2[9]); + x3[9] = _mm_subs_epi16(x2[8], x2[9]); + x3[10] = _mm_subs_epi16(x2[11], x2[10]); + x3[11] = _mm_adds_epi16(x2[10], x2[11]); + x3[12] = _mm_adds_epi16(x2[12], x2[13]); + x3[13] = _mm_subs_epi16(x2[12], x2[13]); + x3[14] = _mm_subs_epi16(x2[15], x2[14]); + x3[15] = _mm_adds_epi16(x2[14], x2[15]); + + // stage 4 + __m128i x4[16]; + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x3[0], x3[1], x4[0], x4[1]); + btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x3[2], x3[3], x4[2], x4[3]); + x4[4] = _mm_adds_epi16(x3[4], x3[5]); + x4[5] = _mm_subs_epi16(x3[4], x3[5]); + x4[6] = _mm_subs_epi16(x3[7], x3[6]); + x4[7] = _mm_adds_epi16(x3[6], x3[7]); + x4[8] = x3[8]; + btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x3[9], x3[14], x4[9], x4[14]); + btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x3[10], x3[13], x4[10], x4[13]); + x4[11] = x3[11]; + x4[12] = x3[12]; + x4[15] = x3[15]; + + // stage 5 + __m128i x5[16]; + x5[0] = _mm_adds_epi16(x4[0], x4[3]); + x5[3] = _mm_subs_epi16(x4[0], x4[3]); + x5[1] = _mm_adds_epi16(x4[1], x4[2]); + x5[2] = _mm_subs_epi16(x4[1], x4[2]); + x5[4] = x4[4]; + btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x4[5], x4[6], x5[5], x5[6]); + x5[7] = x4[7]; + x5[8] = _mm_adds_epi16(x4[8], x4[11]); + x5[11] = _mm_subs_epi16(x4[8], x4[11]); + x5[9] = _mm_adds_epi16(x4[9], x4[10]); + x5[10] = _mm_subs_epi16(x4[9], x4[10]); + x5[12] = _mm_subs_epi16(x4[15], x4[12]); + x5[15] = _mm_adds_epi16(x4[12], x4[15]); + x5[13] = _mm_subs_epi16(x4[14], x4[13]); + x5[14] = _mm_adds_epi16(x4[13], x4[14]); + + // stage 6 + __m128i x6[16]; + x6[0] = _mm_adds_epi16(x5[0], x5[7]); + x6[7] = _mm_subs_epi16(x5[0], x5[7]); + x6[1] = _mm_adds_epi16(x5[1], x5[6]); + x6[6] = _mm_subs_epi16(x5[1], x5[6]); + x6[2] = _mm_adds_epi16(x5[2], x5[5]); + x6[5] = _mm_subs_epi16(x5[2], x5[5]); + x6[3] = _mm_adds_epi16(x5[3], x5[4]); + x6[4] = _mm_subs_epi16(x5[3], x5[4]); + x6[8] = x5[8]; + x6[9] = x5[9]; + btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x5[10], x5[13], x6[10], x6[13]); + btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x5[11], x5[12], x6[11], x6[12]); + x6[14] = x5[14]; + x6[15] = x5[15]; + + // stage 7 + output[0] = _mm_adds_epi16(x6[0], x6[15]); + output[15] = _mm_subs_epi16(x6[0], x6[15]); + output[1] = _mm_adds_epi16(x6[1], x6[14]); + output[14] = _mm_subs_epi16(x6[1], x6[14]); + output[2] = _mm_adds_epi16(x6[2], x6[13]); + output[13] = _mm_subs_epi16(x6[2], x6[13]); + output[3] = _mm_adds_epi16(x6[3], x6[12]); + output[12] = _mm_subs_epi16(x6[3], x6[12]); + output[4] = _mm_adds_epi16(x6[4], x6[11]); + output[11] = _mm_subs_epi16(x6[4], x6[11]); + output[5] = _mm_adds_epi16(x6[5], x6[10]); + output[10] = _mm_subs_epi16(x6[5], x6[10]); + output[6] = _mm_adds_epi16(x6[6], x6[9]); + output[9] = _mm_subs_epi16(x6[6], x6[9]); + output[7] = _mm_adds_epi16(x6[7], x6[8]); + output[8] = _mm_subs_epi16(x6[7], x6[8]); +} + void idct32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); @@ -1399,6 +1639,58 @@ } } +// TODO(binpengsmail@gmail.com): +// To explore the reuse of VP9 versions of corresponding SSE2 functions and +// evaluate whether there is a possibility for further speedup. +void iadst4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { + (void)(cos_bit); + const int32_t *sinpi = sinpi_arr(INV_COS_BIT); + const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]); + const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]); + const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]); + const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]); + const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]); + const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]); + const __m128i sinpi_0_p02 = pair_set_epi16(0, sinpi[2]); + const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]); + __m128i x0[4]; + x0[0] = input[0]; + x0[1] = input[1]; + x0[2] = input[2]; + x0[3] = input[3]; + + __m128i u[2]; + u[0] = _mm_unpacklo_epi16(x0[0], x0[2]); + u[1] = _mm_unpacklo_epi16(x0[1], x0[3]); + + __m128i x1[8]; + x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4 + x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1 + x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02); // x1*sin3 + x3*sin2 + x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04); // x1*sin3 - x3*sin4 + x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3 + x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03); // x2*sin3 + x1[6] = _mm_madd_epi16(u[1], sinpi_0_p02); // x3*sin2 + x1[7] = _mm_madd_epi16(u[1], sinpi_p03_p04); // x1*sin3 + x3*sin4 + + __m128i x2[4]; + x2[0] = _mm_add_epi32(x1[0], x1[2]); // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2 + x2[1] = _mm_add_epi32(x1[1], x1[3]); // x0*sin2 - x2*sin1 +x1*sin3 - x3*sin4 + x2[2] = _mm_add_epi32(x1[4], x1[5]); // x0*sin3 - x2*sin3 +x3*sin3 + x2[3] = _mm_add_epi32(x1[0], x1[1]); // x0*sin1 + x2*sin4 + x0*sin2 - x2*sin1 + // x0*sin1 + x2*sin4 + x3*sin2 + x0*sin2 - x2*sin1 + x2[3] = _mm_add_epi32(x2[3], x1[6]); + // x0*sin1 + x2*sin4 + x3*sin2 + x0*sin2 - x2*sin1 + x2[3] = _mm_sub_epi32(x2[3], x1[7]); + + const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + for (int i = 0; i < 4; ++i) { + __m128i out0 = _mm_add_epi32(x2[i], rounding); + out0 = _mm_srai_epi32(out0, INV_COS_BIT); + output[i] = _mm_packs_epi32(out0, out0); + } +} + void iadst8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); @@ -1488,6 +1780,95 @@ output[7] = _mm_subs_epi16(__zero, x6[1]); } +void iadst8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { + (void)(cos_bit); + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + + // stage 1 + __m128i x1[8]; + x1[0] = input[7]; + x1[1] = input[0]; + x1[2] = input[5]; + x1[3] = input[2]; + x1[4] = input[3]; + x1[5] = input[4]; + x1[6] = input[1]; + x1[7] = input[6]; + + // stage 2 + __m128i x2[8]; + btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x1[0], x1[1], x2[0], x2[1]); + btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x1[2], x1[3], x2[2], x2[3]); + btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x1[4], x1[5], x2[4], x2[5]); + btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x1[6], x1[7], x2[6], x2[7]); + + // stage 3 + __m128i x3[8]; + x3[0] = _mm_adds_epi16(x2[0], x2[4]); + x3[4] = _mm_subs_epi16(x2[0], x2[4]); + x3[1] = _mm_adds_epi16(x2[1], x2[5]); + x3[5] = _mm_subs_epi16(x2[1], x2[5]); + x3[2] = _mm_adds_epi16(x2[2], x2[6]); + x3[6] = _mm_subs_epi16(x2[2], x2[6]); + x3[3] = _mm_adds_epi16(x2[3], x2[7]); + x3[7] = _mm_subs_epi16(x2[3], x2[7]); + + // stage 4 + __m128i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]); + btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]); + + // stage 5 + __m128i x5[8]; + x5[0] = _mm_adds_epi16(x4[0], x4[2]); + x5[2] = _mm_subs_epi16(x4[0], x4[2]); + x5[1] = _mm_adds_epi16(x4[1], x4[3]); + x5[3] = _mm_subs_epi16(x4[1], x4[3]); + x5[4] = _mm_adds_epi16(x4[4], x4[6]); + x5[6] = _mm_subs_epi16(x4[4], x4[6]); + x5[5] = _mm_adds_epi16(x4[5], x4[7]); + x5[7] = _mm_subs_epi16(x4[5], x4[7]); + + // stage 6 + __m128i x6[8]; + x6[0] = x5[0]; + x6[1] = x5[1]; + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x5[2], x5[3], x6[2], x6[3]); + x6[4] = x5[4]; + x6[5] = x5[5]; + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x5[6], x5[7], x6[6], x6[7]); + + // stage 7 + output[0] = x6[0]; + output[1] = _mm_subs_epi16(__zero, x6[4]); + output[2] = x6[6]; + output[3] = _mm_subs_epi16(__zero, x6[2]); + output[4] = x6[3]; + output[5] = _mm_subs_epi16(__zero, x6[7]); + output[6] = x6[5]; + output[7] = _mm_subs_epi16(__zero, x6[1]); +} + void iadst16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); @@ -1673,6 +2054,192 @@ output[15] = _mm_subs_epi16(__zero, x8[1]); } +void iadst16_w4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)(cos_bit); + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); + __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); + __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); + __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); + __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); + __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); + __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); + __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); + __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); + __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); + __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); + __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); + __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); + __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); + __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); + __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); + __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); + __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); + __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + + // stage 1 + __m128i x1[16]; + x1[0] = input[15]; + x1[1] = input[0]; + x1[2] = input[13]; + x1[3] = input[2]; + x1[4] = input[11]; + x1[5] = input[4]; + x1[6] = input[9]; + x1[7] = input[6]; + x1[8] = input[7]; + x1[9] = input[8]; + x1[10] = input[5]; + x1[11] = input[10]; + x1[12] = input[3]; + x1[13] = input[12]; + x1[14] = input[1]; + x1[15] = input[14]; + + // stage 2 + __m128i x2[16]; + btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x1[0], x1[1], x2[0], x2[1]); + btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x1[2], x1[3], x2[2], x2[3]); + btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x1[4], x1[5], x2[4], x2[5]); + btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x1[6], x1[7], x2[6], x2[7]); + btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x1[8], x1[9], x2[8], x2[9]); + btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x1[10], x1[11], x2[10], x2[11]); + btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x1[12], x1[13], x2[12], x2[13]); + btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x1[14], x1[15], x2[14], x2[15]); + + // stage 3 + __m128i x3[16]; + x3[0] = _mm_adds_epi16(x2[0], x2[8]); + x3[8] = _mm_subs_epi16(x2[0], x2[8]); + x3[1] = _mm_adds_epi16(x2[1], x2[9]); + x3[9] = _mm_subs_epi16(x2[1], x2[9]); + x3[2] = _mm_adds_epi16(x2[2], x2[10]); + x3[10] = _mm_subs_epi16(x2[2], x2[10]); + x3[3] = _mm_adds_epi16(x2[3], x2[11]); + x3[11] = _mm_subs_epi16(x2[3], x2[11]); + x3[4] = _mm_adds_epi16(x2[4], x2[12]); + x3[12] = _mm_subs_epi16(x2[4], x2[12]); + x3[5] = _mm_adds_epi16(x2[5], x2[13]); + x3[13] = _mm_subs_epi16(x2[5], x2[13]); + x3[6] = _mm_adds_epi16(x2[6], x2[14]); + x3[14] = _mm_subs_epi16(x2[6], x2[14]); + x3[7] = _mm_adds_epi16(x2[7], x2[15]); + x3[15] = _mm_subs_epi16(x2[7], x2[15]); + + // stage 4 + __m128i x4[16]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + x4[4] = x3[4]; + x4[5] = x3[5]; + x4[6] = x3[6]; + x4[7] = x3[7]; + btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x3[8], x3[9], x4[8], x4[9]); + btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x3[10], x3[11], x4[10], x4[11]); + btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x3[12], x3[13], x4[12], x4[13]); + btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x3[14], x3[15], x4[14], x4[15]); + + // stage 5 + __m128i x5[16]; + x5[0] = _mm_adds_epi16(x4[0], x4[4]); + x5[4] = _mm_subs_epi16(x4[0], x4[4]); + x5[1] = _mm_adds_epi16(x4[1], x4[5]); + x5[5] = _mm_subs_epi16(x4[1], x4[5]); + x5[2] = _mm_adds_epi16(x4[2], x4[6]); + x5[6] = _mm_subs_epi16(x4[2], x4[6]); + x5[3] = _mm_adds_epi16(x4[3], x4[7]); + x5[7] = _mm_subs_epi16(x4[3], x4[7]); + x5[8] = _mm_adds_epi16(x4[8], x4[12]); + x5[12] = _mm_subs_epi16(x4[8], x4[12]); + x5[9] = _mm_adds_epi16(x4[9], x4[13]); + x5[13] = _mm_subs_epi16(x4[9], x4[13]); + x5[10] = _mm_adds_epi16(x4[10], x4[14]); + x5[14] = _mm_subs_epi16(x4[10], x4[14]); + x5[11] = _mm_adds_epi16(x4[11], x4[15]); + x5[15] = _mm_subs_epi16(x4[11], x4[15]); + + // stage 6 + __m128i x6[16]; + x6[0] = x5[0]; + x6[1] = x5[1]; + x6[2] = x5[2]; + x6[3] = x5[3]; + btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x5[4], x5[5], x6[4], x6[5]); + btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x5[6], x5[7], x6[6], x6[7]); + x6[8] = x5[8]; + x6[9] = x5[9]; + x6[10] = x5[10]; + x6[11] = x5[11]; + btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x5[12], x5[13], x6[12], x6[13]); + btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x5[14], x5[15], x6[14], x6[15]); + + // stage 7 + __m128i x7[16]; + x7[0] = _mm_adds_epi16(x6[0], x6[2]); + x7[2] = _mm_subs_epi16(x6[0], x6[2]); + x7[1] = _mm_adds_epi16(x6[1], x6[3]); + x7[3] = _mm_subs_epi16(x6[1], x6[3]); + x7[4] = _mm_adds_epi16(x6[4], x6[6]); + x7[6] = _mm_subs_epi16(x6[4], x6[6]); + x7[5] = _mm_adds_epi16(x6[5], x6[7]); + x7[7] = _mm_subs_epi16(x6[5], x6[7]); + x7[8] = _mm_adds_epi16(x6[8], x6[10]); + x7[10] = _mm_subs_epi16(x6[8], x6[10]); + x7[9] = _mm_adds_epi16(x6[9], x6[11]); + x7[11] = _mm_subs_epi16(x6[9], x6[11]); + x7[12] = _mm_adds_epi16(x6[12], x6[14]); + x7[14] = _mm_subs_epi16(x6[12], x6[14]); + x7[13] = _mm_adds_epi16(x6[13], x6[15]); + x7[15] = _mm_subs_epi16(x6[13], x6[15]); + + // stage 8 + __m128i x8[16]; + x8[0] = x7[0]; + x8[1] = x7[1]; + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x7[2], x7[3], x8[2], x8[3]); + x8[4] = x7[4]; + x8[5] = x7[5]; + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x7[6], x7[7], x8[6], x8[7]); + x8[8] = x7[8]; + x8[9] = x7[9]; + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x7[10], x7[11], x8[10], x8[11]); + x8[12] = x7[12]; + x8[13] = x7[13]; + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x7[14], x7[15], x8[14], x8[15]); + + // stage 9 + output[0] = x8[0]; + output[1] = _mm_subs_epi16(__zero, x8[8]); + output[2] = x8[12]; + output[3] = _mm_subs_epi16(__zero, x8[4]); + output[4] = x8[6]; + output[5] = _mm_subs_epi16(__zero, x8[14]); + output[6] = x8[10]; + output[7] = _mm_subs_epi16(__zero, x8[2]); + output[8] = x8[3]; + output[9] = _mm_subs_epi16(__zero, x8[11]); + output[10] = x8[15]; + output[11] = _mm_subs_epi16(__zero, x8[7]); + output[12] = x8[5]; + output[13] = _mm_subs_epi16(__zero, x8[13]); + output[14] = x8[9]; + output[15] = _mm_subs_epi16(__zero, x8[1]); +} + static void iidentity4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { (void)cos_bit; @@ -1691,6 +2258,21 @@ } } +static void iidentity4_w4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const __m128i scale = _mm_set1_epi16(NewSqrt2); + const __m128i rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1)); + const __m128i one = _mm_set1_epi16(1); + const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding); + for (int i = 0; i < 4; ++i) { + __m128i a_lo = _mm_unpacklo_epi16(input[i], one); + __m128i b_lo = _mm_madd_epi16(a_lo, scale_rounding); + __m128i c_lo = _mm_srai_epi32(b_lo, NewSqrt2Bits); + output[i] = _mm_packs_epi32(c_lo, c_lo); + } +} + static void iidentity8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { (void)cos_bit; @@ -1717,6 +2299,21 @@ } } +static void iidentity16_w4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const __m128i scale = _mm_set1_epi16(2 * NewSqrt2); + const __m128i rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1)); + const __m128i one = _mm_set1_epi16(1); + const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding); + for (int i = 0; i < 16; ++i) { + __m128i a_lo = _mm_unpacklo_epi16(input[i], one); + __m128i b_lo = _mm_madd_epi16(a_lo, scale_rounding); + __m128i c_lo = _mm_srai_epi32(b_lo, NewSqrt2Bits); + output[i] = _mm_packs_epi32(c_lo, c_lo); + } +} + static void iidentity32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { (void)cos_bit; @@ -1777,17 +2374,50 @@ } } -static const transform_1d_sse2 lowbd_txfm_all_1d_arr[TX_SIZES][TX_TYPES_1D] = { - { idct4_new_sse2, iadst4_new_sse2, iadst4_new_sse2, iidentity4_new_sse2 }, - { idct8_new_sse2, iadst8_new_sse2, iadst8_new_sse2, iidentity8_new_sse2 }, - { idct16_new_sse2, iadst16_new_sse2, iadst16_new_sse2, iidentity16_new_sse2 }, - { idct32_new_sse2, NULL, NULL, iidentity32_new_sse2 }, - { idct64_new_sse2, NULL, NULL, iidentity64_new_sse2 }, +// 1D itx types +typedef enum ATTRIBUTE_PACKED { + IDCT_1D, + IADST_1D, + IFLIPADST_1D = IADST_1D, + IIDENTITY_1D, + ITX_TYPES_1D, +} ITX_TYPE_1D; + +static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = { + IDCT_1D, IADST_1D, IDCT_1D, IADST_1D, + IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D, + IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D, + IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D, }; -// TODO(binpengsmail@gmail.com): Replace 1D txfm functions with functions which -// process 4 pixels at one time. Currently use functions which process 8 pixels -// at one time. +static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = { + IDCT_1D, IDCT_1D, IADST_1D, IADST_1D, + IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D, + IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D, + IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D, +}; + +// 1D functions process process 8 pixels at one time. +static const transform_1d_sse2 + lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = { + { idct4_new_sse2, iadst4_new_sse2, iidentity4_new_sse2 }, + { idct8_new_sse2, iadst8_new_sse2, iidentity8_new_sse2 }, + { idct16_new_sse2, iadst16_new_sse2, iidentity16_new_sse2 }, + { idct32_new_sse2, NULL, iidentity32_new_sse2 }, + { idct64_new_sse2, NULL, iidentity64_new_sse2 }, + }; + +// 1D functions process process 4 pixels at one time. +// used in 4x4, 4x8, 4x16, 8x4, 16x4 +static const transform_1d_sse2 + lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = { + { idct4_w4_new_sse2, iadst4_w4_new_sse2, iidentity4_w4_new_sse2 }, + { idct8_w4_new_sse2, iadst8_w4_new_sse2, iidentity8_new_sse2 }, + { idct16_w4_new_sse2, iadst16_w4_new_sse2, iidentity16_w4_new_sse2 }, + { idct32_new_sse2, NULL, iidentity32_new_sse2 }, + { idct64_new_sse2, NULL, iidentity64_new_sse2 }, + }; + void av1_lowbd_inv_txfm2d_add_4x4_sse2(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; @@ -1802,9 +2432,9 @@ const int txfm_size_row = tx_size_high[tx_size]; const transform_1d_sse2 row_txfm = - lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]]; + lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]]; const transform_1d_sse2 col_txfm = - lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]]; + lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); @@ -1837,9 +2467,9 @@ const int txfm_size_row = tx_size_high[tx_size]; const transform_1d_sse2 row_txfm = - lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]]; + lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]]; const transform_1d_sse2 col_txfm = - lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]]; + lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); @@ -1915,9 +2545,9 @@ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const transform_1d_sse2 row_txfm = - lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]]; + lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]]; const transform_1d_sse2 col_txfm = - lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]]; + lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]]; assert(col_txfm != NULL); assert(row_txfm != NULL); @@ -2012,9 +2642,9 @@ const int txfm_size_row = tx_size_high[tx_size]; const transform_1d_sse2 row_txfm = - lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]]; + lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]]; const transform_1d_sse2 col_txfm = - lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]]; + lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); @@ -2049,9 +2679,9 @@ const int txfm_size_row = tx_size_high[tx_size]; const transform_1d_sse2 row_txfm = - lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]]; + lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]]; const transform_1d_sse2 col_txfm = - lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]]; + lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); @@ -2138,9 +2768,9 @@ const int txfm_size_row = tx_size_high[tx_size]; const transform_1d_sse2 row_txfm = - lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]]; + lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]]; const transform_1d_sse2 col_txfm = - lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]]; + lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); @@ -2182,9 +2812,9 @@ const int buf_size_w_div8 = txfm_size_col >> 3; const transform_1d_sse2 row_txfm = - lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]]; + lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]]; const transform_1d_sse2 col_txfm = - lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]]; + lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip);
diff --git a/av1/common/x86/av1_txfm_sse2.h b/av1/common/x86/av1_txfm_sse2.h index 2707bde..0f88f05 100644 --- a/av1/common/x86/av1_txfm_sse2.h +++ b/av1/common/x86/av1_txfm_sse2.h
@@ -40,6 +40,22 @@ *out1 = _mm_packs_epi32(d0, c0); } +#define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \ + { \ + __m128i t0 = _mm_unpacklo_epi16(in0, in1); \ + __m128i u0 = _mm_madd_epi16(t0, w0); \ + __m128i v0 = _mm_madd_epi16(t0, w1); \ + \ + __m128i a0 = _mm_add_epi32(u0, __rounding); \ + __m128i b0 = _mm_add_epi32(v0, __rounding); \ + \ + __m128i c0 = _mm_srai_epi32(a0, cos_bit); \ + __m128i d0 = _mm_srai_epi32(b0, cos_bit); \ + \ + out0 = _mm_packs_epi32(c0, c0); \ + out1 = _mm_packs_epi32(d0, d0); \ + } + #define btf_16_sse2(w0, w1, in0, in1, out0, out1) \ { \ __m128i t0 = _mm_unpacklo_epi16(in0, in1); \