Optimize highbd 64x64 fwd_txfm Added sse4_1 variant for highbd 64x64 fwd_txfm. When tested for 20 frames of crowd_run_360p_10 at 750 kbps, observed ~3.7% reduction in encoder time for speed=1 preset. Achieved module level gains of 4x w.r.t. C function. Change-Id: Id9da2231a7a5c0eebe81f5062f8c2d5a7feb3227
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index a61f1d7..c605906 100755 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl
@@ -217,6 +217,7 @@ specialize qw/av1_fwd_txfm2d_32x32 sse4_1/; add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; + specialize qw/av1_fwd_txfm2d_64x64 sse4_1/; add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_fwd_txfm2d_16x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
diff --git a/av1/common/x86/highbd_txfm_utility_sse4.h b/av1/common/x86/highbd_txfm_utility_sse4.h index 0636555..6f24e59 100644 --- a/av1/common/x86/highbd_txfm_utility_sse4.h +++ b/av1/common/x86/highbd_txfm_utility_sse4.h
@@ -75,6 +75,17 @@ out[63]); } +static INLINE void transpose_32x32(const __m128i *input, __m128i *output) { + for (int j = 0; j < 8; j++) { + for (int i = 0; i < 8; i++) { + TRANSPOSE_4X4(input[i * 32 + j + 0], input[i * 32 + j + 8], + input[i * 32 + j + 16], input[i * 32 + j + 24], + output[j * 32 + i + 0], output[j * 32 + i + 8], + output[j * 32 + i + 16], output[j * 32 + i + 24]); + } + } +} + // Note: // rounding = 1 << (bit - 1) static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0,
diff --git a/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/av1/encoder/x86/av1_fwd_txfm1d_sse4.c index c71f2e7..0761554 100644 --- a/av1/encoder/x86/av1_fwd_txfm1d_sse4.c +++ b/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
@@ -395,7 +395,8 @@ } void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output, - int8_t cos_bit) { + int8_t cos_bit, const int instride, + const int outstride) { const int32_t *cospi = cospi_arr(cos_bit); const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); @@ -480,70 +481,70 @@ // stage 1 __m128i x1[64]; - x1[0] = _mm_add_epi32(input[0], input[63]); - x1[63] = _mm_sub_epi32(input[0], input[63]); - x1[1] = _mm_add_epi32(input[1], input[62]); - x1[62] = _mm_sub_epi32(input[1], input[62]); - x1[2] = _mm_add_epi32(input[2], input[61]); - x1[61] = _mm_sub_epi32(input[2], input[61]); - x1[3] = _mm_add_epi32(input[3], input[60]); - x1[60] = _mm_sub_epi32(input[3], input[60]); - x1[4] = _mm_add_epi32(input[4], input[59]); - x1[59] = _mm_sub_epi32(input[4], input[59]); - x1[5] = _mm_add_epi32(input[5], input[58]); - x1[58] = _mm_sub_epi32(input[5], input[58]); - x1[6] = _mm_add_epi32(input[6], input[57]); - x1[57] = _mm_sub_epi32(input[6], input[57]); - x1[7] = _mm_add_epi32(input[7], input[56]); - x1[56] = _mm_sub_epi32(input[7], input[56]); - x1[8] = _mm_add_epi32(input[8], input[55]); - x1[55] = _mm_sub_epi32(input[8], input[55]); - x1[9] = _mm_add_epi32(input[9], input[54]); - x1[54] = _mm_sub_epi32(input[9], input[54]); - x1[10] = _mm_add_epi32(input[10], input[53]); - x1[53] = _mm_sub_epi32(input[10], input[53]); - x1[11] = _mm_add_epi32(input[11], input[52]); - x1[52] = _mm_sub_epi32(input[11], input[52]); - x1[12] = _mm_add_epi32(input[12], input[51]); - x1[51] = _mm_sub_epi32(input[12], input[51]); - x1[13] = _mm_add_epi32(input[13], input[50]); - x1[50] = _mm_sub_epi32(input[13], input[50]); - x1[14] = _mm_add_epi32(input[14], input[49]); - x1[49] = _mm_sub_epi32(input[14], input[49]); - x1[15] = _mm_add_epi32(input[15], input[48]); - x1[48] = _mm_sub_epi32(input[15], input[48]); - x1[16] = _mm_add_epi32(input[16], input[47]); - x1[47] = _mm_sub_epi32(input[16], input[47]); - x1[17] = _mm_add_epi32(input[17], input[46]); - x1[46] = _mm_sub_epi32(input[17], input[46]); - x1[18] = _mm_add_epi32(input[18], input[45]); - x1[45] = _mm_sub_epi32(input[18], input[45]); - x1[19] = _mm_add_epi32(input[19], input[44]); - x1[44] = _mm_sub_epi32(input[19], input[44]); - x1[20] = _mm_add_epi32(input[20], input[43]); - x1[43] = _mm_sub_epi32(input[20], input[43]); - x1[21] = _mm_add_epi32(input[21], input[42]); - x1[42] = _mm_sub_epi32(input[21], input[42]); - x1[22] = _mm_add_epi32(input[22], input[41]); - x1[41] = _mm_sub_epi32(input[22], input[41]); - x1[23] = _mm_add_epi32(input[23], input[40]); - x1[40] = _mm_sub_epi32(input[23], input[40]); - x1[24] = _mm_add_epi32(input[24], input[39]); - x1[39] = _mm_sub_epi32(input[24], input[39]); - x1[25] = _mm_add_epi32(input[25], input[38]); - x1[38] = _mm_sub_epi32(input[25], input[38]); - x1[26] = _mm_add_epi32(input[26], input[37]); - x1[37] = _mm_sub_epi32(input[26], input[37]); - x1[27] = _mm_add_epi32(input[27], input[36]); - x1[36] = _mm_sub_epi32(input[27], input[36]); - x1[28] = _mm_add_epi32(input[28], input[35]); - x1[35] = _mm_sub_epi32(input[28], input[35]); - x1[29] = _mm_add_epi32(input[29], input[34]); - x1[34] = _mm_sub_epi32(input[29], input[34]); - x1[30] = _mm_add_epi32(input[30], input[33]); - x1[33] = _mm_sub_epi32(input[30], input[33]); - x1[31] = _mm_add_epi32(input[31], input[32]); - x1[32] = _mm_sub_epi32(input[31], input[32]); + x1[0] = _mm_add_epi32(input[0 * instride], input[63 * instride]); + x1[63] = _mm_sub_epi32(input[0 * instride], input[63 * instride]); + x1[1] = _mm_add_epi32(input[1 * instride], input[62 * instride]); + x1[62] = _mm_sub_epi32(input[1 * instride], input[62 * instride]); + x1[2] = _mm_add_epi32(input[2 * instride], input[61 * instride]); + x1[61] = _mm_sub_epi32(input[2 * instride], input[61 * instride]); + x1[3] = _mm_add_epi32(input[3 * instride], input[60 * instride]); + x1[60] = _mm_sub_epi32(input[3 * instride], input[60 * instride]); + x1[4] = _mm_add_epi32(input[4 * instride], input[59 * instride]); + x1[59] = _mm_sub_epi32(input[4 * instride], input[59 * instride]); + x1[5] = _mm_add_epi32(input[5 * instride], input[58 * instride]); + x1[58] = _mm_sub_epi32(input[5 * instride], input[58 * instride]); + x1[6] = _mm_add_epi32(input[6 * instride], input[57 * instride]); + x1[57] = _mm_sub_epi32(input[6 * instride], input[57 * instride]); + x1[7] = _mm_add_epi32(input[7 * instride], input[56 * instride]); + x1[56] = _mm_sub_epi32(input[7 * instride], input[56 * instride]); + x1[8] = _mm_add_epi32(input[8 * instride], input[55 * instride]); + x1[55] = _mm_sub_epi32(input[8 * instride], input[55 * instride]); + x1[9] = _mm_add_epi32(input[9 * instride], input[54 * instride]); + x1[54] = _mm_sub_epi32(input[9 * instride], input[54 * instride]); + x1[10] = _mm_add_epi32(input[10 * instride], input[53 * instride]); + x1[53] = _mm_sub_epi32(input[10 * instride], input[53 * instride]); + x1[11] = _mm_add_epi32(input[11 * instride], input[52 * instride]); + x1[52] = _mm_sub_epi32(input[11 * instride], input[52 * instride]); + x1[12] = _mm_add_epi32(input[12 * instride], input[51 * instride]); + x1[51] = _mm_sub_epi32(input[12 * instride], input[51 * instride]); + x1[13] = _mm_add_epi32(input[13 * instride], input[50 * instride]); + x1[50] = _mm_sub_epi32(input[13 * instride], input[50 * instride]); + x1[14] = _mm_add_epi32(input[14 * instride], input[49 * instride]); + x1[49] = _mm_sub_epi32(input[14 * instride], input[49 * instride]); + x1[15] = _mm_add_epi32(input[15 * instride], input[48 * instride]); + x1[48] = _mm_sub_epi32(input[15 * instride], input[48 * instride]); + x1[16] = _mm_add_epi32(input[16 * instride], input[47 * instride]); + x1[47] = _mm_sub_epi32(input[16 * instride], input[47 * instride]); + x1[17] = _mm_add_epi32(input[17 * instride], input[46 * instride]); + x1[46] = _mm_sub_epi32(input[17 * instride], input[46 * instride]); + x1[18] = _mm_add_epi32(input[18 * instride], input[45 * instride]); + x1[45] = _mm_sub_epi32(input[18 * instride], input[45 * instride]); + x1[19] = _mm_add_epi32(input[19 * instride], input[44 * instride]); + x1[44] = _mm_sub_epi32(input[19 * instride], input[44 * instride]); + x1[20] = _mm_add_epi32(input[20 * instride], input[43 * instride]); + x1[43] = _mm_sub_epi32(input[20 * instride], input[43 * instride]); + x1[21] = _mm_add_epi32(input[21 * instride], input[42 * instride]); + x1[42] = _mm_sub_epi32(input[21 * instride], input[42 * instride]); + x1[22] = _mm_add_epi32(input[22 * instride], input[41 * instride]); + x1[41] = _mm_sub_epi32(input[22 * instride], input[41 * instride]); + x1[23] = _mm_add_epi32(input[23 * instride], input[40 * instride]); + x1[40] = _mm_sub_epi32(input[23 * instride], input[40 * instride]); + x1[24] = _mm_add_epi32(input[24 * instride], input[39 * instride]); + x1[39] = _mm_sub_epi32(input[24 * instride], input[39 * instride]); + x1[25] = _mm_add_epi32(input[25 * instride], input[38 * instride]); + x1[38] = _mm_sub_epi32(input[25 * instride], input[38 * instride]); + x1[26] = _mm_add_epi32(input[26 * instride], input[37 * instride]); + x1[37] = _mm_sub_epi32(input[26 * instride], input[37 * instride]); + x1[27] = _mm_add_epi32(input[27 * instride], input[36 * instride]); + x1[36] = _mm_sub_epi32(input[27 * instride], input[36 * instride]); + x1[28] = _mm_add_epi32(input[28 * instride], input[35 * instride]); + x1[35] = _mm_sub_epi32(input[28 * instride], input[35 * instride]); + x1[29] = _mm_add_epi32(input[29 * instride], input[34 * instride]); + x1[34] = _mm_sub_epi32(input[29 * instride], input[34 * instride]); + x1[30] = _mm_add_epi32(input[30 * instride], input[33 * instride]); + x1[33] = _mm_sub_epi32(input[30 * instride], input[33 * instride]); + x1[31] = _mm_add_epi32(input[31 * instride], input[32 * instride]); + x1[32] = _mm_sub_epi32(input[31 * instride], input[32 * instride]); // stage 2 __m128i x2[64]; @@ -1149,68 +1150,68 @@ x10[48], __rounding, cos_bit); // stage 11 - output[0] = x10[0]; - output[1] = x10[32]; - output[2] = x10[16]; - output[3] = x10[48]; - output[4] = x10[8]; - output[5] = x10[40]; - output[6] = x10[24]; - output[7] = x10[56]; - output[8] = x10[4]; - output[9] = x10[36]; - output[10] = x10[20]; - output[11] = x10[52]; - output[12] = x10[12]; - output[13] = x10[44]; - output[14] = x10[28]; - output[15] = x10[60]; - output[16] = x10[2]; - output[17] = x10[34]; - output[18] = x10[18]; - output[19] = x10[50]; - output[20] = x10[10]; - output[21] = x10[42]; - output[22] = x10[26]; - output[23] = x10[58]; - output[24] = x10[6]; - output[25] = x10[38]; - output[26] = x10[22]; - output[27] = x10[54]; - output[28] = x10[14]; - output[29] = x10[46]; - output[30] = x10[30]; - output[31] = x10[62]; - output[32] = x10[1]; - output[33] = x10[33]; - output[34] = x10[17]; - output[35] = x10[49]; - output[36] = x10[9]; - output[37] = x10[41]; - output[38] = x10[25]; - output[39] = x10[57]; - output[40] = x10[5]; - output[41] = x10[37]; - output[42] = x10[21]; - output[43] = x10[53]; - output[44] = x10[13]; - output[45] = x10[45]; - output[46] = x10[29]; - output[47] = x10[61]; - output[48] = x10[3]; - output[49] = x10[35]; - output[50] = x10[19]; - output[51] = x10[51]; - output[52] = x10[11]; - output[53] = x10[43]; - output[54] = x10[27]; - output[55] = x10[59]; - output[56] = x10[7]; - output[57] = x10[39]; - output[58] = x10[23]; - output[59] = x10[55]; - output[60] = x10[15]; - output[61] = x10[47]; - output[62] = x10[31]; - output[63] = x10[63]; + output[0 * outstride] = x10[0]; + output[1 * outstride] = x10[32]; + output[2 * outstride] = x10[16]; + output[3 * outstride] = x10[48]; + output[4 * outstride] = x10[8]; + output[5 * outstride] = x10[40]; + output[6 * outstride] = x10[24]; + output[7 * outstride] = x10[56]; + output[8 * outstride] = x10[4]; + output[9 * outstride] = x10[36]; + output[10 * outstride] = x10[20]; + output[11 * outstride] = x10[52]; + output[12 * outstride] = x10[12]; + output[13 * outstride] = x10[44]; + output[14 * outstride] = x10[28]; + output[15 * outstride] = x10[60]; + output[16 * outstride] = x10[2]; + output[17 * outstride] = x10[34]; + output[18 * outstride] = x10[18]; + output[19 * outstride] = x10[50]; + output[20 * outstride] = x10[10]; + output[21 * outstride] = x10[42]; + output[22 * outstride] = x10[26]; + output[23 * outstride] = x10[58]; + output[24 * outstride] = x10[6]; + output[25 * outstride] = x10[38]; + output[26 * outstride] = x10[22]; + output[27 * outstride] = x10[54]; + output[28 * outstride] = x10[14]; + output[29 * outstride] = x10[46]; + output[30 * outstride] = x10[30]; + output[31 * outstride] = x10[62]; + output[32 * outstride] = x10[1]; + output[33 * outstride] = x10[33]; + output[34 * outstride] = x10[17]; + output[35 * outstride] = x10[49]; + output[36 * outstride] = x10[9]; + output[37 * outstride] = x10[41]; + output[38 * outstride] = x10[25]; + output[39 * outstride] = x10[57]; + output[40 * outstride] = x10[5]; + output[41 * outstride] = x10[37]; + output[42 * outstride] = x10[21]; + output[43 * outstride] = x10[53]; + output[44 * outstride] = x10[13]; + output[45 * outstride] = x10[45]; + output[46 * outstride] = x10[29]; + output[47 * outstride] = x10[61]; + output[48 * outstride] = x10[3]; + output[49 * outstride] = x10[35]; + output[50 * outstride] = x10[19]; + output[51 * outstride] = x10[51]; + output[52 * outstride] = x10[11]; + output[53 * outstride] = x10[43]; + output[54 * outstride] = x10[27]; + output[55 * outstride] = x10[59]; + output[56 * outstride] = x10[7]; + output[57 * outstride] = x10[39]; + output[58 * outstride] = x10[23]; + output[59 * outstride] = x10[55]; + output[60 * outstride] = x10[15]; + output[61 * outstride] = x10[47]; + output[62 * outstride] = x10[31]; + output[63 * outstride] = x10[63]; }
diff --git a/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/av1/encoder/x86/av1_fwd_txfm2d_sse4.c index 968f900..8ec0256 100644 --- a/av1/encoder/x86/av1_fwd_txfm2d_sse4.c +++ b/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
@@ -14,6 +14,7 @@ #include "av1/common/enums.h" #include "av1/common/av1_txfm.h" #include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/common/x86/highbd_txfm_utility_sse4.h" #include "av1/encoder/av1_fwd_txfm1d_cfg.h" #include "av1/encoder/x86/av1_txfm1d_sse4.h" #include "av1/encoder/x86/av1_fwd_txfm_sse2.h" @@ -52,9 +53,22 @@ } } +static void fdct64_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range) { + const int txfm_size = 64; + const int num_per_128 = 4; + int col_num = txfm_size / num_per_128; + (void)stage_range; + for (int col = 0; col < col_num; col++) { + av1_fdct64_new_sse4_1((input + col), (output + col), cos_bit, col_num, + col_num); + } +} + static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) { switch (txfm_type) { case TXFM_TYPE_DCT32: return fdct32_new_sse4_1; break; + case TXFM_TYPE_DCT64: return fdct64_new_sse4_1; break; default: assert(0); } return NULL; @@ -95,6 +109,42 @@ transpose_32(txfm_size, buf_128, out_128); } +static INLINE void fwd_txfm2d_64x64_sse4_1(const int16_t *input, + int32_t *output, const int stride, + const TXFM_2D_FLIP_CFG *cfg, + int32_t *txfm_buf) { + assert(cfg->tx_size < TX_SIZES); + const int txfm_size = tx_size_wide[cfg->tx_size]; + const int8_t *shift = cfg->shift; + const int8_t *stage_range_col = cfg->stage_range_col; + const int8_t cos_bit_col = cfg->cos_bit_col; + const int8_t cos_bit_row = cfg->cos_bit_row; + const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col); + __m128i *buf_128 = (__m128i *)txfm_buf; + __m128i *out_128 = (__m128i *)output; + + const int num_per_128 = 4; + int txfm2d_size_128 = txfm_size * txfm_size / num_per_128; + int col_num = txfm_size / num_per_128; + + int16_array_with_stride_to_int32_array_without_stride(input, stride, output, + txfm_size); + /*col wise transform*/ + txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col); + av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]); + transpose_32(txfm_size, out_128, buf_128); + + /*row wise transform*/ + for (int col = 0; col < (col_num >> 1); col++) { + av1_fdct64_new_sse4_1((buf_128 + col), (out_128 + col), cos_bit_row, + col_num, (col_num >> 1)); + } + + txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1); + av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]); + transpose_32x32(buf_128, out_128); +} + void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]); @@ -104,6 +154,15 @@ fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf); } +void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg); + (void)bd; + fwd_txfm2d_64x64_sse4_1(input, output, stride, &cfg, txfm_buf); +} + static INLINE void transpose_32_4x4x2(int stride, const __m128i *inputA, const __m128i *inputB, __m128i *output) { __m128i temp0 = _mm_unpacklo_epi32(inputA[0], inputA[2]); @@ -162,8 +221,8 @@ bufA[j] = _mm_cvtepi16_epi32(buf[j]); bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); } - av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row); - av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row); + av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1); + av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1); av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]); av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]); @@ -209,8 +268,8 @@ bufA[j] = _mm_cvtepi16_epi32(buf[j]); bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); } - av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row); - av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row); + av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1); + av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1); av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2); av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
diff --git a/av1/encoder/x86/av1_txfm1d_sse4.h b/av1/encoder/x86/av1_txfm1d_sse4.h index 17ca8fc..6df2a8b 100644 --- a/av1/encoder/x86/av1_txfm1d_sse4.h +++ b/av1/encoder/x86/av1_txfm1d_sse4.h
@@ -29,7 +29,8 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output, int8_t cos_bit); void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output, - int8_t cos_bit); + int8_t cos_bit, const int instride, + const int outstride); void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output, const int8_t cos_bit, const int8_t *stage_range);