Optimize highbd 4x16 and 16x4 inv_txfm Enabled sse4_1 optimizations for tx_sizes 4x16 and 16x4. Module level gains: Tx_size Gain w.r.t. C 4x16 6.01x 16x4 7.36x When tested for 20 frames of crowd_run_360p_10 at 1 mbps for speed=1 preset, observed ~0.5% reduction in encoder time. Change-Id: I0c5d6530c666150be0de062a7084c7a2bf61410f
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index a6d5138..033eeb7 100755 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl
@@ -135,6 +135,10 @@ specialize qw/av1_highbd_inv_txfm_add_4x8 sse4_1/; add_proto qw/void av1_highbd_inv_txfm_add_8x4/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; specialize qw/av1_highbd_inv_txfm_add_8x4 sse4_1/; +add_proto qw/void av1_highbd_inv_txfm_add_4x16/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; +specialize qw/av1_highbd_inv_txfm_add_4x16 sse4_1/; +add_proto qw/void av1_highbd_inv_txfm_add_16x4/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; +specialize qw/av1_highbd_inv_txfm_add_16x4 sse4_1/; add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
diff --git a/av1/common/idct.c b/av1/common/idct.c index 0261f6f..55925a5 100644 --- a/av1/common/idct.c +++ b/av1/common/idct.c
@@ -86,15 +86,15 @@ txfm_param->tx_type, txfm_param->bd); } -void av1_highbd_inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } -void av1_highbd_inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); @@ -263,10 +263,10 @@ av1_highbd_inv_txfm_add_4x4_c(input, dest, stride, txfm_param); break; case TX_16X4: - av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_16x4_c(input, dest, stride, txfm_param); break; case TX_4X16: - av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_4x16_c(input, dest, stride, txfm_param); break; case TX_8X32: av1_highbd_inv_txfm_add_8x32_c(input, dest, stride, txfm_param);
diff --git a/av1/common/idct.h b/av1/common/idct.h index 00a55f9..004d25d 100644 --- a/av1/common/idct.h +++ b/av1/common/idct.h
@@ -44,12 +44,6 @@ return (const int32_t *)input; } -typedef void(highbd_inv_txfm_add)(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *param); - -highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x4; -highbd_inv_txfm_add av1_highbd_inv_txfm_add_4x16; - #ifdef __cplusplus } // extern "C" #endif
diff --git a/av1/common/x86/highbd_inv_txfm_avx2.c b/av1/common/x86/highbd_inv_txfm_avx2.c index 7e53525..9a1224c 100644 --- a/av1/common/x86/highbd_inv_txfm_avx2.c +++ b/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -1320,10 +1320,10 @@ av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param); break; case TX_16X4: - av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param); break; case TX_4X16: - av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param); break; case TX_8X32: av1_highbd_inv_txfm_add_8x32_sse4_1(input, dest, stride, txfm_param);
diff --git a/av1/common/x86/highbd_inv_txfm_sse4.c b/av1/common/x86/highbd_inv_txfm_sse4.c index 254abcd..41afae0 100644 --- a/av1/common/x86/highbd_inv_txfm_sse4.c +++ b/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -5480,6 +5480,121 @@ txfm_size_row, bd); } +static void highbd_inv_txfm2d_add_4x16_sse4_1(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + (void)eob; + __m128i buf1[16]; + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_h_div8 = txfm_size_row >> 2; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2]; + const int input_stride = AOMMIN(32, txfm_size_col); + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + __m128i buf0[16]; + const int32_t *input_row = input; + __m128i *buf0_cur = buf0; + load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row); + for (int i = 0; i < (txfm_size_row >> 2); i++) { + row_txfm(buf0 + (i << 2), buf0 + (i << 2), + inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); + } + + av1_round_shift_array_32_sse4_1(buf0, buf0, txfm_size_row, -shift[0]); + + if (lr_flip) { + for (int j = 0; j < buf_size_h_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2], + buf1[4 * j + 3]); + } + } else { + for (int j = 0; j < buf_size_h_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2], + buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1], + buf1[4 * j + 2], buf1[4 * j + 3]); + } + } + + // 2nd stage: column transform + col_txfm(buf1, buf1, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + + av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]); + + // write to buffer + highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row, + bd); +} + +static void highbd_inv_txfm2d_add_16x4_sse4_1(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + (void)eob; + __m128i buf1[16]; + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 2; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + __m128i buf0[16]; + const int32_t *input_row = input; + load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col); + + for (int j = 0; j < buf_size_w_div8; j++) { + TRANSPOSE_4X4(buf0[j], buf0[j + 4], buf0[j + 8], buf0[j + 12], buf1[4 * j], + buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]); + } + row_txfm(buf1, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); + + __m128i *buf1_ptr; + if (lr_flip) { + flip_buf_sse2(buf0, buf1, txfm_size_col); + buf1_ptr = buf1; + } else { + buf1_ptr = buf0; + } + + // 2nd stage: column transform + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row, + inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + } + av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]); + + // write to buffer + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1_ptr + i * txfm_size_row * 2, + output + 8 * i, stride, ud_flip, + txfm_size_row, bd); + } +} + void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, @@ -5558,6 +5673,62 @@ } } +void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const TX_SIZE tx_size = txfm_param->tx_size; + const int32_t *src = cast_to_int32(input); + int eob = txfm_param->eob; + switch (tx_type) { + // Assembly version doesn't support some transform types, so use C version + // for those. + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + case IDTX: + av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); + break; + default: + highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest), + stride, tx_type, tx_size, eob, bd); + break; + } +} + +void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const TX_SIZE tx_size = txfm_param->tx_size; + const int32_t *src = cast_to_int32(input); + int eob = txfm_param->eob; + switch (tx_type) { + // Assembly version doesn't support some transform types, so use C version + // for those. + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + case IDTX: + av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); + break; + default: + highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest), + stride, tx_type, tx_size, eob, bd); + break; + } +} + void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); @@ -5594,10 +5765,10 @@ av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param); break; case TX_16X4: - av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param); break; case TX_4X16: - av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param); break; case TX_8X32: av1_highbd_inv_txfm_add_8x32_sse4_1(input, dest, stride, txfm_param);