Optimize highbd inv_txfm modules Added SSE4_1 variants for vertical identity txfm types. Module level gains: Tx_size Gain w.r.t. C 8x8 6.65x 8x16 6.15x 16x8 9.59x 16x16 10.07x When tested for multiple test cases observed 1.1% average reduction in encoder time for speed = 1 preset. Change-Id: I586db59840646fa6830647996ea4d84c0e6713dc
diff --git a/av1/common/x86/highbd_inv_txfm_avx2.c b/av1/common/x86/highbd_inv_txfm_avx2.c index 2fe5777..4de31b6 100644 --- a/av1/common/x86/highbd_inv_txfm_avx2.c +++ b/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -4322,13 +4322,13 @@ switch (tx_type) { // Assembly version doesn't support some transform types, so use C version // for those. - case H_DCT: - case H_ADST: - case H_FLIPADST: case IDTX: av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); break; + case H_DCT: + case H_ADST: + case H_FLIPADST: case V_DCT: case V_ADST: case V_FLIPADST: @@ -4413,13 +4413,13 @@ switch (tx_type) { // Assembly version doesn't support some transform types, so use C version // for those. - case H_DCT: - case H_ADST: - case H_FLIPADST: case IDTX: av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); break; + case H_DCT: + case H_ADST: + case H_FLIPADST: case V_DCT: case V_ADST: case V_FLIPADST: @@ -4482,13 +4482,13 @@ switch (tx_type) { // Assembly version doesn't support some transform types, so use C version // for those. - case H_DCT: - case H_ADST: - case H_FLIPADST: case IDTX: av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); break; + case H_DCT: + case H_ADST: + case H_FLIPADST: case V_DCT: case V_ADST: case V_FLIPADST: @@ -4513,13 +4513,13 @@ switch (tx_type) { // Assembly version doesn't support some transform types, so use C version // for those. - case H_DCT: - case H_ADST: - case H_FLIPADST: case IDTX: av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); break; + case H_DCT: + case H_ADST: + case H_FLIPADST: case V_DCT: case V_ADST: case V_FLIPADST:
diff --git a/av1/common/x86/highbd_inv_txfm_sse4.c b/av1/common/x86/highbd_inv_txfm_sse4.c index f546adb..b38bbe9 100644 --- a/av1/common/x86/highbd_inv_txfm_sse4.c +++ b/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -5148,13 +5148,13 @@ switch (tx_type) { // Assembly version doesn't support some transform types, so use C version // for those. - case H_DCT: - case H_ADST: - case H_FLIPADST: case IDTX: av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); break; + case H_DCT: + case H_ADST: + case H_FLIPADST: case V_DCT: case V_ADST: case V_FLIPADST: @@ -5178,9 +5178,6 @@ switch (tx_type) { // Assembly version doesn't support some transform types, so use C version // for those. - case H_DCT: - case H_ADST: - case H_FLIPADST: case IDTX: av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); @@ -5202,9 +5199,6 @@ switch (tx_type) { // Assembly version doesn't support some transform types, so use C version // for those. - case H_DCT: - case H_ADST: - case H_FLIPADST: case IDTX: av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); @@ -5226,9 +5220,6 @@ switch (tx_type) { // Assembly version doesn't support some transform types, so use C version // for those. - case H_DCT: - case H_ADST: - case H_FLIPADST: case IDTX: av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); @@ -5464,6 +5455,85 @@ stride, ud_flip, txfm_size_row, bd); } } +static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input, + uint16_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + __m128i buf1[64]; + int eobx, eoby; + get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int input_stride = AOMMIN(32, txfm_size_col); + const int buf_size_w_div8 = input_stride >> 2; + const int row_max = AOMMIN(32, txfm_size_row); + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < (row_max >> 2); ++i) { + __m128i buf0[16]; + const int32_t *input_row = input + i * input_stride * 4; + for (int j = 0; j < (buf_size_nonzero_w_div8 << 1); ++j) { + __m128i *buf0_cur = buf0 + j * 4; + load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4); + + TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], + buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); + } + if (rect_type == 1 || rect_type == -1) { + av1_round_shift_rect_array_32_sse4_1( + buf0, buf0, (buf_size_nonzero_w_div8 << 3), 0, NewInvSqrt2); + } + row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); + + __m128i *_buf1 = buf1 + i * 4; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]); + } + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + TRANSPOSE_4X4( + buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], + _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], + _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); + } + } + } + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, + inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + + av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, + output + 8 * i, stride, ud_flip, + txfm_size_row, bd); + } + } +} static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, @@ -5802,6 +5872,13 @@ input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, bd); break; + case H_DCT: + case H_ADST: + case H_FLIPADST: + highbd_inv_txfm2d_add_v_identity_ssse41( + input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, + bd); + break; default: assert(0); break; } }