Optimize highbd inv_txfm modules
Added SSE4_1 variants for vertical identity txfm types.
Module level gains:
Tx_size Gain w.r.t. C
8x8 6.65x
8x16 6.15x
16x8 9.59x
16x16 10.07x
When tested for multiple test cases observed 1.1%
average reduction in encoder time for speed = 1 preset.
Change-Id: I586db59840646fa6830647996ea4d84c0e6713dc
diff --git a/av1/common/x86/highbd_inv_txfm_avx2.c b/av1/common/x86/highbd_inv_txfm_avx2.c
index 2fe5777..4de31b6 100644
--- a/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -4322,13 +4322,13 @@
switch (tx_type) {
// Assembly version doesn't support some transform types, so use C version
// for those.
- case H_DCT:
- case H_ADST:
- case H_FLIPADST:
case IDTX:
av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
tx_type, bd);
break;
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
case V_DCT:
case V_ADST:
case V_FLIPADST:
@@ -4413,13 +4413,13 @@
switch (tx_type) {
// Assembly version doesn't support some transform types, so use C version
// for those.
- case H_DCT:
- case H_ADST:
- case H_FLIPADST:
case IDTX:
av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
bd);
break;
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
case V_DCT:
case V_ADST:
case V_FLIPADST:
@@ -4482,13 +4482,13 @@
switch (tx_type) {
// Assembly version doesn't support some transform types, so use C version
// for those.
- case H_DCT:
- case H_ADST:
- case H_FLIPADST:
case IDTX:
av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
txfm_param->tx_type, txfm_param->bd);
break;
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
case V_DCT:
case V_ADST:
case V_FLIPADST:
@@ -4513,13 +4513,13 @@
switch (tx_type) {
// Assembly version doesn't support some transform types, so use C version
// for those.
- case H_DCT:
- case H_ADST:
- case H_FLIPADST:
case IDTX:
av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
txfm_param->tx_type, txfm_param->bd);
break;
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
case V_DCT:
case V_ADST:
case V_FLIPADST:
diff --git a/av1/common/x86/highbd_inv_txfm_sse4.c b/av1/common/x86/highbd_inv_txfm_sse4.c
index f546adb..b38bbe9 100644
--- a/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -5148,13 +5148,13 @@
switch (tx_type) {
// Assembly version doesn't support some transform types, so use C version
// for those.
- case H_DCT:
- case H_ADST:
- case H_FLIPADST:
case IDTX:
av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
bd);
break;
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
case V_DCT:
case V_ADST:
case V_FLIPADST:
@@ -5178,9 +5178,6 @@
switch (tx_type) {
// Assembly version doesn't support some transform types, so use C version
// for those.
- case H_DCT:
- case H_ADST:
- case H_FLIPADST:
case IDTX:
av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
txfm_param->tx_type, txfm_param->bd);
@@ -5202,9 +5199,6 @@
switch (tx_type) {
// Assembly version doesn't support some transform types, so use C version
// for those.
- case H_DCT:
- case H_ADST:
- case H_FLIPADST:
case IDTX:
av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
txfm_param->tx_type, txfm_param->bd);
@@ -5226,9 +5220,6 @@
switch (tx_type) {
// Assembly version doesn't support some transform types, so use C version
// for those.
- case H_DCT:
- case H_ADST:
- case H_FLIPADST:
case IDTX:
av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
tx_type, bd);
@@ -5464,6 +5455,85 @@
stride, ud_flip, txfm_size_row, bd);
}
}
+static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input,
+ uint16_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob,
+ const int bd) {
+ __m128i buf1[64];
+ int eobx, eoby;
+ get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int buf_size_w_div8 = input_stride >> 2;
+ const int row_max = AOMMIN(32, txfm_size_row);
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < (row_max >> 2); ++i) {
+ __m128i buf0[16];
+ const int32_t *input_row = input + i * input_stride * 4;
+ for (int j = 0; j < (buf_size_nonzero_w_div8 << 1); ++j) {
+ __m128i *buf0_cur = buf0 + j * 4;
+ load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+
+ TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+ buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+ }
+ if (rect_type == 1 || rect_type == -1) {
+ av1_round_shift_rect_array_32_sse4_1(
+ buf0, buf0, (buf_size_nonzero_w_div8 << 3), 0, NewInvSqrt2);
+ }
+ row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+ __m128i *_buf1 = buf1 + i * 4;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+ buf0[4 * j],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ TRANSPOSE_4X4(
+ buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+ _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+ _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+ }
+ }
+ }
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+ inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ {
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+ output + 8 * i, stride, ud_flip,
+ txfm_size_row, bd);
+ }
+ }
+}
static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
uint16_t *output,
int stride, TX_TYPE tx_type,
@@ -5802,6 +5872,13 @@
input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
bd);
break;
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ highbd_inv_txfm2d_add_v_identity_ssse41(
+ input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+ bd);
+ break;
default: assert(0); break;
}
}