Skip inv_txfm of all zero block by eob value
While looping through each 16x16 block within a larger block,
we may process the top-left corner only. By the eob value,
we can learn how many blocks need to be processed, and
regard all the remaining blocks as all zero-valued.
Summary of the partial inv_txfm based on eob for avx2.
https://aomedia-review.googlesource.com/c/aom/+/52941
https://aomedia-review.googlesource.com/c/aom/+/52961
https://aomedia-review.googlesource.com/c/aom/+/53041
https://aomedia-review.googlesource.com/c/aom/+/53061
https://aomedia-review.googlesource.com/c/aom/+/53101
For decoder, profiling results shows the cpu time
of inv txfm drops from 4.34% to 3.27%.
Base Commit ID: f9db0e090
Sequence: 150 frames of crew_720p30
Config: Default (-DCONFIG_LOWBITDEPTH=1)
Change-Id: I1f73d02cae38802d00b7ba45a7d1fa97dfe63efe
diff --git a/av1/common/x86/av1_inv_txfm_avx2.c b/av1/common/x86/av1_inv_txfm_avx2.c
index 4dbbfe9..ed763c3 100644
--- a/av1/common/x86/av1_inv_txfm_avx2.c
+++ b/av1/common/x86/av1_inv_txfm_avx2.c
@@ -1642,7 +1642,8 @@
const int txfm_size_col = tx_size_wide[tx_size];
const int txfm_size_row = tx_size_high[tx_size];
const int buf_size_w_div16 = txfm_size_col >> 4;
- const int buf_size_h = AOMMIN(32, txfm_size_row);
+ const int buf_size_nonzero_w_div16 = (eobx + 16) >> 4;
+ const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4;
const int input_stride = AOMMIN(32, txfm_size_col);
const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
@@ -1657,10 +1658,10 @@
assert(row_txfm != NULL);
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
- for (int i = 0; i < buf_size_h; i += 16) {
+ for (int i = 0; i < buf_size_nonzero_h_div16; i++) {
__m256i buf0[64];
- const int32_t *input_row = input + i * input_stride;
- for (int j = 0; j < AOMMIN(2, buf_size_w_div16); ++j) {
+ const int32_t *input_row = input + (i << 4) * input_stride;
+ for (int j = 0; j < buf_size_nonzero_w_div16; ++j) {
__m256i *buf0_cur = buf0 + j * 16;
const int32_t *input_cur = input_row + j * 16;
load_buffer_32bit_to_16bit_w16_avx2(input_cur, input_stride, buf0_cur,
@@ -1673,7 +1674,7 @@
row_txfm(buf0, buf0, cos_bit_row);
round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
- __m256i *buf1_cur = buf1 + i;
+ __m256i *buf1_cur = buf1 + (i << 4);
if (lr_flip) {
for (int j = 0; j < buf_size_w_div16; ++j) {
__m256i temp[16];
@@ -1796,8 +1797,8 @@
const int txfm_size_col = tx_size_wide[tx_size];
const int txfm_size_row = tx_size_high[tx_size];
const int txfm_size_col_notzero = AOMMIN(32, txfm_size_col);
- const int txfm_size_row_notzero = AOMMIN(32, txfm_size_row);
const int input_stride = txfm_size_col_notzero;
+ const int buf_size_w_div16 = (eobx + 16) >> 4;
const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
@@ -1808,17 +1809,17 @@
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
- for (int i = 0; i < txfm_size_col_notzero; i += 16) {
+ for (int i = 0; i < buf_size_w_div16; i++) {
__m256i buf0[64];
- iidentity_row_16xn_avx2(buf0, input + i, input_stride, shift[0],
- txfm_size_row_notzero, txw_idx, rect_type);
+ iidentity_row_16xn_avx2(buf0, input + (i << 4), input_stride, shift[0],
+ eoby + 1, txw_idx, rect_type);
col_txfm(buf0, buf0, cos_bit_col);
__m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1]));
int k = ud_flip ? (txfm_size_row - 1) : 0;
const int step = ud_flip ? -1 : 1;
for (int j = 0; j < txfm_size_row; ++j, k += step) {
__m256i res = _mm256_mulhrs_epi16(buf0[k], mshift);
- write_recon_w16_avx2(res, output + i + j * stride);
+ write_recon_w16_avx2(res, output + (i << 4) + j * stride);
}
}
}
@@ -1836,7 +1837,7 @@
const int txfm_size_col = tx_size_wide[tx_size];
const int txfm_size_row = tx_size_high[tx_size];
const int buf_size_w_div16 = txfm_size_col >> 4;
- const int buf_size_h_div16 = AOMMIN(32, txfm_size_row) >> 4;
+ const int buf_size_h_div16 = (eoby + 16) >> 4;
const int input_stride = AOMMIN(32, txfm_size_col);
const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);