Skip inv_txfm of all zero block by eob value

While looping through each 16x16 block within a larger block,
we may process the top-left corner only. By the eob value,
we can learn how many blocks need to be processed, and
regard all the remaining blocks as all zero-valued.

Summary of the partial inv_txfm based on eob for avx2.

https://aomedia-review.googlesource.com/c/aom/+/52941
https://aomedia-review.googlesource.com/c/aom/+/52961
https://aomedia-review.googlesource.com/c/aom/+/53041
https://aomedia-review.googlesource.com/c/aom/+/53061
https://aomedia-review.googlesource.com/c/aom/+/53101

For decoder, profiling results shows the cpu time
of inv txfm drops from 4.34% to 3.27%.
Base Commit ID: f9db0e090
Sequence: 150 frames of crew_720p30
Config: Default (-DCONFIG_LOWBITDEPTH=1)

Change-Id: I1f73d02cae38802d00b7ba45a7d1fa97dfe63efe
diff --git a/av1/common/x86/av1_inv_txfm_avx2.c b/av1/common/x86/av1_inv_txfm_avx2.c
index 4dbbfe9..ed763c3 100644
--- a/av1/common/x86/av1_inv_txfm_avx2.c
+++ b/av1/common/x86/av1_inv_txfm_avx2.c
@@ -1642,7 +1642,8 @@
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_size_w_div16 = txfm_size_col >> 4;
-  const int buf_size_h = AOMMIN(32, txfm_size_row);
+  const int buf_size_nonzero_w_div16 = (eobx + 16) >> 4;
+  const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4;
   const int input_stride = AOMMIN(32, txfm_size_col);
   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 
@@ -1657,10 +1658,10 @@
   assert(row_txfm != NULL);
   int ud_flip, lr_flip;
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  for (int i = 0; i < buf_size_h; i += 16) {
+  for (int i = 0; i < buf_size_nonzero_h_div16; i++) {
     __m256i buf0[64];
-    const int32_t *input_row = input + i * input_stride;
-    for (int j = 0; j < AOMMIN(2, buf_size_w_div16); ++j) {
+    const int32_t *input_row = input + (i << 4) * input_stride;
+    for (int j = 0; j < buf_size_nonzero_w_div16; ++j) {
       __m256i *buf0_cur = buf0 + j * 16;
       const int32_t *input_cur = input_row + j * 16;
       load_buffer_32bit_to_16bit_w16_avx2(input_cur, input_stride, buf0_cur,
@@ -1673,7 +1674,7 @@
     row_txfm(buf0, buf0, cos_bit_row);
     round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
 
-    __m256i *buf1_cur = buf1 + i;
+    __m256i *buf1_cur = buf1 + (i << 4);
     if (lr_flip) {
       for (int j = 0; j < buf_size_w_div16; ++j) {
         __m256i temp[16];
@@ -1796,8 +1797,8 @@
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int txfm_size_col_notzero = AOMMIN(32, txfm_size_col);
-  const int txfm_size_row_notzero = AOMMIN(32, txfm_size_row);
   const int input_stride = txfm_size_col_notzero;
+  const int buf_size_w_div16 = (eobx + 16) >> 4;
   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 
   const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
@@ -1808,17 +1809,17 @@
 
   int ud_flip, lr_flip;
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  for (int i = 0; i < txfm_size_col_notzero; i += 16) {
+  for (int i = 0; i < buf_size_w_div16; i++) {
     __m256i buf0[64];
-    iidentity_row_16xn_avx2(buf0, input + i, input_stride, shift[0],
-                            txfm_size_row_notzero, txw_idx, rect_type);
+    iidentity_row_16xn_avx2(buf0, input + (i << 4), input_stride, shift[0],
+                            eoby + 1, txw_idx, rect_type);
     col_txfm(buf0, buf0, cos_bit_col);
     __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1]));
     int k = ud_flip ? (txfm_size_row - 1) : 0;
     const int step = ud_flip ? -1 : 1;
     for (int j = 0; j < txfm_size_row; ++j, k += step) {
       __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift);
-      write_recon_w16_avx2(res, output + i + j * stride);
+      write_recon_w16_avx2(res, output + (i << 4) + j * stride);
     }
   }
 }
@@ -1836,7 +1837,7 @@
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_size_w_div16 = txfm_size_col >> 4;
-  const int buf_size_h_div16 = AOMMIN(32, txfm_size_row) >> 4;
+  const int buf_size_h_div16 = (eoby + 16) >> 4;
   const int input_stride = AOMMIN(32, txfm_size_col);
   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);