Revert "Revert "Remove second transpose from transforms""
This reverts commit fe9647fcd4e899330cb46df5267ff5a82806adfd.
Reason For Revert:
Fixed asserts and unit tests passed on arm.
Bug fixed: aomedia:3360
Change-Id: I397e55b1653a473977e0d77d7fb5fa1d4c353c89
diff --git a/av1/encoder/x86/highbd_fwd_txfm_avx2.c b/av1/encoder/x86/highbd_fwd_txfm_avx2.c
index 1faa412..9cdf21f 100644
--- a/av1/encoder/x86/highbd_fwd_txfm_avx2.c
+++ b/av1/encoder/x86/highbd_fwd_txfm_avx2.c
@@ -561,8 +561,7 @@
fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
- store_buffer_avx2(in, coeff, 8, 8);
+ store_buffer_avx2(out, coeff, 8, 8);
break;
case ADST_DCT:
load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
@@ -572,8 +571,7 @@
fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
- store_buffer_avx2(in, coeff, 8, 8);
+ store_buffer_avx2(out, coeff, 8, 8);
break;
case DCT_ADST:
load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
@@ -583,8 +581,7 @@
fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
- store_buffer_avx2(in, coeff, 8, 8);
+ store_buffer_avx2(out, coeff, 8, 8);
break;
case ADST_ADST:
load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
@@ -594,8 +591,7 @@
fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
- store_buffer_avx2(in, coeff, 8, 8);
+ store_buffer_avx2(out, coeff, 8, 8);
break;
case FLIPADST_DCT:
load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
@@ -605,8 +601,7 @@
fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
- store_buffer_avx2(in, coeff, 8, 8);
+ store_buffer_avx2(out, coeff, 8, 8);
break;
case DCT_FLIPADST:
load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
@@ -616,8 +611,7 @@
fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
- store_buffer_avx2(in, coeff, 8, 8);
+ store_buffer_avx2(out, coeff, 8, 8);
break;
case FLIPADST_FLIPADST:
load_buffer_8x8_avx2(input, in, stride, 1, 1, shift[0]);
@@ -627,8 +621,7 @@
fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
- store_buffer_avx2(in, coeff, 8, 8);
+ store_buffer_avx2(out, coeff, 8, 8);
break;
case ADST_FLIPADST:
load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
@@ -638,8 +631,7 @@
fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
- store_buffer_avx2(in, coeff, 8, 8);
+ store_buffer_avx2(out, coeff, 8, 8);
break;
case FLIPADST_ADST:
load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
@@ -649,26 +641,27 @@
fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
- store_buffer_avx2(in, coeff, 8, 8);
+ store_buffer_avx2(out, coeff, 8, 8);
break;
case IDTX:
load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
width_div8);
col_txfm_8x8_rounding(out, -shift[1]);
- idtx8_avx2(out, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
width_div8);
- store_buffer_avx2(in, coeff, 8, 8);
+ store_buffer_avx2(out, coeff, 8, 8);
break;
case V_DCT:
load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
width_div8);
col_txfm_8x8_rounding(out, -shift[1]);
- idtx8_avx2(out, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
width_div8);
- store_buffer_avx2(in, coeff, 8, 8);
+ store_buffer_avx2(out, coeff, 8, 8);
break;
case H_DCT:
load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
@@ -678,17 +671,17 @@
fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
- store_buffer_avx2(in, coeff, 8, 8);
+ store_buffer_avx2(out, coeff, 8, 8);
break;
case V_ADST:
load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
width_div8);
col_txfm_8x8_rounding(out, -shift[1]);
- idtx8_avx2(out, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
width_div8);
- store_buffer_avx2(in, coeff, 8, 8);
+ store_buffer_avx2(out, coeff, 8, 8);
break;
case H_ADST:
load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
@@ -698,17 +691,17 @@
fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
- store_buffer_avx2(in, coeff, 8, 8);
+ store_buffer_avx2(out, coeff, 8, 8);
break;
case V_FLIPADST:
load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
width_div8);
col_txfm_8x8_rounding(out, -shift[1]);
- idtx8_avx2(out, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
width_div8);
- store_buffer_avx2(in, coeff, 8, 8);
+ store_buffer_avx2(out, coeff, 8, 8);
break;
case H_FLIPADST:
load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
@@ -718,8 +711,7 @@
fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
- store_buffer_avx2(in, coeff, 8, 8);
+ store_buffer_avx2(out, coeff, 8, 8);
break;
default: assert(0);
}
@@ -1333,9 +1325,7 @@
fwd_txfm_transpose_8x8_avx2(out, in, 1, 2);
fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2);
row_txfm(in, out, bit, 2, 2);
- fwd_txfm_transpose_8x8_avx2(out, in, 2, 1);
- fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1);
- round_shift_rect_array_32_avx2(in, in, 16, -shift[2], NewSqrt2);
+ round_shift_rect_array_32_avx2(out, in, 16, -shift[2], NewSqrt2);
store_buffer_avx2(in, coeff, 8, 16);
(void)bd;
}
@@ -1394,10 +1384,8 @@
fwd_txfm_transpose_8x8_avx2(out, in, 2, 1);
fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1);
row_txfm(in, out, bit, 1, 1);
- fwd_txfm_transpose_8x8_avx2(out, in, 1, 2);
- fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2);
- round_shift_rect_array_32_avx2(in, in, 16, -shift[2], NewSqrt2);
- store_buffer_avx2(in, coeff, 8, 16);
+ round_shift_rect_array_32_avx2(out, out, 16, -shift[2], NewSqrt2);
+ store_buffer_avx2(out, coeff, 8, 16);
(void)bd;
}
void av1_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *coeff, int stride,
@@ -1422,8 +1410,7 @@
fwd_txfm_transpose_16x16_avx2(out, in);
fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_16x16_avx2(out, in);
- store_buffer_avx2(in, coeff, 8, 32);
+ store_buffer_avx2(out, coeff, 8, 32);
break;
case ADST_DCT:
load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
@@ -1434,8 +1421,7 @@
fwd_txfm_transpose_16x16_avx2(out, in);
fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_16x16_avx2(out, in);
- store_buffer_avx2(in, coeff, 8, 32);
+ store_buffer_avx2(out, coeff, 8, 32);
break;
case DCT_ADST:
load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
@@ -1446,8 +1432,7 @@
fwd_txfm_transpose_16x16_avx2(out, in);
fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_16x16_avx2(out, in);
- store_buffer_avx2(in, coeff, 8, 32);
+ store_buffer_avx2(out, coeff, 8, 32);
break;
case ADST_ADST:
load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
@@ -1458,8 +1443,7 @@
fwd_txfm_transpose_16x16_avx2(out, in);
fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_16x16_avx2(out, in);
- store_buffer_avx2(in, coeff, 8, 32);
+ store_buffer_avx2(out, coeff, 8, 32);
break;
case FLIPADST_DCT:
load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
@@ -1470,8 +1454,7 @@
fwd_txfm_transpose_16x16_avx2(out, in);
fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_16x16_avx2(out, in);
- store_buffer_avx2(in, coeff, 8, 32);
+ store_buffer_avx2(out, coeff, 8, 32);
break;
case DCT_FLIPADST:
load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
@@ -1482,8 +1465,7 @@
fwd_txfm_transpose_16x16_avx2(out, in);
fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_16x16_avx2(out, in);
- store_buffer_avx2(in, coeff, 8, 32);
+ store_buffer_avx2(out, coeff, 8, 32);
break;
case FLIPADST_FLIPADST:
load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 1);
@@ -1494,8 +1476,7 @@
fwd_txfm_transpose_16x16_avx2(out, in);
fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_16x16_avx2(out, in);
- store_buffer_avx2(in, coeff, 8, 32);
+ store_buffer_avx2(out, coeff, 8, 32);
break;
case ADST_FLIPADST:
load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
@@ -1506,8 +1487,7 @@
fwd_txfm_transpose_16x16_avx2(out, in);
fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_16x16_avx2(out, in);
- store_buffer_avx2(in, coeff, 8, 32);
+ store_buffer_avx2(out, coeff, 8, 32);
break;
case FLIPADST_ADST:
load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
@@ -1518,8 +1498,7 @@
fwd_txfm_transpose_16x16_avx2(out, in);
fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_16x16_avx2(out, in);
- store_buffer_avx2(in, coeff, 8, 32);
+ store_buffer_avx2(out, coeff, 8, 32);
break;
case IDTX:
load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
@@ -1527,9 +1506,10 @@
idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
width_div8);
round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
- idtx16_avx2(out, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- store_buffer_avx2(in, coeff, 8, 32);
+ store_buffer_avx2(out, coeff, 8, 32);
break;
case V_DCT:
load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
@@ -1537,9 +1517,10 @@
fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
width_div8);
round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
- idtx16_avx2(out, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- store_buffer_avx2(in, coeff, 8, 32);
+ store_buffer_avx2(out, coeff, 8, 32);
break;
case H_DCT:
load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
@@ -1550,8 +1531,7 @@
fwd_txfm_transpose_16x16_avx2(out, in);
fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_16x16_avx2(out, in);
- store_buffer_avx2(in, coeff, 8, 32);
+ store_buffer_avx2(out, coeff, 8, 32);
break;
case V_ADST:
load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
@@ -1559,9 +1539,10 @@
fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
width_div8);
round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
- idtx16_avx2(out, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- store_buffer_avx2(in, coeff, 8, 32);
+ store_buffer_avx2(out, coeff, 8, 32);
break;
case H_ADST:
load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
@@ -1572,8 +1553,7 @@
fwd_txfm_transpose_16x16_avx2(out, in);
fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_16x16_avx2(out, in);
- store_buffer_avx2(in, coeff, 8, 32);
+ store_buffer_avx2(out, coeff, 8, 32);
break;
case V_FLIPADST:
load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
@@ -1581,9 +1561,10 @@
fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
width_div8);
round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
- idtx16_avx2(out, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- store_buffer_avx2(in, coeff, 8, 32);
+ store_buffer_avx2(out, coeff, 8, 32);
break;
case H_FLIPADST:
load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
@@ -1594,8 +1575,7 @@
fwd_txfm_transpose_16x16_avx2(out, in);
fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
width_div8);
- fwd_txfm_transpose_16x16_avx2(out, in);
- store_buffer_avx2(in, coeff, 8, 32);
+ store_buffer_avx2(out, coeff, 8, 32);
break;
default: assert(0);
}
@@ -2091,15 +2071,7 @@
round_shift_32_8xn_avx2(&buf1[(i << 1) + 1], height, shift[2], width_div8);
}
- for (r = 0; r < height; r += 8) {
- for (c = 0; c < width_div8; c++) {
- fwd_txfm_transpose_8x8_avx2(&buf1[r * width_div8 + c],
- &buf0[c * 8 * width_div8 + (r >> 3)],
- width_div8, width_div8);
- }
- }
-
- store_buffer_avx2(buf0, output, 8, 128);
+ store_buffer_avx2(buf1, output, 8, 128);
}
static INLINE void fdct64_stage2_avx2(__m256i *x1, __m256i *x2,
__m256i *cospi_m32, __m256i *cospi_p32,
@@ -3156,12 +3128,5 @@
width_div16);
}
- for (r = 0; r < (height >> 1); r += 8) {
- for (c = 0; c < width_div16; c++) {
- fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div16 + c],
- &buf1[c * 8 * width_div16 + (r >> 3)],
- width_div16, width_div16);
- }
- }
- store_buffer_avx2(buf1, output, 8, 128);
+ store_buffer_avx2(buf0, output, 8, 128);
}