Optimize highbd fwd_txfm module
Added AVX2 variant for 32x32 txfm blk_size.
When tested for multiple test cases observed 0.75%
average reduction in encoder time for speed = 1 preset.
Module level gains improved by a factor of ~1.8
on average w.r.t to SSE4_1 module.
Change-Id: I986bcdf82d20f80986bb9497c207875115b3f31a
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 2c7a5a0..0228604 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -234,7 +234,7 @@
add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
specialize qw/av1_fwd_txfm2d_16x16 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
- specialize qw/av1_fwd_txfm2d_32x32 sse4_1/;
+ specialize qw/av1_fwd_txfm2d_32x32 sse4_1 avx2/;
add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
specialize qw/av1_fwd_txfm2d_64x64 sse4_1 avx2/;
diff --git a/av1/encoder/x86/highbd_fwd_txfm_avx2.c b/av1/encoder/x86/highbd_fwd_txfm_avx2.c
index 1bcf472..85fb3be 100644
--- a/av1/encoder/x86/highbd_fwd_txfm_avx2.c
+++ b/av1/encoder/x86/highbd_fwd_txfm_avx2.c
@@ -92,6 +92,20 @@
out += stride;
}
}
+#define btf_32_avx2_type0(w0, w1, in0, in1, out0, out1, bit) \
+ do { \
+ const __m256i ww0 = _mm256_set1_epi32(w0); \
+ const __m256i ww1 = _mm256_set1_epi32(w1); \
+ const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0); \
+ const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1); \
+ out0 = _mm256_add_epi32(in0_w0, in1_w1); \
+ av1_round_shift_32_8xn_avx2(&out0, 1, -bit, 1); \
+ const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1); \
+ const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0); \
+ out1 = _mm256_sub_epi32(in0_w1, in1_w0); \
+ av1_round_shift_32_8xn_avx2(&out1, 1, -bit, 1); \
+ } while (0)
+
#define btf_32_type0_avx2_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
do { \
const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0); \
@@ -111,6 +125,519 @@
const int8_t *stage_range, int instride,
int outstride);
+static INLINE void av1_fdct32_avx2(__m256i *input, __m256i *output,
+ const int8_t cos_bit,
+ const int8_t *stage_range,
+ const int instride, const int outstride) {
+ (void)stage_range;
+ __m256i buf0[32];
+ __m256i buf1[32];
+ const int32_t *cospi;
+ int startidx = 0 * instride;
+ int endidx = 31 * instride;
+ // stage 0
+ // stage 1
+ buf1[0] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[31] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[1] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[30] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[2] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[29] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[3] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[28] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[4] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[27] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[5] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[26] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[6] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[25] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[7] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[24] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[8] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[23] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[9] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[22] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[10] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[21] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[11] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[20] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[12] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[19] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[13] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[18] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[14] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[17] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[15] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[16] = _mm256_sub_epi32(input[startidx], input[endidx]);
+
+ // stage 2
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = _mm256_add_epi32(buf1[0], buf1[15]);
+ buf0[15] = _mm256_sub_epi32(buf1[0], buf1[15]);
+ buf0[1] = _mm256_add_epi32(buf1[1], buf1[14]);
+ buf0[14] = _mm256_sub_epi32(buf1[1], buf1[14]);
+ buf0[2] = _mm256_add_epi32(buf1[2], buf1[13]);
+ buf0[13] = _mm256_sub_epi32(buf1[2], buf1[13]);
+ buf0[3] = _mm256_add_epi32(buf1[3], buf1[12]);
+ buf0[12] = _mm256_sub_epi32(buf1[3], buf1[12]);
+ buf0[4] = _mm256_add_epi32(buf1[4], buf1[11]);
+ buf0[11] = _mm256_sub_epi32(buf1[4], buf1[11]);
+ buf0[5] = _mm256_add_epi32(buf1[5], buf1[10]);
+ buf0[10] = _mm256_sub_epi32(buf1[5], buf1[10]);
+ buf0[6] = _mm256_add_epi32(buf1[6], buf1[9]);
+ buf0[9] = _mm256_sub_epi32(buf1[6], buf1[9]);
+ buf0[7] = _mm256_add_epi32(buf1[7], buf1[8]);
+ buf0[8] = _mm256_sub_epi32(buf1[7], buf1[8]);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ buf0[18] = buf1[18];
+ buf0[19] = buf1[19];
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+ buf0[25], cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+ buf0[24], cos_bit);
+ buf0[28] = buf1[28];
+ buf0[29] = buf1[29];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 3
+ cospi = cospi_arr(cos_bit);
+ buf1[0] = _mm256_add_epi32(buf0[0], buf0[7]);
+ buf1[7] = _mm256_sub_epi32(buf0[0], buf0[7]);
+ buf1[1] = _mm256_add_epi32(buf0[1], buf0[6]);
+ buf1[6] = _mm256_sub_epi32(buf0[1], buf0[6]);
+ buf1[2] = _mm256_add_epi32(buf0[2], buf0[5]);
+ buf1[5] = _mm256_sub_epi32(buf0[2], buf0[5]);
+ buf1[3] = _mm256_add_epi32(buf0[3], buf0[4]);
+ buf1[4] = _mm256_sub_epi32(buf0[3], buf0[4]);
+ buf1[8] = buf0[8];
+ buf1[9] = buf0[9];
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+ buf1[13], cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+ buf1[12], cos_bit);
+ buf1[14] = buf0[14];
+ buf1[15] = buf0[15];
+ buf1[16] = _mm256_add_epi32(buf0[16], buf0[23]);
+ buf1[23] = _mm256_sub_epi32(buf0[16], buf0[23]);
+ buf1[17] = _mm256_add_epi32(buf0[17], buf0[22]);
+ buf1[22] = _mm256_sub_epi32(buf0[17], buf0[22]);
+ buf1[18] = _mm256_add_epi32(buf0[18], buf0[21]);
+ buf1[21] = _mm256_sub_epi32(buf0[18], buf0[21]);
+ buf1[19] = _mm256_add_epi32(buf0[19], buf0[20]);
+ buf1[20] = _mm256_sub_epi32(buf0[19], buf0[20]);
+ buf1[24] = _mm256_sub_epi32(buf0[31], buf0[24]);
+ buf1[31] = _mm256_add_epi32(buf0[31], buf0[24]);
+ buf1[25] = _mm256_sub_epi32(buf0[30], buf0[25]);
+ buf1[30] = _mm256_add_epi32(buf0[30], buf0[25]);
+ buf1[26] = _mm256_sub_epi32(buf0[29], buf0[26]);
+ buf1[29] = _mm256_add_epi32(buf0[29], buf0[26]);
+ buf1[27] = _mm256_sub_epi32(buf0[28], buf0[27]);
+ buf1[28] = _mm256_add_epi32(buf0[28], buf0[27]);
+
+ // stage 4
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = _mm256_add_epi32(buf1[0], buf1[3]);
+ buf0[3] = _mm256_sub_epi32(buf1[0], buf1[3]);
+ buf0[1] = _mm256_add_epi32(buf1[1], buf1[2]);
+ buf0[2] = _mm256_sub_epi32(buf1[1], buf1[2]);
+ buf0[4] = buf1[4];
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
+ cos_bit);
+ buf0[7] = buf1[7];
+ buf0[8] = _mm256_add_epi32(buf1[8], buf1[11]);
+ buf0[11] = _mm256_sub_epi32(buf1[8], buf1[11]);
+ buf0[9] = _mm256_add_epi32(buf1[9], buf1[10]);
+ buf0[10] = _mm256_sub_epi32(buf1[9], buf1[10]);
+ buf0[12] = _mm256_sub_epi32(buf1[15], buf1[12]);
+ buf0[15] = _mm256_add_epi32(buf1[15], buf1[12]);
+ buf0[13] = _mm256_sub_epi32(buf1[14], buf1[13]);
+ buf0[14] = _mm256_add_epi32(buf1[14], buf1[13]);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ btf_32_avx2_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+ buf0[29], cos_bit);
+ btf_32_avx2_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+ buf0[28], cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ buf0[22] = buf1[22];
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[25] = buf1[25];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 5
+ cospi = cospi_arr(cos_bit);
+ btf_32_avx2_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
+ cos_bit);
+ btf_32_avx2_type0(cospi[16], cospi[48], buf0[3], buf0[2], buf1[2], buf1[3],
+ cos_bit);
+ buf1[4] = _mm256_add_epi32(buf0[4], buf0[5]);
+ buf1[5] = _mm256_sub_epi32(buf0[4], buf0[5]);
+ buf1[6] = _mm256_sub_epi32(buf0[7], buf0[6]);
+ buf1[7] = _mm256_add_epi32(buf0[7], buf0[6]);
+ buf1[8] = buf0[8];
+ btf_32_avx2_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14],
+ cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+ buf1[13], cos_bit);
+ buf1[11] = buf0[11];
+ buf1[12] = buf0[12];
+ buf1[15] = buf0[15];
+ buf1[16] = _mm256_add_epi32(buf0[16], buf0[19]);
+ buf1[19] = _mm256_sub_epi32(buf0[16], buf0[19]);
+ buf1[17] = _mm256_add_epi32(buf0[17], buf0[18]);
+ buf1[18] = _mm256_sub_epi32(buf0[17], buf0[18]);
+ buf1[20] = _mm256_sub_epi32(buf0[23], buf0[20]);
+ buf1[23] = _mm256_add_epi32(buf0[23], buf0[20]);
+ buf1[21] = _mm256_sub_epi32(buf0[22], buf0[21]);
+ buf1[22] = _mm256_add_epi32(buf0[22], buf0[21]);
+ buf1[24] = _mm256_add_epi32(buf0[24], buf0[27]);
+ buf1[27] = _mm256_sub_epi32(buf0[24], buf0[27]);
+ buf1[25] = _mm256_add_epi32(buf0[25], buf0[26]);
+ buf1[26] = _mm256_sub_epi32(buf0[25], buf0[26]);
+ buf1[28] = _mm256_sub_epi32(buf0[31], buf0[28]);
+ buf1[31] = _mm256_add_epi32(buf0[31], buf0[28]);
+ buf1[29] = _mm256_sub_epi32(buf0[30], buf0[29]);
+ buf1[30] = _mm256_add_epi32(buf0[30], buf0[29]);
+
+ // stage 6
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ btf_32_avx2_type0(cospi[8], cospi[56], buf1[7], buf1[4], buf0[4], buf0[7],
+ cos_bit);
+ btf_32_avx2_type0(cospi[40], cospi[24], buf1[6], buf1[5], buf0[5], buf0[6],
+ cos_bit);
+ buf0[8] = _mm256_add_epi32(buf1[8], buf1[9]);
+ buf0[9] = _mm256_sub_epi32(buf1[8], buf1[9]);
+ buf0[10] = _mm256_sub_epi32(buf1[11], buf1[10]);
+ buf0[11] = _mm256_add_epi32(buf1[11], buf1[10]);
+ buf0[12] = _mm256_add_epi32(buf1[12], buf1[13]);
+ buf0[13] = _mm256_sub_epi32(buf1[12], buf1[13]);
+ buf0[14] = _mm256_sub_epi32(buf1[15], buf1[14]);
+ buf0[15] = _mm256_add_epi32(buf1[15], buf1[14]);
+ buf0[16] = buf1[16];
+ btf_32_avx2_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+ buf0[30], cos_bit);
+ btf_32_avx2_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+ buf0[29], cos_bit);
+ buf0[19] = buf1[19];
+ buf0[20] = buf1[20];
+ btf_32_avx2_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_avx2_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+ buf0[25], cos_bit);
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[27] = buf1[27];
+ buf0[28] = buf1[28];
+ buf0[31] = buf1[31];
+
+ // stage 7
+ cospi = cospi_arr(cos_bit);
+ buf1[0] = buf0[0];
+ buf1[1] = buf0[1];
+ buf1[2] = buf0[2];
+ buf1[3] = buf0[3];
+ buf1[4] = buf0[4];
+ buf1[5] = buf0[5];
+ buf1[6] = buf0[6];
+ buf1[7] = buf0[7];
+ btf_32_avx2_type0(cospi[4], cospi[60], buf0[15], buf0[8], buf1[8], buf1[15],
+ cos_bit);
+ btf_32_avx2_type0(cospi[36], cospi[28], buf0[14], buf0[9], buf1[9], buf1[14],
+ cos_bit);
+ btf_32_avx2_type0(cospi[20], cospi[44], buf0[13], buf0[10], buf1[10],
+ buf1[13], cos_bit);
+ btf_32_avx2_type0(cospi[52], cospi[12], buf0[12], buf0[11], buf1[11],
+ buf1[12], cos_bit);
+ buf1[16] = _mm256_add_epi32(buf0[16], buf0[17]);
+ buf1[17] = _mm256_sub_epi32(buf0[16], buf0[17]);
+ buf1[18] = _mm256_sub_epi32(buf0[19], buf0[18]);
+ buf1[19] = _mm256_add_epi32(buf0[19], buf0[18]);
+ buf1[20] = _mm256_add_epi32(buf0[20], buf0[21]);
+ buf1[21] = _mm256_sub_epi32(buf0[20], buf0[21]);
+ buf1[22] = _mm256_sub_epi32(buf0[23], buf0[22]);
+ buf1[23] = _mm256_add_epi32(buf0[23], buf0[22]);
+ buf1[24] = _mm256_add_epi32(buf0[24], buf0[25]);
+ buf1[25] = _mm256_sub_epi32(buf0[24], buf0[25]);
+ buf1[26] = _mm256_sub_epi32(buf0[27], buf0[26]);
+ buf1[27] = _mm256_add_epi32(buf0[27], buf0[26]);
+ buf1[28] = _mm256_add_epi32(buf0[28], buf0[29]);
+ buf1[29] = _mm256_sub_epi32(buf0[28], buf0[29]);
+ buf1[30] = _mm256_sub_epi32(buf0[31], buf0[30]);
+ buf1[31] = _mm256_add_epi32(buf0[31], buf0[30]);
+
+ // stage 8
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ buf0[4] = buf1[4];
+ buf0[5] = buf1[5];
+ buf0[6] = buf1[6];
+ buf0[7] = buf1[7];
+ buf0[8] = buf1[8];
+ buf0[9] = buf1[9];
+ buf0[10] = buf1[10];
+ buf0[11] = buf1[11];
+ buf0[12] = buf1[12];
+ buf0[13] = buf1[13];
+ buf0[14] = buf1[14];
+ buf0[15] = buf1[15];
+ btf_32_avx2_type0(cospi[2], cospi[62], buf1[31], buf1[16], buf0[16], buf0[31],
+ cos_bit);
+ btf_32_avx2_type0(cospi[34], cospi[30], buf1[30], buf1[17], buf0[17],
+ buf0[30], cos_bit);
+ btf_32_avx2_type0(cospi[18], cospi[46], buf1[29], buf1[18], buf0[18],
+ buf0[29], cos_bit);
+ btf_32_avx2_type0(cospi[50], cospi[14], buf1[28], buf1[19], buf0[19],
+ buf0[28], cos_bit);
+ btf_32_avx2_type0(cospi[10], cospi[54], buf1[27], buf1[20], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_avx2_type0(cospi[42], cospi[22], buf1[26], buf1[21], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_avx2_type0(cospi[26], cospi[38], buf1[25], buf1[22], buf0[22],
+ buf0[25], cos_bit);
+ btf_32_avx2_type0(cospi[58], cospi[6], buf1[24], buf1[23], buf0[23], buf0[24],
+ cos_bit);
+
+ startidx = 0 * outstride;
+ endidx = 31 * outstride;
+ // stage 9
+ output[startidx] = buf0[0];
+ output[endidx] = buf0[31];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[16];
+ output[endidx] = buf0[15];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[8];
+ output[endidx] = buf0[23];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[24];
+ output[endidx] = buf0[7];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[4];
+ output[endidx] = buf0[27];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[20];
+ output[endidx] = buf0[11];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[12];
+ output[endidx] = buf0[19];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[28];
+ output[endidx] = buf0[3];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[2];
+ output[endidx] = buf0[29];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[18];
+ output[endidx] = buf0[13];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[10];
+ output[endidx] = buf0[21];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[26];
+ output[endidx] = buf0[5];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[6];
+ output[endidx] = buf0[25];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[22];
+ output[endidx] = buf0[9];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[14];
+ output[endidx] = buf0[17];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[30];
+ output[endidx] = buf0[1];
+}
+static INLINE void idtx32x32_avx2(__m256i *input, __m256i *output,
+ const int8_t cos_bit,
+ const int8_t *stage_range, int instride,
+ int outstride) {
+ (void)stage_range;
+ (void)cos_bit;
+ for (int i = 0; i < 32; i += 8) {
+ output[i * outstride] = _mm256_slli_epi32(input[i * instride], 2);
+ output[(i + 1) * outstride] =
+ _mm256_slli_epi32(input[(i + 1) * instride], 2);
+ output[(i + 2) * outstride] =
+ _mm256_slli_epi32(input[(i + 2) * instride], 2);
+ output[(i + 3) * outstride] =
+ _mm256_slli_epi32(input[(i + 3) * instride], 2);
+ output[(i + 4) * outstride] =
+ _mm256_slli_epi32(input[(i + 4) * instride], 2);
+ output[(i + 5) * outstride] =
+ _mm256_slli_epi32(input[(i + 5) * instride], 2);
+ output[(i + 6) * outstride] =
+ _mm256_slli_epi32(input[(i + 6) * instride], 2);
+ output[(i + 7) * outstride] =
+ _mm256_slli_epi32(input[(i + 7) * instride], 2);
+ }
+}
+static const transform_1d_avx2 col_txfm8x32_arr[TX_TYPES] = {
+ av1_fdct32_avx2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ idtx32x32_avx2, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+static const transform_1d_avx2 row_txfm8x32_arr[TX_TYPES] = {
+ av1_fdct32_avx2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ idtx32x32_avx2, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+void av1_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m256i buf0[128], buf1[128];
+
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg);
+ TXFM_2D_FLIP_CFG *cfg1 = &cfg;
+ const int tx_size = cfg1->tx_size;
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int8_t *stage_range_col = cfg1->stage_range_col;
+ const int8_t *stage_range_row = cfg1->stage_range_row;
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[cfg1->tx_size];
+ const int height = tx_size_high[cfg1->tx_size];
+ const transform_1d_avx2 col_txfm = col_txfm8x32_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_txfm8x32_arr[tx_type];
+ int r, c;
+ const int width_div16 = (width >> 4);
+ const int width_div8 = (width >> 3);
+
+ for (int i = 0; i < width_div16; i++) {
+ av1_load_buffer_16xn_avx2(input + (i << 4), &buf0[(i << 1)], stride, height,
+ width_div8);
+ av1_round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[0], width_div8);
+ av1_round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0],
+ width_div8);
+ col_txfm(&buf0[(i << 1)], &buf0[(i << 1)], cos_bit_col, stage_range_col,
+ width_div8, width_div8);
+ col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col,
+ stage_range_col, width_div8, width_div8);
+ av1_round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[1], width_div8);
+ av1_round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1],
+ width_div8);
+ }
+
+ for (r = 0; r < height; r += 8) {
+ for (c = 0; c < width_div8; c++) {
+ av1_fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c],
+ &buf1[c * 8 * width_div8 + (r >> 3)],
+ width_div8, width_div8);
+ }
+ }
+
+ for (int i = 0; i < width_div16; i++) {
+ row_txfm(&buf1[(i << 1)], &buf1[(i << 1)], cos_bit_row, stage_range_row,
+ width_div8, width_div8);
+ row_txfm(&buf1[(i << 1) + 1], &buf1[(i << 1) + 1], cos_bit_row,
+ stage_range_row, width_div8, width_div8);
+ av1_round_shift_32_8xn_avx2(&buf1[(i << 1)], height, shift[2], width_div8);
+ av1_round_shift_32_8xn_avx2(&buf1[(i << 1) + 1], height, shift[2],
+ width_div8);
+ }
+
+ for (r = 0; r < height; r += 8) {
+ for (c = 0; c < width_div8; c++) {
+ av1_fwd_txfm_transpose_8x8_avx2(&buf1[r * width_div8 + c],
+ &buf0[c * 8 * width_div8 + (r >> 3)],
+ width_div8, width_div8);
+ }
+ }
+
+ av1_store_buffer_avx2(buf0, output, 8, 128);
+}
static INLINE void av1_fdct64_stage2_avx2(__m256i *x1, __m256i *x2,
__m256i *cospi_m32,
__m256i *cospi_p32,
diff --git a/test/av1_fwd_txfm2d_test.cc b/test/av1_fwd_txfm2d_test.cc
index ba75b02..285983d 100644
--- a/test/av1_fwd_txfm2d_test.cc
+++ b/test/av1_fwd_txfm2d_test.cc
@@ -571,7 +571,7 @@
Values(av1_highbd_fwd_txfm)));
#endif // HAVE_SSE4_1
#if HAVE_AVX2
-static TX_SIZE Highbd_fwd_txfm_for_avx2[] = { TX_64X64 };
+static TX_SIZE Highbd_fwd_txfm_for_avx2[] = { TX_32X32, TX_64X64 };
INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdFwdTxfm2dTest,
Combine(ValuesIn(Highbd_fwd_txfm_for_avx2),