half_btf_avx2: correct fn sig for visual studio
fixes:
formal parameter with __declspec(align('32')) won't be aligned
this is the same change that was made previously for sse4:
5bedd5dc8 idct16x16_sse4_1: correct fn sig for visual studio
Change-Id: Ib520bde439b03f81d5e84a2711ed61215debe862
diff --git a/av1/common/x86/highbd_inv_txfm_avx2.c b/av1/common/x86/highbd_inv_txfm_avx2.c
index 1ba345a..3f6612b 100644
--- a/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -154,14 +154,15 @@
}
}
-static INLINE __m256i half_btf_avx2(__m256i w0, __m256i n0, __m256i w1,
- __m256i n1, __m256i rounding, int bit) {
+static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
+ const __m256i *w1, const __m256i *n1,
+ const __m256i *rounding, int bit) {
__m256i x, y;
- x = _mm256_mullo_epi32(w0, n0);
- y = _mm256_mullo_epi32(w1, n1);
+ x = _mm256_mullo_epi32(*w0, *n0);
+ y = _mm256_mullo_epi32(*w1, *n1);
x = _mm256_add_epi32(x, y);
- x = _mm256_add_epi32(x, rounding);
+ x = _mm256_add_epi32(x, *rounding);
x = _mm256_srai_epi32(x, bit);
return x;
}
@@ -275,22 +276,38 @@
bf0[13] = bf1[13];
bf0[14] = bf1[14];
bf0[15] = bf1[15];
- bf0[16] = half_btf_avx2(cospi62, bf1[16], cospim2, bf1[31], rounding, bit);
- bf0[17] = half_btf_avx2(cospi30, bf1[17], cospim34, bf1[30], rounding, bit);
- bf0[18] = half_btf_avx2(cospi46, bf1[18], cospim18, bf1[29], rounding, bit);
- bf0[19] = half_btf_avx2(cospi14, bf1[19], cospim50, bf1[28], rounding, bit);
- bf0[20] = half_btf_avx2(cospi54, bf1[20], cospim10, bf1[27], rounding, bit);
- bf0[21] = half_btf_avx2(cospi22, bf1[21], cospim42, bf1[26], rounding, bit);
- bf0[22] = half_btf_avx2(cospi38, bf1[22], cospim26, bf1[25], rounding, bit);
- bf0[23] = half_btf_avx2(cospi6, bf1[23], cospim58, bf1[24], rounding, bit);
- bf0[24] = half_btf_avx2(cospi58, bf1[23], cospi6, bf1[24], rounding, bit);
- bf0[25] = half_btf_avx2(cospi26, bf1[22], cospi38, bf1[25], rounding, bit);
- bf0[26] = half_btf_avx2(cospi42, bf1[21], cospi22, bf1[26], rounding, bit);
- bf0[27] = half_btf_avx2(cospi10, bf1[20], cospi54, bf1[27], rounding, bit);
- bf0[28] = half_btf_avx2(cospi50, bf1[19], cospi14, bf1[28], rounding, bit);
- bf0[29] = half_btf_avx2(cospi18, bf1[18], cospi46, bf1[29], rounding, bit);
- bf0[30] = half_btf_avx2(cospi34, bf1[17], cospi30, bf1[30], rounding, bit);
- bf0[31] = half_btf_avx2(cospi2, bf1[16], cospi62, bf1[31], rounding, bit);
+ bf0[16] =
+ half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
+ bf0[17] =
+ half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
+ bf0[18] =
+ half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
+ bf0[19] =
+ half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
+ bf0[20] =
+ half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
+ bf0[23] =
+ half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
+ bf0[24] =
+ half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
+ bf0[25] =
+ half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
+ bf0[28] =
+ half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
+ bf0[29] =
+ half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
+ bf0[30] =
+ half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
+ bf0[31] =
+ half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
// stage 3
bf1[0] = bf0[0];
@@ -301,14 +318,22 @@
bf1[5] = bf0[5];
bf1[6] = bf0[6];
bf1[7] = bf0[7];
- bf1[8] = half_btf_avx2(cospi60, bf0[8], cospim4, bf0[15], rounding, bit);
- bf1[9] = half_btf_avx2(cospi28, bf0[9], cospim36, bf0[14], rounding, bit);
- bf1[10] = half_btf_avx2(cospi44, bf0[10], cospim20, bf0[13], rounding, bit);
- bf1[11] = half_btf_avx2(cospi12, bf0[11], cospim52, bf0[12], rounding, bit);
- bf1[12] = half_btf_avx2(cospi52, bf0[11], cospi12, bf0[12], rounding, bit);
- bf1[13] = half_btf_avx2(cospi20, bf0[10], cospi44, bf0[13], rounding, bit);
- bf1[14] = half_btf_avx2(cospi36, bf0[9], cospi28, bf0[14], rounding, bit);
- bf1[15] = half_btf_avx2(cospi4, bf0[8], cospi60, bf0[15], rounding, bit);
+ bf1[8] =
+ half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
+ bf1[9] =
+ half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
+ bf1[10] =
+ half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
+ bf1[11] =
+ half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
+ bf1[12] =
+ half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
+ bf1[13] =
+ half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
+ bf1[14] =
+ half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
+ bf1[15] =
+ half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
bf1[16] = _mm256_add_epi32(bf0[16], bf0[17]);
bf1[17] = _mm256_sub_epi32(bf0[16], bf0[17]);
bf1[18] = _mm256_sub_epi32(bf0[19], bf0[18]);
@@ -331,10 +356,13 @@
bf0[1] = bf1[1];
bf0[2] = bf1[2];
bf0[3] = bf1[3];
- bf0[4] = half_btf_avx2(cospi56, bf1[4], cospim8, bf1[7], rounding, bit);
- bf0[5] = half_btf_avx2(cospi24, bf1[5], cospim40, bf1[6], rounding, bit);
- bf0[6] = half_btf_avx2(cospi40, bf1[5], cospi24, bf1[6], rounding, bit);
- bf0[7] = half_btf_avx2(cospi8, bf1[4], cospi56, bf1[7], rounding, bit);
+ bf0[4] =
+ half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
+ bf0[5] =
+ half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
+ bf0[6] =
+ half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
+ bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
bf0[8] = _mm256_add_epi32(bf1[8], bf1[9]);
bf0[9] = _mm256_sub_epi32(bf1[8], bf1[9]);
bf0[10] = _mm256_sub_epi32(bf1[11], bf1[10]);
@@ -344,40 +372,54 @@
bf0[14] = _mm256_sub_epi32(bf1[15], bf1[14]);
bf0[15] = _mm256_add_epi32(bf1[14], bf1[15]);
bf0[16] = bf1[16];
- bf0[17] = half_btf_avx2(cospim8, bf1[17], cospi56, bf1[30], rounding, bit);
- bf0[18] = half_btf_avx2(cospim56, bf1[18], cospim8, bf1[29], rounding, bit);
+ bf0[17] =
+ half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
+ bf0[18] =
+ half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
bf0[19] = bf1[19];
bf0[20] = bf1[20];
- bf0[21] = half_btf_avx2(cospim40, bf1[21], cospi24, bf1[26], rounding, bit);
+ bf0[21] =
+ half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
bf0[22] =
- half_btf_avx2(cospim24, bf1[22], cospim40, bf1[25], rounding, bit);
+ half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
bf0[23] = bf1[23];
bf0[24] = bf1[24];
- bf0[25] = half_btf_avx2(cospim40, bf1[22], cospi24, bf1[25], rounding, bit);
- bf0[26] = half_btf_avx2(cospi24, bf1[21], cospi40, bf1[26], rounding, bit);
+ bf0[25] =
+ half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
bf0[27] = bf1[27];
bf0[28] = bf1[28];
- bf0[29] = half_btf_avx2(cospim8, bf1[18], cospi56, bf1[29], rounding, bit);
- bf0[30] = half_btf_avx2(cospi56, bf1[17], cospi8, bf1[30], rounding, bit);
+ bf0[29] =
+ half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
+ bf0[30] =
+ half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
bf0[31] = bf1[31];
// stage 5
- bf1[0] = half_btf_avx2(cospi32, bf0[0], cospi32, bf0[1], rounding, bit);
- bf1[1] = half_btf_avx2(cospi32, bf0[0], cospim32, bf0[1], rounding, bit);
- bf1[2] = half_btf_avx2(cospi48, bf0[2], cospim16, bf0[3], rounding, bit);
- bf1[3] = half_btf_avx2(cospi16, bf0[2], cospi48, bf0[3], rounding, bit);
+ bf1[0] =
+ half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
+ bf1[1] =
+ half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
+ bf1[2] =
+ half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
+ bf1[3] =
+ half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
bf1[4] = _mm256_add_epi32(bf0[4], bf0[5]);
bf1[5] = _mm256_sub_epi32(bf0[4], bf0[5]);
bf1[6] = _mm256_sub_epi32(bf0[7], bf0[6]);
bf1[7] = _mm256_add_epi32(bf0[6], bf0[7]);
bf1[8] = bf0[8];
- bf1[9] = half_btf_avx2(cospim16, bf0[9], cospi48, bf0[14], rounding, bit);
+ bf1[9] =
+ half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
bf1[10] =
- half_btf_avx2(cospim48, bf0[10], cospim16, bf0[13], rounding, bit);
+ half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
bf1[11] = bf0[11];
bf1[12] = bf0[12];
- bf1[13] = half_btf_avx2(cospim16, bf0[10], cospi48, bf0[13], rounding, bit);
- bf1[14] = half_btf_avx2(cospi48, bf0[9], cospi16, bf0[14], rounding, bit);
+ bf1[13] =
+ half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
+ bf1[14] =
+ half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
bf1[15] = bf0[15];
bf1[16] = _mm256_add_epi32(bf0[16], bf0[19]);
bf1[17] = _mm256_add_epi32(bf0[17], bf0[18]);
@@ -402,8 +444,10 @@
bf0[2] = _mm256_sub_epi32(bf1[1], bf1[2]);
bf0[3] = _mm256_sub_epi32(bf1[0], bf1[3]);
bf0[4] = bf1[4];
- bf0[5] = half_btf_avx2(cospim32, bf1[5], cospi32, bf1[6], rounding, bit);
- bf0[6] = half_btf_avx2(cospi32, bf1[5], cospi32, bf1[6], rounding, bit);
+ bf0[5] =
+ half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+ bf0[6] =
+ half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
bf0[7] = bf1[7];
bf0[8] = _mm256_add_epi32(bf1[8], bf1[11]);
bf0[9] = _mm256_add_epi32(bf1[9], bf1[10]);
@@ -415,20 +459,26 @@
bf0[15] = _mm256_add_epi32(bf1[12], bf1[15]);
bf0[16] = bf1[16];
bf0[17] = bf1[17];
- bf0[18] = half_btf_avx2(cospim16, bf1[18], cospi48, bf1[29], rounding, bit);
- bf0[19] = half_btf_avx2(cospim16, bf1[19], cospi48, bf1[28], rounding, bit);
+ bf0[18] =
+ half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
+ bf0[19] =
+ half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
bf0[20] =
- half_btf_avx2(cospim48, bf1[20], cospim16, bf1[27], rounding, bit);
+ half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
bf0[21] =
- half_btf_avx2(cospim48, bf1[21], cospim16, bf1[26], rounding, bit);
+ half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
bf0[22] = bf1[22];
bf0[23] = bf1[23];
bf0[24] = bf1[24];
bf0[25] = bf1[25];
- bf0[26] = half_btf_avx2(cospim16, bf1[21], cospi48, bf1[26], rounding, bit);
- bf0[27] = half_btf_avx2(cospim16, bf1[20], cospi48, bf1[27], rounding, bit);
- bf0[28] = half_btf_avx2(cospi48, bf1[19], cospi16, bf1[28], rounding, bit);
- bf0[29] = half_btf_avx2(cospi48, bf1[18], cospi16, bf1[29], rounding, bit);
+ bf0[26] =
+ half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
+ bf0[28] =
+ half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
+ bf0[29] =
+ half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
bf0[30] = bf1[30];
bf0[31] = bf1[31];
@@ -443,10 +493,14 @@
bf1[7] = _mm256_sub_epi32(bf0[0], bf0[7]);
bf1[8] = bf0[8];
bf1[9] = bf0[9];
- bf1[10] = half_btf_avx2(cospim32, bf0[10], cospi32, bf0[13], rounding, bit);
- bf1[11] = half_btf_avx2(cospim32, bf0[11], cospi32, bf0[12], rounding, bit);
- bf1[12] = half_btf_avx2(cospi32, bf0[11], cospi32, bf0[12], rounding, bit);
- bf1[13] = half_btf_avx2(cospi32, bf0[10], cospi32, bf0[13], rounding, bit);
+ bf1[10] =
+ half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+ bf1[11] =
+ half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+ bf1[12] =
+ half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+ bf1[13] =
+ half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
bf1[14] = bf0[14];
bf1[15] = bf0[15];
bf1[16] = _mm256_add_epi32(bf0[16], bf0[23]);
@@ -487,14 +541,22 @@
bf0[17] = bf1[17];
bf0[18] = bf1[18];
bf0[19] = bf1[19];
- bf0[20] = half_btf_avx2(cospim32, bf1[20], cospi32, bf1[27], rounding, bit);
- bf0[21] = half_btf_avx2(cospim32, bf1[21], cospi32, bf1[26], rounding, bit);
- bf0[22] = half_btf_avx2(cospim32, bf1[22], cospi32, bf1[25], rounding, bit);
- bf0[23] = half_btf_avx2(cospim32, bf1[23], cospi32, bf1[24], rounding, bit);
- bf0[24] = half_btf_avx2(cospi32, bf1[23], cospi32, bf1[24], rounding, bit);
- bf0[25] = half_btf_avx2(cospi32, bf1[22], cospi32, bf1[25], rounding, bit);
- bf0[26] = half_btf_avx2(cospi32, bf1[21], cospi32, bf1[26], rounding, bit);
- bf0[27] = half_btf_avx2(cospi32, bf1[20], cospi32, bf1[27], rounding, bit);
+ bf0[20] =
+ half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+ bf0[23] =
+ half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+ bf0[24] =
+ half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+ bf0[25] =
+ half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
bf0[28] = bf1[28];
bf0[29] = bf1[29];
bf0[30] = bf1[30];