Simplify highbd txfm modules
Code restructuring to simplify multiplication
involved in index calculation. Used addition/subtraction
instead of using multiplication.
For example, a = 31*stride, then, 30*stride = a - stride.
Change-Id: I0b173d7b1d8f07e90d81802faa9732da7fd2c506
diff --git a/av1/common/x86/highbd_inv_txfm_sse4.c b/av1/common/x86/highbd_inv_txfm_sse4.c
index 41afae0..12c6350 100644
--- a/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -3182,69 +3182,69 @@
}
out[0] = x;
- out[63] = x;
out[1] = x;
- out[62] = x;
out[2] = x;
- out[61] = x;
out[3] = x;
- out[60] = x;
out[4] = x;
- out[59] = x;
out[5] = x;
- out[58] = x;
out[6] = x;
- out[57] = x;
out[7] = x;
- out[56] = x;
out[8] = x;
- out[55] = x;
out[9] = x;
- out[54] = x;
out[10] = x;
- out[53] = x;
out[11] = x;
- out[52] = x;
out[12] = x;
- out[51] = x;
out[13] = x;
- out[50] = x;
out[14] = x;
- out[49] = x;
out[15] = x;
- out[48] = x;
out[16] = x;
- out[47] = x;
out[17] = x;
- out[46] = x;
out[18] = x;
- out[45] = x;
out[19] = x;
- out[44] = x;
out[20] = x;
- out[43] = x;
out[21] = x;
- out[42] = x;
out[22] = x;
- out[41] = x;
out[23] = x;
- out[40] = x;
out[24] = x;
- out[39] = x;
out[25] = x;
- out[38] = x;
out[26] = x;
- out[37] = x;
out[27] = x;
- out[36] = x;
out[28] = x;
- out[35] = x;
out[29] = x;
- out[34] = x;
out[30] = x;
- out[33] = x;
out[31] = x;
out[32] = x;
+ out[33] = x;
+ out[34] = x;
+ out[35] = x;
+ out[36] = x;
+ out[37] = x;
+ out[38] = x;
+ out[39] = x;
+ out[40] = x;
+ out[41] = x;
+ out[42] = x;
+ out[43] = x;
+ out[44] = x;
+ out[45] = x;
+ out[46] = x;
+ out[47] = x;
+ out[48] = x;
+ out[49] = x;
+ out[50] = x;
+ out[51] = x;
+ out[52] = x;
+ out[53] = x;
+ out[54] = x;
+ out[55] = x;
+ out[56] = x;
+ out[57] = x;
+ out[58] = x;
+ out[59] = x;
+ out[60] = x;
+ out[61] = x;
+ out[62] = x;
+ out[63] = x;
}
}
diff --git a/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
index faa19b7..5da723f 100644
--- a/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
+++ b/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
@@ -16,40 +16,73 @@
__m128i buf0[32];
__m128i buf1[32];
const int32_t *cospi;
+
+ int startidx = 0 * stride;
+ int endidx = 31 * stride;
// stage 0
// stage 1
- buf1[0] = _mm_add_epi32(input[0 * stride], input[31 * stride]);
- buf1[31] = _mm_sub_epi32(input[0 * stride], input[31 * stride]);
- buf1[1] = _mm_add_epi32(input[1 * stride], input[30 * stride]);
- buf1[30] = _mm_sub_epi32(input[1 * stride], input[30 * stride]);
- buf1[2] = _mm_add_epi32(input[2 * stride], input[29 * stride]);
- buf1[29] = _mm_sub_epi32(input[2 * stride], input[29 * stride]);
- buf1[3] = _mm_add_epi32(input[3 * stride], input[28 * stride]);
- buf1[28] = _mm_sub_epi32(input[3 * stride], input[28 * stride]);
- buf1[4] = _mm_add_epi32(input[4 * stride], input[27 * stride]);
- buf1[27] = _mm_sub_epi32(input[4 * stride], input[27 * stride]);
- buf1[5] = _mm_add_epi32(input[5 * stride], input[26 * stride]);
- buf1[26] = _mm_sub_epi32(input[5 * stride], input[26 * stride]);
- buf1[6] = _mm_add_epi32(input[6 * stride], input[25 * stride]);
- buf1[25] = _mm_sub_epi32(input[6 * stride], input[25 * stride]);
- buf1[7] = _mm_add_epi32(input[7 * stride], input[24 * stride]);
- buf1[24] = _mm_sub_epi32(input[7 * stride], input[24 * stride]);
- buf1[8] = _mm_add_epi32(input[8 * stride], input[23 * stride]);
- buf1[23] = _mm_sub_epi32(input[8 * stride], input[23 * stride]);
- buf1[9] = _mm_add_epi32(input[9 * stride], input[22 * stride]);
- buf1[22] = _mm_sub_epi32(input[9 * stride], input[22 * stride]);
- buf1[10] = _mm_add_epi32(input[10 * stride], input[21 * stride]);
- buf1[21] = _mm_sub_epi32(input[10 * stride], input[21 * stride]);
- buf1[11] = _mm_add_epi32(input[11 * stride], input[20 * stride]);
- buf1[20] = _mm_sub_epi32(input[11 * stride], input[20 * stride]);
- buf1[12] = _mm_add_epi32(input[12 * stride], input[19 * stride]);
- buf1[19] = _mm_sub_epi32(input[12 * stride], input[19 * stride]);
- buf1[13] = _mm_add_epi32(input[13 * stride], input[18 * stride]);
- buf1[18] = _mm_sub_epi32(input[13 * stride], input[18 * stride]);
- buf1[14] = _mm_add_epi32(input[14 * stride], input[17 * stride]);
- buf1[17] = _mm_sub_epi32(input[14 * stride], input[17 * stride]);
- buf1[15] = _mm_add_epi32(input[15 * stride], input[16 * stride]);
- buf1[16] = _mm_sub_epi32(input[15 * stride], input[16 * stride]);
+ buf1[0] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[31] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[1] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[30] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[2] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[29] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[3] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[28] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[4] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[27] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[5] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[26] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[6] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[25] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[7] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[24] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[8] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[23] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[9] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[22] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[10] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[21] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[11] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[20] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[12] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[19] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[13] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[18] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[14] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[17] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[15] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[16] = _mm_sub_epi32(input[startidx], input[endidx]);
// stage 2
cospi = cospi_arr(cos_bit);
@@ -296,39 +329,71 @@
btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
buf0[24], cos_bit);
+ startidx = 0 * stride;
+ endidx = 31 * stride;
// stage 9
- output[0 * stride] = buf0[0];
- output[1 * stride] = buf0[16];
- output[2 * stride] = buf0[8];
- output[3 * stride] = buf0[24];
- output[4 * stride] = buf0[4];
- output[5 * stride] = buf0[20];
- output[6 * stride] = buf0[12];
- output[7 * stride] = buf0[28];
- output[8 * stride] = buf0[2];
- output[9 * stride] = buf0[18];
- output[10 * stride] = buf0[10];
- output[11 * stride] = buf0[26];
- output[12 * stride] = buf0[6];
- output[13 * stride] = buf0[22];
- output[14 * stride] = buf0[14];
- output[15 * stride] = buf0[30];
- output[16 * stride] = buf0[1];
- output[17 * stride] = buf0[17];
- output[18 * stride] = buf0[9];
- output[19 * stride] = buf0[25];
- output[20 * stride] = buf0[5];
- output[21 * stride] = buf0[21];
- output[22 * stride] = buf0[13];
- output[23 * stride] = buf0[29];
- output[24 * stride] = buf0[3];
- output[25 * stride] = buf0[19];
- output[26 * stride] = buf0[11];
- output[27 * stride] = buf0[27];
- output[28 * stride] = buf0[7];
- output[29 * stride] = buf0[23];
- output[30 * stride] = buf0[15];
- output[31 * stride] = buf0[31];
+ output[startidx] = buf0[0];
+ output[endidx] = buf0[31];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[16];
+ output[endidx] = buf0[15];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[8];
+ output[endidx] = buf0[23];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[24];
+ output[endidx] = buf0[7];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[4];
+ output[endidx] = buf0[27];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[20];
+ output[endidx] = buf0[11];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[12];
+ output[endidx] = buf0[19];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[28];
+ output[endidx] = buf0[3];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[2];
+ output[endidx] = buf0[29];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[18];
+ output[endidx] = buf0[13];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[10];
+ output[endidx] = buf0[21];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[26];
+ output[endidx] = buf0[5];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[6];
+ output[endidx] = buf0[25];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[22];
+ output[endidx] = buf0[9];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[14];
+ output[endidx] = buf0[17];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[30];
+ output[endidx] = buf0[1];
}
void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
@@ -478,72 +543,136 @@
__m128i cospi_p03 = _mm_set1_epi32(cospi[3]);
__m128i cospi_p61 = _mm_set1_epi32(cospi[61]);
+ int startidx = 0 * instride;
+ int endidx = 63 * instride;
// stage 1
__m128i x1[64];
- x1[0] = _mm_add_epi32(input[0 * instride], input[63 * instride]);
- x1[63] = _mm_sub_epi32(input[0 * instride], input[63 * instride]);
- x1[1] = _mm_add_epi32(input[1 * instride], input[62 * instride]);
- x1[62] = _mm_sub_epi32(input[1 * instride], input[62 * instride]);
- x1[2] = _mm_add_epi32(input[2 * instride], input[61 * instride]);
- x1[61] = _mm_sub_epi32(input[2 * instride], input[61 * instride]);
- x1[3] = _mm_add_epi32(input[3 * instride], input[60 * instride]);
- x1[60] = _mm_sub_epi32(input[3 * instride], input[60 * instride]);
- x1[4] = _mm_add_epi32(input[4 * instride], input[59 * instride]);
- x1[59] = _mm_sub_epi32(input[4 * instride], input[59 * instride]);
- x1[5] = _mm_add_epi32(input[5 * instride], input[58 * instride]);
- x1[58] = _mm_sub_epi32(input[5 * instride], input[58 * instride]);
- x1[6] = _mm_add_epi32(input[6 * instride], input[57 * instride]);
- x1[57] = _mm_sub_epi32(input[6 * instride], input[57 * instride]);
- x1[7] = _mm_add_epi32(input[7 * instride], input[56 * instride]);
- x1[56] = _mm_sub_epi32(input[7 * instride], input[56 * instride]);
- x1[8] = _mm_add_epi32(input[8 * instride], input[55 * instride]);
- x1[55] = _mm_sub_epi32(input[8 * instride], input[55 * instride]);
- x1[9] = _mm_add_epi32(input[9 * instride], input[54 * instride]);
- x1[54] = _mm_sub_epi32(input[9 * instride], input[54 * instride]);
- x1[10] = _mm_add_epi32(input[10 * instride], input[53 * instride]);
- x1[53] = _mm_sub_epi32(input[10 * instride], input[53 * instride]);
- x1[11] = _mm_add_epi32(input[11 * instride], input[52 * instride]);
- x1[52] = _mm_sub_epi32(input[11 * instride], input[52 * instride]);
- x1[12] = _mm_add_epi32(input[12 * instride], input[51 * instride]);
- x1[51] = _mm_sub_epi32(input[12 * instride], input[51 * instride]);
- x1[13] = _mm_add_epi32(input[13 * instride], input[50 * instride]);
- x1[50] = _mm_sub_epi32(input[13 * instride], input[50 * instride]);
- x1[14] = _mm_add_epi32(input[14 * instride], input[49 * instride]);
- x1[49] = _mm_sub_epi32(input[14 * instride], input[49 * instride]);
- x1[15] = _mm_add_epi32(input[15 * instride], input[48 * instride]);
- x1[48] = _mm_sub_epi32(input[15 * instride], input[48 * instride]);
- x1[16] = _mm_add_epi32(input[16 * instride], input[47 * instride]);
- x1[47] = _mm_sub_epi32(input[16 * instride], input[47 * instride]);
- x1[17] = _mm_add_epi32(input[17 * instride], input[46 * instride]);
- x1[46] = _mm_sub_epi32(input[17 * instride], input[46 * instride]);
- x1[18] = _mm_add_epi32(input[18 * instride], input[45 * instride]);
- x1[45] = _mm_sub_epi32(input[18 * instride], input[45 * instride]);
- x1[19] = _mm_add_epi32(input[19 * instride], input[44 * instride]);
- x1[44] = _mm_sub_epi32(input[19 * instride], input[44 * instride]);
- x1[20] = _mm_add_epi32(input[20 * instride], input[43 * instride]);
- x1[43] = _mm_sub_epi32(input[20 * instride], input[43 * instride]);
- x1[21] = _mm_add_epi32(input[21 * instride], input[42 * instride]);
- x1[42] = _mm_sub_epi32(input[21 * instride], input[42 * instride]);
- x1[22] = _mm_add_epi32(input[22 * instride], input[41 * instride]);
- x1[41] = _mm_sub_epi32(input[22 * instride], input[41 * instride]);
- x1[23] = _mm_add_epi32(input[23 * instride], input[40 * instride]);
- x1[40] = _mm_sub_epi32(input[23 * instride], input[40 * instride]);
- x1[24] = _mm_add_epi32(input[24 * instride], input[39 * instride]);
- x1[39] = _mm_sub_epi32(input[24 * instride], input[39 * instride]);
- x1[25] = _mm_add_epi32(input[25 * instride], input[38 * instride]);
- x1[38] = _mm_sub_epi32(input[25 * instride], input[38 * instride]);
- x1[26] = _mm_add_epi32(input[26 * instride], input[37 * instride]);
- x1[37] = _mm_sub_epi32(input[26 * instride], input[37 * instride]);
- x1[27] = _mm_add_epi32(input[27 * instride], input[36 * instride]);
- x1[36] = _mm_sub_epi32(input[27 * instride], input[36 * instride]);
- x1[28] = _mm_add_epi32(input[28 * instride], input[35 * instride]);
- x1[35] = _mm_sub_epi32(input[28 * instride], input[35 * instride]);
- x1[29] = _mm_add_epi32(input[29 * instride], input[34 * instride]);
- x1[34] = _mm_sub_epi32(input[29 * instride], input[34 * instride]);
- x1[30] = _mm_add_epi32(input[30 * instride], input[33 * instride]);
- x1[33] = _mm_sub_epi32(input[30 * instride], input[33 * instride]);
- x1[31] = _mm_add_epi32(input[31 * instride], input[32 * instride]);
- x1[32] = _mm_sub_epi32(input[31 * instride], input[32 * instride]);
+ x1[0] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[63] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[1] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[62] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[2] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[61] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[3] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[60] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[4] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[59] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[5] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[58] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[6] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[57] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[7] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[56] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[8] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[55] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[9] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[54] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[10] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[53] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[11] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[52] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[12] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[51] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[13] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[50] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[14] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[49] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[15] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[48] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[16] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[47] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[17] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[46] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[18] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[45] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[19] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[44] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[20] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[43] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[21] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[42] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[22] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[41] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[23] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[40] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[24] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[39] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[25] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[38] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[26] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[37] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[27] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[36] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[28] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[35] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[29] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[34] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[30] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[33] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[31] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[32] = _mm_sub_epi32(input[startidx], input[endidx]);
// stage 2
__m128i x2[64];
@@ -1148,69 +1277,133 @@
btf_32_type1_sse4_1_new(cospi_p03, cospi_p61, x9[47], x9[48], x10[47],
x10[48], __rounding, cos_bit);
+ startidx = 0 * outstride;
+ endidx = 63 * outstride;
// stage 11
- output[0 * outstride] = x10[0];
- output[1 * outstride] = x10[32];
- output[2 * outstride] = x10[16];
- output[3 * outstride] = x10[48];
- output[4 * outstride] = x10[8];
- output[5 * outstride] = x10[40];
- output[6 * outstride] = x10[24];
- output[7 * outstride] = x10[56];
- output[8 * outstride] = x10[4];
- output[9 * outstride] = x10[36];
- output[10 * outstride] = x10[20];
- output[11 * outstride] = x10[52];
- output[12 * outstride] = x10[12];
- output[13 * outstride] = x10[44];
- output[14 * outstride] = x10[28];
- output[15 * outstride] = x10[60];
- output[16 * outstride] = x10[2];
- output[17 * outstride] = x10[34];
- output[18 * outstride] = x10[18];
- output[19 * outstride] = x10[50];
- output[20 * outstride] = x10[10];
- output[21 * outstride] = x10[42];
- output[22 * outstride] = x10[26];
- output[23 * outstride] = x10[58];
- output[24 * outstride] = x10[6];
- output[25 * outstride] = x10[38];
- output[26 * outstride] = x10[22];
- output[27 * outstride] = x10[54];
- output[28 * outstride] = x10[14];
- output[29 * outstride] = x10[46];
- output[30 * outstride] = x10[30];
- output[31 * outstride] = x10[62];
- output[32 * outstride] = x10[1];
- output[33 * outstride] = x10[33];
- output[34 * outstride] = x10[17];
- output[35 * outstride] = x10[49];
- output[36 * outstride] = x10[9];
- output[37 * outstride] = x10[41];
- output[38 * outstride] = x10[25];
- output[39 * outstride] = x10[57];
- output[40 * outstride] = x10[5];
- output[41 * outstride] = x10[37];
- output[42 * outstride] = x10[21];
- output[43 * outstride] = x10[53];
- output[44 * outstride] = x10[13];
- output[45 * outstride] = x10[45];
- output[46 * outstride] = x10[29];
- output[47 * outstride] = x10[61];
- output[48 * outstride] = x10[3];
- output[49 * outstride] = x10[35];
- output[50 * outstride] = x10[19];
- output[51 * outstride] = x10[51];
- output[52 * outstride] = x10[11];
- output[53 * outstride] = x10[43];
- output[54 * outstride] = x10[27];
- output[55 * outstride] = x10[59];
- output[56 * outstride] = x10[7];
- output[57 * outstride] = x10[39];
- output[58 * outstride] = x10[23];
- output[59 * outstride] = x10[55];
- output[60 * outstride] = x10[15];
- output[61 * outstride] = x10[47];
- output[62 * outstride] = x10[31];
- output[63 * outstride] = x10[63];
+ output[startidx] = x10[0];
+ output[endidx] = x10[63];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[32];
+ output[endidx] = x10[31];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[16];
+ output[endidx] = x10[47];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[48];
+ output[endidx] = x10[15];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[8];
+ output[endidx] = x10[55];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[40];
+ output[endidx] = x10[23];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[24];
+ output[endidx] = x10[39];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[56];
+ output[endidx] = x10[7];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[4];
+ output[endidx] = x10[59];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[36];
+ output[endidx] = x10[27];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[20];
+ output[endidx] = x10[43];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[52];
+ output[endidx] = x10[11];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[12];
+ output[endidx] = x10[51];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[44];
+ output[endidx] = x10[19];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[28];
+ output[endidx] = x10[35];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[60];
+ output[endidx] = x10[3];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[2];
+ output[endidx] = x10[61];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[34];
+ output[endidx] = x10[29];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[18];
+ output[endidx] = x10[45];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[50];
+ output[endidx] = x10[13];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[10];
+ output[endidx] = x10[53];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[42];
+ output[endidx] = x10[21];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[26];
+ output[endidx] = x10[37];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[58];
+ output[endidx] = x10[5];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[6];
+ output[endidx] = x10[57];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[38];
+ output[endidx] = x10[25];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[22];
+ output[endidx] = x10[41];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[54];
+ output[endidx] = x10[9];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[14];
+ output[endidx] = x10[49];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[46];
+ output[endidx] = x10[17];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[30];
+ output[endidx] = x10[33];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[62];
+ output[endidx] = x10[1];
}
diff --git a/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/av1/encoder/x86/highbd_fwd_txfm_sse4.c
index 9f5df90..a9516ca 100644
--- a/av1/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -70,10 +70,12 @@
__m128i u0, u1, u2, u3;
__m128i v0, v1, v2, v3;
- s0 = _mm_add_epi32(in[0 * num_col], in[3 * num_col]);
- s1 = _mm_add_epi32(in[1 * num_col], in[2 * num_col]);
- s2 = _mm_sub_epi32(in[1 * num_col], in[2 * num_col]);
- s3 = _mm_sub_epi32(in[0 * num_col], in[3 * num_col]);
+ int endidx = 3 * num_col;
+ s0 = _mm_add_epi32(in[0], in[endidx]);
+ s3 = _mm_sub_epi32(in[0], in[endidx]);
+ endidx -= num_col;
+ s1 = _mm_add_epi32(in[num_col], in[endidx]);
+ s2 = _mm_sub_epi32(in[num_col], in[endidx]);
// btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
u0 = _mm_mullo_epi32(s0, cospi32);
@@ -137,15 +139,19 @@
__m128i u0, u1, u2, u3;
__m128i v0, v1, v2, v3;
- s0 = _mm_mullo_epi32(in[0 * num_col], sinpi1);
- s1 = _mm_mullo_epi32(in[0 * num_col], sinpi4);
- s2 = _mm_mullo_epi32(in[1 * num_col], sinpi2);
- s3 = _mm_mullo_epi32(in[1 * num_col], sinpi1);
- s4 = _mm_mullo_epi32(in[2 * num_col], sinpi3);
- s5 = _mm_mullo_epi32(in[3 * num_col], sinpi4);
- s6 = _mm_mullo_epi32(in[3 * num_col], sinpi2);
- t = _mm_add_epi32(in[0 * num_col], in[1 * num_col]);
- s7 = _mm_sub_epi32(t, in[3 * num_col]);
+ int idx = 0 * num_col;
+ s0 = _mm_mullo_epi32(in[idx], sinpi1);
+ s1 = _mm_mullo_epi32(in[idx], sinpi4);
+ t = _mm_add_epi32(in[idx], in[idx + num_col]);
+ idx += num_col;
+ s2 = _mm_mullo_epi32(in[idx], sinpi2);
+ s3 = _mm_mullo_epi32(in[idx], sinpi1);
+ idx += num_col;
+ s4 = _mm_mullo_epi32(in[idx], sinpi3);
+ idx += num_col;
+ s5 = _mm_mullo_epi32(in[idx], sinpi4);
+ s6 = _mm_mullo_epi32(in[idx], sinpi2);
+ s7 = _mm_sub_epi32(t, in[idx]);
t = _mm_add_epi32(s0, s2);
x0 = _mm_add_epi32(t, s5);