av1_fwd_txfm1d_sse4: Use for loops when possible.
BUG=aomedia:442
Change-Id: I80706666b50058c9d0ecd6f568fe0e7a07d00185
diff --git a/av1/common/x86/av1_fwd_txfm1d_sse4.c b/av1/common/x86/av1_fwd_txfm1d_sse4.c
index 847fdfc..d04b667 100644
--- a/av1/common/x86/av1_fwd_txfm1d_sse4.c
+++ b/av1/common/x86/av1_fwd_txfm1d_sse4.c
@@ -14,38 +14,10 @@
for (col = 0; col < col_num; col++) {
// stage 0;
int32_t stage_idx = 0;
- buf0[0] = input[0 * col_num + col];
- buf0[1] = input[1 * col_num + col];
- buf0[2] = input[2 * col_num + col];
- buf0[3] = input[3 * col_num + col];
- buf0[4] = input[4 * col_num + col];
- buf0[5] = input[5 * col_num + col];
- buf0[6] = input[6 * col_num + col];
- buf0[7] = input[7 * col_num + col];
- buf0[8] = input[8 * col_num + col];
- buf0[9] = input[9 * col_num + col];
- buf0[10] = input[10 * col_num + col];
- buf0[11] = input[11 * col_num + col];
- buf0[12] = input[12 * col_num + col];
- buf0[13] = input[13 * col_num + col];
- buf0[14] = input[14 * col_num + col];
- buf0[15] = input[15 * col_num + col];
- buf0[16] = input[16 * col_num + col];
- buf0[17] = input[17 * col_num + col];
- buf0[18] = input[18 * col_num + col];
- buf0[19] = input[19 * col_num + col];
- buf0[20] = input[20 * col_num + col];
- buf0[21] = input[21 * col_num + col];
- buf0[22] = input[22 * col_num + col];
- buf0[23] = input[23 * col_num + col];
- buf0[24] = input[24 * col_num + col];
- buf0[25] = input[25 * col_num + col];
- buf0[26] = input[26 * col_num + col];
- buf0[27] = input[27 * col_num + col];
- buf0[28] = input[28 * col_num + col];
- buf0[29] = input[29 * col_num + col];
- buf0[30] = input[30 * col_num + col];
- buf0[31] = input[31 * col_num + col];
+ int j;
+ for (j = 0; j < 32; ++j) {
+ buf0[j] = input[j * col_num + col];
+ }
// stage 1
stage_idx++;
@@ -376,38 +348,9 @@
buf1[30] = buf0[15];
buf1[31] = buf0[31];
- output[0 * col_num + col] = buf1[0];
- output[1 * col_num + col] = buf1[1];
- output[2 * col_num + col] = buf1[2];
- output[3 * col_num + col] = buf1[3];
- output[4 * col_num + col] = buf1[4];
- output[5 * col_num + col] = buf1[5];
- output[6 * col_num + col] = buf1[6];
- output[7 * col_num + col] = buf1[7];
- output[8 * col_num + col] = buf1[8];
- output[9 * col_num + col] = buf1[9];
- output[10 * col_num + col] = buf1[10];
- output[11 * col_num + col] = buf1[11];
- output[12 * col_num + col] = buf1[12];
- output[13 * col_num + col] = buf1[13];
- output[14 * col_num + col] = buf1[14];
- output[15 * col_num + col] = buf1[15];
- output[16 * col_num + col] = buf1[16];
- output[17 * col_num + col] = buf1[17];
- output[18 * col_num + col] = buf1[18];
- output[19 * col_num + col] = buf1[19];
- output[20 * col_num + col] = buf1[20];
- output[21 * col_num + col] = buf1[21];
- output[22 * col_num + col] = buf1[22];
- output[23 * col_num + col] = buf1[23];
- output[24 * col_num + col] = buf1[24];
- output[25 * col_num + col] = buf1[25];
- output[26 * col_num + col] = buf1[26];
- output[27 * col_num + col] = buf1[27];
- output[28 * col_num + col] = buf1[28];
- output[29 * col_num + col] = buf1[29];
- output[30 * col_num + col] = buf1[30];
- output[31 * col_num + col] = buf1[31];
+ for (j = 0; j < 32; ++j) {
+ output[j * col_num + col] = buf1[j];
+ }
}
}
@@ -425,10 +368,10 @@
for (col = 0; col < col_num; col++) {
// stage 0;
int32_t stage_idx = 0;
- buf0[0] = input[0 * col_num + col];
- buf0[1] = input[1 * col_num + col];
- buf0[2] = input[2 * col_num + col];
- buf0[3] = input[3 * col_num + col];
+ int j;
+ for (j = 0; j < 4; ++j) {
+ buf0[j] = input[j * col_num + col];
+ }
// stage 1
stage_idx++;
@@ -469,10 +412,9 @@
buf1[2] = buf0[3];
buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
- output[0 * col_num + col] = buf1[0];
- output[1 * col_num + col] = buf1[1];
- output[2 * col_num + col] = buf1[2];
- output[3 * col_num + col] = buf1[3];
+ for (j = 0; j < 4; ++j) {
+ output[j * col_num + col] = buf1[j];
+ }
}
}
@@ -490,38 +432,10 @@
for (col = 0; col < col_num; col++) {
// stage 0;
int32_t stage_idx = 0;
- buf0[0] = input[0 * col_num + col];
- buf0[1] = input[1 * col_num + col];
- buf0[2] = input[2 * col_num + col];
- buf0[3] = input[3 * col_num + col];
- buf0[4] = input[4 * col_num + col];
- buf0[5] = input[5 * col_num + col];
- buf0[6] = input[6 * col_num + col];
- buf0[7] = input[7 * col_num + col];
- buf0[8] = input[8 * col_num + col];
- buf0[9] = input[9 * col_num + col];
- buf0[10] = input[10 * col_num + col];
- buf0[11] = input[11 * col_num + col];
- buf0[12] = input[12 * col_num + col];
- buf0[13] = input[13 * col_num + col];
- buf0[14] = input[14 * col_num + col];
- buf0[15] = input[15 * col_num + col];
- buf0[16] = input[16 * col_num + col];
- buf0[17] = input[17 * col_num + col];
- buf0[18] = input[18 * col_num + col];
- buf0[19] = input[19 * col_num + col];
- buf0[20] = input[20 * col_num + col];
- buf0[21] = input[21 * col_num + col];
- buf0[22] = input[22 * col_num + col];
- buf0[23] = input[23 * col_num + col];
- buf0[24] = input[24 * col_num + col];
- buf0[25] = input[25 * col_num + col];
- buf0[26] = input[26 * col_num + col];
- buf0[27] = input[27 * col_num + col];
- buf0[28] = input[28 * col_num + col];
- buf0[29] = input[29 * col_num + col];
- buf0[30] = input[30 * col_num + col];
- buf0[31] = input[31 * col_num + col];
+ int j;
+ for (j = 0; j < 32; ++j) {
+ buf0[j] = input[j * col_num + col];
+ }
// stage 1
stage_idx++;
@@ -918,37 +832,8 @@
buf1[30] = buf0[17];
buf1[31] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
- output[0 * col_num + col] = buf1[0];
- output[1 * col_num + col] = buf1[1];
- output[2 * col_num + col] = buf1[2];
- output[3 * col_num + col] = buf1[3];
- output[4 * col_num + col] = buf1[4];
- output[5 * col_num + col] = buf1[5];
- output[6 * col_num + col] = buf1[6];
- output[7 * col_num + col] = buf1[7];
- output[8 * col_num + col] = buf1[8];
- output[9 * col_num + col] = buf1[9];
- output[10 * col_num + col] = buf1[10];
- output[11 * col_num + col] = buf1[11];
- output[12 * col_num + col] = buf1[12];
- output[13 * col_num + col] = buf1[13];
- output[14 * col_num + col] = buf1[14];
- output[15 * col_num + col] = buf1[15];
- output[16 * col_num + col] = buf1[16];
- output[17 * col_num + col] = buf1[17];
- output[18 * col_num + col] = buf1[18];
- output[19 * col_num + col] = buf1[19];
- output[20 * col_num + col] = buf1[20];
- output[21 * col_num + col] = buf1[21];
- output[22 * col_num + col] = buf1[22];
- output[23 * col_num + col] = buf1[23];
- output[24 * col_num + col] = buf1[24];
- output[25 * col_num + col] = buf1[25];
- output[26 * col_num + col] = buf1[26];
- output[27 * col_num + col] = buf1[27];
- output[28 * col_num + col] = buf1[28];
- output[29 * col_num + col] = buf1[29];
- output[30 * col_num + col] = buf1[30];
- output[31 * col_num + col] = buf1[31];
+ for (j = 0; j < 32; ++j) {
+ output[j * col_num + col] = buf1[j];
+ }
}
}