av1_fwd_txfm1d_sse4: Use for loops when possible.

BUG=aomedia:442

Change-Id: I80706666b50058c9d0ecd6f568fe0e7a07d00185
diff --git a/av1/common/x86/av1_fwd_txfm1d_sse4.c b/av1/common/x86/av1_fwd_txfm1d_sse4.c
index 847fdfc..d04b667 100644
--- a/av1/common/x86/av1_fwd_txfm1d_sse4.c
+++ b/av1/common/x86/av1_fwd_txfm1d_sse4.c
@@ -14,38 +14,10 @@
   for (col = 0; col < col_num; col++) {
     // stage 0;
     int32_t stage_idx = 0;
-    buf0[0] = input[0 * col_num + col];
-    buf0[1] = input[1 * col_num + col];
-    buf0[2] = input[2 * col_num + col];
-    buf0[3] = input[3 * col_num + col];
-    buf0[4] = input[4 * col_num + col];
-    buf0[5] = input[5 * col_num + col];
-    buf0[6] = input[6 * col_num + col];
-    buf0[7] = input[7 * col_num + col];
-    buf0[8] = input[8 * col_num + col];
-    buf0[9] = input[9 * col_num + col];
-    buf0[10] = input[10 * col_num + col];
-    buf0[11] = input[11 * col_num + col];
-    buf0[12] = input[12 * col_num + col];
-    buf0[13] = input[13 * col_num + col];
-    buf0[14] = input[14 * col_num + col];
-    buf0[15] = input[15 * col_num + col];
-    buf0[16] = input[16 * col_num + col];
-    buf0[17] = input[17 * col_num + col];
-    buf0[18] = input[18 * col_num + col];
-    buf0[19] = input[19 * col_num + col];
-    buf0[20] = input[20 * col_num + col];
-    buf0[21] = input[21 * col_num + col];
-    buf0[22] = input[22 * col_num + col];
-    buf0[23] = input[23 * col_num + col];
-    buf0[24] = input[24 * col_num + col];
-    buf0[25] = input[25 * col_num + col];
-    buf0[26] = input[26 * col_num + col];
-    buf0[27] = input[27 * col_num + col];
-    buf0[28] = input[28 * col_num + col];
-    buf0[29] = input[29 * col_num + col];
-    buf0[30] = input[30 * col_num + col];
-    buf0[31] = input[31 * col_num + col];
+    int j;
+    for (j = 0; j < 32; ++j) {
+      buf0[j] = input[j * col_num + col];
+    }
 
     // stage 1
     stage_idx++;
@@ -376,38 +348,9 @@
     buf1[30] = buf0[15];
     buf1[31] = buf0[31];
 
-    output[0 * col_num + col] = buf1[0];
-    output[1 * col_num + col] = buf1[1];
-    output[2 * col_num + col] = buf1[2];
-    output[3 * col_num + col] = buf1[3];
-    output[4 * col_num + col] = buf1[4];
-    output[5 * col_num + col] = buf1[5];
-    output[6 * col_num + col] = buf1[6];
-    output[7 * col_num + col] = buf1[7];
-    output[8 * col_num + col] = buf1[8];
-    output[9 * col_num + col] = buf1[9];
-    output[10 * col_num + col] = buf1[10];
-    output[11 * col_num + col] = buf1[11];
-    output[12 * col_num + col] = buf1[12];
-    output[13 * col_num + col] = buf1[13];
-    output[14 * col_num + col] = buf1[14];
-    output[15 * col_num + col] = buf1[15];
-    output[16 * col_num + col] = buf1[16];
-    output[17 * col_num + col] = buf1[17];
-    output[18 * col_num + col] = buf1[18];
-    output[19 * col_num + col] = buf1[19];
-    output[20 * col_num + col] = buf1[20];
-    output[21 * col_num + col] = buf1[21];
-    output[22 * col_num + col] = buf1[22];
-    output[23 * col_num + col] = buf1[23];
-    output[24 * col_num + col] = buf1[24];
-    output[25 * col_num + col] = buf1[25];
-    output[26 * col_num + col] = buf1[26];
-    output[27 * col_num + col] = buf1[27];
-    output[28 * col_num + col] = buf1[28];
-    output[29 * col_num + col] = buf1[29];
-    output[30 * col_num + col] = buf1[30];
-    output[31 * col_num + col] = buf1[31];
+    for (j = 0; j < 32; ++j) {
+      output[j * col_num + col] = buf1[j];
+    }
   }
 }
 
@@ -425,10 +368,10 @@
   for (col = 0; col < col_num; col++) {
     // stage 0;
     int32_t stage_idx = 0;
-    buf0[0] = input[0 * col_num + col];
-    buf0[1] = input[1 * col_num + col];
-    buf0[2] = input[2 * col_num + col];
-    buf0[3] = input[3 * col_num + col];
+    int j;
+    for (j = 0; j < 4; ++j) {
+      buf0[j] = input[j * col_num + col];
+    }
 
     // stage 1
     stage_idx++;
@@ -469,10 +412,9 @@
     buf1[2] = buf0[3];
     buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
 
-    output[0 * col_num + col] = buf1[0];
-    output[1 * col_num + col] = buf1[1];
-    output[2 * col_num + col] = buf1[2];
-    output[3 * col_num + col] = buf1[3];
+    for (j = 0; j < 4; ++j) {
+      output[j * col_num + col] = buf1[j];
+    }
   }
 }
 
@@ -490,38 +432,10 @@
   for (col = 0; col < col_num; col++) {
     // stage 0;
     int32_t stage_idx = 0;
-    buf0[0] = input[0 * col_num + col];
-    buf0[1] = input[1 * col_num + col];
-    buf0[2] = input[2 * col_num + col];
-    buf0[3] = input[3 * col_num + col];
-    buf0[4] = input[4 * col_num + col];
-    buf0[5] = input[5 * col_num + col];
-    buf0[6] = input[6 * col_num + col];
-    buf0[7] = input[7 * col_num + col];
-    buf0[8] = input[8 * col_num + col];
-    buf0[9] = input[9 * col_num + col];
-    buf0[10] = input[10 * col_num + col];
-    buf0[11] = input[11 * col_num + col];
-    buf0[12] = input[12 * col_num + col];
-    buf0[13] = input[13 * col_num + col];
-    buf0[14] = input[14 * col_num + col];
-    buf0[15] = input[15 * col_num + col];
-    buf0[16] = input[16 * col_num + col];
-    buf0[17] = input[17 * col_num + col];
-    buf0[18] = input[18 * col_num + col];
-    buf0[19] = input[19 * col_num + col];
-    buf0[20] = input[20 * col_num + col];
-    buf0[21] = input[21 * col_num + col];
-    buf0[22] = input[22 * col_num + col];
-    buf0[23] = input[23 * col_num + col];
-    buf0[24] = input[24 * col_num + col];
-    buf0[25] = input[25 * col_num + col];
-    buf0[26] = input[26 * col_num + col];
-    buf0[27] = input[27 * col_num + col];
-    buf0[28] = input[28 * col_num + col];
-    buf0[29] = input[29 * col_num + col];
-    buf0[30] = input[30 * col_num + col];
-    buf0[31] = input[31 * col_num + col];
+    int j;
+    for (j = 0; j < 32; ++j) {
+      buf0[j] = input[j * col_num + col];
+    }
 
     // stage 1
     stage_idx++;
@@ -918,37 +832,8 @@
     buf1[30] = buf0[17];
     buf1[31] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
 
-    output[0 * col_num + col] = buf1[0];
-    output[1 * col_num + col] = buf1[1];
-    output[2 * col_num + col] = buf1[2];
-    output[3 * col_num + col] = buf1[3];
-    output[4 * col_num + col] = buf1[4];
-    output[5 * col_num + col] = buf1[5];
-    output[6 * col_num + col] = buf1[6];
-    output[7 * col_num + col] = buf1[7];
-    output[8 * col_num + col] = buf1[8];
-    output[9 * col_num + col] = buf1[9];
-    output[10 * col_num + col] = buf1[10];
-    output[11 * col_num + col] = buf1[11];
-    output[12 * col_num + col] = buf1[12];
-    output[13 * col_num + col] = buf1[13];
-    output[14 * col_num + col] = buf1[14];
-    output[15 * col_num + col] = buf1[15];
-    output[16 * col_num + col] = buf1[16];
-    output[17 * col_num + col] = buf1[17];
-    output[18 * col_num + col] = buf1[18];
-    output[19 * col_num + col] = buf1[19];
-    output[20 * col_num + col] = buf1[20];
-    output[21 * col_num + col] = buf1[21];
-    output[22 * col_num + col] = buf1[22];
-    output[23 * col_num + col] = buf1[23];
-    output[24 * col_num + col] = buf1[24];
-    output[25 * col_num + col] = buf1[25];
-    output[26 * col_num + col] = buf1[26];
-    output[27 * col_num + col] = buf1[27];
-    output[28 * col_num + col] = buf1[28];
-    output[29 * col_num + col] = buf1[29];
-    output[30 * col_num + col] = buf1[30];
-    output[31 * col_num + col] = buf1[31];
+    for (j = 0; j < 32; ++j) {
+      output[j * col_num + col] = buf1[j];
+    }
   }
 }