Simplify highbd txfm modules

Code restructuring to simplify multiplication
involved in index calculation. Used addition/subtraction
instead of using multiplication.
For example, a = 31*stride, then, 30*stride = a - stride.

Change-Id: I0b173d7b1d8f07e90d81802faa9732da7fd2c506
diff --git a/av1/common/x86/highbd_inv_txfm_sse4.c b/av1/common/x86/highbd_inv_txfm_sse4.c
index 41afae0..12c6350 100644
--- a/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -3182,69 +3182,69 @@
     }
 
     out[0] = x;
-    out[63] = x;
     out[1] = x;
-    out[62] = x;
     out[2] = x;
-    out[61] = x;
     out[3] = x;
-    out[60] = x;
     out[4] = x;
-    out[59] = x;
     out[5] = x;
-    out[58] = x;
     out[6] = x;
-    out[57] = x;
     out[7] = x;
-    out[56] = x;
     out[8] = x;
-    out[55] = x;
     out[9] = x;
-    out[54] = x;
     out[10] = x;
-    out[53] = x;
     out[11] = x;
-    out[52] = x;
     out[12] = x;
-    out[51] = x;
     out[13] = x;
-    out[50] = x;
     out[14] = x;
-    out[49] = x;
     out[15] = x;
-    out[48] = x;
     out[16] = x;
-    out[47] = x;
     out[17] = x;
-    out[46] = x;
     out[18] = x;
-    out[45] = x;
     out[19] = x;
-    out[44] = x;
     out[20] = x;
-    out[43] = x;
     out[21] = x;
-    out[42] = x;
     out[22] = x;
-    out[41] = x;
     out[23] = x;
-    out[40] = x;
     out[24] = x;
-    out[39] = x;
     out[25] = x;
-    out[38] = x;
     out[26] = x;
-    out[37] = x;
     out[27] = x;
-    out[36] = x;
     out[28] = x;
-    out[35] = x;
     out[29] = x;
-    out[34] = x;
     out[30] = x;
-    out[33] = x;
     out[31] = x;
     out[32] = x;
+    out[33] = x;
+    out[34] = x;
+    out[35] = x;
+    out[36] = x;
+    out[37] = x;
+    out[38] = x;
+    out[39] = x;
+    out[40] = x;
+    out[41] = x;
+    out[42] = x;
+    out[43] = x;
+    out[44] = x;
+    out[45] = x;
+    out[46] = x;
+    out[47] = x;
+    out[48] = x;
+    out[49] = x;
+    out[50] = x;
+    out[51] = x;
+    out[52] = x;
+    out[53] = x;
+    out[54] = x;
+    out[55] = x;
+    out[56] = x;
+    out[57] = x;
+    out[58] = x;
+    out[59] = x;
+    out[60] = x;
+    out[61] = x;
+    out[62] = x;
+    out[63] = x;
   }
 }
 
diff --git a/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
index faa19b7..5da723f 100644
--- a/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
+++ b/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
@@ -16,40 +16,73 @@
   __m128i buf0[32];
   __m128i buf1[32];
   const int32_t *cospi;
+
+  int startidx = 0 * stride;
+  int endidx = 31 * stride;
   // stage 0
   // stage 1
-  buf1[0] = _mm_add_epi32(input[0 * stride], input[31 * stride]);
-  buf1[31] = _mm_sub_epi32(input[0 * stride], input[31 * stride]);
-  buf1[1] = _mm_add_epi32(input[1 * stride], input[30 * stride]);
-  buf1[30] = _mm_sub_epi32(input[1 * stride], input[30 * stride]);
-  buf1[2] = _mm_add_epi32(input[2 * stride], input[29 * stride]);
-  buf1[29] = _mm_sub_epi32(input[2 * stride], input[29 * stride]);
-  buf1[3] = _mm_add_epi32(input[3 * stride], input[28 * stride]);
-  buf1[28] = _mm_sub_epi32(input[3 * stride], input[28 * stride]);
-  buf1[4] = _mm_add_epi32(input[4 * stride], input[27 * stride]);
-  buf1[27] = _mm_sub_epi32(input[4 * stride], input[27 * stride]);
-  buf1[5] = _mm_add_epi32(input[5 * stride], input[26 * stride]);
-  buf1[26] = _mm_sub_epi32(input[5 * stride], input[26 * stride]);
-  buf1[6] = _mm_add_epi32(input[6 * stride], input[25 * stride]);
-  buf1[25] = _mm_sub_epi32(input[6 * stride], input[25 * stride]);
-  buf1[7] = _mm_add_epi32(input[7 * stride], input[24 * stride]);
-  buf1[24] = _mm_sub_epi32(input[7 * stride], input[24 * stride]);
-  buf1[8] = _mm_add_epi32(input[8 * stride], input[23 * stride]);
-  buf1[23] = _mm_sub_epi32(input[8 * stride], input[23 * stride]);
-  buf1[9] = _mm_add_epi32(input[9 * stride], input[22 * stride]);
-  buf1[22] = _mm_sub_epi32(input[9 * stride], input[22 * stride]);
-  buf1[10] = _mm_add_epi32(input[10 * stride], input[21 * stride]);
-  buf1[21] = _mm_sub_epi32(input[10 * stride], input[21 * stride]);
-  buf1[11] = _mm_add_epi32(input[11 * stride], input[20 * stride]);
-  buf1[20] = _mm_sub_epi32(input[11 * stride], input[20 * stride]);
-  buf1[12] = _mm_add_epi32(input[12 * stride], input[19 * stride]);
-  buf1[19] = _mm_sub_epi32(input[12 * stride], input[19 * stride]);
-  buf1[13] = _mm_add_epi32(input[13 * stride], input[18 * stride]);
-  buf1[18] = _mm_sub_epi32(input[13 * stride], input[18 * stride]);
-  buf1[14] = _mm_add_epi32(input[14 * stride], input[17 * stride]);
-  buf1[17] = _mm_sub_epi32(input[14 * stride], input[17 * stride]);
-  buf1[15] = _mm_add_epi32(input[15 * stride], input[16 * stride]);
-  buf1[16] = _mm_sub_epi32(input[15 * stride], input[16 * stride]);
+  buf1[0] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[31] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[1] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[30] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[2] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[29] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[3] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[28] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[4] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[27] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[5] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[26] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[6] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[25] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[7] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[24] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[8] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[23] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[9] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[22] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[10] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[21] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[11] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[20] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[12] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[19] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[13] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[18] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[14] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[17] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[15] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[16] = _mm_sub_epi32(input[startidx], input[endidx]);
 
   // stage 2
   cospi = cospi_arr(cos_bit);
@@ -296,39 +329,71 @@
   btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
                       buf0[24], cos_bit);
 
+  startidx = 0 * stride;
+  endidx = 31 * stride;
   // stage 9
-  output[0 * stride] = buf0[0];
-  output[1 * stride] = buf0[16];
-  output[2 * stride] = buf0[8];
-  output[3 * stride] = buf0[24];
-  output[4 * stride] = buf0[4];
-  output[5 * stride] = buf0[20];
-  output[6 * stride] = buf0[12];
-  output[7 * stride] = buf0[28];
-  output[8 * stride] = buf0[2];
-  output[9 * stride] = buf0[18];
-  output[10 * stride] = buf0[10];
-  output[11 * stride] = buf0[26];
-  output[12 * stride] = buf0[6];
-  output[13 * stride] = buf0[22];
-  output[14 * stride] = buf0[14];
-  output[15 * stride] = buf0[30];
-  output[16 * stride] = buf0[1];
-  output[17 * stride] = buf0[17];
-  output[18 * stride] = buf0[9];
-  output[19 * stride] = buf0[25];
-  output[20 * stride] = buf0[5];
-  output[21 * stride] = buf0[21];
-  output[22 * stride] = buf0[13];
-  output[23 * stride] = buf0[29];
-  output[24 * stride] = buf0[3];
-  output[25 * stride] = buf0[19];
-  output[26 * stride] = buf0[11];
-  output[27 * stride] = buf0[27];
-  output[28 * stride] = buf0[7];
-  output[29 * stride] = buf0[23];
-  output[30 * stride] = buf0[15];
-  output[31 * stride] = buf0[31];
+  output[startidx] = buf0[0];
+  output[endidx] = buf0[31];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[16];
+  output[endidx] = buf0[15];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[8];
+  output[endidx] = buf0[23];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[24];
+  output[endidx] = buf0[7];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[4];
+  output[endidx] = buf0[27];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[20];
+  output[endidx] = buf0[11];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[12];
+  output[endidx] = buf0[19];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[28];
+  output[endidx] = buf0[3];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[2];
+  output[endidx] = buf0[29];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[18];
+  output[endidx] = buf0[13];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[10];
+  output[endidx] = buf0[21];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[26];
+  output[endidx] = buf0[5];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[6];
+  output[endidx] = buf0[25];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[22];
+  output[endidx] = buf0[9];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[14];
+  output[endidx] = buf0[17];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[30];
+  output[endidx] = buf0[1];
 }
 
 void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
@@ -478,72 +543,136 @@
   __m128i cospi_p03 = _mm_set1_epi32(cospi[3]);
   __m128i cospi_p61 = _mm_set1_epi32(cospi[61]);
 
+  int startidx = 0 * instride;
+  int endidx = 63 * instride;
   // stage 1
   __m128i x1[64];
-  x1[0] = _mm_add_epi32(input[0 * instride], input[63 * instride]);
-  x1[63] = _mm_sub_epi32(input[0 * instride], input[63 * instride]);
-  x1[1] = _mm_add_epi32(input[1 * instride], input[62 * instride]);
-  x1[62] = _mm_sub_epi32(input[1 * instride], input[62 * instride]);
-  x1[2] = _mm_add_epi32(input[2 * instride], input[61 * instride]);
-  x1[61] = _mm_sub_epi32(input[2 * instride], input[61 * instride]);
-  x1[3] = _mm_add_epi32(input[3 * instride], input[60 * instride]);
-  x1[60] = _mm_sub_epi32(input[3 * instride], input[60 * instride]);
-  x1[4] = _mm_add_epi32(input[4 * instride], input[59 * instride]);
-  x1[59] = _mm_sub_epi32(input[4 * instride], input[59 * instride]);
-  x1[5] = _mm_add_epi32(input[5 * instride], input[58 * instride]);
-  x1[58] = _mm_sub_epi32(input[5 * instride], input[58 * instride]);
-  x1[6] = _mm_add_epi32(input[6 * instride], input[57 * instride]);
-  x1[57] = _mm_sub_epi32(input[6 * instride], input[57 * instride]);
-  x1[7] = _mm_add_epi32(input[7 * instride], input[56 * instride]);
-  x1[56] = _mm_sub_epi32(input[7 * instride], input[56 * instride]);
-  x1[8] = _mm_add_epi32(input[8 * instride], input[55 * instride]);
-  x1[55] = _mm_sub_epi32(input[8 * instride], input[55 * instride]);
-  x1[9] = _mm_add_epi32(input[9 * instride], input[54 * instride]);
-  x1[54] = _mm_sub_epi32(input[9 * instride], input[54 * instride]);
-  x1[10] = _mm_add_epi32(input[10 * instride], input[53 * instride]);
-  x1[53] = _mm_sub_epi32(input[10 * instride], input[53 * instride]);
-  x1[11] = _mm_add_epi32(input[11 * instride], input[52 * instride]);
-  x1[52] = _mm_sub_epi32(input[11 * instride], input[52 * instride]);
-  x1[12] = _mm_add_epi32(input[12 * instride], input[51 * instride]);
-  x1[51] = _mm_sub_epi32(input[12 * instride], input[51 * instride]);
-  x1[13] = _mm_add_epi32(input[13 * instride], input[50 * instride]);
-  x1[50] = _mm_sub_epi32(input[13 * instride], input[50 * instride]);
-  x1[14] = _mm_add_epi32(input[14 * instride], input[49 * instride]);
-  x1[49] = _mm_sub_epi32(input[14 * instride], input[49 * instride]);
-  x1[15] = _mm_add_epi32(input[15 * instride], input[48 * instride]);
-  x1[48] = _mm_sub_epi32(input[15 * instride], input[48 * instride]);
-  x1[16] = _mm_add_epi32(input[16 * instride], input[47 * instride]);
-  x1[47] = _mm_sub_epi32(input[16 * instride], input[47 * instride]);
-  x1[17] = _mm_add_epi32(input[17 * instride], input[46 * instride]);
-  x1[46] = _mm_sub_epi32(input[17 * instride], input[46 * instride]);
-  x1[18] = _mm_add_epi32(input[18 * instride], input[45 * instride]);
-  x1[45] = _mm_sub_epi32(input[18 * instride], input[45 * instride]);
-  x1[19] = _mm_add_epi32(input[19 * instride], input[44 * instride]);
-  x1[44] = _mm_sub_epi32(input[19 * instride], input[44 * instride]);
-  x1[20] = _mm_add_epi32(input[20 * instride], input[43 * instride]);
-  x1[43] = _mm_sub_epi32(input[20 * instride], input[43 * instride]);
-  x1[21] = _mm_add_epi32(input[21 * instride], input[42 * instride]);
-  x1[42] = _mm_sub_epi32(input[21 * instride], input[42 * instride]);
-  x1[22] = _mm_add_epi32(input[22 * instride], input[41 * instride]);
-  x1[41] = _mm_sub_epi32(input[22 * instride], input[41 * instride]);
-  x1[23] = _mm_add_epi32(input[23 * instride], input[40 * instride]);
-  x1[40] = _mm_sub_epi32(input[23 * instride], input[40 * instride]);
-  x1[24] = _mm_add_epi32(input[24 * instride], input[39 * instride]);
-  x1[39] = _mm_sub_epi32(input[24 * instride], input[39 * instride]);
-  x1[25] = _mm_add_epi32(input[25 * instride], input[38 * instride]);
-  x1[38] = _mm_sub_epi32(input[25 * instride], input[38 * instride]);
-  x1[26] = _mm_add_epi32(input[26 * instride], input[37 * instride]);
-  x1[37] = _mm_sub_epi32(input[26 * instride], input[37 * instride]);
-  x1[27] = _mm_add_epi32(input[27 * instride], input[36 * instride]);
-  x1[36] = _mm_sub_epi32(input[27 * instride], input[36 * instride]);
-  x1[28] = _mm_add_epi32(input[28 * instride], input[35 * instride]);
-  x1[35] = _mm_sub_epi32(input[28 * instride], input[35 * instride]);
-  x1[29] = _mm_add_epi32(input[29 * instride], input[34 * instride]);
-  x1[34] = _mm_sub_epi32(input[29 * instride], input[34 * instride]);
-  x1[30] = _mm_add_epi32(input[30 * instride], input[33 * instride]);
-  x1[33] = _mm_sub_epi32(input[30 * instride], input[33 * instride]);
-  x1[31] = _mm_add_epi32(input[31 * instride], input[32 * instride]);
-  x1[32] = _mm_sub_epi32(input[31 * instride], input[32 * instride]);
+  x1[0] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[63] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[1] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[62] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[2] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[61] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[3] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[60] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[4] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[59] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[5] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[58] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[6] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[57] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[7] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[56] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[8] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[55] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[9] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[54] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[10] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[53] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[11] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[52] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[12] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[51] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[13] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[50] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[14] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[49] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[15] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[48] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[16] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[47] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[17] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[46] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[18] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[45] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[19] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[44] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[20] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[43] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[21] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[42] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[22] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[41] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[23] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[40] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[24] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[39] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[25] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[38] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[26] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[37] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[27] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[36] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[28] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[35] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[29] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[34] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[30] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[33] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[31] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[32] = _mm_sub_epi32(input[startidx], input[endidx]);
 
   // stage 2
   __m128i x2[64];
@@ -1148,69 +1277,133 @@
   btf_32_type1_sse4_1_new(cospi_p03, cospi_p61, x9[47], x9[48], x10[47],
                           x10[48], __rounding, cos_bit);
 
+  startidx = 0 * outstride;
+  endidx = 63 * outstride;
   // stage 11
-  output[0 * outstride] = x10[0];
-  output[1 * outstride] = x10[32];
-  output[2 * outstride] = x10[16];
-  output[3 * outstride] = x10[48];
-  output[4 * outstride] = x10[8];
-  output[5 * outstride] = x10[40];
-  output[6 * outstride] = x10[24];
-  output[7 * outstride] = x10[56];
-  output[8 * outstride] = x10[4];
-  output[9 * outstride] = x10[36];
-  output[10 * outstride] = x10[20];
-  output[11 * outstride] = x10[52];
-  output[12 * outstride] = x10[12];
-  output[13 * outstride] = x10[44];
-  output[14 * outstride] = x10[28];
-  output[15 * outstride] = x10[60];
-  output[16 * outstride] = x10[2];
-  output[17 * outstride] = x10[34];
-  output[18 * outstride] = x10[18];
-  output[19 * outstride] = x10[50];
-  output[20 * outstride] = x10[10];
-  output[21 * outstride] = x10[42];
-  output[22 * outstride] = x10[26];
-  output[23 * outstride] = x10[58];
-  output[24 * outstride] = x10[6];
-  output[25 * outstride] = x10[38];
-  output[26 * outstride] = x10[22];
-  output[27 * outstride] = x10[54];
-  output[28 * outstride] = x10[14];
-  output[29 * outstride] = x10[46];
-  output[30 * outstride] = x10[30];
-  output[31 * outstride] = x10[62];
-  output[32 * outstride] = x10[1];
-  output[33 * outstride] = x10[33];
-  output[34 * outstride] = x10[17];
-  output[35 * outstride] = x10[49];
-  output[36 * outstride] = x10[9];
-  output[37 * outstride] = x10[41];
-  output[38 * outstride] = x10[25];
-  output[39 * outstride] = x10[57];
-  output[40 * outstride] = x10[5];
-  output[41 * outstride] = x10[37];
-  output[42 * outstride] = x10[21];
-  output[43 * outstride] = x10[53];
-  output[44 * outstride] = x10[13];
-  output[45 * outstride] = x10[45];
-  output[46 * outstride] = x10[29];
-  output[47 * outstride] = x10[61];
-  output[48 * outstride] = x10[3];
-  output[49 * outstride] = x10[35];
-  output[50 * outstride] = x10[19];
-  output[51 * outstride] = x10[51];
-  output[52 * outstride] = x10[11];
-  output[53 * outstride] = x10[43];
-  output[54 * outstride] = x10[27];
-  output[55 * outstride] = x10[59];
-  output[56 * outstride] = x10[7];
-  output[57 * outstride] = x10[39];
-  output[58 * outstride] = x10[23];
-  output[59 * outstride] = x10[55];
-  output[60 * outstride] = x10[15];
-  output[61 * outstride] = x10[47];
-  output[62 * outstride] = x10[31];
-  output[63 * outstride] = x10[63];
+  output[startidx] = x10[0];
+  output[endidx] = x10[63];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[32];
+  output[endidx] = x10[31];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[16];
+  output[endidx] = x10[47];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[48];
+  output[endidx] = x10[15];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[8];
+  output[endidx] = x10[55];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[40];
+  output[endidx] = x10[23];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[24];
+  output[endidx] = x10[39];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[56];
+  output[endidx] = x10[7];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[4];
+  output[endidx] = x10[59];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[36];
+  output[endidx] = x10[27];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[20];
+  output[endidx] = x10[43];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[52];
+  output[endidx] = x10[11];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[12];
+  output[endidx] = x10[51];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[44];
+  output[endidx] = x10[19];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[28];
+  output[endidx] = x10[35];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[60];
+  output[endidx] = x10[3];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[2];
+  output[endidx] = x10[61];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[34];
+  output[endidx] = x10[29];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[18];
+  output[endidx] = x10[45];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[50];
+  output[endidx] = x10[13];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[10];
+  output[endidx] = x10[53];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[42];
+  output[endidx] = x10[21];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[26];
+  output[endidx] = x10[37];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[58];
+  output[endidx] = x10[5];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[6];
+  output[endidx] = x10[57];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[38];
+  output[endidx] = x10[25];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[22];
+  output[endidx] = x10[41];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[54];
+  output[endidx] = x10[9];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[14];
+  output[endidx] = x10[49];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[46];
+  output[endidx] = x10[17];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[30];
+  output[endidx] = x10[33];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[62];
+  output[endidx] = x10[1];
 }
diff --git a/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/av1/encoder/x86/highbd_fwd_txfm_sse4.c
index 9f5df90..a9516ca 100644
--- a/av1/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -70,10 +70,12 @@
   __m128i u0, u1, u2, u3;
   __m128i v0, v1, v2, v3;
 
-  s0 = _mm_add_epi32(in[0 * num_col], in[3 * num_col]);
-  s1 = _mm_add_epi32(in[1 * num_col], in[2 * num_col]);
-  s2 = _mm_sub_epi32(in[1 * num_col], in[2 * num_col]);
-  s3 = _mm_sub_epi32(in[0 * num_col], in[3 * num_col]);
+  int endidx = 3 * num_col;
+  s0 = _mm_add_epi32(in[0], in[endidx]);
+  s3 = _mm_sub_epi32(in[0], in[endidx]);
+  endidx -= num_col;
+  s1 = _mm_add_epi32(in[num_col], in[endidx]);
+  s2 = _mm_sub_epi32(in[num_col], in[endidx]);
 
   // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
   u0 = _mm_mullo_epi32(s0, cospi32);
@@ -137,15 +139,19 @@
   __m128i u0, u1, u2, u3;
   __m128i v0, v1, v2, v3;
 
-  s0 = _mm_mullo_epi32(in[0 * num_col], sinpi1);
-  s1 = _mm_mullo_epi32(in[0 * num_col], sinpi4);
-  s2 = _mm_mullo_epi32(in[1 * num_col], sinpi2);
-  s3 = _mm_mullo_epi32(in[1 * num_col], sinpi1);
-  s4 = _mm_mullo_epi32(in[2 * num_col], sinpi3);
-  s5 = _mm_mullo_epi32(in[3 * num_col], sinpi4);
-  s6 = _mm_mullo_epi32(in[3 * num_col], sinpi2);
-  t = _mm_add_epi32(in[0 * num_col], in[1 * num_col]);
-  s7 = _mm_sub_epi32(t, in[3 * num_col]);
+  int idx = 0 * num_col;
+  s0 = _mm_mullo_epi32(in[idx], sinpi1);
+  s1 = _mm_mullo_epi32(in[idx], sinpi4);
+  t = _mm_add_epi32(in[idx], in[idx + num_col]);
+  idx += num_col;
+  s2 = _mm_mullo_epi32(in[idx], sinpi2);
+  s3 = _mm_mullo_epi32(in[idx], sinpi1);
+  idx += num_col;
+  s4 = _mm_mullo_epi32(in[idx], sinpi3);
+  idx += num_col;
+  s5 = _mm_mullo_epi32(in[idx], sinpi4);
+  s6 = _mm_mullo_epi32(in[idx], sinpi2);
+  s7 = _mm_sub_epi32(t, in[idx]);
 
   t = _mm_add_epi32(s0, s2);
   x0 = _mm_add_epi32(t, s5);