Optimize highbd 64x64 fwd_txfm

Added sse4_1 variant for highbd 64x64 fwd_txfm.

When tested for 20 frames of crowd_run_360p_10 at 750 kbps,
observed ~3.7% reduction in encoder time for speed=1 preset.

Achieved module level gains of 4x w.r.t. C function.

Change-Id: Id9da2231a7a5c0eebe81f5062f8c2d5a7feb3227
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index a61f1d7..c605906 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -217,6 +217,7 @@
   specialize qw/av1_fwd_txfm2d_32x32 sse4_1/;
 
   add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_64x64 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_16x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
diff --git a/av1/common/x86/highbd_txfm_utility_sse4.h b/av1/common/x86/highbd_txfm_utility_sse4.h
index 0636555..6f24e59 100644
--- a/av1/common/x86/highbd_txfm_utility_sse4.h
+++ b/av1/common/x86/highbd_txfm_utility_sse4.h
@@ -75,6 +75,17 @@
                 out[63]);
 }
 
+static INLINE void transpose_32x32(const __m128i *input, __m128i *output) {
+  for (int j = 0; j < 8; j++) {
+    for (int i = 0; i < 8; i++) {
+      TRANSPOSE_4X4(input[i * 32 + j + 0], input[i * 32 + j + 8],
+                    input[i * 32 + j + 16], input[i * 32 + j + 24],
+                    output[j * 32 + i + 0], output[j * 32 + i + 8],
+                    output[j * 32 + i + 16], output[j * 32 + i + 24]);
+    }
+  }
+}
+
 // Note:
 //  rounding = 1 << (bit - 1)
 static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0,
diff --git a/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
index c71f2e7..0761554 100644
--- a/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
+++ b/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
@@ -395,7 +395,8 @@
 }
 
 void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
-                           int8_t cos_bit) {
+                           int8_t cos_bit, const int instride,
+                           const int outstride) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
 
@@ -480,70 +481,70 @@
 
   // stage 1
   __m128i x1[64];
-  x1[0] = _mm_add_epi32(input[0], input[63]);
-  x1[63] = _mm_sub_epi32(input[0], input[63]);
-  x1[1] = _mm_add_epi32(input[1], input[62]);
-  x1[62] = _mm_sub_epi32(input[1], input[62]);
-  x1[2] = _mm_add_epi32(input[2], input[61]);
-  x1[61] = _mm_sub_epi32(input[2], input[61]);
-  x1[3] = _mm_add_epi32(input[3], input[60]);
-  x1[60] = _mm_sub_epi32(input[3], input[60]);
-  x1[4] = _mm_add_epi32(input[4], input[59]);
-  x1[59] = _mm_sub_epi32(input[4], input[59]);
-  x1[5] = _mm_add_epi32(input[5], input[58]);
-  x1[58] = _mm_sub_epi32(input[5], input[58]);
-  x1[6] = _mm_add_epi32(input[6], input[57]);
-  x1[57] = _mm_sub_epi32(input[6], input[57]);
-  x1[7] = _mm_add_epi32(input[7], input[56]);
-  x1[56] = _mm_sub_epi32(input[7], input[56]);
-  x1[8] = _mm_add_epi32(input[8], input[55]);
-  x1[55] = _mm_sub_epi32(input[8], input[55]);
-  x1[9] = _mm_add_epi32(input[9], input[54]);
-  x1[54] = _mm_sub_epi32(input[9], input[54]);
-  x1[10] = _mm_add_epi32(input[10], input[53]);
-  x1[53] = _mm_sub_epi32(input[10], input[53]);
-  x1[11] = _mm_add_epi32(input[11], input[52]);
-  x1[52] = _mm_sub_epi32(input[11], input[52]);
-  x1[12] = _mm_add_epi32(input[12], input[51]);
-  x1[51] = _mm_sub_epi32(input[12], input[51]);
-  x1[13] = _mm_add_epi32(input[13], input[50]);
-  x1[50] = _mm_sub_epi32(input[13], input[50]);
-  x1[14] = _mm_add_epi32(input[14], input[49]);
-  x1[49] = _mm_sub_epi32(input[14], input[49]);
-  x1[15] = _mm_add_epi32(input[15], input[48]);
-  x1[48] = _mm_sub_epi32(input[15], input[48]);
-  x1[16] = _mm_add_epi32(input[16], input[47]);
-  x1[47] = _mm_sub_epi32(input[16], input[47]);
-  x1[17] = _mm_add_epi32(input[17], input[46]);
-  x1[46] = _mm_sub_epi32(input[17], input[46]);
-  x1[18] = _mm_add_epi32(input[18], input[45]);
-  x1[45] = _mm_sub_epi32(input[18], input[45]);
-  x1[19] = _mm_add_epi32(input[19], input[44]);
-  x1[44] = _mm_sub_epi32(input[19], input[44]);
-  x1[20] = _mm_add_epi32(input[20], input[43]);
-  x1[43] = _mm_sub_epi32(input[20], input[43]);
-  x1[21] = _mm_add_epi32(input[21], input[42]);
-  x1[42] = _mm_sub_epi32(input[21], input[42]);
-  x1[22] = _mm_add_epi32(input[22], input[41]);
-  x1[41] = _mm_sub_epi32(input[22], input[41]);
-  x1[23] = _mm_add_epi32(input[23], input[40]);
-  x1[40] = _mm_sub_epi32(input[23], input[40]);
-  x1[24] = _mm_add_epi32(input[24], input[39]);
-  x1[39] = _mm_sub_epi32(input[24], input[39]);
-  x1[25] = _mm_add_epi32(input[25], input[38]);
-  x1[38] = _mm_sub_epi32(input[25], input[38]);
-  x1[26] = _mm_add_epi32(input[26], input[37]);
-  x1[37] = _mm_sub_epi32(input[26], input[37]);
-  x1[27] = _mm_add_epi32(input[27], input[36]);
-  x1[36] = _mm_sub_epi32(input[27], input[36]);
-  x1[28] = _mm_add_epi32(input[28], input[35]);
-  x1[35] = _mm_sub_epi32(input[28], input[35]);
-  x1[29] = _mm_add_epi32(input[29], input[34]);
-  x1[34] = _mm_sub_epi32(input[29], input[34]);
-  x1[30] = _mm_add_epi32(input[30], input[33]);
-  x1[33] = _mm_sub_epi32(input[30], input[33]);
-  x1[31] = _mm_add_epi32(input[31], input[32]);
-  x1[32] = _mm_sub_epi32(input[31], input[32]);
+  x1[0] = _mm_add_epi32(input[0 * instride], input[63 * instride]);
+  x1[63] = _mm_sub_epi32(input[0 * instride], input[63 * instride]);
+  x1[1] = _mm_add_epi32(input[1 * instride], input[62 * instride]);
+  x1[62] = _mm_sub_epi32(input[1 * instride], input[62 * instride]);
+  x1[2] = _mm_add_epi32(input[2 * instride], input[61 * instride]);
+  x1[61] = _mm_sub_epi32(input[2 * instride], input[61 * instride]);
+  x1[3] = _mm_add_epi32(input[3 * instride], input[60 * instride]);
+  x1[60] = _mm_sub_epi32(input[3 * instride], input[60 * instride]);
+  x1[4] = _mm_add_epi32(input[4 * instride], input[59 * instride]);
+  x1[59] = _mm_sub_epi32(input[4 * instride], input[59 * instride]);
+  x1[5] = _mm_add_epi32(input[5 * instride], input[58 * instride]);
+  x1[58] = _mm_sub_epi32(input[5 * instride], input[58 * instride]);
+  x1[6] = _mm_add_epi32(input[6 * instride], input[57 * instride]);
+  x1[57] = _mm_sub_epi32(input[6 * instride], input[57 * instride]);
+  x1[7] = _mm_add_epi32(input[7 * instride], input[56 * instride]);
+  x1[56] = _mm_sub_epi32(input[7 * instride], input[56 * instride]);
+  x1[8] = _mm_add_epi32(input[8 * instride], input[55 * instride]);
+  x1[55] = _mm_sub_epi32(input[8 * instride], input[55 * instride]);
+  x1[9] = _mm_add_epi32(input[9 * instride], input[54 * instride]);
+  x1[54] = _mm_sub_epi32(input[9 * instride], input[54 * instride]);
+  x1[10] = _mm_add_epi32(input[10 * instride], input[53 * instride]);
+  x1[53] = _mm_sub_epi32(input[10 * instride], input[53 * instride]);
+  x1[11] = _mm_add_epi32(input[11 * instride], input[52 * instride]);
+  x1[52] = _mm_sub_epi32(input[11 * instride], input[52 * instride]);
+  x1[12] = _mm_add_epi32(input[12 * instride], input[51 * instride]);
+  x1[51] = _mm_sub_epi32(input[12 * instride], input[51 * instride]);
+  x1[13] = _mm_add_epi32(input[13 * instride], input[50 * instride]);
+  x1[50] = _mm_sub_epi32(input[13 * instride], input[50 * instride]);
+  x1[14] = _mm_add_epi32(input[14 * instride], input[49 * instride]);
+  x1[49] = _mm_sub_epi32(input[14 * instride], input[49 * instride]);
+  x1[15] = _mm_add_epi32(input[15 * instride], input[48 * instride]);
+  x1[48] = _mm_sub_epi32(input[15 * instride], input[48 * instride]);
+  x1[16] = _mm_add_epi32(input[16 * instride], input[47 * instride]);
+  x1[47] = _mm_sub_epi32(input[16 * instride], input[47 * instride]);
+  x1[17] = _mm_add_epi32(input[17 * instride], input[46 * instride]);
+  x1[46] = _mm_sub_epi32(input[17 * instride], input[46 * instride]);
+  x1[18] = _mm_add_epi32(input[18 * instride], input[45 * instride]);
+  x1[45] = _mm_sub_epi32(input[18 * instride], input[45 * instride]);
+  x1[19] = _mm_add_epi32(input[19 * instride], input[44 * instride]);
+  x1[44] = _mm_sub_epi32(input[19 * instride], input[44 * instride]);
+  x1[20] = _mm_add_epi32(input[20 * instride], input[43 * instride]);
+  x1[43] = _mm_sub_epi32(input[20 * instride], input[43 * instride]);
+  x1[21] = _mm_add_epi32(input[21 * instride], input[42 * instride]);
+  x1[42] = _mm_sub_epi32(input[21 * instride], input[42 * instride]);
+  x1[22] = _mm_add_epi32(input[22 * instride], input[41 * instride]);
+  x1[41] = _mm_sub_epi32(input[22 * instride], input[41 * instride]);
+  x1[23] = _mm_add_epi32(input[23 * instride], input[40 * instride]);
+  x1[40] = _mm_sub_epi32(input[23 * instride], input[40 * instride]);
+  x1[24] = _mm_add_epi32(input[24 * instride], input[39 * instride]);
+  x1[39] = _mm_sub_epi32(input[24 * instride], input[39 * instride]);
+  x1[25] = _mm_add_epi32(input[25 * instride], input[38 * instride]);
+  x1[38] = _mm_sub_epi32(input[25 * instride], input[38 * instride]);
+  x1[26] = _mm_add_epi32(input[26 * instride], input[37 * instride]);
+  x1[37] = _mm_sub_epi32(input[26 * instride], input[37 * instride]);
+  x1[27] = _mm_add_epi32(input[27 * instride], input[36 * instride]);
+  x1[36] = _mm_sub_epi32(input[27 * instride], input[36 * instride]);
+  x1[28] = _mm_add_epi32(input[28 * instride], input[35 * instride]);
+  x1[35] = _mm_sub_epi32(input[28 * instride], input[35 * instride]);
+  x1[29] = _mm_add_epi32(input[29 * instride], input[34 * instride]);
+  x1[34] = _mm_sub_epi32(input[29 * instride], input[34 * instride]);
+  x1[30] = _mm_add_epi32(input[30 * instride], input[33 * instride]);
+  x1[33] = _mm_sub_epi32(input[30 * instride], input[33 * instride]);
+  x1[31] = _mm_add_epi32(input[31 * instride], input[32 * instride]);
+  x1[32] = _mm_sub_epi32(input[31 * instride], input[32 * instride]);
 
   // stage 2
   __m128i x2[64];
@@ -1149,68 +1150,68 @@
                           x10[48], __rounding, cos_bit);
 
   // stage 11
-  output[0] = x10[0];
-  output[1] = x10[32];
-  output[2] = x10[16];
-  output[3] = x10[48];
-  output[4] = x10[8];
-  output[5] = x10[40];
-  output[6] = x10[24];
-  output[7] = x10[56];
-  output[8] = x10[4];
-  output[9] = x10[36];
-  output[10] = x10[20];
-  output[11] = x10[52];
-  output[12] = x10[12];
-  output[13] = x10[44];
-  output[14] = x10[28];
-  output[15] = x10[60];
-  output[16] = x10[2];
-  output[17] = x10[34];
-  output[18] = x10[18];
-  output[19] = x10[50];
-  output[20] = x10[10];
-  output[21] = x10[42];
-  output[22] = x10[26];
-  output[23] = x10[58];
-  output[24] = x10[6];
-  output[25] = x10[38];
-  output[26] = x10[22];
-  output[27] = x10[54];
-  output[28] = x10[14];
-  output[29] = x10[46];
-  output[30] = x10[30];
-  output[31] = x10[62];
-  output[32] = x10[1];
-  output[33] = x10[33];
-  output[34] = x10[17];
-  output[35] = x10[49];
-  output[36] = x10[9];
-  output[37] = x10[41];
-  output[38] = x10[25];
-  output[39] = x10[57];
-  output[40] = x10[5];
-  output[41] = x10[37];
-  output[42] = x10[21];
-  output[43] = x10[53];
-  output[44] = x10[13];
-  output[45] = x10[45];
-  output[46] = x10[29];
-  output[47] = x10[61];
-  output[48] = x10[3];
-  output[49] = x10[35];
-  output[50] = x10[19];
-  output[51] = x10[51];
-  output[52] = x10[11];
-  output[53] = x10[43];
-  output[54] = x10[27];
-  output[55] = x10[59];
-  output[56] = x10[7];
-  output[57] = x10[39];
-  output[58] = x10[23];
-  output[59] = x10[55];
-  output[60] = x10[15];
-  output[61] = x10[47];
-  output[62] = x10[31];
-  output[63] = x10[63];
+  output[0 * outstride] = x10[0];
+  output[1 * outstride] = x10[32];
+  output[2 * outstride] = x10[16];
+  output[3 * outstride] = x10[48];
+  output[4 * outstride] = x10[8];
+  output[5 * outstride] = x10[40];
+  output[6 * outstride] = x10[24];
+  output[7 * outstride] = x10[56];
+  output[8 * outstride] = x10[4];
+  output[9 * outstride] = x10[36];
+  output[10 * outstride] = x10[20];
+  output[11 * outstride] = x10[52];
+  output[12 * outstride] = x10[12];
+  output[13 * outstride] = x10[44];
+  output[14 * outstride] = x10[28];
+  output[15 * outstride] = x10[60];
+  output[16 * outstride] = x10[2];
+  output[17 * outstride] = x10[34];
+  output[18 * outstride] = x10[18];
+  output[19 * outstride] = x10[50];
+  output[20 * outstride] = x10[10];
+  output[21 * outstride] = x10[42];
+  output[22 * outstride] = x10[26];
+  output[23 * outstride] = x10[58];
+  output[24 * outstride] = x10[6];
+  output[25 * outstride] = x10[38];
+  output[26 * outstride] = x10[22];
+  output[27 * outstride] = x10[54];
+  output[28 * outstride] = x10[14];
+  output[29 * outstride] = x10[46];
+  output[30 * outstride] = x10[30];
+  output[31 * outstride] = x10[62];
+  output[32 * outstride] = x10[1];
+  output[33 * outstride] = x10[33];
+  output[34 * outstride] = x10[17];
+  output[35 * outstride] = x10[49];
+  output[36 * outstride] = x10[9];
+  output[37 * outstride] = x10[41];
+  output[38 * outstride] = x10[25];
+  output[39 * outstride] = x10[57];
+  output[40 * outstride] = x10[5];
+  output[41 * outstride] = x10[37];
+  output[42 * outstride] = x10[21];
+  output[43 * outstride] = x10[53];
+  output[44 * outstride] = x10[13];
+  output[45 * outstride] = x10[45];
+  output[46 * outstride] = x10[29];
+  output[47 * outstride] = x10[61];
+  output[48 * outstride] = x10[3];
+  output[49 * outstride] = x10[35];
+  output[50 * outstride] = x10[19];
+  output[51 * outstride] = x10[51];
+  output[52 * outstride] = x10[11];
+  output[53 * outstride] = x10[43];
+  output[54 * outstride] = x10[27];
+  output[55 * outstride] = x10[59];
+  output[56 * outstride] = x10[7];
+  output[57 * outstride] = x10[39];
+  output[58 * outstride] = x10[23];
+  output[59 * outstride] = x10[55];
+  output[60 * outstride] = x10[15];
+  output[61 * outstride] = x10[47];
+  output[62 * outstride] = x10[31];
+  output[63 * outstride] = x10[63];
 }
diff --git a/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
index 968f900..8ec0256 100644
--- a/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
+++ b/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
@@ -14,6 +14,7 @@
 #include "av1/common/enums.h"
 #include "av1/common/av1_txfm.h"
 #include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
 #include "av1/encoder/x86/av1_txfm1d_sse4.h"
 #include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
@@ -52,9 +53,22 @@
   }
 }
 
+static void fdct64_new_sse4_1(const __m128i *input, __m128i *output,
+                              const int8_t cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 64;
+  const int num_per_128 = 4;
+  int col_num = txfm_size / num_per_128;
+  (void)stage_range;
+  for (int col = 0; col < col_num; col++) {
+    av1_fdct64_new_sse4_1((input + col), (output + col), cos_bit, col_num,
+                          col_num);
+  }
+}
+
 static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
   switch (txfm_type) {
     case TXFM_TYPE_DCT32: return fdct32_new_sse4_1; break;
+    case TXFM_TYPE_DCT64: return fdct64_new_sse4_1; break;
     default: assert(0);
   }
   return NULL;
@@ -95,6 +109,42 @@
   transpose_32(txfm_size, buf_128, out_128);
 }
 
+static INLINE void fwd_txfm2d_64x64_sse4_1(const int16_t *input,
+                                           int32_t *output, const int stride,
+                                           const TXFM_2D_FLIP_CFG *cfg,
+                                           int32_t *txfm_buf) {
+  assert(cfg->tx_size < TX_SIZES);
+  const int txfm_size = tx_size_wide[cfg->tx_size];
+  const int8_t *shift = cfg->shift;
+  const int8_t *stage_range_col = cfg->stage_range_col;
+  const int8_t cos_bit_col = cfg->cos_bit_col;
+  const int8_t cos_bit_row = cfg->cos_bit_row;
+  const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  __m128i *buf_128 = (__m128i *)txfm_buf;
+  __m128i *out_128 = (__m128i *)output;
+
+  const int num_per_128 = 4;
+  int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+  int col_num = txfm_size / num_per_128;
+
+  int16_array_with_stride_to_int32_array_without_stride(input, stride, output,
+                                                        txfm_size);
+  /*col wise transform*/
+  txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+  av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+  transpose_32(txfm_size, out_128, buf_128);
+
+  /*row wise transform*/
+  for (int col = 0; col < (col_num >> 1); col++) {
+    av1_fdct64_new_sse4_1((buf_128 + col), (out_128 + col), cos_bit_row,
+                          col_num, (col_num >> 1));
+  }
+
+  txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1);
+  av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
+  transpose_32x32(buf_128, out_128);
+}
+
 void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
                                  int stride, TX_TYPE tx_type, int bd) {
   DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
@@ -104,6 +154,15 @@
   fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
 }
 
+void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
+  (void)bd;
+  fwd_txfm2d_64x64_sse4_1(input, output, stride, &cfg, txfm_buf);
+}
+
 static INLINE void transpose_32_4x4x2(int stride, const __m128i *inputA,
                                       const __m128i *inputB, __m128i *output) {
   __m128i temp0 = _mm_unpacklo_epi32(inputA[0], inputA[2]);
@@ -162,8 +221,8 @@
       bufA[j] = _mm_cvtepi16_epi32(buf[j]);
       bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
     }
-    av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row);
-    av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row);
+    av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+    av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
     av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
     av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
 
@@ -209,8 +268,8 @@
       bufA[j] = _mm_cvtepi16_epi32(buf[j]);
       bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
     }
-    av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row);
-    av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row);
+    av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+    av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
     av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
     av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
 
diff --git a/av1/encoder/x86/av1_txfm1d_sse4.h b/av1/encoder/x86/av1_txfm1d_sse4.h
index 17ca8fc..6df2a8b 100644
--- a/av1/encoder/x86/av1_txfm1d_sse4.h
+++ b/av1/encoder/x86/av1_txfm1d_sse4.h
@@ -29,7 +29,8 @@
 void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
                            int8_t cos_bit);
 void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
-                           int8_t cos_bit);
+                           int8_t cos_bit, const int instride,
+                           const int outstride);
 
 void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
                            const int8_t cos_bit, const int8_t *stage_range);