Common Identity 16x8 transform: forward and reverse.

The code for forward and reverse transform was exactly same, so moved to
a common header.

BUG=aomedia:442

Change-Id: Ic0d82b6213127c757c81e7a84045a30e28036161
diff --git a/aom_dsp/x86/txfm_common_sse2.h b/aom_dsp/x86/txfm_common_sse2.h
index ae57361..87f1565 100644
--- a/aom_dsp/x86/txfm_common_sse2.h
+++ b/aom_dsp/x86/txfm_common_sse2.h
@@ -34,4 +34,197 @@
   return _mm_shuffle_epi32(b, 0x4e);
 }
 
+#if CONFIG_EXT_TX
+// Identity transform (both forward and inverse).
+static INLINE void idtx16_8col(__m128i *in) {
+  const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
+  const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i y0, y1, y2, y3, y4, y5, y6, y7;
+
+  in[0] = _mm_slli_epi16(in[0], 1);
+  in[1] = _mm_slli_epi16(in[1], 1);
+  in[2] = _mm_slli_epi16(in[2], 1);
+  in[3] = _mm_slli_epi16(in[3], 1);
+  in[4] = _mm_slli_epi16(in[4], 1);
+  in[5] = _mm_slli_epi16(in[5], 1);
+  in[6] = _mm_slli_epi16(in[6], 1);
+  in[7] = _mm_slli_epi16(in[7], 1);
+  in[8] = _mm_slli_epi16(in[8], 1);
+  in[9] = _mm_slli_epi16(in[9], 1);
+  in[10] = _mm_slli_epi16(in[10], 1);
+  in[11] = _mm_slli_epi16(in[11], 1);
+  in[12] = _mm_slli_epi16(in[12], 1);
+  in[13] = _mm_slli_epi16(in[13], 1);
+  in[14] = _mm_slli_epi16(in[14], 1);
+  in[15] = _mm_slli_epi16(in[15], 1);
+
+  v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
+  v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
+  v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
+  v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
+  v4 = _mm_unpacklo_epi16(in[4], k__zero_epi16);
+  v5 = _mm_unpacklo_epi16(in[5], k__zero_epi16);
+  v6 = _mm_unpacklo_epi16(in[6], k__zero_epi16);
+  v7 = _mm_unpacklo_epi16(in[7], k__zero_epi16);
+
+  u0 = _mm_unpacklo_epi16(in[8], k__zero_epi16);
+  u1 = _mm_unpacklo_epi16(in[9], k__zero_epi16);
+  u2 = _mm_unpacklo_epi16(in[10], k__zero_epi16);
+  u3 = _mm_unpacklo_epi16(in[11], k__zero_epi16);
+  u4 = _mm_unpacklo_epi16(in[12], k__zero_epi16);
+  u5 = _mm_unpacklo_epi16(in[13], k__zero_epi16);
+  u6 = _mm_unpacklo_epi16(in[14], k__zero_epi16);
+  u7 = _mm_unpacklo_epi16(in[15], k__zero_epi16);
+
+  x0 = _mm_unpackhi_epi16(in[0], k__zero_epi16);
+  x1 = _mm_unpackhi_epi16(in[1], k__zero_epi16);
+  x2 = _mm_unpackhi_epi16(in[2], k__zero_epi16);
+  x3 = _mm_unpackhi_epi16(in[3], k__zero_epi16);
+  x4 = _mm_unpackhi_epi16(in[4], k__zero_epi16);
+  x5 = _mm_unpackhi_epi16(in[5], k__zero_epi16);
+  x6 = _mm_unpackhi_epi16(in[6], k__zero_epi16);
+  x7 = _mm_unpackhi_epi16(in[7], k__zero_epi16);
+
+  y0 = _mm_unpackhi_epi16(in[8], k__zero_epi16);
+  y1 = _mm_unpackhi_epi16(in[9], k__zero_epi16);
+  y2 = _mm_unpackhi_epi16(in[10], k__zero_epi16);
+  y3 = _mm_unpackhi_epi16(in[11], k__zero_epi16);
+  y4 = _mm_unpackhi_epi16(in[12], k__zero_epi16);
+  y5 = _mm_unpackhi_epi16(in[13], k__zero_epi16);
+  y6 = _mm_unpackhi_epi16(in[14], k__zero_epi16);
+  y7 = _mm_unpackhi_epi16(in[15], k__zero_epi16);
+
+  v0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
+  v1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
+  v2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
+  v3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
+  v4 = _mm_madd_epi16(v4, k__sqrt2_epi16);
+  v5 = _mm_madd_epi16(v5, k__sqrt2_epi16);
+  v6 = _mm_madd_epi16(v6, k__sqrt2_epi16);
+  v7 = _mm_madd_epi16(v7, k__sqrt2_epi16);
+
+  x0 = _mm_madd_epi16(x0, k__sqrt2_epi16);
+  x1 = _mm_madd_epi16(x1, k__sqrt2_epi16);
+  x2 = _mm_madd_epi16(x2, k__sqrt2_epi16);
+  x3 = _mm_madd_epi16(x3, k__sqrt2_epi16);
+  x4 = _mm_madd_epi16(x4, k__sqrt2_epi16);
+  x5 = _mm_madd_epi16(x5, k__sqrt2_epi16);
+  x6 = _mm_madd_epi16(x6, k__sqrt2_epi16);
+  x7 = _mm_madd_epi16(x7, k__sqrt2_epi16);
+
+  u0 = _mm_madd_epi16(u0, k__sqrt2_epi16);
+  u1 = _mm_madd_epi16(u1, k__sqrt2_epi16);
+  u2 = _mm_madd_epi16(u2, k__sqrt2_epi16);
+  u3 = _mm_madd_epi16(u3, k__sqrt2_epi16);
+  u4 = _mm_madd_epi16(u4, k__sqrt2_epi16);
+  u5 = _mm_madd_epi16(u5, k__sqrt2_epi16);
+  u6 = _mm_madd_epi16(u6, k__sqrt2_epi16);
+  u7 = _mm_madd_epi16(u7, k__sqrt2_epi16);
+
+  y0 = _mm_madd_epi16(y0, k__sqrt2_epi16);
+  y1 = _mm_madd_epi16(y1, k__sqrt2_epi16);
+  y2 = _mm_madd_epi16(y2, k__sqrt2_epi16);
+  y3 = _mm_madd_epi16(y3, k__sqrt2_epi16);
+  y4 = _mm_madd_epi16(y4, k__sqrt2_epi16);
+  y5 = _mm_madd_epi16(y5, k__sqrt2_epi16);
+  y6 = _mm_madd_epi16(y6, k__sqrt2_epi16);
+  y7 = _mm_madd_epi16(y7, k__sqrt2_epi16);
+
+  v0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  x0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
+  x1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
+  x2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
+  x3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
+  x4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING);
+  x5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING);
+  x6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING);
+  x7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+  y0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING);
+  y1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING);
+  y2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING);
+  y3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING);
+  y4 = _mm_add_epi32(y4, k__DCT_CONST_ROUNDING);
+  y5 = _mm_add_epi32(y5, k__DCT_CONST_ROUNDING);
+  y6 = _mm_add_epi32(y6, k__DCT_CONST_ROUNDING);
+  y7 = _mm_add_epi32(y7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  x0 = _mm_srai_epi32(x0, DCT_CONST_BITS);
+  x1 = _mm_srai_epi32(x1, DCT_CONST_BITS);
+  x2 = _mm_srai_epi32(x2, DCT_CONST_BITS);
+  x3 = _mm_srai_epi32(x3, DCT_CONST_BITS);
+  x4 = _mm_srai_epi32(x4, DCT_CONST_BITS);
+  x5 = _mm_srai_epi32(x5, DCT_CONST_BITS);
+  x6 = _mm_srai_epi32(x6, DCT_CONST_BITS);
+  x7 = _mm_srai_epi32(x7, DCT_CONST_BITS);
+
+  u0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  y0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
+  y1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
+  y2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
+  y3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
+  y4 = _mm_srai_epi32(y4, DCT_CONST_BITS);
+  y5 = _mm_srai_epi32(y5, DCT_CONST_BITS);
+  y6 = _mm_srai_epi32(y6, DCT_CONST_BITS);
+  y7 = _mm_srai_epi32(y7, DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(v0, x0);
+  in[1] = _mm_packs_epi32(v1, x1);
+  in[2] = _mm_packs_epi32(v2, x2);
+  in[3] = _mm_packs_epi32(v3, x3);
+  in[4] = _mm_packs_epi32(v4, x4);
+  in[5] = _mm_packs_epi32(v5, x5);
+  in[6] = _mm_packs_epi32(v6, x6);
+  in[7] = _mm_packs_epi32(v7, x7);
+
+  in[8] = _mm_packs_epi32(u0, y0);
+  in[9] = _mm_packs_epi32(u1, y1);
+  in[10] = _mm_packs_epi32(u2, y2);
+  in[11] = _mm_packs_epi32(u3, y3);
+  in[12] = _mm_packs_epi32(u4, y4);
+  in[13] = _mm_packs_epi32(u5, y5);
+  in[14] = _mm_packs_epi32(u6, y6);
+  in[15] = _mm_packs_epi32(u7, y7);
+}
+#endif  // CONFIG_EXT_TX
+
 #endif  // AOM_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/av1/common/x86/idct_intrin_sse2.c b/av1/common/x86/idct_intrin_sse2.c
index 293fedd..1a66dd5 100644
--- a/av1/common/x86/idct_intrin_sse2.c
+++ b/av1/common/x86/idct_intrin_sse2.c
@@ -243,202 +243,12 @@
 }
 
 #if CONFIG_EXT_TX
-static void iidtx16_8col(__m128i *in) {
-  const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
-  const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i y0, y1, y2, y3, y4, y5, y6, y7;
-
-  in[0] = _mm_slli_epi16(in[0], 1);
-  in[1] = _mm_slli_epi16(in[1], 1);
-  in[2] = _mm_slli_epi16(in[2], 1);
-  in[3] = _mm_slli_epi16(in[3], 1);
-  in[4] = _mm_slli_epi16(in[4], 1);
-  in[5] = _mm_slli_epi16(in[5], 1);
-  in[6] = _mm_slli_epi16(in[6], 1);
-  in[7] = _mm_slli_epi16(in[7], 1);
-  in[8] = _mm_slli_epi16(in[8], 1);
-  in[9] = _mm_slli_epi16(in[9], 1);
-  in[10] = _mm_slli_epi16(in[10], 1);
-  in[11] = _mm_slli_epi16(in[11], 1);
-  in[12] = _mm_slli_epi16(in[12], 1);
-  in[13] = _mm_slli_epi16(in[13], 1);
-  in[14] = _mm_slli_epi16(in[14], 1);
-  in[15] = _mm_slli_epi16(in[15], 1);
-
-  v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
-  v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
-  v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
-  v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
-  v4 = _mm_unpacklo_epi16(in[4], k__zero_epi16);
-  v5 = _mm_unpacklo_epi16(in[5], k__zero_epi16);
-  v6 = _mm_unpacklo_epi16(in[6], k__zero_epi16);
-  v7 = _mm_unpacklo_epi16(in[7], k__zero_epi16);
-
-  u0 = _mm_unpacklo_epi16(in[8], k__zero_epi16);
-  u1 = _mm_unpacklo_epi16(in[9], k__zero_epi16);
-  u2 = _mm_unpacklo_epi16(in[10], k__zero_epi16);
-  u3 = _mm_unpacklo_epi16(in[11], k__zero_epi16);
-  u4 = _mm_unpacklo_epi16(in[12], k__zero_epi16);
-  u5 = _mm_unpacklo_epi16(in[13], k__zero_epi16);
-  u6 = _mm_unpacklo_epi16(in[14], k__zero_epi16);
-  u7 = _mm_unpacklo_epi16(in[15], k__zero_epi16);
-
-  x0 = _mm_unpackhi_epi16(in[0], k__zero_epi16);
-  x1 = _mm_unpackhi_epi16(in[1], k__zero_epi16);
-  x2 = _mm_unpackhi_epi16(in[2], k__zero_epi16);
-  x3 = _mm_unpackhi_epi16(in[3], k__zero_epi16);
-  x4 = _mm_unpackhi_epi16(in[4], k__zero_epi16);
-  x5 = _mm_unpackhi_epi16(in[5], k__zero_epi16);
-  x6 = _mm_unpackhi_epi16(in[6], k__zero_epi16);
-  x7 = _mm_unpackhi_epi16(in[7], k__zero_epi16);
-
-  y0 = _mm_unpackhi_epi16(in[8], k__zero_epi16);
-  y1 = _mm_unpackhi_epi16(in[9], k__zero_epi16);
-  y2 = _mm_unpackhi_epi16(in[10], k__zero_epi16);
-  y3 = _mm_unpackhi_epi16(in[11], k__zero_epi16);
-  y4 = _mm_unpackhi_epi16(in[12], k__zero_epi16);
-  y5 = _mm_unpackhi_epi16(in[13], k__zero_epi16);
-  y6 = _mm_unpackhi_epi16(in[14], k__zero_epi16);
-  y7 = _mm_unpackhi_epi16(in[15], k__zero_epi16);
-
-  v0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
-  v1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
-  v2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
-  v3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
-  v4 = _mm_madd_epi16(v4, k__sqrt2_epi16);
-  v5 = _mm_madd_epi16(v5, k__sqrt2_epi16);
-  v6 = _mm_madd_epi16(v6, k__sqrt2_epi16);
-  v7 = _mm_madd_epi16(v7, k__sqrt2_epi16);
-
-  x0 = _mm_madd_epi16(x0, k__sqrt2_epi16);
-  x1 = _mm_madd_epi16(x1, k__sqrt2_epi16);
-  x2 = _mm_madd_epi16(x2, k__sqrt2_epi16);
-  x3 = _mm_madd_epi16(x3, k__sqrt2_epi16);
-  x4 = _mm_madd_epi16(x4, k__sqrt2_epi16);
-  x5 = _mm_madd_epi16(x5, k__sqrt2_epi16);
-  x6 = _mm_madd_epi16(x6, k__sqrt2_epi16);
-  x7 = _mm_madd_epi16(x7, k__sqrt2_epi16);
-
-  u0 = _mm_madd_epi16(u0, k__sqrt2_epi16);
-  u1 = _mm_madd_epi16(u1, k__sqrt2_epi16);
-  u2 = _mm_madd_epi16(u2, k__sqrt2_epi16);
-  u3 = _mm_madd_epi16(u3, k__sqrt2_epi16);
-  u4 = _mm_madd_epi16(u4, k__sqrt2_epi16);
-  u5 = _mm_madd_epi16(u5, k__sqrt2_epi16);
-  u6 = _mm_madd_epi16(u6, k__sqrt2_epi16);
-  u7 = _mm_madd_epi16(u7, k__sqrt2_epi16);
-
-  y0 = _mm_madd_epi16(y0, k__sqrt2_epi16);
-  y1 = _mm_madd_epi16(y1, k__sqrt2_epi16);
-  y2 = _mm_madd_epi16(y2, k__sqrt2_epi16);
-  y3 = _mm_madd_epi16(y3, k__sqrt2_epi16);
-  y4 = _mm_madd_epi16(y4, k__sqrt2_epi16);
-  y5 = _mm_madd_epi16(y5, k__sqrt2_epi16);
-  y6 = _mm_madd_epi16(y6, k__sqrt2_epi16);
-  y7 = _mm_madd_epi16(y7, k__sqrt2_epi16);
-
-  v0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  x0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
-  x1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
-  x2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
-  x3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
-  x4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING);
-  x5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING);
-  x6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING);
-  x7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-
-  y0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING);
-  y1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING);
-  y2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING);
-  y3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING);
-  y4 = _mm_add_epi32(y4, k__DCT_CONST_ROUNDING);
-  y5 = _mm_add_epi32(y5, k__DCT_CONST_ROUNDING);
-  y6 = _mm_add_epi32(y6, k__DCT_CONST_ROUNDING);
-  y7 = _mm_add_epi32(y7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
-  x0 = _mm_srai_epi32(x0, DCT_CONST_BITS);
-  x1 = _mm_srai_epi32(x1, DCT_CONST_BITS);
-  x2 = _mm_srai_epi32(x2, DCT_CONST_BITS);
-  x3 = _mm_srai_epi32(x3, DCT_CONST_BITS);
-  x4 = _mm_srai_epi32(x4, DCT_CONST_BITS);
-  x5 = _mm_srai_epi32(x5, DCT_CONST_BITS);
-  x6 = _mm_srai_epi32(x6, DCT_CONST_BITS);
-  x7 = _mm_srai_epi32(x7, DCT_CONST_BITS);
-
-  u0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  y0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
-  y1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
-  y2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
-  y3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
-  y4 = _mm_srai_epi32(y4, DCT_CONST_BITS);
-  y5 = _mm_srai_epi32(y5, DCT_CONST_BITS);
-  y6 = _mm_srai_epi32(y6, DCT_CONST_BITS);
-  y7 = _mm_srai_epi32(y7, DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(v0, x0);
-  in[1] = _mm_packs_epi32(v1, x1);
-  in[2] = _mm_packs_epi32(v2, x2);
-  in[3] = _mm_packs_epi32(v3, x3);
-  in[4] = _mm_packs_epi32(v4, x4);
-  in[5] = _mm_packs_epi32(v5, x5);
-  in[6] = _mm_packs_epi32(v6, x6);
-  in[7] = _mm_packs_epi32(v7, x7);
-
-  in[8] = _mm_packs_epi32(u0, y0);
-  in[9] = _mm_packs_epi32(u1, y1);
-  in[10] = _mm_packs_epi32(u2, y2);
-  in[11] = _mm_packs_epi32(u3, y3);
-  in[12] = _mm_packs_epi32(u4, y4);
-  in[13] = _mm_packs_epi32(u5, y5);
-  in[14] = _mm_packs_epi32(u6, y6);
-  in[15] = _mm_packs_epi32(u7, y7);
-}
-
 static void iidtx16_sse2(__m128i *in0, __m128i *in1) {
   array_transpose_16x16(in0, in1);
-  iidtx16_8col(in0);
-  iidtx16_8col(in1);
+  idtx16_8col(in0);
+  idtx16_8col(in1);
 }
-#endif
+#endif  // CONFIG_EXT_TX
 
 void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
                                int stride, int tx_type) {
@@ -762,7 +572,7 @@
     case H_DCT:
     case H_ADST:
     case H_FLIPADST:
-    case IDTX: iidtx16_8col(in); break;
+    case IDTX: idtx16_8col(in); break;
 #endif
     default: assert(0); break;
   }
@@ -888,7 +698,7 @@
     case V_FLIPADST:
     case V_ADST:
     case V_DCT:
-    case IDTX: iidtx16_8col(in); break;
+    case IDTX: idtx16_8col(in); break;
 #endif
     default: assert(0); break;
   }
diff --git a/av1/encoder/x86/dct_intrin_sse2.c b/av1/encoder/x86/dct_intrin_sse2.c
index f613dc0..44a11754 100644
--- a/av1/encoder/x86/dct_intrin_sse2.c
+++ b/av1/encoder/x86/dct_intrin_sse2.c
@@ -2326,199 +2326,9 @@
 }
 
 #if CONFIG_EXT_TX
-static void fidtx16_8col(__m128i *in) {
-  const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
-  const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i y0, y1, y2, y3, y4, y5, y6, y7;
-
-  in[0] = _mm_slli_epi16(in[0], 1);
-  in[1] = _mm_slli_epi16(in[1], 1);
-  in[2] = _mm_slli_epi16(in[2], 1);
-  in[3] = _mm_slli_epi16(in[3], 1);
-  in[4] = _mm_slli_epi16(in[4], 1);
-  in[5] = _mm_slli_epi16(in[5], 1);
-  in[6] = _mm_slli_epi16(in[6], 1);
-  in[7] = _mm_slli_epi16(in[7], 1);
-  in[8] = _mm_slli_epi16(in[8], 1);
-  in[9] = _mm_slli_epi16(in[9], 1);
-  in[10] = _mm_slli_epi16(in[10], 1);
-  in[11] = _mm_slli_epi16(in[11], 1);
-  in[12] = _mm_slli_epi16(in[12], 1);
-  in[13] = _mm_slli_epi16(in[13], 1);
-  in[14] = _mm_slli_epi16(in[14], 1);
-  in[15] = _mm_slli_epi16(in[15], 1);
-
-  v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
-  v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
-  v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
-  v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
-  v4 = _mm_unpacklo_epi16(in[4], k__zero_epi16);
-  v5 = _mm_unpacklo_epi16(in[5], k__zero_epi16);
-  v6 = _mm_unpacklo_epi16(in[6], k__zero_epi16);
-  v7 = _mm_unpacklo_epi16(in[7], k__zero_epi16);
-
-  u0 = _mm_unpacklo_epi16(in[8], k__zero_epi16);
-  u1 = _mm_unpacklo_epi16(in[9], k__zero_epi16);
-  u2 = _mm_unpacklo_epi16(in[10], k__zero_epi16);
-  u3 = _mm_unpacklo_epi16(in[11], k__zero_epi16);
-  u4 = _mm_unpacklo_epi16(in[12], k__zero_epi16);
-  u5 = _mm_unpacklo_epi16(in[13], k__zero_epi16);
-  u6 = _mm_unpacklo_epi16(in[14], k__zero_epi16);
-  u7 = _mm_unpacklo_epi16(in[15], k__zero_epi16);
-
-  x0 = _mm_unpackhi_epi16(in[0], k__zero_epi16);
-  x1 = _mm_unpackhi_epi16(in[1], k__zero_epi16);
-  x2 = _mm_unpackhi_epi16(in[2], k__zero_epi16);
-  x3 = _mm_unpackhi_epi16(in[3], k__zero_epi16);
-  x4 = _mm_unpackhi_epi16(in[4], k__zero_epi16);
-  x5 = _mm_unpackhi_epi16(in[5], k__zero_epi16);
-  x6 = _mm_unpackhi_epi16(in[6], k__zero_epi16);
-  x7 = _mm_unpackhi_epi16(in[7], k__zero_epi16);
-
-  y0 = _mm_unpackhi_epi16(in[8], k__zero_epi16);
-  y1 = _mm_unpackhi_epi16(in[9], k__zero_epi16);
-  y2 = _mm_unpackhi_epi16(in[10], k__zero_epi16);
-  y3 = _mm_unpackhi_epi16(in[11], k__zero_epi16);
-  y4 = _mm_unpackhi_epi16(in[12], k__zero_epi16);
-  y5 = _mm_unpackhi_epi16(in[13], k__zero_epi16);
-  y6 = _mm_unpackhi_epi16(in[14], k__zero_epi16);
-  y7 = _mm_unpackhi_epi16(in[15], k__zero_epi16);
-
-  v0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
-  v1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
-  v2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
-  v3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
-  v4 = _mm_madd_epi16(v4, k__sqrt2_epi16);
-  v5 = _mm_madd_epi16(v5, k__sqrt2_epi16);
-  v6 = _mm_madd_epi16(v6, k__sqrt2_epi16);
-  v7 = _mm_madd_epi16(v7, k__sqrt2_epi16);
-
-  x0 = _mm_madd_epi16(x0, k__sqrt2_epi16);
-  x1 = _mm_madd_epi16(x1, k__sqrt2_epi16);
-  x2 = _mm_madd_epi16(x2, k__sqrt2_epi16);
-  x3 = _mm_madd_epi16(x3, k__sqrt2_epi16);
-  x4 = _mm_madd_epi16(x4, k__sqrt2_epi16);
-  x5 = _mm_madd_epi16(x5, k__sqrt2_epi16);
-  x6 = _mm_madd_epi16(x6, k__sqrt2_epi16);
-  x7 = _mm_madd_epi16(x7, k__sqrt2_epi16);
-
-  u0 = _mm_madd_epi16(u0, k__sqrt2_epi16);
-  u1 = _mm_madd_epi16(u1, k__sqrt2_epi16);
-  u2 = _mm_madd_epi16(u2, k__sqrt2_epi16);
-  u3 = _mm_madd_epi16(u3, k__sqrt2_epi16);
-  u4 = _mm_madd_epi16(u4, k__sqrt2_epi16);
-  u5 = _mm_madd_epi16(u5, k__sqrt2_epi16);
-  u6 = _mm_madd_epi16(u6, k__sqrt2_epi16);
-  u7 = _mm_madd_epi16(u7, k__sqrt2_epi16);
-
-  y0 = _mm_madd_epi16(y0, k__sqrt2_epi16);
-  y1 = _mm_madd_epi16(y1, k__sqrt2_epi16);
-  y2 = _mm_madd_epi16(y2, k__sqrt2_epi16);
-  y3 = _mm_madd_epi16(y3, k__sqrt2_epi16);
-  y4 = _mm_madd_epi16(y4, k__sqrt2_epi16);
-  y5 = _mm_madd_epi16(y5, k__sqrt2_epi16);
-  y6 = _mm_madd_epi16(y6, k__sqrt2_epi16);
-  y7 = _mm_madd_epi16(y7, k__sqrt2_epi16);
-
-  v0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  x0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
-  x1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
-  x2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
-  x3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
-  x4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING);
-  x5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING);
-  x6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING);
-  x7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-
-  y0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING);
-  y1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING);
-  y2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING);
-  y3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING);
-  y4 = _mm_add_epi32(y4, k__DCT_CONST_ROUNDING);
-  y5 = _mm_add_epi32(y5, k__DCT_CONST_ROUNDING);
-  y6 = _mm_add_epi32(y6, k__DCT_CONST_ROUNDING);
-  y7 = _mm_add_epi32(y7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
-  x0 = _mm_srai_epi32(x0, DCT_CONST_BITS);
-  x1 = _mm_srai_epi32(x1, DCT_CONST_BITS);
-  x2 = _mm_srai_epi32(x2, DCT_CONST_BITS);
-  x3 = _mm_srai_epi32(x3, DCT_CONST_BITS);
-  x4 = _mm_srai_epi32(x4, DCT_CONST_BITS);
-  x5 = _mm_srai_epi32(x5, DCT_CONST_BITS);
-  x6 = _mm_srai_epi32(x6, DCT_CONST_BITS);
-  x7 = _mm_srai_epi32(x7, DCT_CONST_BITS);
-
-  u0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  y0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
-  y1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
-  y2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
-  y3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
-  y4 = _mm_srai_epi32(y4, DCT_CONST_BITS);
-  y5 = _mm_srai_epi32(y5, DCT_CONST_BITS);
-  y6 = _mm_srai_epi32(y6, DCT_CONST_BITS);
-  y7 = _mm_srai_epi32(y7, DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(v0, x0);
-  in[1] = _mm_packs_epi32(v1, x1);
-  in[2] = _mm_packs_epi32(v2, x2);
-  in[3] = _mm_packs_epi32(v3, x3);
-  in[4] = _mm_packs_epi32(v4, x4);
-  in[5] = _mm_packs_epi32(v5, x5);
-  in[6] = _mm_packs_epi32(v6, x6);
-  in[7] = _mm_packs_epi32(v7, x7);
-
-  in[8] = _mm_packs_epi32(u0, y0);
-  in[9] = _mm_packs_epi32(u1, y1);
-  in[10] = _mm_packs_epi32(u2, y2);
-  in[11] = _mm_packs_epi32(u3, y3);
-  in[12] = _mm_packs_epi32(u4, y4);
-  in[13] = _mm_packs_epi32(u5, y5);
-  in[14] = _mm_packs_epi32(u6, y6);
-  in[15] = _mm_packs_epi32(u7, y7);
-}
-
 static void fidtx16_sse2(__m128i *in0, __m128i *in1) {
-  fidtx16_8col(in0);
-  fidtx16_8col(in1);
+  idtx16_8col(in0);
+  idtx16_8col(in1);
   array_transpose_16x16(in0, in1);
 }
 #endif  // CONFIG_EXT_TX
@@ -3248,7 +3058,7 @@
       fidtx8_sse2(t);
       fidtx8_sse2(b);
       row_8x16_rounding(in, 2);
-      fidtx16_8col(in);
+      idtx16_8col(in);
       break;
     case V_DCT:
       load_buffer_8x16(input, in, stride, 0, 0);
@@ -3266,7 +3076,7 @@
       fdct8_sse2(t);
       fdct8_sse2(b);
       row_8x16_rounding(in, 2);
-      fidtx16_8col(in);
+      idtx16_8col(in);
       break;
     case V_ADST:
       load_buffer_8x16(input, in, stride, 0, 0);
@@ -3284,7 +3094,7 @@
       fadst8_sse2(t);
       fadst8_sse2(b);
       row_8x16_rounding(in, 2);
-      fidtx16_8col(in);
+      idtx16_8col(in);
       break;
     case V_FLIPADST:
       load_buffer_8x16(input, in, stride, 1, 0);
@@ -3302,7 +3112,7 @@
       fadst8_sse2(t);
       fadst8_sse2(b);
       row_8x16_rounding(in, 2);
-      fidtx16_8col(in);
+      idtx16_8col(in);
       break;
 #endif
     default: assert(0); break;
@@ -3410,14 +3220,14 @@
       fidtx8_sse2(l);
       fidtx8_sse2(r);
       col_16x8_rounding(in, 2);
-      fidtx16_8col(in);
+      idtx16_8col(in);
       break;
     case V_DCT:
       load_buffer_16x8(input, in, stride, 0, 0);
       fdct8_sse2(l);
       fdct8_sse2(r);
       col_16x8_rounding(in, 2);
-      fidtx16_8col(in);
+      idtx16_8col(in);
       break;
     case H_DCT:
       load_buffer_16x8(input, in, stride, 0, 0);
@@ -3431,7 +3241,7 @@
       fadst8_sse2(l);
       fadst8_sse2(r);
       col_16x8_rounding(in, 2);
-      fidtx16_8col(in);
+      idtx16_8col(in);
       break;
     case H_ADST:
       load_buffer_16x8(input, in, stride, 0, 0);
@@ -3445,7 +3255,7 @@
       fadst8_sse2(l);
       fadst8_sse2(r);
       col_16x8_rounding(in, 2);
-      fidtx16_8col(in);
+      idtx16_8col(in);
       break;
     case H_FLIPADST:
       load_buffer_16x8(input, in, stride, 0, 1);