8x8/16x16 HT types V_DCT to H_FLIPADST SSE2 optimization

- Wrote function: fidtx8_sse2() and fidtx16_sse2().
- Turned on vp10_fht8x8_sse2()/vp10_fht16x16_sse2() for new types.
- Updated 8x8/16x16 unit tests for accuracy/speed.
- Running 20K times with random numbers and getting through
  tx type from V_DCT to H_FLIPADST, SSE2 speed improvement:
  8x8: ~131%
  16x16: ~66%

Change-Id: Ibbb707e932a08fec3b1f423a7dab280a1d696c9a
diff --git a/test/vp10_fht16x16_test.cc b/test/vp10_fht16x16_test.cc
index 3967149..7994d7f 100644
--- a/test/vp10_fht16x16_test.cc
+++ b/test/vp10_fht16x16_test.cc
@@ -70,6 +70,61 @@
   RunCoeffCheck();
 }
 
+#if CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
+TEST(VP10Trans16x16HTSpeedTest, C_version) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 20000;
+    int bit_depth = 8;
+    int mask = (1 << bit_depth) - 1;
+    const int num_coeffs = 256;
+    int16_t *input = new int16_t[num_coeffs];
+    tran_low_t *output = new tran_low_t[num_coeffs];
+    const int stride = 16;
+    int tx_type;
+
+    for (int i = 0; i < count_test_block; ++i) {
+      for (int j = 0; j < num_coeffs; ++j) {
+        input[j] = (rnd.Rand8() & mask) - (rnd.Rand8() & mask);
+      }
+      for (tx_type = V_DCT; tx_type <= H_FLIPADST; ++tx_type) {
+        vp10_fht16x16_c(input, output, stride, tx_type);
+      }
+    }
+
+    delete[] input;
+    delete[] output;
+}
+#endif  // CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_SSE2 && CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
+TEST(VP10Trans16x16HTSpeedTest, SSE2_version) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 20000;
+    int bit_depth = 8;
+    int mask = (1 << bit_depth) - 1;
+    const int num_coeffs = 256;
+    int16_t *input = reinterpret_cast<int16_t *>
+        (vpx_memalign(16, sizeof(int16_t) * num_coeffs));
+    tran_low_t *output = reinterpret_cast<tran_low_t *>
+        (vpx_memalign(16, sizeof(tran_low_t) * num_coeffs));
+
+    const int stride = 16;
+    int tx_type;
+
+    for (int i = 0; i < count_test_block; ++i) {
+      for (int j = 0; j < num_coeffs; ++j) {
+        input[j] = (rnd.Rand8() & mask) - (rnd.Rand8() & mask);
+      }
+      for (tx_type = V_DCT; tx_type <= H_FLIPADST; ++tx_type) {
+        vp10_fht16x16_sse2(input, output, stride, tx_type);
+      }
+    }
+
+    vpx_free(input);
+    vpx_free(output);
+}
+#endif  // HAVE_SSE2 && CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
+
 using std::tr1::make_tuple;
 
 #if HAVE_SSE2
@@ -103,6 +158,18 @@
       make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 7,
                  VPX_BITS_8, 256),
       make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 8,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 10,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 11,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 12,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 13,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 14,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 15,
                  VPX_BITS_8, 256)));
 #endif  // !CONFIG_EXT_TX
 #endif  // HAVE_SSE2
diff --git a/test/vp10_fht8x8_test.cc b/test/vp10_fht8x8_test.cc
index 96f5632..07ab61d 100644
--- a/test/vp10_fht8x8_test.cc
+++ b/test/vp10_fht8x8_test.cc
@@ -69,6 +69,61 @@
   RunCoeffCheck();
 }
 
+#if CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
+TEST(VP10Trans8x8HTSpeedTest, C_version) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 20000;
+    int bit_depth = 8;
+    int mask = (1 << bit_depth) - 1;
+    const int num_coeffs = 64;
+    int16_t *input = new int16_t[num_coeffs];
+    tran_low_t *output = new tran_low_t[num_coeffs];
+    const int stride = 8;
+    int tx_type;
+
+    for (int i = 0; i < count_test_block; ++i) {
+      for (int j = 0; j < num_coeffs; ++j) {
+        input[j] = (rnd.Rand8() & mask) - (rnd.Rand8() & mask);
+      }
+      for (tx_type = V_DCT; tx_type <= H_FLIPADST; ++tx_type) {
+        vp10_fht8x8_c(input, output, stride, tx_type);
+      }
+    }
+
+    delete[] input;
+    delete[] output;
+}
+#endif  // CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_SSE2 && CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
+TEST(VP10Trans8x8HTSpeedTest, SSE2_version) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 20000;
+    int bit_depth = 8;
+    int mask = (1 << bit_depth) - 1;
+    const int num_coeffs = 64;
+    int16_t *input = reinterpret_cast<int16_t *>
+        (vpx_memalign(16, sizeof(int16_t) * num_coeffs));
+    tran_low_t *output = reinterpret_cast<tran_low_t *>
+        (vpx_memalign(16, sizeof(tran_low_t) * num_coeffs));
+
+    const int stride = 8;
+    int tx_type;
+
+    for (int i = 0; i < count_test_block; ++i) {
+      for (int j = 0; j < num_coeffs; ++j) {
+        input[j] = (rnd.Rand8() & mask) - (rnd.Rand8() & mask);
+      }
+      for (tx_type = V_DCT; tx_type <= H_FLIPADST; ++tx_type) {
+        vp10_fht8x8_sse2(input, output, stride, tx_type);
+      }
+    }
+
+    vpx_free(input);
+    vpx_free(output);
+}
+#endif  // HAVE_SSE2 && CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
+
 using std::tr1::make_tuple;
 
 #if HAVE_SSE2
@@ -102,6 +157,18 @@
       make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 7,
                  VPX_BITS_8, 64),
       make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 8,
+                 VPX_BITS_8, 64),
+      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 10,
+                 VPX_BITS_8, 64),
+      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 11,
+                 VPX_BITS_8, 64),
+      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 12,
+                 VPX_BITS_8, 64),
+      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 13,
+                 VPX_BITS_8, 64),
+      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 14,
+                 VPX_BITS_8, 64),
+      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 15,
                  VPX_BITS_8, 64)));
 #endif  // !CONFIG_EXT_TX
 #endif  // HAVE_SSE2
diff --git a/vp10/encoder/hybrid_fwd_txfm.c b/vp10/encoder/hybrid_fwd_txfm.c
index fd1cff2..2018960 100644
--- a/vp10/encoder/hybrid_fwd_txfm.c
+++ b/vp10/encoder/hybrid_fwd_txfm.c
@@ -54,8 +54,6 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      vp10_fht4x4(src_diff, coeff, diff_stride, tx_type);
-      break;
     case V_DCT:
     case H_DCT:
     case V_ADST:
@@ -70,7 +68,6 @@
 #endif  // CONFIG_EXT_TX
     default:
       assert(0);
-      break;
   }
 }
 
@@ -93,15 +90,13 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      vp10_fht8x8(src_diff, coeff, diff_stride, tx_type);
-      break;
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
     case V_FLIPADST:
     case H_FLIPADST:
-      vp10_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
+      vp10_fht8x8(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type);
@@ -109,7 +104,6 @@
 #endif  // CONFIG_EXT_TX
     default:
       assert(0);
-      break;
   }
 }
 
@@ -132,15 +126,13 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      vp10_fht16x16(src_diff, coeff, diff_stride, tx_type);
-      break;
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
     case V_FLIPADST:
     case H_FLIPADST:
-      vp10_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
+      vp10_fht16x16(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type);
@@ -148,7 +140,6 @@
 #endif  // CONFIG_EXT_TX
     default:
       assert(0);
-      break;
   }
 }
 
diff --git a/vp10/encoder/x86/dct_sse2.c b/vp10/encoder/x86/dct_sse2.c
index 0e568dd..47422ad 100644
--- a/vp10/encoder/x86/dct_sse2.c
+++ b/vp10/encoder/x86/dct_sse2.c
@@ -1280,6 +1280,21 @@
   array_transpose_8x8(in, in);
 }
 
+#if CONFIG_EXT_TX
+static void fidtx8_sse2(__m128i *in) {
+  in[0] = _mm_slli_epi16(in[0], 1);
+  in[1] = _mm_slli_epi16(in[1], 1);
+  in[2] = _mm_slli_epi16(in[2], 1);
+  in[3] = _mm_slli_epi16(in[3], 1);
+  in[4] = _mm_slli_epi16(in[4], 1);
+  in[5] = _mm_slli_epi16(in[5], 1);
+  in[6] = _mm_slli_epi16(in[6], 1);
+  in[7] = _mm_slli_epi16(in[7], 1);
+
+  array_transpose_8x8(in, in);
+}
+#endif  // CONFIG_EXT_TX
+
 void vp10_fht8x8_sse2(const int16_t *input, tran_low_t *output,
                      int stride, int tx_type) {
   __m128i in[8];
@@ -1345,10 +1360,51 @@
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
+    case V_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fdct8_sse2(in);
+      fidtx8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case H_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fidtx8_sse2(in);
+      fdct8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case V_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fadst8_sse2(in);
+      fidtx8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case H_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fidtx8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case V_FLIPADST:
+      load_buffer_8x8(input, in, stride, 1, 0);
+      fadst8_sse2(in);
+      fidtx8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case H_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1);
+      fidtx8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
 #endif  // CONFIG_EXT_TX
     default:
       assert(0);
-      break;
   }
 }
 
@@ -2226,6 +2282,204 @@
   array_transpose_16x16(in0, in1);
 }
 
+#if CONFIG_EXT_TX
+static void fidtx16_8col(__m128i *in) {
+  const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
+  const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i y0, y1, y2, y3, y4, y5, y6, y7;
+
+  in[0] = _mm_slli_epi16(in[0], 1);
+  in[1] = _mm_slli_epi16(in[1], 1);
+  in[2] = _mm_slli_epi16(in[2], 1);
+  in[3] = _mm_slli_epi16(in[3], 1);
+  in[4] = _mm_slli_epi16(in[4], 1);
+  in[5] = _mm_slli_epi16(in[5], 1);
+  in[6] = _mm_slli_epi16(in[6], 1);
+  in[7] = _mm_slli_epi16(in[7], 1);
+  in[8] = _mm_slli_epi16(in[8], 1);
+  in[9] = _mm_slli_epi16(in[9], 1);
+  in[10] = _mm_slli_epi16(in[10], 1);
+  in[11] = _mm_slli_epi16(in[11], 1);
+  in[12] = _mm_slli_epi16(in[12], 1);
+  in[13] = _mm_slli_epi16(in[13], 1);
+  in[14] = _mm_slli_epi16(in[14], 1);
+  in[15] = _mm_slli_epi16(in[15], 1);
+
+  v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
+  v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
+  v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
+  v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
+  v4 = _mm_unpacklo_epi16(in[4], k__zero_epi16);
+  v5 = _mm_unpacklo_epi16(in[5], k__zero_epi16);
+  v6 = _mm_unpacklo_epi16(in[6], k__zero_epi16);
+  v7 = _mm_unpacklo_epi16(in[7], k__zero_epi16);
+
+  u0 = _mm_unpacklo_epi16(in[8], k__zero_epi16);
+  u1 = _mm_unpacklo_epi16(in[9], k__zero_epi16);
+  u2 = _mm_unpacklo_epi16(in[10], k__zero_epi16);
+  u3 = _mm_unpacklo_epi16(in[11], k__zero_epi16);
+  u4 = _mm_unpacklo_epi16(in[12], k__zero_epi16);
+  u5 = _mm_unpacklo_epi16(in[13], k__zero_epi16);
+  u6 = _mm_unpacklo_epi16(in[14], k__zero_epi16);
+  u7 = _mm_unpacklo_epi16(in[15], k__zero_epi16);
+
+  x0 = _mm_unpackhi_epi16(in[0], k__zero_epi16);
+  x1 = _mm_unpackhi_epi16(in[1], k__zero_epi16);
+  x2 = _mm_unpackhi_epi16(in[2], k__zero_epi16);
+  x3 = _mm_unpackhi_epi16(in[3], k__zero_epi16);
+  x4 = _mm_unpackhi_epi16(in[4], k__zero_epi16);
+  x5 = _mm_unpackhi_epi16(in[5], k__zero_epi16);
+  x6 = _mm_unpackhi_epi16(in[6], k__zero_epi16);
+  x7 = _mm_unpackhi_epi16(in[7], k__zero_epi16);
+
+  y0 = _mm_unpackhi_epi16(in[8], k__zero_epi16);
+  y1 = _mm_unpackhi_epi16(in[9], k__zero_epi16);
+  y2 = _mm_unpackhi_epi16(in[10], k__zero_epi16);
+  y3 = _mm_unpackhi_epi16(in[11], k__zero_epi16);
+  y4 = _mm_unpackhi_epi16(in[12], k__zero_epi16);
+  y5 = _mm_unpackhi_epi16(in[13], k__zero_epi16);
+  y6 = _mm_unpackhi_epi16(in[14], k__zero_epi16);
+  y7 = _mm_unpackhi_epi16(in[15], k__zero_epi16);
+
+  v0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
+  v1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
+  v2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
+  v3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
+  v4 = _mm_madd_epi16(v4, k__sqrt2_epi16);
+  v5 = _mm_madd_epi16(v5, k__sqrt2_epi16);
+  v6 = _mm_madd_epi16(v6, k__sqrt2_epi16);
+  v7 = _mm_madd_epi16(v7, k__sqrt2_epi16);
+
+  x0 = _mm_madd_epi16(x0, k__sqrt2_epi16);
+  x1 = _mm_madd_epi16(x1, k__sqrt2_epi16);
+  x2 = _mm_madd_epi16(x2, k__sqrt2_epi16);
+  x3 = _mm_madd_epi16(x3, k__sqrt2_epi16);
+  x4 = _mm_madd_epi16(x4, k__sqrt2_epi16);
+  x5 = _mm_madd_epi16(x5, k__sqrt2_epi16);
+  x6 = _mm_madd_epi16(x6, k__sqrt2_epi16);
+  x7 = _mm_madd_epi16(x7, k__sqrt2_epi16);
+
+  u0 = _mm_madd_epi16(u0, k__sqrt2_epi16);
+  u1 = _mm_madd_epi16(u1, k__sqrt2_epi16);
+  u2 = _mm_madd_epi16(u2, k__sqrt2_epi16);
+  u3 = _mm_madd_epi16(u3, k__sqrt2_epi16);
+  u4 = _mm_madd_epi16(u4, k__sqrt2_epi16);
+  u5 = _mm_madd_epi16(u5, k__sqrt2_epi16);
+  u6 = _mm_madd_epi16(u6, k__sqrt2_epi16);
+  u7 = _mm_madd_epi16(u7, k__sqrt2_epi16);
+
+  y0 = _mm_madd_epi16(y0, k__sqrt2_epi16);
+  y1 = _mm_madd_epi16(y1, k__sqrt2_epi16);
+  y2 = _mm_madd_epi16(y2, k__sqrt2_epi16);
+  y3 = _mm_madd_epi16(y3, k__sqrt2_epi16);
+  y4 = _mm_madd_epi16(y4, k__sqrt2_epi16);
+  y5 = _mm_madd_epi16(y5, k__sqrt2_epi16);
+  y6 = _mm_madd_epi16(y6, k__sqrt2_epi16);
+  y7 = _mm_madd_epi16(y7, k__sqrt2_epi16);
+
+  v0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  x0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
+  x1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
+  x2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
+  x3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
+  x4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING);
+  x5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING);
+  x6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING);
+  x7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+  y0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING);
+  y1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING);
+  y2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING);
+  y3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING);
+  y4 = _mm_add_epi32(y4, k__DCT_CONST_ROUNDING);
+  y5 = _mm_add_epi32(y5, k__DCT_CONST_ROUNDING);
+  y6 = _mm_add_epi32(y6, k__DCT_CONST_ROUNDING);
+  y7 = _mm_add_epi32(y7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  x0 = _mm_srai_epi32(x0, DCT_CONST_BITS);
+  x1 = _mm_srai_epi32(x1, DCT_CONST_BITS);
+  x2 = _mm_srai_epi32(x2, DCT_CONST_BITS);
+  x3 = _mm_srai_epi32(x3, DCT_CONST_BITS);
+  x4 = _mm_srai_epi32(x4, DCT_CONST_BITS);
+  x5 = _mm_srai_epi32(x5, DCT_CONST_BITS);
+  x6 = _mm_srai_epi32(x6, DCT_CONST_BITS);
+  x7 = _mm_srai_epi32(x7, DCT_CONST_BITS);
+
+  u0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  y0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
+  y1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
+  y2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
+  y3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
+  y4 = _mm_srai_epi32(y4, DCT_CONST_BITS);
+  y5 = _mm_srai_epi32(y5, DCT_CONST_BITS);
+  y6 = _mm_srai_epi32(y6, DCT_CONST_BITS);
+  y7 = _mm_srai_epi32(y7, DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(v0, x0);
+  in[1] = _mm_packs_epi32(v1, x1);
+  in[2] = _mm_packs_epi32(v2, x2);
+  in[3] = _mm_packs_epi32(v3, x3);
+  in[4] = _mm_packs_epi32(v4, x4);
+  in[5] = _mm_packs_epi32(v5, x5);
+  in[6] = _mm_packs_epi32(v6, x6);
+  in[7] = _mm_packs_epi32(v7, x7);
+
+  in[8] = _mm_packs_epi32(u0, y0);
+  in[9] = _mm_packs_epi32(u1, y1);
+  in[10] = _mm_packs_epi32(u2, y2);
+  in[11] = _mm_packs_epi32(u3, y3);
+  in[12] = _mm_packs_epi32(u4, y4);
+  in[13] = _mm_packs_epi32(u5, y5);
+  in[14] = _mm_packs_epi32(u6, y6);
+  in[15] = _mm_packs_epi32(u7, y7);
+}
+
+static void fidtx16_sse2(__m128i *in0, __m128i *in1) {
+  fidtx16_8col(in0);
+  fidtx16_8col(in1);
+  array_transpose_16x16(in0, in1);
+}
+#endif  // CONFIG_EXT_TX
+
 void vp10_fht16x16_sse2(const int16_t *input, tran_low_t *output,
                        int stride, int tx_type) {
   __m128i in0[16], in1[16];
@@ -2291,6 +2545,48 @@
       fadst16_sse2(in0, in1);
       write_buffer_16x16(output, in0, in1, 16);
       break;
+    case V_DCT:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fdct16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fidtx16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case H_DCT:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fidtx16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case V_ADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fidtx16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case H_ADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fidtx16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case V_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 1, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fidtx16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case H_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 1);
+      fidtx16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
 #endif  // CONFIG_EXT_TX
     default:
       assert(0);