Optimize lowbd fwd_txfm modules

Added AVX2 variants for 8x16 and 16x8 txfm blk_sizes.

When tested for multiple test cases observed 0.2%
average reduction in encoder time for speed = 1 preset.

Module level gains improved by a factor of ~1.3
on average w.r.t to SSE2 modules.

Change-Id: Iac8c74352d8892a17830fd761d03bde58f589f21
diff --git a/aom_dsp/x86/txfm_common_avx2.h b/aom_dsp/x86/txfm_common_avx2.h
index 8a40508..cc729ba 100644
--- a/aom_dsp/x86/txfm_common_avx2.h
+++ b/aom_dsp/x86/txfm_common_avx2.h
@@ -167,7 +167,35 @@
   out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31);
   out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31);
 }
+static INLINE void transpose_16bit_16x8_avx2(const __m256i *const in,
+                                             __m256i *const out) {
+  const __m256i a0 = _mm256_unpacklo_epi16(in[0], in[1]);
+  const __m256i a1 = _mm256_unpacklo_epi16(in[2], in[3]);
+  const __m256i a2 = _mm256_unpacklo_epi16(in[4], in[5]);
+  const __m256i a3 = _mm256_unpacklo_epi16(in[6], in[7]);
+  const __m256i a4 = _mm256_unpackhi_epi16(in[0], in[1]);
+  const __m256i a5 = _mm256_unpackhi_epi16(in[2], in[3]);
+  const __m256i a6 = _mm256_unpackhi_epi16(in[4], in[5]);
+  const __m256i a7 = _mm256_unpackhi_epi16(in[6], in[7]);
 
+  const __m256i b0 = _mm256_unpacklo_epi32(a0, a1);
+  const __m256i b1 = _mm256_unpacklo_epi32(a2, a3);
+  const __m256i b2 = _mm256_unpacklo_epi32(a4, a5);
+  const __m256i b3 = _mm256_unpacklo_epi32(a6, a7);
+  const __m256i b4 = _mm256_unpackhi_epi32(a0, a1);
+  const __m256i b5 = _mm256_unpackhi_epi32(a2, a3);
+  const __m256i b6 = _mm256_unpackhi_epi32(a4, a5);
+  const __m256i b7 = _mm256_unpackhi_epi32(a6, a7);
+
+  out[0] = _mm256_unpacklo_epi64(b0, b1);
+  out[1] = _mm256_unpackhi_epi64(b0, b1);
+  out[2] = _mm256_unpacklo_epi64(b4, b5);
+  out[3] = _mm256_unpackhi_epi64(b4, b5);
+  out[4] = _mm256_unpacklo_epi64(b2, b3);
+  out[5] = _mm256_unpackhi_epi64(b2, b3);
+  out[6] = _mm256_unpacklo_epi64(b6, b7);
+  out[7] = _mm256_unpackhi_epi64(b6, b7);
+}
 static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) {
   for (int i = 0; i < size; ++i) {
     out[size - i - 1] = in[i];
@@ -235,7 +263,61 @@
     }
   }
 }
+static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) {
+  const __m256i scale_rounding =
+      pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1));
+  const __m256i b = _mm256_madd_epi16(a, scale_rounding);
+  return _mm256_srai_epi32(b, NewSqrt2Bits);
+}
+static INLINE void store_rect_16bit_to_32bit_w8_avx2(const __m256i a,
+                                                     int32_t *const b) {
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i a_lo = _mm256_unpacklo_epi16(a, one);
+  const __m256i a_hi = _mm256_unpackhi_epi16(a, one);
+  const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2);
+  const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2);
+  const __m256i temp = _mm256_permute2f128_si256(b_lo, b_hi, 0x31);
+  _mm_store_si128((__m128i *)b, _mm256_castsi256_si128(b_lo));
+  _mm_store_si128((__m128i *)(b + 4), _mm256_castsi256_si128(b_hi));
+  _mm256_store_si256((__m256i *)(b + 64), temp);
+}
+static INLINE void store_rect_buffer_16bit_to_32bit_w8_avx2(
+    const __m256i *const in, int32_t *const out, const int stride,
+    const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_rect_16bit_to_32bit_w8_avx2(in[i], out + i * stride);
+  }
+}
+static INLINE void pack_reg(const __m128i *in1, const __m128i *in2,
+                            __m256i *out) {
+  out[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[0]), in2[0], 0x1);
+  out[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[1]), in2[1], 0x1);
+  out[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[2]), in2[2], 0x1);
+  out[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[3]), in2[3], 0x1);
+  out[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[4]), in2[4], 0x1);
+  out[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[5]), in2[5], 0x1);
+  out[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[6]), in2[6], 0x1);
+  out[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[7]), in2[7], 0x1);
+}
+static INLINE void extract_reg(const __m256i *in, __m128i *out1) {
+  out1[0] = _mm256_castsi256_si128(in[0]);
+  out1[1] = _mm256_castsi256_si128(in[1]);
+  out1[2] = _mm256_castsi256_si128(in[2]);
+  out1[3] = _mm256_castsi256_si128(in[3]);
+  out1[4] = _mm256_castsi256_si128(in[4]);
+  out1[5] = _mm256_castsi256_si128(in[5]);
+  out1[6] = _mm256_castsi256_si128(in[6]);
+  out1[7] = _mm256_castsi256_si128(in[7]);
 
+  out1[8] = _mm256_extractf128_si256(in[0], 0x01);
+  out1[9] = _mm256_extractf128_si256(in[1], 0x01);
+  out1[10] = _mm256_extractf128_si256(in[2], 0x01);
+  out1[11] = _mm256_extractf128_si256(in[3], 0x01);
+  out1[12] = _mm256_extractf128_si256(in[4], 0x01);
+  out1[13] = _mm256_extractf128_si256(in[5], 0x01);
+  out1[14] = _mm256_extractf128_si256(in[6], 0x01);
+  out1[15] = _mm256_extractf128_si256(in[7], 0x01);
+}
 #ifdef __cplusplus
 }
 #endif
diff --git a/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
index 13982cc..3f0f2c3 100644
--- a/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
+++ b/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
@@ -1407,13 +1407,6 @@
   output[14] = x1[15];
   output[15] = x1[0];
 }
-
-static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) {
-  const __m256i scale__r = pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1));
-  const __m256i b = _mm256_madd_epi16(a, scale__r);
-  return _mm256_srai_epi32(b, NewSqrt2Bits);
-}
-
 static INLINE void fidentity16x16_new_avx2(const __m256i *input,
                                            __m256i *output, int8_t cos_bit) {
   (void)cos_bit;
@@ -1997,6 +1990,781 @@
   }
 }
 
+void btf_16_avx2(__m256i w0, __m256i w1, __m256i in0, __m256i in1,
+                 __m128i *out0, __m128i *out1, __m128i *out2, __m128i *out3,
+                 __m256i __rounding, int8_t cos_bit) {
+  __m256i t0 = _mm256_unpacklo_epi16(in0, in1);
+  __m256i t1 = _mm256_unpackhi_epi16(in0, in1);
+  __m256i u0 = _mm256_madd_epi16(t0, w0);
+  __m256i u1 = _mm256_madd_epi16(t1, w0);
+  __m256i v0 = _mm256_madd_epi16(t0, w1);
+  __m256i v1 = _mm256_madd_epi16(t1, w1);
+
+  __m256i a0 = _mm256_add_epi32(u0, __rounding);
+  __m256i a1 = _mm256_add_epi32(u1, __rounding);
+  __m256i b0 = _mm256_add_epi32(v0, __rounding);
+  __m256i b1 = _mm256_add_epi32(v1, __rounding);
+
+  __m256i c0 = _mm256_srai_epi32(a0, cos_bit);
+  __m256i c1 = _mm256_srai_epi32(a1, cos_bit);
+  __m256i d0 = _mm256_srai_epi32(b0, cos_bit);
+  __m256i d1 = _mm256_srai_epi32(b1, cos_bit);
+
+  __m256i temp0 = _mm256_packs_epi32(c0, c1);
+  __m256i temp1 = _mm256_packs_epi32(d0, d1);
+
+  *out0 = _mm256_castsi256_si128(temp0);
+  *out1 = _mm256_castsi256_si128(temp1);
+  *out2 = _mm256_extractf128_si256(temp0, 0x01);
+  *out3 = _mm256_extractf128_si256(temp1, 0x01);
+}
+static INLINE void fdct8x8_new_avx2(const __m256i *input, __m256i *output,
+                                    int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+
+  // stage 1
+  __m256i x1[8];
+  x1[0] = _mm256_adds_epi16(input[0], input[7]);
+  x1[7] = _mm256_subs_epi16(input[0], input[7]);
+  x1[1] = _mm256_adds_epi16(input[1], input[6]);
+  x1[6] = _mm256_subs_epi16(input[1], input[6]);
+  x1[2] = _mm256_adds_epi16(input[2], input[5]);
+  x1[5] = _mm256_subs_epi16(input[2], input[5]);
+  x1[3] = _mm256_adds_epi16(input[3], input[4]);
+  x1[4] = _mm256_subs_epi16(input[3], input[4]);
+
+  // stage 2
+  __m256i x2[8];
+  x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
+  x2[3] = _mm256_subs_epi16(x1[0], x1[3]);
+  x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
+  x2[2] = _mm256_subs_epi16(x1[1], x1[2]);
+  x2[4] = x1[4];
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], __rounding,
+                  cos_bit);
+  x2[5] = x1[5];
+  x2[6] = x1[6];
+  x2[7] = x1[7];
+
+  // stage 3
+  __m256i x3[8];
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x2[0], &x2[1], __rounding,
+                  cos_bit);
+  x3[0] = x2[0];
+  x3[1] = x2[1];
+  btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x2[2], &x2[3], __rounding,
+                  cos_bit);
+  x3[2] = x2[2];
+  x3[3] = x2[3];
+  x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
+  x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
+  x3[6] = _mm256_subs_epi16(x2[7], x2[6]);
+  x3[7] = _mm256_adds_epi16(x2[7], x2[6]);
+
+  // stage 4
+  __m256i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x3[4], &x3[7], __rounding,
+                  cos_bit);
+  x4[4] = x3[4];
+  x4[7] = x3[7];
+  btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x3[5], &x3[6], __rounding,
+                  cos_bit);
+  x4[5] = x3[5];
+  x4[6] = x3[6];
+  // stage 5
+  output[0] = x4[0];
+  output[1] = x4[4];
+  output[2] = x4[2];
+  output[3] = x4[6];
+  output[4] = x4[1];
+  output[5] = x4[5];
+  output[6] = x4[3];
+  output[7] = x4[7];
+}
+static INLINE void fadst8x8_new_avx2(const __m256i *input, __m256i *output,
+                                     int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i __zero = _mm256_setzero_si256();
+  const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+  __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
+  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
+  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
+  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
+  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
+  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
+  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
+  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
+  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
+
+  // stage 1
+  __m256i x1[8];
+  x1[0] = input[0];
+  x1[1] = _mm256_subs_epi16(__zero, input[7]);
+  x1[2] = _mm256_subs_epi16(__zero, input[3]);
+  x1[3] = input[4];
+  x1[4] = _mm256_subs_epi16(__zero, input[1]);
+  x1[5] = input[6];
+  x1[6] = input[2];
+  x1[7] = _mm256_subs_epi16(__zero, input[5]);
+
+  // stage 2
+  __m256i x2[8];
+  x2[0] = x1[0];
+  x2[1] = x1[1];
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], __rounding,
+                  cos_bit);
+  x2[2] = x1[2];
+  x2[3] = x1[3];
+  x2[4] = x1[4];
+  x2[5] = x1[5];
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], __rounding,
+                  cos_bit);
+  x2[6] = x1[6];
+  x2[7] = x1[7];
+
+  // stage 3
+  __m256i x3[8];
+  x3[0] = _mm256_adds_epi16(x2[0], x2[2]);
+  x3[2] = _mm256_subs_epi16(x2[0], x2[2]);
+  x3[1] = _mm256_adds_epi16(x2[1], x2[3]);
+  x3[3] = _mm256_subs_epi16(x2[1], x2[3]);
+  x3[4] = _mm256_adds_epi16(x2[4], x2[6]);
+  x3[6] = _mm256_subs_epi16(x2[4], x2[6]);
+  x3[5] = _mm256_adds_epi16(x2[5], x2[7]);
+  x3[7] = _mm256_subs_epi16(x2[5], x2[7]);
+
+  // stage 4
+  __m256i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x3[4], &x3[5], __rounding,
+                  cos_bit);
+  x4[4] = x3[4];
+  x4[5] = x3[5];
+  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x3[6], &x3[7], __rounding,
+                  cos_bit);
+  x4[6] = x3[6];
+  x4[7] = x3[7];
+
+  // stage 5
+  __m256i x5[8];
+  x5[0] = _mm256_adds_epi16(x4[0], x4[4]);
+  x5[4] = _mm256_subs_epi16(x4[0], x4[4]);
+  x5[1] = _mm256_adds_epi16(x4[1], x4[5]);
+  x5[5] = _mm256_subs_epi16(x4[1], x4[5]);
+  x5[2] = _mm256_adds_epi16(x4[2], x4[6]);
+  x5[6] = _mm256_subs_epi16(x4[2], x4[6]);
+  x5[3] = _mm256_adds_epi16(x4[3], x4[7]);
+  x5[7] = _mm256_subs_epi16(x4[3], x4[7]);
+
+  // stage 6
+  __m256i x6[8];
+  btf_16_w16_avx2(cospi_p04_p60, cospi_p60_m04, &x5[0], &x5[1], __rounding,
+                  cos_bit);
+  x6[0] = x5[0];
+  x6[1] = x5[1];
+  btf_16_w16_avx2(cospi_p20_p44, cospi_p44_m20, &x5[2], &x5[3], __rounding,
+                  cos_bit);
+  x6[2] = x5[2];
+  x6[3] = x5[3];
+  btf_16_w16_avx2(cospi_p36_p28, cospi_p28_m36, &x5[4], &x5[5], __rounding,
+                  cos_bit);
+  x6[4] = x5[4];
+  x6[5] = x5[5];
+  btf_16_w16_avx2(cospi_p52_p12, cospi_p12_m52, &x5[6], &x5[7], __rounding,
+                  cos_bit);
+  x6[6] = x5[6];
+  x6[7] = x5[7];
+
+  // stage 7
+  output[0] = x6[1];
+  output[1] = x6[6];
+  output[2] = x6[3];
+  output[3] = x6[4];
+  output[4] = x6[5];
+  output[5] = x6[2];
+  output[6] = x6[7];
+  output[7] = x6[0];
+}
+static INLINE void fidentity8x8_new_avx2(const __m256i *input, __m256i *output,
+                                         int8_t cos_bit) {
+  (void)cos_bit;
+
+  output[0] = _mm256_adds_epi16(input[0], input[0]);
+  output[1] = _mm256_adds_epi16(input[1], input[1]);
+  output[2] = _mm256_adds_epi16(input[2], input[2]);
+  output[3] = _mm256_adds_epi16(input[3], input[3]);
+  output[4] = _mm256_adds_epi16(input[4], input[4]);
+  output[5] = _mm256_adds_epi16(input[5], input[5]);
+  output[6] = _mm256_adds_epi16(input[6], input[6]);
+  output[7] = _mm256_adds_epi16(input[7], input[7]);
+}
+static INLINE void fdct8x16_new_avx2(const __m128i *input, __m128i *output,
+                                     int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+  __m128i temp0, temp1, temp2, temp3;
+  __m256i in0, in1;
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+  __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+  __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+
+  __m256i cospi_arr[12];
+
+  cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m32_p32),
+                                         cospi_m32_p32, 0x1);
+  cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+                                         cospi_p32_p32, 0x1);
+  cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+                                         cospi_p48_p16, 0x1);
+  cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
+                                         cospi_m16_p48, 0x1);
+  cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m16_p48),
+                                         cospi_m48_m16, 0x1);
+  cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_p16),
+                                         cospi_m16_p48, 0x1);
+  cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_p08),
+                                         cospi_p24_p40, 0x1);
+  cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m08_p56),
+                                         cospi_m40_p24, 0x1);
+  cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p60_p04),
+                                         cospi_p28_p36, 0x1);
+  cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m04_p60),
+                                         cospi_m36_p28, 0x1);
+  cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p44_p20),
+                                          cospi_p12_p52, 0x1);
+  cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m20_p44),
+                                          cospi_m52_p12, 0x1);
+
+  __m256i x[8];
+  x[0] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[1], 0x1);
+  x[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[15]), input[14],
+                                 0x1);
+  x[2] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[3], 0x1);
+  x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[13]), input[12],
+                                 0x1);
+  x[4] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[4], 0x1);
+  x[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[11],
+                                 0x1);
+  x[6] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[6], 0x1);
+  x[7] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[9], 0x1);
+
+  // stage 1
+  __m256i x1[16];
+  x1[0] = _mm256_adds_epi16(x[0], x[1]);
+  x1[7] = _mm256_subs_epi16(x[0], x[1]);
+  x1[1] = _mm256_adds_epi16(x[2], x[3]);
+  x1[6] = _mm256_subs_epi16(x[2], x[3]);
+  x1[2] = _mm256_adds_epi16(x[4], x[5]);
+  x1[5] = _mm256_subs_epi16(x[4], x[5]);
+  x1[3] = _mm256_adds_epi16(x[6], x[7]);
+  x1[4] = _mm256_subs_epi16(x[6], x[7]);
+
+  // stage 2
+  __m256i x2[8];
+  x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
+  x2[7] = _mm256_subs_epi16(x1[0], x1[3]);
+  x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
+  x2[6] = _mm256_subs_epi16(x1[1], x1[2]);
+  x2[2] = x1[4];
+  x2[3] = x1[7];
+  btf_16_avx2(cospi_arr[0], cospi_arr[1], x1[5], x1[6], &temp0, &temp1, &temp2,
+              &temp3, __rounding_256, cos_bit);
+  x2[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp0, 0x1);
+  x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
+
+  // stage 3
+  __m256i x3[8];
+  x2[1] = _mm256_permute4x64_epi64(x2[1], 0x4e);
+  x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
+  x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
+  x3[2] = _mm256_blend_epi32(x2[7], x2[6], 0xf0);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, _mm256_castsi256_si128(x2[6]),
+              _mm256_extractf128_si256(x2[7], 0x01), temp0, temp1);
+  x3[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp1), temp0, 0x1);
+  x3[3] = _mm256_adds_epi16(x2[2], x2[4]);
+  x3[4] = _mm256_subs_epi16(x2[2], x2[4]);
+  x3[5] = _mm256_adds_epi16(x2[3], x2[5]);
+  x3[6] = _mm256_subs_epi16(x2[3], x2[5]);
+
+  // stage 4
+  __m256i x4[8];
+  in0 = _mm256_blend_epi32(x3[0], x3[1], 0xf0);
+  in1 = _mm256_permute2f128_si256(x3[0], x3[1], 0x21);
+  btf_16_avx2(cospi_arr[2], cospi_arr[3], in0, in1, &output[0], &output[8],
+              &output[4], &output[12], __rounding_256, cos_bit);
+  x4[2] = _mm256_adds_epi16(x3[2], x3[7]);
+  x4[3] = _mm256_subs_epi16(x3[2], x3[7]);
+  x4[4] = _mm256_permute2f128_si256(x3[3], x3[4], 0x20);
+  x4[5] = _mm256_permute2f128_si256(x3[6], x3[5], 0x20);
+  in0 = _mm256_permute2f128_si256(x3[3], x3[4], 0x31);
+  in1 = _mm256_permute2f128_si256(x3[5], x3[6], 0x31);
+  btf_16_avx2(cospi_arr[4], cospi_arr[5], in0, in1, &temp0, &temp1, &temp2,
+              &temp3, __rounding_256, cos_bit);
+  x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp2, 0x1);
+  x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
+
+  // stage 5
+  __m256i x5[8];
+  in0 = _mm256_permute2f128_si256(x4[2], x4[3], 0x31);
+  in1 = _mm256_permute2f128_si256(x4[2], x4[3], 0x20);
+  btf_16_avx2(cospi_arr[6], cospi_arr[7], in0, in1, &output[2], &output[14],
+              &output[10], &output[6], __rounding_256, cos_bit);
+  x5[4] = _mm256_adds_epi16(x4[4], x4[6]);
+  x5[5] = _mm256_subs_epi16(x4[4], x4[6]);
+  x5[6] = _mm256_adds_epi16(x4[5], x4[7]);
+  x5[7] = _mm256_subs_epi16(x4[5], x4[7]);
+
+  // stage 6
+  in0 = _mm256_permute2f128_si256(x5[4], x5[5], 0x20);
+  in1 = _mm256_permute2f128_si256(x5[6], x5[7], 0x31);
+  btf_16_avx2(cospi_arr[8], cospi_arr[9], in0, in1, &output[1], &output[15],
+              &output[9], &output[7], __rounding_256, cos_bit);
+  in0 = _mm256_permute2f128_si256(x5[5], x5[4], 0x31);
+  in1 = _mm256_permute2f128_si256(x5[7], x5[6], 0x20);
+  btf_16_avx2(cospi_arr[10], cospi_arr[11], in0, in1, &output[5], &output[11],
+              &output[13], &output[3], __rounding_256, cos_bit);
+}
+static INLINE void fadst8x16_new_avx2(const __m128i *input, __m128i *output,
+                                      int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i __zero = _mm256_setzero_si256();
+  const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
+  __m256i in0, in1;
+  __m128i temp0, temp1, temp2, temp3;
+
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+  __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+  __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+  __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+  __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+  __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+  __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+  __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+  __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+  __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+  __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+  __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+  __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+  __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+  __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+  __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+  __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+  __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+  __m256i cospi_arr[20];
+
+  cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+                                         cospi_p32_p32, 0x1);
+  cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
+                                         cospi_p32_m32, 0x1);
+  cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+                                         cospi_p32_p32, 0x1);
+  cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
+                                         cospi_p32_m32, 0x1);
+  cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
+                                         cospi_m48_p16, 0x1);
+  cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
+                                         cospi_p16_p48, 0x1);
+  cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
+                                         cospi_m48_p16, 0x1);
+  cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
+                                         cospi_p16_p48, 0x1);
+  cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
+                                         cospi_p40_p24, 0x1);
+  cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_m08),
+                                         cospi_p24_m40, 0x1);
+  cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m56_p08),
+                                          cospi_m24_p40, 0x1);
+  cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
+                                          cospi_p40_p24, 0x1);
+  cospi_arr[12] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p02_p62),
+                                          cospi_p10_p54, 0x1);
+  cospi_arr[13] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p62_m02),
+                                          cospi_p54_m10, 0x1);
+  cospi_arr[14] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p18_p46),
+                                          cospi_p26_p38, 0x1);
+  cospi_arr[15] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p46_m18),
+                                          cospi_p38_m26, 0x1);
+  cospi_arr[16] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p34_p30),
+                                          cospi_p42_p22, 0x1);
+  cospi_arr[17] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p30_m34),
+                                          cospi_p22_m42, 0x1);
+  cospi_arr[18] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p50_p14),
+                                          cospi_p58_p06, 0x1);
+  cospi_arr[19] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p14_m50),
+                                          cospi_p06_m58, 0x1);
+
+  __m256i x[8];
+  x[0] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[4], 0x1);
+  x[1] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[6], 0x1);
+  x[2] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[12], 0x1);
+  x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[14],
+                                 0x1);
+  x[4] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[1]), input[9], 0x1);
+  x[5] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[3]), input[11], 0x1);
+  x[6] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[13], 0x1);
+  x[7] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[15], 0x1);
+
+  // stage 1
+  __m256i x1[8];
+  x1[0] = x[0];
+  x1[1] = _mm256_subs_epi16(__zero, x[7]);
+  x1[2] = x[2];
+  x1[3] = _mm256_subs_epi16(__zero, x[5]);
+  x1[4] = _mm256_subs_epi16(__zero, x[4]);
+  x1[5] = x[3];
+  x1[6] = _mm256_subs_epi16(__zero, x[6]);
+  x1[7] = x[1];
+
+  // stage 2
+  __m256i x2[8];
+  x2[0] = _mm256_blend_epi32(x1[0], x1[1], 0xf0);
+  x2[3] = _mm256_blend_epi32(x1[3], x1[2], 0xf0);
+  x2[4] = _mm256_blend_epi32(x1[4], x1[5], 0xf0);
+  x2[7] = _mm256_blend_epi32(x1[7], x1[6], 0xf0);
+  in0 = _mm256_blend_epi32(x1[1], x1[0], 0xf0);
+  in1 = _mm256_blend_epi32(x1[2], x1[3], 0xf0);
+  btf_16_avx2(cospi_arr[0], cospi_arr[1], in0, in1, &temp0, &temp1, &temp2,
+              &temp3, __rounding_256, cos_bit);
+  x2[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x2[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+  in0 = _mm256_permute2f128_si256(x1[7], x1[6], 0x21);
+  in1 = _mm256_permute2f128_si256(x1[4], x1[5], 0x21);
+  btf_16_avx2(cospi_arr[2], cospi_arr[3], in0, in1, &temp0, &temp1, &temp2,
+              &temp3, __rounding_256, cos_bit);
+  x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x2[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+
+  // stage 3
+  __m256i x3[8];
+  x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
+  x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
+  x3[2] = _mm256_adds_epi16(x2[3], x2[2]);
+  x3[3] = _mm256_subs_epi16(x2[3], x2[2]);
+  x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
+  x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
+  x3[6] = _mm256_adds_epi16(x2[7], x2[6]);
+  x3[7] = _mm256_subs_epi16(x2[7], x2[6]);
+
+  // stage 4
+  __m256i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[4] = x3[4];
+  x4[5] = x3[5];
+  in0 = _mm256_permute2f128_si256(x3[2], x3[3], 0x20);
+  in1 = _mm256_permute2f128_si256(x3[2], x3[3], 0x31);
+  btf_16_avx2(cospi_arr[4], cospi_arr[5], in0, in1, &temp0, &temp1, &temp2,
+              &temp3, __rounding_256, cos_bit);
+  x4[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x4[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+  in0 = _mm256_permute2f128_si256(x3[6], x3[7], 0x20);
+  in1 = _mm256_permute2f128_si256(x3[6], x3[7], 0x31);
+  btf_16_avx2(cospi_arr[6], cospi_arr[7], in0, in1, &temp0, &temp1, &temp2,
+              &temp3, __rounding_256, cos_bit);
+  x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+
+  // stage 5
+  __m256i x5[8];
+  x5[0] = _mm256_adds_epi16(x4[0], x4[2]);
+  x5[1] = _mm256_subs_epi16(x4[0], x4[2]);
+  x5[2] = _mm256_adds_epi16(x4[1], x4[3]);
+  x5[3] = _mm256_subs_epi16(x4[1], x4[3]);
+  x5[4] = _mm256_adds_epi16(x4[4], x4[6]);
+  x5[5] = _mm256_subs_epi16(x4[4], x4[6]);
+  x5[6] = _mm256_adds_epi16(x4[5], x4[7]);
+  x5[7] = _mm256_subs_epi16(x4[5], x4[7]);
+
+  // stage 6
+  __m256i x6[8];
+  x6[0] = x5[0];
+  x6[1] = x5[2];
+  x6[2] = x5[1];
+  x6[3] = x5[3];
+  in0 = _mm256_permute2f128_si256(x5[4], x5[6], 0x20);
+  in1 = _mm256_permute2f128_si256(x5[4], x5[6], 0x31);
+  btf_16_avx2(cospi_arr[8], cospi_arr[9], in0, in1, &temp0, &temp1, &temp2,
+              &temp3, __rounding_256, cos_bit);
+  x6[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x6[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+  in0 = _mm256_permute2f128_si256(x5[5], x5[7], 0x20);
+  in1 = _mm256_permute2f128_si256(x5[5], x5[7], 0x31);
+  btf_16_avx2(cospi_arr[10], cospi_arr[11], in0, in1, &temp0, &temp1, &temp2,
+              &temp3, __rounding_256, cos_bit);
+  x6[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x6[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+
+  // stage 7
+  __m256i x7[8];
+  x7[0] = _mm256_adds_epi16(x6[0], x6[4]);
+  x7[1] = _mm256_subs_epi16(x6[0], x6[4]);
+  x7[2] = _mm256_adds_epi16(x6[1], x6[5]);
+  x7[3] = _mm256_subs_epi16(x6[1], x6[5]);
+  x7[4] = _mm256_adds_epi16(x6[2], x6[6]);
+  x7[5] = _mm256_subs_epi16(x6[2], x6[6]);
+  x7[6] = _mm256_adds_epi16(x6[3], x6[7]);
+  x7[7] = _mm256_subs_epi16(x6[3], x6[7]);
+
+  // stage 8
+  in0 = _mm256_permute2f128_si256(x7[0], x7[2], 0x20);
+  in1 = _mm256_permute2f128_si256(x7[0], x7[2], 0x31);
+  btf_16_avx2(cospi_arr[12], cospi_arr[13], in0, in1, &output[15], &output[0],
+              &output[13], &output[2], __rounding_256, cos_bit);
+  in0 = _mm256_permute2f128_si256(x7[4], x7[6], 0x20);
+  in1 = _mm256_permute2f128_si256(x7[4], x7[6], 0x31);
+  btf_16_avx2(cospi_arr[14], cospi_arr[15], in0, in1, &output[11], &output[4],
+              &output[9], &output[6], __rounding_256, cos_bit);
+  in0 = _mm256_permute2f128_si256(x7[1], x7[3], 0x20);
+  in1 = _mm256_permute2f128_si256(x7[1], x7[3], 0x31);
+  btf_16_avx2(cospi_arr[16], cospi_arr[17], in0, in1, &output[7], &output[8],
+              &output[5], &output[10], __rounding_256, cos_bit);
+  in0 = _mm256_permute2f128_si256(x7[5], x7[7], 0x20);
+  in1 = _mm256_permute2f128_si256(x7[5], x7[7], 0x31);
+  btf_16_avx2(cospi_arr[18], cospi_arr[19], in0, in1, &output[3], &output[12],
+              &output[1], &output[14], __rounding_256, cos_bit);
+}
+static INLINE void fidentity8x16_new_avx2(const __m128i *input, __m128i *output,
+                                          int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i one = _mm256_set1_epi16(1);
+  __m256i temp;
+  for (int i = 0; i < 16; i += 2) {
+    temp = _mm256_insertf128_si256(_mm256_castsi128_si256(input[i]),
+                                   input[i + 1], 0x1);
+    const __m256i a_lo = _mm256_unpacklo_epi16(temp, one);
+    const __m256i a_hi = _mm256_unpackhi_epi16(temp, one);
+    const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
+    const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
+    temp = _mm256_packs_epi32(b_lo, b_hi);
+    output[i] = _mm256_castsi256_si128(temp);
+    output[i + 1] = _mm256_extractf128_si256(temp, 0x1);
+  }
+}
+static const transform_1d_avx2 row_txfm8x16_arr[TX_TYPES] = {
+  fdct8x8_new_avx2,       // DCT_DCT
+  fdct8x8_new_avx2,       // ADST_DCT
+  fadst8x8_new_avx2,      // DCT_ADST
+  fadst8x8_new_avx2,      // ADST_ADST
+  fdct8x8_new_avx2,       // FLIPADST_DCT
+  fadst8x8_new_avx2,      // DCT_FLIPADST
+  fadst8x8_new_avx2,      // FLIPADST_FLIPADST
+  fadst8x8_new_avx2,      // ADST_FLIPADST
+  fadst8x8_new_avx2,      // FLIPADST_ADST
+  fidentity8x8_new_avx2,  // IDTX
+  fidentity8x8_new_avx2,  // V_DCT
+  fdct8x8_new_avx2,       // H_DCT
+  fidentity8x8_new_avx2,  // V_ADST
+  fadst8x8_new_avx2,      // H_ADST
+  fidentity8x8_new_avx2,  // V_FLIPADST
+  fadst8x8_new_avx2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_new_avx2,       // DCT_DCT
+  fadst8x16_new_avx2,      // ADST_DCT
+  fdct8x16_new_avx2,       // DCT_ADST
+  fadst8x16_new_avx2,      // ADST_ADST
+  fadst8x16_new_avx2,      // FLIPADST_DCT
+  fdct8x16_new_avx2,       // DCT_FLIPADST
+  fadst8x16_new_avx2,      // FLIPADST_FLIPADST
+  fadst8x16_new_avx2,      // ADST_FLIPADST
+  fadst8x16_new_avx2,      // FLIPADST_ADST
+  fidentity8x16_new_avx2,  // IDTX
+  fdct8x16_new_avx2,       // V_DCT
+  fidentity8x16_new_avx2,  // H_DCT
+  fadst8x16_new_avx2,      // V_ADST
+  fidentity8x16_new_avx2,  // H_ADST
+  fadst8x16_new_avx2,      // V_FLIPADST
+  fidentity8x16_new_avx2   // H_FLIPADST
+};
+static const transform_1d_avx2 col_txfm16x8_arr[TX_TYPES] = {
+  fdct8x8_new_avx2,       // DCT_DCT
+  fadst8x8_new_avx2,      // ADST_DCT
+  fdct8x8_new_avx2,       // DCT_ADST
+  fadst8x8_new_avx2,      // ADST_ADST
+  fadst8x8_new_avx2,      // FLIPADST_DCT
+  fdct8x8_new_avx2,       // DCT_FLIPADST
+  fadst8x8_new_avx2,      // FLIPADST_FLIPADST
+  fadst8x8_new_avx2,      // ADST_FLIPADST
+  fadst8x8_new_avx2,      // FLIPADST_ADST
+  fidentity8x8_new_avx2,  // IDTX
+  fdct8x8_new_avx2,       // V_DCT
+  fidentity8x8_new_avx2,  // H_DCT
+  fadst8x8_new_avx2,      // V_ADST
+  fidentity8x8_new_avx2,  // H_ADST
+  fadst8x8_new_avx2,      // V_FLIPADST
+  fidentity8x8_new_avx2,  // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm16x8_arr[TX_TYPES] = {
+  fdct8x16_new_avx2,       // DCT_DCT
+  fdct8x16_new_avx2,       // ADST_DCT
+  fadst8x16_new_avx2,      // DCT_ADST
+  fadst8x16_new_avx2,      // ADST_ADST
+  fdct8x16_new_avx2,       // FLIPADST_DCT
+  fadst8x16_new_avx2,      // DCT_FLIPADST
+  fadst8x16_new_avx2,      // FLIPADST_FLIPADST
+  fadst8x16_new_avx2,      // ADST_FLIPADST
+  fadst8x16_new_avx2,      // FLIPADST_ADST
+  fidentity8x16_new_avx2,  // IDTX
+  fidentity8x16_new_avx2,  // V_DCT
+  fdct8x16_new_avx2,       // H_DCT
+  fidentity8x16_new_avx2,  // V_ADST
+  fadst8x16_new_avx2,      // H_ADST
+  fidentity8x16_new_avx2,  // V_FLIPADST
+  fadst8x16_new_avx2       // H_FLIPADST
+};
+void lowbd_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  __m256i buf2[8];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_8X16];
+  const int txw_idx = get_txw_idx(TX_8X16);
+  const int txh_idx = get_txh_idx(TX_8X16);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_txfm8x16_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+  transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+
+  __m128i *bufl, *bufu;
+  if (lr_flip) {
+    bufl = buf0;
+    bufu = buf0 + 8;
+    flip_buf_sse2(buf1 + width * 0, bufl, width);
+    flip_buf_sse2(buf1 + width * 1, bufu, width);
+  } else {
+    bufl = buf1 + width * 0;
+    bufu = buf1 + width * 1;
+  }
+  pack_reg(bufl, bufu, buf2);
+  row_txfm(buf2, buf2, cos_bit_row);
+  round_shift_16bit_w16_avx2(buf2, width, shift[2]);
+  transpose_16bit_16x8_avx2(buf2, buf2);
+  store_rect_buffer_16bit_to_32bit_w8_avx2(buf2, output, width, 8);
+}
+void lowbd_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  __m256i buf2[8];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_16X8];
+  const int txw_idx = get_txw_idx(TX_16X8);
+  const int txh_idx = get_txh_idx(TX_16X8);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 8;
+  const transform_1d_avx2 col_txfm = col_txfm16x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm16x8_arr[tx_type];
+  __m128i *buf;
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip(input + 8 * 0, stride, buf0, height);
+    load_buffer_16bit_to_16bit_flip(input + 8 * 1, stride, &buf0[8], height);
+  } else {
+    load_buffer_16bit_to_16bit(input + 8 * 0, stride, buf0, height);
+    load_buffer_16bit_to_16bit(input + 8 * 1, stride, &buf0[8], height);
+  }
+  pack_reg(buf0, &buf0[8], buf2);
+  round_shift_16bit_w16_avx2(buf2, height, shift[0]);
+  col_txfm(buf2, buf2, cos_bit_col);
+  round_shift_16bit_w16_avx2(buf2, height, shift[1]);
+  transpose_16bit_16x8_avx2(buf2, buf2);
+  extract_reg(buf2, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_8x8(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
+  transpose_16bit_8x8(buf + 8, buf + 8);
+  store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
+}
 static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
   av1_lowbd_fwd_txfm2d_4x4_sse2,   // 4x4 transform
   av1_lowbd_fwd_txfm2d_8x8_sse2,   // 8x8 transform
@@ -2005,8 +2773,8 @@
   lowbd_fwd_txfm2d_64x64_avx2,     // 64x64 transform
   av1_lowbd_fwd_txfm2d_4x8_sse2,   // 4x8 transform
   av1_lowbd_fwd_txfm2d_8x4_sse2,   // 8x4 transform
-  av1_lowbd_fwd_txfm2d_8x16_sse2,  // 8x16 transform
-  av1_lowbd_fwd_txfm2d_16x8_sse2,  // 16x8 transform
+  lowbd_fwd_txfm2d_8x16_avx2,      // 8x16 transform
+  lowbd_fwd_txfm2d_16x8_avx2,      // 16x8 transform
   lowbd_fwd_txfm2d_16x32_avx2,     // 16x32 transform
   lowbd_fwd_txfm2d_32x16_avx2,     // 32x16 transform
   lowbd_fwd_txfm2d_32x64_avx2,     // 32x64 transform
diff --git a/test/av1_fwd_txfm2d_test.cc b/test/av1_fwd_txfm2d_test.cc
index c1b97f7..c70eba7 100644
--- a/test/av1_fwd_txfm2d_test.cc
+++ b/test/av1_fwd_txfm2d_test.cc
@@ -287,6 +287,67 @@
     }
   }
 }
+void AV1FwdTxfm2dSpeedTest(TX_SIZE tx_size, lowbd_fwd_txfm_func target_func) {
+  TxfmParam param;
+  memset(&param, 0, sizeof(param));
+  const int rows = tx_size_high[tx_size];
+  const int cols = tx_size_wide[tx_size];
+  const int num_loops = 1000000 / (rows * cols);
+
+  for (int i = 0; i < 2; ++i) {
+    const int bd = 8;
+    for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+      if (libaom_test::IsTxSizeTypeValid(
+              tx_size, static_cast<TX_TYPE>(tx_type)) == false) {
+        continue;
+      }
+
+      FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size];
+      if (ref_func != NULL) {
+        DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 };
+        DECLARE_ALIGNED(32, int32_t, output[64 * 64]);
+        DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]);
+        int input_stride = 64;
+        ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+        for (int r = 0; r < rows; ++r) {
+          for (int c = 0; c < cols; ++c) {
+            input[r * input_stride + c] = rnd.Rand16() % (1 << bd);
+          }
+        }
+
+        param.tx_type = (TX_TYPE)tx_type;
+        param.tx_size = (TX_SIZE)tx_size;
+        param.tx_set_type = EXT_TX_SET_ALL16;
+        param.bd = bd;
+
+        aom_usec_timer ref_timer, test_timer;
+
+        aom_usec_timer_start(&ref_timer);
+        for (int i = 0; i < num_loops; ++i) {
+          ref_func(input, ref_output, input_stride, (TX_TYPE)tx_type, bd);
+        }
+        aom_usec_timer_mark(&ref_timer);
+        const int elapsed_time_c =
+            static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+        aom_usec_timer_start(&test_timer);
+        for (int i = 0; i < num_loops; ++i) {
+          target_func(input, output, input_stride, &param);
+        }
+        aom_usec_timer_mark(&test_timer);
+        const int elapsed_time_simd =
+            static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+        printf(
+            "txfm_size[%d] \t txfm_type[%d] \t c_time=%d \t simd_time=%d \t "
+            "gain=%d \n",
+            tx_size, tx_type, elapsed_time_c, elapsed_time_simd,
+            (elapsed_time_c / elapsed_time_simd));
+      }
+    }
+  }
+}
 
 typedef ::testing::tuple<TX_SIZE, lowbd_fwd_txfm_func> LbdFwdTxfm2dParam;
 
@@ -295,7 +356,9 @@
 TEST_P(AV1FwdTxfm2dTest, match) {
   AV1FwdTxfm2dMatchTest(GET_PARAM(0), GET_PARAM(1));
 }
-
+TEST_P(AV1FwdTxfm2dTest, DISABLED_Speed) {
+  AV1FwdTxfm2dSpeedTest(GET_PARAM(0), GET_PARAM(1));
+}
 using ::testing::Combine;
 using ::testing::Values;
 using ::testing::ValuesIn;