CWG-D009: ADST types replacement Replace the ADST forward and inverse functions: - 4-point: DST-4 using butterfly operations. - 8-point: Graph Fourier Transform with self-loop of 1.5 using matrix multiplication. - 16-point: DST-7 using matrix multiplication. The scaling for the inverse matrix multiplication is 7. The scaling for the forward matrix multiplication is 12.

commit: 3497d20e54426ce67d43eb282da2ad5f17f5f9f2 [log] [tgz]
author: Aleix Segui <asegui@netflix.com> Wed Apr 19 16:47:17 2023 -0700
committer: Joel Sole <jsole@netflix.com> Wed Jun 14 10:25:16 2023 -0700
tree: a51d961a6ca43a1c3f166112023a896c7d6da4d0
parent: 3b7dd10416db1fd48228bbd2383196df2416175c [diff]
diff --git a/aom_dsp/x86/txfm_common_avx2.h b/aom_dsp/x86/txfm_common_avx2.h
index 57059d7..f4d2386 100644
--- a/aom_dsp/x86/txfm_common_avx2.h
+++ b/aom_dsp/x86/txfm_common_avx2.h

@@ -26,6 +26,37 @@
       (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
 }
 
+#if CONFIG_ADST8_TUNED || CONFIG_ADST16_TUNED
+void iadst_matrix_mult_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                       int bd, int out_shift, const int32_t* kernel,
+                       int kernel_size, int num_cols);
+
+static INLINE void round_shift_avx2(__m256i *in, const __m256i _r, const int bit) {
+    __m256i a0 = _mm256_add_epi32(*in, _r);
+    *in = _mm256_srai_epi32(a0, bit);
+}
+
+static INLINE __m256i interleave_coefs_avx2(const int32_t a, const int32_t b) {
+    const __m128i coef1 = _mm_set1_epi16(a);
+    const __m128i coef2 = _mm_set1_epi16(b);
+    const __m256i coef = _mm256_insertf128_si256(_mm256_castsi128_si256(coef1), coef2, 0x1);
+    return coef;
+}
+
+static INLINE void matrix_coef_mult_avx2(const __m256i w0, const __m256i w1,
+                                         const __m256i in0, const __m256i in1,
+                                         __m256i *out0, __m256i *out1) {
+    __m256i t0 = _mm256_unpacklo_epi16(in0, in1);
+    __m256i t1 = _mm256_unpackhi_epi16(in0, in1);
+
+    __m256i v0 = _mm256_unpacklo_epi16(w0, w1);
+    __m256i v1 = _mm256_unpackhi_epi16(w0, w1);
+
+    *out0 = _mm256_madd_epi16(t0, v0);
+    *out1 = _mm256_madd_epi16(t1, v1);
+}
+#endif
+
 static INLINE void btf_16_w16_avx2(const __m256i w0, const __m256i w1,
                                    __m256i *in0, __m256i *in1, const __m256i _r,
                                    const int32_t cos_bit) {

diff --git a/av1/common/av1_inv_txfm1d.c b/av1/common/av1_inv_txfm1d.c
index 1f37ad6..c715888 100644
--- a/av1/common/av1_inv_txfm1d.c
+++ b/av1/common/av1_inv_txfm1d.c

@@ -654,6 +654,65 @@
   bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
 }
 
+#if CONFIG_ADST4_TUNED
+void av2_iadst4(const int32_t *input, int32_t *output,
+                int8_t cos_bit, const int8_t *stage_range) {
+    const int32_t size = 4;
+    const int32_t *cospi;
+
+    int32_t *bf0, *bf1;
+    int32_t step[4];
+
+    // stage 0;
+    av1_range_check_buf(0, input, input, size, stage_range[0]);
+
+    // stage 1;
+    bf1 = output;
+    bf1[0] = input[0];
+    bf1[1] = -input[3];
+    bf1[2] = -input[1];
+    bf1[3] = input[2];
+    av1_range_check_buf(1, input, bf1, size, stage_range[1]);
+
+    // stage 2
+    cospi = cospi_arr(cos_bit);
+    bf0 = output;
+    bf1 = step;
+    bf1[0] = bf0[0];
+    bf1[1] = bf0[1];
+    bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+    bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+    av1_range_check_buf(2, input, bf1, size, stage_range[2]);
+
+    // stage 3
+    bf0 = step;
+    bf1 = output;
+    bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[3]);
+    bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[3]);
+    bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[3]);
+    bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[3]);
+    av1_range_check_buf(3, input, bf1, size, stage_range[3]);
+
+    // stage 4
+    bf0 = output;
+    bf1 = step;
+    bf1[0] = half_btf(cospi[8], bf0[0], cospi[56], bf0[1], cos_bit);
+    bf1[1] = half_btf(cospi[56], bf0[0], -cospi[8], bf0[1], cos_bit);
+    bf1[2] = half_btf(cospi[40], bf0[2], cospi[24], bf0[3], cos_bit);
+    bf1[3] = half_btf(cospi[24], bf0[2], -cospi[40], bf0[3], cos_bit);
+    av1_range_check_buf(4, input, bf1, size, stage_range[4]);
+
+    // stage 5
+    bf0 = step;
+    bf1 = output;
+    bf1[0] = bf0[1];
+    bf1[1] = bf0[2];
+    bf1[2] = bf0[3];
+    bf1[3] = bf0[0];
+    av1_range_check_buf(5, input, bf1, size, stage_range[5]);
+}
+#endif
+
 void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
                 const int8_t *stage_range) {
   int bit = cos_bit;
@@ -711,6 +770,16 @@
   output[3] = round_shift(x3, bit);
 }
 
+#if CONFIG_ADST8_TUNED
+void av2_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
+    (void)stage_range;
+    (void)cos_bit;
+    av2_txfm_matrix_mult(input, output, av2_adst_kernel8[INV_TXFM],
+                         TXFM_SIZE8, INV_ADST_BIT, stage_range[0]);
+}
+#endif
+
 void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
                 const int8_t *stage_range) {
   assert(output != input);
@@ -819,6 +888,16 @@
   bf1[7] = -bf0[1];
 }
 
+#if CONFIG_ADST16_TUNED
+void av2_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                 const int8_t *stage_range) {
+    (void)stage_range;
+    (void)cos_bit;
+    av2_txfm_matrix_mult(input, output, av2_adst_kernel16[INV_TXFM],
+                         TXFM_SIZE16, INV_ADST_BIT, stage_range[0]);
+}
+#endif
+
 void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
                  const int8_t *stage_range) {
   assert(output != input);

diff --git a/av1/common/av1_inv_txfm1d.h b/av1/common/av1_inv_txfm1d.h
index 9d1bd6d..8190438 100644
--- a/av1/common/av1_inv_txfm1d.h
+++ b/av1/common/av1_inv_txfm1d.h

@@ -54,6 +54,18 @@
                        const int8_t *stage_range);
 void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
                        const int8_t *stage_range);
+#if CONFIG_ADST4_TUNED
+void av2_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+#endif
+#if CONFIG_ADST8_TUNED
+void av2_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+#endif
+#if CONFIG_ADST16_TUNED
+void av2_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                 const int8_t *stage_range);
+#endif
 
 #ifdef __cplusplus
 }

diff --git a/av1/common/av1_inv_txfm2d.c b/av1/common/av1_inv_txfm2d.c
index d98f054..63bd304 100644
--- a/av1/common/av1_inv_txfm2d.c
+++ b/av1/common/av1_inv_txfm2d.c

@@ -117,9 +117,21 @@
     case TXFM_TYPE_DCT16: return av1_idct16;
     case TXFM_TYPE_DCT32: return av1_idct32;
     case TXFM_TYPE_DCT64: return av1_idct64;
+#if CONFIG_ADST4_TUNED
+    case TXFM_TYPE_ADST4: return av2_iadst4;
+#else
     case TXFM_TYPE_ADST4: return av1_iadst4;
+#endif
+#if CONFIG_ADST8_TUNED
+    case TXFM_TYPE_ADST8: return av2_iadst8;
+#else
     case TXFM_TYPE_ADST8: return av1_iadst8;
+#endif
+#if CONFIG_ADST16_TUNED
+    case TXFM_TYPE_ADST16: return av2_iadst16;
+#else
     case TXFM_TYPE_ADST16: return av1_iadst16;
+#endif
     case TXFM_TYPE_IDENTITY4: return av1_iidentity4_c;
     case TXFM_TYPE_IDENTITY8: return av1_iidentity8_c;
     case TXFM_TYPE_IDENTITY16: return av1_iidentity16_c;

diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 482354a..b57beb8 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl

@@ -137,8 +137,16 @@
 add_proto qw/void inv_stxfm/ , "tran_low_t *src, tran_low_t *dst, const PREDICTION_MODE mode, const uint8_t stx_idx, const int size";
 specialize qw/inv_stxfm sse4_1/;
 
-add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *input, uint16_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2 neon/;
+if ( aom_config("CONFIG_ADST4_TUNED") eq "yes"
+  || aom_config("CONFIG_ADST8_TUNED") eq "yes"
+  || aom_config("CONFIG_ADST16_TUNED") eq "yes"
+) {
+    add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *input, uint16_t *dest, int stride, const TxfmParam *txfm_param";
+    specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2/;
+} else {
+    add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *input, uint16_t *dest, int stride, const TxfmParam *txfm_param";
+    specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2 neon/;
+}
 
 add_proto qw/void av1_highbd_inv_txfm_add_4x4/,  "const tran_low_t *input, uint16_t *dest, int stride, const TxfmParam *txfm_param";
 specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1 neon/;
@@ -271,34 +279,74 @@
   add_proto qw/void av1_lowbd_fwd_txfm/, "const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param";
   specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1 avx2 neon/;
 
-  add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_4x8 sse4_1 neon/;
-  add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_8x4 sse4_1 neon/;
-  add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_8x16 sse4_1 avx2 neon/;
-  add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_16x8 sse4_1 avx2 neon/;
-  add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_16x32 sse4_1 neon/;
-  add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_32x16 sse4_1 neon/;
-  add_proto qw/void av1_fwd_txfm2d_4x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_4x16 sse4_1 neon/;
-  add_proto qw/void av1_fwd_txfm2d_16x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_16x4 sse4_1 neon/;
-  add_proto qw/void av1_fwd_txfm2d_8x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_8x32 sse4_1 neon/;
-  add_proto qw/void av1_fwd_txfm2d_32x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_32x8 sse4_1 neon/;
-  add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_4x4 sse4_1 neon/;
-  add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_8x8 sse4_1 avx2 neon/;
-  add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_16x16 sse4_1 avx2 neon/;
-  add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_32x32 sse4_1 avx2 neon/;
+  if ( aom_config("CONFIG_ADST4_TUNED") eq "yes"
+    || aom_config("CONFIG_ADST8_TUNED") eq "yes"
+    || aom_config("CONFIG_ADST16_TUNED") eq "yes"
+  ) {
+      add_proto qw/void av1_lowbd_fwd_txfm/, "const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param";
+      specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1 avx2/;
+
+      add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_4x8 sse4_1/;
+      add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_8x4 sse4_1/;
+      add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_8x16 sse4_1 avx2/;
+      add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_16x8 sse4_1 avx2/;
+      add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_16x32 sse4_1 neon/;
+      add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_32x16 sse4_1 neon/;
+      add_proto qw/void av1_fwd_txfm2d_4x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_4x16 sse4_1/;
+      add_proto qw/void av1_fwd_txfm2d_16x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_16x4 sse4_1/;
+      add_proto qw/void av1_fwd_txfm2d_8x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_8x32 sse4_1 neon/;
+      add_proto qw/void av1_fwd_txfm2d_32x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_32x8 sse4_1 neon/;
+      add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_4x4 sse4_1/;
+      add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_8x8 sse4_1 avx2/;
+      add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_16x16 sse4_1 avx2/;
+      add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_32x32 sse4_1 avx2 neon/;
+  } else {
+      add_proto qw/void av1_lowbd_fwd_txfm/, "const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param";
+      specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1 avx2 neon/;
+
+      add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_4x8 sse4_1 neon/;
+      add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_8x4 sse4_1 neon/;
+      add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_8x16 sse4_1 avx2 neon/;
+      add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_16x8 sse4_1 avx2 neon/;
+      add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_16x32 sse4_1 neon/;
+      add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_32x16 sse4_1 neon/;
+      add_proto qw/void av1_fwd_txfm2d_4x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_4x16 sse4_1 neon/;
+      add_proto qw/void av1_fwd_txfm2d_16x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_16x4 sse4_1 neon/;
+      add_proto qw/void av1_fwd_txfm2d_8x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_8x32 sse4_1 neon/;
+      add_proto qw/void av1_fwd_txfm2d_32x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_32x8 sse4_1 neon/;
+      add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_4x4 sse4_1 neon/;
+      add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_8x8 sse4_1 avx2 neon/;
+      add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_16x16 sse4_1 avx2 neon/;
+      add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+      specialize qw/av1_fwd_txfm2d_32x32 sse4_1 avx2 neon/;
+  }
 
   add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   specialize qw/av1_fwd_txfm2d_64x64 sse4_1 avx2 neon/;

diff --git a/av1/common/av1_txfm.c b/av1/common/av1_txfm.c
index bd208ee..4a1d101 100644
--- a/av1/common/av1_txfm.c
+++ b/av1/common/av1_txfm.c

@@ -14,6 +14,9 @@
 #include "config/av1_rtcd.h"
 
 #include "av1/common/av1_txfm.h"
+#if CONFIG_ADST8_TUNED || CONFIG_ADST16_TUNED
+#include "av1/common/av1_inv_txfm1d.h"
+#endif
 
 // av1_cospi_arr[i][j] = (int)round(cos(PI*j/128) * (1<<(cos_bit_min+i)));
 const int32_t av1_cospi_arr_data[7][64] = {
@@ -83,6 +86,98 @@
   { 0, 21133, 39716, 53510, 60849 }
 };
 
+#if CONFIG_ADST8_TUNED
+const int32_t av2_adst_kernel8[TXFM_DIRECTIONS][TXFM_KERNEL_SIZE8] =
+{
+  {
+    519, 1278, 1989, 2628, 3169, 3594, 3886, 4035,
+    1529, 3327, 4049, 3461, 1754, -521, -2627, -3884,
+    2454, 4041, 2179, -1542, -3947, -2984, 526, 3587,
+    3232, 3081, -1835, -3913, 61, 3941, 1726, -3158,
+    3781, 759, -4008, 440, 3877, -1599, -3398, 2616,
+    3974, -1987, -1987, 3974, -1987, -1987, 3974, -1987,
+    3581, -3764, 2258, 262, -2665, 3871, -3339, 1309,
+    2264, -3145, 3679, -3805, 3511, -2828, 1832, -634,
+  },
+  {
+    16, 48, 77, 101, 118, 124, 112, 71,
+    40, 104, 126, 96, 24, -62, -118, -98,
+    62, 127, 68, -57, -125, -62, 71, 115,
+    82, 108, -48, -122, 14, 124, 8, -119,
+    99, 55, -123, 2, 121, -62, -83, 110,
+    112, -16, -93, 123, -50, -62, 121, -88,
+    121, -82, 16, 54, -106, 124, -104, 57,
+    126, -121, 112, -99, 82, -62, 41, -20,
+  }
+};
+#endif
+#if CONFIG_ADST16_TUNED
+const int32_t av2_adst_kernel16[TXFM_DIRECTIONS][TXFM_KERNEL_SIZE16] =
+{
+  {
+    383, 763, 1136, 1499, 1848, 2181, 2493, 2783, 3048, 3286, 3493, 3669, 3812, 3920, 3992, 4029,
+    1136, 2181, 3048, 3669, 3992, 3992, 3669, 3048, 2181, 1136, 0, -1136, -2181, -3048, -3669, -3992,
+    1848, 3286, 3992, 3812, 2783, 1136, -763, -2493, -3669, -4029, -3493, -2181, -383, 1499, 3048, 3920,
+    2493, 3920, 3669, 1848, -763, -3048, -4029, -3286, -1136, 1499, 3493, 3992, 2783, 383, -2181, -3812,
+    3048, 3992, 2181, -1136, -3669, -3669, -1136, 2181, 3992, 3048, 0, -3048, -3992, -2181, 1136, 3669,
+    3493, 3493, 0, -3493, -3493, 0, 3493, 3493, 0, -3493, -3493, 0, 3493, 3493, 0, -3493,
+    3812, 2493, -2181, -3920, -383, 3669, 2783, -1848, -3992, -763, 3493, 3048, -1499, -4029, -1136, 3286,
+    3992, 1136, -3669, -2181, 3048, 3048, -2181, -3669, 1136, 3992, 0, -3992, -1136, 3669, 2181, -3048,
+    4029, -383, -3992, 763, 3920, -1136, -3812, 1499, 3669, -1848, -3493, 2181, 3286, -2493, -3048, 2783,
+    3920, -1848, -3048, 3286, 1499, -3992, 383, 3812, -2181, -2783, 3493, 1136, -4029, 763, 3669, -2493,
+    3669, -3048, -1136, 3992, -2181, -2181, 3992, -1136, -3048, 3669, 0, -3669, 3048, 1136, -3992, 2181,
+    3286, -3812, 1136, 2493, -4029, 2181, 1499, -3920, 3048, 383, -3493, 3669, -763, -2783, 3992, -1848,
+    2783, -4029, 3048, -383, -2493, 3992, -3286, 763, 2181, -3920, 3493, -1136, -1848, 3812, -3669, 1499,
+    2181, -3669, 3992, -3048, 1136, 1136, -3048, 3992, -3669, 2181, 0, -2181, 3669, -3992, 3048, -1136,
+    1499, -2783, 3669, -4029, 3812, -3048, 1848, -383, -1136, 2493, -3493, 3992, -3920, 3286, -2181, 763,
+    763, -1499, 2181, -2783, 3286, -3669, 3920, -4029, 3992, -3812, 3493, -3048, 2493, -1848, 1136, -383,
+  },
+  {
+    12, 36, 58, 78, 95, 109, 119, 125, 126, 122, 115, 103, 87, 68, 47, 24,
+    24, 68, 103, 122, 125, 109, 78, 36, -12, -58, -95, -119, -126, -115, -87, -47,
+    36, 95, 125, 115, 68, 0, -68, -115, -125, -95, -36, 36, 95, 125, 115, 68,
+    47, 115, 119, 58, -36, -109, -122, -68, 24, 103, 125, 78, -12, -95, -126, -87,
+    58, 125, 87, -24, -115, -109, -12, 95, 122, 47, -68, -126, -78, 36, 119, 103,
+    68, 125, 36, -95, -115, 0, 115, 95, -36, -125, -68, 68, 125, 36, -95, -115,
+    78, 115, -24, -126, -36, 109, 87, -68, -119, 12, 125, 47, -103, -95, 58, 122,
+    87, 95, -78, -103, 68, 109, -58, -115, 47, 119, -36, -122, 24, 125, -12, -126,
+    95, 68, -115, -36, 125, 0, -125, 36, 115, -68, -95, 95, 68, -115, -36, 125,
+    103, 36, -126, 47, 95, -109, -24, 125, -58, -87, 115, 12, -122, 68, 78, -119,
+    109, 0, -109, 109, 0, -109, 109, 0, -109, 109, 0, -109, 109, 0, -109, 109,
+    115, -36, -68, 125, -95, 0, 95, -125, 68, 36, -115, 115, -36, -68, 125, -95,
+    119, -68, -12, 87, -125, 109, -47, -36, 103, -126, 95, -24, -58, 115, -122, 78,
+    122, -95, 47, 12, -68, 109, -126, 115, -78, 24, 36, -87, 119, -125, 103, -58,
+    125, -115, 95, -68, 36, 0, -36, 68, -95, 115, -125, 125, -115, 95, -68, 36,
+    126, -125, 122, -119, 115, -109, 103, -95, 87, -78, 68, -58, 47, -36, 24, -12,
+  }
+};
+#endif
+
+#if CONFIG_ADST8_TUNED || CONFIG_ADST16_TUNED
+void av2_txfm_matrix_mult(const int32_t *input, int32_t *output,
+                          const int32_t *kernel, int kernel_size,
+                          int8_t bit, int8_t clamp) {
+    assert(kernel_size < TXFM_SIZE_MAX);
+
+    int32_t result[TXFM_SIZE_MAX];
+    memset(result, 0, sizeof(result[0]) * kernel_size);
+
+    for(int i = 0; i < kernel_size; ++i) {
+        for(int j = 0; j < kernel_size; ++j) {
+            int32_t prod = kernel[kernel_size * i + j] * input[j];
+            result[i] = range_check_value(prod + result[i], STAGE_RANGE_MAX);
+        }
+    }
+    for(int i = 0; i < kernel_size; ++i)
+        output[i] = round_shift(result[i], bit);
+
+    if (clamp) {
+      for(int i = 0; i < kernel_size; ++i)
+        output[i] = clamp_value(output[i], clamp);
+    }
+}
+#endif
+
 void av1_round_shift_array_c(int32_t *arr, int size, int bit) {
   int i;
   if (bit == 0) {

diff --git a/av1/common/av1_txfm.h b/av1/common/av1_txfm.h
index 4c983a1..a73a279 100644
--- a/av1/common/av1_txfm.h
+++ b/av1/common/av1_txfm.h

@@ -34,6 +34,12 @@
 
 extern const int32_t av1_cospi_arr_data[7][64];
 extern const int32_t av1_sinpi_arr_data[7][5];
+#if CONFIG_ADST8_TUNED
+extern const int32_t av2_adst_kernel8[TXFM_DIRECTIONS][TXFM_KERNEL_SIZE8];
+#endif
+#if CONFIG_ADST16_TUNED
+extern const int32_t av2_adst_kernel16[TXFM_DIRECTIONS][TXFM_KERNEL_SIZE16];
+#endif
 
 #if CONFIG_CROSS_CHROMA_TX
 #define CCTX_PREC_BITS 8
@@ -45,6 +51,11 @@
 static const int cos_bit_min = 10;
 static const int cos_bit_max = 16;
 
+#if CONFIG_ADST8_TUNED || CONFIG_ADST16_TUNED
+#define FWD_ADST_BIT 12
+#define INV_ADST_BIT 7
+#endif
+
 #define NewSqrt2Bits ((int32_t)12)
 // 2^12 * sqrt(2)
 static const int32_t NewSqrt2 = 5793;
@@ -221,6 +232,11 @@
                           TXFM_2D_FLIP_CFG *cfg);
 void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
                           TXFM_2D_FLIP_CFG *cfg);
+#if CONFIG_ADST8_TUNED || CONFIG_ADST16_TUNED
+void av2_txfm_matrix_mult(const int32_t *input, int32_t *output,
+                          const int32_t *kernel, int kernel_size,
+                          int8_t bit, int8_t clamp);
+#endif
 extern const TXFM_TYPE av1_txfm_type_ls[5][TX_TYPES_1D];
 extern const int8_t av1_txfm_stage_num_list[TXFM_TYPES];
 static INLINE int get_txw_idx(TX_SIZE tx_size) {

diff --git a/av1/common/enums.h b/av1/common/enums.h
index d194d6f..fe01c86 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h

@@ -177,6 +177,24 @@
 #define IST_8x8_WIDTH 64
 #define IST_8x8_HEIGHT 32
 
+#if CONFIG_ATC_NEWTXSETS
+// TX sizes used for mode dependent TX sets
+#define MODE_DEPTX_TXSIZES 19
+#endif  // CONFIG_ATC_NEWTXSETS
+
+#if CONFIG_ADST8_TUNED || CONFIG_ADST16_TUNED
+#define TXFM_SIZE_MAX 16
+#define STAGE_RANGE_MAX 30
+#endif
+#if CONFIG_ADST8_TUNED
+#define TXFM_SIZE8 8
+#define TXFM_KERNEL_SIZE8 64
+#endif
+#if CONFIG_ADST16_TUNED
+#define TXFM_SIZE16 16
+#define TXFM_KERNEL_SIZE16 256
+#endif
+
 #define FSC_MODES 2
 #if CONFIG_ATC_DCTX_ALIGNED
 #define FSC_MAXWIDTH 32
@@ -537,6 +555,14 @@
 } UENUM1BYTE(CctxType);
 #endif  // CONFIG_CROSS_CHROMA_TX
 
+#if CONFIG_ADST8_TUNED || CONFIG_ADST16_TUNED
+enum {
+  FWD_TXFM,
+  INV_TXFM,
+  TXFM_DIRECTIONS
+} UENUM1BYTE(TXFM_DIRECTION);
+#endif
+
 enum {
   REG_REG,
   REG_SMOOTH,

diff --git a/av1/common/x86/av1_txfm_sse2.h b/av1/common/x86/av1_txfm_sse2.h
index c09e643..e622a73 100644
--- a/av1/common/x86/av1_txfm_sse2.h
+++ b/av1/common/x86/av1_txfm_sse2.h

@@ -257,6 +257,21 @@
   }
 }
 
+#if CONFIG_ADST8_TUNED || CONFIG_ADST16_TUNED
+static INLINE void matrix_coef_mult_sse2(const __m128i w0, const __m128i w1,
+                                         const __m128i in0, const __m128i in1,
+                                         __m128i *out0, __m128i *out1) {
+    __m128i t0 = _mm_unpacklo_epi16(in0, in1);
+    __m128i t1 = _mm_unpackhi_epi16(in0, in1);
+
+    __m128i v0 = _mm_unpacklo_epi16(w0, w1);
+    __m128i v1 = _mm_unpackhi_epi16(w0, w1);
+
+    *out0 = _mm_madd_epi16(t0, v0);
+    *out1 = _mm_madd_epi16(t1, v1);
+}
+#endif
+
 void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
                                    int stride, TX_TYPE tx_type, int bd);
 

diff --git a/av1/common/x86/av1_txfm_sse4.h b/av1/common/x86/av1_txfm_sse4.h
index 1b10e51..8cebab8 100644
--- a/av1/common/x86/av1_txfm_sse4.h
+++ b/av1/common/x86/av1_txfm_sse4.h

@@ -19,6 +19,12 @@
 extern "C" {
 #endif
 
+#if CONFIG_ADST8_TUNED || CONFIG_ADST16_TUNED
+void iadst_matrix_mult_sse4(__m128i *in, __m128i *out, int bit, int do_cols,
+                            int bd, int out_shift, const int32_t* kernel,
+                            int kernel_size, int num_cols);
+#endif
+
 static INLINE __m128i av1_round_shift_32_sse4_1(__m128i vec, int bit) {
   __m128i tmp, round;
   round = _mm_set1_epi32(1 << (bit - 1));

diff --git a/av1/common/x86/highbd_inv_txfm_avx2.c b/av1/common/x86/highbd_inv_txfm_avx2.c
index 8af0bb9..5139db0 100644
--- a/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/av1/common/x86/highbd_inv_txfm_avx2.c

@@ -1534,6 +1534,14 @@
   }
 }
 
+#if CONFIG_ADST16_TUNED
+static void iadst16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+    (void)bit;
+    iadst_matrix_mult_avx2(in, out, INV_ADST_BIT, do_cols, bd, out_shift,
+                           av2_adst_kernel16[INV_TXFM], TXFM_SIZE16, 1);
+}
+#else
 static void iadst16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                               int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
@@ -1711,7 +1719,16 @@
     }
   }
 }
+#endif
 
+#if CONFIG_ADST16_TUNED
+static void iadst16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                         int bd, int out_shift) {
+    (void)bit;
+    iadst_matrix_mult_avx2(in, out, INV_ADST_BIT, do_cols, bd, out_shift,
+                           av2_adst_kernel16[INV_TXFM], TXFM_SIZE16, 8);
+}
+#else
 static void iadst16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                               int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
@@ -2032,7 +2049,15 @@
     }
   }
 }
-
+#endif
+#if CONFIG_ADST16_TUNED
+static void iadst16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                         int bd, int out_shift) {
+    (void)bit;
+    iadst_matrix_mult_avx2(in, out, INV_ADST_BIT, do_cols, bd, out_shift,
+                           av2_adst_kernel16[INV_TXFM], TXFM_SIZE16, TXFM_SIZE16);
+}
+#else
 static void iadst16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                          int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
@@ -2412,6 +2437,7 @@
     }
   }
 }
+#endif
 static void idct8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                               int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
@@ -2562,6 +2588,61 @@
     highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 8);
   }
 }
+
+#if CONFIG_ADST8_TUNED || CONFIG_ADST16_TUNED
+void iadst_matrix_mult_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                              int bd, int out_shift, const int32_t* kernel,
+                              int kernel_size, int num_cols) {
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+    int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+    __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+    __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+    __m256i x[16];
+
+    for(int i = 0; i < kernel_size; ++i) {
+        int row_idx = i*kernel_size;
+        __m256i sum = zero;
+        __m256i t;
+        for(int j = 0; j < num_cols; ++j) {
+            const __m256i coef = _mm256_set1_epi32(kernel[row_idx + j]);
+            t = _mm256_mullo_epi32(in[j], coef);
+            sum = _mm256_add_epi32(sum, t);
+        }
+        sum = _mm256_add_epi32(sum, rnding);
+        sum = _mm256_srai_epi32(sum, bit);
+        sum = _mm256_max_epi32(sum, clamp_lo);
+        x[i] = _mm256_min_epi32(sum, clamp_hi);
+    }
+
+    if (!do_cols) {
+        log_range = AOMMAX(16, bd + 6);
+        clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+        clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+        if (out_shift != 0) {
+            __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+            for(int i = 0; i < kernel_size; ++i) {
+                x[i] = _mm256_add_epi32(x[i], offset);
+                x[i] = _mm256_sra_epi32(x[i], _mm_cvtsi32_si128(out_shift));
+            }
+        }
+    }
+
+    for(int i = 0; i < kernel_size; ++i) {
+        x[i] = _mm256_max_epi32(x[i], clamp_lo);
+        out[i] = _mm256_min_epi32(x[i], clamp_hi);
+    }
+}
+#endif
+
+#if CONFIG_ADST8_TUNED
+static void iadst8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                               int bd, int out_shift) {
+    (void)bit;
+    iadst_matrix_mult_avx2(in, out, INV_ADST_BIT, do_cols, bd, out_shift,
+                           av2_adst_kernel8[INV_TXFM], TXFM_SIZE8, 1);
+}
+#else
 static void iadst8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                                int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
@@ -2651,7 +2732,15 @@
                    out_shift);
   }
 }
-
+#endif
+#if CONFIG_ADST8_TUNED
+static void iadst8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                          int bd, int out_shift) {
+    (void)bit;
+    iadst_matrix_mult_avx2(in, out, INV_ADST_BIT, do_cols, bd, out_shift,
+                           av2_adst_kernel8[INV_TXFM], TXFM_SIZE8, TXFM_SIZE8);
+}
+#else
 static void iadst8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                           int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
@@ -2820,6 +2909,7 @@
                    out_shift);
   }
 }
+#endif
 static INLINE void idct64_stage8_avx2(
     __m256i *u, const __m256i *cospim32, const __m256i *cospi32,
     const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,

diff --git a/av1/common/x86/highbd_inv_txfm_sse4.c b/av1/common/x86/highbd_inv_txfm_sse4.c
index 2096832..adecff3 100644
--- a/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/av1/common/x86/highbd_inv_txfm_sse4.c

@@ -447,6 +447,79 @@
   }
 }
 
+#if CONFIG_ADST4_TUNED
+static void iadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                            int bd, int out_shift) {
+    const int32_t *cospi = cospi_arr(bit);
+    const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+    const __m128i cospi8 =  _mm_set1_epi32((int)cospi[8]);
+    const __m128i cospi24 = _mm_set1_epi32((int)cospi[24]);
+    const __m128i cospi32 = _mm_set1_epi32((int)cospi[32]);
+    const __m128i cospi40 = _mm_set1_epi32((int)cospi[40]);
+    const __m128i cospi56 = _mm_set1_epi32((int)cospi[56]);
+    int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+    __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+    __m128i s0, s1, s2, s3;
+    __m128i x0, x1, x2, x3;
+    __m128i u0, u1, u2, u3;
+    __m128i v0, v1, v2, v3;
+
+    // stage 0 transpose
+    v0 = _mm_unpacklo_epi32(in[0], in[1]);
+    v1 = _mm_unpackhi_epi32(in[0], in[1]);
+    v2 = _mm_unpacklo_epi32(in[2], in[3]);
+    v3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+    u0 = _mm_unpacklo_epi64(v0, v2);
+    u1 = _mm_unpackhi_epi64(v0, v2);
+    u2 = _mm_unpacklo_epi64(v1, v3);
+    u3 = _mm_unpackhi_epi64(v1, v3);
+
+    // stage 1
+    x0 = u0;
+    x2 = _mm_sub_epi32(_mm_setzero_si128(), u1);
+    x3 = u2;
+    x1 = _mm_sub_epi32(_mm_setzero_si128(), u3);
+
+    // stage 2
+    s0 = x0;
+    s1 = x1;
+    s2 = half_btf_sse4_1(&cospi32, &x2, &cospi32, &x3, &rnding, bit);
+    s3 = half_btf_neg_sse4_1(&cospi32, &x2, &cospi32, &x3, &rnding, bit);
+
+    // stage 3
+    // Stage 3
+    addsub_sse4_1(s0, s2, &x0, &x2, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(s1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi);
+
+    x0 = _mm_add_epi32(s0, s2);
+    x1 = _mm_add_epi32(s1, s3);
+    x2 = _mm_sub_epi32(s0, s2);
+    x3 = _mm_sub_epi32(s1, s3);
+
+    // stage 4
+    s0 = half_btf_sse4_1(&cospi8, &x0, &cospi56, &x1, &rnding, bit);
+    s1 = half_btf_neg_sse4_1(&cospi56, &x0, &cospi8, &x1, &rnding, bit);
+    s2 = half_btf_sse4_1(&cospi40, &x2, &cospi24, &x3, &rnding, bit);
+    s3 = half_btf_neg_sse4_1(&cospi24, &x2, &cospi40, &x3, &rnding, bit);
+
+    //stage 5
+    out[0] = s1;
+    out[1] = s2;
+    out[2] = s3;
+    out[3] = s0;
+
+    if (!do_cols) {
+        log_range = AOMMAX(16, bd + 6);
+        clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+        clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+        round_shift_4x4(out, out_shift);
+        highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
+    }
+}
+#else
 static void iadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
                             int bd, int out_shift) {
   const int32_t *sinpi = sinpi_arr(bit);
@@ -572,6 +645,7 @@
     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
   }
 }
+#endif
 
 static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
                              int fliplr, int flipud, int shift, int bd) {
@@ -931,6 +1005,14 @@
   }
 }
 
+#if CONFIG_ADST8_TUNED
+static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                            int bd, int out_shift) {
+  (void)bit;
+  iadst_matrix_mult_sse4(in, out, INV_ADST_BIT, do_cols, bd, out_shift,
+                         av2_adst_kernel8[INV_TXFM], TXFM_SIZE8, TXFM_SIZE8);
+}
+#else
 static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
                             int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
@@ -1251,6 +1333,7 @@
                      &clamp_hi_out, out_shift);
   }
 }
+#endif
 
 static void iidentity8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
                               int bd, int out_shift) {
@@ -1590,6 +1673,14 @@
   }
 }
 
+#if CONFIG_ADST8_TUNED
+static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+                                 int do_cols, int bd, int out_shift) {
+    (void)bit;
+    iadst_matrix_mult_sse4(in, out, INV_ADST_BIT, do_cols, bd, out_shift,
+                           av2_adst_kernel8[INV_TXFM], TXFM_SIZE8, 1);
+}
+#else
 static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit,
                                  int do_cols, int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
@@ -1678,7 +1769,15 @@
                      out_shift);
   }
 }
-
+#endif
+#if CONFIG_ADST8_TUNED
+static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                                int bd, int out_shift) {
+    (void)bit;
+    iadst_matrix_mult_sse4(in, out, INV_ADST_BIT, do_cols, bd, out_shift,
+                           av2_adst_kernel8[INV_TXFM], TXFM_SIZE8, TXFM_SIZE8);
+}
+#else
 static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
                                 int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
@@ -1849,6 +1948,7 @@
                      out_shift);
   }
 }
+#endif
 
 static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
                                   int do_cols, int bd, int out_shift) {
@@ -2043,6 +2143,60 @@
   }
 }
 
+#if CONFIG_ADST8_TUNED || CONFIG_ADST16_TUNED
+void iadst_matrix_mult_sse4(__m128i *in, __m128i *out, int bit, int do_cols,
+                            int bd, int out_shift, const int32_t* kernel,
+                            int kernel_size, int num_cols) {
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+    int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+    __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+    __m128i x[16];
+
+    for (int i = 0; i < kernel_size; ++i) {
+        int row_idx = i*kernel_size;
+        __m128i sum = zero;
+        __m128i t;
+        for (int j = 0; j < num_cols; ++j) {
+            const __m128i coef = _mm_set1_epi32(kernel[row_idx + j]);
+            t = _mm_mullo_epi32(in[j], coef);
+            sum = _mm_add_epi32(sum, t);
+        }
+        sum = _mm_add_epi32(sum, rnding);
+        sum = _mm_srai_epi32(sum, bit);
+        sum = _mm_max_epi32(sum, clamp_lo);
+        x[i] = _mm_min_epi32(sum, clamp_hi);
+    }
+
+    if (!do_cols) {
+        log_range = AOMMAX(16, bd + 6);
+        clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+        clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+        if (out_shift != 0) {
+            __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+            for(int i = 0; i < kernel_size; ++i) {
+                x[i] = _mm_add_epi32(x[i], offset);
+                x[i] = _mm_sra_epi32(x[i], _mm_cvtsi32_si128(out_shift));
+            }
+        }
+    }
+
+    for(int i = 0; i < kernel_size; ++i) {
+        x[i] = _mm_max_epi32(x[i], clamp_lo);
+        out[i] = _mm_min_epi32(x[i], clamp_hi);
+    }
+}
+#endif
+
+#if CONFIG_ADST16_TUNED
+static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+                                   int do_cols, int bd, int out_shift) {
+    (void)bit;
+    iadst_matrix_mult_sse4(in, out, INV_ADST_BIT, do_cols, bd, out_shift,
+                           av2_adst_kernel16[INV_TXFM], TXFM_SIZE16, 1);
+}
+#else
 static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
                                    int do_cols, int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
@@ -2214,7 +2368,16 @@
                      &clamp_hi_out, out_shift);
   }
 }
+#endif
 
+#if CONFIG_ADST16_TUNED
+static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+                                   int do_cols, int bd, int out_shift) {
+    (void)bit;
+    iadst_matrix_mult_sse4(in, out, INV_ADST_BIT, do_cols, bd, out_shift,
+                           av2_adst_kernel16[INV_TXFM], TXFM_SIZE16, 8);
+}
+#else
 static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
                                    int do_cols, int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
@@ -2531,6 +2694,7 @@
                      &clamp_hi_out, out_shift);
   }
 }
+#endif
 
 static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
                              int bd, int out_shift) {
@@ -2714,6 +2878,14 @@
   }
 }
 
+#if CONFIG_ADST16_TUNED
+static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+    (void)bit;
+    iadst_matrix_mult_sse4(in, out, INV_ADST_BIT, do_cols, bd, out_shift,
+                           av2_adst_kernel16[INV_TXFM], TXFM_SIZE16, TXFM_SIZE16);
+}
+#else
 static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
                               int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
@@ -3090,6 +3262,7 @@
                      &clamp_hi_out, out_shift);
   }
 }
+#endif
 static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
                                int bd, int out_shift) {
   (void)bit;

diff --git a/av1/common/x86/highbd_txfm_utility_sse4.h b/av1/common/x86/highbd_txfm_utility_sse4.h
index 3a2ded9..f049287 100644
--- a/av1/common/x86/highbd_txfm_utility_sse4.h
+++ b/av1/common/x86/highbd_txfm_utility_sse4.h

@@ -108,7 +108,20 @@
   x = _mm_srai_epi32(x, bit);
   return x;
 }
+#if CONFIG_ADST4_TUNED
+static INLINE __m128i half_btf_neg_sse4_1(const __m128i *w0, const __m128i *n0,
+                                      const __m128i *w1, const __m128i *n1,
+                                      const __m128i *rounding, int bit) {
+  __m128i x, y;
 
+  x = _mm_mullo_epi32(*w0, *n0);
+  y = _mm_mullo_epi32(*w1, *n1);
+  x = _mm_sub_epi32(x, y);
+  x = _mm_add_epi32(x, *rounding);
+  x = _mm_srai_epi32(x, bit);
+  return x;
+}
+#endif
 static INLINE __m128i half_btf_0_sse4_1(const __m128i *w0, const __m128i *n0,
                                         const __m128i *rounding, int bit) {
   __m128i x;

diff --git a/av1/encoder/av1_fwd_txfm1d.c b/av1/encoder/av1_fwd_txfm1d.c
index 2d87d13..91564c3 100644
--- a/av1/encoder/av1_fwd_txfm1d.c
+++ b/av1/encoder/av1_fwd_txfm1d.c

@@ -674,6 +674,65 @@
   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 }
 
+#if CONFIG_ADST4_TUNED
+void av2_fadst4(const int32_t *input, int32_t *output,
+                int8_t cos_bit, const int8_t *stage_range) {
+    const int32_t size = 4;
+    const int32_t *cospi;
+
+    int32_t *bf0, *bf1;
+    int32_t step[4];
+
+    // stage 0;
+    av1_range_check_buf(0, input, input, size, stage_range[0]);
+
+    // stage 1;
+    bf1 = output;
+    bf1[0] = input[3];
+    bf1[1] = input[0];
+    bf1[2] = input[1];
+    bf1[3] = input[2];
+    av1_range_check_buf(1, input, bf1, size, stage_range[1]);
+
+    // stage 2
+    cospi = cospi_arr(cos_bit);
+    bf0 = output;
+    bf1 = step;
+    bf1[0] = half_btf(cospi[8], bf0[0], cospi[56], bf0[1], cos_bit);
+    bf1[1] = half_btf(-cospi[8], bf0[1], cospi[56], bf0[0], cos_bit);
+    bf1[2] = half_btf(cospi[40], bf0[2], cospi[24], bf0[3], cos_bit);
+    bf1[3] = half_btf(-cospi[40], bf0[3], cospi[24], bf0[2], cos_bit);
+    av1_range_check_buf(2, input, bf1, size, stage_range[2]);
+
+    // stage 3
+    bf0 = step;
+    bf1 = output;
+    bf1[0] = bf0[0] + bf0[2];
+    bf1[1] = bf0[1] + bf0[3];
+    bf1[2] = -bf0[2] + bf0[0];
+    bf1[3] = -bf0[3] + bf0[1];
+    av1_range_check_buf(3, input, bf1, size, stage_range[3]);
+
+    // stage 4
+    bf0 = output;
+    bf1 = step;
+    bf1[0] = bf0[0];
+    bf1[1] = bf0[1];
+    bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+    bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit);
+    av1_range_check_buf(4, input, bf1, size, stage_range[4]);
+
+    // stage 5
+    bf0 = step;
+    bf1 = output;
+    bf1[0] = bf0[0];
+    bf1[1] = -bf0[2];
+    bf1[2] = bf0[3];
+    bf1[3] = -bf0[1];
+    av1_range_check_buf(5, input, bf1, size, stage_range[5]);
+}
+#endif
+
 void av1_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
                 const int8_t *stage_range) {
   int bit = cos_bit;
@@ -733,6 +792,16 @@
   av1_range_check_buf(6, input, output, 4, stage_range[6]);
 }
 
+#if CONFIG_ADST8_TUNED
+void av2_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
+    (void)stage_range;
+    (void)cos_bit;
+    av2_txfm_matrix_mult(input, output, av2_adst_kernel8[FWD_TXFM],
+                         TXFM_SIZE8, FWD_ADST_BIT, 0);
+}
+#endif
+
 void av1_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
                 const int8_t *stage_range) {
   const int32_t size = 8;
@@ -847,6 +916,16 @@
   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 }
 
+#if CONFIG_ADST16_TUNED
+void av2_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                 const int8_t *stage_range) {
+    (void)stage_range;
+    (void)cos_bit;
+    av2_txfm_matrix_mult(input, output, av2_adst_kernel16[FWD_TXFM],
+                         TXFM_SIZE16, FWD_ADST_BIT, 0);
+}
+#endif
+
 void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
                  const int8_t *stage_range) {
   const int32_t size = 16;

diff --git a/av1/encoder/av1_fwd_txfm1d.h b/av1/encoder/av1_fwd_txfm1d.h
index c125944..d0d4d04 100644
--- a/av1/encoder/av1_fwd_txfm1d.h
+++ b/av1/encoder/av1_fwd_txfm1d.h

@@ -43,6 +43,18 @@
                        const int8_t *stage_range);
 void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
                        const int8_t *stage_range);
+#if CONFIG_ADST4_TUNED
+void av2_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+#endif
+#if CONFIG_ADST8_TUNED
+void av2_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+#endif
+#if CONFIG_ADST16_TUNED
+void av2_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                 const int8_t *stage_range);
+#endif
 #ifdef __cplusplus
 }
 #endif

diff --git a/av1/encoder/av1_fwd_txfm2d.c b/av1/encoder/av1_fwd_txfm2d.c
index bbbdc9a..f360823 100644
--- a/av1/encoder/av1_fwd_txfm2d.c
+++ b/av1/encoder/av1_fwd_txfm2d.c

@@ -28,9 +28,21 @@
     case TXFM_TYPE_DCT16: return av1_fdct16;
     case TXFM_TYPE_DCT32: return av1_fdct32;
     case TXFM_TYPE_DCT64: return av1_fdct64;
+#if CONFIG_ADST4_TUNED
+    case TXFM_TYPE_ADST4: return av2_fadst4;
+#else
     case TXFM_TYPE_ADST4: return av1_fadst4;
+#endif
+#if CONFIG_ADST8_TUNED
+    case TXFM_TYPE_ADST8: return av2_fadst8;
+#else
     case TXFM_TYPE_ADST8: return av1_fadst8;
+#endif
+#if CONFIG_ADST16_TUNED
+    case TXFM_TYPE_ADST16: return av2_fadst16;
+#else
     case TXFM_TYPE_ADST16: return av1_fadst16;
+#endif
     case TXFM_TYPE_IDENTITY4: return av1_fidentity4_c;
     case TXFM_TYPE_IDENTITY8: return av1_fidentity8_c;
     case TXFM_TYPE_IDENTITY16: return av1_fidentity16_c;

diff --git a/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
index f6d46cd..297fcad 100644
--- a/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
+++ b/av1/encoder/x86/av1_fwd_txfm2d_avx2.c

@@ -1279,6 +1279,35 @@
   output[63] = x1[63];
 }
 
+#if CONFIG_ADST16_TUNED
+static INLINE void fadst16x16_new_avx2(const __m256i *input, __m256i *output,
+                                       int8_t cos_bit) {
+    (void)cos_bit;
+    const int32_t* kernel = av2_adst_kernel16[FWD_TXFM];
+    const int size = TXFM_SIZE16;
+
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i rnding = _mm256_set1_epi32(1 << (FWD_ADST_BIT - 1));
+    __m256i x[16];
+    for (int i = 0; i < 16; ++i) {
+        int row_idx = i*size;
+        __m256i sum1 = zero;
+        __m256i sum2 = zero;
+        __m256i t1, t2;
+        for (int j = 0; j < 16; j+=2) {
+            const __m256i coef1 = _mm256_set1_epi16(kernel[row_idx + j]);
+            const __m256i coef2 = _mm256_set1_epi16(kernel[row_idx + j + 1]);
+            matrix_coef_mult_avx2(coef1, coef2, input[j], input[j+1], &t1, &t2);
+            sum1 = _mm256_add_epi32(sum1, t1);
+            sum2 = _mm256_add_epi32(sum2, t2);
+        }
+        round_shift_avx2(&sum1, rnding, FWD_ADST_BIT);
+        round_shift_avx2(&sum2, rnding, FWD_ADST_BIT);
+        x[i] = _mm256_packs_epi32(sum1, sum2);
+    }
+    for (int i = 0; i < 16; ++i) output[i] = x[i];
+}
+#else
 static INLINE void fadst16x16_new_avx2(const __m256i *input, __m256i *output,
                                        int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
@@ -1408,6 +1437,7 @@
   output[14] = x1[15];
   output[15] = x1[0];
 }
+#endif
 
 static INLINE void fidentity16x16_new_avx2(const __m256i *input,
                                            __m256i *output, int8_t cos_bit) {
@@ -2101,6 +2131,35 @@
   output[7] = x4[7];
 }
 
+#if CONFIG_ADST8_TUNED
+static INLINE void fadst8x8_new_avx2(const __m256i *input, __m256i *output,
+                                     int8_t cos_bit) {
+    (void)cos_bit;
+
+    const int32_t* kernel = av2_adst_kernel8[FWD_TXFM];
+    const int size = TXFM_SIZE8;
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i rnding = _mm256_set1_epi32(1 << (FWD_ADST_BIT - 1));
+    __m256i x[8];
+    for (int i = 0; i < 8; ++i) {
+        int row_idx = i*size;
+        __m256i sum1 = zero;
+        __m256i sum2 = zero;
+        __m256i t1, t2;
+        for (int j = 0; j < 8; j+=2) {
+            const __m256i coef1 = _mm256_set1_epi16(kernel[row_idx + j]);
+            const __m256i coef2 = _mm256_set1_epi16(kernel[row_idx + j + 1]);
+            matrix_coef_mult_avx2(coef1, coef2, input[j], input[j+1], &t1, &t2);
+            sum1 = _mm256_add_epi32(sum1, t1);
+            sum2 = _mm256_add_epi32(sum2, t2);
+        }
+        round_shift_avx2(&sum1, rnding, FWD_ADST_BIT);
+        round_shift_avx2(&sum2, rnding, FWD_ADST_BIT);
+        x[i] = _mm256_packs_epi32(sum1, sum2);
+    }
+    for (int i = 0; i < 8; ++i) output[i] = x[i];
+}
+#else
 static INLINE void fadst8x8_new_avx2(const __m256i *input, __m256i *output,
                                      int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
@@ -2213,6 +2272,7 @@
   output[6] = x6[7];
   output[7] = x6[0];
 }
+#endif
 
 static INLINE void fidentity8x8_new_avx2(const __m256i *input, __m256i *output,
                                          int8_t cos_bit) {
@@ -2377,6 +2437,47 @@
               &output[11], &output[13], &output[3], &__rounding_256, &cos_bit);
 }
 
+#if CONFIG_ADST16_TUNED
+static INLINE void fadst8x16_new_avx2(const __m128i *input, __m128i *output,
+                                      int8_t cos_bit) {
+    (void)cos_bit;
+    const int32_t* kernel = av2_adst_kernel16[FWD_TXFM];
+    const int size = TXFM_SIZE16;
+
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i rnding = _mm256_set1_epi32(1 << (FWD_ADST_BIT - 1));
+    __m256i x[16], s[8];
+
+    for(int i = 0; i < size; ++i) {
+        x[i] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[i]), input[i], 0x1);
+    }
+
+    for (int i = 0; i < 8; ++i) {
+        int row_idx = (2*i)*size;
+        __m256i sum1 = zero;
+        __m256i sum2 = zero;
+        __m256i t1, t2;
+        for (int j = 0; j < 8; ++j) {
+            int col_idx = 2*j;
+            const __m256i coef1 = interleave_coefs_avx2(kernel[row_idx + col_idx],
+                                                        kernel[row_idx + size + col_idx]);
+            const __m256i coef2 = interleave_coefs_avx2(kernel[row_idx + col_idx + 1],
+                                                        kernel[row_idx + size + col_idx + 1]);
+            matrix_coef_mult_avx2(coef1, coef2, x[col_idx], x[col_idx + 1], &t1, &t2);
+            sum1 = _mm256_add_epi32(sum1, t1);
+            sum2 = _mm256_add_epi32(sum2, t2);
+        }
+        round_shift_avx2(&sum1, rnding, FWD_ADST_BIT);
+        round_shift_avx2(&sum2, rnding, FWD_ADST_BIT);
+        s[i] = _mm256_packs_epi32(sum1, sum2);
+    }
+
+    for (int i = 0; i < 8; ++i) {
+        output[2*i] = _mm256_extracti128_si256(s[i], 0x00);
+        output[2*i+1] = _mm256_extracti128_si256(s[i], 0x01);
+    }
+}
+#else
 static INLINE void fadst8x16_new_avx2(const __m128i *input, __m128i *output,
                                       int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
@@ -2593,6 +2694,7 @@
   btf_16_avx2(&cospi_arr[18], &cospi_arr[19], &in0, &in1, &output[3],
               &output[12], &output[1], &output[14], &__rounding_256, &cos_bit);
 }
+#endif
 
 static INLINE void fidentity8x16_new_avx2(const __m128i *input, __m128i *output,
                                           int8_t cos_bit) {

diff --git a/av1/encoder/x86/av1_fwd_txfm_sse2.c b/av1/encoder/x86/av1_fwd_txfm_sse2.c
index 5fe1179..c185da3 100644
--- a/av1/encoder/x86/av1_fwd_txfm_sse2.c
+++ b/av1/encoder/x86/av1_fwd_txfm_sse2.c

@@ -1418,6 +1418,70 @@
   output[63] = x10[63];
 }
 
+#if CONFIG_ADST4_TUNED
+static void fadst4x4_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+    const int32_t *cospi = cospi_arr(cos_bit);
+    const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+    const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+    const __m128i cospi_p8_p56 = pair_set_epi16(cospi[8], cospi[56]);
+    const __m128i cospi_p56_m8 = pair_set_epi16(cospi[56], -cospi[8]);
+    const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+    const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+    const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+    __m128i u[4], v[4];
+
+    // stage 1
+    u[0] = _mm_unpacklo_epi16(input[3], input[0]);
+    u[1] = _mm_unpacklo_epi16(input[1], input[2]);
+
+    // stage 2
+    v[0] = _mm_madd_epi16(u[0], cospi_p8_p56);
+    v[1] = _mm_madd_epi16(u[0], cospi_p56_m8);
+    v[2] = _mm_madd_epi16(u[1], cospi_p40_p24);
+    v[3] = _mm_madd_epi16(u[1], cospi_p24_m40);
+
+    u[0] = _mm_add_epi32(v[0], __rounding);
+    u[1] = _mm_add_epi32(v[1], __rounding);
+    u[2] = _mm_add_epi32(v[2], __rounding);
+    u[3] = _mm_add_epi32(v[3], __rounding);
+
+    v[0] = _mm_srai_epi32(u[0], cos_bit);
+    v[1] = _mm_srai_epi32(u[1], cos_bit);
+    v[2] = _mm_srai_epi32(u[2], cos_bit);
+    v[3] = _mm_srai_epi32(u[3], cos_bit);
+
+    u[0] = _mm_packs_epi32(v[0], v[1]);
+    u[1] = _mm_packs_epi32(v[2], v[3]);
+
+    // stage 3
+    v[0] = _mm_adds_epi16(u[0], u[1]);
+    v[1] = _mm_subs_epi16(u[0], u[1]);
+
+    // stage 4
+    u[0] = v[0];
+
+    __m128i t1, t2;
+    t1 = _mm_srli_si128(v[1], 8);
+    t2 = _mm_unpacklo_epi16(v[1], t1);
+    u[2] = _mm_madd_epi16(t2, cospi_p32_p32);
+    u[3] = _mm_madd_epi16(t2, cospi_p32_m32);
+    v[2] = _mm_add_epi32(u[2], __rounding);
+    v[3] = _mm_add_epi32(u[3], __rounding);
+    u[2] = _mm_srai_epi32(v[2], cos_bit);
+    u[3] = _mm_srai_epi32(v[3], cos_bit);
+    u[1] =  _mm_packs_epi32(u[2], u[3]);
+
+    // stage 5
+    u[2] = _mm_subs_epi16(_mm_setzero_si128(), u[0]);
+    u[3] = _mm_subs_epi16(_mm_setzero_si128(), u[1]);
+
+    output[0] = u[0];
+    output[1] = u[3];
+    output[2] = _mm_srli_si128(u[1], 8);
+    output[3] = _mm_srli_si128(u[2], 8);
+}
+#else
 static void fadst4x4_new_sse2(const __m128i *input, __m128i *output,
                               int8_t cos_bit) {
   const int32_t *sinpi = sinpi_arr(cos_bit);
@@ -1468,7 +1532,17 @@
   output[2] = _mm_srli_si128(output[0], 8);
   output[3] = _mm_srli_si128(output[1], 8);
 }
+#endif
 
+#if CONFIG_ADST8_TUNED
+static void fadst8x8_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit);
+
+static void fadst4x8_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+    fadst8x8_new_sse2(input, output, cos_bit);
+}
+#else
 static void fadst4x8_new_sse2(const __m128i *input, __m128i *output,
                               int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
@@ -1565,7 +1639,46 @@
   output[6] = x6[7];
   output[7] = x6[0];
 }
+#endif
 
+#if CONFIG_ADST4_TUNED
+static void fadst8x4_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+    const int32_t *cospi = cospi_arr(cos_bit);
+    const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+    const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+    const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+    const __m128i cospi_p8_p56 = pair_set_epi16(cospi[8], cospi[56]);
+    const __m128i cospi_p56_m8 = pair_set_epi16(cospi[56], -cospi[8]);
+    const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+    const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+
+    // stage 2
+    __m128i x1[4];
+    btf_16_sse2(cospi_p8_p56, cospi_p56_m8, input[3], input[0], x1[0], x1[1]);
+    btf_16_sse2(cospi_p40_p24, cospi_p24_m40, input[1], input[2], x1[2], x1[3]);
+
+    // stage 3
+    __m128i x2[4];
+    x2[0] = _mm_adds_epi16(x1[0], x1[2]);
+    x2[2] = _mm_subs_epi16(x1[0], x1[2]);
+    x2[1] = _mm_adds_epi16(x1[1], x1[3]);
+    x2[3] = _mm_subs_epi16(x1[1], x1[3]);
+
+    // stage 4
+    __m128i x3[4];
+    x3[0] = x2[0];
+    x3[1] = x2[1];
+    btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x2[2], x2[3], x3[2], x3[3]);
+
+    //stage 5
+    output[0] = x3[0];
+    output[1] = _mm_subs_epi16(_mm_setzero_si128(), x3[2]);
+    output[2] = x3[3];
+    output[3] = _mm_subs_epi16(_mm_setzero_si128(), x3[1]);
+}
+#else
 static void fadst8x4_new_sse2(const __m128i *input, __m128i *output,
                               int8_t cos_bit) {
   const int32_t *sinpi = sinpi_arr(cos_bit);
@@ -1643,7 +1756,41 @@
   output[2] = _mm_packs_epi32(u_lo[2], u_hi[2]);
   output[3] = _mm_packs_epi32(u_lo[3], u_hi[3]);
 }
+#endif
 
+#if CONFIG_ADST8_TUNED
+static void fadst8x8_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+    (void)cos_bit;
+
+    const int32_t* kernel = av2_adst_kernel8[FWD_TXFM];
+    const int size = TXFM_SIZE8;
+
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i rnding = _mm_set1_epi32(1 << (FWD_ADST_BIT - 1));
+    __m128i x[8];
+    for (int i = 0; i < 8; ++i) {
+        int row_idx = i*size;
+        __m128i sum1 = zero;
+        __m128i sum2 = zero;
+        __m128i t1, t2;
+        for (int j = 0; j < 8; j+=2) {
+            const __m128i coef1 = _mm_set1_epi16(kernel[row_idx + j]);
+            const __m128i coef2 = _mm_set1_epi16(kernel[row_idx + j + 1]);
+            matrix_coef_mult_sse2(coef1, coef2, input[j], input[j+1], &t1, &t2);
+            sum1 = _mm_add_epi32(sum1, t1);
+            sum2 = _mm_add_epi32(sum2, t2);
+        }
+        __m128i a0 = _mm_add_epi32(sum1, rnding);
+        __m128i a1 = _mm_add_epi32(sum2, rnding);
+        __m128i c0 = _mm_srai_epi32(a0, FWD_ADST_BIT);
+        __m128i c1 = _mm_srai_epi32(a1, FWD_ADST_BIT);
+
+        x[i] = _mm_packs_epi32(c0, c1);
+    }
+    for (int i = 0; i < 8; ++i) output[i] = x[i];
+}
+#else
 static void fadst8x8_new_sse2(const __m128i *input, __m128i *output,
                               int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
@@ -1732,7 +1879,42 @@
   output[6] = x6[7];
   output[7] = x6[0];
 }
+#endif
 
+#if CONFIG_ADST16_TUNED
+static void fadst8x16_new_sse2(const __m128i *input, __m128i *output,
+                               int8_t cos_bit) {
+    const int32_t* kernel = av2_adst_kernel16[FWD_TXFM];
+    const int size = TXFM_SIZE16;
+
+    (void)cos_bit;
+
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i rnding = _mm_set1_epi32(1 << (FWD_ADST_BIT - 1));
+    __m128i x[16];
+    for (int i = 0; i < 16; ++i) {
+        int row_idx = i*size;
+        __m128i sum1 = zero;
+        __m128i sum2 = zero;
+        __m128i t1, t2;
+        for (int j = 0; j < 16; j+=2) {
+            const __m128i coef1 = _mm_set1_epi16(kernel[row_idx + j]);
+            const __m128i coef2 = _mm_set1_epi16(kernel[row_idx + j + 1]);
+            matrix_coef_mult_sse2(coef1, coef2, input[j], input[j+1], &t1, &t2);
+            sum1 = _mm_add_epi32(sum1, t1);
+            sum2 = _mm_add_epi32(sum2, t2);
+        }
+        __m128i a0 = _mm_add_epi32(sum1, rnding);
+        __m128i a1 = _mm_add_epi32(sum2, rnding);
+
+        __m128i c0 = _mm_srai_epi32(a0, FWD_ADST_BIT);
+        __m128i c1 = _mm_srai_epi32(a1, FWD_ADST_BIT);
+
+        x[i] = _mm_packs_epi32(c0, c1);
+    }
+    for (int i = 0; i < 16; ++i) output[i] = x[i];
+}
+#else
 static void fadst8x16_new_sse2(const __m128i *input, __m128i *output,
                                int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
@@ -1917,6 +2099,7 @@
   output[14] = x8[15];
   output[15] = x8[0];
 }
+#endif
 
 static const transform_1d_sse2 col_txfm4x4_arr[TX_TYPES] = {
   fdct4x4_new_sse2,       // DCT_DCT

diff --git a/av1/encoder/x86/highbd_fwd_txfm_avx2.c b/av1/encoder/x86/highbd_fwd_txfm_avx2.c
index 5fdf3f2..3b4e418 100644
--- a/av1/encoder/x86/highbd_fwd_txfm_avx2.c
+++ b/av1/encoder/x86/highbd_fwd_txfm_avx2.c

@@ -354,6 +354,34 @@
     out[6 * outstride + col] = u[3];  // buf0[3]
   }
 }
+#if CONFIG_ADST8_TUNED
+static void fadst8_avx2(__m256i *in, __m256i *out, const int8_t bit,
+                        const int col_num, const int outstride) {
+    (void)bit;
+    const int32_t* kernel = av2_adst_kernel8[FWD_TXFM];
+    const int size = TXFM_SIZE8;
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i rnding = _mm256_set1_epi32(1 << (FWD_ADST_BIT - 1));
+    __m256i x[8];
+    int col;
+    for (col = 0; col < col_num; ++col) {
+        for (int i = 0; i < 8; ++i) {
+            int row_idx = i*size;
+            __m256i sum = zero;
+            __m256i t;
+            for (int j = 0; j < 8; ++j) {
+                const __m256i coef = _mm256_set1_epi32(kernel[row_idx + j]);
+                t = _mm256_mullo_epi32(in[j* col_num + col], coef);
+                sum = _mm256_add_epi32(sum, t);
+            }
+            sum = _mm256_add_epi32(sum, rnding);
+            x[i] = _mm256_srai_epi32(sum, FWD_ADST_BIT);
+        }
+        for(int i = 0; i < 8; ++i)
+            out[i * outstride + col] = x[i];
+    }
+}
+#else
 static void fadst8_avx2(__m256i *in, __m256i *out, const int8_t bit,
                         const int col_num, const int outstirde) {
   (void)col_num;
@@ -527,6 +555,7 @@
     out[7 * outstirde + col] = v0;
   }
 }
+#endif
 static void idtx8_avx2(__m256i *in, __m256i *out, const int8_t bit, int col_num,
                        int outstride) {
   (void)bit;
@@ -1012,6 +1041,35 @@
     out[15 * outstride + col] = v[15];
   }
 }
+
+#if CONFIG_ADST16_TUNED
+static void fadst16_avx2(__m256i *in, __m256i *out, const int8_t bit,
+                         const int num_cols, const int outstride) {
+    (void)bit;
+    const int32_t* kernel = av2_adst_kernel16[FWD_TXFM];
+    const int size = TXFM_SIZE16;
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i rnding = _mm256_set1_epi32(1 << (FWD_ADST_BIT - 1));
+    __m256i x[16];
+    int col;
+    for (col = 0; col < num_cols; ++col) {
+        for (int i = 0; i < 16; ++i) {
+            int row_idx = i*size;
+            __m256i sum = zero;
+            __m256i t;
+            for (int j = 0; j < 16; ++j) {
+                const __m256i coef = _mm256_set1_epi32(kernel[row_idx + j]);
+                t = _mm256_mullo_epi32(in[j* num_cols + col], coef);
+                sum = _mm256_add_epi32(sum, t);
+            }
+            sum = _mm256_add_epi32(sum, rnding);
+            x[i] = _mm256_srai_epi32(sum, FWD_ADST_BIT);
+        }
+        for(int i = 0; i < 16; ++i)
+            out[i * outstride + col] = x[i];
+    }
+}
+#else
 static void fadst16_avx2(__m256i *in, __m256i *out, const int8_t bit,
                          const int num_cols, const int outstride) {
   const int32_t *cospi = cospi_arr(bit);
@@ -1264,6 +1322,7 @@
     out[15 * outstride + col] = v[0];
   }
 }
+#endif
 static void idtx16_avx2(__m256i *in, __m256i *out, const int8_t bit,
                         int col_num, const int outstride) {
   (void)bit;

diff --git a/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/av1/encoder/x86/highbd_fwd_txfm_sse4.c
index 703ddee..d6bad48 100644
--- a/av1/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/av1/encoder/x86/highbd_fwd_txfm_sse4.c

@@ -126,6 +126,69 @@
   _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
 }
 
+#if CONFIG_ADST4_TUNED
+static void fadst4x4_sse4_1(__m128i *in, __m128i *out, int bit,
+                            const int num_col) {
+    const int32_t *cospi = cospi_arr(bit);
+    const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+    const __m128i cospi8 =  _mm_set1_epi32((int)cospi[8]);
+    const __m128i cospi24 = _mm_set1_epi32((int)cospi[24]);
+    const __m128i cospi32 = _mm_set1_epi32((int)cospi[32]);
+    const __m128i cospi40 = _mm_set1_epi32((int)cospi[40]);
+    const __m128i cospi56 = _mm_set1_epi32((int)cospi[56]);
+
+    __m128i s0, s1, s2, s3;
+    __m128i x0, x1, x2, x3;
+    __m128i u0, u1, u2, u3;
+    __m128i v0, v1, v2, v3;
+
+    // stage 0
+    // stage 1
+    int idx = 0 * num_col;
+    x1 = in[idx];
+    idx += num_col;
+    x2 = in[idx];
+    idx += num_col;
+    x3 = in[idx];
+    idx += num_col;
+    x0 = in[idx];
+
+    // stage 2
+    s0 = half_btf_sse4_1(&cospi8, &x0, &cospi56, &x1, &rnding, bit);
+    s1 = half_btf_neg_sse4_1(&cospi56, &x0, &cospi8, &x1, &rnding, bit);
+    s2 = half_btf_sse4_1(&cospi40, &x2, &cospi24, &x3, &rnding, bit);
+    s3 = half_btf_neg_sse4_1(&cospi24, &x2, &cospi40, &x3, &rnding, bit);
+
+    // stage 3
+    x0 = _mm_add_epi32(s0, s2);
+    x1 = _mm_add_epi32(s1, s3);
+    x2 = _mm_sub_epi32(s0, s2);
+    x3 = _mm_sub_epi32(s1, s3);
+
+    // stage 4
+    s0 = x0;
+    s1 = x1;
+    s2 = half_btf_sse4_1(&cospi32, &x2, &cospi32, &x3, &rnding, bit);
+    s3 = half_btf_neg_sse4_1(&cospi32, &x2, &cospi32, &x3, &rnding, bit);
+
+    //stage 5
+    u0 = s0;
+    u1 = _mm_sub_epi32(_mm_setzero_si128(), s2);
+    u2 = s3;
+    u3 = _mm_sub_epi32(_mm_setzero_si128(), s1);
+
+    // Transpose 4x4 32-bit
+    v0 = _mm_unpacklo_epi32(u0, u1);
+    v1 = _mm_unpackhi_epi32(u0, u1);
+    v2 = _mm_unpacklo_epi32(u2, u3);
+    v3 = _mm_unpackhi_epi32(u2, u3);
+
+    out[0] = _mm_unpacklo_epi64(v0, v2);
+    out[1] = _mm_unpackhi_epi64(v0, v2);
+    out[2] = _mm_unpacklo_epi64(v1, v3);
+    out[3] = _mm_unpackhi_epi64(v1, v3);
+}
+#else
 static void fadst4x4_sse4_1(__m128i *in, __m128i *out, int bit,
                             const int num_col) {
   const int32_t *sinpi = sinpi_arr(bit);
@@ -189,6 +252,7 @@
   out[2] = _mm_unpacklo_epi64(v1, v3);
   out[3] = _mm_unpackhi_epi64(v1, v3);
 }
+#endif
 static void idtx4x4_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
   (void)bit;
   __m128i fact = _mm_set1_epi32(NewSqrt2);
@@ -633,6 +697,34 @@
   fdct4x8_sse4_1(in + 1, out + 1, bit, col_num);
 }
 
+#if CONFIG_ADST8_TUNED
+static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit,
+                            const int col_num) {
+    (void)bit;
+    const int32_t* kernel = av2_adst_kernel8[FWD_TXFM];
+    const int size = TXFM_SIZE8;
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i rnding = _mm_set1_epi32(1 << (FWD_ADST_BIT - 1));
+    __m128i x[8];
+    int col;
+    for (col = 0; col < col_num; ++col) {
+        for(int i = 0; i < 8; ++i) {
+            int row_idx = i*size;
+            __m128i sum = zero;
+            __m128i t;
+            for(int j = 0; j < 8; ++j) {
+                const __m128i coef = _mm_set1_epi32(kernel[row_idx + j]);
+                t = _mm_mullo_epi32(in[j* col_num + col], coef);
+                sum = _mm_add_epi32(sum, t);
+            }
+            sum = _mm_add_epi32(sum, rnding);
+            x[i] = _mm_srai_epi32(sum, FWD_ADST_BIT);
+        }
+        for(int i = 0; i < 8; ++i)
+            out[i * col_num + col] = x[i];
+    }
+}
+#else
 static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit,
                             const int col_num) {
   const int32_t *cospi = cospi_arr(bit);
@@ -814,6 +906,7 @@
     out[col_num * 7 + col] = v0;
   }
 }
+#endif
 static void idtx8x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
   (void)bit;
 
@@ -1463,6 +1556,35 @@
   }
 }
 
+#if CONFIG_ADST16_TUNED
+static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit,
+                              const int num_cols) {
+    (void)bit;
+    const int32_t* kernel = av2_adst_kernel16[FWD_TXFM];
+    const int size = TXFM_SIZE16;
+
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i rnding = _mm_set1_epi32(1 << (FWD_ADST_BIT - 1));
+    __m128i x[16];
+    int col;
+    for (col = 0; col < num_cols; ++col) {
+        for(int i = 0; i < 16; ++i) {
+            int row_idx = i*size;
+            __m128i sum = zero;
+            __m128i t;
+            for(int j = 0; j < 16; ++j) {
+                const __m128i coef = _mm_set1_epi32(kernel[row_idx + j]);
+                t = _mm_mullo_epi32(in[j* num_cols + col], coef);
+                sum = _mm_add_epi32(sum, t);
+            }
+            sum = _mm_add_epi32(sum, rnding);
+            x[i] = _mm_srai_epi32(sum, FWD_ADST_BIT);
+        }
+        for(int i = 0; i < 16; ++i)
+            out[i * num_cols + col] = x[i];
+    }
+}
+#else
 static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit,
                               const int num_cols) {
   const int32_t *cospi = cospi_arr(bit);
@@ -1709,6 +1831,7 @@
     out[15 * num_cols + col] = v[0];
   }
 }
+#endif
 
 static void col_txfm_16x16_rounding(__m128i *in, int shift) {
   // Note:

diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index 667da53..9d6884c 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake

@@ -244,6 +244,12 @@
 set_aom_config_var(CONFIG_SKIP_TXFM_OPT 1
                    "Enable to optimize the signaling of skip_txfm")
 set_aom_config_var(CONFIG_CWP 1 "Enables compound weighted prediction.")
+set_aom_config_var(CONFIG_ADST4_TUNED 1
+                   "AV2 experiment to replace the ADST4 basis.")
+set_aom_config_var(CONFIG_ADST8_TUNED 1
+                   "AV2 experiment to replace the ADST8 basis with matrix mults.")
+set_aom_config_var(CONFIG_ADST16_TUNED 1
+                   "AV2 experiment to replace the ADST16 basis with matrix mults.")
 
 # This is an encode-only change.
 set_aom_config_var(CONFIG_MV_SEARCH_RANGE 1
commit	3497d20e54426ce67d43eb282da2ad5f17f5f9f2	[log] [tgz]
author	Aleix Segui <asegui@netflix.com>	Wed Apr 19 16:47:17 2023 -0700
committer	Joel Sole <jsole@netflix.com>	Wed Jun 14 10:25:16 2023 -0700
tree	a51d961a6ca43a1c3f166112023a896c7d6da4d0
parent	3b7dd10416db1fd48228bbd2383196df2416175c [diff]