DST7 16p & 32p experiment (No SIMD)

Change-Id: Ia8cd6da9bf885da9ae8f8722f42862b0eb52eebc
diff --git a/av1/common/av1_inv_txfm1d.c b/av1/common/av1_inv_txfm1d.c
index 8d69efc..f081fbb 100644
--- a/av1/common/av1_inv_txfm1d.c
+++ b/av1/common/av1_inv_txfm1d.c
@@ -818,6 +818,21 @@
   bf1[7] = -bf0[1];
 }
 
+#if CONFIG_DST7_16X16
+void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                 const int8_t *stage_range) {
+  assert(output != input);
+  (void)cos_bit;
+  (void)stage_range;
+  for (int32_t i = 0; i < 16; i++) {
+    int32_t sum = 0;
+    for (int32_t j = 0; j < 16; j++) {
+      sum += input[j] * dst7_16x16[j][i];
+    }
+    output[i] = ROUND_POWER_OF_TWO_SIGNED(sum, DST_16X16_PREC_BITS);
+  }
+}
+#else
 void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
                  const int8_t *stage_range) {
   assert(output != input);
@@ -1025,6 +1040,22 @@
   bf1[14] = bf0[9];
   bf1[15] = -bf0[1];
 }
+#endif  // CONFIG_DST7_16X16
+
+#if CONFIG_DST_32X32
+void av1_iadst32(const int32_t *input, int32_t *output, int8_t cos_bit,
+                 const int8_t *stage_range) {
+  (void)cos_bit;
+  (void)stage_range;
+  for (int32_t i = 0; i < 32; i++) {
+    int32_t sum = 0;
+    for (int32_t j = 0; j < 32; j++) {
+      sum += input[j] * dst7_32x32[j][i];
+    }
+    output[i] = ROUND_POWER_OF_TWO_SIGNED(sum, DST_32X32_PREC_BITS);
+  }
+}
+#endif
 
 void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
                       const int8_t *stage_range) {
diff --git a/av1/common/av1_inv_txfm1d.h b/av1/common/av1_inv_txfm1d.h
index e1d5d98..b90643e 100644
--- a/av1/common/av1_inv_txfm1d.h
+++ b/av1/common/av1_inv_txfm1d.h
@@ -45,6 +45,10 @@
                 const int8_t *stage_range);
 void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
                  const int8_t *stage_range);
+#if CONFIG_DST_32X32
+void av1_iadst32(const int32_t *input, int32_t *output, int8_t cos_bit,
+                 const int8_t *stage_range);
+#endif
 void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
                       const int8_t *stage_range);
 void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
diff --git a/av1/common/av1_inv_txfm2d.c b/av1/common/av1_inv_txfm2d.c
index 559d121..3b5ae91 100644
--- a/av1/common/av1_inv_txfm2d.c
+++ b/av1/common/av1_inv_txfm2d.c
@@ -121,6 +121,9 @@
     case TXFM_TYPE_ADST4: return av1_iadst4;
     case TXFM_TYPE_ADST8: return av1_iadst8;
     case TXFM_TYPE_ADST16: return av1_iadst16;
+#if CONFIG_DST_32X32
+    case TXFM_TYPE_ADST32: return av1_iadst32;
+#endif  // CONFIG_DST_32X32
     case TXFM_TYPE_IDENTITY4: return av1_iidentity4_c;
     case TXFM_TYPE_IDENTITY8: return av1_iidentity8_c;
     case TXFM_TYPE_IDENTITY16: return av1_iidentity16_c;
diff --git a/av1/common/av1_txfm.c b/av1/common/av1_txfm.c
index ac43402..498600d 100644
--- a/av1/common/av1_txfm.c
+++ b/av1/common/av1_txfm.c
@@ -59,6 +59,142 @@
     14359, 12785, 11204, 9616,  8022,  6424,  4821,  3216,  1608 }
 };
 
+#if CONFIG_DST7_16X16
+const int16_t dst7_16x16[16][16] = {
+  { 12, 24, 36, 47, 57, 69, 78, 87, 94, 103, 109, 115, 118, 123, 125, 126 },
+  { 36, 69, 94, 115, 125, 125, 115, 94, 69, 36, 0, -36, -69, -94, -115, -125 },
+  { 57, 103, 125, 118, 87, 36, -24, -78, -115, -126, -109, -69, -12, 47, 94,
+    123 },
+  { 78, 123, 115, 57, -24, -94, -126, -103, -36, 47, 109, 125, 87, 12, -69,
+    -118 },
+  { 94, 125, 69, -36, -115, -115, -36, 69, 125, 94, 0, -94, -125, -69, 36,
+    115 },
+  { 109, 109, 0, -109, -109, 0, 109, 109, 0, -109, -109, 0, 109, 109, 0, -109 },
+  { 118, 78, -69, -123, -12, 115, 87, -57, -125, -24, 109, 94, -47, -126, -36,
+    103 },
+  { 125, 36, -115, -69, 94, 94, -69, -115, 36, 125, 0, -125, -36, 115, 69,
+    -94 },
+  { 126, -12, -125, 24, 123, -36, -118, 47, 115, -57, -109, 69, 103, -78, -94,
+    87 },
+  { 123, -57, -94, 103, 47, -125, 12, 118, -69, -87, 109, 36, -126, 24, 115,
+    -78 },
+  { 115, -94, -36, 125, -69, -69, 125, -36, -94, 115, 0, -115, 94, 36, -125,
+    69 },
+  { 103, -118, 36, 78, -126, 69, 47, -123, 94, 12, -109, 115, -24, -87, 125,
+    -57 },
+  { 87, -126, 94, -12, -78, 125, -103, 24, 69, -123, 109, -36, -57, 118, -115,
+    47 },
+  { 69, -115, 125, -94, 36, 36, -94, 125, -115, 69, 0, -69, 115, -125, 94,
+    -36 },
+  { 47, -87, 115, -126, 118, -94, 57, -12, -36, 78, -109, 125, -123, 103, -69,
+    24 },
+  { 24, -47, 69, -87, 103, -115, 123, -126, 125, -118, 109, -94, 78, -57, 36,
+    -12 },
+};
+#endif
+
+#if CONFIG_DST_32X32
+const int16_t
+    dst7_32x32[32][32] = {
+      { 6,   12,  18,  24,  30,  36,  42,  48,  54,  59,  64,
+        70,  75,  80,  84,  89,  93,  97,  101, 105, 108, 111,
+        114, 116, 119, 121, 123, 124, 125, 126, 127, 127 },
+      { 18,  36,  54,  70,  84,  97,   108,  116,  123,  126, 127,
+        125, 121, 114, 105, 93,  80,   64,   48,   30,   12,  -6,
+        -24, -42, -59, -75, -89, -101, -111, -119, -124, -127 },
+      { 30,  59,  84,  105, 119, 126,  126,  119,  105,  84,   59,
+        30,  0,   -30, -59, -84, -105, -119, -126, -126, -119, -105,
+        -84, -59, -30, 0,   30,  59,   84,   105,  119,  126 },
+      { 42,   80,   108,  124,  126, 114, 89,  54,  12,   -30, -70,
+        -101, -121, -127, -119, -97, -64, -24, 18,  59,   93,  116,
+        127,  123,  105,  75,   36,  -6,  -48, -84, -111, -125 },
+      { 54,   97,   123,  125,  105, 64,  12,  -42, -89, -119, -127,
+        -111, -75,  -24,  30,   80,  114, 127, 116, 84,  36,   -18,
+        -70,  -108, -126, -121, -93, -48, 6,   59,  101, 124 },
+      { 64,  111, 127, 108, 59,  -6, -70, -114, -127, -105, -54,
+        12,  75,  116, 126, 101, 48, -18, -80,  -119, -125, -97,
+        -42, 24,  84,  121, 124, 93, 36,  -30,  -89,  -123 },
+      { 75,  121, 121, 75,  0,    -75,  -121, -121, -75, 0,  75,
+        121, 121, 75,  0,   -75,  -121, -121, -75,  0,   75, 121,
+        121, 75,  0,   -75, -121, -121, -75,  0,    75,  121 },
+      { 84,   126,  105, 30,   -59,  -119, -119, -59, 30,  105, 126,
+        84,   0,    -84, -126, -105, -30,  59,   119, 119, 59,  -30,
+        -105, -126, -84, 0,    84,   126,  105,  30,  -59, -119 },
+      { 93,  127,  80,   -18, -105, -124, -64,  36,  114, 119,  48,
+        -54, -121, -111, -30, 70,   125,  101,  12,  -84, -127, -89,
+        6,   97,   126,  75,  -24,  -108, -123, -59, 42,  116 },
+      { 101,  123, 48,   -64,  -126, -89, 18,  111,  116, 30,  -80,
+        -127, -75, 36,   119,  108,  12,  -93, -125, -59, 54,  124,
+        97,   -6,  -105, -121, -42,  70,  127, 84,   -24, -114 },
+      { 108,  114, 12,  -101, -119, -24,  93,   123,  36,  -84, -125,
+        -48,  75,  127, 59,   -64,  -127, -70,  54,   126, 80,  -42,
+        -124, -89, 30,  121,  97,   -18,  -116, -105, 6,   111 },
+      { 114, 101, -24, -123, -84,  48,  127, 64,  -70, -126, -42,
+        89,  121, 18,  -105, -111, 6,   116, 97,  -30, -124, -80,
+        54,  127, 59,  -75,  -125, -36, 93,  119, 12,  -108 },
+      { 119, 84,  -59,  -126, -30, 105, 105, -30,  -126, -59, 84,
+        119, 0,   -119, -84,  59,  126, 30,  -105, -105, 30,  126,
+        59,  -84, -119, 0,    119, 84,  -59, -126, -30,  105 },
+      { 123,  64,   -89, -111, 30,  127,  36,   -108, -93, 59,  124,
+        6,    -121, -70, 84,   114, -24,  -127, -42,  105, 97,  -54,
+        -125, -12,  119, 75,   -80, -116, 18,   126,  48,  -101 },
+      { 125,  42,  -111, -80,  84,  108,  -48, -124, 6,   126,  36,
+        -114, -75, 89,   105,  -54, -123, 12,  127,  30,  -116, -70,
+        93,   101, -59,  -121, 18,  127,  24,  -119, -64, 97 },
+      { 127, 18,   -124, -36, 119,  54,   -111, -70, 101,  84, -89,
+        -97, 75,   108,  -59, -116, 42,   123,  -24, -126, 6,  127,
+        12,  -125, -30,  121, 48,   -114, -64,  105, 80,   -93 },
+      { 127,  -6,  -127, 12,   126,  -18, -125, 24,   124, -30, -123,
+        36,   121, -42,  -119, 48,   116, -54,  -114, 59,  111, -64,
+        -108, 70,  105,  -75,  -101, 80,  97,   -84,  -93, 89 },
+      { 126, -30, -119, 59, 105, -84, -84,  105, 59,  -119, -30,
+        126, 0,   -126, 30, 119, -59, -105, 84,  84,  -105, -59,
+        119, 30,  -126, 0,  126, -30, -119, 59,  105, -84 },
+      { 124, -54,  -101, 97,  59,   -123, -6,  125, -48,  -105, 93,
+        64,  -121, -12,  126, -42,  -108, 89,  70,  -119, -18,  127,
+        -36, -111, 84,   75,  -116, -24,  127, -30, -114, 80 },
+      { 121, -75, -75, 121,  0,    -121, 75,   75,   -121, 0,   121,
+        -75, -75, 121, 0,    -121, 75,   75,   -121, 0,    121, -75,
+        -75, 121, 0,   -121, 75,   75,   -121, 0,    121,  -75 },
+      { 116,  -93, -42, 127,  -59, -80,  123,  -18, -108, 105, 24,
+        -124, 75,  64,  -126, 36,  97,   -114, -6,  119,  -89, -48,
+        127,  -54, -84, 121,  -12, -111, 101,  30,  -125, 70 },
+      { 111, -108, -6,  114, -105, -12, 116, -101, -18, 119, -97,
+        -24, 121,  -93, -30, 123,  -89, -36, 124,  -84, -42, 125,
+        -80, -48,  126, -75, -54,  127, -70, -59,  127, -64 },
+      { 105, -119, 30,   84,  -126, 59,   59,  -126, 84,   30,  -119,
+        105, 0,    -105, 119, -30,  -84,  126, -59,  -59,  126, -84,
+        -30, 119,  -105, 0,   105,  -119, 30,  84,   -126, 59 },
+      { 97,  -125, 64, 42, -119, 111, -24, -80,  127, -84, -18,
+        108, -121, 48, 59, -124, 101, -6,  -93,  126, -70, -36,
+        116, -114, 30, 75, -127, 89,  12,  -105, 123, -54 },
+      { 89,   -127, 93,  -6,   -84, 127, -97,  12,  80,   -126, 101,
+        -18,  -75,  125, -105, 24,  70,  -124, 108, -30,  -64,  123,
+        -111, 36,   59,  -121, 114, -42, -54,  119, -116, 48 },
+      { 80,   -124, 114,  -54, -30, 101,  -127, 97,   -24,  -59, 116,
+        -123, 75,   6,    -84, 125, -111, 48,   36,   -105, 127, -93,
+        18,   64,   -119, 121, -70, -12,  89,   -126, 108,  -42 },
+      { 70,  -116, 125,  -93, 30,  42,  -101, 127,  -111, 59,  12,
+        -80, 121,  -123, 84,  -18, -54, 108,  -127, 105,  -48, -24,
+        89,  -124, 119,  -75, 6,   64,  -114, 126,  -97,  36 },
+      { 59,   -105, 126, -119, 84,   -30,  -30, 84,   -119, 126, -105,
+        59,   0,    -59, 105,  -126, 119,  -84, 30,   30,   -84, 119,
+        -126, 105,  -59, 0,    59,   -105, 126, -119, 84,   -30 },
+      { 48,  -89,  116, -127, 119,  -93, 54,   -6,  -42,  84,  -114,
+        127, -121, 97,  -59,  12,   36,  -80,  111, -126, 123, -101,
+        64,  -18,  -30, 75,   -108, 125, -124, 105, -70,  24 },
+      { 36, -70, 97,  -116, 126, -125, 114, -93, 64, -30, -6,
+        42, -75, 101, -119, 127, -124, 111, -89, 59, -24, -12,
+        48, -80, 105, -121, 127, -123, 108, -84, 54, -18 },
+      { 24,   -48, 70,   -89, 105,  -116, 124, -127, 125, -119, 108,
+        -93,  75,  -54,  30,  -6,   -18,  42,  -64,  84,  -101, 114,
+        -123, 127, -126, 121, -111, 97,   -80, 59,   -36, 12 },
+      { 12,   -24, 36,   -48, 59,   -70, 80,   -89, 97,   -105, 111,
+        -116, 121, -124, 126, -127, 127, -125, 123, -119, 114,  -108,
+        101,  -93, 84,   -75, 64,   -54, 42,   -30, 18,   -6 }
+    };
+#endif  // CONFIG_DST_32X32
+
 // av1_sinpi_arr_data[i][j] = (int)round((sqrt(2) * sin(j*Pi/9) * 2 / 3) * (1
 // << (cos_bit_min + i))) modified so that elements j=1,2 sum to element j=4.
 const int32_t av1_sinpi_arr_data[7][5] = {
@@ -90,8 +226,12 @@
   { TXFM_TYPE_DCT4, TXFM_TYPE_ADST4, TXFM_TYPE_ADST4, TXFM_TYPE_IDENTITY4 },
   { TXFM_TYPE_DCT8, TXFM_TYPE_ADST8, TXFM_TYPE_ADST8, TXFM_TYPE_IDENTITY8 },
   { TXFM_TYPE_DCT16, TXFM_TYPE_ADST16, TXFM_TYPE_ADST16, TXFM_TYPE_IDENTITY16 },
+#if CONFIG_DST_32X32
+  { TXFM_TYPE_DCT32, TXFM_TYPE_ADST32, TXFM_TYPE_ADST32, TXFM_TYPE_IDENTITY32 },
+#else
   { TXFM_TYPE_DCT32, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID,
     TXFM_TYPE_IDENTITY32 },
+#endif  // CONFIG_DST_32X32
   { TXFM_TYPE_DCT64, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID }
 };
 
@@ -108,6 +248,9 @@
   1,   // TXFM_TYPE_IDENTITY8
   1,   // TXFM_TYPE_IDENTITY16
   1,   // TXFM_TYPE_IDENTITY32
+#if CONFIG_DST_32X32
+  1,  // TXFM_TYPE_ADST32
+#endif
 };
 
 void av1_range_check_buf(int32_t stage, const int32_t *input,
diff --git a/av1/common/av1_txfm.h b/av1/common/av1_txfm.h
index 20049b6..77bb19e 100644
--- a/av1/common/av1_txfm.h
+++ b/av1/common/av1_txfm.h
@@ -34,6 +34,16 @@
 extern const int32_t av1_cospi_arr_data[7][64];
 extern const int32_t av1_sinpi_arr_data[7][5];
 
+#if CONFIG_DST7_16X16
+extern const int16_t dst7_16x16[16][16];
+#define DST_16X16_PREC_BITS 7
+#endif
+
+#if CONFIG_DST_32X32
+extern const int16_t dst7_32x32[32][32];
+#define DST_32X32_PREC_BITS 7
+#endif  // CONFIG_DST_32X32
+
 #define MAX_TXFM_STAGE_NUM 12
 
 static const int cos_bit_min = 10;
@@ -125,6 +135,9 @@
   TXFM_TYPE_IDENTITY8,
   TXFM_TYPE_IDENTITY16,
   TXFM_TYPE_IDENTITY32,
+#if CONFIG_DST_32X32
+  TXFM_TYPE_ADST32,
+#endif
   TXFM_TYPES,
   TXFM_TYPE_INVALID,
 } UENUM1BYTE(TXFM_TYPE);
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index d5f3ff9..c535ca8 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -1026,6 +1026,18 @@
   { EXT_TX_SET_ALL16, EXT_TX_SET_DTT9_IDTX_1DDCT },
 };
 
+#if CONFIG_DST_32X32
+static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter,
+                                                int use_reduced_set) {
+  const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
+  if (tx_size_sqr_up == TX_64X64) return EXT_TX_SET_DCTONLY;
+  if (tx_size_sqr_up == TX_32X32) return EXT_TX_SET_DTT4_IDTX;
+  if (use_reduced_set)
+    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
+  const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
+  return av1_ext_tx_set_lookup[is_inter][tx_size_sqr == TX_16X16];
+}
+#else
 static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter,
                                                 int use_reduced_set) {
   const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
@@ -1037,6 +1049,7 @@
   const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
   return av1_ext_tx_set_lookup[is_inter][tx_size_sqr == TX_16X16];
 }
+#endif  // CONFIG_DST_32X32
 
 // Maps tx set types to the indices.
 static const int ext_tx_set_index[2][EXT_TX_SET_TYPES] = {
diff --git a/av1/common/idct.c b/av1/common/idct.c
index bff438f..0e9c727 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -290,9 +290,32 @@
       tmp[r * tmp_stride + c] = dst[r * stride + c];
     }
   }
-
+#if CONFIG_DST7_16X16 && CONFIG_DST_32X32
+  if (tx_size_wide[tx_size] == 32 || tx_size_high[tx_size] == 32 ||
+      tx_size_wide[tx_size] == 16 || tx_size_high[tx_size] == 16)
+    av1_highbd_inv_txfm_add_c(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride,
+                              txfm_param);
+  else
+    av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride,
+                            txfm_param);
+#elif CONFIG_DST7_16X16
+  if (tx_size_wide[tx_size] == 16 || tx_size_high[tx_size] == 16)
+    av1_highbd_inv_txfm_add_c(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride,
+                              txfm_param);
+  else
+    av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride,
+                            txfm_param);
+#elif CONFIG_DST_32X32
+  if (tx_size_wide[tx_size] == 32 || tx_size_high[tx_size] == 32)
+    av1_highbd_inv_txfm_add_c(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride,
+                              txfm_param);
+  else
+    av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride,
+                            txfm_param);
+#else
   av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride,
                           txfm_param);
+#endif
 
   for (int r = 0; r < h; ++r) {
     for (int c = 0; c < w; ++c) {
@@ -314,9 +337,56 @@
                   &txfm_param);
   assert(av1_ext_tx_used[txfm_param.tx_set_type][txfm_param.tx_type]);
 
+#if CONFIG_DST7_16X16 || CONFIG_DST_32X32
+  uint16_t allowed_tx_mask = 0xF1FE;
+  allowed_tx_mask &= (1 << tx_type);
+#endif
+
   if (txfm_param.is_hbd) {
+#if CONFIG_DST7_16X16 && CONFIG_DST_32X32
+    if ((tx_size_wide[tx_size] == 16 || tx_size_high[tx_size] == 16 ||
+         tx_size_wide[tx_size] == 32 || tx_size_high[tx_size] == 32) &&
+        allowed_tx_mask)
+      av1_highbd_inv_txfm_add_c(dqcoeff, dst, stride, &txfm_param);
+    else
+      av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+#elif CONFIG_DST7_16X16
+    if ((tx_size_wide[tx_size] == 16 || tx_size_high[tx_size] == 16) &&
+        allowed_tx_mask)
+      av1_highbd_inv_txfm_add_c(dqcoeff, dst, stride, &txfm_param);
+    else
+      av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+#elif CONFIG_DST_32X32
+    if ((tx_size_wide[tx_size] == 32 || tx_size_high[tx_size] == 32) &&
+        allowed_tx_mask)
+      av1_highbd_inv_txfm_add_c(dqcoeff, dst, stride, &txfm_param);
+    else
+      av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+#else
     av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+#endif  // CONFIG_DST7_16X16 && CONFIG_DST_32X32
   } else {
+#if CONFIG_DST7_16X16 && CONFIG_DST_32X32
+    if ((tx_size_wide[tx_size] == 16 || tx_size_high[tx_size] == 16 ||
+         tx_size_wide[tx_size] == 32 || tx_size_high[tx_size] == 32) &&
+        allowed_tx_mask)
+      av1_inv_txfm_add_c(dqcoeff, dst, stride, &txfm_param);
+    else
+      av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+#elif CONFIG_DST7_16X16
+    if ((tx_size_wide[tx_size] == 16 || tx_size_high[tx_size] == 16) &&
+        allowed_tx_mask)
+      av1_inv_txfm_add_c(dqcoeff, dst, stride, &txfm_param);
+    else
+      av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+#elif CONFIG_DST_32X32
+    if ((tx_size_wide[tx_size] == 32 || tx_size_high[tx_size] == 32) &&
+        allowed_tx_mask)
+      av1_inv_txfm_add_c(dqcoeff, dst, stride, &txfm_param);
+    else
+      av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+#else
     av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+#endif  // CONFIG_DST7_16X16 && CONFIG_DST_32X32
   }
 }
diff --git a/av1/encoder/av1_fwd_txfm1d.c b/av1/encoder/av1_fwd_txfm1d.c
index 6601c19..854acfa 100644
--- a/av1/encoder/av1_fwd_txfm1d.c
+++ b/av1/encoder/av1_fwd_txfm1d.c
@@ -846,6 +846,20 @@
   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 }
 
+#if CONFIG_DST7_16X16
+void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                 const int8_t *stage_range) {
+  (void)cos_bit;
+  (void)stage_range;
+  for (int32_t i = 0; i < 16; i++) {
+    int32_t sum = 0;
+    for (int32_t j = 0; j < 16; j++) {
+      sum += input[j] * dst7_16x16[i][j];
+    }
+    output[i] = ROUND_POWER_OF_TWO_SIGNED(sum, DST_16X16_PREC_BITS);
+  }
+}
+#else
 void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
                  const int8_t *stage_range) {
   const int32_t size = 16;
@@ -1060,6 +1074,22 @@
   bf1[15] = bf0[0];
   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 }
+#endif  // CONFIG_DST7_16X16
+
+#if CONFIG_DST_32X32
+void av1_fadst32(const int32_t *input, int32_t *output, int8_t cos_bit,
+                 const int8_t *stage_range) {
+  (void)cos_bit;
+  (void)stage_range;
+  for (int32_t i = 0; i < 32; i++) {
+    int32_t sum = 0;
+    for (int32_t j = 0; j < 32; j++) {
+      sum += input[j] * dst7_32x32[i][j];
+    }
+    output[i] = ROUND_POWER_OF_TWO_SIGNED(sum, DST_32X32_PREC_BITS);
+  }
+}
+#endif
 
 void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
                       const int8_t *stage_range) {
diff --git a/av1/encoder/av1_fwd_txfm1d.h b/av1/encoder/av1_fwd_txfm1d.h
index 9ef54fe..877b40b 100644
--- a/av1/encoder/av1_fwd_txfm1d.h
+++ b/av1/encoder/av1_fwd_txfm1d.h
@@ -34,6 +34,10 @@
                 const int8_t *stage_range);
 void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
                  const int8_t *stage_range);
+#if CONFIG_DST_32X32
+void av1_fadst32(const int32_t *input, int32_t *output, int8_t cos_bit,
+                 const int8_t *stage_range);
+#endif
 void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
                       const int8_t *stage_range);
 void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
diff --git a/av1/encoder/av1_fwd_txfm2d.c b/av1/encoder/av1_fwd_txfm2d.c
index bcb829d..98d2887 100644
--- a/av1/encoder/av1_fwd_txfm2d.c
+++ b/av1/encoder/av1_fwd_txfm2d.c
@@ -30,6 +30,9 @@
     case TXFM_TYPE_ADST4: return av1_fadst4;
     case TXFM_TYPE_ADST8: return av1_fadst8;
     case TXFM_TYPE_ADST16: return av1_fadst16;
+#if CONFIG_DST_32X32
+    case TXFM_TYPE_ADST32: return av1_fadst32;
+#endif  // CONFIG_DST_32X32
     case TXFM_TYPE_IDENTITY4: return av1_fidentity4_c;
     case TXFM_TYPE_IDENTITY8: return av1_fidentity8_c;
     case TXFM_TYPE_IDENTITY16: return av1_fidentity16_c;
@@ -364,6 +367,10 @@
 static const int8_t fadst8_range_mult2[8] = { 0, 0, 1, 3, 3, 5, 5, 5 };
 static const int8_t fadst16_range_mult2[10] = { 0, 0, 1, 3, 3, 5, 5, 7, 7, 7 };
 
+#if CONFIG_DST_32X32
+static const int8_t fadst32_range_mult2[1] = { 9 };
+#endif
+
 static const int8_t fidtx4_range_mult2[1] = { 1 };
 static const int8_t fidtx8_range_mult2[1] = { 2 };
 static const int8_t fidtx16_range_mult2[1] = { 3 };
@@ -380,10 +387,22 @@
 #endif
 
 static const int8_t *fwd_txfm_range_mult2_list[TXFM_TYPES] = {
-  fdct4_range_mult2,  fdct8_range_mult2,   fdct16_range_mult2,
-  fdct32_range_mult2, fdct64_range_mult2,  fadst4_range_mult2,
-  fadst8_range_mult2, fadst16_range_mult2, fidtx4_range_mult2,
-  fidtx8_range_mult2, fidtx16_range_mult2, fidtx32_range_mult2
+  fdct4_range_mult2,
+  fdct8_range_mult2,
+  fdct16_range_mult2,
+  fdct32_range_mult2,
+  fdct64_range_mult2,
+  fadst4_range_mult2,
+  fadst8_range_mult2,
+  fadst16_range_mult2,
+  fidtx4_range_mult2,
+  fidtx8_range_mult2,
+  fidtx16_range_mult2,
+  fidtx32_range_mult2
+#if CONFIG_DST_32X32
+  ,
+  fadst32_range_mult2,
+#endif
 };
 
 static INLINE void set_fwd_txfm_non_scale_range(TXFM_2D_FLIP_CFG *cfg) {
diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index 0699085..bd09770 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c
@@ -109,7 +109,16 @@
   int32_t *dst_coeff = (int32_t *)coeff;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
+#if CONFIG_DST7_16X16
+  uint16_t allowed_tx_mask = 0xF1FE;
+  allowed_tx_mask &= (1 << tx_type);
+  if (allowed_tx_mask)
+    av1_fwd_txfm2d_8x16_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+  else
+    av1_fwd_txfm2d_8x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
+#else
   av1_fwd_txfm2d_8x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
+#endif  // CONFIG_DST7_16X16
 }
 
 static void highbd_fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
@@ -117,49 +126,132 @@
   int32_t *dst_coeff = (int32_t *)coeff;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
+#if CONFIG_DST7_16X16
+  uint16_t allowed_tx_mask = 0xF1FE;
+  allowed_tx_mask &= (1 << tx_type);
+  if (allowed_tx_mask)
+    av1_fwd_txfm2d_16x8_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+  else
+    av1_fwd_txfm2d_16x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
+#else
   av1_fwd_txfm2d_16x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
+#endif  // CONFIG_DST7_16X16
 }
 
 static void highbd_fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
                                   int diff_stride, TxfmParam *txfm_param) {
   int32_t *dst_coeff = (int32_t *)coeff;
+#if CONFIG_DST_32X32
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  uint16_t allowed_tx_mask = 0xF1FE;
+  allowed_tx_mask &= (1 << tx_type);
+  if (allowed_tx_mask)
+    av1_fwd_txfm2d_16x32_c(src_diff, dst_coeff, diff_stride,
+                           txfm_param->tx_type, txfm_param->bd);
+  else
+    av1_fwd_txfm2d_16x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                         txfm_param->bd);
+#else
+  assert(txfm_param->tx_type == DCT_DCT || txfm_param->tx_type == IDTX);
   av1_fwd_txfm2d_16x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
                        txfm_param->bd);
+#endif  // CONFIG_DST_32X32
 }
 
 static void highbd_fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
                                   int diff_stride, TxfmParam *txfm_param) {
   int32_t *dst_coeff = (int32_t *)coeff;
+#if CONFIG_DST_32X32
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  uint16_t allowed_tx_mask = 0xF1FE;
+  allowed_tx_mask &= (1 << tx_type);
+  if (allowed_tx_mask)
+    av1_fwd_txfm2d_32x16_c(src_diff, dst_coeff, diff_stride,
+                           txfm_param->tx_type, txfm_param->bd);
+  else
+    av1_fwd_txfm2d_32x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                         txfm_param->bd);
+#else
+  assert(txfm_param->tx_type == DCT_DCT || txfm_param->tx_type == IDTX);
   av1_fwd_txfm2d_32x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
                        txfm_param->bd);
+#endif  // CONFIG_DST_32X32
 }
 
 static void highbd_fwd_txfm_16x4(const int16_t *src_diff, tran_low_t *coeff,
                                  int diff_stride, TxfmParam *txfm_param) {
   int32_t *dst_coeff = (int32_t *)coeff;
+#if CONFIG_DST7_16X16
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  uint16_t allowed_tx_mask = 0xF1FE;
+  allowed_tx_mask &= (1 << tx_type);
+  if (allowed_tx_mask)
+    av1_fwd_txfm2d_16x4_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                          txfm_param->bd);
+  else
+    av1_fwd_txfm2d_16x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                        txfm_param->bd);
+#else
   av1_fwd_txfm2d_16x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
                       txfm_param->bd);
+#endif  // CONFIG_DST7_16X16
 }
 
 static void highbd_fwd_txfm_4x16(const int16_t *src_diff, tran_low_t *coeff,
                                  int diff_stride, TxfmParam *txfm_param) {
   int32_t *dst_coeff = (int32_t *)coeff;
+#if CONFIG_DST7_16X16
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  uint16_t allowed_tx_mask = 0xF1FE;
+  allowed_tx_mask &= (1 << tx_type);
+  if (allowed_tx_mask)
+    av1_fwd_txfm2d_4x16_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                          txfm_param->bd);
+  else
+    av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                        txfm_param->bd);
+#else
   av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
                       txfm_param->bd);
+#endif  // CONFIG_DST7_16X16
 }
 
 static void highbd_fwd_txfm_32x8(const int16_t *src_diff, tran_low_t *coeff,
                                  int diff_stride, TxfmParam *txfm_param) {
   int32_t *dst_coeff = (int32_t *)coeff;
+#if CONFIG_DST_32X32
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  uint16_t allowed_tx_mask = 0xF1FE;
+  allowed_tx_mask &= (1 << tx_type);
+  if (allowed_tx_mask)
+    av1_fwd_txfm2d_32x8_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                          txfm_param->bd);
+  else
+    av1_fwd_txfm2d_32x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                        txfm_param->bd);
+#else
   av1_fwd_txfm2d_32x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
                       txfm_param->bd);
+#endif  // CONFIG_DST_32X32
 }
 
 static void highbd_fwd_txfm_8x32(const int16_t *src_diff, tran_low_t *coeff,
                                  int diff_stride, TxfmParam *txfm_param) {
   int32_t *dst_coeff = (int32_t *)coeff;
+#if CONFIG_DST_32X32
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  uint16_t allowed_tx_mask = 0xF1FE;
+  allowed_tx_mask &= (1 << tx_type);
+  if (allowed_tx_mask)
+    av1_fwd_txfm2d_8x32_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                          txfm_param->bd);
+  else
+    av1_fwd_txfm2d_8x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                        txfm_param->bd);
+#else
   av1_fwd_txfm2d_8x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
                       txfm_param->bd);
+#endif  // CONFIG_DST_32X32
 }
 
 static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
@@ -175,7 +267,16 @@
   int32_t *dst_coeff = (int32_t *)coeff;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
+#if CONFIG_DST7_16X16
+  uint16_t allowed_tx_mask = 0xF1FE;
+  allowed_tx_mask &= (1 << tx_type);
+  if (allowed_tx_mask)
+    av1_fwd_txfm2d_16x16_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+  else
+    av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
+#else
   av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
+#endif  // CONFIG_DST7_16X16
 }
 
 static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
@@ -183,7 +284,16 @@
   int32_t *dst_coeff = (int32_t *)coeff;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
+#if CONFIG_DST_32X32
+  uint16_t allowed_tx_mask = 0xF1FE;
+  allowed_tx_mask &= (1 << tx_type);
+  if (allowed_tx_mask)
+    av1_fwd_txfm2d_32x32_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+  else
+    av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd);
+#else
   av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd);
+#endif  // CONFIG_DST_32X32
 }
 
 static void highbd_fwd_txfm_32x64(const int16_t *src_diff, tran_low_t *coeff,
@@ -230,10 +340,41 @@
 
 void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
                   TxfmParam *txfm_param) {
-  if (txfm_param->bd == 8)
+  if (txfm_param->bd == 8) {
+#if CONFIG_DST7_16X16 || CONFIG_DST_32X32
+    const TX_TYPE tx_type = txfm_param->tx_type;
+    uint16_t allowed_tx_mask = 0xF1FE;
+    allowed_tx_mask &= (1 << tx_type);
+#endif
+#if CONFIG_DST7_16X16 && CONFIG_DST_32X32
+    if ((tx_size_wide[txfm_param->tx_size] == 16 ||
+         tx_size_high[txfm_param->tx_size] == 16 ||
+         tx_size_wide[txfm_param->tx_size] == 32 ||
+         tx_size_high[txfm_param->tx_size] == 32) &&
+        allowed_tx_mask)
+      av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+    else
+      av1_lowbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+#elif CONFIG_DST7_16X16
+    if ((tx_size_wide[txfm_param->tx_size] == 16 ||
+         tx_size_high[txfm_param->tx_size] == 16) &&
+        allowed_tx_mask)
+      av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+    else
+      av1_lowbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+#elif CONFIG_DST_32X32
+    if ((tx_size_wide[txfm_param->tx_size] == 32 ||
+         tx_size_high[txfm_param->tx_size] == 32) &&
+        allowed_tx_mask)
+      av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+    else
+      av1_lowbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+#else
     av1_lowbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
-  else
+#endif  // CONFIG_DST7_16X16
+  } else {
     av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+  }
 }
 
 void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff,
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index 81646ed..f3a69cf 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c
@@ -1995,6 +1995,11 @@
   if (cpi->oxcf.txfm_cfg.enable_flip_idtx == 0)
     ext_tx_used_flag &= DCT_ADST_TX_MASK;
 
+#if CONFIG_DST_32X32
+  if (!is_inter && (txsize_sqr_up_map[tx_size] == TX_32X32))
+    ext_tx_used_flag &= DCT_ADST_TX_MASK;
+#endif
+
   uint16_t allowed_tx_mask = 0;  // 1: allow; 0: skip.
   if (txk_allowed < TX_TYPES) {
     allowed_tx_mask = 1 << txk_allowed;
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index fa53112..b4e03d3 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -139,7 +139,9 @@
                    "AV2 experiment flag to remove dual filter.")
 # Partitioning
 set_aom_config_var(CONFIG_SDP 0 NUMBER "AV2 Semi-Decoupled Partitioning.")
-
+# Primary Transforms
+set_aom_config_var(CONFIG_DST7_16X16 0 NUMBER "AV2 DST7 16x16 experiment flag.")
+set_aom_config_var(CONFIG_DST_32X32 0 NUMBER "AV2 DST7 32x32 experiment flag.")
 #
 # Variables in this section control optional features of the build system.
 #