Add CONFIG_DAALA_DCT32 experiment.

This experiment replaces the 32-point Type-II DCT and 32-point
Type-IV DST scaling vp9 transforms with the 32-point orthonormal
Daala transforms.

subset-1:

    monty-square-baseline-s1-F3@2017-08-02T11:50:51.375Z ->
      monty-square-dct32-s1-F3@2017-08-02T11:50:18.859Z

      PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
    0.0000 |  0.0115 | -0.1044 |  -0.0185 | -0.0069 | -0.0603 |     0.0555

objective-1-fast (4 frames):

    monty-square-baseline-o1f-F3-l4-fine@2017-08-12T02:18:05.560Z ->
      monty-square-dct32-o1f-F3-l4-fine@2017-08-12T02:19:44.461Z

      PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
   -0.0269 | -0.0715 |     N/A |  -0.0547 | -0.0268 | -0.0590 |        N/A

Change-Id: Ib1bad991d82eb67956e94a6216298a84e908b169
diff --git a/aom_dsp/inv_txfm.c b/aom_dsp/inv_txfm.c
index d0b6fef..fbf09db 100644
--- a/aom_dsp/inv_txfm.c
+++ b/aom_dsp/inv_txfm.c
@@ -14,7 +14,8 @@
 
 #include "./aom_dsp_rtcd.h"
 #include "aom_dsp/inv_txfm.h"
-#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16
+#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \
+    CONFIG_DAALA_DCT32
 #include "av1/common/daala_tx.h"
 #endif
 
@@ -881,6 +882,18 @@
   }
 }
 
+#if CONFIG_DAALA_DCT32
+void aom_idct32_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  od_coeff x[32];
+  od_coeff y[32];
+  for (i = 0; i < 32; i++) y[i] = (od_coeff)input[i];
+  od_bin_idct32(x, 1, y);
+  for (i = 0; i < 32; i++) output[i] = (tran_low_t)x[i];
+}
+
+#else
+
 void aom_idct32_c(const tran_low_t *input, tran_low_t *output) {
   tran_low_t step1[32], step2[32];
   tran_high_t temp1, temp2;
@@ -1247,6 +1260,7 @@
   output[30] = WRAPLOW(step1[1] - step1[30]);
   output[31] = WRAPLOW(step1[0] - step1[31]);
 }
+#endif
 
 #if CONFIG_MRC_TX
 void aom_imrc32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
diff --git a/av1/common/daala_tx.c b/av1/common/daala_tx.c
index d9f674c..82c8af5 100644
--- a/av1/common/daala_tx.c
+++ b/av1/common/daala_tx.c
@@ -795,6 +795,64 @@
   } \
   while (0)
 
+#define OD_FDCT_16_ASYM(t0, t8, t8h, t4, tc, tch, t2, ta, tah, t6, te, teh, \
+  t1, t9, t9h, t5, td, tdh, t3, tb, tbh, t7, tf, tfh) \
+  /* Embedded 16-point asymmetric Type-II fDCT. */ \
+  do { \
+    t0 += tfh; \
+    tf = t0 - tf; \
+    t1 -= teh; \
+    te += t1; \
+    t2 += tdh; \
+    td = t2 - td; \
+    t3 -= tch; \
+    tc += t3; \
+    t4 += tbh; \
+    tb = t4 - tb; \
+    t5 -= tah; \
+    ta += t5; \
+    t6 += t9h; \
+    t9 = t6 - t9; \
+    t7 -= t8h; \
+    t8 += t7; \
+    OD_FDCT_8(t0, t8, t4, tc, t2, ta, t6, te); \
+    OD_FDST_8(tf, t7, tb, t3, td, t5, t9, t1); \
+  } \
+  while (0)
+
+#define OD_IDCT_16_ASYM(t0, t8, t4, tc, t2, ta, t6, te, \
+  t1, t1h, t9, t9h, t5, t5h, td, tdh, t3, t3h, tb, tbh, t7, t7h, tf, tfh) \
+  /* Embedded 16-point asymmetric Type-II iDCT. */ \
+  do { \
+    OD_IDST_8(tf, tb, td, t9, te, ta, tc, t8); \
+    OD_IDCT_8(t0, t4, t2, t6, t1, t5, t3, t7); \
+    t1 -= te; \
+    t1h = OD_DCT_RSHIFT(t1, 1); \
+    te += t1h; \
+    t9 = t6 - t9; \
+    t9h = OD_DCT_RSHIFT(t9, 1); \
+    t6 -= t9h; \
+    t5 -= ta; \
+    t5h = OD_DCT_RSHIFT(t5, 1); \
+    ta += t5h; \
+    td = t2 - td; \
+    tdh = OD_DCT_RSHIFT(td, 1); \
+    t2 -= tdh; \
+    t3 -= tc; \
+    t3h = OD_DCT_RSHIFT(t3, 1); \
+    tc += t3h; \
+    tb = t4 - tb; \
+    tbh = OD_DCT_RSHIFT(tb, 1); \
+    t4 -= tbh; \
+    t7 -= t8; \
+    t7h = OD_DCT_RSHIFT(t7, 1); \
+    t8 += t7h; \
+    tf = t0 - tf; \
+    tfh = OD_DCT_RSHIFT(tf, 1); \
+    t0 -= tfh; \
+  } \
+  while (0)
+
 #define OD_FDST_16(s0, s8, s4, sc, s2, sa, s6, se, \
   s1, s9, s5, sd, s3, sb, s7, sf) \
   /* Embedded 16-point orthonormal Type-IV fDST. */ \
@@ -1176,6 +1234,560 @@
   } \
   while (0)
 
+/* TODO: rewrite this to match OD_FDST_16. */
+#define OD_FDST_16_ASYM(t0, t0h, t8, t4, t4h, tc, t2, ta, t6, te, \
+  t1, t9, t5, td, t3, tb, t7, t7h, tf) \
+  /* Embedded 16-point asymmetric Type-IV fDST. */ \
+  do { \
+    int t2h; \
+    int t3h; \
+    int t6h; \
+    int t8h; \
+    int t9h; \
+    int tch; \
+    int tdh; \
+    /* TODO: Can we move these into another operation */ \
+    t8 = -t8; \
+    t9 = -t9; \
+    ta = -ta; \
+    tb = -tb; \
+    td = -td; \
+    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
+    OD_DCT_OVERFLOW_CHECK(te, 13573, 8192, 136); \
+    t1 -= (te*13573 + 8192) >> 14; \
+    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
+    OD_DCT_OVERFLOW_CHECK(t1, 11585, 16384, 137); \
+    te += (t1*11585 + 16384) >> 15; \
+    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
+    OD_DCT_OVERFLOW_CHECK(te, 13573, 8192, 138); \
+    t1 -= (te*13573 + 8192) >> 14; \
+    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
+    OD_DCT_OVERFLOW_CHECK(td, 4161, 8192, 139); \
+    t2 += (td*4161 + 8192) >> 14; \
+    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
+    OD_DCT_OVERFLOW_CHECK(t2, 15137, 8192, 140); \
+    td -= (t2*15137 + 8192) >> 14; \
+    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
+    OD_DCT_OVERFLOW_CHECK(td, 14341, 8192, 141); \
+    t2 += (td*14341 + 8192) >> 14; \
+    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
+    OD_DCT_OVERFLOW_CHECK(t3, 14341, 8192, 142); \
+    tc -= (t3*14341 + 8192) >> 14; \
+    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
+    OD_DCT_OVERFLOW_CHECK(tc, 15137, 8192, 143); \
+    t3 += (tc*15137 + 8192) >> 14; \
+    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
+    OD_DCT_OVERFLOW_CHECK(t3, 4161, 8192, 144); \
+    tc -= (t3*4161 + 8192) >> 14; \
+    te = t0h - te; \
+    t0 -= te; \
+    tf = OD_DCT_RSHIFT(t1, 1) - tf; \
+    t1 -= tf; \
+    /* TODO: Can we move this into another operation */ \
+    tc = -tc; \
+    t2 = OD_DCT_RSHIFT(tc, 1) - t2; \
+    tc -= t2; \
+    t3 = OD_DCT_RSHIFT(td, 1) - t3; \
+    td = t3 - td; \
+    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
+    OD_DCT_OVERFLOW_CHECK(t6, 7489, 4096, 145); \
+    t9 -= (t6*7489 + 4096) >> 13; \
+    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
+    OD_DCT_OVERFLOW_CHECK(t9, 11585, 8192, 146); \
+    t6 += (t9*11585 + 8192) >> 14; \
+    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
+    OD_DCT_OVERFLOW_CHECK(t6, 19195, 16384, 147); \
+    t9 += (t6*19195 + 16384) >> 15; \
+    t8 += OD_DCT_RSHIFT(t9, 1); \
+    t9 -= t8; \
+    t6 = t7h - t6; \
+    t7 -= t6; \
+    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
+    OD_DCT_OVERFLOW_CHECK(t7, 6723, 4096, 148); \
+    t8 += (t7*6723 + 4096) >> 13; \
+    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
+    OD_DCT_OVERFLOW_CHECK(t8, 16069, 8192, 149); \
+    t7 -= (t8*16069 + 8192) >> 14; \
+    /* 6723/8192 ~= Tan[7*Pi/32]) ~= 0.820678790828660 */ \
+    OD_DCT_OVERFLOW_CHECK(t7, 6723, 4096, 150); \
+    t8 += (t7*6723 + 4096) >> 13; \
+    /* 17515/32768 ~= Tan[5*Pi/32]) ~= 0.534511135950792 */ \
+    OD_DCT_OVERFLOW_CHECK(t6, 17515, 16384, 151); \
+    t9 += (t6*17515 + 16384) >> 15; \
+    /* 13623/16384 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
+    OD_DCT_OVERFLOW_CHECK(t9, 13623, 8192, 152); \
+    t6 -= (t9*13623 + 8192) >> 14; \
+    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
+    OD_DCT_OVERFLOW_CHECK(t6, 17515, 16384, 153); \
+    t9 += (t6*17515 + 16384) >> 15; \
+    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
+    OD_DCT_OVERFLOW_CHECK(ta, 13573, 8192, 154); \
+    t5 += (ta*13573 + 8192) >> 14; \
+    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
+    OD_DCT_OVERFLOW_CHECK(t5, 11585, 16384, 155); \
+    ta -= (t5*11585 + 16384) >> 15; \
+    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
+    OD_DCT_OVERFLOW_CHECK(ta, 13573, 8192, 156); \
+    t5 += (ta*13573 + 8192) >> 14; \
+    tb += OD_DCT_RSHIFT(t5, 1); \
+    t5 = tb - t5; \
+    ta += t4h; \
+    t4 -= ta; \
+    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
+    OD_DCT_OVERFLOW_CHECK(t5, 2485, 4096, 157); \
+    ta += (t5*2485 + 4096) >> 13; \
+    /* 18205/32768 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
+    OD_DCT_OVERFLOW_CHECK(ta, 18205, 16384, 158); \
+    t5 -= (ta*18205 + 16384) >> 15; \
+    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
+    OD_DCT_OVERFLOW_CHECK(t5, 2485, 4096, 159); \
+    ta += (t5*2485 + 4096) >> 13; \
+    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
+    OD_DCT_OVERFLOW_CHECK(t4, 6723, 4096, 160); \
+    tb -= (t4*6723 + 4096) >> 13; \
+    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
+    OD_DCT_OVERFLOW_CHECK(tb, 16069, 8192, 161); \
+    t4 += (tb*16069 + 8192) >> 14; \
+    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
+    OD_DCT_OVERFLOW_CHECK(t4, 6723, 4096, 162); \
+    tb -= (t4*6723 + 4096) >> 13; \
+    /* TODO: Can we move this into another operation */ \
+    t5 = -t5; \
+    tc -= tf; \
+    tch = OD_DCT_RSHIFT(tc, 1); \
+    tf += tch; \
+    t3 += t0; \
+    t3h = OD_DCT_RSHIFT(t3, 1); \
+    t0 -= t3h; \
+    td -= t1; \
+    tdh = OD_DCT_RSHIFT(td, 1); \
+    t1 += tdh; \
+    t2 += te; \
+    t2h = OD_DCT_RSHIFT(t2, 1); \
+    te -= t2h; \
+    t8 += t4; \
+    t8h = OD_DCT_RSHIFT(t8, 1); \
+    t4 = t8h - t4; \
+    t7 = tb - t7; \
+    t7h = OD_DCT_RSHIFT(t7, 1); \
+    tb = t7h - tb; \
+    t6 -= ta; \
+    t6h = OD_DCT_RSHIFT(t6, 1); \
+    ta += t6h; \
+    t9 = t5 - t9; \
+    t9h = OD_DCT_RSHIFT(t9, 1); \
+    t5 -= t9h; \
+    t0 -= t7h; \
+    t7 += t0; \
+    tf += t8h; \
+    t8 -= tf; \
+    te -= t6h; \
+    t6 += te; \
+    t1 += t9h; \
+    t9 -= t1; \
+    tb -= tch; \
+    tc += tb; \
+    t4 += t3h; \
+    t3 -= t4; \
+    ta -= tdh; \
+    td += ta; \
+    t5 = t2h - t5; \
+    t2 -= t5; \
+    /* TODO: Can we move these into another operation */ \
+    t8 = -t8; \
+    t9 = -t9; \
+    ta = -ta; \
+    tb = -tb; \
+    tc = -tc; \
+    td = -td; \
+    tf = -tf; \
+    /* 7799/8192 ~= Tan[31*Pi/128] ~= 0.952079146700925 */ \
+    OD_DCT_OVERFLOW_CHECK(tf, 7799, 4096, 163); \
+    t0 -= (tf*7799 + 4096) >> 13; \
+    /* 4091/4096 ~= Sin[31*Pi/64] ~= 0.998795456205172 */ \
+    OD_DCT_OVERFLOW_CHECK(t0, 4091, 2048, 164); \
+    tf += (t0*4091 + 2048) >> 12; \
+    /* 7799/8192 ~= Tan[31*Pi/128] ~= 0.952079146700925 */ \
+    OD_DCT_OVERFLOW_CHECK(tf, 7799, 4096, 165); \
+    t0 -= (tf*7799 + 4096) >> 13; \
+    /* 2417/32768 ~= Tan[3*Pi/128] ~= 0.0737644315224493 */ \
+    OD_DCT_OVERFLOW_CHECK(te, 2417, 16384, 166); \
+    t1 += (te*2417 + 16384) >> 15; \
+    /* 601/4096 ~= Sin[3*Pi/64] ~= 0.146730474455362 */ \
+    OD_DCT_OVERFLOW_CHECK(t1, 601, 2048, 167); \
+    te -= (t1*601 + 2048) >> 12; \
+    /* 2417/32768 ~= Tan[3*Pi/128] ~= 0.0737644315224493 */ \
+    OD_DCT_OVERFLOW_CHECK(te, 2417, 16384, 168); \
+    t1 += (te*2417 + 16384) >> 15; \
+    /* 14525/32768 ~= Tan[17*Pi/128] ~= 0.443269513890864 */ \
+    OD_DCT_OVERFLOW_CHECK(t8, 14525, 16384, 169); \
+    t7 -= (t8*14525 + 16384) >> 15; \
+    /* 3035/4096 ~= Sin[17*Pi/64] ~= 0.740951125354959 */ \
+    OD_DCT_OVERFLOW_CHECK(t7, 3035, 2048, 170); \
+    t8 += (t7*3035 + 2048) >> 12; \
+    /* 7263/16384 ~= Tan[17*Pi/128] ~= 0.443269513890864 */ \
+    OD_DCT_OVERFLOW_CHECK(t8, 7263, 8192, 171); \
+    t7 -= (t8*7263 + 8192) >> 14; \
+    /* 6393/8192 ~= Tan[27*Pi/128] ~= 0.780407659653944 */ \
+    OD_DCT_OVERFLOW_CHECK(td, 6393, 4096, 172); \
+    t2 -= (td*6393 + 4096) >> 13; \
+    /* 3973/4096 ~= Sin[27*Pi/64] ~= 0.970031253194544 */ \
+    OD_DCT_OVERFLOW_CHECK(t2, 3973, 2048, 173); \
+    td += (t2*3973 + 2048) >> 12; \
+    /* 6393/8192 ~= Tan[27*Pi/128] ~= 0.780407659653944 */ \
+    OD_DCT_OVERFLOW_CHECK(td, 6393, 4096, 174); \
+    t2 -= (td*6393 + 4096) >> 13; \
+    /* 9281/16384 ~= Tan[21*Pi/128] ~= 0.566493002730344 */ \
+    OD_DCT_OVERFLOW_CHECK(ta, 9281, 8192, 175); \
+    t5 -= (ta*9281 + 8192) >> 14; \
+    /* 7027/8192 ~= Sin[21*Pi/64] ~= 0.857728610000272 */ \
+    OD_DCT_OVERFLOW_CHECK(t5, 7027, 4096, 176); \
+    ta += (t5*7027 + 4096) >> 13; \
+    /* 9281/16384 ~= Tan[21*Pi/128] ~= 0.566493002730344 */ \
+    OD_DCT_OVERFLOW_CHECK(ta, 9281, 8192, 177); \
+    t5 -= (ta*9281 + 8192) >> 14; \
+    /* 11539/16384 ~= Tan[25*Pi/128] ~= 0.704279460865044 */ \
+    OD_DCT_OVERFLOW_CHECK(tc, 11539, 8192, 178); \
+    t3 -= (tc*11539 + 8192) >> 14; \
+    /* 7713/8192 ~= Sin[25*Pi/64] ~= 0.941544065183021 */ \
+    OD_DCT_OVERFLOW_CHECK(t3, 7713, 4096, 179); \
+    tc += (t3*7713 + 4096) >> 13; \
+    /* 11539/16384 ~= Tan[25*Pi/128] ~= 0.704279460865044 */ \
+    OD_DCT_OVERFLOW_CHECK(tc, 11539, 8192, 180); \
+    t3 -= (tc*11539 + 8192) >> 14; \
+    /* 10375/16384 ~= Tan[23*Pi/128] ~= 0.633243016177569 */ \
+    OD_DCT_OVERFLOW_CHECK(tb, 10375, 8192, 181); \
+    t4 -= (tb*10375 + 8192) >> 14; \
+    /* 7405/8192 ~= Sin[23*Pi/64] ~= 0.903989293123443 */ \
+    OD_DCT_OVERFLOW_CHECK(t4, 7405, 4096, 182); \
+    tb += (t4*7405 + 4096) >> 13; \
+    /* 10375/16384 ~= Tan[23*Pi/128] ~= 0.633243016177569 */ \
+    OD_DCT_OVERFLOW_CHECK(tb, 10375, 8192, 183); \
+    t4 -= (tb*10375 + 8192) >> 14; \
+    /* 8247/16384 ~= Tan[19*Pi/128] ~= 0.503357699799294 */ \
+    OD_DCT_OVERFLOW_CHECK(t9, 8247, 8192, 184); \
+    t6 -= (t9*8247 + 8192) >> 14; \
+    /* 1645/2048 ~= Sin[19*Pi/64] ~= 0.803207531480645 */ \
+    OD_DCT_OVERFLOW_CHECK(t6, 1645, 1024, 185); \
+    t9 += (t6*1645 + 1024) >> 11; \
+    /* 8247/16384 ~= Tan[19*Pi/128] ~= 0.503357699799294 */ \
+    OD_DCT_OVERFLOW_CHECK(t9, 8247, 8192, 186); \
+    t6 -= (t9*8247 + 8192) >> 14; \
+  } \
+  while (0)
+
+#define OD_IDST_16_ASYM(t0, t0h, t8, t4, tc, t2, t2h, ta, t6, te, teh, \
+  t1, t9, t5, td, t3, tb, t7, tf) \
+  /* Embedded 16-point asymmetric Type-IV iDST. */ \
+  do { \
+    int t1h_; \
+    int t3h_; \
+    int t4h; \
+    int t6h; \
+    int t9h_; \
+    int tbh_; \
+    int tch; \
+    /* 8247/16384 ~= Tan[19*Pi/128] ~= 0.503357699799294 */ \
+    t6 += (t9*8247 + 8192) >> 14; \
+    /* 1645/2048 ~= Sin[19*Pi/64] ~= 0.803207531480645 */ \
+    t9 -= (t6*1645 + 1024) >> 11; \
+    /* 8247/16384 ~= Tan[19*Pi/128] ~= 0.503357699799294 */ \
+    t6 += (t9*8247 + 8192) >> 14; \
+    /* 10375/16384 ~= Tan[23*Pi/128] ~= 0.633243016177569 */ \
+    t2 += (td*10375 + 8192) >> 14; \
+    /* 7405/8192 ~= Sin[23*Pi/64] ~= 0.903989293123443 */ \
+    td -= (t2*7405 + 4096) >> 13; \
+    /* 10375/16384 ~= Tan[23*Pi/128] ~= 0.633243016177569 */ \
+    t2 += (td*10375 + 8192) >> 14; \
+    /* 11539/16384 ~= Tan[25*Pi/128] ~= 0.704279460865044 */ \
+    tc += (t3*11539 + 8192) >> 14; \
+    /* 7713/8192 ~= Sin[25*Pi/64] ~= 0.941544065183021 */ \
+    t3 -= (tc*7713 + 4096) >> 13; \
+    /* 11539/16384 ~= Tan[25*Pi/128] ~= 0.704279460865044 */ \
+    tc += (t3*11539 + 8192) >> 14; \
+    /* 9281/16384 ~= Tan[21*Pi/128] ~= 0.566493002730344 */ \
+    ta += (t5*9281 + 8192) >> 14; \
+    /* 7027/8192 ~= Sin[21*Pi/64] ~= 0.857728610000272 */ \
+    t5 -= (ta*7027 + 4096) >> 13; \
+    /* 9281/16384 ~= Tan[21*Pi/128] ~= 0.566493002730344 */ \
+    ta += (t5*9281 + 8192) >> 14; \
+    /* 6393/8192 ~= Tan[27*Pi/128] ~= 0.780407659653944 */ \
+    t4 += (tb*6393 + 4096) >> 13; \
+    /* 3973/4096 ~= Sin[27*Pi/64] ~= 0.970031253194544 */ \
+    tb -= (t4*3973 + 2048) >> 12; \
+    /* 6393/8192 ~= Tan[27*Pi/128] ~= 0.780407659653944 */ \
+    t4 += (tb*6393 + 4096) >> 13; \
+    /* 7263/16384 ~= Tan[17*Pi/128] ~= 0.443269513890864 */ \
+    te += (t1*7263 + 8192) >> 14; \
+    /* 3035/4096 ~= Sin[17*Pi/64] ~= 0.740951125354959 */ \
+    t1 -= (te*3035 + 2048) >> 12; \
+    /* 14525/32768 ~= Tan[17*Pi/128] ~= 0.443269513890864 */ \
+    te += (t1*14525 + 16384) >> 15; \
+    /* 2417/32768 ~= Tan[3*Pi/128] ~= 0.0737644315224493 */ \
+    t8 -= (t7*2417 + 16384) >> 15; \
+    /* 601/4096 ~= Sin[3*Pi/64] ~= 0.146730474455362 */ \
+    t7 += (t8*601 + 2048) >> 12; \
+    /* 2417/32768 ~= Tan[3*Pi/128] ~= 0.0737644315224493 */ \
+    t8 -= (t7*2417 + 16384) >> 15; \
+    /* 7799/8192 ~= Tan[31*Pi/128] ~= 0.952079146700925 */ \
+    t0 += (tf*7799 + 4096) >> 13; \
+    /* 4091/4096 ~= Sin[31*Pi/64] ~= 0.998795456205172 */ \
+    tf -= (t0*4091 + 2048) >> 12; \
+    /* 7799/8192 ~= Tan[31*Pi/128] ~= 0.952079146700925 */ \
+    t0 += (tf*7799 + 4096) >> 13; \
+    /* TODO: Can we move these into another operation */ \
+    t1 = -t1; \
+    t3 = -t3; \
+    t5 = -t5; \
+    t9 = -t9; \
+    tb = -tb; \
+    td = -td; \
+    tf = -tf; \
+    t4 += ta; \
+    t4h = OD_DCT_RSHIFT(t4, 1); \
+    ta = t4h - ta; \
+    tb -= t5; \
+    tbh_ = OD_DCT_RSHIFT(tb, 1); \
+    t5 += tbh_; \
+    tc += t2; \
+    tch = OD_DCT_RSHIFT(tc, 1); \
+    t2 -= tch; \
+    t3 -= td; \
+    t3h_ = OD_DCT_RSHIFT(t3, 1); \
+    td += t3h_; \
+    t9 += t8; \
+    t9h_ = OD_DCT_RSHIFT(t9, 1); \
+    t8 -= t9h_; \
+    t6 -= t7; \
+    t6h = OD_DCT_RSHIFT(t6, 1); \
+    t7 += t6h; \
+    t1 += tf; \
+    t1h_ = OD_DCT_RSHIFT(t1, 1); \
+    tf -= t1h_; \
+    te -= t0; \
+    teh = OD_DCT_RSHIFT(te, 1); \
+    t0 += teh; \
+    ta += t9h_; \
+    t9 = ta - t9; \
+    t5 -= t6h; \
+    t6 += t5; \
+    td = teh - td; \
+    te = td - te; \
+    t2 = t1h_ - t2; \
+    t1 -= t2; \
+    t7 += t4h; \
+    t4 -= t7; \
+    t8 -= tbh_; \
+    tb += t8; \
+    t0 += tch; \
+    tc -= t0; \
+    tf -= t3h_; \
+    t3 += tf; \
+    /* TODO: Can we move this into another operation */ \
+    ta = -ta; \
+    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
+    td += (t2*6723 + 4096) >> 13; \
+    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
+    t2 -= (td*16069 + 8192) >> 14; \
+    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
+    td += (t2*6723 + 4096) >> 13; \
+    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
+    t5 -= (ta*2485 + 4096) >> 13; \
+    /* 18205/32768 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
+    ta += (t5*18205 + 16384) >> 15; \
+    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
+    t5 -= (ta*2485 + 4096) >> 13; \
+    t2 += t5; \
+    t2h = OD_DCT_RSHIFT(t2, 1); \
+    t5 -= t2h; \
+    ta = td - ta; \
+    td -= OD_DCT_RSHIFT(ta, 1); \
+    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
+    ta -= (t5*13573 + 8192) >> 14; \
+    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
+    t5 += (ta*11585 + 16384) >> 15; \
+    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
+    ta -= (t5*13573 + 8192) >> 14; \
+    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
+    t9 -= (t6*17515 + 16384) >> 15; \
+    /* 13623/16384 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
+    t6 += (t9*13623 + 8192) >> 14; \
+    /* 17515/32768 ~= Tan[5*Pi/32]) ~= 0.534511135950792 */ \
+    t9 -= (t6*17515 + 16384) >> 15; \
+    /* 6723/8192 ~= Tan[7*Pi/32]) ~= 0.820678790828660 */ \
+    t1 -= (te*6723 + 4096) >> 13; \
+    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
+    te += (t1*16069 + 8192) >> 14; \
+    /* 6723/8192 ~= Tan[7*Pi/32]) ~= 0.820678790828660 */ \
+    t1 -= (te*6723 + 4096) >> 13; \
+    te += t6; \
+    teh = OD_DCT_RSHIFT(te, 1); \
+    t6 = teh - t6; \
+    t9 += t1; \
+    t1 -= OD_DCT_RSHIFT(t9, 1); \
+    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
+    t9 -= (t6*19195 + 16384) >> 15; \
+    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
+    t6 -= (t9*11585 + 8192) >> 14; \
+    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
+    t9 += (t6*7489 + 4096) >> 13; \
+    tb = tc - tb; \
+    tc = OD_DCT_RSHIFT(tb, 1) - tc; \
+    t3 += t4; \
+    t4 = OD_DCT_RSHIFT(t3, 1) - t4; \
+    /* TODO: Can we move this into another operation */ \
+    t3 = -t3; \
+    t8 += tf; \
+    tf = OD_DCT_RSHIFT(t8, 1) - tf; \
+    t0 += t7; \
+    t0h = OD_DCT_RSHIFT(t0, 1); \
+    t7 = t0h - t7; \
+    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
+    t3 += (tc*4161 + 8192) >> 14; \
+    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
+    tc -= (t3*15137 + 8192) >> 14; \
+    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
+    t3 += (tc*14341 + 8192) >> 14; \
+    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
+    t4 -= (tb*14341 + 8192) >> 14; \
+    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
+    tb += (t4*15137 + 8192) >> 14; \
+    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
+    t4 -= (tb*4161 + 8192) >> 14; \
+    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
+    t8 += (t7*13573 + 8192) >> 14; \
+    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
+    t7 -= (t8*11585 + 16384) >> 15; \
+    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
+    t8 += (t7*13573 + 8192) >> 14; \
+    /* TODO: Can we move these into another operation */ \
+    t1 = -t1; \
+    t5 = -t5; \
+    t9 = -t9; \
+    tb = -tb; \
+    td = -td; \
+  } \
+  while (0)
+
+#define OD_FDCT_32(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, \
+  te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
+  /* Embedded 32-point orthonormal Type-II fDCT. */ \
+  do { \
+    int tgh; \
+    int thh; \
+    int tih; \
+    int tkh; \
+    int tmh; \
+    int tnh; \
+    int toh; \
+    int tqh; \
+    int tsh; \
+    int tuh; \
+    int tvh; \
+    tv = t0 - tv; \
+    tvh = OD_DCT_RSHIFT(tv, 1); \
+    t0 -= tvh; \
+    tu += t1; \
+    tuh = OD_DCT_RSHIFT(tu, 1); \
+    t1 = tuh - t1; \
+    tt = t2 - tt; \
+    t2 -= OD_DCT_RSHIFT(tt, 1); \
+    ts += t3; \
+    tsh = OD_DCT_RSHIFT(ts, 1); \
+    t3 = tsh - t3; \
+    tr = t4 - tr; \
+    t4 -= OD_DCT_RSHIFT(tr, 1); \
+    tq += t5; \
+    tqh = OD_DCT_RSHIFT(tq, 1); \
+    t5 = tqh - t5; \
+    tp = t6 - tp; \
+    t6 -= OD_DCT_RSHIFT(tp, 1); \
+    to += t7; \
+    toh = OD_DCT_RSHIFT(to, 1); \
+    t7 = toh - t7; \
+    tn = t8 - tn; \
+    tnh = OD_DCT_RSHIFT(tn, 1); \
+    t8 -= tnh; \
+    tm += t9; \
+    tmh = OD_DCT_RSHIFT(tm, 1); \
+    t9 = tmh - t9; \
+    tl = ta - tl; \
+    ta -= OD_DCT_RSHIFT(tl, 1); \
+    tk += tb; \
+    tkh = OD_DCT_RSHIFT(tk, 1); \
+    tb = tkh - tb; \
+    tj = tc - tj; \
+    tc -= OD_DCT_RSHIFT(tj, 1); \
+    ti += td; \
+    tih = OD_DCT_RSHIFT(ti, 1); \
+    td = tih - td; \
+    th = te - th; \
+    thh = OD_DCT_RSHIFT(th, 1); \
+    te -= thh; \
+    tg += tf; \
+    tgh = OD_DCT_RSHIFT(tg, 1); \
+    tf = tgh - tf; \
+    OD_FDCT_16_ASYM(t0, tg, tgh, t8, to, toh, t4, tk, tkh, tc, ts, tsh, \
+     t2, ti, tih, ta, tq, tqh, t6, tm, tmh, te, tu, tuh); \
+    OD_FDST_16_ASYM(tv, tvh, tf, tn, tnh, t7, tr, tb, tj, t3, \
+     tt, td, tl, t5, tp, t9, th, thh, t1); \
+  } \
+  while (0)
+
+#define OD_IDCT_32(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, \
+  te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
+  /* Embedded 32-point orthonormal Type-II iDCT. */ \
+  do { \
+    int t1h; \
+    int t3h; \
+    int t5h; \
+    int t7h; \
+    int t9h; \
+    int tbh; \
+    int tdh; \
+    int tfh; \
+    int thh; \
+    int tth; \
+    int tvh; \
+    OD_IDST_16_ASYM(tv, tvh, tn, tr, tj, tt, tth, tl, tp, th, thh, \
+     tu, tm, tq, ti, ts, tk, to, tg); \
+    OD_IDCT_16_ASYM(t0, t8, t4, tc, t2, ta, t6, te, \
+     t1, t1h, t9, t9h, t5, t5h, td, tdh, t3, t3h, tb, tbh, t7, t7h, tf, tfh); \
+    tu = t1h - tu; \
+    t1 -= tu; \
+    te += thh; \
+    th = te - th; \
+    tm = t9h - tm; \
+    t9 -= tm; \
+    t6 += OD_DCT_RSHIFT(tp, 1); \
+    tp = t6 - tp; \
+    tq = t5h - tq; \
+    t5 -= tq; \
+    ta += OD_DCT_RSHIFT(tl, 1); \
+    tl = ta - tl; \
+    ti = tdh - ti; \
+    td -= ti; \
+    t2 += tth; \
+    tt = t2 - tt; \
+    ts = t3h - ts; \
+    t3 -= ts; \
+    tc += OD_DCT_RSHIFT(tj, 1); \
+    tj = tc - tj; \
+    tk = tbh - tk; \
+    tb -= tk; \
+    t4 += OD_DCT_RSHIFT(tr, 1); \
+    tr = t4 - tr; \
+    to = t7h - to; \
+    t7 -= to; \
+    t8 += OD_DCT_RSHIFT(tn, 1); \
+    tn = t8 - tn; \
+    tg = tfh - tg; \
+    tf -= tg; \
+    t0 += tvh; \
+    tv = t0 - tv; \
+  } \
+  while (0)
+
 void od_bin_fdct4(od_coeff y[4], const od_coeff *x, int xstride) {
   int q0;
   int q1;
@@ -1527,3 +2139,206 @@
   x[14*xstride] = (od_coeff)s1;
   x[15*xstride] = (od_coeff)s0;
 }
+
+void od_bin_fdct32(od_coeff y[32], const od_coeff *x, int xstride) {
+  /*215 adds, 38 shifts, 87 "muls".*/
+  int t0;
+  int t1;
+  int t2;
+  int t3;
+  int t4;
+  int t5;
+  int t6;
+  int t7;
+  int t8;
+  int t9;
+  int ta;
+  int tb;
+  int tc;
+  int td;
+  int te;
+  int tf;
+  int tg;
+  int th;
+  int ti;
+  int tj;
+  int tk;
+  int tl;
+  int tm;
+  int tn;
+  int to;
+  int tp;
+  int tq;
+  int tr;
+  int ts;
+  int tt;
+  int tu;
+  int tv;
+  t0 = x[0*xstride];
+  tg = x[1*xstride];
+  t8 = x[2*xstride];
+  to = x[3*xstride];
+  t4 = x[4*xstride];
+  tk = x[5*xstride];
+  tc = x[6*xstride];
+  ts = x[7*xstride];
+  t2 = x[8*xstride];
+  ti = x[9*xstride];
+  ta = x[10*xstride];
+  tq = x[11*xstride];
+  t6 = x[12*xstride];
+  tm = x[13*xstride];
+  te = x[14*xstride];
+  tu = x[15*xstride];
+  t1 = x[16*xstride];
+  th = x[17*xstride];
+  t9 = x[18*xstride];
+  tp = x[19*xstride];
+  t5 = x[20*xstride];
+  tl = x[21*xstride];
+  td = x[22*xstride];
+  tt = x[23*xstride];
+  t3 = x[24*xstride];
+  tj = x[25*xstride];
+  tb = x[26*xstride];
+  tr = x[27*xstride];
+  t7 = x[28*xstride];
+  tn = x[29*xstride];
+  tf = x[30*xstride];
+  tv = x[31*xstride];
+  OD_FDCT_32(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, te, tu,
+    t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv);
+  y[0] = (od_coeff)t0;
+  y[1] = (od_coeff)t1;
+  y[2] = (od_coeff)t2;
+  y[3] = (od_coeff)t3;
+  y[4] = (od_coeff)t4;
+  y[5] = (od_coeff)t5;
+  y[6] = (od_coeff)t6;
+  y[7] = (od_coeff)t7;
+  y[8] = (od_coeff)t8;
+  y[9] = (od_coeff)t9;
+  y[10] = (od_coeff)ta;
+  y[11] = (od_coeff)tb;
+  y[12] = (od_coeff)tc;
+  y[13] = (od_coeff)td;
+  y[14] = (od_coeff)te;
+  y[15] = (od_coeff)tf;
+  y[16] = (od_coeff)tg;
+  y[17] = (od_coeff)th;
+  y[18] = (od_coeff)ti;
+  y[19] = (od_coeff)tj;
+  y[20] = (od_coeff)tk;
+  y[21] = (od_coeff)tl;
+  y[22] = (od_coeff)tm;
+  y[23] = (od_coeff)tn;
+  y[24] = (od_coeff)to;
+  y[25] = (od_coeff)tp;
+  y[26] = (od_coeff)tq;
+  y[27] = (od_coeff)tr;
+  y[28] = (od_coeff)ts;
+  y[29] = (od_coeff)tt;
+  y[30] = (od_coeff)tu;
+  y[31] = (od_coeff)tv;
+}
+
+void od_bin_idct32(od_coeff *x, int xstride, const od_coeff y[32]) {
+  int t0;
+  int t1;
+  int t2;
+  int t3;
+  int t4;
+  int t5;
+  int t6;
+  int t7;
+  int t8;
+  int t9;
+  int ta;
+  int tb;
+  int tc;
+  int td;
+  int te;
+  int tf;
+  int tg;
+  int th;
+  int ti;
+  int tj;
+  int tk;
+  int tl;
+  int tm;
+  int tn;
+  int to;
+  int tp;
+  int tq;
+  int tr;
+  int ts;
+  int tt;
+  int tu;
+  int tv;
+  t0 = y[0];
+  tg = y[1];
+  t8 = y[2];
+  to = y[3];
+  t4 = y[4];
+  tk = y[5];
+  tc = y[6];
+  ts = y[7];
+  t2 = y[8];
+  ti = y[9];
+  ta = y[10];
+  tq = y[11];
+  t6 = y[12];
+  tm = y[13];
+  te = y[14];
+  tu = y[15];
+  t1 = y[16];
+  th = y[17];
+  t9 = y[18];
+  tp = y[19];
+  t5 = y[20];
+  tl = y[21];
+  td = y[22];
+  tt = y[23];
+  t3 = y[24];
+  tj = y[25];
+  tb = y[26];
+  tr = y[27];
+  t7 = y[28];
+  tn = y[29];
+  tf = y[30];
+  tv = y[31];
+  OD_IDCT_32(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, te, tu,
+    t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv);
+  x[0*xstride] = (od_coeff)t0;
+  x[1*xstride] = (od_coeff)t1;
+  x[2*xstride] = (od_coeff)t2;
+  x[3*xstride] = (od_coeff)t3;
+  x[4*xstride] = (od_coeff)t4;
+  x[5*xstride] = (od_coeff)t5;
+  x[6*xstride] = (od_coeff)t6;
+  x[7*xstride] = (od_coeff)t7;
+  x[8*xstride] = (od_coeff)t8;
+  x[9*xstride] = (od_coeff)t9;
+  x[10*xstride] = (od_coeff)ta;
+  x[11*xstride] = (od_coeff)tb;
+  x[12*xstride] = (od_coeff)tc;
+  x[13*xstride] = (od_coeff)td;
+  x[14*xstride] = (od_coeff)te;
+  x[15*xstride] = (od_coeff)tf;
+  x[16*xstride] = (od_coeff)tg;
+  x[17*xstride] = (od_coeff)th;
+  x[18*xstride] = (od_coeff)ti;
+  x[19*xstride] = (od_coeff)tj;
+  x[20*xstride] = (od_coeff)tk;
+  x[21*xstride] = (od_coeff)tl;
+  x[22*xstride] = (od_coeff)tm;
+  x[23*xstride] = (od_coeff)tn;
+  x[24*xstride] = (od_coeff)to;
+  x[25*xstride] = (od_coeff)tp;
+  x[26*xstride] = (od_coeff)tq;
+  x[27*xstride] = (od_coeff)tr;
+  x[28*xstride] = (od_coeff)ts;
+  x[29*xstride] = (od_coeff)tt;
+  x[30*xstride] = (od_coeff)tu;
+  x[31*xstride] = (od_coeff)tv;
+}
diff --git a/av1/common/daala_tx.h b/av1/common/daala_tx.h
index b38c9b1..b0f24a1 100644
--- a/av1/common/daala_tx.h
+++ b/av1/common/daala_tx.h
@@ -13,5 +13,7 @@
 void od_bin_idct16(od_coeff *x, int xstride, const od_coeff y[16]);
 void od_bin_fdst16(od_coeff y[16], const od_coeff *x, int xstride);
 void od_bin_idst16(od_coeff *x, int xstride, const od_coeff y[16]);
+void od_bin_fdct32(od_coeff y[32], const od_coeff *x, int xstride);
+void od_bin_idct32(od_coeff *x, int xstride, const od_coeff y[32]);
 
 #endif
diff --git a/av1/common/idct.c b/av1/common/idct.c
index 2f4df99..f5f6593 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -69,7 +69,13 @@
 
 static void iidtx32_c(const tran_low_t *input, tran_low_t *output) {
   int i;
-  for (i = 0; i < 32; ++i) output[i] = input[i] * 4;
+  for (i = 0; i < 32; ++i) {
+#if CONFIG_DAALA_DCT32
+    output[i] = input[i];
+#else
+    output[i] = input[i] * 4;
+#endif
+  }
 }
 
 #if CONFIG_TX64X64
@@ -82,6 +88,20 @@
 #endif  // CONFIG_EXT_TX
 
 // For use in lieu of ADST
+#if CONFIG_DAALA_DCT32
+static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[16];
+  // No scaling within; Daala transforms are all orthonormal
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] = input[i];
+  }
+  for (i = 0; i < 16; ++i) {
+    output[i] = input[16 + i];
+  }
+  aom_idct16_c(inputhalf, output + 16);
+}
+#else
 static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
   int i;
   tran_low_t inputhalf[16];
@@ -95,6 +115,7 @@
   aom_idct16_c(inputhalf, output + 16);
   // Note overall scaling factor is 4 times orthogonal
 }
+#endif
 
 #if CONFIG_TX64X64
 static void idct64_col_c(const tran_low_t *input, tran_low_t *output) {
@@ -1279,7 +1300,7 @@
   }
 }
 
-#if CONFIG_EXT_TX
+#if CONFIG_EXT_TX || CONFIG_DAALA_DCT32
 void av1_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                              const TxfmParam *txfm_param) {
   int tx_type = txfm_param->tx_type;
@@ -1287,7 +1308,8 @@
   assert(tx_type == DCT_DCT);
 #endif
   static const transform_2d IHT_32[] = {
-    { aom_idct32_c, aom_idct32_c },      // DCT_DCT
+    { aom_idct32_c, aom_idct32_c },  // DCT_DCT
+#if CONFIG_EXT_TX
     { ihalfright32_c, aom_idct32_c },    // ADST_DCT
     { aom_idct32_c, ihalfright32_c },    // DCT_ADST
     { ihalfright32_c, ihalfright32_c },  // ADST_ADST
@@ -1303,6 +1325,7 @@
     { iidtx32_c, ihalfright32_c },       // H_ADST
     { ihalfright32_c, iidtx32_c },       // V_FLIPADST
     { iidtx32_c, ihalfright32_c },       // H_FLIPADST
+#endif
   };
 
   int i, j;
@@ -1313,14 +1336,24 @@
 
   // inverse transform row vectors
   for (i = 0; i < 32; ++i) {
+#if CONFIG_DAALA_DCT32
+    tran_low_t temp_in[32];
+    for (j = 0; j < 32; j++) temp_in[j] = input[j] * 2;
+    IHT_32[tx_type].rows(temp_in, out[i]);
+#else
     IHT_32[tx_type].rows(input, out[i]);
+#endif
     input += 32;
   }
 
   // transpose
   for (i = 0; i < 32; i++) {
     for (j = 0; j < 32; j++) {
+#if CONFIG_DAALA_DCT32
+      tmp[j][i] = out[i][j] * 4;
+#else
       tmp[j][i] = out[i][j];
+#endif
     }
   }
 
@@ -1334,11 +1367,15 @@
     for (j = 0; j < 32; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
+#if CONFIG_DAALA_DCT32
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+#else
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
+#endif
     }
   }
 }
-#endif  // CONFIG_EXT_TX
+#endif  // CONFIG_EXT_TX || CONFIG_DAALA_DCT32
 
 #if CONFIG_TX64X64
 void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride,
@@ -1513,6 +1550,7 @@
 }
 #endif  // CONFIG_MRC_TX
 
+#if !CONFIG_DAALA_DCT32
 static void idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
                           const TxfmParam *txfm_param) {
 #if CONFIG_ADAPT_SCAN
@@ -1535,6 +1573,7 @@
   else
     aom_idct32x32_1024_add(input, dest, stride);
 }
+#endif
 
 #if CONFIG_TX64X64
 static void idct64x64_add(const tran_low_t *input, uint8_t *dest, int stride,
@@ -1798,7 +1837,13 @@
                                int stride, const TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   switch (tx_type) {
+#if !CONFIG_DAALA_DCT32
     case DCT_DCT: idct32x32_add(input, dest, stride, txfm_param); break;
+#else
+    case DCT_DCT:
+      av1_iht32x32_1024_add_c(input, dest, stride, txfm_param);
+      break;
+#endif
 #if CONFIG_EXT_TX
     case ADST_DCT:
     case DCT_ADST:
diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index af9e582..bc5d894 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c
@@ -21,7 +21,8 @@
 #include "av1/common/av1_fwd_txfm1d.h"
 #include "av1/common/av1_fwd_txfm1d_cfg.h"
 #include "av1/common/idct.h"
-#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16
+#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \
+    CONFIG_DAALA_DCT32
 #include "av1/common/daala_tx.h"
 #endif
 
@@ -369,6 +370,18 @@
 }
 #endif
 
+#if CONFIG_DAALA_DCT32
+static void fdct32(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  od_coeff x[32];
+  od_coeff y[32];
+  for (i = 0; i < 32; i++) x[i] = (od_coeff)input[i];
+  od_bin_fdct32(y, x, 1);
+  for (i = 0; i < 32; i++) output[i] = (tran_low_t)y[i];
+}
+
+#else
+
 static void fdct32(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
   tran_low_t step[32];
@@ -766,6 +779,7 @@
 
   range_check(output, 32, 18);
 }
+#endif
 
 #ifndef AV1_DCT_GTEST
 
@@ -1075,6 +1089,20 @@
 #endif
 
 // For use in lieu of ADST
+#if CONFIG_DAALA_DCT32
+static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[16];
+  // No scaling within; Daala transforms are all orthonormal
+  for (i = 0; i < 16; ++i) {
+    output[16 + i] = input[i];
+  }
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] = input[i + 16];
+  }
+  fdct16(inputhalf, output);
+}
+#else
 static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
   int i;
   tran_low_t inputhalf[16];
@@ -1088,6 +1116,7 @@
   fdct16(inputhalf, output);
   // Note overall scaling factor is 4 times orthogonal
 }
+#endif
 
 #if CONFIG_MRC_TX
 static void get_masked_residual32(const int16_t **input, int *input_stride,
@@ -1214,7 +1243,13 @@
 
 static void fidtx32(const tran_low_t *input, tran_low_t *output) {
   int i;
-  for (i = 0; i < 32; ++i) output[i] = input[i] * 4;
+  for (i = 0; i < 32; ++i) {
+#if CONFIG_DAALA_DCT32
+    output[i] = input[i];
+#else
+    output[i] = input[i] * 4;
+#endif
+  }
 }
 
 static void copy_block(const int16_t *src, int src_stride, int l, int w,
@@ -2467,17 +2502,30 @@
 
   // Columns
   for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
+    for (j = 0; j < 32; ++j) {
+#if CONFIG_DAALA_DCT32
+      temp_in[j] = input[j * stride + i] * 16;
+#else
+      temp_in[j] = input[j * stride + i] * 4;
+#endif
+    }
     ht.cols(temp_in, temp_out);
-    for (j = 0; j < 32; ++j)
+    for (j = 0; j < 32; ++j) {
+#if CONFIG_DAALA_DCT32
+      out[j * 32 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+#else
       out[j * 32 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
+#endif
+    }
   }
 
   // Rows
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
     ht.rows(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) output[j + i * 32] = temp_out[j];
+    for (j = 0; j < 32; ++j) {
+      output[j + i * 32] = temp_out[j];
+    }
   }
 }
 
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index 4d45a57..c11e51d 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -126,6 +126,7 @@
 set(CONFIG_CONVOLVE_ROUND 1 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_CDEF 0 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_DAALA_DCT16 0 CACHE NUMBER "AV1 experiment flag.")
+set(CONFIG_DAALA_DCT32 0 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_DAALA_DCT4 0 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_DAALA_DCT8 0 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_DAALA_DIST 0 CACHE NUMBER "AV1 experiment flag.")
diff --git a/build/cmake/aom_configure.cmake b/build/cmake/aom_configure.cmake
index f7c18d0..8c9993b 100644
--- a/build/cmake/aom_configure.cmake
+++ b/build/cmake/aom_configure.cmake
@@ -247,7 +247,8 @@
   change_config_and_warn(CONFIG_DCT_ONLY 1 CONFIG_DAALA_DCT4)
 endif()
 
-if (CONFIG_DAALA_DCT4 OR CONFIG_DAALA_DCT8 OR CONFIG_DAALA_DCT16)
+if (CONFIG_DAALA_DCT4 OR CONFIG_DAALA_DCT8 OR CONFIG_DAALA_DCT16 OR
+    CONFIG_DAALA_DCT32)
   if (HAVE_MMX)
     change_config_and_warn(HAVE_MMX 0 CONFIG_DAALA_DCTx)
   endif()
diff --git a/configure b/configure
index 2f2f130..1e98a56 100755
--- a/configure
+++ b/configure
@@ -296,6 +296,7 @@
     daala_dct4
     daala_dct8
     daala_dct16
+    daala_dct32
     cb4x4
     chroma_2x2
     chroma_sub8x8
@@ -579,7 +580,10 @@
     if enabled daala_dct4; then
       enable_feature dct_only
     fi
-    if enabled daala_dct4 || enabled daala_dct8 || enabled daala_dct16; then
+    if enabled daala_dct4 ||
+	    enabled daala_dct8 ||
+	    enabled daala_dct16 ||
+	    enabled daala_dct32; then
       disable_feature mmx
       disable_feature rect_tx
       disable_feature var_tx
diff --git a/test/av1_dct_test.cc b/test/av1_dct_test.cc
index 39ffb0f..fdaf9ab 100644
--- a/test/av1_dct_test.cc
+++ b/test/av1_dct_test.cc
@@ -23,7 +23,8 @@
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 1
 #define AV1_DCT_GTEST
 #include "av1/encoder/dct.c"
-#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16
+#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \
+    CONFIG_DAALA_DCT32
 #include "av1/common/daala_tx.c"
 #endif