Add CONFIG_DAALA_DCT64 experiment.

This experiment replaces the 64-point Type-II DCT and related
scaling vp9 transforms with the 64-point orthonormal
Daala transforms.

subset-1:

    monty-square-baseline-s1-F2@2017-07-28T03:35:45.962Z ->
      monty-square-dct64-s1-F2@2017-07-29T04:50:58.412Z

       PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
    -0.1930 | -0.2037 | -0.0643 |  -0.1917 | -0.2331 | -0.3510 |    -0.1810

objective-1-fast:

    monty-square-baseline-o1f-F2@2017-07-28T03:35:35.533Z ->
      monty-square-dct64-o1f-F2@2017-07-29T04:50:28.542Z

       PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
    -0.2557 | -0.1743 | -0.4900 |  -0.3028 | -0.4147 | -0.5764 |    -0.2864

Change-Id: I1f944df29e44d2e350c42555af274f2d75a62a92
diff --git a/aom_dsp/inv_txfm.c b/aom_dsp/inv_txfm.c
index fbf09db..0aa4672 100644
--- a/aom_dsp/inv_txfm.c
+++ b/aom_dsp/inv_txfm.c
@@ -15,7 +15,7 @@
 #include "./aom_dsp_rtcd.h"
 #include "aom_dsp/inv_txfm.h"
 #if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \
-    CONFIG_DAALA_DCT32
+    CONFIG_DAALA_DCT32 || CONFIG_DAALA_DCT64
 #include "av1/common/daala_tx.h"
 #endif
 
@@ -1469,6 +1469,17 @@
   }
 }
 
+#if CONFIG_TX64X64 && CONFIG_DAALA_DCT64
+void aom_idct64_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  od_coeff x[64];
+  od_coeff y[64];
+  for (i = 0; i < 64; i++) y[i] = (od_coeff)input[i];
+  od_bin_idct64(x, 1, y);
+  for (i = 0; i < 64; i++) output[i] = (tran_low_t)x[i];
+}
+#endif
+
 void aom_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
                                  int stride, int bd) {
   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
diff --git a/aom_dsp/inv_txfm.h b/aom_dsp/inv_txfm.h
index a9c485e..dee7599 100644
--- a/aom_dsp/inv_txfm.h
+++ b/aom_dsp/inv_txfm.h
@@ -68,6 +68,9 @@
 void aom_idct8_c(const tran_low_t *input, tran_low_t *output);
 void aom_idct16_c(const tran_low_t *input, tran_low_t *output);
 void aom_idct32_c(const tran_low_t *input, tran_low_t *output);
+#if CONFIG_TX64X64 && CONFIG_DAALA_DCT64
+void aom_idct64_c(const tran_low_t *input, tran_low_t *output);
+#endif
 void aom_iadst4_c(const tran_low_t *input, tran_low_t *output);
 void aom_iadst8_c(const tran_low_t *input, tran_low_t *output);
 void aom_iadst16_c(const tran_low_t *input, tran_low_t *output);
diff --git a/av1/common/daala_tx.c b/av1/common/daala_tx.c
index 82c8af5..72e9ebc 100644
--- a/av1/common/daala_tx.c
+++ b/av1/common/daala_tx.c
@@ -1788,6 +1788,1336 @@
   } \
   while (0)
 
+#if CONFIG_TX64X64
+#define OD_FDCT_32_ASYM(t0, tg, tgh, t8, to, toh, t4, tk, tkh, tc, ts, tsh, \
+  t2, ti, tih, ta, tq, tqh, t6, tm, tmh, te, tu, tuh, t1, th, thh, \
+  t9, tp, tph, t5, tl, tlh, td, tt, tth, t3, tj, tjh, tb, tr, trh, \
+  t7, tn, tnh, tf, tv, tvh) \
+  /* Embedded 32-point asymmetric Type-II fDCT. */ \
+  do { \
+    t0 += tvh; \
+    tv = t0 - tv; \
+    t1 = tuh - t1; \
+    tu -= t1; \
+    t2 += tth; \
+    tt = t2 - tt; \
+    t3 = tsh - t3; \
+    ts -= t3; \
+    t4 += trh; \
+    tr = t4 - tr; \
+    t5 = tqh - t5; \
+    tq -= t5; \
+    t6 += tph; \
+    tp = t6 - tp; \
+    t7 = toh - t7; \
+    to -= t7; \
+    t8 += tnh; \
+    tn = t8 - tn; \
+    t9 = tmh - t9; \
+    tm -= t9; \
+    ta += tlh; \
+    tl = ta - tl; \
+    tb = tkh - tb; \
+    tk -= tb; \
+    tc += tjh; \
+    tj = tc - tj; \
+    td = tih - td; \
+    ti -= td; \
+    te += thh; \
+    th = te - th; \
+    tf = tgh - tf; \
+    tg -= tf; \
+    OD_FDCT_16(t0, tg, t8, to, t4, tk, tc, ts, \
+     t2, ti, ta, tq, t6, tm, te, tu); \
+    OD_FDST_16(tv, tf, tn, t7, tr, tb, tj, t3, \
+     tt, td, tl, t5, tp, t9, th, t1); \
+  } \
+  while (0)
+
+#define OD_IDCT_32_ASYM(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, \
+  t6, tm, te, tu, t1, t1h, th, thh, t9, t9h, tp, tph, t5, t5h, tl, tlh, \
+  td, tdh, tt, tth, t3, t3h, tj, tjh, tb, tbh, tr, trh, t7, t7h, tn, tnh, \
+  tf, tfh, tv, tvh) \
+  /* Embedded 32-point asymmetric Type-II iDCT. */ \
+  do { \
+    OD_IDST_16(tv, tn, tr, tj, tt, tl, tp, th, \
+     tu, tm, tq, ti, ts, tk, to, tg); \
+    OD_IDCT_16(t0, t8, t4, tc, t2, ta, t6, te, \
+     t1, t9, t5, td, t3, tb, t7, tf); \
+    tv = t0 - tv; \
+    tvh = OD_DCT_RSHIFT(tv, 1); \
+    t0 -= tvh; \
+    t1 += tu; \
+    t1h = OD_DCT_RSHIFT(t1, 1); \
+    tu = t1h - tu; \
+    tt = t2 - tt; \
+    tth = OD_DCT_RSHIFT(tt, 1); \
+    t2 -= tth; \
+    t3 += ts; \
+    t3h = OD_DCT_RSHIFT(t3, 1); \
+    ts = t3h - ts; \
+    tr = t4 - tr; \
+    trh = OD_DCT_RSHIFT(tr, 1); \
+    t4 -= trh; \
+    t5 += tq; \
+    t5h = OD_DCT_RSHIFT(t5, 1); \
+    tq = t5h - tq; \
+    tp = t6 - tp; \
+    tph = OD_DCT_RSHIFT(tp, 1); \
+    t6 -= tph; \
+    t7 += to; \
+    t7h = OD_DCT_RSHIFT(t7, 1); \
+    to = t7h - to; \
+    tn = t8 - tn; \
+    tnh = OD_DCT_RSHIFT(tn, 1); \
+    t8 -= tnh; \
+    t9 += tm; \
+    t9h = OD_DCT_RSHIFT(t9, 1); \
+    tm = t9h - tm; \
+    tl = ta - tl; \
+    tlh = OD_DCT_RSHIFT(tl, 1); \
+    ta -= tlh; \
+    tb += tk; \
+    tbh = OD_DCT_RSHIFT(tb, 1); \
+    tk = tbh - tk; \
+    tj = tc - tj; \
+    tjh = OD_DCT_RSHIFT(tj, 1); \
+    tc -= tjh; \
+    td += ti; \
+    tdh = OD_DCT_RSHIFT(td, 1); \
+    ti = tdh - ti; \
+    th = te - th; \
+    thh = OD_DCT_RSHIFT(th, 1); \
+    te -= thh; \
+    tf += tg; \
+    tfh = OD_DCT_RSHIFT(tf, 1); \
+    tg = tfh - tg; \
+  } \
+  while (0)
+
+#define OD_FDST_32_ASYM(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, \
+  tm, te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
+  /* Embedded 32-point asymmetric Type-IV fDST. */ \
+  do { \
+    int t0h; \
+    int t1h; \
+    int t4h; \
+    int t5h; \
+    int tqh; \
+    int trh; \
+    int tuh; \
+    int tvh; \
+    \
+    tu = -tu; \
+    \
+    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
+    OD_DCT_OVERFLOW_CHECK(tq, 13573, 8192, 271); \
+    t5 -= (tq*13573 + 8192) >> 14; \
+    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
+    OD_DCT_OVERFLOW_CHECK(t5, 11585, 16384, 272); \
+    tq += (t5*11585 + 16384) >> 15; \
+    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
+    OD_DCT_OVERFLOW_CHECK(tq, 13573, 8192, 273); \
+    t5 -= (tq*13573 + 8192) >> 14; \
+    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
+    OD_DCT_OVERFLOW_CHECK(t6, 29957, 16384, 274); \
+    tp += (t6*29957 + 16384) >> 15; \
+    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
+    OD_DCT_OVERFLOW_CHECK(tp, 11585, 8192, 275); \
+    t6 -= (tp*11585 + 8192) >> 14; \
+    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
+    OD_DCT_OVERFLOW_CHECK(t6, 19195, 16384, 276); \
+    tp -= (t6*19195 + 16384) >> 15; \
+    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
+    OD_DCT_OVERFLOW_CHECK(t1, 29957, 16384, 277); \
+    tu += (t1*29957 + 16384) >> 15; \
+    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
+    OD_DCT_OVERFLOW_CHECK(tu, 11585, 8192, 278); \
+    t1 -= (tu*11585 + 8192) >> 14; \
+    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
+    OD_DCT_OVERFLOW_CHECK(t1, 19195, 16384, 279); \
+    tu -= (t1*19195 + 16384) >> 15; \
+    /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
+    OD_DCT_OVERFLOW_CHECK(t2, 28681, 16384, 280); \
+    tt += (t2*28681 + 16384) >> 15; \
+    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
+    OD_DCT_OVERFLOW_CHECK(tt, 15137, 8192, 281); \
+    t2 -= (tt*15137 + 8192) >> 14; \
+    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
+    OD_DCT_OVERFLOW_CHECK(t2, 4161, 8192, 282); \
+    tt += (t2*4161 + 8192) >> 14; \
+    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
+    OD_DCT_OVERFLOW_CHECK(ts, 4161, 8192, 283); \
+    t3 += (ts*4161 + 8192) >> 14; \
+    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
+    OD_DCT_OVERFLOW_CHECK(t3, 15137, 8192, 284); \
+    ts -= (t3*15137 + 8192) >> 14; \
+    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
+    OD_DCT_OVERFLOW_CHECK(ts, 14341, 8192, 285); \
+    t3 += (ts*14341 + 8192) >> 14; \
+    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
+    OD_DCT_OVERFLOW_CHECK(tm, 19195, 16384, 286); \
+    t9 -= (tm*19195 + 16384) >> 15; \
+    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
+    OD_DCT_OVERFLOW_CHECK(t9, 11585, 8192, 287); \
+    tm -= (t9*11585 + 8192) >> 14; \
+    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
+    OD_DCT_OVERFLOW_CHECK(tm, 7489, 4096, 288); \
+    t9 += (tm*7489 + 4096) >> 13; \
+    /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \
+    OD_DCT_OVERFLOW_CHECK(tl, 3259, 4096, 289); \
+    ta += (tl*3259 + 4096) >> 13; \
+    /* 3135/16384 ~= Sin[Pi/8]/2 ~= 0.1913417161825449 */ \
+    OD_DCT_OVERFLOW_CHECK(ta, 3135, 8192, 290); \
+    tl -= (ta*3135 + 8192) >> 14; \
+    /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \
+    OD_DCT_OVERFLOW_CHECK(tl, 3259, 4096, 291); \
+    ta += (tl*3259 + 4096) >> 13; \
+    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
+    OD_DCT_OVERFLOW_CHECK(tk, 4161, 8192, 292); \
+    tb += (tk*4161 + 8192) >> 14; \
+    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
+    OD_DCT_OVERFLOW_CHECK(tb, 15137, 8192, 293); \
+    tk -= (tb*15137 + 8192) >> 14; \
+    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
+    OD_DCT_OVERFLOW_CHECK(tk, 14341, 8192, 294); \
+    tb += (tk*14341 + 8192) >> 14; \
+    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
+    OD_DCT_OVERFLOW_CHECK(te, 29957, 16384, 295); \
+    th += (te*29957 + 16384) >> 15; \
+    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
+    OD_DCT_OVERFLOW_CHECK(th, 11585, 8192, 296); \
+    te -= (th*11585 + 8192) >> 14; \
+    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
+    OD_DCT_OVERFLOW_CHECK(te, 19195, 16384, 297); \
+    th -= (te*19195 + 16384) >> 15; \
+    /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
+    OD_DCT_OVERFLOW_CHECK(tc, 28681, 16384, 298); \
+    tj += (tc*28681 + 16384) >> 15; \
+    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
+    OD_DCT_OVERFLOW_CHECK(tj, 15137, 8192, 299); \
+    tc -= (tj*15137 + 8192) >> 14; \
+    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
+    OD_DCT_OVERFLOW_CHECK(tc, 4161, 8192, 300); \
+    tj += (tc*4161 + 8192) >> 14; \
+    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
+    OD_DCT_OVERFLOW_CHECK(ti, 4161, 8192, 301); \
+    td += (ti*4161 + 8192) >> 14; \
+    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
+    OD_DCT_OVERFLOW_CHECK(td, 15137, 8192, 302); \
+    ti -= (td*15137 + 8192) >> 14; \
+    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
+    OD_DCT_OVERFLOW_CHECK(ti, 14341, 8192, 303); \
+    td += (ti*14341 + 8192) >> 14; \
+    \
+    t1 = -t1; \
+    t2 = -t2; \
+    t3 = -t3; \
+    td = -td; \
+    tg = -tg; \
+    to = -to; \
+    ts = -ts; \
+    \
+    tr -= OD_DCT_RSHIFT(t5, 1); \
+    t5 += tr; \
+    tq -= OD_DCT_RSHIFT(t4, 1); /* pass */ \
+    t4 += tq; \
+    t6 -= OD_DCT_RSHIFT(t7, 1); \
+    t7 += t6; \
+    to -= OD_DCT_RSHIFT(tp, 1); /* pass */ \
+    tp += to; \
+    t1 += OD_DCT_RSHIFT(t0, 1); /* pass */ \
+    t0 -= t1; \
+    tv -= OD_DCT_RSHIFT(tu, 1); \
+    tu += tv; \
+    t3 -= OD_DCT_RSHIFT(tt, 1); \
+    tt += t3; \
+    t2 += OD_DCT_RSHIFT(ts, 1); \
+    ts -= t2; \
+    t9 -= OD_DCT_RSHIFT(t8, 1); /* pass */ \
+    t8 += t9; \
+    tn += OD_DCT_RSHIFT(tm, 1); \
+    tm -= tn; \
+    tb += OD_DCT_RSHIFT(ta, 1); \
+    ta -= tb; \
+    tl -= OD_DCT_RSHIFT(tk, 1); \
+    tk += tl; \
+    te -= OD_DCT_RSHIFT(tf, 1); /* pass */ \
+    tf += te; \
+    tg -= OD_DCT_RSHIFT(th, 1); \
+    th += tg; \
+    tc -= OD_DCT_RSHIFT(ti, 1); \
+    ti += tc; \
+    td += OD_DCT_RSHIFT(tj, 1); \
+    tj -= td; \
+    \
+    t4 = -t4; \
+    \
+    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \
+    OD_DCT_OVERFLOW_CHECK(tr, 6723, 4096, 304); \
+    t4 += (tr*6723 + 4096) >> 13; \
+    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.9807852804032304 */ \
+    OD_DCT_OVERFLOW_CHECK(t4, 16069, 8192, 305); \
+    tr -= (t4*16069 + 8192) >> 14; \
+    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \
+    OD_DCT_OVERFLOW_CHECK(tr, 6723, 4096, 306); \
+    t4 += (tr*6723 + 4096) >> 13; \
+    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \
+    OD_DCT_OVERFLOW_CHECK(tq, 17515, 16384, 307); \
+    t5 += (tq*17515 + 16384) >> 15; \
+    /* 13623/16384 ~= Sin[5*Pi/16] ~= 0.8314696123025452 */ \
+    OD_DCT_OVERFLOW_CHECK(t5, 13623, 8192, 308); \
+    tq -= (t5*13623 + 8192) >> 14; \
+    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \
+    OD_DCT_OVERFLOW_CHECK(tq, 17515, 16384, 309); \
+    t5 += (tq*17515 + 16384) >> 15; \
+    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
+    OD_DCT_OVERFLOW_CHECK(to, 3227, 16384, 310); \
+    t7 += (to*3227 + 16384) >> 15; \
+    /* 6393/32768 ~= Sin[Pi/16] ~= 0.19509032201612825 */ \
+    OD_DCT_OVERFLOW_CHECK(t7, 6393, 16384, 311); \
+    to -= (t7*6393 + 16384) >> 15; \
+    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
+    OD_DCT_OVERFLOW_CHECK(to, 3227, 16384, 312); \
+    t7 += (to*3227 + 16384) >> 15; \
+    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
+    OD_DCT_OVERFLOW_CHECK(tp, 2485, 4096, 313); \
+    t6 += (tp*2485 + 4096) >> 13; \
+    /* 18205/32768 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
+    OD_DCT_OVERFLOW_CHECK(t6, 18205, 16384, 314); \
+    tp -= (t6*18205 + 16384) >> 15; \
+    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
+    OD_DCT_OVERFLOW_CHECK(tp, 2485, 4096, 315); \
+    t6 += (tp*2485 + 4096) >> 13; \
+    \
+    t5 = -t5; \
+    \
+    tr += to; \
+    trh = OD_DCT_RSHIFT(tr, 1); \
+    to -= trh; \
+    t4 += t7; \
+    t4h = OD_DCT_RSHIFT(t4, 1); \
+    t7 -= t4h; \
+    t5 += tp; \
+    t5h = OD_DCT_RSHIFT(t5, 1); \
+    tp -= t5h; \
+    tq += t6; \
+    tqh = OD_DCT_RSHIFT(tq, 1); \
+    t6 -= tqh; \
+    t0 -= t3; \
+    t0h = OD_DCT_RSHIFT(t0, 1); \
+    t3 += t0h; \
+    tv -= ts; \
+    tvh = OD_DCT_RSHIFT(tv, 1); \
+    ts += tvh; \
+    tu += tt; \
+    tuh = OD_DCT_RSHIFT(tu, 1); \
+    tt -= tuh; \
+    t1 -= t2; \
+    t1h = OD_DCT_RSHIFT(t1, 1); \
+    t2 += t1h; \
+    t8 += tb; \
+    tb -= OD_DCT_RSHIFT(t8, 1); \
+    tn += tk; \
+    tk -= OD_DCT_RSHIFT(tn, 1); \
+    t9 += tl; \
+    tl -= OD_DCT_RSHIFT(t9, 1); \
+    tm -= ta; \
+    ta += OD_DCT_RSHIFT(tm, 1); \
+    tc -= tf; \
+    tf += OD_DCT_RSHIFT(tc, 1); \
+    tj += tg; \
+    tg -= OD_DCT_RSHIFT(tj, 1); \
+    td -= te; \
+    te += OD_DCT_RSHIFT(td, 1); \
+    ti += th; \
+    th -= OD_DCT_RSHIFT(ti, 1); \
+    \
+    t9 = -t9; \
+    tl = -tl; \
+    \
+    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
+    OD_DCT_OVERFLOW_CHECK(tn, 805, 8192, 316); \
+    t8 += (tn*805 + 8192) >> 14; \
+    /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \
+    OD_DCT_OVERFLOW_CHECK(t8, 803, 4096, 317); \
+    tn -= (t8*803 + 4096) >> 13; \
+    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
+    OD_DCT_OVERFLOW_CHECK(tn, 805, 8192, 318); \
+    t8 += (tn*805 + 8192) >> 14; \
+    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
+    OD_DCT_OVERFLOW_CHECK(tb, 11725, 16384, 319); \
+    tk += (tb*11725 + 16384) >> 15; \
+    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \
+    OD_DCT_OVERFLOW_CHECK(tk, 5197, 4096, 320); \
+    tb -= (tk*5197 + 4096) >> 13; \
+    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
+    OD_DCT_OVERFLOW_CHECK(tb, 11725, 16384, 321); \
+    tk += (tb*11725 + 16384) >> 15; \
+    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \
+    OD_DCT_OVERFLOW_CHECK(tl, 2455, 2048, 322); \
+    ta += (tl*2455 + 2048) >> 12; \
+    /* 14449/16384 ~= Sin[11*Pi/32] ~= 0.881921264348355 */ \
+    OD_DCT_OVERFLOW_CHECK(ta, 14449, 8192, 323); \
+    tl -= (ta*14449 + 8192) >> 14; \
+    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \
+    OD_DCT_OVERFLOW_CHECK(tl, 2455, 2048, 324); \
+    ta += (tl*2455 + 2048) >> 12; \
+    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
+    OD_DCT_OVERFLOW_CHECK(tm, 4861, 16384, 325); \
+    t9 += (tm*4861 + 16384) >> 15; \
+    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \
+    OD_DCT_OVERFLOW_CHECK(t9, 1189, 2048, 326); \
+    tm -= (t9*1189 + 2048) >> 12; \
+    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
+    OD_DCT_OVERFLOW_CHECK(tm, 4861, 16384, 327); \
+    t9 += (tm*4861 + 16384) >> 15; \
+    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
+    OD_DCT_OVERFLOW_CHECK(tg, 805, 8192, 328); \
+    tf += (tg*805 + 8192) >> 14; \
+    /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \
+    OD_DCT_OVERFLOW_CHECK(tf, 803, 4096, 329); \
+    tg -= (tf*803 + 4096) >> 13; \
+    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
+    OD_DCT_OVERFLOW_CHECK(tg, 805, 8192, 330); \
+    tf += (tg*805 + 8192) >> 14; \
+    /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
+    OD_DCT_OVERFLOW_CHECK(tj, 2931, 4096, 331); \
+    tc += (tj*2931 + 4096) >> 13; \
+    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \
+    OD_DCT_OVERFLOW_CHECK(tc, 5197, 4096, 332); \
+    tj -= (tc*5197 + 4096) >> 13; \
+    /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
+    OD_DCT_OVERFLOW_CHECK(tj, 2931, 4096, 333); \
+    tc += (tj*2931 + 4096) >> 13; \
+    /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \
+    OD_DCT_OVERFLOW_CHECK(ti, 513, 1024, 334); \
+    td += (ti*513 + 1024) >> 11; \
+    /* 7723/16384 ~= Sin[5*Pi/32] ~= 0.47139673682599764 */ \
+    OD_DCT_OVERFLOW_CHECK(td, 7723, 8192, 335); \
+    ti -= (td*7723 + 8192) >> 14; \
+    /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \
+    OD_DCT_OVERFLOW_CHECK(ti, 513, 1024, 336); \
+    td += (ti*513 + 1024) >> 11; \
+    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
+    OD_DCT_OVERFLOW_CHECK(th, 4861, 16384, 337); \
+    te += (th*4861 + 16384) >> 15; \
+    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \
+    OD_DCT_OVERFLOW_CHECK(te, 1189, 2048, 338); \
+    th -= (te*1189 + 2048) >> 12; \
+    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
+    OD_DCT_OVERFLOW_CHECK(th, 4861, 16384, 339); \
+    te += (th*4861 + 16384) >> 15; \
+    \
+    ta = -ta; \
+    tb = -tb; \
+    \
+    tt += t5h; \
+    t5 -= tt; \
+    t2 -= tqh; \
+    tq += t2; \
+    tp += t1h; \
+    t1 -= tp; \
+    t6 -= tuh; \
+    tu += t6; \
+    t7 += tvh; \
+    tv -= t7; \
+    to += t0h; \
+    t0 -= to; \
+    t3 -= t4h; \
+    t4 += t3; \
+    ts += trh; \
+    tr -= ts; \
+    tf -= OD_DCT_RSHIFT(tn, 1); \
+    tn += tf; \
+    tg -= OD_DCT_RSHIFT(t8, 1); \
+    t8 += tg; \
+    tk += OD_DCT_RSHIFT(tc, 1); \
+    tc -= tk; \
+    tb += OD_DCT_RSHIFT(tj, 1); \
+    tj -= tb; \
+    ta += OD_DCT_RSHIFT(ti, 1); \
+    ti -= ta; \
+    tl += OD_DCT_RSHIFT(td, 1); \
+    td -= tl; \
+    te -= OD_DCT_RSHIFT(tm, 1); \
+    tm += te; \
+    th -= OD_DCT_RSHIFT(t9, 1); \
+    t9 += th; \
+    ta -= t5; \
+    t5 += OD_DCT_RSHIFT(ta, 1); \
+    tq -= tl; \
+    tl += OD_DCT_RSHIFT(tq, 1); \
+    t2 -= ti; \
+    ti += OD_DCT_RSHIFT(t2, 1); \
+    td -= tt; \
+    tt += OD_DCT_RSHIFT(td, 1); \
+    tm += tp; \
+    tp -= OD_DCT_RSHIFT(tm, 1); \
+    t6 += t9; \
+    t9 -= OD_DCT_RSHIFT(t6, 1); \
+    te -= tu; \
+    tu += OD_DCT_RSHIFT(te, 1); \
+    t1 -= th; \
+    th += OD_DCT_RSHIFT(t1, 1); \
+    t0 -= tg; \
+    tg += OD_DCT_RSHIFT(t0, 1); \
+    tf += tv; \
+    tv -= OD_DCT_RSHIFT(tf, 1); \
+    t8 -= t7; \
+    t7 += OD_DCT_RSHIFT(t8, 1); \
+    to -= tn; \
+    tn += OD_DCT_RSHIFT(to, 1); \
+    t4 -= tk; \
+    tk += OD_DCT_RSHIFT(t4, 1); \
+    tb -= tr; \
+    tr += OD_DCT_RSHIFT(tb, 1); \
+    t3 -= tj; \
+    tj += OD_DCT_RSHIFT(t3, 1); \
+    tc -= ts; \
+    ts += OD_DCT_RSHIFT(tc, 1); \
+    \
+    tr = -tr; \
+    ts = -ts; \
+    tt = -tt; \
+    tu = -tu; \
+    \
+    /* 2847/4096 ~= (1/Sqrt[2] - Cos[63*Pi/128]/2)/Sin[63*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(t0, 2847, 2048, 340); \
+    tv += (t0*2847 + 2048) >> 12; \
+    /* 5791/4096 ~= Sqrt[2]*Sin[63*Pi/128] */  \
+    OD_DCT_OVERFLOW_CHECK(tv, 5791, 2048, 341); \
+    t0 -= (tv*5791 + 2048) >> 12; \
+    /* 5593/8192 ~= (1/Sqrt[2] - Cos[63*Pi/128])/Sin[63*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(t0, 5593, 4096, 342); \
+    tv += (t0*5593 + 4096) >> 13; \
+    /* 4099/8192 ~= (1/Sqrt[2] - Cos[31*Pi/128]/2)/Sin[31*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(tf, 4099, 4096, 343); \
+    tg -= (tf*4099 + 4096) >> 13; \
+    /* 1997/2048 ~= Sqrt[2]*Sin[31*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(tg, 1997, 1024, 344); \
+    tf += (tg*1997 + 1024) >> 11; \
+    /* -815/32768 ~= (1/Sqrt[2] - Cos[31*Pi/128])/Sin[31*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(tf, 815, 16384, 345); \
+    tg += (tf*815 + 16384) >> 15; \
+    /* 2527/4096 ~= (1/Sqrt[2] - Cos[17*Pi/128]/2)/Sin[17*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(t8, 2527, 2048, 346); \
+    tn -= (t8*2527 + 2048) >> 12; \
+    /* 4695/8192 ~= Sqrt[2]*Sin[17*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(tn, 4695, 4096, 347); \
+    t8 += (tn*4695 + 4096) >> 13; \
+    /* -4187/8192 ~= (1/Sqrt[2] - Cos[17*Pi/128])/Sin[17*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(t8, 4187, 4096, 348); \
+    tn += (t8*4187 + 4096) >> 13; \
+    /* 5477/8192 ~= (1/Sqrt[2] - Cos[15*Pi/128]/2)/Sin[15*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(to, 5477, 4096, 349); \
+    t7 += (to*5477 + 4096) >> 13; \
+    /* 4169/8192 ~= Sqrt[2]*Sin[15*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(t7, 4169, 4096, 350); \
+    to -= (t7*4169 + 4096) >> 13; \
+    /* -2571/4096 ~= (1/Sqrt[2] - Cos[15*Pi/128])/Sin[15*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(to, 2571, 2048, 351); \
+    t7 -= (to*2571 + 2048) >> 12; \
+    /* 5331/8192 ~= (1/Sqrt[2] - Cos[59*Pi/128]/2)/Sin[59*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(t2, 5331, 4096, 352); \
+    tt += (t2*5331 + 4096) >> 13; \
+    /* 5749/4096 ~= Sqrt[2]*Sin[59*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(tt, 5749, 2048, 353); \
+    t2 -= (tt*5749 + 2048) >> 12; \
+    /* 2413/4096 ~= (1/Sqrt[2] - Cos[59*Pi/128])/Sin[59*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(t2, 2413, 2048, 354); \
+    tt += (t2*2413 + 2048) >> 12; \
+    /* 4167/8192 ~= (1/Sqrt[2] - Cos[27*Pi/128]/2)/Sin[27*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(td, 4167, 4096, 355); \
+    ti -= (td*4167 + 4096) >> 13; \
+    /* 891/1024 ~= Sqrt[2]*Sin[27*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(ti, 891, 512, 356); \
+    td += (ti*891 + 512) >> 10; \
+    /* -4327/32768 ~= (1/Sqrt[2] - Cos[27*Pi/128])/Sin[27*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(td, 4327, 16384, 357); \
+    ti += (td*4327 + 16384) >> 15; \
+    /* 2261/4096 ~= (1/Sqrt[2] - Cos[21*Pi/128]/2)/Sin[21*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(ta, 2261, 2048, 358); \
+    tl -= (ta*2261 + 2048) >> 12; \
+    /* 2855/4096 ~= Sqrt[2]*Sin[21*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(tl, 2855, 2048, 359); \
+    ta += (tl*2855 + 2048) >> 12; \
+    /* -5417/16384 ~= (1/Sqrt[2] - Cos[21*Pi/128])/Sin[21*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(ta, 5417, 8192, 360); \
+    tl += (ta*5417 + 8192) >> 14; \
+    /* 3459/4096 ~= (1/Sqrt[2] - Cos[11*Pi/128]/2)/Sin[11*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(tq, 3459, 2048, 361); \
+    t5 += (tq*3459 + 2048) >> 12; \
+    /* 1545/4096 ~= Sqrt[2]*Sin[11*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(t5, 1545, 2048, 362); \
+    tq -= (t5*1545 + 2048) >> 12; \
+    /* -1971/2048 ~= (1/Sqrt[2] - Cos[11*Pi/128])/Sin[11*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(tq, 1971, 1024, 363); \
+    t5 -= (tq*1971 + 1024) >> 11; \
+    /* 323/512 ~= (1/Sqrt[2] - Cos[57*Pi/128]/2)/Sin[57*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(t3, 323, 256, 364); \
+    ts += (t3*323 + 256) >> 9; \
+    /* 5707/4096 ~= Sqrt[2]*Sin[57*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(ts, 5707, 2048, 365); \
+    t3 -= (ts*5707 + 2048) >> 12; \
+    /* 2229/4096 ~= (1/Sqrt[2] - Cos[57*Pi/128])/Sin[57*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(t3, 2229, 2048, 366); \
+    ts += (t3*2229 + 2048) >> 12; \
+    /* 1061/2048 ~= (1/Sqrt[2] - Cos[25*Pi/128]/2)/Sin[25*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(tc, 1061, 1024, 367); \
+    tj -= (tc*1061 + 1024) >> 11; \
+    /* 6671/8192 ~= Sqrt[2]*Sin[25*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(tj, 6671, 4096, 368); \
+    tc += (tj*6671 + 4096) >> 13; \
+    /* -6287/32768 ~= (1/Sqrt[2] - Cos[25*Pi/128])/Sin[25*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(tc, 6287, 16384, 369); \
+    tj += (tc*6287 + 16384) >> 15; \
+    /* 4359/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128]/2)/Sin[23*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(tb, 4359, 4096, 370); \
+    tk -= (tb*4359 + 4096) >> 13; \
+    /* 3099/4096 ~= Sqrt[2]*Sin[23*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(tk, 3099, 2048, 371); \
+    tb += (tk*3099 + 2048) >> 12; \
+    /* -2109/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128])/Sin[23*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(tb, 2109, 4096, 372); \
+    tk += (tb*2109 + 4096) >> 13; \
+    /* 5017/8192 ~= (1/Sqrt[2] - Cos[55*Pi/128]/2)/Sin[55*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(t4, 5017, 4096, 373); \
+    tr += (t4*5017 + 4096) >> 13; \
+    /* 1413/1024 ~= Sqrt[2]*Sin[55*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(tr, 1413, 512, 374); \
+    t4 -= (tr*1413 + 512) >> 10; \
+    /* 8195/16384 ~= (1/Sqrt[2] - Cos[55*Pi/128])/Sin[55*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(t4, 8195, 8192, 375); \
+    tr += (t4*8195 + 8192) >> 14; \
+    /* 2373/4096 ~= (1/Sqrt[2] - Cos[19*Pi/128]/2)/Sin[19*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(tm, 2373, 2048, 376); \
+    t9 += (tm*2373 + 2048) >> 12; \
+    /* 5209/8192 ~= Sqrt[2]*Sin[19*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(t9, 5209, 4096, 377); \
+    tm -= (t9*5209 + 4096) >> 13; \
+    /* -3391/8192 ~= (1/Sqrt[2] - Cos[19*Pi/128])/Sin[19*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(tm, 3391, 4096, 378); \
+    t9 -= (tm*3391 + 4096) >> 13; \
+    /* 1517/2048 ~= (1/Sqrt[2] - Cos[13*Pi/128]/2)/Sin[13*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(t6, 1517, 1024, 379); \
+    tp -= (t6*1517 + 1024) >> 11; \
+    /* 1817/4096 ~= Sqrt[2]*Sin[13*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(tp, 1817, 2048, 380); \
+    t6 += (tp*1817 + 2048) >> 12; \
+    /* -6331/8192 ~= (1/Sqrt[2] - Cos[13*Pi/128])/Sin[13*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(t6, 6331, 4096, 381); \
+    tp += (t6*6331 + 4096) >> 13; \
+    /* 515/1024 ~= (1/Sqrt[2] - Cos[29*Pi/128]/2)/Sin[29*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(te, 515, 512, 382); \
+    th -= (te*515 + 512) >> 10; \
+    /* 7567/8192 ~= Sqrt[2]*Sin[29*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(th, 7567, 4096, 383); \
+    te += (th*7567 + 4096) >> 13; \
+    /* -2513/32768 ~= (1/Sqrt[2] - Cos[29*Pi/128])/Sin[29*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(te, 2513, 16384, 384); \
+    th += (te*2513 + 16384) >> 15; \
+    /* 2753/4096 ~= (1/Sqrt[2] - Cos[61*Pi/128]/2)/Sin[61*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(t1, 2753, 2048, 385); \
+    tu += (t1*2753 + 2048) >> 12; \
+    /* 5777/4096 ~= Sqrt[2]*Sin[61*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(tu, 5777, 2048, 386); \
+    t1 -= (tu*5777 + 2048) >> 12; \
+    /* 1301/2048 ~= (1/Sqrt[2] - Cos[61*Pi/128])/Sin[61*Pi/128] */ \
+    OD_DCT_OVERFLOW_CHECK(t1, 1301, 1024, 387); \
+    tu += (t1*1301 + 1024) >> 11; \
+  } \
+  while (0)
+
+#define OD_IDST_32_ASYM(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, \
+  tm, te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
+  /* Embedded 32-point asymmetric Type-IV iDST. */ \
+  do { \
+    int t0h; \
+    int t4h; \
+    int tbh; \
+    int tfh; \
+    int tgh; \
+    int tkh; \
+    int trh; \
+    int tvh; \
+    /* 1301/2048 ~= (1/Sqrt[2] - Cos[61*Pi/128])/Sin[61*Pi/128] */ \
+    tf -= (tg*1301 + 1024) >> 11; \
+    /* 5777/4096 ~= Sqrt[2]*Sin[61*Pi/128] */ \
+    tg += (tf*5777 + 2048) >> 12; \
+    /* 2753/4096 ~= (1/Sqrt[2] - Cos[61*Pi/128]/2)/Sin[61*Pi/128] */ \
+    tf -= (tg*2753 + 2048) >> 12; \
+    /* -2513/32768 ~= (1/Sqrt[2] - Cos[29*Pi/128])/Sin[29*Pi/128] */ \
+    th -= (te*2513 + 16384) >> 15; \
+    /* 7567/8192 ~= Sqrt[2]*Sin[29*Pi/128] */ \
+    te -= (th*7567 + 4096) >> 13; \
+    /* 515/1024 ~= (1/Sqrt[2] - Cos[29*Pi/128]/2)/Sin[29*Pi/128] */ \
+    th += (te*515 + 512) >> 10; \
+    /* -6331/8192 ~= (1/Sqrt[2] - Cos[13*Pi/128])/Sin[13*Pi/128] */ \
+    tj -= (tc*6331 + 4096) >> 13; \
+    /* 1817/4096 ~= Sqrt[2]*Sin[13*Pi/128] */ \
+    tc -= (tj*1817 + 2048) >> 12; \
+    /* 1517/2048 ~= (1/Sqrt[2] - Cos[13*Pi/128]/2)/Sin[13*Pi/128] */ \
+    tj += (tc*1517 + 1024) >> 11; \
+    /* -3391/8192 ~= (1/Sqrt[2] - Cos[19*Pi/128])/Sin[19*Pi/128] */ \
+    ti += (td*3391 + 4096) >> 13; \
+    /* 5209/8192 ~= Sqrt[2]*Sin[19*Pi/128] */ \
+    td += (ti*5209 + 4096) >> 13; \
+    /* 2373/4096 ~= (1/Sqrt[2] - Cos[19*Pi/128]/2)/Sin[19*Pi/128] */ \
+    ti -= (td*2373 + 2048) >> 12; \
+    /* 8195/16384 ~= (1/Sqrt[2] - Cos[55*Pi/128])/Sin[55*Pi/128] */ \
+    tr -= (t4*8195 + 8192) >> 14; \
+    /* 1413/1024 ~= Sqrt[2]*Sin[55*Pi/128] */ \
+    t4 += (tr*1413 + 512) >> 10; \
+    /* 5017/8192 ~= (1/Sqrt[2] - Cos[55*Pi/128]/2)/Sin[55*Pi/128] */ \
+    tr -= (t4*5017 + 4096) >> 13; \
+    /* -2109/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128])/Sin[23*Pi/128] */ \
+    t5 -= (tq*2109 + 4096) >> 13; \
+    /* 3099/4096 ~= Sqrt[2]*Sin[23*Pi/128] */ \
+    tq -= (t5*3099 + 2048) >> 12; \
+    /* 4359/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128]/2)/Sin[23*Pi/128] */ \
+    t5 += (tq*4359 + 4096) >> 13; \
+    /* -6287/32768 ~= (1/Sqrt[2] - Cos[25*Pi/128])/Sin[25*Pi/128] */ \
+    tp -= (t6*6287 + 16384) >> 15; \
+    /* 6671/8192 ~= Sqrt[2]*Sin[25*Pi/128] */ \
+    t6 -= (tp*6671 + 4096) >> 13; \
+    /* 1061/2048 ~= (1/Sqrt[2] - Cos[25*Pi/128]/2)/Sin[25*Pi/128] */ \
+    tp += (t6*1061 + 1024) >> 11; \
+    /* 2229/4096 ~= (1/Sqrt[2] - Cos[57*Pi/128])/Sin[57*Pi/128] */ \
+    t7 -= (to*2229 + 2048) >> 12; \
+    /* 5707/4096 ~= Sqrt[2]*Sin[57*Pi/128] */ \
+    to += (t7*5707 + 2048) >> 12; \
+    /* 323/512 ~= (1/Sqrt[2] - Cos[57*Pi/128]/2)/Sin[57*Pi/128] */ \
+    t7 -= (to*323 + 256) >> 9; \
+    /* -1971/2048 ~= (1/Sqrt[2] - Cos[11*Pi/128])/Sin[11*Pi/128] */ \
+    tk += (tb*1971 + 1024) >> 11; \
+    /* 1545/4096 ~= Sqrt[2]*Sin[11*Pi/128] */ \
+    tb += (tk*1545 + 2048) >> 12; \
+    /* 3459/4096 ~= (1/Sqrt[2] - Cos[11*Pi/128]/2)/Sin[11*Pi/128] */ \
+    tk -= (tb*3459 + 2048) >> 12; \
+    /* -5417/16384 ~= (1/Sqrt[2] - Cos[21*Pi/128])/Sin[21*Pi/128] */ \
+    tl -= (ta*5417 + 8192) >> 14; \
+    /* 2855/4096 ~= Sqrt[2]*Sin[21*Pi/128] */ \
+    ta -= (tl*2855 + 2048) >> 12; \
+    /* 2261/4096 ~= (1/Sqrt[2] - Cos[21*Pi/128]/2)/Sin[21*Pi/128] */ \
+    tl += (ta*2261 + 2048) >> 12; \
+    /* -4327/32768 ~= (1/Sqrt[2] - Cos[27*Pi/128])/Sin[27*Pi/128] */ \
+    t9 -= (tm*4327 + 16384) >> 15; \
+    /* 891/1024 ~= Sqrt[2]*Sin[27*Pi/128] */ \
+    tm -= (t9*891 + 512) >> 10; \
+    /* 4167/8192 ~= (1/Sqrt[2] - Cos[27*Pi/128]/2)/Sin[27*Pi/128] */ \
+    t9 += (tm*4167 + 4096) >> 13; \
+    /* 2413/4096 ~= (1/Sqrt[2] - Cos[59*Pi/128])/Sin[59*Pi/128] */ \
+    tn -= (t8*2413 + 2048) >> 12; \
+    /* 5749/4096 ~= Sqrt[2]*Sin[59*Pi/128] */ \
+    t8 += (tn*5749 + 2048) >> 12; \
+    /* 5331/8192 ~= (1/Sqrt[2] - Cos[59*Pi/128]/2)/Sin[59*Pi/128] */ \
+    tn -= (t8*5331 + 4096) >> 13; \
+    /* -2571/4096 ~= (1/Sqrt[2] - Cos[15*Pi/128])/Sin[15*Pi/128] */ \
+    ts += (t3*2571 + 2048) >> 12; \
+    /* 4169/8192 ~= Sqrt[2]*Sin[15*Pi/128] */ \
+    t3 += (ts*4169 + 4096) >> 13; \
+    /* 5477/8192 ~= (1/Sqrt[2] - Cos[15*Pi/128]/2)/Sin[15*Pi/128] */ \
+    ts -= (t3*5477 + 4096) >> 13; \
+    /* -4187/8192 ~= (1/Sqrt[2] - Cos[17*Pi/128])/Sin[17*Pi/128] */ \
+    tt -= (t2*4187 + 4096) >> 13; \
+    /* 4695/8192 ~= Sqrt[2]*Sin[17*Pi/128] */ \
+    t2 -= (tt*4695 + 4096) >> 13; \
+    /* 2527/4096 ~= (1/Sqrt[2] - Cos[17*Pi/128]/2)/Sin[17*Pi/128] */ \
+    tt += (t2*2527 + 2048) >> 12; \
+    /* -815/32768 ~= (1/Sqrt[2] - Cos[31*Pi/128])/Sin[31*Pi/128] */ \
+    t1 -= (tu*815 + 16384) >> 15; \
+    /* 1997/2048 ~= Sqrt[2]*Sin[31*Pi/128] */ \
+    tu -= (t1*1997 + 1024) >> 11; \
+    /* 4099/8192 ~= (1/Sqrt[2] - Cos[31*Pi/128]/2)/Sin[31*Pi/128] */ \
+    t1 += (tu*4099 + 4096) >> 13; \
+    /* 5593/8192 ~= (1/Sqrt[2] - Cos[63*Pi/128])/Sin[63*Pi/128] */ \
+    tv -= (t0*5593 + 4096) >> 13; \
+    /* 5791/4096 ~= Sqrt[2]*Sin[63*Pi/128] */ \
+    t0 += (tv*5791 + 2048) >> 12; \
+    /* 2847/4096 ~= (1/Sqrt[2] - Cos[63*Pi/128]/2)/Sin[63*Pi/128] */ \
+    tv -= (t0*2847 + 2048) >> 12; \
+    \
+    t7 = -t7; \
+    tf = -tf; \
+    tn = -tn; \
+    tr = -tr; \
+    \
+    t7 -= OD_DCT_RSHIFT(t6, 1); \
+    t6 += t7; \
+    tp -= OD_DCT_RSHIFT(to, 1); \
+    to += tp; \
+    tr -= OD_DCT_RSHIFT(tq, 1); \
+    tq += tr; \
+    t5 -= OD_DCT_RSHIFT(t4, 1); \
+    t4 += t5; \
+    tt -= OD_DCT_RSHIFT(t3, 1); \
+    t3 += tt; \
+    ts -= OD_DCT_RSHIFT(t2, 1); \
+    t2 += ts; \
+    tv += OD_DCT_RSHIFT(tu, 1); \
+    tu -= tv; \
+    t1 -= OD_DCT_RSHIFT(t0, 1); \
+    t0 += t1; \
+    th -= OD_DCT_RSHIFT(tg, 1); \
+    tg += th; \
+    tf -= OD_DCT_RSHIFT(te, 1); \
+    te += tf; \
+    ti += OD_DCT_RSHIFT(tc, 1); \
+    tc -= ti; \
+    tj += OD_DCT_RSHIFT(td, 1); \
+    td -= tj; \
+    tn -= OD_DCT_RSHIFT(tm, 1); \
+    tm += tn; \
+    t9 -= OD_DCT_RSHIFT(t8, 1); \
+    t8 += t9; \
+    tl -= OD_DCT_RSHIFT(tb, 1); \
+    tb += tl; \
+    tk -= OD_DCT_RSHIFT(ta, 1); \
+    ta += tk; \
+    \
+    ti -= th; \
+    th += OD_DCT_RSHIFT(ti, 1); \
+    td -= te; \
+    te += OD_DCT_RSHIFT(td, 1); \
+    tm += tl; \
+    tl -= OD_DCT_RSHIFT(tm, 1); \
+    t9 += ta; \
+    ta -= OD_DCT_RSHIFT(t9, 1); \
+    tp += tq; \
+    tq -= OD_DCT_RSHIFT(tp, 1); \
+    t6 += t5; \
+    t5 -= OD_DCT_RSHIFT(t6, 1); \
+    t2 -= t1; \
+    t1 += OD_DCT_RSHIFT(t2, 1); \
+    tt -= tu; \
+    tu += OD_DCT_RSHIFT(tt, 1); \
+    tr += t7; \
+    trh = OD_DCT_RSHIFT(tr, 1); \
+    t7 -= trh; \
+    t4 -= to; \
+    t4h = OD_DCT_RSHIFT(t4, 1); \
+    to += t4h; \
+    t0 += t3; \
+    t0h = OD_DCT_RSHIFT(t0, 1); \
+    t3 -= t0h; \
+    tv += ts; \
+    tvh = OD_DCT_RSHIFT(tv, 1); \
+    ts -= tvh; \
+    tf -= tc; \
+    tfh = OD_DCT_RSHIFT(tf, 1); \
+    tc += tfh; \
+    tg += tj; \
+    tgh = OD_DCT_RSHIFT(tg, 1); \
+    tj -= tgh; \
+    tb -= t8; \
+    tbh = OD_DCT_RSHIFT(tb, 1); \
+    t8 += tbh; \
+    tk += tn; \
+    tkh = OD_DCT_RSHIFT(tk, 1); \
+    tn -= tkh; \
+    \
+    ta = -ta; \
+    tq = -tq; \
+    \
+    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
+    te -= (th*4861 + 16384) >> 15; \
+    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \
+    th += (te*1189 + 2048) >> 12; \
+    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
+    te -= (th*4861 + 16384) >> 15; \
+    /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \
+    tm -= (t9*513 + 1024) >> 11; \
+    /* 7723/16384 ~= Sin[5*Pi/32] ~= 0.47139673682599764 */ \
+    t9 += (tm*7723 + 8192) >> 14; \
+    /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \
+    tm -= (t9*513 + 1024) >> 11; \
+    /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
+    t6 -= (tp*2931 + 4096) >> 13; \
+    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \
+    tp += (t6*5197 + 4096) >> 13; \
+    /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
+    t6 -= (tp*2931 + 4096) >> 13; \
+    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
+    tu -= (t1*805 + 8192) >> 14; \
+    /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \
+    t1 += (tu*803 + 4096) >> 13; \
+    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
+    tu -= (t1*805 + 8192) >> 14; \
+    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
+    ti -= (td*4861 + 16384) >> 15; \
+    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \
+    td += (ti*1189 + 2048) >> 12; \
+    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
+    ti -= (td*4861 + 16384) >> 15; \
+    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \
+    ta -= (tl*2455 + 2048) >> 12; \
+    /* 14449/16384 ~= Sin[11*Pi/32] ~= 0.881921264348355 */ \
+    tl += (ta*14449 + 8192) >> 14; \
+    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \
+    ta -= (tl*2455 + 2048) >> 12; \
+    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
+    t5 -= (tq*11725 + 16384) >> 15; \
+    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \
+    tq += (t5*5197 + 4096) >> 13; \
+    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
+    t5 -= (tq*11725 + 16384) >> 15; \
+    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
+    t2 -= (tt*805 + 8192) >> 14; \
+    /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \
+    tt += (t2*803 + 4096) >> 13; \
+    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
+    t2 -= (tt*805 + 8192) >> 14; \
+    \
+    tl = -tl; \
+    ti = -ti; \
+    \
+    th += OD_DCT_RSHIFT(t9, 1); \
+    t9 -= th; \
+    te -= OD_DCT_RSHIFT(tm, 1); \
+    tm += te; \
+    t1 += OD_DCT_RSHIFT(tp, 1); \
+    tp -= t1; \
+    tu -= OD_DCT_RSHIFT(t6, 1); \
+    t6 += tu; \
+    ta -= OD_DCT_RSHIFT(td, 1); \
+    td += ta; \
+    tl += OD_DCT_RSHIFT(ti, 1); \
+    ti -= tl; \
+    t5 += OD_DCT_RSHIFT(tt, 1); \
+    tt -= t5; \
+    tq += OD_DCT_RSHIFT(t2, 1); \
+    t2 -= tq; \
+    \
+    t8 -= tgh; \
+    tg += t8; \
+    tn += tfh; \
+    tf -= tn; \
+    t7 -= tvh; \
+    tv += t7; \
+    to -= t0h; \
+    t0 += to; \
+    tc += tbh; \
+    tb -= tc; \
+    tj += tkh; \
+    tk -= tj; \
+    ts += t4h; \
+    t4 -= ts; \
+    t3 += trh; \
+    tr -= t3; \
+    \
+    tk = -tk; \
+    \
+    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
+    tc -= (tj*2485 + 4096) >> 13; \
+    /* 18205/32768 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
+    tj += (tc*18205 + 16384) >> 15; \
+    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
+    tc -= (tj*2485 + 4096) >> 13; \
+    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
+    ts -= (t3*3227 + 16384) >> 15; \
+    /* 6393/32768 ~= Sin[Pi/16] ~= 0.19509032201612825 */ \
+    t3 += (ts*6393 + 16384) >> 15; \
+    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
+    ts -= (t3*3227 + 16384) >> 15; \
+    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \
+    tk -= (tb*17515 + 16384) >> 15; \
+    /* 13623/16384 ~= Sin[5*Pi/16] ~= 0.8314696123025452 */ \
+    tb += (tk*13623 + 8192) >> 14; \
+    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \
+    tk -= (tb*17515 + 16384) >> 15; \
+    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \
+    t4 -= (tr*6723 + 4096) >> 13; \
+    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.9807852804032304 */ \
+    tr += (t4*16069 + 8192) >> 14; \
+    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \
+    t4 -= (tr*6723 + 4096) >> 13; \
+    \
+    t4 = -t4; \
+    \
+    tp += tm; \
+    tm -= OD_DCT_RSHIFT(tp, 1); \
+    t9 -= t6; \
+    t6 += OD_DCT_RSHIFT(t9, 1); \
+    th -= t1; \
+    t1 += OD_DCT_RSHIFT(th, 1); \
+    tu -= te; \
+    te += OD_DCT_RSHIFT(tu, 1); /* pass */ \
+    t5 -= tl; \
+    tl += OD_DCT_RSHIFT(t5, 1); \
+    ta += tq; \
+    tq -= OD_DCT_RSHIFT(ta, 1); \
+    td += tt; \
+    tt -= OD_DCT_RSHIFT(td, 1); \
+    t2 -= ti; \
+    ti += OD_DCT_RSHIFT(t2, 1); /* pass */ \
+    t7 += t8; \
+    t8 -= OD_DCT_RSHIFT(t7, 1); \
+    tn -= to; \
+    to += OD_DCT_RSHIFT(tn, 1); \
+    tf -= tv; \
+    tv += OD_DCT_RSHIFT(tf, 1); \
+    t0 += tg; \
+    tg -= OD_DCT_RSHIFT(t0, 1); /* pass */ \
+    tj -= t3; \
+    t3 += OD_DCT_RSHIFT(tj, 1); /* pass */ \
+    ts -= tc; \
+    tc += OD_DCT_RSHIFT(ts, 1); \
+    t4 -= tb; \
+    tb += OD_DCT_RSHIFT(t4, 1); /* pass */ \
+    tk -= tr; \
+    tr += OD_DCT_RSHIFT(tk, 1); \
+    \
+    t1 = -t1; \
+    t3 = -t3; \
+    t7 = -t7; \
+    t8 = -t8; \
+    tg = -tg; \
+    tm = -tm; \
+    to = -to; \
+    \
+    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
+    tm -= (t9*14341 + 8192) >> 14; \
+    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
+    t9 += (tm*15137 + 8192) >> 14; \
+    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
+    tm -= (t9*4161 + 8192) >> 14; \
+    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
+    tp -= (t6*4161 + 8192) >> 14; \
+    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
+    t6 += (tp*15137 + 8192) >> 14; \
+    /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
+    tp -= (t6*28681 + 16384) >> 15; \
+    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
+    th += (te*19195 + 16384) >> 15; \
+    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
+    te += (th*11585 + 8192) >> 14; \
+    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
+    th -= (te*29957 + 16384) >> 15; \
+    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
+    tq -= (t5*14341 + 8192) >> 14; \
+    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
+    t5 += (tq*15137 + 8192) >> 14; \
+    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
+    tq -= (t5*4161 + 8192) >> 14; \
+    /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \
+    ta -= (tl*3259 + 4096) >> 13; \
+    /* 3135/16384 ~= Sin[Pi/8]/2 ~= 0.1913417161825449 */ \
+    tl += (ta*3135 + 8192) >> 14; \
+    /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \
+    ta -= (tl*3259 + 4096) >> 13; \
+    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
+    ti -= (td*7489 + 4096) >> 13; \
+    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
+    td += (ti*11585 + 8192) >> 14; \
+    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
+    ti += (td*19195 + 16384) >> 15; \
+    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
+    to -= (t7*14341 + 8192) >> 14; \
+    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
+    t7 += (to*15137 + 8192) >> 14; \
+    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
+    to -= (t7*4161 + 8192) >> 14; \
+    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
+    tn -= (t8*4161 + 8192) >> 14; \
+    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
+    t8 += (tn*15137 + 8192) >> 14; \
+    /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
+    tn -= (t8*28681 + 16384) >> 15; \
+    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
+    tf += (tg*19195 + 16384) >> 15; \
+    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
+    tg += (tf*11585 + 8192) >> 14; \
+    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
+    tf -= (tg*29957 + 16384) >> 15; \
+    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
+    tj += (tc*19195 + 16384) >> 15; \
+    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
+    tc += (tj*11585 + 8192) >> 14; \
+    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
+    tj -= (tc*29957 + 16384) >> 15; \
+    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
+    tk += (tb*13573 + 8192) >> 14; \
+    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
+    tb -= (tk*11585 + 16384) >> 15; \
+    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
+    tk += (tb*13573 + 8192) >> 14; \
+    \
+    tf = -tf; \
+    \
+  } \
+  while (0)
+
+#define OD_FDCT_64(u0, uw, ug, uM, u8, uE, uo, uU, u4, uA, uk, uQ, uc, uI, \
+  us, uY, u2, uy, ui, uO, ua, uG, uq, uW, u6, uC, um, uS, ue, uK, uu, u_, u1, \
+  ux, uh, uN, u9, uF, up, uV, u5, uB, ul, uR, ud, uJ, ut, uZ, u3, uz, uj, uP, \
+  ub, uH, ur, uX, u7, uD, un, uT, uf, uL, uv, u) \
+  /* Embedded 64-point orthonormal Type-II fDCT. */ \
+  do { \
+    int uwh; \
+    int uxh; \
+    int uyh; \
+    int uzh; \
+    int uAh; \
+    int uBh; \
+    int uCh; \
+    int uDh; \
+    int uEh; \
+    int uFh; \
+    int uGh; \
+    int uHh; \
+    int uIh; \
+    int uJh; \
+    int uKh; \
+    int uLh; \
+    int uMh; \
+    int uNh; \
+    int uOh; \
+    int uPh; \
+    int uQh; \
+    int uRh; \
+    int uSh; \
+    int uTh; \
+    int uUh; \
+    int uVh; \
+    int uWh; \
+    int uXh; \
+    int uYh; \
+    int uZh; \
+    int u_h; \
+    int uh_; \
+    u = u0 - u; \
+    uh_ = OD_DCT_RSHIFT(u, 1); \
+    u0 -= uh_; \
+    u_ += u1; \
+    u_h = OD_DCT_RSHIFT(u_, 1); \
+    u1 = u_h - u1; \
+    uZ = u2 - uZ; \
+    uZh = OD_DCT_RSHIFT(uZ, 1); \
+    u2 -= uZh; \
+    uY += u3; \
+    uYh = OD_DCT_RSHIFT(uY, 1); \
+    u3 = uYh - u3; \
+    uX = u4 - uX; \
+    uXh = OD_DCT_RSHIFT(uX, 1); \
+    u4 -= uXh; \
+    uW += u5; \
+    uWh = OD_DCT_RSHIFT(uW, 1); \
+    u5 = uWh - u5; \
+    uV = u6 - uV; \
+    uVh = OD_DCT_RSHIFT(uV, 1); \
+    u6 -= uVh; \
+    uU += u7; \
+    uUh = OD_DCT_RSHIFT(uU, 1); \
+    u7 = uUh - u7; \
+    uT = u8 - uT; \
+    uTh = OD_DCT_RSHIFT(uT, 1); \
+    u8 -= uTh; \
+    uS += u9; \
+    uSh = OD_DCT_RSHIFT(uS, 1); \
+    u9 = uSh - u9; \
+    uR = ua - uR; \
+    uRh = OD_DCT_RSHIFT(uR, 1); \
+    ua -= uRh; \
+    uQ += ub; \
+    uQh = OD_DCT_RSHIFT(uQ, 1); \
+    ub = uQh - ub; \
+    uP = uc - uP; \
+    uPh = OD_DCT_RSHIFT(uP, 1); \
+    uc -= uPh; \
+    uO += ud; \
+    uOh = OD_DCT_RSHIFT(uO, 1); \
+    ud = uOh - ud; \
+    uN = ue - uN; \
+    uNh = OD_DCT_RSHIFT(uN, 1); \
+    ue -= uNh; \
+    uM += uf; \
+    uMh = OD_DCT_RSHIFT(uM, 1); \
+    uf = uMh - uf; \
+    uL = ug - uL; \
+    uLh = OD_DCT_RSHIFT(uL, 1); \
+    ug -= uLh; \
+    uK += uh; \
+    uKh = OD_DCT_RSHIFT(uK, 1); \
+    uh = uKh - uh; \
+    uJ = ui - uJ; \
+    uJh = OD_DCT_RSHIFT(uJ, 1); \
+    ui -= uJh; \
+    uI += uj; \
+    uIh = OD_DCT_RSHIFT(uI, 1); \
+    uj = uIh - uj; \
+    uH = uk - uH; \
+    uHh = OD_DCT_RSHIFT(uH, 1); \
+    uk -= uHh; \
+    uG += ul; \
+    uGh = OD_DCT_RSHIFT(uG, 1); \
+    ul = uGh - ul; \
+    uF = um - uF; \
+    uFh = OD_DCT_RSHIFT(uF, 1); \
+    um -= uFh; \
+    uE += un; \
+    uEh = OD_DCT_RSHIFT(uE, 1); \
+    un = uEh - un; \
+    uD = uo - uD; \
+    uDh = OD_DCT_RSHIFT(uD, 1); \
+    uo -= uDh; \
+    uC += up; \
+    uCh = OD_DCT_RSHIFT(uC, 1); \
+    up = uCh - up; \
+    uB = uq - uB; \
+    uBh = OD_DCT_RSHIFT(uB, 1); \
+    uq -= uBh; \
+    uA += ur; \
+    uAh = OD_DCT_RSHIFT(uA, 1); \
+    ur = uAh - ur; \
+    uz = us - uz; \
+    uzh = OD_DCT_RSHIFT(uz, 1); \
+    us -= uzh; \
+    uy += ut; \
+    uyh = OD_DCT_RSHIFT(uy, 1); \
+    ut = uyh - ut; \
+    ux = uu - ux; \
+    uxh = OD_DCT_RSHIFT(ux, 1); \
+    uu -= uxh; \
+    uw += uv; \
+    uwh = OD_DCT_RSHIFT(uw, 1); \
+    uv = uwh - uv; \
+    OD_FDCT_32_ASYM(u0, uw, uwh, ug, uM, uMh, u8, uE, uEh, uo, uU, uUh, \
+      u4, uA, uAh, uk, uQ, uQh, uc, uI, uIh, us, uY, uYh, u2, uy, uyh, \
+      ui, uO, uOh, ua, uG, uGh, uq, uW, uWh, u6, uC, uCh, um, uS, uSh, \
+      ue, uK, uKh, uu, u_, u_h); \
+    OD_FDST_32_ASYM(u, uv, uL, uf, uT, un, uD, u7, uX, ur, uH, ub, uP, uj, \
+      uz, u3, uZ, ut, uJ, ud, uR, ul, uB, u5, uV, up, uF, u9, uN, uh, ux, u1); \
+  } \
+  while (0)
+
+#define OD_IDCT_64(u0, uw, ug, uM, u8, uE, uo, uU, u4, uA, uk, uQ, uc, uI, \
+  us, uY, u2, uy, ui, uO, ua, uG, uq, uW, u6, uC, um, uS, ue, uK, uu, u_, u1, \
+  ux, uh, uN, u9, uF, up, uV, u5, uB, ul, uR, ud, uJ, ut, uZ, u3, uz, uj, uP, \
+  ub, uH, ur, uX, u7, uD, un, uT, uf, uL, uv, u) \
+  /* Embedded 64-point orthonormal Type-II fDCT. */ \
+  do { \
+    int u1h; \
+    int u3h; \
+    int u5h; \
+    int u7h; \
+    int u9h; \
+    int ubh; \
+    int udh; \
+    int ufh; \
+    int uhh; \
+    int ujh; \
+    int ulh; \
+    int unh; \
+    int uph; \
+    int urh; \
+    int uth; \
+    int uvh; \
+    int uxh; \
+    int uzh; \
+    int uBh; \
+    int uDh; \
+    int uFh; \
+    int uHh; \
+    int uJh; \
+    int uLh; \
+    int uNh; \
+    int uPh; \
+    int uRh; \
+    int uTh; \
+    int uVh; \
+    int uXh; \
+    int uZh; \
+    int uh_; \
+    OD_IDST_32_ASYM(u, uL, uT, uD, uX, uH, uP, uz, uZ, uJ, uR, uB, uV, uF, \
+      uN, ux, u_, uK, uS, uC, uW, uG, uO, uy, uY, uI, uQ, uA, uU, uE, uM, uw); \
+    OD_IDCT_32_ASYM(u0, ug, u8, uo, u4, uk, uc, us, u2, ui, ua, uq, u6, um, \
+      ue, uu, u1, u1h, uh, uhh, u9, u9h, up, uph, u5, u5h, ul, ulh, ud, udh, \
+      ut, uth, u3, u3h, uj, ujh, ub, ubh, ur, urh, u7, u7h, un, unh, uf, ufh, \
+      uv, uvh); \
+    uh_ = OD_DCT_RSHIFT(u, 1); \
+    u0 += uh_; \
+    u = u0 - u; \
+    u_ = u1h - u_; \
+    u1 -= u_; \
+    uZh = OD_DCT_RSHIFT(uZ, 1); \
+    u2 += uZh; \
+    uZ = u2 - uZ; \
+    uY = u3h - uY; \
+    u3 -= uY; \
+    uXh = OD_DCT_RSHIFT(uX, 1); \
+    u4 += uXh; \
+    uX = u4 - uX; \
+    uW = u5h - uW; \
+    u5 -= uW; \
+    uVh = OD_DCT_RSHIFT(uV, 1); \
+    u6 += uVh; \
+    uV = u6 - uV; \
+    uU = u7h - uU; \
+    u7 -= uU; \
+    uTh = OD_DCT_RSHIFT(uT, 1); \
+    u8 += uTh; \
+    uT = u8 - uT; \
+    uS = u9h - uS; \
+    u9 -= uS; \
+    uRh = OD_DCT_RSHIFT(uR, 1); \
+    ua += uRh; \
+    uR = ua - uR; \
+    uQ = ubh - uQ; \
+    ub -= uQ; \
+    uPh = OD_DCT_RSHIFT(uP, 1); \
+    uc += uPh; \
+    uP = uc - uP; \
+    uO = udh - uO; \
+    ud -= uO; \
+    uNh = OD_DCT_RSHIFT(uN, 1); \
+    ue += uNh; \
+    uN = ue - uN; \
+    uM = ufh - uM; \
+    uf -= uM; \
+    uLh = OD_DCT_RSHIFT(uL, 1); \
+    ug += uLh; \
+    uL = ug - uL; \
+    uK = uhh - uK; \
+    uh -= uK; \
+    uJh = OD_DCT_RSHIFT(uJ, 1); \
+    ui += uJh; \
+    uJ = ui - uJ; \
+    uI = ujh - uI; \
+    uj -= uI; \
+    uHh = OD_DCT_RSHIFT(uH, 1); \
+    uk += uHh; \
+    uH = uk - uH; \
+    uG = ulh - uG; \
+    ul -= uG; \
+    uFh = OD_DCT_RSHIFT(uF, 1); \
+    um += uFh; \
+    uF = um - uF; \
+    uE = unh - uE; \
+    un -= uE; \
+    uDh = OD_DCT_RSHIFT(uD, 1); \
+    uo += uDh; \
+    uD = uo - uD; \
+    uC = uph - uC; \
+    up -= uC; \
+    uBh = OD_DCT_RSHIFT(uB, 1); \
+    uq += uBh; \
+    uB = uq - uB; \
+    uA = urh - uA; \
+    ur -= uA; \
+    uzh = OD_DCT_RSHIFT(uz, 1); \
+    us += uzh; \
+    uz = us - uz; \
+    uy = uth - uy; \
+    ut -= uy; \
+    uxh = OD_DCT_RSHIFT(ux, 1); \
+    uu += uxh; \
+    ux = uu - ux; \
+    uw = uvh - uw; \
+    uv -= uw; \
+  } while (0)
+#endif
+
 void od_bin_fdct4(od_coeff y[4], const od_coeff *x, int xstride) {
   int q0;
   int q1;
@@ -2342,3 +3672,405 @@
   x[30*xstride] = (od_coeff)tu;
   x[31*xstride] = (od_coeff)tv;
 }
+
+#if CONFIG_TX64X64
+void od_bin_fdct64(od_coeff y[64], const od_coeff *x, int xstride) {
+  int t0;
+  int t1;
+  int t2;
+  int t3;
+  int t4;
+  int t5;
+  int t6;
+  int t7;
+  int t8;
+  int t9;
+  int ta;
+  int tb;
+  int tc;
+  int td;
+  int te;
+  int tf;
+  int tg;
+  int th;
+  int ti;
+  int tj;
+  int tk;
+  int tl;
+  int tm;
+  int tn;
+  int to;
+  int tp;
+  int tq;
+  int tr;
+  int ts;
+  int tt;
+  int tu;
+  int tv;
+  int tw;
+  int tx;
+  int ty;
+  int tz;
+  int tA;
+  int tB;
+  int tC;
+  int tD;
+  int tE;
+  int tF;
+  int tG;
+  int tH;
+  int tI;
+  int tJ;
+  int tK;
+  int tL;
+  int tM;
+  int tN;
+  int tO;
+  int tP;
+  int tQ;
+  int tR;
+  int tS;
+  int tT;
+  int tU;
+  int tV;
+  int tW;
+  int tX;
+  int tY;
+  int tZ;
+  int t_;
+  int t;
+  t0 = x[0*xstride];
+  tw = x[1*xstride];
+  tg = x[2*xstride];
+  tM = x[3*xstride];
+  t8 = x[4*xstride];
+  tE = x[5*xstride];
+  to = x[6*xstride];
+  tU = x[7*xstride];
+  t4 = x[8*xstride];
+  tA = x[9*xstride];
+  tk = x[10*xstride];
+  tQ = x[11*xstride];
+  tc = x[12*xstride];
+  tI = x[13*xstride];
+  ts = x[14*xstride];
+  tY = x[15*xstride];
+  t2 = x[16*xstride];
+  ty = x[17*xstride];
+  ti = x[18*xstride];
+  tO = x[19*xstride];
+  ta = x[20*xstride];
+  tG = x[21*xstride];
+  tq = x[22*xstride];
+  tW = x[23*xstride];
+  t6 = x[24*xstride];
+  tC = x[25*xstride];
+  tm = x[26*xstride];
+  tS = x[27*xstride];
+  te = x[28*xstride];
+  tK = x[29*xstride];
+  tu = x[30*xstride];
+  t_ = x[31*xstride];
+  t1 = x[32*xstride];
+  tx = x[33*xstride];
+  th = x[34*xstride];
+  tN = x[35*xstride];
+  t9 = x[36*xstride];
+  tF = x[37*xstride];
+  tp = x[38*xstride];
+  tV = x[39*xstride];
+  t5 = x[40*xstride];
+  tB = x[41*xstride];
+  tl = x[42*xstride];
+  tR = x[43*xstride];
+  td = x[44*xstride];
+  tJ = x[45*xstride];
+  tt = x[46*xstride];
+  tZ = x[47*xstride];
+  t3 = x[48*xstride];
+  tz = x[49*xstride];
+  tj = x[50*xstride];
+  tP = x[51*xstride];
+  tb = x[52*xstride];
+  tH = x[53*xstride];
+  tr = x[54*xstride];
+  tX = x[55*xstride];
+  t7 = x[56*xstride];
+  tD = x[57*xstride];
+  tn = x[58*xstride];
+  tT = x[59*xstride];
+  tf = x[60*xstride];
+  tL = x[61*xstride];
+  tv = x[62*xstride];
+  t = x[63*xstride];
+  OD_FDCT_64(t0, tw, tg, tM, t8, tE, to, tU, t4, tA, tk, tQ, tc, tI, ts, tY,
+    t2, ty, ti, tO, ta, tG, tq, tW, t6, tC, tm, tS, te, tK, tu, t_, t1, tx,
+    th, tN, t9, tF, tp, tV, t5, tB, tl, tR, td, tJ, tt, tZ, t3, tz, tj, tP,
+    tb, tH, tr, tX, t7, tD, tn, tT, tf, tL, tv, t);
+  y[0] = (od_coeff)t0;
+  y[1] = (od_coeff)t1;
+  y[2] = (od_coeff)t2;
+  y[3] = (od_coeff)t3;
+  y[4] = (od_coeff)t4;
+  y[5] = (od_coeff)t5;
+  y[6] = (od_coeff)t6;
+  y[7] = (od_coeff)t7;
+  y[8] = (od_coeff)t8;
+  y[9] = (od_coeff)t9;
+  y[10] = (od_coeff)ta;
+  y[11] = (od_coeff)tb;
+  y[12] = (od_coeff)tc;
+  y[13] = (od_coeff)td;
+  y[14] = (od_coeff)te;
+  y[15] = (od_coeff)tf;
+  y[16] = (od_coeff)tg;
+  y[17] = (od_coeff)th;
+  y[18] = (od_coeff)ti;
+  y[19] = (od_coeff)tj;
+  y[20] = (od_coeff)tk;
+  y[21] = (od_coeff)tl;
+  y[22] = (od_coeff)tm;
+  y[23] = (od_coeff)tn;
+  y[24] = (od_coeff)to;
+  y[25] = (od_coeff)tp;
+  y[26] = (od_coeff)tq;
+  y[27] = (od_coeff)tr;
+  y[28] = (od_coeff)ts;
+  y[29] = (od_coeff)tt;
+  y[30] = (od_coeff)tu;
+  y[31] = (od_coeff)tv;
+  y[32] = (od_coeff)tw;
+  y[33] = (od_coeff)tx;
+  y[34] = (od_coeff)ty;
+  y[35] = (od_coeff)tz;
+  y[36] = (od_coeff)tA;
+  y[37] = (od_coeff)tB;
+  y[38] = (od_coeff)tC;
+  y[39] = (od_coeff)tD;
+  y[40] = (od_coeff)tE;
+  y[41] = (od_coeff)tF;
+  y[41] = (od_coeff)tF;
+  y[42] = (od_coeff)tG;
+  y[43] = (od_coeff)tH;
+  y[44] = (od_coeff)tI;
+  y[45] = (od_coeff)tJ;
+  y[46] = (od_coeff)tK;
+  y[47] = (od_coeff)tL;
+  y[48] = (od_coeff)tM;
+  y[49] = (od_coeff)tN;
+  y[50] = (od_coeff)tO;
+  y[51] = (od_coeff)tP;
+  y[52] = (od_coeff)tQ;
+  y[53] = (od_coeff)tR;
+  y[54] = (od_coeff)tS;
+  y[55] = (od_coeff)tT;
+  y[56] = (od_coeff)tU;
+  y[57] = (od_coeff)tV;
+  y[58] = (od_coeff)tW;
+  y[59] = (od_coeff)tX;
+  y[60] = (od_coeff)tY;
+  y[61] = (od_coeff)tZ;
+  y[62] = (od_coeff)t_;
+  y[63] = (od_coeff)t;
+}
+
+void od_bin_idct64(od_coeff *x, int xstride, const od_coeff y[64]) {
+  int t0;
+  int t1;
+  int t2;
+  int t3;
+  int t4;
+  int t5;
+  int t6;
+  int t7;
+  int t8;
+  int t9;
+  int ta;
+  int tb;
+  int tc;
+  int td;
+  int te;
+  int tf;
+  int tg;
+  int th;
+  int ti;
+  int tj;
+  int tk;
+  int tl;
+  int tm;
+  int tn;
+  int to;
+  int tp;
+  int tq;
+  int tr;
+  int ts;
+  int tt;
+  int tu;
+  int tv;
+  int tw;
+  int tx;
+  int ty;
+  int tz;
+  int tA;
+  int tB;
+  int tC;
+  int tD;
+  int tE;
+  int tF;
+  int tG;
+  int tH;
+  int tI;
+  int tJ;
+  int tK;
+  int tL;
+  int tM;
+  int tN;
+  int tO;
+  int tP;
+  int tQ;
+  int tR;
+  int tS;
+  int tT;
+  int tU;
+  int tV;
+  int tW;
+  int tX;
+  int tY;
+  int tZ;
+  int t_;
+  int t;
+  t0 = y[0];
+  tw = y[1];
+  tg = y[2];
+  tM = y[3];
+  t8 = y[4];
+  tE = y[5];
+  to = y[6];
+  tU = y[7];
+  t4 = y[8];
+  tA = y[9];
+  tk = y[10];
+  tQ = y[11];
+  tc = y[12];
+  tI = y[13];
+  ts = y[14];
+  tY = y[15];
+  t2 = y[16];
+  ty = y[17];
+  ti = y[18];
+  tO = y[19];
+  ta = y[20];
+  tG = y[21];
+  tq = y[22];
+  tW = y[23];
+  t6 = y[24];
+  tC = y[25];
+  tm = y[26];
+  tS = y[27];
+  te = y[28];
+  tK = y[29];
+  tu = y[30];
+  t_ = y[31];
+  t1 = y[32];
+  tx = y[33];
+  th = y[34];
+  tN = y[35];
+  t9 = y[36];
+  tF = y[37];
+  tp = y[38];
+  tV = y[39];
+  t5 = y[40];
+  tB = y[41];
+  tl = y[42];
+  tR = y[43];
+  td = y[44];
+  tJ = y[45];
+  tt = y[46];
+  tZ = y[47];
+  t3 = y[48];
+  tz = y[49];
+  tj = y[50];
+  tP = y[51];
+  tb = y[52];
+  tH = y[53];
+  tr = y[54];
+  tX = y[55];
+  t7 = y[56];
+  tD = y[57];
+  tn = y[58];
+  tT = y[59];
+  tf = y[60];
+  tL = y[61];
+  tv = y[62];
+  t = y[63];
+  OD_IDCT_64(t0, tw, tg, tM, t8, tE, to, tU, t4, tA, tk, tQ, tc, tI, ts, tY,
+    t2, ty, ti, tO, ta, tG, tq, tW, t6, tC, tm, tS, te, tK, tu, t_, t1, tx,
+    th, tN, t9, tF, tp, tV, t5, tB, tl, tR, td, tJ, tt, tZ, t3, tz, tj, tP,
+    tb, tH, tr, tX, t7, tD, tn, tT, tf, tL, tv, t);
+  x[0*xstride] = (od_coeff)t0;
+  x[1*xstride] = (od_coeff)t1;
+  x[2*xstride] = (od_coeff)t2;
+  x[3*xstride] = (od_coeff)t3;
+  x[4*xstride] = (od_coeff)t4;
+  x[5*xstride] = (od_coeff)t5;
+  x[6*xstride] = (od_coeff)t6;
+  x[7*xstride] = (od_coeff)t7;
+  x[8*xstride] = (od_coeff)t8;
+  x[9*xstride] = (od_coeff)t9;
+  x[10*xstride] = (od_coeff)ta;
+  x[11*xstride] = (od_coeff)tb;
+  x[12*xstride] = (od_coeff)tc;
+  x[13*xstride] = (od_coeff)td;
+  x[14*xstride] = (od_coeff)te;
+  x[15*xstride] = (od_coeff)tf;
+  x[16*xstride] = (od_coeff)tg;
+  x[17*xstride] = (od_coeff)th;
+  x[18*xstride] = (od_coeff)ti;
+  x[19*xstride] = (od_coeff)tj;
+  x[20*xstride] = (od_coeff)tk;
+  x[21*xstride] = (od_coeff)tl;
+  x[22*xstride] = (od_coeff)tm;
+  x[23*xstride] = (od_coeff)tn;
+  x[24*xstride] = (od_coeff)to;
+  x[25*xstride] = (od_coeff)tp;
+  x[26*xstride] = (od_coeff)tq;
+  x[27*xstride] = (od_coeff)tr;
+  x[28*xstride] = (od_coeff)ts;
+  x[29*xstride] = (od_coeff)tt;
+  x[30*xstride] = (od_coeff)tu;
+  x[31*xstride] = (od_coeff)tv;
+  x[32*xstride] = (od_coeff)tw;
+  x[33*xstride] = (od_coeff)tx;
+  x[34*xstride] = (od_coeff)ty;
+  x[35*xstride] = (od_coeff)tz;
+  x[36*xstride] = (od_coeff)tA;
+  x[37*xstride] = (od_coeff)tB;
+  x[38*xstride] = (od_coeff)tC;
+  x[39*xstride] = (od_coeff)tD;
+  x[40*xstride] = (od_coeff)tE;
+  x[41*xstride] = (od_coeff)tF;
+  x[41*xstride] = (od_coeff)tF;
+  x[42*xstride] = (od_coeff)tG;
+  x[43*xstride] = (od_coeff)tH;
+  x[44*xstride] = (od_coeff)tI;
+  x[45*xstride] = (od_coeff)tJ;
+  x[46*xstride] = (od_coeff)tK;
+  x[47*xstride] = (od_coeff)tL;
+  x[48*xstride] = (od_coeff)tM;
+  x[49*xstride] = (od_coeff)tN;
+  x[50*xstride] = (od_coeff)tO;
+  x[51*xstride] = (od_coeff)tP;
+  x[52*xstride] = (od_coeff)tQ;
+  x[53*xstride] = (od_coeff)tR;
+  x[54*xstride] = (od_coeff)tS;
+  x[55*xstride] = (od_coeff)tT;
+  x[56*xstride] = (od_coeff)tU;
+  x[57*xstride] = (od_coeff)tV;
+  x[58*xstride] = (od_coeff)tW;
+  x[59*xstride] = (od_coeff)tX;
+  x[60*xstride] = (od_coeff)tY;
+  x[61*xstride] = (od_coeff)tZ;
+  x[62*xstride] = (od_coeff)t_;
+  x[63*xstride] = (od_coeff)t;
+}
+#endif
diff --git a/av1/common/daala_tx.h b/av1/common/daala_tx.h
index b0f24a1..cef35c9 100644
--- a/av1/common/daala_tx.h
+++ b/av1/common/daala_tx.h
@@ -15,5 +15,8 @@
 void od_bin_idst16(od_coeff *x, int xstride, const od_coeff y[16]);
 void od_bin_fdct32(od_coeff y[32], const od_coeff *x, int xstride);
 void od_bin_idct32(od_coeff *x, int xstride, const od_coeff y[32]);
-
+#if CONFIG_TX64X64
+void od_bin_fdct64(od_coeff y[64], const od_coeff *x, int xstride);
+void od_bin_idct64(od_coeff *x, int xstride, const od_coeff y[64]);
+#endif
 #endif
diff --git a/av1/common/idct.c b/av1/common/idct.c
index f5f6593..13596f2 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -81,8 +81,13 @@
 #if CONFIG_TX64X64
 static void iidtx64_c(const tran_low_t *input, tran_low_t *output) {
   int i;
-  for (i = 0; i < 64; ++i)
+  for (i = 0; i < 64; ++i) {
+#if CONFIG_DAALA_DCT64
+    output[i] = input[i];
+#else
     output[i] = (tran_low_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
+#endif
+  }
 }
 #endif  // CONFIG_TX64X64
 #endif  // CONFIG_EXT_TX
@@ -118,6 +123,29 @@
 #endif
 
 #if CONFIG_TX64X64
+#if CONFIG_DAALA_DCT64
+static void idct64_col_c(const tran_low_t *input, tran_low_t *output) {
+  aom_idct64_c(input, output);
+}
+
+static void idct64_row_c(const tran_low_t *input, tran_low_t *output) {
+  aom_idct64_c(input, output);
+}
+
+static void ihalfright64_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[32];
+  // No scaling within; Daala transforms are all orthonormal
+  for (i = 0; i < 32; ++i) {
+    inputhalf[i] = input[i];
+  }
+  for (i = 0; i < 32; ++i) {
+    output[i] = input[32 + i];
+  }
+  aom_idct32_c(inputhalf, output + 32);
+}
+
+#else
 static void idct64_col_c(const tran_low_t *input, tran_low_t *output) {
   int32_t in[64], out[64];
   int i;
@@ -148,6 +176,7 @@
   aom_idct32_c(inputhalf, output + 32);
   // Note overall scaling factor is 4 * sqrt(2)  times orthogonal
 }
+#endif  // CONFIG_DAALA_DCT64
 #endif  // CONFIG_TX64X64
 
 // Inverse identity transform and add.
@@ -1416,8 +1445,15 @@
 
   // inverse transform row vectors
   for (i = 0; i < 64; ++i) {
+#if CONFIG_DAALA_DCT64
+    tran_low_t temp_in[64];
+    for (j = 0; j < 64; j++) temp_in[j] = input[j] * 2;
+    IHT_64[tx_type].rows(temp_in, out[i]);
+// Do not rescale intermediate for Daala
+#else
     IHT_64[tx_type].rows(input, out[i]);
     for (j = 0; j < 64; ++j) out[i][j] = ROUND_POWER_OF_TWO(out[i][j], 1);
+#endif
     input += 64;
   }
 
@@ -1440,7 +1476,11 @@
     for (j = 0; j < 64; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
+#if CONFIG_DAALA_DCT64
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 2));
+#else
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+#endif
     }
   }
 }
@@ -1575,13 +1615,13 @@
 }
 #endif
 
-#if CONFIG_TX64X64
+#if CONFIG_TX64X64 && !CONFIG_DAALA_DCT64
 static void idct64x64_add(const tran_low_t *input, uint8_t *dest, int stride,
                           const TxfmParam *txfm_param) {
   (void)txfm_param;
   av1_iht64x64_4096_add(input, dest, stride, txfm_param);
 }
-#endif  // CONFIG_TX64X64
+#endif  // CONFIG_TX64X64 && !CONFIG_DAALA_DCT64
 
 #if CONFIG_CHROMA_2X2
 static void inv_txfm_add_2x2(const tran_low_t *input, uint8_t *dest, int stride,
@@ -1875,7 +1915,11 @@
                                int stride, const TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   switch (tx_type) {
+#if !CONFIG_DAALA_DCT64
     case DCT_DCT: idct64x64_add(input, dest, stride, txfm_param); break;
+#else
+    case DCT_DCT:
+#endif
 #if CONFIG_EXT_TX
     case ADST_DCT:
     case DCT_ADST:
diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index bc5d894..0bb4798 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c
@@ -22,7 +22,7 @@
 #include "av1/common/av1_fwd_txfm1d_cfg.h"
 #include "av1/common/idct.h"
 #if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \
-    CONFIG_DAALA_DCT32
+    CONFIG_DAALA_DCT32 || CONFIG_DAALA_DCT64
 #include "av1/common/daala_tx.h"
 #endif
 
@@ -782,6 +782,16 @@
 #endif
 
 #ifndef AV1_DCT_GTEST
+#if CONFIG_TX64X64 && CONFIG_DAALA_DCT64
+static void fdct64(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  od_coeff x[64];
+  od_coeff y[64];
+  for (i = 0; i < 64; i++) x[i] = (od_coeff)input[i];
+  od_bin_fdct64(y, x, 1);
+  for (i = 0; i < 64; i++) output[i] = (tran_low_t)y[i];
+}
+#endif
 
 static void fadst4(const tran_low_t *input, tran_low_t *output) {
   tran_high_t x0, x1, x2, x3;
@@ -2530,6 +2540,37 @@
 }
 
 #if CONFIG_TX64X64
+#if CONFIG_DAALA_DCT64
+#if CONFIG_EXT_TX
+static void fidtx64(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 64; ++i) output[i] = input[i];
+}
+
+// For use in lieu of ADST
+static void fhalfright64(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[32];
+  // No scaling within; Daala transforms are all orthonormal
+  for (i = 0; i < 32; ++i) {
+    output[32 + i] = input[i];
+  }
+  for (i = 0; i < 32; ++i) {
+    inputhalf[i] = input[i + 32];
+  }
+  fdct32(inputhalf, output);
+  // Note overall scaling factor is 2 times unitary
+}
+#endif  // CONFIG_EXT_TX
+
+static void fdct64_col(const tran_low_t *input, tran_low_t *output) {
+  fdct64(input, output);
+}
+
+static void fdct64_row(const tran_low_t *input, tran_low_t *output) {
+  fdct64(input, output);
+}
+#else
 #if CONFIG_EXT_TX
 static void fidtx64(const tran_low_t *input, tran_low_t *output) {
   int i;
@@ -2568,6 +2609,7 @@
   av1_fdct64_new(in, out, fwd_cos_bit_row_dct_64, fwd_stage_range_row_dct_64);
   for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
 }
+#endif
 
 void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
                     TxfmParam *txfm_param) {
@@ -2609,10 +2651,18 @@
 
   // Columns
   for (i = 0; i < 64; ++i) {
+#if CONFIG_DAALA_DCT64
+    for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i] * 16;
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < 64; ++j)
+      out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 3;
+
+#else
     for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i];
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 64; ++j)
       out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+#endif
   }
 
   // Rows
@@ -2620,8 +2670,12 @@
     for (j = 0; j < 64; ++j) temp_in[j] = out[j + i * 64];
     ht.rows(temp_in, temp_out);
     for (j = 0; j < 64; ++j)
+#if CONFIG_DAALA_DCT64
+      output[j + i * 64] = temp_out[j];
+#else
       output[j + i * 64] =
           (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+#endif
   }
 }
 #endif  // CONFIG_TX64X64
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index 5f6873f..66a5918 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -127,6 +127,7 @@
 set(CONFIG_DAALA_DCT16 0 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_DAALA_DCT32 0 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_DAALA_DCT4 0 CACHE NUMBER "AV1 experiment flag.")
+set(CONFIG_DAALA_DCT64 0 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_DAALA_DCT8 0 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_DAALA_DIST 0 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_DCT_ONLY 0 CACHE NUMBER "AV1 experiment flag.")
diff --git a/build/cmake/aom_configure.cmake b/build/cmake/aom_configure.cmake
index 8c9993b..df4252f 100644
--- a/build/cmake/aom_configure.cmake
+++ b/build/cmake/aom_configure.cmake
@@ -247,8 +247,16 @@
   change_config_and_warn(CONFIG_DCT_ONLY 1 CONFIG_DAALA_DCT4)
 endif()
 
+if (CONFIG_DAALA_DCT64)
+  if (NOT CONFIG_TX64X64)
+     message(WARNING
+       "--- Enabled CONFIG_TX64X64, needed for CONFIG_DAALA_DCT64.")
+     set(CONFIG_TX64X64 1)
+   endif()
+endif()
+
 if (CONFIG_DAALA_DCT4 OR CONFIG_DAALA_DCT8 OR CONFIG_DAALA_DCT16 OR
-    CONFIG_DAALA_DCT32)
+    CONFIG_DAALA_DCT32 OR CONFIG_DAALA_DCT64)
   if (HAVE_MMX)
     change_config_and_warn(HAVE_MMX 0 CONFIG_DAALA_DCTx)
   endif()
diff --git a/configure b/configure
index 3dbd9d5..0093007 100755
--- a/configure
+++ b/configure
@@ -295,6 +295,7 @@
     daala_dct8
     daala_dct16
     daala_dct32
+    daala_dct64
     cb4x4
     chroma_2x2
     chroma_sub8x8
@@ -576,10 +577,14 @@
     if enabled daala_dct4; then
       enable_feature dct_only
     fi
+    if enabled daala_dct64; then
+      enable_feature tx64x64
+    fi
     if enabled daala_dct4 ||
 	    enabled daala_dct8 ||
 	    enabled daala_dct16 ||
-	    enabled daala_dct32; then
+	    enabled daala_dct32 ||
+	    enabled daala_dct64; then
       disable_feature mmx
       disable_feature rect_tx
       disable_feature var_tx