Add new 32-point Type-IV DST to daala_tx.

Change-Id: I75ce79db34789151cc4cc9183a550f59e1357bfe
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index a43c103..8d81604 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -948,6 +948,16 @@
       return EXT_TX_SET_MRC_DCT;
   }
 #endif  // CONFIG_MRC_TX
+#if CONFIG_DAALA_TX32
+  if (tx_size_sqr_up > TX_32X32)
+    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY;
+  if (is_inter)
+    return (tx_size_sqr >= TX_16X16 ? EXT_TX_SET_DTT9_IDTX_1DDCT
+                                    : EXT_TX_SET_ALL16);
+  else
+    return (tx_size_sqr >= TX_16X16 ? EXT_TX_SET_DTT4_IDTX
+                                    : EXT_TX_SET_DTT4_IDTX_1DDCT);
+#endif
   if (tx_size_sqr_up == TX_32X32)
     return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY;
   if (is_inter)
diff --git a/av1/common/daala_tx.c b/av1/common/daala_tx.c
index c46a16f..eb99a90 100644
--- a/av1/common/daala_tx.c
+++ b/av1/common/daala_tx.c
@@ -1788,6 +1788,640 @@
   } \
   while (0)
 
+/* Embedded 32-point orthonormal Type-IV fDST. */
+#define OD_FDST_32(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, \
+  te, tf, tg, th, ti, tj, tk, tl, tm, tn, to, tp, tq, tr, ts, tt, tu, tv) \
+  /* 117 "muls", 117 + 128 = 245 adds, 36 shifts */ \
+  do { \
+    od_coeff t0h; \
+    od_coeff t1h; \
+    od_coeff t2h; \
+    od_coeff t3h; \
+    od_coeff t4h; \
+    od_coeff t6h; \
+    od_coeff t8h; \
+    od_coeff t9h; \
+    od_coeff tah; \
+    od_coeff tbh; \
+    od_coeff tch; \
+    od_coeff tdh; \
+    od_coeff teh; \
+    od_coeff tfh; \
+    od_coeff tgh; \
+    od_coeff thh; \
+    od_coeff tih; \
+    od_coeff tjh; \
+    od_coeff tkh; \
+    od_coeff tlh; \
+    od_coeff tmh; \
+    od_coeff tnh; \
+    od_coeff tph; \
+    od_coeff trh; \
+    od_coeff tsh; \
+    od_coeff tth; \
+    od_coeff tuh; \
+    od_coeff tvh; \
+    /* Stage 0 */ \
+    tp += (t6*659 + 2048) >> 12; \
+    t6 -= (tp*10279 + 16384) >> 15; \
+    tp += (t6*659 + 2048) >> 12; \
+    th += (te*3045 + 4096) >> 13; \
+    te -= (th*21403 + 16384) >> 15; \
+    th += (te*3045 + 4096) >> 13; \
+    t9 += (tm*20191 + 16384) >> 15; \
+    tm -= (t9*29269 + 16384) >> 15; \
+    t9 += (tm*20191 + 16384) >> 15; \
+    tu += (t1*1207 + 16384) >> 15; \
+    t1 -= (tu*2411 + 16384) >> 15; \
+    tu += (t1*1207 + 16384) >> 15; \
+    t4 += (tr*13113 + 8192) >> 14; \
+    tr -= (t4*7993 + 4096) >> 13; \
+    t4 += (tr*13113 + 8192) >> 14; \
+    tj += (tc*10381 + 16384) >> 15; \
+    tc -= (tj*4717 + 4096) >> 13; \
+    tj += (tc*10381 + 16384) >> 15; \
+    tb += (tk*18035 + 16384) >> 15; \
+    tk -= (tb*6921 + 4096) >> 13; \
+    tb += (tk*18035 + 16384) >> 15; \
+    ts += (t3*1411 + 8192) >> 14; \
+    t3 -= (ts*2801 + 8192) >> 14; \
+    ts += (t3*1411 + 8192) >> 14; \
+    tq += (t5*2225 + 8192) >> 14; \
+    t5 -= (tq*2185 + 4096) >> 13; \
+    tq += (t5*2225 + 8192) >> 14; \
+    ti += (td*11273 + 16384) >> 15; \
+    td -= (ti*315 + 256) >> 9; \
+    ti += (td*11273 + 16384) >> 15; \
+    tl += (ta*8637 + 16384) >> 15; \
+    ta -= (tl*16151 + 16384) >> 15; \
+    tl += (ta*8637 + 16384) >> 15; \
+    tt += (t2*2013 + 16384) >> 15; \
+    t2 -= (tt*4011 + 16384) >> 15; \
+    tt += (t2*2013 + 16384) >> 15; \
+    to += (t7*6101 + 16384) >> 15; \
+    t7 -= (to*11793 + 16384) >> 15; \
+    to += (t7*6101 + 16384) >> 15; \
+    t8 += (tn*10659 + 8192) >> 14; \
+    tn -= (t8*29957 + 16384) >> 15; \
+    t8 += (tn*10659 + 8192) >> 14; \
+    tg += (tf*819 + 1024) >> 11; \
+    tf -= (tg*22595 + 16384) >> 15; \
+    tg += (tf*819 + 1024) >> 11; \
+    t0 += (tv*31973 + 16384) >> 15; \
+    tv -= (t0*16379 + 8192) >> 14; \
+    t0 += (tv*31973 + 16384) >> 15; \
+    /* Stage 1 */ \
+    tj -= ts; \
+    tjh = OD_DCT_RSHIFT(tj, 1); \
+    ts += tjh; \
+    tr = tk - tr; \
+    trh = OD_DCT_RSHIFT(tr, 1); \
+    tk = trh - tk; \
+    tc += t3; \
+    tch = OD_DCT_RSHIFT(tc, 1); \
+    t3 -= tch; \
+    t4 += tb; \
+    t4h = OD_DCT_RSHIFT(t4, 1); \
+    tb -= t4h; \
+    tv += tf; \
+    tvh = OD_DCT_RSHIFT(tv, 1); \
+    tf -= tvh; \
+    t8 -= to; \
+    t8h = OD_DCT_RSHIFT(t8, 1); \
+    to += t8h; \
+    t0 += tg; \
+    t0h = OD_DCT_RSHIFT(t0, 1); \
+    tg -= t0h; \
+    tn = t7 - tn; \
+    tnh = OD_DCT_RSHIFT(tn, 1); \
+    t7 -= tnh; \
+    th -= tu; \
+    thh = OD_DCT_RSHIFT(th, 1); \
+    tu += thh; \
+    t6 += tm; \
+    t6h = OD_DCT_RSHIFT(t6, 1); \
+    tm = t6h - tm; \
+    te += t1; \
+    teh = OD_DCT_RSHIFT(te, 1); \
+    t1 -= teh; \
+    tp += t9; \
+    tph = OD_DCT_RSHIFT(tp, 1); \
+    t9 -= tph; \
+    t2 -= td; \
+    t2h = OD_DCT_RSHIFT(t2, 1); \
+    td += t2h; \
+    tl = tq - tl; \
+    tlh = OD_DCT_RSHIFT(tl, 1); \
+    tq -= tlh; \
+    tt += ti; \
+    tth = OD_DCT_RSHIFT(tt, 1); \
+    ti -= tth; \
+    ta += t5; \
+    tah = OD_DCT_RSHIFT(ta, 1); \
+    t5 -= tah; \
+    /* Stage 2 */ \
+    tm -= thh; \
+    th += tm; \
+    t9 = teh - t9; \
+    te -= t9; \
+    td = tlh - td; \
+    tl -= td; \
+    ti += tah; \
+    ta -= ti; \
+    tk = tjh - tk; \
+    tj -= tk; \
+    tb -= tch; \
+    tc += tb; \
+    tg += tnh; \
+    tn = tg - tn; \
+    tf += t8h; \
+    t8 = tf - t8; \
+    t3 -= trh; \
+    tr += t3; \
+    ts += t4h; \
+    t4 -= ts; \
+    to -= t0h; \
+    t0 += to; \
+    t7 = tvh - t7; \
+    tv = t7 - tv; \
+    t1 -= t6h; \
+    t6 += t1; \
+    tu += tph; \
+    tp -= tu; \
+    tq -= tth; \
+    tt += tq; \
+    t5 += t2h; \
+    t2 -= t5; \
+    /* Stage 3 */ \
+    tj += (tc*11725 + 16384) >> 15; \
+    tc -= (tj*5197 + 4096) >> 13; \
+    tj += (tc*11725 + 16384) >> 15; \
+    td += (ti*513 + 1024) >> 11; \
+    ti -= (td*15447 + 16384) >> 15; \
+    td += (ti*513 + 1024) >> 11; \
+    th += (te*4861 + 16384) >> 15; \
+    te -= (th*1189 + 2048) >> 12; \
+    th += (te*4861 + 16384) >> 15; \
+    tg += (tf*805 + 8192) >> 14; \
+    tf -= (tg*803 + 4096) >> 13; \
+    tg += (tf*805 + 8192) >> 14; \
+    tb += (tk*7749 + 8192) >> 14; \
+    tk -= (tb*12665 + 8192) >> 14; \
+    tb += (tk*7749 + 8192) >> 14; \
+    tl += (ta*2455 + 2048) >> 12; \
+    ta -= (tl*28899 + 16384) >> 15; \
+    tl += (ta*2455 + 2048) >> 12; \
+    t9 += (tm*12151 + 8192) >> 14; \
+    tm -= (t9*31357 + 16384) >> 15; \
+    t9 += (tm*12151 + 8192) >> 14; \
+    tn += (t8*29699 + 16384) >> 15; \
+    t8 -= (tn*16305 + 8192) >> 14; \
+    tn += (t8*29699 + 16384) >> 15; \
+    /* Stage 4 */ \
+    tf -= tc; \
+    tfh = OD_DCT_RSHIFT(tf, 1); \
+    tc += tfh; \
+    ti = th - ti; \
+    tih = OD_DCT_RSHIFT(ti, 1); \
+    th -= tih; \
+    tg += tj; \
+    tgh = OD_DCT_RSHIFT(tg, 1); \
+    tj = tgh - tj; \
+    td -= te; \
+    tdh = OD_DCT_RSHIFT(td, 1); \
+    te += tdh; \
+    tm = ta - tm; \
+    tmh = OD_DCT_RSHIFT(tm, 1); \
+    ta = tmh - ta; \
+    t9 += tl; \
+    t9h = OD_DCT_RSHIFT(t9, 1); \
+    tl -= t9h; \
+    tb += t8; \
+    tbh = OD_DCT_RSHIFT(tb, 1); \
+    t8 -= tbh; \
+    tk += tn; \
+    tkh = OD_DCT_RSHIFT(tk, 1); \
+    tn -= tkh; \
+    t1 -= t2; \
+    t1h = OD_DCT_RSHIFT(t1, 1); \
+    t2 += t1h; \
+    t3 += tv; \
+    t3h = OD_DCT_RSHIFT(t3, 1); \
+    tv -= t3h; \
+    tu += tt; \
+    tuh = OD_DCT_RSHIFT(tu, 1); \
+    tt -= tuh; \
+    ts -= t0; \
+    tsh = OD_DCT_RSHIFT(ts, 1); \
+    t0 += tsh; \
+    tq = t6 - tq; \
+    t6 -= OD_DCT_RSHIFT(tq, 1); \
+    to += tr; \
+    tr = OD_DCT_RSHIFT(to, 1) - tr; \
+    t7 = t4 - t7; \
+    t4 -= OD_DCT_RSHIFT(t7, 1); \
+    t5 -= tp; \
+    tp += OD_DCT_RSHIFT(t5, 1); \
+    /* Stage 5 */ \
+    tp += (t6*2485 + 4096) >> 13; \
+    t6 -= (tp*18205 + 16384) >> 15; \
+    tp += (t6*2485 + 4096) >> 13; \
+    to += (t7*3227 + 16384) >> 15; \
+    t7 -= (to*6393 + 16384) >> 15; \
+    to += (t7*3227 + 16384) >> 15; \
+    tq += (t5*17515 + 16384) >> 15; \
+    t5 -= (tq*13623 + 8192) >> 14; \
+    tq += (t5*17515 + 16384) >> 15; \
+    t4 += (tr*6723 + 4096) >> 13; \
+    tr -= (t4*16069 + 8192) >> 14; \
+    t4 += (tr*6723 + 4096) >> 13; \
+    /* Stage 6 */ \
+    tj += tdh; \
+    td -= tj; \
+    tc -= tih; \
+    ti += tc; \
+    th = tgh - th; \
+    tg -= th; \
+    te += tfh; \
+    tf -= te; \
+    tl = tkh - tl; \
+    tk -= tl; \
+    ta += tbh; \
+    tb -= ta; \
+    tn -= tmh; \
+    tm += tn; \
+    t8 += t9h; \
+    t9 = t8 - t9; \
+    tt = t3h - tt; \
+    t3 -= tt; \
+    t2 -= tsh; \
+    ts += t2; \
+    tv -= t1h; \
+    t1 += tv; \
+    t0 += tuh; \
+    tu -= t0; \
+    tp = OD_DCT_RSHIFT(to, 1) - tp; \
+    to -= tp; \
+    t6 += OD_DCT_RSHIFT(t7, 1); \
+    t7 -= t6; \
+    t4 = OD_DCT_RSHIFT(tq, 1) - t4; \
+    tq -= t4; \
+    tr += OD_DCT_RSHIFT(t5, 1); \
+    t5 = tr - t5; \
+    /* Stage 7 */ \
+    td += (ti*21894 + 16384) >> 15; \
+    ti -= (td*15137 + 8192) >> 14; \
+    td += (ti*21895 + 16384) >> 15; \
+    tj += (tc*21894 + 16384) >> 15; \
+    tc -= (tj*15137 + 8192) >> 14; \
+    tj += (tc*21895 + 16384) >> 15; \
+    th += (te*13573 + 16384) >> 15; \
+    te -= (th*11585 + 8192) >> 14; \
+    th += (te*13573 + 16384) >> 15; \
+    tb += (tk*21894 + 16384) >> 15; \
+    tk -= (tb*15137 + 8192) >> 14; \
+    tb += (tk*21895 + 16384) >> 15; \
+    ta += (tl*3259 + 8192) >> 14; \
+    tl -= (ta*3135 + 4096) >> 13; \
+    ta += (tl*3259 + 8192) >> 14; \
+    t9 += (tm*13573 + 16384) >> 15; \
+    tm -= (t9*11585 + 8192) >> 14; \
+    t9 += (tm*13573 + 16384) >> 15; \
+    ts += (t3*3259 + 8192) >> 14; \
+    t3 -= (ts*3135 + 4096) >> 13; \
+    ts += (t3*3259 + 8192) >> 14; \
+    t2 += (tt*3259 + 8192) >> 14; \
+    tt -= (t2*3135 + 4096) >> 13; \
+    t2 += (tt*3259 + 8192) >> 14; \
+    tu += (t1*13573 + 16384) >> 15; \
+    t1 -= (tu*11585 + 8192) >> 14; \
+    tu += (t1*13573 + 16384) >> 15; \
+    tp += (t6*13573 + 16384) >> 15; \
+    t6 -= (tp*11585 + 8192) >> 14; \
+    tp += (t6*13573 + 16384) >> 15; \
+    tq += (t5*13573 + 16384) >> 15; \
+    t5 -= (tq*11585 + 8192) >> 14; \
+    tq += (t5*13573 + 16384) >> 15; \
+  } \
+  while (0)
+
+/* Embedded 32-point orthonormal Type-IV iDST. */
+#define OD_IDST_32(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, \
+  te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
+  /* 117 "muls", 117 + 128 = 245 adds, 36 shifts */ \
+  do { \
+    od_coeff t0h; \
+    od_coeff t1h; \
+    od_coeff t2h; \
+    od_coeff t3h; \
+    od_coeff t4h; \
+    od_coeff t6h; \
+    od_coeff t8h; \
+    od_coeff t9h; \
+    od_coeff tah; \
+    od_coeff tbh; \
+    od_coeff tch; \
+    od_coeff tdh; \
+    od_coeff teh; \
+    od_coeff tfh; \
+    od_coeff tgh; \
+    od_coeff thh; \
+    od_coeff tih; \
+    od_coeff tjh; \
+    od_coeff tkh; \
+    od_coeff tlh; \
+    od_coeff tmh; \
+    od_coeff tnh; \
+    od_coeff tph; \
+    od_coeff trh; \
+    od_coeff tsh; \
+    od_coeff tth; \
+    od_coeff tuh; \
+    od_coeff tvh; \
+    /* Stage 0 */ \
+    tq -= (t5*13573 + 16384) >> 15; \
+    t5 += (tq*11585 + 8192) >> 14; \
+    tq -= (t5*13573 + 16384) >> 15; \
+    tp -= (t6*13573 + 16384) >> 15; \
+    t6 += (tp*11585 + 8192) >> 14; \
+    tp -= (t6*13573 + 16384) >> 15; \
+    tu -= (t1*13573 + 16384) >> 15; \
+    t1 += (tu*11585 + 8192) >> 14; \
+    tu -= (t1*13573 + 16384) >> 15; \
+    t2 -= (tt*3259 + 8192) >> 14; \
+    tt += (t2*3135 + 4096) >> 13; \
+    t2 -= (tt*3259 + 8192) >> 14; \
+    ts -= (t3*3259 + 8192) >> 14; \
+    t3 += (ts*3135 + 4096) >> 13; \
+    ts -= (t3*3259 + 8192) >> 14; \
+    t9 -= (tm*13573 + 16384) >> 15; \
+    tm += (t9*11585 + 8192) >> 14; \
+    t9 -= (tm*13573 + 16384) >> 15; \
+    ta -= (tl*3259 + 8192) >> 14; \
+    tl += (ta*3135 + 4096) >> 13; \
+    ta -= (tl*3259 + 8192) >> 14; \
+    tb -= (tk*21895 + 16384) >> 15; \
+    tk += (tb*15137 + 8192) >> 14; \
+    tb -= (tk*21894 + 16384) >> 15; \
+    th -= (te*13573 + 16384) >> 15; \
+    te += (th*11585 + 8192) >> 14; \
+    th -= (te*13573 + 16384) >> 15; \
+    tj -= (tc*21895 + 16384) >> 15; \
+    tc += (tj*15137 + 8192) >> 14; \
+    tj -= (tc*21894 + 16384) >> 15; \
+    td -= (ti*21895 + 16384) >> 15; \
+    ti += (td*15137 + 8192) >> 14; \
+    td -= (ti*21894 + 16384) >> 15; \
+    /* Stage 1 */ \
+    t5 = tr - t5; \
+    tr -= OD_DCT_RSHIFT(t5, 1); \
+    tq += t4; \
+    t4 = OD_DCT_RSHIFT(tq, 1) - t4; \
+    t7 += t6; \
+    t6 -= OD_DCT_RSHIFT(t7, 1); \
+    to += tp; \
+    tp = OD_DCT_RSHIFT(to, 1) - tp; \
+    tu += t0; \
+    tuh = OD_DCT_RSHIFT(tu, 1); \
+    t0 -= tuh; \
+    t1 -= tv; \
+    t1h = OD_DCT_RSHIFT(t1, 1); \
+    tv += t1h; \
+    ts -= t2; \
+    tsh = OD_DCT_RSHIFT(ts, 1); \
+    t2 += tsh; \
+    t3 += tt; \
+    t3h = OD_DCT_RSHIFT(t3, 1); \
+    tt = t3h - tt; \
+    t9 = t8 - t9; \
+    t9h = OD_DCT_RSHIFT(t9, 1); \
+    t8 -= t9h; \
+    tm -= tn; \
+    tmh = OD_DCT_RSHIFT(tm, 1); \
+    tn += tmh; \
+    tb += ta; \
+    tbh = OD_DCT_RSHIFT(tb, 1); \
+    ta -= tbh; \
+    tk += tl; \
+    tkh = OD_DCT_RSHIFT(tk, 1); \
+    tl = tkh - tl; \
+    tf += te; \
+    tfh = OD_DCT_RSHIFT(tf, 1); \
+    te -= tfh; \
+    tg += th; \
+    tgh = OD_DCT_RSHIFT(tg, 1); \
+    th = tgh - th; \
+    ti -= tc; \
+    tih = OD_DCT_RSHIFT(ti, 1); \
+    tc += tih; \
+    td += tj; \
+    tdh = OD_DCT_RSHIFT(td, 1); \
+    tj -= tdh; \
+    /* Stage 2 */ \
+    t4 -= (tr*6723 + 4096) >> 13; \
+    tr += (t4*16069 + 8192) >> 14; \
+    t4 -= (tr*6723 + 4096) >> 13; \
+    tq -= (t5*17515 + 16384) >> 15; \
+    t5 += (tq*13623 + 8192) >> 14; \
+    tq -= (t5*17515 + 16384) >> 15; \
+    to -= (t7*3227 + 16384) >> 15; \
+    t7 += (to*6393 + 16384) >> 15; \
+    to -= (t7*3227 + 16384) >> 15; \
+    tp -= (t6*2485 + 4096) >> 13; \
+    t6 += (tp*18205 + 16384) >> 15; \
+    tp -= (t6*2485 + 4096) >> 13; \
+    /* Stage 3 */ \
+    tp -= OD_DCT_RSHIFT(t5, 1); \
+    t5 += tp; \
+    t4 += OD_DCT_RSHIFT(t7, 1); \
+    t7 = t4 - t7; \
+    tr = OD_DCT_RSHIFT(to, 1) - tr; \
+    to -= tr; \
+    t6 += OD_DCT_RSHIFT(tq, 1); \
+    tq = t6 - tq; \
+    t0 -= tsh; \
+    ts += t0; \
+    tt += tuh; \
+    tu -= tt; \
+    tv += t3h; \
+    t3 -= tv; \
+    t2 -= t1h; \
+    t1 += t2; \
+    tn += tkh; \
+    tk -= tn; \
+    t8 += tbh; \
+    tb -= t8; \
+    tl += t9h; \
+    t9 -= tl; \
+    ta = tmh - ta; \
+    tm = ta - tm; \
+    te -= tdh; \
+    td += te; \
+    tj = tgh - tj; \
+    tg -= tj; \
+    th += tih; \
+    ti = th - ti; \
+    tc -= tfh; \
+    tf += tc; \
+    /* Stage 4 */ \
+    tn -= (t8*29699 + 16384) >> 15; \
+    t8 += (tn*16305 + 8192) >> 14; \
+    tn -= (t8*29699 + 16384) >> 15; \
+    t9 -= (tm*12151 + 8192) >> 14; \
+    tm += (t9*31357 + 16384) >> 15; \
+    t9 -= (tm*12151 + 8192) >> 14; \
+    tl -= (ta*2455 + 2048) >> 12; \
+    ta += (tl*28899 + 16384) >> 15; \
+    tl -= (ta*2455 + 2048) >> 12; \
+    tb -= (tk*7749 + 8192) >> 14; \
+    tk += (tb*12665 + 8192) >> 14; \
+    tb -= (tk*7749 + 8192) >> 14; \
+    tg -= (tf*805 + 8192) >> 14; \
+    tf += (tg*803 + 4096) >> 13; \
+    tg -= (tf*805 + 8192) >> 14; \
+    th -= (te*4861 + 16384) >> 15; \
+    te += (th*1189 + 2048) >> 12; \
+    th -= (te*4861 + 16384) >> 15; \
+    td -= (ti*513 + 1024) >> 11; \
+    ti += (td*15447 + 16384) >> 15; \
+    td -= (ti*513 + 1024) >> 11; \
+    tj -= (tc*11725 + 16384) >> 15; \
+    tc += (tj*5197 + 4096) >> 13; \
+    tj -= (tc*11725 + 16384) >> 15; \
+    /* Stage 5 */ \
+    t2 += t5; \
+    t2h = OD_DCT_RSHIFT(t2, 1); \
+    t5 -= t2h; \
+    tt -= tq; \
+    tth = OD_DCT_RSHIFT(tt, 1); \
+    tq += tth; \
+    tp += tu; \
+    tph = OD_DCT_RSHIFT(tp, 1); \
+    tu -= tph; \
+    t6 -= t1; \
+    t6h = OD_DCT_RSHIFT(t6, 1); \
+    t1 += t6h; \
+    tv = t7 - tv; \
+    tvh = OD_DCT_RSHIFT(tv, 1); \
+    t7 = tvh - t7; \
+    t0 -= to; \
+    t0h = OD_DCT_RSHIFT(t0, 1); \
+    to += t0h; \
+    t4 += ts; \
+    t4h = OD_DCT_RSHIFT(t4, 1); \
+    ts -= t4h; \
+    tr -= t3; \
+    trh = OD_DCT_RSHIFT(tr, 1); \
+    t3 += trh; \
+    t8 = tf - t8; \
+    t8h = OD_DCT_RSHIFT(t8, 1); \
+    tf -= t8h; \
+    tn = tg - tn; \
+    tnh = OD_DCT_RSHIFT(tn, 1); \
+    tg -= tnh; \
+    tc -= tb; \
+    tch = OD_DCT_RSHIFT(tc, 1); \
+    tb += tch; \
+    tj += tk; \
+    tjh = OD_DCT_RSHIFT(tj, 1); \
+    tk = tjh - tk; \
+    ta += ti; \
+    tah = OD_DCT_RSHIFT(ta, 1); \
+    ti -= tah; \
+    tl += td; \
+    tlh = OD_DCT_RSHIFT(tl, 1); \
+    td = tlh - td; \
+    te += t9; \
+    teh = OD_DCT_RSHIFT(te, 1); \
+    t9 = teh - t9; \
+    th -= tm; \
+    thh = OD_DCT_RSHIFT(th, 1); \
+    tm += thh; \
+    /* Stage 6 */ \
+    t5 += tah; \
+    ta -= t5; \
+    ti += tth; \
+    tt -= ti; \
+    tq += tlh; \
+    tl = tq - tl; \
+    td -= t2h; \
+    t2 += td; \
+    t9 += tph; \
+    tp -= t9; \
+    t1 += teh; \
+    te -= t1; \
+    tm = t6h - tm; \
+    t6 -= tm; \
+    tu -= thh; \
+    th += tu; \
+    t7 += tnh; \
+    tn = t7 - tn; \
+    tg += t0h; \
+    t0 -= tg; \
+    to -= t8h; \
+    t8 += to; \
+    tf += tvh; \
+    tv -= tf; \
+    tb += t4h; \
+    t4 -= tb; \
+    t3 += tch; \
+    tc -= t3; \
+    tk = trh - tk; \
+    tr = tk - tr; \
+    ts -= tjh; \
+    tj += ts; \
+    /* Stage 7 */ \
+    t0 -= (tv*31973 + 16384) >> 15; \
+    tv += (t0*16379 + 8192) >> 14; \
+    t0 -= (tv*31973 + 16384) >> 15; \
+    tg -= (tf*819 + 1024) >> 11; \
+    tf += (tg*22595 + 16384) >> 15; \
+    tg -= (tf*819 + 1024) >> 11; \
+    t8 -= (tn*10659 + 8192) >> 14; \
+    tn += (t8*29957 + 16384) >> 15; \
+    t8 -= (tn*10659 + 8192) >> 14; \
+    to -= (t7*6101 + 16384) >> 15; \
+    t7 += (to*11793 + 16384) >> 15; \
+    to -= (t7*6101 + 16384) >> 15; \
+    tt -= (t2*2013 + 16384) >> 15; \
+    t2 += (tt*4011 + 16384) >> 15; \
+    tt -= (t2*2013 + 16384) >> 15; \
+    tl -= (ta*8637 + 16384) >> 15; \
+    ta += (tl*16151 + 16384) >> 15; \
+    tl -= (ta*8637 + 16384) >> 15; \
+    ti -= (td*11273 + 16384) >> 15; \
+    td += (ti*315 + 256) >> 9; \
+    ti -= (td*11273 + 16384) >> 15; \
+    tq -= (t5*2225 + 8192) >> 14; \
+    t5 += (tq*2185 + 4096) >> 13; \
+    tq -= (t5*2225 + 8192) >> 14; \
+    ts -= (t3*1411 + 8192) >> 14; \
+    t3 += (ts*2801 + 8192) >> 14; \
+    ts -= (t3*1411 + 8192) >> 14; \
+    tb -= (tk*18035 + 16384) >> 15; \
+    tk += (tb*6921 + 4096) >> 13; \
+    tb -= (tk*18035 + 16384) >> 15; \
+    tj -= (tc*10381 + 16384) >> 15; \
+    tc += (tj*4717 + 4096) >> 13; \
+    tj -= (tc*10381 + 16384) >> 15; \
+    t4 -= (tr*13113 + 8192) >> 14; \
+    tr += (t4*7993 + 4096) >> 13; \
+    t4 -= (tr*13113 + 8192) >> 14; \
+    tu -= (t1*1207 + 16384) >> 15; \
+    t1 += (tu*2411 + 16384) >> 15; \
+    tu -= (t1*1207 + 16384) >> 15; \
+    t9 -= (tm*20191 + 16384) >> 15; \
+    tm += (t9*29269 + 16384) >> 15; \
+    t9 -= (tm*20191 + 16384) >> 15; \
+    th -= (te*3045 + 4096) >> 13; \
+    te += (th*21403 + 16384) >> 15; \
+    th -= (te*3045 + 4096) >> 13; \
+    tp -= (t6*659 + 2048) >> 12; \
+    t6 += (tp*10279 + 16384) >> 15; \
+    tp -= (t6*659 + 2048) >> 12; \
+  } \
+  while (0)
+
 #if CONFIG_TX64X64
 #define OD_FDCT_32_ASYM(t0, tg, tgh, t8, to, toh, t4, tk, tkh, tc, ts, tsh, \
   t2, ti, tih, ta, tq, tqh, t6, tm, tmh, te, tu, tuh, t1, th, thh, \
@@ -3898,6 +4532,208 @@
   x[31*xstride] = (od_coeff)tv;
 }
 
+void od_bin_fdst32(od_coeff y[32], const od_coeff *x, int xstride) {
+  od_coeff t0;
+  od_coeff t1;
+  od_coeff t2;
+  od_coeff t3;
+  od_coeff t4;
+  od_coeff t5;
+  od_coeff t6;
+  od_coeff t7;
+  od_coeff t8;
+  od_coeff t9;
+  od_coeff ta;
+  od_coeff tb;
+  od_coeff tc;
+  od_coeff td;
+  od_coeff te;
+  od_coeff tf;
+  od_coeff tg;
+  od_coeff th;
+  od_coeff ti;
+  od_coeff tj;
+  od_coeff tk;
+  od_coeff tl;
+  od_coeff tm;
+  od_coeff tn;
+  od_coeff to;
+  od_coeff tp;
+  od_coeff tq;
+  od_coeff tr;
+  od_coeff ts;
+  od_coeff tt;
+  od_coeff tu;
+  od_coeff tv;
+  t0 = x[0*xstride];
+  t1 = x[1*xstride];
+  t2 = x[2*xstride];
+  t3 = x[3*xstride];
+  t4 = x[4*xstride];
+  t5 = x[5*xstride];
+  t6 = x[6*xstride];
+  t7 = x[7*xstride];
+  t8 = x[8*xstride];
+  t9 = x[9*xstride];
+  ta = x[10*xstride];
+  tb = x[11*xstride];
+  tc = x[12*xstride];
+  td = x[13*xstride];
+  te = x[14*xstride];
+  tf = x[15*xstride];
+  tg = x[16*xstride];
+  th = x[17*xstride];
+  ti = x[18*xstride];
+  tj = x[19*xstride];
+  tk = x[20*xstride];
+  tl = x[21*xstride];
+  tm = x[22*xstride];
+  tn = x[23*xstride];
+  to = x[24*xstride];
+  tp = x[25*xstride];
+  tq = x[26*xstride];
+  tr = x[27*xstride];
+  ts = x[28*xstride];
+  tt = x[29*xstride];
+  tu = x[30*xstride];
+  tv = x[31*xstride];
+  OD_FDST_32(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf,
+    tg, th, ti, tj, tk, tl, tm, tn, to, tp, tq, tr, ts, tt, tu, tv);
+  y[0] = t0;
+  y[1] = tg;
+  y[2] = t8;
+  y[3] = to;
+  y[4] = t4;
+  y[5] = tk;
+  y[6] = tc;
+  y[7] = ts;
+  y[8] = t2;
+  y[9] = ti;
+  y[10] = ta;
+  y[11] = tq;
+  y[12] = t6;
+  y[13] = tm;
+  y[14] = te;
+  y[15] = tu;
+  y[16] = t1;
+  y[17] = th;
+  y[18] = t9;
+  y[19] = tp;
+  y[20] = t5;
+  y[21] = tl;
+  y[22] = td;
+  y[23] = tt;
+  y[24] = t3;
+  y[25] = tj;
+  y[26] = tb;
+  y[27] = tr;
+  y[28] = t7;
+  y[29] = tn;
+  y[30] = tf;
+  y[31] = tv;
+}
+
+void od_bin_idst32(od_coeff *x, int xstride, const od_coeff y[32]) {
+  od_coeff t0;
+  od_coeff t1;
+  od_coeff t2;
+  od_coeff t3;
+  od_coeff t4;
+  od_coeff t5;
+  od_coeff t6;
+  od_coeff t7;
+  od_coeff t8;
+  od_coeff t9;
+  od_coeff ta;
+  od_coeff tb;
+  od_coeff tc;
+  od_coeff td;
+  od_coeff te;
+  od_coeff tf;
+  od_coeff tg;
+  od_coeff th;
+  od_coeff ti;
+  od_coeff tj;
+  od_coeff tk;
+  od_coeff tl;
+  od_coeff tm;
+  od_coeff tn;
+  od_coeff to;
+  od_coeff tp;
+  od_coeff tq;
+  od_coeff tr;
+  od_coeff ts;
+  od_coeff tt;
+  od_coeff tu;
+  od_coeff tv;
+  t0 = y[0];
+  tg = y[1];
+  t8 = y[2];
+  to = y[3];
+  t4 = y[4];
+  tk = y[5];
+  tc = y[6];
+  ts = y[7];
+  t2 = y[8];
+  ti = y[9];
+  ta = y[10];
+  tq = y[11];
+  t6 = y[12];
+  tm = y[13];
+  te = y[14];
+  tu = y[15];
+  t1 = y[16];
+  th = y[17];
+  t9 = y[18];
+  tp = y[19];
+  t5 = y[20];
+  tl = y[21];
+  td = y[22];
+  tt = y[23];
+  t3 = y[24];
+  tj = y[25];
+  tb = y[26];
+  tr = y[27];
+  t7 = y[28];
+  tn = y[29];
+  tf = y[30];
+  tv = y[31];
+  OD_IDST_32(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, te, tu,
+    t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv);
+  x[0*xstride] = t0;
+  x[1*xstride] = t1;
+  x[2*xstride] = t2;
+  x[3*xstride] = t3;
+  x[4*xstride] = t4;
+  x[5*xstride] = t5;
+  x[6*xstride] = t6;
+  x[7*xstride] = t7;
+  x[8*xstride] = t8;
+  x[9*xstride] = t9;
+  x[10*xstride] = ta;
+  x[11*xstride] = tb;
+  x[12*xstride] = tc;
+  x[13*xstride] = td;
+  x[14*xstride] = te;
+  x[15*xstride] = tf;
+  x[16*xstride] = tg;
+  x[17*xstride] = th;
+  x[18*xstride] = ti;
+  x[19*xstride] = tj;
+  x[20*xstride] = tk;
+  x[21*xstride] = tl;
+  x[22*xstride] = tm;
+  x[23*xstride] = tn;
+  x[24*xstride] = to;
+  x[25*xstride] = tp;
+  x[26*xstride] = tq;
+  x[27*xstride] = tr;
+  x[28*xstride] = ts;
+  x[29*xstride] = tt;
+  x[30*xstride] = tu;
+  x[31*xstride] = tv;
+}
+
 #if CONFIG_TX64X64
 void od_bin_fdct64(od_coeff y[64], const od_coeff *x, int xstride) {
   int t0;
@@ -4441,30 +5277,22 @@
   for (i = 0; i < 32; i++) output[i] = (tran_low_t)x[i];
 }
 
-/* Preserve the "half-right" transform behavior. */
 void daala_fdst32(const tran_low_t *input, tran_low_t *output) {
   int i;
-  tran_low_t inputhalf[16];
-  for (i = 0; i < 16; ++i) {
-    output[16 + i] = input[i];
-  }
-  for (i = 0; i < 16; ++i) {
-    inputhalf[i] = input[i + 16];
-  }
-  daala_fdct16(inputhalf, output);
+  od_coeff x[32];
+  od_coeff y[32];
+  for (i = 0; i < 32; i++) x[i] = (od_coeff)input[i];
+  od_bin_fdst32(y, x, 1);
+  for (i = 0; i < 32; i++) output[i] = (tran_low_t)y[i];
 }
 
-/* Preserve the "half-right" transform behavior. */
 void daala_idst32(const tran_low_t *input, tran_low_t *output) {
   int i;
-  tran_low_t inputhalf[16];
-  for (i = 0; i < 16; ++i) {
-    inputhalf[i] = input[i];
-  }
-  for (i = 0; i < 16; ++i) {
-    output[i] = input[16 + i];
-  }
-  daala_idct16(inputhalf, output + 16);
+  od_coeff x[32];
+  od_coeff y[32];
+  for (i = 0; i < 32; i++) y[i] = input[i];
+  od_bin_idst32(x, 1, y);
+  for (i = 0; i < 32; i++) output[i] = (tran_low_t)x[i];
 }
 
 void daala_idtx32(const tran_low_t *input, tran_low_t *output) {