daala_tx: New flattened 16-point Type-IV DST.

Change-Id: Ic741f269d0bd5e5e295b55f95bfef05050bc31e5
diff --git a/av1/common/daala_tx.c b/av1/common/daala_tx.c
index 45b7057..4c4f435 100644
--- a/av1/common/daala_tx.c
+++ b/av1/common/daala_tx.c
@@ -2119,6 +2119,471 @@
   } \
   while (0)
 
+#define OD_FDST_16_FLAT(s0, s8, s4, sc, s2, sa, s6, se, \
+  s1, s9, s5, sd, s3, sb, s7, sf) \
+  /* Embedded 16-point orthonormal Type-IV fDST with flattened rotations. */ \
+  do { \
+    int t_; \
+    int u_; \
+    int s0h; \
+    int s4h; \
+    int sbh; \
+    int sfh; \
+    t_ = s1 + se; \
+    /* 32729/32768 ~= (Sin[17*Pi/64] + Cos[17*Pi/64])/Sqrt[2] ~=
+        0.9987954562051723 */ \
+    u_ = (se*32729 + 16384) >> 15; \
+    /* 201/2048 ~= (Sin[17*Pi/64] - Cos[17*Pi/64])*Sqrt[2] ~=
+        0.09813534865483615 */ \
+    se = (s1*201 + 1024) >> 11; \
+    /* 31121/32768 ~= Cos[17*Pi/64]*Sqrt[2] = 0.9497277818777543 */ \
+    t_ = (t_*31121 + 16384) >> 15; \
+    se += t_; \
+    s1 = u_ - OD_RSHIFT1(t_); \
+    t_ = s6 - s9; \
+    /* 32413/32768 ~= (Sin[19*Pi/64] + Cos[19*Pi/64])/Sqrt[2] ~=
+        0.9891765099647809 */ \
+    u_ = (s9*32413 + 16384) >> 15; \
+    /* 601/2048 ~= (Sin[19*Pi/64] - Cos[19*Pi/64])*Sqrt[2]
+        ~= 0.29346094891072355 */ \
+    s9 = (s6*601 + 1024) >> 11; \
+    /* 27605/32768 ~= Cos[19*Pi/64]*Sqrt[2] = 0.8424460355094193 */ \
+    t_ = (t_*27605 + 16384) >> 15; \
+    s9 += t_; \
+    s6 = u_ + OD_RSHIFT1(t_); \
+    t_ = s5 + sa; \
+    /* 15893/16384 ~= (Sin[21*Pi/64] + Cos[21*Pi/64])/Sqrt[2] ~=
+        0.970031253194544 */ \
+    u_ = (sa*15893 + 8192) >> 14; \
+    /* 3981/8192 ~= (Sin[21*Pi/64] - Cos[21*Pi/64])*Sqrt[2] ~=
+        0.48596035980652796 */ \
+    sa = (s5*3981 + 4096) >> 13; \
+    /* 1489/2048 ~= Cos[21*Pi/64]*Sqrt[2] ~= 0.72705107329128 */ \
+    t_ = (t_*1489 + 1024) >> 11; \
+    sa += t_; \
+    s5 = OD_RSHIFT1(t_) - u_; \
+    t_ = sd - s2; \
+    /* 30853/32768 ~= (Sin[23*Pi/64] + Cos[23*Pi/64])/Sqrt[2] ~=
+        0.9415440651830208 */ \
+    u_ = (sd*30853 + 16384) >> 15; \
+    /* 11039/16384 ~= (Sin[23*Pi/64] - Cos[23*Pi/64])*Sqrt[2] ~=
+        0.6737797067844402 */ \
+    sd = (s2*11039 + 8192) >> 14; \
+    /* 19813/32768 ~= Cos[23*Pi/64]*Sqrt[2] ~= 0.6046542117908008 */ \
+    t_ = (t_*19813 + 16384) >> 15; \
+    sd -= t_; \
+    s2 = OD_RSHIFT1(t_) - u_; \
+    t_ = s3 + sc; \
+    /* 14811/16384 ~= (Sin[25*Pi/64] + Cos[25*Pi/64])/Sqrt[2] ~=
+        0.9039892931234433 */ \
+    u_ = (sc*14811 + 8192) >> 14; \
+    /* 7005/8192 ~= (Sin[25*Pi/64] - Cos[25*Pi/64])*Sqrt[2] ~=
+        0.8551101868605642 */ \
+    sc = (s3*7005 + 4096) >> 13; \
+    /* 3903/8192 ~= Cos[25*Pi/64]*Sqrt[2] ~= 0.47643419969316125 */ \
+    t_ = (t_*3903 + 4096) >> 13; \
+    sc += t_; \
+    s3 = u_ - OD_RSHIFT1(t_); \
+    t_ = sb - s4; \
+    /* 14053/16384 ~= (Sin[27*Pi/64] + Cos[27*Pi/64])/Sqrt[2] ~=
+        0.857728610000272 */ \
+    u_ = (sb*14053 + 8192) >> 14; \
+    /* 8423/8192 ~= (Sin[27*Pi/64] - Cos[27*Pi/64])*Sqrt[2] ~=
+        1.0282054883864435 */ \
+    sb = (s4*8423 + 4096) >> 13; \
+    /* 2815/8192 ~= Cos[27*Pi/64]*Sqrt[2] = 0.34362586580705035 */ \
+    t_ = (t_*2815 + 4096) >> 13; \
+    sb -= t_; \
+    s4 = OD_RSHIFT1(t_) - u_; \
+    t_ = s7 + s8; \
+    /* 1645/2048 ~= (Sin[29*Pi/64] + Cos[29*Pi/64])/Sqrt[2] ~=
+        0.8032075314806449 */ \
+    u_ = (s8*1645 + 1024) >> 11; \
+    /* 305/256 ~= (Sin[29*Pi/64] - Cos[29*Pi/64])*Sqrt[2] ~=
+        1.1913986089848667 */ \
+    s8 = (s7*305 + 128) >> 8; \
+    /* 425/2048 ~= Cos[29*Pi/64]*Sqrt[2] ~= 0.20750822698821159 */ \
+    t_ = (t_*425 + 1024) >> 11; \
+    s8 += t_; \
+    s7 = u_ - OD_RSHIFT1(t_); \
+    t_ = s0 - sf; \
+    /* 24279/32768 ~= (Sin[31*Pi/64] + Cos[31*Pi/64])/Sqrt[2] ~=
+        0.7409511253549591 */ \
+    u_ = (sf*24279 + 16384) >> 15; \
+    /* 44011/32768 ~= (Sin[31*Pi/64] - Cos[31*Pi/64])*Sqrt[2] ~=
+        1.3431179096940369 */ \
+    sf = (s0*44011 + 16384) >> 15; \
+    /* 1137/16384 ~= Cos[31*Pi/64]*Sqrt[2] ~= 0.0693921705079406 */ \
+    t_ = (t_*1137 + 8192) >> 14; \
+    s0 = u_ + OD_RSHIFT1(t_); \
+    sf += t_; \
+    s3 -= OD_RSHIFT1(sd); \
+    sd += s3; \
+    s2 += OD_RSHIFT1(sc); \
+    sc -= s2; \
+    s5 -= OD_RSHIFT1(sb); \
+    sb += s5; \
+    s4 -= OD_RSHIFT1(sa); \
+    sa += s4; \
+    s1 += OD_RSHIFT1(sf); \
+    sf -= s1; \
+    s7 -= OD_RSHIFT1(s9); \
+    s9 += s7; \
+    s6 -= OD_RSHIFT1(s8); \
+    s8 += s6; \
+    s0 += OD_RSHIFT1(se); \
+    se -= s0; \
+    sa -= s9; \
+    s9 += OD_RSHIFT1(sa); \
+    s5 += s6; \
+    s6 -= OD_RSHIFT1(s5); \
+    s1 -= s2; \
+    s2 += OD_RSHIFT1(s1); \
+    se += sd; \
+    sd -= OD_RSHIFT1(se); \
+    s0 += sc; \
+    s0h = OD_RSHIFT1(s0); \
+    sc -= s0h; \
+    sf -= s3; \
+    sfh = OD_RSHIFT1(sf); \
+    s3 += sfh; \
+    sb += s7; \
+    sbh = OD_RSHIFT1(sb); \
+    s7 -= sbh; \
+    s4 += s8; \
+    s4h = OD_RSHIFT1(s4); \
+    s8 -= s4h; \
+    t_ = OD_PAVG(s1, se); \
+    /* 9633/8192 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */ \
+    u_ = (s1*9633 + 4096) >> 13; \
+    /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */ \
+    s1 = (se*12873 + 8192) >> 14; \
+    /* 12785/32768 ~= 2*Cos[7*Pi/16] ~= 0.3901806440322565 */ \
+    t_ = (t_*12785 + 16384) >> 15; \
+    s1 += t_; \
+    se = u_ - t_; \
+    t_ = s6 + s9; \
+    /* 45451/32768 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
+    u_ = (s9*45451 + 16384) >> 15; \
+    /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
+    s9 = (s6*9041 + 16384) >> 15; \
+    /* 18205/32768 ~= Cos[5*Pi/16] ~= 0.5555702330196022 */ \
+    t_ = (t_*18205 + 16384) >> 15; \
+    s9 += t_; \
+    s6 = u_ - t_; \
+    t_ = OD_PAVG(s5, sa); \
+    /* 22725/16384 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
+    u_ = (sa*22725 + 8192) >> 14; \
+    /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
+    sa = (s5*9041 + 16384) >> 15; \
+    /* 18205/16384 ~= 2*Cos[5*Pi/16] ~= 1.1111404660392044 */ \
+    t_ = (t_*18205 + 8192) >> 14; \
+    sa += t_; \
+    s5 = t_ - u_; \
+    t_ = s2 + sd; \
+    /* 38531/32768 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */ \
+    u_ = (s2*38531 + 16384) >> 15; \
+    /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */ \
+    s2 = (sd*12873 + 8192) >> 14; \
+    /* 6393/32768 ~= Cos[7*Pi/16] ~= 0.19509032201612825 */ \
+    t_ = (t_*6393 + 16384) >> 15; \
+    s2 += t_; \
+    sd = u_ - t_; \
+    s3 -= s4h; \
+    s4 += s3; \
+    s8 -= s0h; \
+    s0 += s8; \
+    s7 += sfh; \
+    sf -= s7; \
+    sc += sbh; \
+    sb -= sc; \
+    s6 += OD_RSHIFT1(se) ;\
+    se -= s6; \
+    s9 -= OD_RSHIFT1(s1); \
+    s1 += s9; \
+    sd -= OD_RSHIFT1(s5); \
+    s5 += sd; \
+    s2 -= OD_RSHIFT1(sa); \
+    sa += s2; \
+    t_ = OD_PAVG(s3, sc); \
+    /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
+    u_ = (s3*21407 + 8192) >> 14; \
+    /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
+    s3 = (sc*8867 + 8192) >> 14; \
+    /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
+    t_ = (t_*3135 + 2048) >> 12; \
+    s3 += t_; \
+    sc = u_ - t_; \
+    t_ = OD_PAVG(s4, sb); \
+    /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
+    u_ = (s4*21407 + 8192) >> 14; \
+    /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
+    s4 = (sb*8867 + 8192) >> 14; \
+    /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
+    t_ = (t_*3134 + 2048) >> 12; \
+    s4 += t_; \
+    sb = u_ - t_; \
+    t_ = OD_PAVG(s5, sa); \
+    /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
+    u_ = (sa*11585 + 4096) >> 13; \
+    /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
+    sa = (t_*11585 + 4096) >> 13; \
+    s5 = sa - u_; \
+    t_ = OD_PAVG(s6, -s9); \
+    /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
+    s6 = (s9*11585 + 4096) >> 13; \
+    /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
+    s9 = (t_*11585 + 4096) >> 13; \
+    s6 += s9; \
+    t_ = OD_PAVG(s7, -s8); \
+    /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
+    s7 = (s8*11585 + 4096) >> 13; \
+    /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
+    s8 = (t_*11585 + 4096) >> 13; \
+    s7 += s8; \
+  } \
+  while (0)
+
+#define OD_IDST_16_FLAT(s0, s1, s2, s3, s4, s5, s6, s7, \
+  s8, s9, sa, sb, sc, sd, se, sf) \
+  /* Embedded 16-point orthonormal Type-IV iDST with flattened rotations. */ \
+  do { \
+    int t_; \
+    int u_; \
+    int s0h; \
+    int s1h; \
+    int s2h; \
+    int s3h; \
+    int s4h; \
+    int s5h; \
+    int s6h; \
+    int s7h; \
+    int sbh; \
+    int sfh; \
+    t_ = OD_PAVG(s6, s9); \
+    /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
+    s9 = (s6*11585 + 4096) >> 13; \
+    /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
+    s6 = (t_*11585 + 4096) >> 13; \
+    s9 -= s6; \
+    t_ = OD_PAVG(s5, sa); \
+    /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
+    sa = (s5*11585 + 4096) >> 13; \
+    /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
+    s5 = (t_*11585 + 4096) >> 13; \
+    sa -= s5; \
+    t_ = OD_PAVG(s7, s8); \
+    /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
+    s8 = (s7*11585 + 4096) >> 13; \
+    /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
+    s7 = (t_*11585 + 4096) >> 13; \
+    s8 -= s7; \
+    t_ = OD_PAVG(s3, sc); \
+    /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
+    u_ = (s3*21407 + 8192) >> 14; \
+    /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
+    s3 = (sc*8867 + 8192) >> 14; \
+    /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
+    t_ = (t_*3135 + 2048) >> 12; \
+    s3 += t_; \
+    sc = u_ - t_; \
+    t_ = OD_PAVG(sb, -s4); \
+    /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
+    u_ = (sb*21407 + 8192) >> 14; \
+    /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
+    sb = (s4*8867 + 8192) >> 14; \
+    /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
+    t_ = (t_*3135 + 2048) >> 12; \
+    sb -= t_; \
+    s4 = t_ - u_; \
+    sa += s2; \
+    s2 -= OD_RSHIFT1(sa); \
+    s5 -= sd; \
+    sd += OD_RSHIFT1(s5); \
+    s1 -= s9; \
+    s9 += OD_RSHIFT1(s1); \
+    se += s6; \
+    s6 -= OD_RSHIFT1(se); \
+    sb += sc; \
+    sbh = OD_RSHIFT1(sb); \
+    sc -= sbh; \
+    sf += s7; \
+    sfh = OD_RSHIFT1(sf); \
+    s7 -= sfh; \
+    s0 -= s8; \
+    s0h = OD_RSHIFT1(s0); \
+    s8 += s0h; \
+    s4 += s3; \
+    s4h = OD_RSHIFT1(s4); \
+    s3 -= s4h; \
+    t_ = sd - s2; \
+    /* 38531/32768 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */ \
+    u_ = (sd*38531 + 16384) >> 15; \
+    /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */ \
+    sd = (s2*12873 + 8192) >> 14; \
+    /* 6393/32768 ~= Cos[7*Pi/16] ~= 0.19509032201612825 */ \
+    t_ = (t_*6393 + 16384) >> 15; \
+    sd -= t_; \
+    s2 = t_ - u_; \
+    t_ = OD_PAVG(s5, -sa); \
+    /* 22725/16384 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
+    u_ = (s5*22725 + 8192) >> 14; \
+    /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
+    s5 = (sa*9041 + 16384) >> 15; \
+    /* 18205/16384 ~= 2*Cos[5*Pi/16] ~= 1.1111404660392044 */ \
+    t_ = (t_*18205 + 8192) >> 14; \
+    s5 -= t_; \
+    sa = t_ - u_; \
+    t_ = s6 + s9; \
+    /* 45451/32768 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
+    u_ = (s9*45451 + 16384) >> 15; \
+    /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
+    s9 = (s6*9041 + 16384) >> 15; \
+    /* 18205/32768 ~= Cos[5*Pi/16] ~= 0.5555702330196022 */ \
+    t_ = (t_*18205 + 16384) >> 15; \
+    s9 += t_; \
+    s6 = u_ - t_; \
+    t_ = OD_PAVG(s1, se); \
+    /* 9633/8192 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */ \
+    u_ = (s1*9633 + 4096) >> 13; \
+    /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */ \
+    s1 = (se*12873 + 8192) >> 14; \
+    /* 12785/32768 ~= 2*Cos[7*Pi/16] ~= 0.3901806440322565 */ \
+    t_ = (t_*12785 + 16384) >> 15; \
+    s1 += t_; \
+    se = u_ - t_; \
+    s8 -= s4h; \
+    s4 += s8; \
+    s7 += sbh; \
+    sb -= s7; \
+    s3 -= sfh; \
+    sf += s3; \
+    sc += s0h; \
+    s0 -= sc; \
+    sd += OD_RSHIFT1(se); \
+    se -= sd; \
+    s2 += OD_RSHIFT1(s1); \
+    s1 -= s2; \
+    s6 -= OD_RSHIFT1(s5); \
+    s5 += s6; \
+    s9 -= OD_RSHIFT1(sa); \
+    sa += s9; \
+    s0 -= se; \
+    s0h = OD_RSHIFT1(s0); \
+    se += s0h; \
+    s1 -= sf; \
+    s1h = OD_RSHIFT1(s1); \
+    sf += s1h; \
+    s2 += sc; \
+    s2h = OD_RSHIFT1(s2); \
+    sc -= s2h; \
+    s3 += sd; \
+    s3h = OD_RSHIFT1(s3); \
+    sd -= s3h; \
+    s4 -= sa; \
+    s4h = OD_RSHIFT1(s4); \
+    sa += s4h; \
+    s5 -= sb; \
+    s5h = OD_RSHIFT1(s5); \
+    sb += s5h; \
+    s6 += s8; \
+    s6h = OD_RSHIFT1(s6); \
+    s8 -= s6h; \
+    s7 += s9; \
+    s7h = OD_RSHIFT1(s7); \
+    s9 -= s7h; \
+    t_ = se - s1h; \
+    /* 32729/32768 ~= (Sin[17*Pi/64] + Cos[17*Pi/64])/Sqrt[2] ~=
+        0.9987954562051723 */ \
+    u_ = (s1*32729 + 16384) >> 15; \
+    /* 201/2048 ~= (Sin[17*Pi/64] - Cos[17*Pi/64])*Sqrt[2] ~=
+        0.09813534865483615 */ \
+    s1 = (se*201 + 1024) >> 11; \
+    /* 31121/32768 ~= Cos[17*Pi/64]*Sqrt[2] ~=
+        0.9497277818777543 */ \
+    t_ = (t_*31121 + 16384) >> 15; \
+    s1 += t_; \
+    se = u_ + t_; \
+    t_ = s6h + s9; \
+    /* 32413/32768 ~= (Sin[19*Pi/64] + Cos[19*Pi/64])/Sqrt[2] ~=
+        0.9891765099647809 */ \
+    u_ = (s6*32413 + 16384) >> 15; \
+    /* 601/2048 ~= (Sin[19*Pi/64] - Cos[19*Pi/64])*Sqrt[2] ~=
+        0.29346094891072355 */ \
+    s6 = (s9*601 + 1024) >> 11; \
+    /* 27605/32768 ~= Cos[19*Pi/64]*Sqrt[2] ~= 0.8424460355094193 */ \
+    t_ = (t_*27605 + 16384) >> 15; \
+    s6 += t_; \
+    s9 = u_ - t_; \
+    t_ = sa - s5h; \
+    /* 15893/16384 ~= (Sin[21*Pi/64] + Cos[21*Pi/64])/Sqrt[2] ~=
+        0.970031253194544 */ \
+    u_ = (s5*15893 + 8192) >> 14; \
+    /* 3981/8192 ~= (Sin[21*Pi/64] - Cos[21*Pi/64])*Sqrt[2] ~=
+        0.48596035980652796 */ \
+    s5 = (sa*3981 + 4096) >> 13; \
+    /* 1489/2048 ~= Cos[21*Pi/64]*Sqrt[2] ~= 0.72705107329128 */ \
+    t_ = (t_*1489 + 1024) >> 11; \
+    s5 += t_; \
+    sa = u_ + t_; \
+    t_ = s2h + sd; \
+    /* 30853/32768 ~= (Sin[23*Pi/64] + Cos[23*Pi/64])/Sqrt[2] ~=
+        0.9415440651830208 */ \
+    u_ = (s2*30853 + 16384) >> 15; \
+    /* 11039/16384 ~= (Sin[23*Pi/64] - Cos[23*Pi/64])*Sqrt[2] ~=
+        0.6737797067844402 */ \
+    s2 = (sd*11039 + 8192) >> 14; \
+    /* 19813/32768 ~= Cos[23*Pi/64]*Sqrt[2] ~= 0.6046542117908008 */ \
+    t_ = (t_*19813 + 16384) >> 15; \
+    s2 += t_; \
+    sd = u_ - t_; \
+    t_ = sc - s3h; \
+    /* 14811/16384 ~= (Sin[25*Pi/64] + Cos[25*Pi/64])/Sqrt[2] ~=
+        0.9039892931234433 */ \
+    u_ = (s3*14811 + 8192) >> 14; \
+    /* 7005/8192 ~= (Sin[25*Pi/64] - Cos[25*Pi/64])*Sqrt[2] ~=
+        0.8551101868605642 */ \
+    s3 = (sc*7005 + 4096) >> 13; \
+    /* 3903/8192 ~= Cos[25*Pi/64]*Sqrt[2] ~= 0.47643419969316125 */ \
+    t_ = (t_*3903 + 4096) >> 13; \
+    s3 += t_; \
+    sc = u_ + t_; \
+    t_ = s4h + sb; \
+    /* 14053/16384 ~= (Sin[27*Pi/64] + Cos[27*Pi/64])/Sqrt[2] ~=
+        0.857728610000272 */ \
+    u_ = (s4*14053 + 8192) >> 14; \
+    /* 8423/8192 ~= (Sin[27*Pi/64] - Cos[27*Pi/64])*Sqrt[2] ~=
+        1.0282054883864435 */ \
+    s4 = (sb*8423 + 4096) >> 13; \
+    /* 2815/8192 ~= Cos[27*Pi/64]*Sqrt[2] ~= 0.34362586580705035 */ \
+    t_ = (t_*2815 + 4096) >> 13; \
+    s4 += t_; \
+    sb = u_ - t_; \
+    t_ = s8 - s7h; \
+    /* 1645/2048 ~= (Sin[29*Pi/64] + Cos[29*Pi/64])/Sqrt[2] ~=
+        0.8032075314806449 */ \
+    u_ = (s7*1645 + 1024) >> 11; \
+    /* 305/256 ~= (Sin[29*Pi/64] - Cos[29*Pi/64])*Sqrt[2] ~=
+        1.1913986089848667 */ \
+    s7 = (s8*305 + 128) >> 8; \
+    /* 425/2048 ~= Cos[29*Pi/64]*Sqrt[2] ~= 0.20750822698821159 */ \
+    t_ = (t_*425 + 1024) >> 11; \
+    s7 += t_; \
+    s8 = u_ + t_; \
+    t_ = s0h + sf; \
+    /* 24279/32768 ~= (Sin[31*Pi/64] + Cos[31*Pi/64])/Sqrt[2] ~=
+        0.7409511253549591 */ \
+    u_ = (s0*24279 + 16384) >> 15; \
+    /* 44011/32768 ~= (Sin[31*Pi/64] - Cos[31*Pi/64])*Sqrt[2] ~=
+        1.3431179096940369 */ \
+    s0 = (sf*44011 + 16384) >> 15; \
+    /* 1137/16384 ~= Cos[31*Pi/64]*Sqrt[2] ~= 0.06939217050794069 */ \
+    t_ = (t_*1137 + 8192) >> 14; \
+    s0 += t_; \
+    sf = u_ - t_; \
+  } \
+  while (0)
+
 /* TODO: rewrite this to match OD_FDST_16. */
 #define OD_FDST_16_ASYM_PR(t0, t0h, t8, t4, t4h, tc, t2, ta, t6, te, \
   t1, t9, t5, td, t3, tb, t7, t7h, tf) \
@@ -5772,39 +6237,40 @@
   int sd;
   int se;
   int sf;
-  s0 = x[15*xstride];
-  s8 = x[14*xstride];
-  s4 = x[13*xstride];
-  sc = x[12*xstride];
-  s2 = x[11*xstride];
-  sa = x[10*xstride];
-  s6 = x[9*xstride];
-  se = x[8*xstride];
-  s1 = x[7*xstride];
-  s9 = x[6*xstride];
-  s5 = x[5*xstride];
-  sd = x[4*xstride];
-  s3 = x[3*xstride];
-  sb = x[2*xstride];
-  s7 = x[1*xstride];
-  sf = x[0*xstride];
-  OD_FDST_16_PR(s0, s8, s4, sc, s2, sa, s6, se, s1, s9, s5, sd, s3, sb, s7, sf);
-  y[0] = (od_coeff)sf;
-  y[1] = (od_coeff)-se;
-  y[2] = (od_coeff)sd;
-  y[3] = (od_coeff)-sc;
-  y[4] = (od_coeff)sb;
-  y[5] = (od_coeff)-sa;
-  y[6] = (od_coeff)s9;
-  y[7] = (od_coeff)-s8;
-  y[8] = (od_coeff)s7;
-  y[9] = (od_coeff)-s6;
-  y[10] = (od_coeff)s5;
-  y[11] = (od_coeff)-s4;
-  y[12] = (od_coeff)s3;
-  y[13] = (od_coeff)-s2;
-  y[14] = (od_coeff)s1;
-  y[15] = (od_coeff)-s0;
+  s0 = x[0*xstride];
+  s8 = x[1*xstride];
+  s4 = x[2*xstride];
+  sc = x[3*xstride];
+  s2 = x[4*xstride];
+  sa = x[5*xstride];
+  s6 = x[6*xstride];
+  se = x[7*xstride];
+  s1 = x[8*xstride];
+  s9 = x[9*xstride];
+  s5 = x[10*xstride];
+  sd = x[11*xstride];
+  s3 = x[12*xstride];
+  sb = x[13*xstride];
+  s7 = x[14*xstride];
+  sf = x[15*xstride];
+  OD_FDST_16_FLAT(s0, s8, s4, sc, s2, sa, s6, se,
+   s1, s9, s5, sd, s3, sb, s7, sf);
+  y[0] = (od_coeff)s0;
+  y[1] = (od_coeff)s1;
+  y[2] = (od_coeff)s2;
+  y[3] = (od_coeff)s3;
+  y[4] = (od_coeff)s4;
+  y[5] = (od_coeff)s5;
+  y[6] = (od_coeff)s6;
+  y[7] = (od_coeff)s7;
+  y[8] = (od_coeff)s8;
+  y[9] = (od_coeff)s9;
+  y[10] = (od_coeff)sa;
+  y[11] = (od_coeff)sb;
+  y[12] = (od_coeff)sc;
+  y[13] = (od_coeff)sd;
+  y[14] = (od_coeff)se;
+  y[15] = (od_coeff)sf;
 }
 
 void od_bin_idst16(od_coeff *x, int xstride, const od_coeff y[16]) {
@@ -5824,39 +6290,40 @@
   int sd;
   int se;
   int sf;
-  s0 = -y[15];
-  s8 = y[14];
-  s4 = -y[13];
-  sc = y[12];
-  s2 = -y[11];
-  sa = y[10];
-  s6 = -y[9];
-  se = y[8];
-  s1 = -y[7];
-  s9 = y[6];
-  s5 = -y[5];
-  sd = y[4];
-  s3 = -y[3];
-  sb = y[2];
-  s7 = -y[1];
-  sf = y[0];
-  OD_IDST_16_PR(s0, s8, s4, sc, s2, sa, s6, se, s1, s9, s5, sd, s3, sb, s7, sf);
-  x[0*xstride] = (od_coeff)sf;
-  x[1*xstride] = (od_coeff)se;
-  x[2*xstride] = (od_coeff)sd;
-  x[3*xstride] = (od_coeff)sc;
-  x[4*xstride] = (od_coeff)sb;
-  x[5*xstride] = (od_coeff)sa;
-  x[6*xstride] = (od_coeff)s9;
-  x[7*xstride] = (od_coeff)s8;
-  x[8*xstride] = (od_coeff)s7;
-  x[9*xstride] = (od_coeff)s6;
-  x[10*xstride] = (od_coeff)s5;
-  x[11*xstride] = (od_coeff)s4;
-  x[12*xstride] = (od_coeff)s3;
-  x[13*xstride] = (od_coeff)s2;
-  x[14*xstride] = (od_coeff)s1;
-  x[15*xstride] = (od_coeff)s0;
+  s0 = y[0];
+  s8 = y[1];
+  s4 = y[2];
+  sc = y[3];
+  s2 = y[4];
+  sa = y[5];
+  s6 = y[6];
+  se = y[7];
+  s1 = y[8];
+  s9 = y[9];
+  s5 = y[10];
+  sd = y[11];
+  s3 = y[12];
+  sb = y[13];
+  s7 = y[14];
+  sf = y[15];
+  OD_IDST_16_FLAT(s0, s8, s4, sc, s2, sa, s6, se,
+   s1, s9, s5, sd, s3, sb, s7, sf);
+  x[0*xstride] = (od_coeff)s0;
+  x[1*xstride] = (od_coeff)s1;
+  x[2*xstride] = (od_coeff)s2;
+  x[3*xstride] = (od_coeff)s3;
+  x[4*xstride] = (od_coeff)s4;
+  x[5*xstride] = (od_coeff)s5;
+  x[6*xstride] = (od_coeff)s6;
+  x[7*xstride] = (od_coeff)s7;
+  x[8*xstride] = (od_coeff)s8;
+  x[9*xstride] = (od_coeff)s9;
+  x[10*xstride] = (od_coeff)sa;
+  x[11*xstride] = (od_coeff)sb;
+  x[12*xstride] = (od_coeff)sc;
+  x[13*xstride] = (od_coeff)sd;
+  x[14*xstride] = (od_coeff)se;
+  x[15*xstride] = (od_coeff)sf;
 }
 
 void od_bin_fdct32(od_coeff y[32], const od_coeff *x, int xstride) {