daala_tx: Unify the asym and ortho DST designs.

This patch refactors the DST transforms so that the orthonormal and
 asymmetric transforms are now nearly identical (up to multiplicaiton
 constants and an extra set of shifts).
This means that the DST designs are now embeddable for every level
 and should address hardware concerns about gate area.

In addition, minor changes were made to improve transform accuracy:

 - all of the transforms now have perfect reconstruction for those
    computations outside the rotations, i.e., all +/- butterfly steps
    are exactly invertible
 - two multiplication constants were reduced below < 1.0 (better for
    SIMD and gives slightly improved accuracy)
 - the averaging bias is removed which saves an extra addition for each
    of the averaging steps

Additional averaging steps can be removed from the 8-point Type-IV DST
 giving a 68% reduction in MSE for the 32-point DCT, but has not been
 done in the event we use it in place of the 8-point Type-VII DST.

subset-1:

master-daala_tx@2017-12-10T22:38:19.651Z ->
 new-daala_tx@2017-12-10T22:37:50.844Z

  PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
0.0057 | -0.0210 | -0.1821 |   0.0085 | -0.0002 |  0.0147 |    -0.0674

Change-Id: Ib124eebf6f2e4b3c51c078d4e8f229fc5ec26171
diff --git a/av1/av1.cmake b/av1/av1.cmake
index df73ad8..e32405b 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -31,6 +31,7 @@
     "${AOM_ROOT}/av1/common/convolve.h"
     "${AOM_ROOT}/av1/common/daala_tx.c"
     "${AOM_ROOT}/av1/common/daala_tx.h"
+    "${AOM_ROOT}/av1/common/daala_tx_kernels.h"
     "${AOM_ROOT}/av1/common/debugmodes.c"
     "${AOM_ROOT}/av1/common/entropy.c"
     "${AOM_ROOT}/av1/common/entropy.h"
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index c9ddf39..2c76f18 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -26,6 +26,7 @@
 AV1_COMMON_SRCS-yes += common/common.h
 AV1_COMMON_SRCS-yes += common/daala_tx.c
 AV1_COMMON_SRCS-yes += common/daala_tx.h
+AV1_COMMON_SRCS-yes += common/daala_tx_kernels.h
 AV1_COMMON_SRCS-yes += common/daala_inv_txfm.c
 AV1_COMMON_SRCS-yes += common/daala_inv_txfm.h
 AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/daala_tx_kernels.h
diff --git a/av1/common/daala_tx.c b/av1/common/daala_tx.c
index 73f4596..854011b 100644
--- a/av1/common/daala_tx.c
+++ b/av1/common/daala_tx.c
@@ -1,5 +1,6 @@
 #include "av1/common/daala_tx.h"
 #include "av1/common/odintrin.h"
+#include "av1/common/daala_tx_kernels.h"
 
 /* clang-format off */
 
@@ -39,32 +40,6 @@
   } \
   while (0)
 
-#define OD_FDCT_2_FLAT(p0, p1) \
-  /* Embedded 2-point orthonormal Type-II fDCT with flattened rotations. */ \
-  do { \
-    int t_; \
-    t_ = (p0 - p1 + 1) >> 1; \
-    /* 46341/32768 ~= 2*Sin[Pi/4] = 1.4142135623730951 */ \
-    p0 = (p1*46341 + 16384) >> 15; \
-    /* 46341/32768 ~= 2*Cos[Pi/4] = 1.4142135623730951 */ \
-    p1 = (t_*46341 + 16384) >> 15; \
-    p0 += p1; \
-  } \
-  while (0)
-
-#define OD_IDCT_2_FLAT(p0, p1) \
-  /* Embedded 2-point orthonormal Type-II iDCT with flattened rotations. */ \
-  do { \
-    int t_; \
-    t_ = p0 + p1; \
-    /* 11585/8192 ~= 2*Sin[Pi/4] = 1.4142135623730951 */ \
-    p1 = (p0*11585 + 4096) >> 13; \
-    /* 11585/16384 ~= Cos[Pi/4] = 0.7071067811865475 */ \
-    p0 = (t_*11585 + 8192) >> 14; \
-    p1 -= p0; \
-  } \
-  while (0)
-
 #define OD_FDCT_2_ASYM_PR(p0, p1, p1h) \
   /* Embedded 2-point asymmetric Type-II fDCT. */ \
   do { \
@@ -82,9 +57,6 @@
   } \
   while (0)
 
-#define OD_FDCT_2_ASYM_FLAT OD_FDCT_2_ASYM_PR
-#define OD_IDCT_2_ASYM_FLAT OD_IDCT_2_ASYM_PR
-
 #define OD_FDST_2_PR(p0, p1) \
   /* Embedded 2-point orthonormal Type-IV fDST. */ \
   do { \
@@ -112,24 +84,6 @@
   } \
   while (0)
 
-#define OD_FDST_2_FLAT(p0, p1) \
-  do { \
-    int t_; \
-    int u_; \
-    t_ = (p0 + p1 + 1) >> 1; \
-    /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
-    u_ = (p0*21407 + 8192) >> 14; \
-    /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.541196100146197 */ \
-    p0 = (p1*8867 + 8192) >> 14; \
-    /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
-    t_ = (t_*3135 + 2048) >> 12; \
-    p0 += t_; \
-    p1 = u_ - t_; \
-  } \
-  while (0)
-
-#define OD_IDST_2_FLAT OD_FDST_2_FLAT
-
 #define OD_FDST_2_ASYM_PR(p0, p1) \
   /* Embedded 2-point asymmetric Type-IV fDST. */ \
   do { \
@@ -157,40 +111,6 @@
   } \
   while (0)
 
-#define OD_FDST_2_ASYM_FLAT(p0, p0h, p1) \
-  /* Embedded 2-point asymmetric Type-IV fDST with flattened rotations. */ \
-  do { \
-    int t_; \
-    int u_; \
-    t_ = p0h + p1; \
-    /* 15137/16384 ~= (Cos[3*Pi/8] + Sin[3*Pi/8])/Sqrt[2] = 0.9238795325112867 */ \
-    u_ = (p0*15137 + 8192) >> 14; \
-    /* 3135/4096 ~= (Cos[3*Pi/8] - Sin[3*Pi/8])*Sqrt[2] = 0.7653668647301795 */ \
-    p0 = (p1*3135 + 2048) >> 12; \
-    /* 8867/16384 ~= Cos[3*Pi/8]*Sqrt[2] = 0.5411961001461971 */ \
-    t_ = (t_*8867 + 8192) >> 14; \
-    p0 += t_; \
-    p1 = u_ - t_; \
-  } \
-  while (0)
-
-#define OD_IDST_2_ASYM_FLAT(p0, p1) \
-  /* Embedded 2-point asymmetric Type-IV iDST with flattened rotations. */ \
-  do { \
-    int t_; \
-    int u_; \
-    t_ = (p0 + p1 + 1) >> 1; \
-    /* 3135/4096 ~= (Cos[Pi/8] - Sin[Pi/8])*Sqrt[2] = 0.7653668647301795 */ \
-    u_ = (p1*3135 + 2048) >> 12; \
-    /* 15137/16384 ~= (Cos[Pi/8] + Sin[Pi/8])/Sqrt[2] = 0.9238795325112867 */ \
-    p1 = (p0*15137 + 8192) >> 14; \
-    /* 8867/8192 ~= 2*Cos[3*Pi/8]*Sqrt[2] = 1.082392200292394 */ \
-    t_ = (t_*8867 + 4096) >> 13; \
-    p0 = u_ + t_; \
-    p1 -= OD_RSHIFT1(t_); \
-  } \
-  while (0)
-
 #define OD_FDCT_4_PR(q0, q2, q1, q3) \
   /* Embedded 4-point orthonormal Type-II fDCT. */ \
   do { \
@@ -248,61 +168,6 @@
   } \
   while (0)
 
-#define OD_FDCT_4_FLAT(q0, q1, q2, q3) \
-  /* Embedded 4-point orthonormal Type-II fDCT with flattened rotations. */ \
-  do { \
-    int q1h; \
-    int q3h; \
-    q3 = q0 - q3; \
-    q3h = OD_RSHIFT1(q3); \
-    q0 -= q3h; \
-    q1 += q2; \
-    q1h = OD_RSHIFT1(q1); \
-    q2 -= q1h; \
-    OD_FDCT_2_ASYM_FLAT(q0, q1, q1h); \
-    OD_FDST_2_ASYM_FLAT(q3, q3h, q2); \
-  } \
-  while (0)
-
-#define OD_IDCT_4_FLAT(q0, q2, q1, q3) \
-  /* Embedded 4-point orthonormal Type-II iDCT with flattened rotations. */ \
-  do { \
-    int q1h; \
-    OD_IDST_2_ASYM_FLAT(q3, q2); \
-    OD_IDCT_2_ASYM_FLAT(q0, q1, q1h); \
-    q2 += q1h; \
-    q1 -= q2; \
-    q0 += OD_RSHIFT1(q3); \
-    q3 = q0 - q3; \
-  } \
-  while (0)
-
-#define OD_FDCT_4_ASYM_FLAT(q0, q1, q1h, q2, q3, q3h) \
-  /* Embedded 4-point asymmetric Type-II fDCT with flattened rotations. */ \
-  do { \
-    q0 += q3h; \
-    q3 = q0 - q3; \
-    q2 -= q1h; \
-    q1 += q2; \
-    OD_FDCT_2_FLAT(q0, q1); \
-    OD_FDST_2_FLAT(q3, q2); \
-  } \
-  while (0)
-
-#define OD_IDCT_4_ASYM_FLAT(q0, q2, q1, q1h, q3, q3h) \
-  /* Embedded 4-point asymmetric Type-II iDCT with flattened rotations. */ \
-  do { \
-    OD_IDST_2_FLAT(q3, q2); \
-    OD_IDCT_2_FLAT(q0, q1); \
-    q1 -= q2; \
-    q1h = OD_RSHIFT1(q1); \
-    q2 += q1h; \
-    q3 = q0 - q3; \
-    q3h = OD_RSHIFT1(q3); \
-    q0 -= q3h; \
-  } \
-  while (0)
-
 #define OD_FDST_4_PR(q0, q2, q1, q3) \
   /* Embedded 4-point orthonormal Type-IV fDST. */ \
   do { \
@@ -384,90 +249,6 @@
   } \
   while (0)
 
-#define OD_FDST_4_FLAT(q0, q1, q2, q3) \
-  /* Embedded 4-point orthonormal Type-IV fDST with flattened rotations. */ \
-  do { \
-    int t_; \
-    int u_; \
-    t_ = q0 - q3; \
-    /* 13623/16384 ~= (Sin[7*Pi/16] + Cos[7*Pi/16])/Sqrt[2] ~=
-        0.8314696123025451 */ \
-    u_ = (13623*q3 + 8192) >> 14; \
-    /* 18205/16384 ~= (Sin[7*Pi/16] - Cos[7*Pi/16])*Sqrt[2] ~=
-        1.1111404660392046 */ \
-    q3 = (18205*q0 + 8192) >> 14; \
-    /* 9041/32768 ~= Cos[7*Pi/16]*Sqrt[2] ~= 0.275899379282943 */ \
-    t_ = (9041*t_ + 16384) >> 15; \
-    q0 = u_ + OD_RSHIFT1(t_); \
-    q3 += t_; \
-    t_ = q1 + q2; \
-    /* 16069/16384 ~= (Sin[5*Pi/16] + Cos[5*Pi/16])/Sqrt[2] ~=
-        0.9807852804032304 */ \
-    u_ = (16069*q1 + 8192) >> 14; \
-    /* 12785/32768 ~= (Sin[5*Pi/16] - Cos[5*Pi/16])*Sqrt[2] ~=
-        0.3901806440322566 */ \
-    q1 = (12785*q2 + 16384) >> 15; \
-    /* 12873/16384 ~= Cos[5*Pi/16]*Sqrt[2] ~= 0.7856949583871021 */ \
-    t_ = (12873*t_ + 8192) >> 14; \
-    q2 = u_ - OD_RSHIFT1(t_); \
-    q1 += t_; \
-    q2 += OD_RSHIFT1(q3); \
-    q3 -= q2; \
-    q0 += OD_RSHIFT1(q1); \
-    q1 -= q0; \
-    t_ = (q1 + q2 + 1) >> 1; \
-    /* 11585/8192 ~= 2*Sin[Pi/4] ~= 1.4142135623730951 */ \
-    q1 = (11585*q2 + 4096) >> 13; \
-    /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
-    q2 = (11585*t_ + 4096) >> 13; \
-    q1 -= q2; \
-  } \
-  while (0)
-
-#define OD_IDST_4_FLAT(q0, q1, q2, q3) \
-  /* Embedded 4-point orthonormal Type-IV fDST with flattened rotations. */ \
-  do { \
-    int t_; \
-    int u_; \
-    int q2h; \
-    int q3h; \
-    t_ = (q1 + q2 + 1) >> 1; \
-    /* 11585/8192 ~= 2*Sin[Pi/4] ~= 1.4142135623730951 */ \
-    q2 = (11585*q1 + 4096) >> 13; \
-    /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
-    q1 = (11585*t_ + 4096) >> 13; \
-    q2 -= q1; \
-    q2 += q0; \
-    q2h = OD_RSHIFT1(q2); \
-    q0 -= q2h; \
-    q3 += q1; \
-    q3h = OD_RSHIFT1(q3); \
-    q1 -= q3h; \
-    t_ = q1 + q2h; \
-    /* 16069/16384 ~= (Sin[5*Pi/16] + Cos[5*Pi/16])/Sqrt[2] ~=
-        0.9807852804032304 */ \
-    u_ = (16069*q2 + 8192) >> 14; \
-    /* 12785/32768 ~= (Sin[5*Pi/16] - Cos[5*Pi/16])*Sqrt[2] ~=
-        0.3901806440322566 */ \
-    q2 = (12785*q1 + 16384) >> 15; \
-    /* 12873/16384 ~= Cos[5*Pi/16]*Sqrt[2] ~= 0.7856949583871021 */ \
-    t_ = (12873*t_ + 8192) >> 14; \
-    q1 = u_ - t_; \
-    q2 += t_; \
-    t_ = q0 - q3h; \
-    /* 13623/16384 ~= (Sin[7*Pi/16] + Cos[7*Pi/16])/Sqrt[2] ~=
-        0.8314696123025451 */ \
-    u_ = (13623*q3 + 8192) >> 14; \
-    /* 18205/16384 ~= (Sin[7*Pi/16] - Cos[7*Pi/16])*Sqrt[2] ~=
-        1.1111404660392046 */ \
-    q3 = (18205*q0 + 8192) >> 14; \
-    /* 9041/32768 ~= Cos[7*Pi/16]*Sqrt[2] = 0.275899379282943 */ \
-    t_ = (9041*t_ + 16384) >> 15; \
-    q0 = u_ + t_; \
-    q3 += t_; \
-  } \
-  while (0)
-
 #define OD_FDST_4_ASYM_PR(t0, t0h, t2, t1, t3) \
   /* Embedded 4-point asymmetric Type-IV fDST. */ \
   do { \
@@ -534,81 +315,6 @@
   } \
   while (0)
 
-#define OD_FDST_4_ASYM_FLAT(q0, q0h, q1, q2, q2h, q3) \
-  /* Embedded 4-point asymmetric Type-IV fDST with flattened rotations. */ \
-  do { \
-    int t_; \
-    int u_; \
-    t_ = q0h - q3; \
-    /* 38531/32768 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */ \
-    u_ = (q3*38531 + 16384) >> 15; \
-    /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */ \
-    q3 = (q0*12873 + 8192) >> 14; \
-    /* 12785/32768 ~= 2*Cos[7*Pi/16] ~= 0.3901806440322565 */ \
-    t_ = (t_*12785 + 16384) >> 15; \
-    q0 = u_ + OD_RSHIFT1(t_); \
-    q3 += t_; \
-    t_ = q1 + q2h; \
-    /* 45451/32768 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
-    u_ = (q1*45451 + 16384) >> 15; \
-    /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
-    q1 = (q2*9041 + 16384) >> 15; \
-    /* 18205/16384 ~= 2*Cos[5*Pi/16] ~= 1.1111404660392044 */ \
-    t_ = (t_*18205 + 8192) >> 14; \
-    q1 += t_; \
-    q2 = u_ - OD_RSHIFT1(t_); \
-    q2 += OD_RSHIFT1(q3); \
-    q3 -= q2; \
-    q0 += OD_RSHIFT1(q1); \
-    q1 -= q0; \
-    t_ = (q1 + q2 + 1) >> 1; \
-    /* 11585/8192 ~= 2*Sin[Pi/4] ~= 1.4142135623730951 */ \
-    q1 = (q2*11585 + 4096) >> 13; \
-    /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
-    q2 = (t_*11585 + 4096) >> 13; \
-    q1 -= q2; \
-  } \
-  while (0)
-
-#define OD_IDST_4_ASYM_FLAT(q0, q2, q1, q3) \
-  do { \
-    int t_; \
-    int u_; \
-    int q1h; \
-    int q3h; \
-    t_ = (q1 + q2 + 1) >> 1; \
-    /* 11585/8192 ~= 2*Sin[Pi/4] ~= 1.4142135623730951 */ \
-    q1 = (q2*11585 + 4096) >> 13; \
-    /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
-    q2 = (t_*11585 + 4096) >> 13; \
-    q1 -= q2; \
-    q1 += q0; \
-    q1h = OD_RSHIFT1(q1); \
-    q0 -= q1h; \
-    q3 += q2; \
-    q3h = OD_RSHIFT1(q3); \
-    q2 -= q3h; \
-    t_ = q1h + q2; \
-    /* 45451/32768 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
-    u_ = (q1*45451 + 16384) >> 15; \
-    /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
-    q1 = (q2*9041 + 16384) >> 15; \
-    /* 18205/16384 ~= 2*Cos[5*Pi/16] ~= 1.1111404660392044 */ \
-    t_ = (t_*18205 + 8192) >> 14; \
-    q1 += OD_RSHIFT1(t_); \
-    q2 = u_ - t_; \
-    t_ = q0 - q3h; \
-    /* 38531/32768 ~= Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */ \
-    u_ = (q3*38531 + 16384) >> 15; \
-    /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */ \
-    q3 = (q0*12873 + 8192) >> 14; \
-    /* 12785/32768 ~= 2*Cos[7*Pi/16] = 0.3901806440322565 */ \
-    t_ = (t_*12785 + 16384) >> 15; \
-    q3 += OD_RSHIFT1(t_); \
-    q0 = u_ + t_; \
-  } \
-  while (0)
-
 #define OD_FDCT_8_PR(r0, r4, r2, r6, r1, r5, r3, r7) \
   /* Embedded 8-point orthonormal Type-II fDCT. */ \
   do { \
@@ -654,48 +360,6 @@
   } \
   while (0)
 
-#define OD_FDCT_8_FLAT(r0, r1, r2, r3, r4, r5, r6, r7) \
-  /* Embedded 8-point orthonormal Type-II fDCT with flattened rotations. */ \
-  do { \
-    int r1h; \
-    int r3h; \
-    int r5h; \
-    int r7h; \
-    r7 = r0 - r7; \
-    r7h = OD_RSHIFT1(r7); \
-    r0 -= r7h; \
-    r1 += r6; \
-    r1h = OD_RSHIFT1(r1); \
-    r6 -= r1h; \
-    r5 = r2 - r5; \
-    r5h = OD_RSHIFT1(r5); \
-    r2 -= r5h; \
-    r3 += r4; \
-    r3h = OD_RSHIFT1(r3); \
-    r4 -= r3h; \
-    OD_FDCT_4_ASYM_FLAT(r0, r1, r1h, r2, r3, r3h); \
-    OD_FDST_4_ASYM_FLAT(r7, r7h, r6, r5, r5h, r4); \
-  } \
-  while (0)
-
-#define OD_IDCT_8_FLAT(r0, r4, r2, r6, r1, r5, r3, r7) \
-  /* Embedded 8-point orthonormal Type-II iDCT with flattened rotations. */ \
-  do { \
-    int r1h; \
-    int r3h; \
-    OD_IDST_4_ASYM_FLAT(r7, r5, r6, r4); \
-    OD_IDCT_4_ASYM_FLAT(r0, r2, r1, r1h, r3, r3h); \
-    r4 += r3h; \
-    r3 -= r4; \
-    r2 += OD_RSHIFT1(r5); \
-    r5 = r2 - r5; \
-    r6 += r1h; \
-    r1 -= r6; \
-    r0 += OD_RSHIFT1(r7); \
-    r7 = r0 - r7; \
-  } \
-  while (0)
-
 #define OD_FDCT_8_ASYM_PR(r0, r4, r4h, r2, r6, r6h, r1, r5, r5h, r3, r7, r7h) \
   /* Embedded 8-point asymmetric Type-II fDCT. */ \
   do { \
@@ -732,42 +396,6 @@
   } \
   while (0)
 
-#define OD_FDCT_8_ASYM_FLAT(r0, r1, r1h, r2, r3, r3h, r4, r5, r5h, r6, r7, r7h) \
-  /* Embedded 8-point asymmetric Type-II fDCT. */ \
-  do { \
-    r0 += r7h; \
-    r7 = r0 - r7; \
-    r4 -= r3h; \
-    r3 += r4; \
-    r2 += r5h; \
-    r5 = r2 - r5; \
-    r6 -= r1h; \
-    r1 += r6; \
-    OD_FDCT_4_FLAT(r0, r1, r2, r3); \
-    OD_FDST_4_FLAT(r7, r6, r5, r4); \
-  } \
-  while (0)
-
-#define OD_IDCT_8_ASYM_FLAT(r0, r4, r2, r6, r1, r1h, r5, r5h, r3, r3h, r7, r7h) \
-  /* Embedded 8-point asymmetric Type-II iDCT with flattened rotations. */ \
-  do { \
-    OD_IDST_4_FLAT(r7, r5, r6, r4); \
-    OD_IDCT_4_FLAT(r0, r2, r1, r3); \
-    r7 = r0 - r7; \
-    r7h = OD_RSHIFT1(r7); \
-    r0 -= r7h; \
-    r1 -= r6; \
-    r1h = OD_RSHIFT1(r1); \
-    r6 += r1h; \
-    r5 = r2 - r5; \
-    r5h = OD_RSHIFT1(r5); \
-    r2 -= r5h; \
-    r3 -= r4; \
-    r3h = OD_RSHIFT1(r3); \
-    r4 += r3h; \
-  } \
-  while (0)
-
 #define OD_FDST_8_PR(t0, t4, t2, t6, t1, t5, t3, t7)  \
   /* Embedded 8-point orthonormal Type-IV fDST. */ \
   do { \
@@ -937,190 +565,6 @@
   } \
   while (0)
 
-#define OD_FDST_8_FLAT(r0, r1, r2, r3, r4, r5, r6, r7) \
-  /* Embedded 8-point Type-IV fDST with flattened rotations. */ \
-  do { \
-    int t_; \
-    int u_; \
-    int r0h; \
-    int r2h; \
-    int r5h; \
-    int r7h; \
-    t_ = r3 - r4; \
-    /* 23059/16384 ~= Sin[9*Pi/32] + Cos[9*Pi/32] ~= 1.4074037375263826 */ \
-    u_ = (23059*r4 + 8192) >> 14; \
-    /* 2271/16384 ~= Sin[9*Pi/32] - Cos[9*Pi/32] ~= 0.1386171691990915 */ \
-    r4 = (2271*r3 + 8192) >> 14; \
-    /* 5197/8192 ~= Cos[9*Pi/32] ~= 0.6343932841636455 */ \
-    t_ = (5197*t_ + 4096) >> 13; \
-    r3 = u_ + t_; \
-    r4 += t_; \
-    t_ = r2 + r5; \
-    /* 22173/16384 ~= Sin[11*Pi/32] + Cos[11*Pi/32] ~= 1.3533180011743526 */ \
-    u_ = (22173*r2 + 8192) >> 14; \
-    /* 3363/8192 ~= Sin[11*Pi/32] - Cos[11*Pi/32] ~= 0.4105245275223574 */ \
-    r2 = (3363*r5 + 4096) >> 13; \
-    /* 15447/32768 ~= Cos[11*Pi/32] ~= 0.47139673682599764 */ \
-    t_ = (15447*t_ + 16384) >> 15; \
-    r2 += t_; \
-    r5 = u_ - t_; \
-    t_ = r1 - r6; \
-    /* 40869/32768 ~= Sin[13*Pi/32] + Cos[13*Pi/32] ~= 1.247225012986671 */ \
-    u_ = (40869*r6 + 16384) >> 15; \
-    /* 21845/32768 ~= Sin[13*Pi/32] - Cos[13*Pi/32] ~= 0.6666556584777465 */ \
-    r6 = (21845*r1 + 16384) >> 15; \
-    /* 1189/4096 ~= Cos[13*Pi/32] ~= 0.29028467725446233 */ \
-    t_ = (1189*t_ + 2048) >> 12; \
-    r1 = u_ + t_; \
-    r6 += t_; \
-    t_ = r0 + r7; \
-    /* 17911/16384 ~= Sin[15*Pi/32] + Cos[15*Pi/32] ~= 1.0932018670017576 */ \
-    u_ = (17911*r0 + 8192) >> 14; \
-    /* 14699/16384 ~= Sin[15*Pi/32] - Cos[15*Pi/32] ~= 0.8971675863426363 */ \
-    r0 = (14699*r7 + 8192) >> 14; \
-    /* 803/8192 ~= Cos[15*Pi/32] ~= 0.0980171403295606 */ \
-    t_ = (803*t_ + 4096) >> 13; \
-    r0 += t_; \
-    r7 = u_ - t_; \
-    r2 -= r1; \
-    r2h = OD_RSHIFT1(r2); \
-    r1 += r2h; \
-    r5 += r6; \
-    r5h = OD_RSHIFT1(r5); \
-    r6 -= r5h; \
-    r0 += r3; \
-    r0h = OD_RSHIFT1(r0); \
-    r3 -= r0h; \
-    r7 -= r4; \
-    r7h = OD_RSHIFT1(r7); \
-    r4 += r7h; \
-    r3 += r5h; \
-    r5 -= r3; \
-    r1 -= r0h; \
-    r0 += r1; \
-    r4 += r2h; \
-    r2 -= r4; \
-    r6 += r7h; \
-    r7 -= r6; \
-    t_ = (r4 - r3 + 1) >> 1; \
-    /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
-    u_ = (21407*r3 + 8192) >> 14; \
-    /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
-    r3 = (8867*r4 + 8192) >> 14; \
-    /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
-    t_ = (3135*t_ + 2048) >> 12; \
-    r3 += t_; \
-    r4 = u_ + t_; \
-    t_ = (r2 - r5 + 1) >> 1; \
-    /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
-    u_ = (21407*r2 + 8192) >> 14; \
-    /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
-    r2 = (8867*r5 + 8192) >> 14; \
-    /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
-    t_ = (3135*t_ + 2048) >> 12; \
-    r5 = t_ - u_; \
-    r2 -= t_; \
-    t_ = (r6 - r1 + 1) >> 1; \
-    /* 11585/8192 ~= 2*Sin[Pi/4] ~= 1.4142135623730951 */ \
-    r6 = (11585*r1 + 4096) >> 13; \
-    /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
-    r1 = (11585*t_ + 4096) >> 13; \
-    r6 += r1; \
-  } \
-  while (0)
-
-#define OD_IDST_8_FLAT(r0, r4, r2, r6, r1, r5, r3, r7) \
-  /* Embedded 8-point Type-IV iDST with flattened rotations. */ \
-  do { \
-    int t_; \
-    int u_; \
-    int r0h; \
-    int r2h; \
-    int r5h; \
-    int r7h; \
-    t_ = (r1 + r6 + 1) >> 1; \
-    /* 11585/8192 ~= 2*Sin[Pi/4] ~= 1.4142135623730951 */ \
-    r1 = (11585*r6 + 4096) >> 13; \
-    /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
-    r6 = (11585*t_ + 4096) >> 13; \
-    r1 -= r6; \
-    t_ = (r5 - r2 + 1) >> 1; \
-    /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
-    u_ = (21407*r5 + 8192) >> 14; \
-    /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
-    r5 = (8867*r2 + 8192) >> 14; \
-    /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
-    t_ = (3135*t_ + 2048) >> 12; \
-    r5 -= t_; \
-    r2 = t_ - u_; \
-    t_ = (r3 + r4 + 1) >> 1; \
-    /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
-    u_ = (21407*r4 + 8192) >> 14; \
-    /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
-    r4 = (8867*r3 + 8192) >> 14; \
-    /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
-    t_ = (3135*t_ + 2048) >> 12; \
-    r3 = u_ - t_; \
-    r4 += t_; \
-    r7 += r6; \
-    r7h = OD_RSHIFT1(r7); \
-    r6 -= r7h; \
-    r2 += r4; \
-    r2h = OD_RSHIFT1(r2); \
-    r4 -= r2h; \
-    r0 -= r1; \
-    r0h = OD_RSHIFT1(r0); \
-    r1 += r0h; \
-    r5 += r3; \
-    r5h = OD_RSHIFT1(r5); \
-    r3 -= r5h; \
-    r4 -= r7h; \
-    r7 += r4; \
-    r6 += r5h; \
-    r5 -= r6; \
-    r3 += r0h; \
-    r0 -= r3; \
-    r1 -= r2h; \
-    r2 += r1; \
-    t_ = r0 + r7; \
-    /* 17911/16384 ~= Sin[15*Pi/32] + Cos[15*Pi/32] ~= 1.0932018670017576 */ \
-    u_ = (17911*r0 + 8192) >> 14; \
-    /* 14699/16384 ~= Sin[15*Pi/32] - Cos[15*Pi/32] ~= 0.8971675863426363 */ \
-    r0 = (14699*r7 + 8192) >> 14; \
-    /* 803/8192 ~= Cos[15*Pi/32] ~= 0.0980171403295606 */ \
-    t_ = (803*t_ + 4096) >> 13; \
-    r7 = u_ - t_; \
-    r0 += t_; \
-    t_ = r1 - r6; \
-    /* 40869/32768 ~= Sin[13*Pi/32] + Cos[13*Pi/32] ~= 1.247225012986671 */ \
-    u_ = (40869*r6 + 16384) >> 15; \
-    /* 21845/32768 ~= Sin[13*Pi/32] - Cos[13*Pi/32] ~= 0.6666556584777465 */ \
-    r6 = (21845*r1 + 16384) >> 15; \
-    /* 1189/4096 ~= Cos[13*Pi/32] ~= 0.29028467725446233 */ \
-    t_ = (1189*t_ + 2048) >> 12; \
-    r1 = u_ + t_; \
-    r6 += t_; \
-    t_ = r2 + r5; \
-    /* 22173/16384 ~= Sin[11*Pi/32] + Cos[11*Pi/32] ~= 1.3533180011743526 */ \
-    u_ = (22173*r2 + 8192) >> 14; \
-    /* 3363/8192 ~= Sin[11*Pi/32] - Cos[11*Pi/32] ~= 0.4105245275223574 */ \
-    r2 = (3363*r5 + 4096) >> 13; \
-    /* 15447/32768 ~= Cos[11*Pi/32] ~= 0.47139673682599764 */ \
-    t_ = (15447*t_ + 16384) >> 15; \
-    r5 = u_ - t_; \
-    r2 += t_; \
-    t_ = r3 - r4; \
-    /* 23059/16384 ~= Sin[9*Pi/32] + Cos[9*Pi/32] ~= 1.4074037375263826 */ \
-    u_ = (23059*r4 + 8192) >> 14; \
-    /* 2271/16384 ~= Sin[9*Pi/32] - Cos[9*Pi/32] ~= 0.1386171691990915 */ \
-    r4 = (2271*r3 + 8192) >> 14; \
-    /* 5197/8192 ~= Cos[9*Pi/32] ~= 0.6343932841636455 */ \
-    t_ = (5197*t_ + 4096) >> 13; \
-    r3 = u_ + t_; \
-    r4 += t_; \
-  } \
-  while (0)
-
 /* Rewrite this so that t0h can be passed in. */
 #define OD_FDST_8_ASYM_PR(t0, t4, t2, t6, t1, t5, t3, t7) \
   /* Embedded 8-point asymmetric Type-IV fDST. */ \
@@ -1287,205 +731,6 @@
   } \
   while (0)
 
-#define OD_FDST_8_ASYM_FLAT(r0, r0h, r1, r2, r2h, r3, \
-  r4, r4h, r5, r6, r6h, r7) \
-  /* Embedded 8-point asymmetric Type-IV fDST with flattened rotations. */ \
-  do { \
-    int t_; \
-    int u_; \
-    int r5h; \
-    int r7h; \
-    t_ = r3 - r4h; \
-    /* 16305/16384 ~= (Sin[9*Pi/32] + Cos[9*Pi/32])/Sqrt[2] ~=
-        0.9951847266721969 */ \
-    u_ = (16305*r4 + 8192) >> 14; \
-    /* 803/4096 ~= (Sin[9*Pi/32] - Cos[9*Pi/32])*Sqrt[2] ~=
-        0.1960342806591213 */ \
-    r4 = (803*r3 + 2048) >> 12; \
-    /* 14699/16384 ~= Cos[9*Pi/32]*Sqrt[2] ~= 0.8971675863426364 */ \
-    t_ = (14699*t_ + 8192) >> 14; \
-    r3 = u_ + t_; \
-    r4 += t_; \
-    t_ = r2h + r5; \
-    /* 31357/32768 ~= (Sin[11*Pi/32] + Cos[11*Pi/32])/Sqrt[2]
-        ~= 0.9569403357322087 */ \
-    u_ = (31357*r2 + 16384) >> 15; \
-    /* 1189/2048 ~= (Sin[11*Pi/32] - Cos[11*Pi/32])*Sqrt[2] ~=
-        0.5805693545089248 */ \
-    r2 = (1189*r5 + 1024) >> 11; \
-    /* 21845/32768 ~= Cos[11*Pi/32] ~= 0.6666556584777465 */ \
-    t_ = (21845*t_ + 16384) >> 15; \
-    r2 += t_; \
-    r5 = u_ - t_; \
-    t_ = r1 - r6h; \
-    /* 28899/32768 ~= (Sin[13*Pi/32] + Cos[13*Pi/32])/Sqrt[2] ~=
-        0.8819212643483548 */ \
-    u_ = (28899*r6 + 16384) >> 15; \
-    /* 30893/32768 ~= (Sin[13*Pi/32] - Cos[13*Pi/32])*Sqrt[2] ~=
-        0.942793473651995297112775 */ \
-    r6 = (30893*r1 + 16384) >> 15; \
-    /* 3363/8192 ~= Cos[13*Pi/32]*Sqrt[2] ~= 0.41052452752235735 */ \
-    t_ = (3363*t_ + 4096) >> 13; \
-    r1 = u_ + t_; \
-    r6 += t_; \
-    t_ = r0h + r7; \
-    /* 12665/16384 ~= (Sin[15*Pi/32] + Cos[15*Pi/32])/Sqrt[2] ~=
-        0.773010453362737 */ \
-    u_ = (12665*r0 + 8192) >> 14; \
-    /* 5197/4096 ~= (Sin[15*Pi/32] - Cos[15*Pi/32])*Sqrt[2] ~=
-        1.268786568327291 */ \
-    r0 = (5197*r7 + 2048) >> 12; \
-    /* 2271/16384 ~= Cos[15*Pi/32]*Sqrt[2] ~= 0.13861716919909148 */ \
-    t_ = (2271*t_ + 8192) >> 14; \
-    r0 += t_; \
-    r7 = u_ - t_; \
-    r2 -= r1; \
-    r2h = OD_RSHIFT1(r2); \
-    r1 += r2h; \
-    r5 += r6; \
-    r5h = OD_RSHIFT1(r5); \
-    r6 -= r5h; \
-    r0 += r3; \
-    r0h = OD_RSHIFT1(r0); \
-    r3 -= r0h; \
-    r7 -= r4; \
-    r7h = OD_RSHIFT1(r7); \
-    r4 += r7h; \
-    r3 += r5h; \
-    r5 -= r3; \
-    r1 -= r0h; \
-    r0 += r1; \
-    r4 += r2h; \
-    r2 -= r4; \
-    r6 += r7h; \
-    r7 -= r6; \
-    t_ = (r4 - r3 + 1) >> 1; \
-    /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
-    u_ = (21407*r3 + 8192) >> 14; \
-    /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
-    r3 = (8867*r4 + 8192) >> 14; \
-    /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
-    t_ = (3135*t_ + 2048) >> 12; \
-    r3 += t_; \
-    r4 = u_ + t_; \
-    t_ = (r2 - r5 + 1) >> 1; \
-    /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
-    u_ = (21407*r2 + 8192) >> 14; \
-    /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
-    r2 = (8867*r5 + 8192) >> 14; \
-    /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
-    t_ = (3135*t_ + 2048) >> 12; \
-    r5 = t_ - u_; \
-    r2 -= t_; \
-    t_ = (r6 - r1 + 1) >> 1; \
-    /* 11585/8192 ~= 2*Sin[Pi/4] ~= 1.4142135623730951 */ \
-    r6 = (11585*r1 + 4096) >> 13; \
-    /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
-    r1 = (11585*t_ + 4096) >> 13; \
-    r6 += r1; \
-  } \
-  while (0)
-
-#define OD_IDST_8_ASYM_FLAT(r0, r4, r2, r6, r1, r5, r3, r7) \
-  /* Embedded 8-point asymmetric Type-IV iDST with flattened rotations. */ \
-  do { \
-    int t_; \
-    int u_; \
-    int r0h; \
-    int r2h; \
-    int r5h; \
-    int r7h; \
-    t_ = (r1 + r6 + 1) >> 1; \
-    /* 11585/8192 ~= 2*Sin[Pi/4] ~= 1.4142135623730951 */ \
-    r1 = (11585*r6 + 4096) >> 13; \
-    /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
-    r6 = (11585*t_ + 4096) >> 13; \
-    r1 -= r6; \
-    t_ = (r5 - r2 + 1) >> 1; \
-    /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
-    u_ = (21407*r5 + 8192) >> 14; \
-    /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
-    r5 = (8867*r2 + 8192) >> 14; \
-    /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
-    t_ = (3135*t_ + 2048) >> 12; \
-    r5 -= t_; \
-    r2 = t_ - u_; \
-    t_ = (r3 + r4 + 1) >> 1; \
-    /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
-    u_ = (21407*r4 + 8192) >> 14; \
-    /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
-    r4 = (8867*r3 + 8192) >> 14; \
-    /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
-    t_ = (3135*t_ + 2048) >> 12; \
-    r3 = u_ - t_; \
-    r4 += t_; \
-    r7 += r6; \
-    r7h = OD_RSHIFT1(r7); \
-    r6 -= r7h; \
-    r2 += r4; \
-    r2h = OD_RSHIFT1(r2); \
-    r4 -= r2h; \
-    r0 -= r1; \
-    r0h = OD_RSHIFT1(r0); \
-    r1 += r0h; \
-    r5 += r3; \
-    r5h = OD_RSHIFT1(r5); \
-    r3 -= r5h; \
-    r4 -= r7h; \
-    r7 += r4; \
-    r6 += r5h; \
-    r5 -= r6; \
-    r3 += r0h; \
-    r0 -= r3; \
-    r1 -= r2h; \
-    r2 += r1; \
-    t_ = r0 + r7; \
-    /* 12665/16384 ~= (Sin[15*Pi/32] + Cos[15*Pi/32])/Sqrt[2] ~=
-        0.773010453362737 */ \
-    u_ = (12665*r0 + 8192) >> 14; \
-    /* 5197/4096 ~= (Sin[15*Pi/32] - Cos[15*Pi/32])*Sqrt[2] ~=
-        1.268786568327291 */ \
-    r0 = (5197*r7 + 2048) >> 12; \
-    /* 2271/16384 ~= Cos[15*Pi/32]*Sqrt[2] ~= 0.13861716919909148 */ \
-    t_ = (2271*t_ + 8192) >> 14; \
-    r7 = u_ - OD_RSHIFT1(t_); \
-    r0 += t_; \
-    t_ = r1 - r6; \
-    /* 28899/32768 ~= (Sin[13*Pi/32] + Cos[13*Pi/32])/Sqrt[2] ~=
-        0.8819212643483548 */ \
-    u_ = (28899*r6 + 16384) >> 15; \
-    /* 30893/32768 ~= (Sin[13*Pi/32] - Cos[13*Pi/32])*Sqrt[2] ~=
-        0.942793473651995297112775 */ \
-    r6 = (30893*r1 + 16384) >> 15; \
-    /* 3363/8192 ~= Cos[13*Pi/32]*Sqrt[2] ~= 0.41052452752235735 */ \
-    t_ = (3363*t_ + 4096) >> 13; \
-    r1 = u_ + OD_RSHIFT1(t_); \
-    r6 += t_; \
-    t_ = r2 + r5; \
-    /* 31357/32768 ~= (Sin[11*Pi/32] + Cos[11*Pi/32])/Sqrt[2]
-        ~= 0.9569403357322087 */ \
-    u_ = (31357*r2 + 16384) >> 15; \
-    /* 1189/2048 ~= (Sin[11*Pi/32] - Cos[11*Pi/32])*Sqrt[2] ~=
-        0.5805693545089248 */ \
-    r2 = (1189*r5 + 1024) >> 11; \
-    /* 21845/32768 ~= Cos[11*Pi/32] ~= 0.6666556584777465 */ \
-    t_ = (21845*t_ + 16384) >> 15; \
-    r5 = u_ - OD_RSHIFT1(t_); \
-    r2 += t_; \
-    t_ = r3 - r4; \
-    /* 16305/16384 ~= (Sin[9*Pi/32] + Cos[9*Pi/32])/Sqrt[2] ~=
-        0.9951847266721969 */ \
-    u_ = (16305*r4 + 8192) >> 14; \
-    /* 803/4096 ~= (Sin[9*Pi/32] - Cos[9*Pi/32])*Sqrt[2] ~=
-        0.1960342806591213 */ \
-    r4 = (803*r3 + 2048) >> 12; \
-    /* 14699/16384 ~= Cos[9*Pi/32]*Sqrt[2] ~= 0.8971675863426364 */ \
-    t_ = (14699*t_ + 8192) >> 14; \
-    r3 = u_ + OD_RSHIFT1(t_); \
-    r4 += t_; \
-  } \
-  while (0)
-
 #define OD_FDCT_16_PR(s0, s8, s4, sc, s2, sa, s6, se, \
   s1, s9, s5, sd, s3, sb, s7, sf) \
   /* Embedded 16-point orthonormal Type-II fDCT. */ \
@@ -1552,76 +797,6 @@
   } \
   while (0)
 
-#define OD_FDCT_16_FLAT(s0, s1, s2, s3, s4, s5, s6, s7, \
- s8, s9, sa, sb, sc, sd, se, sf) \
-  /* Embedded 16-point orthonormal Type-II fDCT with flattened rotations. */ \
-  do { \
-    int s1h; \
-    int s3h; \
-    int s5h; \
-    int s7h; \
-    int s9h; \
-    int sbh; \
-    int sdh; \
-    int sfh; \
-    sf = s0 - sf; \
-    sfh = OD_RSHIFT1(sf); \
-    s0 -= sfh; \
-    s1 += se; \
-    s1h = OD_RSHIFT1(s1); \
-    se -= s1h; \
-    sd = s2 - sd; \
-    sdh = OD_RSHIFT1(sd); \
-    s2 -= sdh; \
-    s3 += sc; \
-    s3h = OD_RSHIFT1(s3); \
-    sc -= s3h; \
-    sb = s4 - sb; \
-    sbh = OD_RSHIFT1(sb); \
-    s4 -= sbh; \
-    s5 += sa; \
-    s5h = OD_RSHIFT1(s5); \
-    sa -= s5h; \
-    s9 = s6 - s9; \
-    s9h = OD_RSHIFT1(s9); \
-    s6 -= s9h; \
-    s7 += s8; \
-    s7h = OD_RSHIFT1(s7); \
-    s8 -= s7h; \
-    OD_FDCT_8_ASYM_FLAT(s0, s1, s1h, s2, s3, s3h, s4, s5, s5h, s6, s7, s7h); \
-    OD_FDST_8_ASYM_FLAT(sf, sfh, se, sd, sdh, sc, sb, sbh, sa, s9, s9h, s8); \
-  } \
-  while (0)
-
-#define OD_IDCT_16_FLAT(s0, s8, s4, sc, s2, sa, s6, se, \
- s1, s9, s5, sd, s3, sb, s7, sf) \
-  /* Embedded 16-point orthonormal Type-II iDCT with flattened rotations. */ \
-  do { \
-    int s1h; \
-    int s3h; \
-    int s5h; \
-    int s7h; \
-    OD_IDST_8_ASYM_FLAT(sf, sb, sd, s9, se, sa, sc, s8); \
-    OD_IDCT_8_ASYM_FLAT(s0, s4, s2, s6, s1, s1h, s5, s5h, s3, s3h, s7, s7h); \
-    s8 += s7h; \
-    s7 -= s8; \
-    s6 += OD_RSHIFT1(s9); \
-    s9 = s6 - s9; \
-    sa += s5h; \
-    s5 -= sa; \
-    s4 += OD_RSHIFT1(sb); \
-    sb = s4 - sb; \
-    sc += s3h; \
-    s3 -= sc; \
-    s2 += OD_RSHIFT1(sd); \
-    sd = s2 - sd; \
-    se += s1h; \
-    s1 -= se; \
-    s0 += OD_RSHIFT1(sf); \
-    sf = s0 - sf; \
-  } \
-  while (0)
-
 #define OD_FDCT_16_ASYM_PR(t0, t8, t8h, t4, tc, tch, t2, ta, tah, t6, te, teh, \
   t1, t9, t9h, t5, td, tdh, t3, tb, tbh, t7, tf, tfh) \
   /* Embedded 16-point asymmetric Type-II fDCT. */ \
@@ -1680,64 +855,6 @@
   } \
   while (0)
 
-#define OD_FDCT_16_ASYM_FLAT(t0, t8, t8h, t4, tc, tch, t2, ta, tah, t6, \
-  te, teh, t1, t9, t9h, t5, td, tdh, t3, tb, tbh, t7, tf, tfh) \
-  /* Embedded 16-point asymmetric Type-II fDCT with flattened rotations. */ \
-  do { \
-    t0 += tfh; \
-    tf = t0 - tf; \
-    t1 -= teh; \
-    te += t1; \
-    t2 += tdh; \
-    td = t2 - td; \
-    t3 -= tch; \
-    tc += t3; \
-    t4 += tbh; \
-    tb = t4 - tb; \
-    t5 -= tah; \
-    ta += t5; \
-    t6 += t9h; \
-    t9 = t6 - t9; \
-    t7 -= t8h; \
-    t8 += t7; \
-    OD_FDCT_8_FLAT(t0, t8, t4, tc, t2, ta, t6, te); \
-    OD_FDST_8_FLAT(tf, t7, tb, t3, td, t5, t9, t1); \
-  } \
-  while (0)
-
-#define OD_IDCT_16_ASYM_FLAT(t0, t8, t4, tc, t2, ta, t6, te, \
-  t1, t1h, t9, t9h, t5, t5h, td, tdh, t3, t3h, tb, tbh, t7, t7h, tf, tfh) \
-  /* Embedded 16-point asymmetric Type-II iDCT with flattened rotations. */ \
-  do { \
-    OD_IDST_8_FLAT(tf, tb, td, t9, te, ta, tc, t8); \
-    OD_IDCT_8_FLAT(t0, t4, t2, t6, t1, t5, t3, t7); \
-    t1 -= te; \
-    t1h = OD_RSHIFT1(t1); \
-    te += t1h; \
-    t9 = t6 - t9; \
-    t9h = OD_RSHIFT1(t9); \
-    t6 -= t9h; \
-    t5 -= ta; \
-    t5h = OD_RSHIFT1(t5); \
-    ta += t5h; \
-    td = t2 - td; \
-    tdh = OD_RSHIFT1(td); \
-    t2 -= tdh; \
-    t3 -= tc; \
-    t3h = OD_RSHIFT1(t3); \
-    tc += t3h; \
-    tb = t4 - tb; \
-    tbh = OD_RSHIFT1(tb); \
-    t4 -= tbh; \
-    t7 -= t8; \
-    t7h = OD_RSHIFT1(t7); \
-    t8 += t7h; \
-    tf = t0 - tf; \
-    tfh = OD_RSHIFT1(tf); \
-    t0 -= tfh; \
-  } \
-  while (0)
-
 #define OD_FDST_16_PR(s0, s8, s4, sc, s2, sa, s6, se, \
   s1, s9, s5, sd, s3, sb, s7, sf) \
   /* Embedded 16-point orthonormal Type-IV fDST. */ \
@@ -2119,471 +1236,6 @@
   } \
   while (0)
 
-#define OD_FDST_16_FLAT(s0, s8, s4, sc, s2, sa, s6, se, \
-  s1, s9, s5, sd, s3, sb, s7, sf) \
-  /* Embedded 16-point orthonormal Type-IV fDST with flattened rotations. */ \
-  do { \
-    int t_; \
-    int u_; \
-    int s0h; \
-    int s4h; \
-    int sbh; \
-    int sfh; \
-    t_ = s1 + se; \
-    /* 32729/32768 ~= (Sin[17*Pi/64] + Cos[17*Pi/64])/Sqrt[2] ~=
-        0.9987954562051723 */ \
-    u_ = (se*32729 + 16384) >> 15; \
-    /* 201/2048 ~= (Sin[17*Pi/64] - Cos[17*Pi/64])*Sqrt[2] ~=
-        0.09813534865483615 */ \
-    se = (s1*201 + 1024) >> 11; \
-    /* 31121/32768 ~= Cos[17*Pi/64]*Sqrt[2] = 0.9497277818777543 */ \
-    t_ = (t_*31121 + 16384) >> 15; \
-    se += t_; \
-    s1 = u_ - OD_RSHIFT1(t_); \
-    t_ = s6 - s9; \
-    /* 32413/32768 ~= (Sin[19*Pi/64] + Cos[19*Pi/64])/Sqrt[2] ~=
-        0.9891765099647809 */ \
-    u_ = (s9*32413 + 16384) >> 15; \
-    /* 601/2048 ~= (Sin[19*Pi/64] - Cos[19*Pi/64])*Sqrt[2]
-        ~= 0.29346094891072355 */ \
-    s9 = (s6*601 + 1024) >> 11; \
-    /* 27605/32768 ~= Cos[19*Pi/64]*Sqrt[2] = 0.8424460355094193 */ \
-    t_ = (t_*27605 + 16384) >> 15; \
-    s9 += t_; \
-    s6 = u_ + OD_RSHIFT1(t_); \
-    t_ = s5 + sa; \
-    /* 15893/16384 ~= (Sin[21*Pi/64] + Cos[21*Pi/64])/Sqrt[2] ~=
-        0.970031253194544 */ \
-    u_ = (sa*15893 + 8192) >> 14; \
-    /* 3981/8192 ~= (Sin[21*Pi/64] - Cos[21*Pi/64])*Sqrt[2] ~=
-        0.48596035980652796 */ \
-    sa = (s5*3981 + 4096) >> 13; \
-    /* 1489/2048 ~= Cos[21*Pi/64]*Sqrt[2] ~= 0.72705107329128 */ \
-    t_ = (t_*1489 + 1024) >> 11; \
-    sa += t_; \
-    s5 = OD_RSHIFT1(t_) - u_; \
-    t_ = sd - s2; \
-    /* 30853/32768 ~= (Sin[23*Pi/64] + Cos[23*Pi/64])/Sqrt[2] ~=
-        0.9415440651830208 */ \
-    u_ = (sd*30853 + 16384) >> 15; \
-    /* 11039/16384 ~= (Sin[23*Pi/64] - Cos[23*Pi/64])*Sqrt[2] ~=
-        0.6737797067844402 */ \
-    sd = (s2*11039 + 8192) >> 14; \
-    /* 19813/32768 ~= Cos[23*Pi/64]*Sqrt[2] ~= 0.6046542117908008 */ \
-    t_ = (t_*19813 + 16384) >> 15; \
-    sd -= t_; \
-    s2 = OD_RSHIFT1(t_) - u_; \
-    t_ = s3 + sc; \
-    /* 14811/16384 ~= (Sin[25*Pi/64] + Cos[25*Pi/64])/Sqrt[2] ~=
-        0.9039892931234433 */ \
-    u_ = (sc*14811 + 8192) >> 14; \
-    /* 7005/8192 ~= (Sin[25*Pi/64] - Cos[25*Pi/64])*Sqrt[2] ~=
-        0.8551101868605642 */ \
-    sc = (s3*7005 + 4096) >> 13; \
-    /* 3903/8192 ~= Cos[25*Pi/64]*Sqrt[2] ~= 0.47643419969316125 */ \
-    t_ = (t_*3903 + 4096) >> 13; \
-    sc += t_; \
-    s3 = u_ - OD_RSHIFT1(t_); \
-    t_ = sb - s4; \
-    /* 14053/16384 ~= (Sin[27*Pi/64] + Cos[27*Pi/64])/Sqrt[2] ~=
-        0.857728610000272 */ \
-    u_ = (sb*14053 + 8192) >> 14; \
-    /* 8423/8192 ~= (Sin[27*Pi/64] - Cos[27*Pi/64])*Sqrt[2] ~=
-        1.0282054883864435 */ \
-    sb = (s4*8423 + 4096) >> 13; \
-    /* 2815/8192 ~= Cos[27*Pi/64]*Sqrt[2] = 0.34362586580705035 */ \
-    t_ = (t_*2815 + 4096) >> 13; \
-    sb -= t_; \
-    s4 = OD_RSHIFT1(t_) - u_; \
-    t_ = s7 + s8; \
-    /* 1645/2048 ~= (Sin[29*Pi/64] + Cos[29*Pi/64])/Sqrt[2] ~=
-        0.8032075314806449 */ \
-    u_ = (s8*1645 + 1024) >> 11; \
-    /* 305/256 ~= (Sin[29*Pi/64] - Cos[29*Pi/64])*Sqrt[2] ~=
-        1.1913986089848667 */ \
-    s8 = (s7*305 + 128) >> 8; \
-    /* 425/2048 ~= Cos[29*Pi/64]*Sqrt[2] ~= 0.20750822698821159 */ \
-    t_ = (t_*425 + 1024) >> 11; \
-    s8 += t_; \
-    s7 = u_ - OD_RSHIFT1(t_); \
-    t_ = s0 - sf; \
-    /* 24279/32768 ~= (Sin[31*Pi/64] + Cos[31*Pi/64])/Sqrt[2] ~=
-        0.7409511253549591 */ \
-    u_ = (sf*24279 + 16384) >> 15; \
-    /* 44011/32768 ~= (Sin[31*Pi/64] - Cos[31*Pi/64])*Sqrt[2] ~=
-        1.3431179096940369 */ \
-    sf = (s0*44011 + 16384) >> 15; \
-    /* 1137/16384 ~= Cos[31*Pi/64]*Sqrt[2] ~= 0.0693921705079406 */ \
-    t_ = (t_*1137 + 8192) >> 14; \
-    s0 = u_ + OD_RSHIFT1(t_); \
-    sf += t_; \
-    s3 -= OD_RSHIFT1(sd); \
-    sd += s3; \
-    s2 += OD_RSHIFT1(sc); \
-    sc -= s2; \
-    s5 -= OD_RSHIFT1(sb); \
-    sb += s5; \
-    s4 -= OD_RSHIFT1(sa); \
-    sa += s4; \
-    s1 += OD_RSHIFT1(sf); \
-    sf -= s1; \
-    s7 -= OD_RSHIFT1(s9); \
-    s9 += s7; \
-    s6 -= OD_RSHIFT1(s8); \
-    s8 += s6; \
-    s0 += OD_RSHIFT1(se); \
-    se -= s0; \
-    sa -= s9; \
-    s9 += OD_RSHIFT1(sa); \
-    s5 += s6; \
-    s6 -= OD_RSHIFT1(s5); \
-    s1 -= s2; \
-    s2 += OD_RSHIFT1(s1); \
-    se += sd; \
-    sd -= OD_RSHIFT1(se); \
-    s0 += sc; \
-    s0h = OD_RSHIFT1(s0); \
-    sc -= s0h; \
-    sf -= s3; \
-    sfh = OD_RSHIFT1(sf); \
-    s3 += sfh; \
-    sb += s7; \
-    sbh = OD_RSHIFT1(sb); \
-    s7 -= sbh; \
-    s4 += s8; \
-    s4h = OD_RSHIFT1(s4); \
-    s8 -= s4h; \
-    t_ = OD_PAVG(s1, se); \
-    /* 9633/8192 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */ \
-    u_ = (s1*9633 + 4096) >> 13; \
-    /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */ \
-    s1 = (se*12873 + 8192) >> 14; \
-    /* 12785/32768 ~= 2*Cos[7*Pi/16] ~= 0.3901806440322565 */ \
-    t_ = (t_*12785 + 16384) >> 15; \
-    s1 += t_; \
-    se = u_ - t_; \
-    t_ = s6 + s9; \
-    /* 45451/32768 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
-    u_ = (s9*45451 + 16384) >> 15; \
-    /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
-    s9 = (s6*9041 + 16384) >> 15; \
-    /* 18205/32768 ~= Cos[5*Pi/16] ~= 0.5555702330196022 */ \
-    t_ = (t_*18205 + 16384) >> 15; \
-    s9 += t_; \
-    s6 = u_ - t_; \
-    t_ = OD_PAVG(s5, sa); \
-    /* 22725/16384 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
-    u_ = (sa*22725 + 8192) >> 14; \
-    /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
-    sa = (s5*9041 + 16384) >> 15; \
-    /* 18205/16384 ~= 2*Cos[5*Pi/16] ~= 1.1111404660392044 */ \
-    t_ = (t_*18205 + 8192) >> 14; \
-    sa += t_; \
-    s5 = t_ - u_; \
-    t_ = s2 + sd; \
-    /* 38531/32768 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */ \
-    u_ = (s2*38531 + 16384) >> 15; \
-    /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */ \
-    s2 = (sd*12873 + 8192) >> 14; \
-    /* 6393/32768 ~= Cos[7*Pi/16] ~= 0.19509032201612825 */ \
-    t_ = (t_*6393 + 16384) >> 15; \
-    s2 += t_; \
-    sd = u_ - t_; \
-    s3 -= s4h; \
-    s4 += s3; \
-    s8 -= s0h; \
-    s0 += s8; \
-    s7 += sfh; \
-    sf -= s7; \
-    sc += sbh; \
-    sb -= sc; \
-    s6 += OD_RSHIFT1(se) ;\
-    se -= s6; \
-    s9 -= OD_RSHIFT1(s1); \
-    s1 += s9; \
-    sd -= OD_RSHIFT1(s5); \
-    s5 += sd; \
-    s2 -= OD_RSHIFT1(sa); \
-    sa += s2; \
-    t_ = OD_PAVG(s3, sc); \
-    /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
-    u_ = (s3*21407 + 8192) >> 14; \
-    /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
-    s3 = (sc*8867 + 8192) >> 14; \
-    /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
-    t_ = (t_*3135 + 2048) >> 12; \
-    s3 += t_; \
-    sc = u_ - t_; \
-    t_ = OD_PAVG(s4, sb); \
-    /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
-    u_ = (s4*21407 + 8192) >> 14; \
-    /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
-    s4 = (sb*8867 + 8192) >> 14; \
-    /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
-    t_ = (t_*3134 + 2048) >> 12; \
-    s4 += t_; \
-    sb = u_ - t_; \
-    t_ = OD_PAVG(s5, sa); \
-    /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
-    u_ = (sa*11585 + 4096) >> 13; \
-    /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
-    sa = (t_*11585 + 4096) >> 13; \
-    s5 = sa - u_; \
-    t_ = OD_PAVG(s6, -s9); \
-    /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
-    s6 = (s9*11585 + 4096) >> 13; \
-    /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
-    s9 = (t_*11585 + 4096) >> 13; \
-    s6 += s9; \
-    t_ = OD_PAVG(s7, -s8); \
-    /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
-    s7 = (s8*11585 + 4096) >> 13; \
-    /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
-    s8 = (t_*11585 + 4096) >> 13; \
-    s7 += s8; \
-  } \
-  while (0)
-
-#define OD_IDST_16_FLAT(s0, s1, s2, s3, s4, s5, s6, s7, \
-  s8, s9, sa, sb, sc, sd, se, sf) \
-  /* Embedded 16-point orthonormal Type-IV iDST with flattened rotations. */ \
-  do { \
-    int t_; \
-    int u_; \
-    int s0h; \
-    int s1h; \
-    int s2h; \
-    int s3h; \
-    int s4h; \
-    int s5h; \
-    int s6h; \
-    int s7h; \
-    int sbh; \
-    int sfh; \
-    t_ = OD_PAVG(s6, s9); \
-    /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
-    s9 = (s6*11585 + 4096) >> 13; \
-    /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
-    s6 = (t_*11585 + 4096) >> 13; \
-    s9 -= s6; \
-    t_ = OD_PAVG(s5, sa); \
-    /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
-    sa = (s5*11585 + 4096) >> 13; \
-    /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
-    s5 = (t_*11585 + 4096) >> 13; \
-    sa -= s5; \
-    t_ = OD_PAVG(s7, s8); \
-    /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
-    s8 = (s7*11585 + 4096) >> 13; \
-    /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
-    s7 = (t_*11585 + 4096) >> 13; \
-    s8 -= s7; \
-    t_ = OD_PAVG(s3, sc); \
-    /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
-    u_ = (s3*21407 + 8192) >> 14; \
-    /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
-    s3 = (sc*8867 + 8192) >> 14; \
-    /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
-    t_ = (t_*3135 + 2048) >> 12; \
-    s3 += t_; \
-    sc = u_ - t_; \
-    t_ = OD_PAVG(sb, -s4); \
-    /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
-    u_ = (sb*21407 + 8192) >> 14; \
-    /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
-    sb = (s4*8867 + 8192) >> 14; \
-    /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
-    t_ = (t_*3135 + 2048) >> 12; \
-    sb -= t_; \
-    s4 = t_ - u_; \
-    sa += s2; \
-    s2 -= OD_RSHIFT1(sa); \
-    s5 -= sd; \
-    sd += OD_RSHIFT1(s5); \
-    s1 -= s9; \
-    s9 += OD_RSHIFT1(s1); \
-    se += s6; \
-    s6 -= OD_RSHIFT1(se); \
-    sb += sc; \
-    sbh = OD_RSHIFT1(sb); \
-    sc -= sbh; \
-    sf += s7; \
-    sfh = OD_RSHIFT1(sf); \
-    s7 -= sfh; \
-    s0 -= s8; \
-    s0h = OD_RSHIFT1(s0); \
-    s8 += s0h; \
-    s4 += s3; \
-    s4h = OD_RSHIFT1(s4); \
-    s3 -= s4h; \
-    t_ = sd - s2; \
-    /* 38531/32768 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */ \
-    u_ = (sd*38531 + 16384) >> 15; \
-    /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */ \
-    sd = (s2*12873 + 8192) >> 14; \
-    /* 6393/32768 ~= Cos[7*Pi/16] ~= 0.19509032201612825 */ \
-    t_ = (t_*6393 + 16384) >> 15; \
-    sd -= t_; \
-    s2 = t_ - u_; \
-    t_ = OD_PAVG(s5, -sa); \
-    /* 22725/16384 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
-    u_ = (s5*22725 + 8192) >> 14; \
-    /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
-    s5 = (sa*9041 + 16384) >> 15; \
-    /* 18205/16384 ~= 2*Cos[5*Pi/16] ~= 1.1111404660392044 */ \
-    t_ = (t_*18205 + 8192) >> 14; \
-    s5 -= t_; \
-    sa = t_ - u_; \
-    t_ = s6 + s9; \
-    /* 45451/32768 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
-    u_ = (s9*45451 + 16384) >> 15; \
-    /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
-    s9 = (s6*9041 + 16384) >> 15; \
-    /* 18205/32768 ~= Cos[5*Pi/16] ~= 0.5555702330196022 */ \
-    t_ = (t_*18205 + 16384) >> 15; \
-    s9 += t_; \
-    s6 = u_ - t_; \
-    t_ = OD_PAVG(s1, se); \
-    /* 9633/8192 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */ \
-    u_ = (s1*9633 + 4096) >> 13; \
-    /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */ \
-    s1 = (se*12873 + 8192) >> 14; \
-    /* 12785/32768 ~= 2*Cos[7*Pi/16] ~= 0.3901806440322565 */ \
-    t_ = (t_*12785 + 16384) >> 15; \
-    s1 += t_; \
-    se = u_ - t_; \
-    s8 -= s4h; \
-    s4 += s8; \
-    s7 += sbh; \
-    sb -= s7; \
-    s3 -= sfh; \
-    sf += s3; \
-    sc += s0h; \
-    s0 -= sc; \
-    sd += OD_RSHIFT1(se); \
-    se -= sd; \
-    s2 += OD_RSHIFT1(s1); \
-    s1 -= s2; \
-    s6 -= OD_RSHIFT1(s5); \
-    s5 += s6; \
-    s9 -= OD_RSHIFT1(sa); \
-    sa += s9; \
-    s0 -= se; \
-    s0h = OD_RSHIFT1(s0); \
-    se += s0h; \
-    s1 -= sf; \
-    s1h = OD_RSHIFT1(s1); \
-    sf += s1h; \
-    s2 += sc; \
-    s2h = OD_RSHIFT1(s2); \
-    sc -= s2h; \
-    s3 += sd; \
-    s3h = OD_RSHIFT1(s3); \
-    sd -= s3h; \
-    s4 -= sa; \
-    s4h = OD_RSHIFT1(s4); \
-    sa += s4h; \
-    s5 -= sb; \
-    s5h = OD_RSHIFT1(s5); \
-    sb += s5h; \
-    s6 += s8; \
-    s6h = OD_RSHIFT1(s6); \
-    s8 -= s6h; \
-    s7 += s9; \
-    s7h = OD_RSHIFT1(s7); \
-    s9 -= s7h; \
-    t_ = se - s1h; \
-    /* 32729/32768 ~= (Sin[17*Pi/64] + Cos[17*Pi/64])/Sqrt[2] ~=
-        0.9987954562051723 */ \
-    u_ = (s1*32729 + 16384) >> 15; \
-    /* 201/2048 ~= (Sin[17*Pi/64] - Cos[17*Pi/64])*Sqrt[2] ~=
-        0.09813534865483615 */ \
-    s1 = (se*201 + 1024) >> 11; \
-    /* 31121/32768 ~= Cos[17*Pi/64]*Sqrt[2] ~=
-        0.9497277818777543 */ \
-    t_ = (t_*31121 + 16384) >> 15; \
-    s1 += t_; \
-    se = u_ + t_; \
-    t_ = s6h + s9; \
-    /* 32413/32768 ~= (Sin[19*Pi/64] + Cos[19*Pi/64])/Sqrt[2] ~=
-        0.9891765099647809 */ \
-    u_ = (s6*32413 + 16384) >> 15; \
-    /* 601/2048 ~= (Sin[19*Pi/64] - Cos[19*Pi/64])*Sqrt[2] ~=
-        0.29346094891072355 */ \
-    s6 = (s9*601 + 1024) >> 11; \
-    /* 27605/32768 ~= Cos[19*Pi/64]*Sqrt[2] ~= 0.8424460355094193 */ \
-    t_ = (t_*27605 + 16384) >> 15; \
-    s6 += t_; \
-    s9 = u_ - t_; \
-    t_ = sa - s5h; \
-    /* 15893/16384 ~= (Sin[21*Pi/64] + Cos[21*Pi/64])/Sqrt[2] ~=
-        0.970031253194544 */ \
-    u_ = (s5*15893 + 8192) >> 14; \
-    /* 3981/8192 ~= (Sin[21*Pi/64] - Cos[21*Pi/64])*Sqrt[2] ~=
-        0.48596035980652796 */ \
-    s5 = (sa*3981 + 4096) >> 13; \
-    /* 1489/2048 ~= Cos[21*Pi/64]*Sqrt[2] ~= 0.72705107329128 */ \
-    t_ = (t_*1489 + 1024) >> 11; \
-    s5 += t_; \
-    sa = u_ + t_; \
-    t_ = s2h + sd; \
-    /* 30853/32768 ~= (Sin[23*Pi/64] + Cos[23*Pi/64])/Sqrt[2] ~=
-        0.9415440651830208 */ \
-    u_ = (s2*30853 + 16384) >> 15; \
-    /* 11039/16384 ~= (Sin[23*Pi/64] - Cos[23*Pi/64])*Sqrt[2] ~=
-        0.6737797067844402 */ \
-    s2 = (sd*11039 + 8192) >> 14; \
-    /* 19813/32768 ~= Cos[23*Pi/64]*Sqrt[2] ~= 0.6046542117908008 */ \
-    t_ = (t_*19813 + 16384) >> 15; \
-    s2 += t_; \
-    sd = u_ - t_; \
-    t_ = sc - s3h; \
-    /* 14811/16384 ~= (Sin[25*Pi/64] + Cos[25*Pi/64])/Sqrt[2] ~=
-        0.9039892931234433 */ \
-    u_ = (s3*14811 + 8192) >> 14; \
-    /* 7005/8192 ~= (Sin[25*Pi/64] - Cos[25*Pi/64])*Sqrt[2] ~=
-        0.8551101868605642 */ \
-    s3 = (sc*7005 + 4096) >> 13; \
-    /* 3903/8192 ~= Cos[25*Pi/64]*Sqrt[2] ~= 0.47643419969316125 */ \
-    t_ = (t_*3903 + 4096) >> 13; \
-    s3 += t_; \
-    sc = u_ + t_; \
-    t_ = s4h + sb; \
-    /* 14053/16384 ~= (Sin[27*Pi/64] + Cos[27*Pi/64])/Sqrt[2] ~=
-        0.857728610000272 */ \
-    u_ = (s4*14053 + 8192) >> 14; \
-    /* 8423/8192 ~= (Sin[27*Pi/64] - Cos[27*Pi/64])*Sqrt[2] ~=
-        1.0282054883864435 */ \
-    s4 = (sb*8423 + 4096) >> 13; \
-    /* 2815/8192 ~= Cos[27*Pi/64]*Sqrt[2] ~= 0.34362586580705035 */ \
-    t_ = (t_*2815 + 4096) >> 13; \
-    s4 += t_; \
-    sb = u_ - t_; \
-    t_ = s8 - s7h; \
-    /* 1645/2048 ~= (Sin[29*Pi/64] + Cos[29*Pi/64])/Sqrt[2] ~=
-        0.8032075314806449 */ \
-    u_ = (s7*1645 + 1024) >> 11; \
-    /* 305/256 ~= (Sin[29*Pi/64] - Cos[29*Pi/64])*Sqrt[2] ~=
-        1.1913986089848667 */ \
-    s7 = (s8*305 + 128) >> 8; \
-    /* 425/2048 ~= Cos[29*Pi/64]*Sqrt[2] ~= 0.20750822698821159 */ \
-    t_ = (t_*425 + 1024) >> 11; \
-    s7 += t_; \
-    s8 = u_ + t_; \
-    t_ = s0h + sf; \
-    /* 24279/32768 ~= (Sin[31*Pi/64] + Cos[31*Pi/64])/Sqrt[2] ~=
-        0.7409511253549591 */ \
-    u_ = (s0*24279 + 16384) >> 15; \
-    /* 44011/32768 ~= (Sin[31*Pi/64] - Cos[31*Pi/64])*Sqrt[2] ~=
-        1.3431179096940369 */ \
-    s0 = (sf*44011 + 16384) >> 15; \
-    /* 1137/16384 ~= Cos[31*Pi/64]*Sqrt[2] ~= 0.06939217050794069 */ \
-    t_ = (t_*1137 + 8192) >> 14; \
-    s0 += t_; \
-    sf = u_ - t_; \
-  } \
-  while (0)
-
 /* TODO: rewrite this to match OD_FDST_16. */
 #define OD_FDST_16_ASYM_PR(t0, t0h, t8, t4, t4h, tc, t2, ta, t6, te, \
   t1, t9, t5, td, t3, tb, t7, t7h, tf) \
@@ -3019,444 +1671,6 @@
   } \
   while (0)
 
-#define OD_FDST_16_ASYM_FLAT(s0, s0h, s8, s4, s4h, sc, s2, s2h, sa, s6, s6h, \
-  se, s1, s1h, s9, s5, s5h, sd, s3, s3h, sb, s7, s7h, sf) \
-  /* Embedded 16-point asymmetric Type-IV fDST with flattened rotations. */ \
-  do { \
-    int t_; \
-    int u_; \
-    int sbh; \
-    int sfh; \
-    t_ = s1h + se; \
-    /* 46285/32768 ~= Sin[17*Pi/64] + Cos[17*Pi/64] ~= 1.4125100802019774 */ \
-    u_ = (se*46285 + 16384) >> 15; \
-    /* 1137/16384 ~= Sin[17*Pi/64] - Cos[17*Pi/64] ~= 0.06939217050794078 */ \
-    se = (s1*1137 + 8192) >> 14; \
-    /* 44011/32768 ~= Cos[17*Pi/64]*2 ~= 1.3431179096940367 */ \
-    t_ = (t_*44011 + 16384) >> 15; \
-    se += t_; \
-    s1 = u_ - OD_RSHIFT1(t_); \
-    t_ = s6h - s9; \
-    /* 45839/32768 ~= Sin[19*Pi/64] + Cos[19*Pi/64] ~= 1.3989068359730783 */ \
-    u_ = (s9*45839 + 16384) >> 15; \
-    /* 425/2048 ~= Sin[19*Pi/64] - Cos[19*Pi/64] ~= 0.20750822698821159 */ \
-    s9 = (425*s6 + 1024) >> 11; \
-    /* 305/256 ~= Cos[19*Pi/64]*2 ~= 1.1913986089848667 */ \
-    t_ = (305*t_ + 128) >> 8; \
-    s9 += t_; \
-    s6 = u_ + OD_RSHIFT1(t_); \
-    t_ = s5h + sa; \
-    /* 5619/4096 ~= Sin[21*Pi/64] + Cos[21*Pi/64] ~= 1.371831354193494 */ \
-    u_ = (sa*5619 + 2048) >> 12; \
-    /* 2815/8192 ~= Sin[21*Pi/64] - Cos[21*Pi/64] ~= 0.34362586580705046 */ \
-    sa = (s5*2815 + 4096) >> 13; \
-    /* 8423/8192 ~= Cos[21*Pi/64]*2 ~= 1.0282054883864433 */ \
-    t_ = (t_*8423 + 4096) >> 13; \
-    sa += t_; \
-    s5 = OD_RSHIFT1(t_) - u_; \
-    t_ = sd - s2h; \
-    /* 2727/2048 ~= Sin[23*Pi/64] + Cos[23*Pi/64] ~= 1.3315443865537255 */ \
-    u_ = (sd*2727 + 1024) >> 11; \
-    /* 3903/8192 ~= Sin[23*Pi/64] - Cos[23*Pi/64] ~= 0.47643419969316125 */ \
-    sd = (s2*3903 + 4096) >> 13; \
-    /* 7005/8192 ~= Cos[23*Pi/64]*2 ~= 0.8551101868605642 */ \
-    t_ = (t_*7005 + 4096) >> 13; \
-    sd -= t_; \
-    s2 = OD_RSHIFT1(t_) - u_; \
-    t_ = s3h + sc; \
-    /* 10473/8192 ~= Sin[25*Pi/64] + Cos[25*Pi/64] ~= 1.278433918575241 */ \
-    u_ = (sc*10473 + 4096) >> 13; \
-    /* 19813/32768 ~= Sin[25*Pi/64] - Cos[25*Pi/64] ~= 0.6046542117908007 */ \
-    sc = (s3*19813 + 16384) >> 15; \
-    /* 11039/16384 ~= Cos[25*Pi/64]*2 ~= 0.6737797067844401 */ \
-    t_ = (t_*11039 + 8192) >> 14; \
-    sc += t_; \
-    s3 = u_ - OD_RSHIFT1(t_); \
-    t_ = sb - s4h; \
-    /* 9937/8192 ~= Sin[27*Pi/64] + Cos[27*Pi/64] ~= 1.213011433097808 */ \
-    u_ = (sb*9937 + 4096) >> 13; \
-    /* 1489/2048 ~= Sin[27*Pi/64] - Cos[27*Pi/64] ~= 0.72705107329128 */ \
-    sb = (s4*1489 + 1024) >> 11; \
-    /* 3981/8192 ~= Cos[27*Pi/64]*2 ~= 0.48596035980652774 */ \
-    t_ = (t_*3981 + 4096) >> 13; \
-    sb -= t_; \
-    s4 = OD_RSHIFT1(t_) - u_; \
-    t_ = s7h + s8; \
-    /* 37221/32768 ~= Sin[29*Pi/64] + Cos[29*Pi/64] ~= 1.1359069844201428 */ \
-    u_ = (s8*37221 + 16384) >> 15; \
-    /* 27605/32768 ~= Sin[29*Pi/64] - Cos[29*Pi/64] ~= 0.8424460355094192 */ \
-    s8 = (s7*27605 + 16384) >> 15; \
-    /* 601/2048 ~= Cos[29*Pi/64]*2 ~= 0.2934609489107235 */ \
-    t_ = (t_*601 + 1024) >> 11; \
-    s8 += t_; \
-    s7 = u_ - OD_RSHIFT1(t_); \
-    t_ = s0h - sf; \
-    /* 1073/1024 ~= Sin[31*Pi/64] + Cos[31*Pi/64] ~= 1.0478631305325905 */ \
-    u_ = (sf*1073 + 512) >> 10; \
-    /* 31121/32768 ~= Sin[31*Pi/64] - Cos[31*Pi/64] ~= 0.9497277818777544 */ \
-    sf = (s0*31121 + 16384) >> 15; \
-    /* 201/2048 ~= Cos[31*Pi/64]*2 ~= 0.09813534865483603 */ \
-    t_ = (t_*201 + 1024) >> 11; \
-    s0 = u_ + OD_RSHIFT1(t_); \
-    sf += t_; \
-    s3 -= OD_RSHIFT1(sd); \
-    sd += s3; \
-    s2 += OD_RSHIFT1(sc); \
-    sc -= s2; \
-    s5 -= OD_RSHIFT1(sb); \
-    sb += s5; \
-    s4 -= OD_RSHIFT1(sa); \
-    sa += s4; \
-    s1 += OD_RSHIFT1(sf); \
-    sf -= s1; \
-    s7 -= OD_RSHIFT1(s9); \
-    s9 += s7; \
-    s6 -= OD_RSHIFT1(s8); \
-    s8 += s6; \
-    s0 += OD_RSHIFT1(se); \
-    se -= s0; \
-    sa -= s9; \
-    s9 += OD_RSHIFT1(sa); \
-    s5 += s6; \
-    s6 -= OD_RSHIFT1(s5); \
-    s1 -= s2; \
-    s2 += OD_RSHIFT1(s1); \
-    se += sd; \
-    sd -= OD_RSHIFT1(se); \
-    s0 += sc; \
-    s0h = OD_RSHIFT1(s0); \
-    sc -= s0h; \
-    sf -= s3; \
-    sfh = OD_RSHIFT1(sf); \
-    s3 += sfh; \
-    sb += s7; \
-    sbh = OD_RSHIFT1(sb); \
-    s7 -= sbh; \
-    s4 += s8; \
-    s4h = OD_RSHIFT1(s4); \
-    s8 -= s4h; \
-    t_ = s1 + se; \
-    /* 9633/8192 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */ \
-    u_ = (s1*9633 + 4096) >> 13; \
-    /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */ \
-    s1 = (se*12873 + 8192) >> 14; \
-    /* 6393/32768 ~= Cos[7*Pi/16] ~= 0.19509032201612825 */ \
-    t_ = (t_*6393 + 16384) >> 15; \
-    s1 += t_; \
-    se = u_ - t_; \
-    t_ = s6 + s9; \
-    /* 22725/16384 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
-    u_ = (s9*22725 + 8192) >> 14; \
-    /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
-    s9 = (s6*9041 + 16384) >> 15; \
-    /* 18205/32768 ~= Cos[5*Pi/16] ~= 0.5555702330196022 */ \
-    t_ = (t_*18205 + 16384) >> 15; \
-    s9 += t_; \
-    s6 = u_ - t_; \
-    t_ = s5 + sa; \
-    /* 11363/8192 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
-    u_ = (sa*11363 + 4096) >> 13; \
-    /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
-    sa = (s5*9041 + 16384) >> 15; \
-    /* 4551/8192 ~= Cos[5*Pi/16] ~= 0.5555702330196022 */ \
-    t_ = (t_*4551 + 4096) >> 13; \
-    sa += t_; \
-    s5 = t_ - u_; \
-    t_ = s2 + sd; \
-    /* 9633/8192 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */ \
-    u_ = (s2*9633 + 4096) >> 13; \
-    /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */ \
-    s2 = (sd*12873 + 8192) >> 14; \
-    /* 6393/32768 ~= Cos[7*Pi/16] ~= 0.19509032201612825 */ \
-    t_ = (t_*6393 + 16384) >> 15; \
-    s2 += t_; \
-    sd = u_ - t_; \
-    s3 -= s4h; \
-    s4 += s3; \
-    s8 -= s0h; \
-    s0 += s8; \
-    s7 += sfh; \
-    sf -= s7; \
-    sc += sbh; \
-    sb -= sc; \
-    s6 += OD_RSHIFT1(se) ;\
-    se -= s6; \
-    s9 -= OD_RSHIFT1(s1); \
-    s1 += s9; \
-    sd -= OD_RSHIFT1(s5); \
-    s5 += sd; \
-    s2 -= OD_RSHIFT1(sa); \
-    sa += s2; \
-    t_ = s3 + sc; \
-    /* 10703/8192 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
-    u_ = (s3*10703 + 4096) >> 13; \
-    /* 8867/16348 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
-    s3 = (sc*8867 + 8192) >> 14; \
-    /* 3135/8192 ~= Cos[3*Pi/8] ~= 0.3826834323650898 */ \
-    t_ = (t_*3135 + 4096) >> 13; \
-    s3 += t_; \
-    sc = u_ - t_; \
-    t_ = s4 + sb; \
-    /* 10703/8192 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
-    u_ = (s4*10703 + 4096) >> 13; \
-    /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
-    s4 = (sb*8867 + 8192) >> 14; \
-    /* 3135/8192 ~= Cos[3*Pi/8] ~= 0.3826834323650898 */ \
-    t_ = (t_*3135 + 4096) >> 13; \
-    s4 += t_; \
-    sb = u_ - t_; \
-    t_ = s5 + sa; \
-    /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
-    u_ = (sa*11585 + 4096) >> 13; \
-    /* 11585/16384 ~= Cos[Pi/4] ~= 0.7071067811865475 */ \
-    sa = (t_*11585 + 8192) >> 14; \
-    s5 = sa - u_; \
-    t_ = s6 - s9; \
-    /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
-    s6 = (s9*11585 + 4096) >> 13; \
-    /* 11585/16384 ~= Cos[Pi/4] ~= 0.7071067811865475 */ \
-    s9 = (t_*11585 + 8192) >> 14; \
-    s6 += s9; \
-    t_ = s7 - s8; \
-    /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
-    s7 = (s8*11585 + 4096) >> 13; \
-    /* 11585/16384 ~= Cos[Pi/4] ~= 0.7071067811865475 */ \
-    s8 = (t_*11585 + 8192) >> 14; \
-    s7 += s8; \
-  } \
-  while (0)
-
-#define OD_IDST_16_ASYM_FLAT(s0, s1, s2, s3, s4, s5, s6, s7, \
-  s8, s9, sa, sb, sc, sd, se, sf) \
-  /* Embedded 16-point asymmetric Type-IV iDST with flattened rotations. */ \
-  do { \
-    int t_; \
-    int u_; \
-    int s0h; \
-    int s1h; \
-    int s2h; \
-    int s3h; \
-    int s4h; \
-    int s5h; \
-    int s6h; \
-    int s7h; \
-    int sbh; \
-    int sfh; \
-    t_ = s6 + s9; \
-    /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
-    s9 = (s6*11585 + 4096) >> 13; \
-    /* 11585/16384 ~= Cos[Pi/4] ~= 0.7071067811865475 */ \
-    s6 = (t_*11585 + 8192) >> 14; \
-    s9 -= s6; \
-    t_ = s5 + sa; \
-    /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
-    sa = (s5*11585 + 4096) >> 13; \
-    /* 11585/16384 ~= Cos[Pi/4] ~= 0.7071067811865475 */ \
-    s5 = (t_*11585 + 8192) >> 14; \
-    sa -= s5; \
-    t_ = s7 + s8; \
-    /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
-    s8 = (s7*11585 + 4096) >> 13; \
-    /* 11585/16384 ~= Cos[Pi/4] ~= 0.7071067811865475 */ \
-    s7 = (t_*11585 + 8192) >> 14; \
-    s8 -= s7; \
-    t_ = s3 - sc; \
-    /* 10703/8192 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
-    u_ = (sc*10703 + 4096) >> 13; \
-    /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
-    sc = (s3*8867 + 8192) >> 14; \
-    /* 3135/8192 ~= Cos[3*Pi/8] ~= 0.3826834323650898 */ \
-    t_ = (t_*3135 + 4096) >> 13; \
-    sc += t_; \
-    s3 = u_ + t_; \
-    t_ = sb - s4; \
-    /* 10703/8192 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
-    u_ = (sb*10703 + 4096) >> 13; \
-    /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
-    sb = (s4*8867 + 8192) >> 14; \
-    /* 3135/8192 ~= Cos[3*Pi/8] ~= 0.3826834323650898 */ \
-    t_ = (t_*3135 + 4096) >> 13; \
-    sb -= t_; \
-    s4 = t_ - u_; \
-    sa += s2; \
-    s2 -= OD_RSHIFT1(sa); \
-    s5 -= sd; \
-    sd += OD_RSHIFT1(s5); \
-    s1 -= s9; \
-    s9 += OD_RSHIFT1(s1); \
-    se += s6; \
-    s6 -= OD_RSHIFT1(se); \
-    sb += sc; \
-    sbh = OD_RSHIFT1(sb); \
-    sc -= sbh; \
-    sf += s7; \
-    sfh = OD_RSHIFT1(sf); \
-    s7 -= sfh; \
-    s0 -= s8; \
-    s0h = OD_RSHIFT1(s0); \
-    s8 += s0h; \
-    s4 += s3; \
-    s4h = OD_RSHIFT1(s4); \
-    s3 -= s4h; \
-    t_ = sd - s2; \
-    /* 9633/8192 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */ \
-    u_ = (sd*9633 + 4096) >> 13; \
-    /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */ \
-    sd = (s2*12873 + 8192) >> 14; \
-    /* 6393/32768 ~= Cos[7*Pi/16] ~= 0.19509032201612825 */ \
-    t_ = (t_*6393 + 16384) >> 15; \
-    sd -= t_; \
-    s2 = t_ - u_; \
-    t_ = s5 - sa; \
-    /* 11363/8192 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
-    u_ = (s5*11363 + 4096) >> 13; \
-    /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
-    s5 = (sa*9041 + 16384) >> 15; \
-    /* 4551/8192 ~= Cos[5*Pi/16] ~= 0.5555702330196022 */ \
-    t_ = (t_*4551 + 4096) >> 13; \
-    s5 -= t_; \
-    sa = t_ - u_; \
-    t_ = s6 + s9; \
-    /* 22725/16384 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
-    u_ = (s9*22725 + 8192) >> 14; \
-    /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
-    s9 = (s6*9041 + 16384) >> 15; \
-    /* 18205/32768 ~= Cos[5*Pi/16] ~= 0.5555702330196022 */ \
-    t_ = (t_*18205 + 16384) >> 15; \
-    s9 += t_; \
-    s6 = u_ - t_; \
-    t_ = s1 + se; \
-    /* 9633/8192 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */ \
-    u_ = (s1*9633 + 4096) >> 13; \
-    /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */ \
-    s1 = (se*12873 + 8192) >> 14; \
-    /* 6393/32768 ~= Cos[7*Pi/16] ~= 0.19509032201612825 */ \
-    t_ = (t_*6393 + 16384) >> 15; \
-    s1 += t_; \
-    se = u_ - t_; \
-    s8 -= s4h; \
-    s4 += s8; \
-    s7 += sbh; \
-    sb -= s7; \
-    s3 -= sfh; \
-    sf += s3; \
-    sc += s0h; \
-    s0 -= sc; \
-    sd += OD_RSHIFT1(se); \
-    se -= sd; \
-    s2 += OD_RSHIFT1(s1); \
-    s1 -= s2; \
-    s6 -= OD_RSHIFT1(s5); \
-    s5 += s6; \
-    s9 -= OD_RSHIFT1(sa); \
-    sa += s9; \
-    s0 -= se; \
-    s0h = OD_RSHIFT1(s0); \
-    se += s0h; \
-    s1 -= sf; \
-    s1h = OD_RSHIFT1(s1); \
-    sf += s1h; \
-    s2 += sc; \
-    s2h = OD_RSHIFT1(s2); \
-    sc -= s2h; \
-    s3 += sd; \
-    s3h = OD_RSHIFT1(s3); \
-    sd -= s3h; \
-    s4 -= sa; \
-    s4h = OD_RSHIFT1(s4); \
-    sa += s4h; \
-    s5 -= sb; \
-    s5h = OD_RSHIFT1(s5); \
-    sb += s5h; \
-    s6 += s8; \
-    s6h = OD_RSHIFT1(s6); \
-    s8 -= s6h; \
-    s7 += s9; \
-    s7h = OD_RSHIFT1(s7); \
-    s9 -= s7h; \
-    t_ = se - s1h; \
-    /* 23143/32768 ~= (Sin[17*Pi/64] + Cos[17*Pi/64])/2 ~=
-        0.7062550401009887 */ \
-    u_ = (s1*23143 + 16384) >> 15; \
-    /* 1137/8192 ~= (Sin[17*Pi/64] - Cos[17*Pi/64])*2 ~=
-        0.13878434101588155 */ \
-    s1 = (se*1137 + 4096) >> 13; \
-    /* 44011/32768 ~= Cos[17*Pi/64]*2 ~= 1.3431179096940367 */ \
-    t_ = (t_*44011 + 16384) >> 15; \
-    s1 += t_; \
-    se = u_ + OD_RSHIFT1(t_); \
-    t_ = s6h + s9; \
-    /* 2865/4096 ~= (Sin[19*Pi/64] + Cos[19*Pi/64])/2 ~= 0.6994534179865391 */ \
-    u_ = (s6*2865 + 2048) >> 12; \
-    /* 13599/32768 ~= (Sin[19*Pi/64] - Cos[19*Pi/64])*2 ~=
-        0.41501645397642317 */ \
-    s6 = (s9*13599 + 16384) >> 15; \
-    /* 305/256 ~= Cos[19*Pi/64]*2 ~= 1.1913986089848667 */ \
-    t_ = (t_*305 + 128) >> 8; \
-    s6 += t_; \
-    s9 = u_ - OD_RSHIFT1(t_); \
-    t_ = sa - s5h; \
-    /* 5619/8192 ~= (Sin[21*Pi/64] + Cos[21*Pi/64])/2 ~= 0.685915677096747 */ \
-    u_ = (s5*5619 + 4096) >> 13; \
-    /* 2815/4096 ~= (Sin[21*Pi/64] - Cos[21*Pi/64])*2 ~= 0.6872517316141009 */ \
-    s5 = (sa*2815 + 2048) >> 12; \
-    /* 8423/8192 ~= Cos[21*Pi/64]*2 ~= 1.0282054883864433 */ \
-    t_ = (t_*8423 + 4096) >> 13; \
-    s5 += t_; \
-    sa = u_ + OD_RSHIFT1(t_); \
-    t_ = s2h + sd; \
-    /* 2727/4096 ~= (Sin[23*Pi/64] + Cos[23*Pi/64])/2 ~= 0.6657721932768628 */ \
-    u_ = (s2*2727 + 2048) >> 12; \
-    /* 3903/4096 ~= (Sin[23*Pi/64] - Cos[23*Pi/64])*2 ~= 0.9528683993863225 */ \
-    s2 = (sd*3903 + 2048) >> 12; \
-    /* 7005/8192 ~= Cos[23*Pi/64]*2 ~= 0.8551101868605642 */ \
-    t_ = (t_*7005 + 4096) >> 13; \
-    s2 += t_; \
-    sd = u_ - OD_RSHIFT1(t_); \
-    t_ = sc - s3h; \
-    /* 10473/16384 ~= (Sin[25*Pi/64] + Cos[25*Pi/64])/2 ~=
-        0.6392169592876205 */ \
-    u_ = (s3*10473 + 8192) >> 14; \
-    /* 39627/32768 ~= (Sin[25*Pi/64] - Cos[25*Pi/64])*2 ~=
-        1.2093084235816014 */ \
-    s3 = (sc*39627 + 16384) >> 15; \
-    /* 11039/16384 ~= Cos[25*Pi/64]*2 ~= 0.6737797067844401 */ \
-    t_ = (t_*11039 + 8192) >> 14; \
-    s3 += t_; \
-    sc = u_ + OD_RSHIFT1(t_); \
-    t_ = s4h + sb; \
-    /* 9937/16384 ~= (Sin[27*Pi/64] + Cos[27*Pi/64])/2 ~= 0.606505716548904 */ \
-    u_ = (s4*9937 + 8192) >> 14; \
-    /* 1489/1024 ~= (Sin[27*Pi/64] - Cos[27*Pi/64])*2 ~= 1.45410214658256 */ \
-    s4 = (sb*1489 + 512) >> 10; \
-    /* 3981/8192 ~= Cos[27*Pi/64]*2 ~= 0.48596035980652774 */ \
-    t_ = (t_*3981 + 4096) >> 13; \
-    s4 += t_; \
-    sb = u_ - OD_RSHIFT1(t_); \
-    t_ = s8 - s7h; \
-    /* 18611/32768 ~= (Sin[29*Pi/64] + Cos[29*Pi/64])/2 ~=
-        0.5679534922100714 */ \
-    u_ = (s7*18611 + 16384) >> 15; \
-    /* 55211/32768 ~= (Sin[29*Pi/64] - Cos[29*Pi/64])*2 ~=
-        1.6848920710188384 */ \
-    s7 = (s8*55211 + 16384) >> 15; \
-    /* 601/2048 ~= Cos[29*Pi/64]*2 ~= 0.2934609489107235 */ \
-    t_ = (t_*601 + 1024) >> 11; \
-    s7 += t_; \
-    s8 = u_ + OD_RSHIFT1(t_); \
-    t_ = s0h + sf; \
-    /* 1073/2048 ~= (Sin[31*Pi/64] + Cos[31*Pi/64])/2 ~= 0.5239315652662953 */ \
-    u_ = (s0*1073 + 1024) >> 11; \
-    /* 62241/32768 ~= (Sin[31*Pi/64] - Cos[31*Pi/64])*2 ~=
-        1.8994555637555088 */ \
-    s0 = (sf*62241 + 16384) >> 15; \
-    /* 201/2048 ~= Cos[31*Pi/64]*2 ~= 0.09813534865483603 */ \
-    t_ = (t_*201 + 1024) >> 11; \
-    s0 += t_; \
-    sf = u_ - OD_RSHIFT1(t_); \
-  } \
-  while (0)
-
 #define OD_FDCT_32_PR(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, \
   te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
   /* Embedded 32-point orthonormal Type-II fDCT. */ \
@@ -3576,138 +1790,6 @@
   } \
   while (0)
 
-#define OD_FDCT_32_FLAT(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, \
-  tm, te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
-  /* Embedded 32-point orthonormal Type-II fDCT with flattened rotations. */ \
-  do { \
-    od_coeff tgh; \
-    od_coeff thh; \
-    od_coeff tih; \
-    od_coeff tjh; \
-    od_coeff tkh; \
-    od_coeff tlh; \
-    od_coeff tmh; \
-    od_coeff tnh; \
-    od_coeff toh; \
-    od_coeff tph; \
-    od_coeff tqh; \
-    od_coeff trh; \
-    od_coeff tsh; \
-    od_coeff tth; \
-    od_coeff tuh; \
-    od_coeff tvh; \
-    tv = t0 - tv; \
-    tvh = OD_RSHIFT1(tv); \
-    t0 -= tvh; \
-    tu += t1; \
-    tuh = OD_RSHIFT1(tu); \
-    t1 -= tuh; \
-    tt = t2 - tt; \
-    tth = OD_RSHIFT1(tt); \
-    t2 -= tth; \
-    ts += t3; \
-    tsh = OD_RSHIFT1(ts); \
-    t3 -= tsh; \
-    tr = t4 - tr; \
-    trh = OD_RSHIFT1(tr); \
-    t4 -= trh; \
-    tq += t5; \
-    tqh = OD_RSHIFT1(tq); \
-    t5 -= tqh; \
-    tp = t6 - tp; \
-    tph = OD_RSHIFT1(tp); \
-    t6 -= tph; \
-    to += t7; \
-    toh = OD_RSHIFT1(to); \
-    t7 -= toh; \
-    tn = t8 - tn; \
-    tnh = OD_RSHIFT1(tn); \
-    t8 -= tnh; \
-    tm += t9; \
-    tmh = OD_RSHIFT1(tm); \
-    t9 -= tmh; \
-    tl = ta - tl; \
-    tlh = OD_RSHIFT1(tl); \
-    ta -= tlh; \
-    tk += tb; \
-    tkh = OD_RSHIFT1(tk); \
-    tb -= tkh; \
-    tj = tc - tj; \
-    tjh = OD_RSHIFT1(tj); \
-    tc -= tjh; \
-    ti += td; \
-    tih = OD_RSHIFT1(ti); \
-    td -= tih; \
-    th = te - th; \
-    thh = OD_RSHIFT1(th); \
-    te -= thh; \
-    tg += tf; \
-    tgh = OD_RSHIFT1(tg); \
-    tf -= tgh; \
-    OD_FDCT_16_ASYM_FLAT(t0, tg, tgh, t8, to, toh, t4, tk, tkh, tc, ts, tsh, \
-     t2, ti, tih, ta, tq, tqh, t6, tm, tmh, te, tu, tuh); \
-    OD_FDST_16_ASYM_FLAT(tv, tvh, tf, tn, tnh, t7, tr, trh, tb, tj, tjh, t3, \
-     tt, tth, td, tl, tlh, t5, tp, tph, t9, th, thh, t1); \
-  } \
-  while (0)
-
-#define OD_IDCT_32_FLAT(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, \
-  tm, te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
-  /* Embedded 32-point orthonormal Type-II iDCT with flattened rotations. */ \
-  do { \
-    od_coeff t1h; \
-    od_coeff t3h; \
-    od_coeff t5h; \
-    od_coeff t7h; \
-    od_coeff t9h; \
-    od_coeff tbh; \
-    od_coeff tdh; \
-    od_coeff tfh; \
-    od_coeff thh; \
-    od_coeff tth; \
-    od_coeff tvh; \
-    OD_IDST_16_ASYM_FLAT(tv, tn, tr, tj, tt, tl, tp, th, \
-     tu, tm, tq, ti, ts, tk, to, tg); \
-    OD_IDCT_16_ASYM_FLAT(t0, t8, t4, tc, t2, ta, t6, te, \
-     t1, t1h, t9, t9h, t5, t5h, td, tdh, t3, t3h, tb, tbh, t7, t7h, tf, tfh); \
-    tu += t1h; \
-    t1 -= tu; \
-    thh = OD_RSHIFT1(th); \
-    te += thh; \
-    th = te - th; \
-    tm += t9h; \
-    t9 -= tm; \
-    t6 += OD_RSHIFT1(tp); \
-    tp = t6 - tp; \
-    tq += t5h; \
-    t5 -= tq; \
-    ta += OD_RSHIFT1(tl); \
-    tl = ta - tl; \
-    ti += tdh; \
-    td -= ti; \
-    tth = OD_RSHIFT1(tt); \
-    t2 += tth; \
-    tt = t2 - tt; \
-    ts += t3h; \
-    t3 -= ts; \
-    tc += OD_RSHIFT1(tj); \
-    tj = tc - tj; \
-    tk += tbh; \
-    tb -= tk; \
-    t4 += OD_RSHIFT1(tr); \
-    tr = t4 - tr; \
-    to += t7h; \
-    t7 -= to; \
-    t8 += OD_RSHIFT1(tn); \
-    tn = t8 - tn; \
-    tg += tfh; \
-    tf -= tg; \
-    tvh = OD_RSHIFT1(tv); \
-    t0 += tvh; \
-    tv = t0 - tv; \
-  } \
-  while (0)
-
 /* Embedded 32-point orthonormal Type-IV fDST. */
 #define OD_FDST_32_PR(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, \
   te, tf, tg, th, ti, tj, tk, tl, tm, tn, to, tp, tq, tr, ts, tt, tu, tv) \
@@ -5682,7 +3764,7 @@
   q1 = x[1*xstride];
   q2 = x[2*xstride];
   q3 = x[3*xstride];
-  OD_FDCT_4_FLAT(q0, q1, q2, q3);
+  od_fdct_4(&q0, &q1, &q2, &q3);
   y[0] = (od_coeff)q0;
   y[1] = (od_coeff)q2;
   y[2] = (od_coeff)q1;
@@ -5699,7 +3781,7 @@
   q2 = y[1];
   q1 = y[2];
   q3 = y[3];
-  OD_IDCT_4_FLAT(q0, q2, q1, q3);
+  od_idct_4(&q0, &q2, &q1, &q3);
   x[0*xstride] = (od_coeff)q0;
   x[1*xstride] = (od_coeff)q1;
   x[2*xstride] = (od_coeff)q2;
@@ -5815,7 +3897,7 @@
   r5 = x[5*xstride];
   r6 = x[6*xstride];
   r7 = x[7*xstride];
-  OD_FDCT_8_FLAT(r0, r1, r2, r3, r4, r5, r6, r7);
+  od_fdct_8(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7);
   y[0] = (od_coeff)r0;
   y[1] = (od_coeff)r4;
   y[2] = (od_coeff)r2;
@@ -5843,7 +3925,7 @@
   r5 = y[5];
   r3 = y[6];
   r7 = y[7];
-  OD_IDCT_8_FLAT(r0, r4, r2, r6, r1, r5, r3, r7);
+  od_idct_8(&r0, &r4, &r2, &r6, &r1, &r5, &r3, &r7);
   x[0*xstride] = (od_coeff)r0;
   x[1*xstride] = (od_coeff)r1;
   x[2*xstride] = (od_coeff)r2;
@@ -5872,7 +3954,7 @@
   r5 = x[5*xstride];
   r6 = x[6*xstride];
   r7 = x[7*xstride];
-  OD_FDST_8_FLAT(r0, r1, r2, r3, r4, r5, r6, r7);
+  od_fdst_8(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7);
   y[0] = (od_coeff)r0;
   y[1] = (od_coeff)r4;
   y[2] = (od_coeff)r2;
@@ -5900,7 +3982,7 @@
   r5 = y[5];
   r3 = y[6];
   r7 = y[7];
-  OD_IDST_8_FLAT(r0, r4, r2, r6, r1, r5, r3, r7);
+  od_idst_8(&r0, &r4, &r2, &r6, &r1, &r5, &r3, &r7);
   x[0*xstride] = (od_coeff)r0;
   x[1*xstride] = (od_coeff)r1;
   x[2*xstride] = (od_coeff)r2;
@@ -6167,8 +4249,8 @@
   sd = x[13*xstride];
   se = x[14*xstride];
   sf = x[15*xstride];
-  OD_FDCT_16_FLAT(s0, s1, s2, s3, s4, s5, s6, s7,
-    s8, s9, sa, sb, sc, sd, se, sf);
+  od_fdct_16(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7,
+    &s8, &s9, &sa, &sb, &sc, &sd, &se, &sf);
   y[0] = (od_coeff)s0;
   y[1] = (od_coeff)s8;
   y[2] = (od_coeff)s4;
@@ -6220,8 +4302,8 @@
   sb = y[13];
   s7 = y[14];
   sf = y[15];
-  OD_IDCT_16_FLAT(s0, s8, s4, sc, s2, sa, s6, se,
-    s1, s9, s5, sd, s3, sb, s7, sf);
+  od_idct_16(&s0, &s8, &s4, &sc, &s2, &sa, &s6, &se,
+    &s1, &s9, &s5, &sd, &s3, &sb, &s7, &sf);
   x[0*xstride] = (od_coeff)s0;
   x[1*xstride] = (od_coeff)s1;
   x[2*xstride] = (od_coeff)s2;
@@ -6258,38 +4340,38 @@
   int se;
   int sf;
   s0 = x[0*xstride];
-  s8 = x[1*xstride];
-  s4 = x[2*xstride];
-  sc = x[3*xstride];
-  s2 = x[4*xstride];
-  sa = x[5*xstride];
+  s1 = x[1*xstride];
+  s2 = x[2*xstride];
+  s3 = x[3*xstride];
+  s4 = x[4*xstride];
+  s5 = x[5*xstride];
   s6 = x[6*xstride];
-  se = x[7*xstride];
-  s1 = x[8*xstride];
+  s7 = x[7*xstride];
+  s8 = x[8*xstride];
   s9 = x[9*xstride];
-  s5 = x[10*xstride];
-  sd = x[11*xstride];
-  s3 = x[12*xstride];
-  sb = x[13*xstride];
-  s7 = x[14*xstride];
+  sa = x[10*xstride];
+  sb = x[11*xstride];
+  sc = x[12*xstride];
+  sd = x[13*xstride];
+  se = x[14*xstride];
   sf = x[15*xstride];
-  OD_FDST_16_FLAT(s0, s8, s4, sc, s2, sa, s6, se,
-   s1, s9, s5, sd, s3, sb, s7, sf);
+  od_fdst_16(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7,
+    &s8, &s9, &sa, &sb, &sc, &sd, &se, &sf);
   y[0] = (od_coeff)s0;
-  y[1] = (od_coeff)s1;
-  y[2] = (od_coeff)s2;
-  y[3] = (od_coeff)s3;
-  y[4] = (od_coeff)s4;
-  y[5] = (od_coeff)s5;
+  y[1] = (od_coeff)s8;
+  y[2] = (od_coeff)s4;
+  y[3] = (od_coeff)sc;
+  y[4] = (od_coeff)s2;
+  y[5] = (od_coeff)sa;
   y[6] = (od_coeff)s6;
-  y[7] = (od_coeff)s7;
-  y[8] = (od_coeff)s8;
+  y[7] = (od_coeff)se;
+  y[8] = (od_coeff)s1;
   y[9] = (od_coeff)s9;
-  y[10] = (od_coeff)sa;
-  y[11] = (od_coeff)sb;
-  y[12] = (od_coeff)sc;
-  y[13] = (od_coeff)sd;
-  y[14] = (od_coeff)se;
+  y[10] = (od_coeff)s5;
+  y[11] = (od_coeff)sd;
+  y[12] = (od_coeff)s3;
+  y[13] = (od_coeff)sb;
+  y[14] = (od_coeff)s7;
   y[15] = (od_coeff)sf;
 }
 
@@ -6326,8 +4408,8 @@
   sb = y[13];
   s7 = y[14];
   sf = y[15];
-  OD_IDST_16_FLAT(s0, s8, s4, sc, s2, sa, s6, se,
-   s1, s9, s5, sd, s3, sb, s7, sf);
+  od_idst_16(&s0, &s8, &s4, &sc, &s2, &sa, &s6, &se,
+    &s1, &s9, &s5, &sd, &s3, &sb, &s7, &sf);
   x[0*xstride] = (od_coeff)s0;
   x[1*xstride] = (od_coeff)s1;
   x[2*xstride] = (od_coeff)s2;
@@ -6412,9 +4494,10 @@
   tn = x[29*xstride];
   tf = x[30*xstride];
   tv = x[31*xstride];
-  OD_FDCT_32_FLAT(
-    t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, te, tu,
-    t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv);
+  od_fdct_32(
+    &t0, &tg, &t8, &to, &t4, &tk, &tc, &ts, &t2, &ti, &ta, &tq, &t6, &tm, &te,
+    &tu, &t1, &th, &t9, &tp, &t5, &tl, &td, &tt, &t3, &tj, &tb, &tr, &t7, &tn,
+    &tf, &tv);
   y[0] = (od_coeff)t0;
   y[1] = (od_coeff)t1;
   y[2] = (od_coeff)t2;
@@ -6514,9 +4597,10 @@
   tn = y[29];
   tf = y[30];
   tv = y[31];
-  OD_IDCT_32_FLAT(
-    t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, te, tu,
-    t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv);
+  od_idct_32(
+    &t0, &tg, &t8, &to, &t4, &tk, &tc, &ts, &t2, &ti, &ta, &tq, &t6, &tm, &te,
+    &tu, &t1, &th, &t9, &tp, &t5, &tl, &td, &tt, &t3, &tj, &tb, &tr, &t7, &tn,
+    &tf, &tv);
   x[0*xstride] = (od_coeff)t0;
   x[1*xstride] = (od_coeff)t1;
   x[2*xstride] = (od_coeff)t2;
diff --git a/av1/common/daala_tx_kernels.h b/av1/common/daala_tx_kernels.h
new file mode 100644
index 0000000..d6542c0
--- /dev/null
+++ b/av1/common/daala_tx_kernels.h
@@ -0,0 +1,1603 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#ifndef AOM_DSP_DAALA_TX_KERNELS_H_
+#define AOM_DSP_DAALA_TX_KERNELS_H_
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/odintrin.h"
+
+#define AVG_BIAS (0)
+
+static INLINE od_coeff od_rshift1(od_coeff v) {
+  return (v + (v < 0)) >> 1;
+}
+
+static INLINE od_coeff od_add(od_coeff p0, od_coeff p1) {
+  return p0 + p1;
+}
+
+static INLINE od_coeff od_sub(od_coeff p0, od_coeff p1) {
+  return p0 - p1;
+}
+
+static INLINE od_coeff od_avg_add(od_coeff p0, od_coeff p1) {
+  return (od_add(p0, p1) + AVG_BIAS) >> 1;
+}
+
+static INLINE od_coeff od_avg_sub(od_coeff p0, od_coeff p1) {
+  return (od_sub(p0, p1) + AVG_BIAS) >> 1;
+}
+
+/* Fixed point multiply. */
+static INLINE od_coeff od_mul(od_coeff n, int c, int q) {
+  return (n*c + ((1 << q) >> 1)) >> q;
+}
+
+/* Two multiply rotation primative (used when rotating by Pi/4). */
+static INLINE void od_rot2(od_coeff *p0, od_coeff *p1, od_coeff t, int c0,
+ int q0, int c1, int q1) {
+  *p1 = od_mul(*p0, c0, q0);
+  *p0 = od_mul(t, c1, q1);
+}
+
+/* Rotate by Pi/4 and add. */
+static INLINE void od_rotate_pi4_add(od_coeff *p0, od_coeff *p1, od_coeff t,
+ int c0, int q0, int c1, int q1) {
+  od_rot2(p0, p1, t, c0, q0, c1, q1);
+  *p1 = od_add(*p1, *p0);
+}
+
+/* Rotate by Pi/4 and subtract. */
+static INLINE void od_rotate_pi4_sub(od_coeff *p0, od_coeff *p1, od_coeff t,
+ int c0, int q0, int c1, int q1) {
+  od_rot2(p0, p1, t, c0, q0, c1, q1);
+  *p1 = od_sub(*p1, *p0);
+}
+
+/* Three multiply rotation primative. */
+static INLINE void od_rot3(od_coeff *p0, od_coeff *p1, od_coeff *t, od_coeff *u,
+ int c0, int q0, int c1, int q1, int c2, int q2) {
+  *u = od_mul(*p0, c0, q0);
+  *p0 = od_mul(*p1, c1, q1);
+  *t = od_mul(*t, c2, q2);
+}
+
+/* Rotate and add. */
+static INLINE void od_rotate_add(od_coeff *p0, od_coeff *p1, od_coeff t,
+ int c0, int q0, int c1, int q1, int c2, int q2, int shift) {
+  od_coeff u;
+  od_rot3(p0, p1, &t, &u, c0, q0, c1, q1, c2, q2);
+  *p0 = od_add(*p0, t);
+  if (shift) t = od_rshift1(t);
+  *p1 = od_add(u, t);
+}
+
+/* Rotate and subtract. */
+static INLINE void od_rotate_sub(od_coeff *p0, od_coeff *p1, od_coeff t,
+ int c0, int q0, int c1, int q1, int c2, int q2, int shift) {
+  od_coeff u;
+  od_rot3(p0, p1, &t, &u, c0, q0, c1, q1, c2, q2);
+  *p0 = od_add(*p0, t);
+  if (shift) t = od_rshift1(t);
+  *p1 = od_sub(u, t);
+}
+
+/* Rotate and subtract with negation. */
+static INLINE void od_rotate_neg(od_coeff *p0, od_coeff *p1, od_coeff t,
+ int c0, int q0, int c1, int q1, int c2, int q2) {
+  od_coeff u;
+  od_rot3(p0, p1, &t, &u, c0, q0, c1, q1, c2, q2);
+  *p0 = od_sub(*p0, t);
+  *p1 = od_sub(t, u);
+}
+
+/* Computes the +/- addition butterfly (asymmetric output).
+   The inverse to this function is od_butterfly_add_asym().
+
+    p0 = p0 + p1;
+    p1 = p1 - p0/2; */
+static INLINE void od_butterfly_add(od_coeff *p0, od_coeff *p0h, od_coeff *p1) {
+  od_coeff p0h_;
+  *p0 = od_add(*p0, *p1);
+  p0h_ = od_rshift1(*p0);
+  *p1 = od_sub(*p1, p0h_);
+  if (p0h != NULL) *p0h = p0h_;
+}
+
+/* Computes the asymmetric +/- addition butterfly (unscaled output).
+   The inverse to this function is od_butterfly_add().
+
+    p1 = p1 + p0/2;
+    p0 = p0 - p1; */
+static INLINE void od_butterfly_add_asym(od_coeff *p0, od_coeff p0h,
+ od_coeff *p1) {
+  *p1 = od_add(*p1, p0h);
+  *p0 = od_sub(*p0, *p1);
+}
+
+/* Computes the +/- subtraction butterfly (asymmetric output).
+   The inverse to this function is od_butterfly_sub_asym().
+
+    p0 = p0 - p1;
+    p1 = p1 + p0/2; */
+static INLINE void od_butterfly_sub(od_coeff *p0, od_coeff *p0h, od_coeff *p1) {
+  od_coeff p0h_;
+  *p0 = od_sub(*p0, *p1);
+  p0h_ = od_rshift1(*p0);
+  *p1 = od_add(*p1, p0h_);
+  if (p0h != NULL) *p0h = p0h_;
+}
+
+/* Computes the asymmetric +/- subtraction butterfly (unscaled output).
+   The inverse to this function is od_butterfly_sub().
+
+    p1 = p1 - p0/2;
+    p0 = p0 + p1; */
+static INLINE void od_butterfly_sub_asym(od_coeff *p0, od_coeff p0h,
+ od_coeff *p1) {
+  *p1 = od_sub(*p1, p0h);
+  *p0 = od_add(*p0, *p1);
+}
+
+/* Computes the +/- subtract and negate butterfly (asymmetric output).
+   The inverse to this function is od_butterfly_neg_asym().
+
+    p1 = p1 - p0;
+    p0 = p0 + p1/2;
+    p1 = -p1; */
+static INLINE void od_butterfly_neg(od_coeff *p0, od_coeff *p1, od_coeff *p1h) {
+  *p1 = od_sub(*p0, *p1);
+  *p1h = od_rshift1(*p1);
+  *p0 = od_sub(*p0, *p1h);
+}
+
+/*  Computes the asymmetric +/- negate and subtract butterfly (unscaled output).
+    The inverse to this function is od_butterfly_neg().
+
+    p1 = -p1;
+    p0 = p0 - p1/2;
+    p1 = p1 + p0; */
+static INLINE void od_butterfly_neg_asym(od_coeff *p0, od_coeff *p1,
+ od_coeff p1h) {
+  *p0 = od_add(*p0, p1h);
+  *p1 = od_sub(*p0, *p1);
+}
+
+/* --- 2-point Transforms --- */
+
+/**
+ * 2-point orthonormal Type-II fDCT
+ */
+static INLINE void od_fdct_2(od_coeff *p0, od_coeff *p1) {
+  /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4]  = 1.4142135623730951 */
+  /* 11585/8192 = 2*Cos[Pi/4]            = 1.4142135623730951 */
+  od_rotate_pi4_add(p1, p0, od_avg_sub(*p0, *p1), 11585, 13, 11585, 13);
+}
+
+/**
+ * 2-point orthonormal Type-II iDCT
+ */
+static INLINE void od_idct_2(od_coeff *p0, od_coeff *p1) {
+  /*  11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+  /* 11585/16384 = Cos[Pi/4]             = 0.7071067811865475 */
+  od_rotate_pi4_sub(p0, p1, od_add(*p1, *p0), 11585, 13, 11585, 14);
+}
+
+/**
+ * 2-point asymmetric Type-II fDCT
+ */
+static INLINE void od_fdct_2_asym(od_coeff *p0, od_coeff *p1,
+ od_coeff p1h) {
+  od_butterfly_neg_asym(p0, p1, p1h);
+}
+
+/**
+ * 2-point asymmetric Type-II iDCT
+ */
+static INLINE void od_idct_2_asym(od_coeff *p0, od_coeff *p1, od_coeff *p1h) {
+  od_butterfly_neg(p0, p1, p1h);
+}
+
+/**
+ * 2-point orthonormal Type-IV fDCT
+ */
+static INLINE void od_fdst_2(od_coeff *p0, od_coeff *p1) {
+
+  /* Stage 0 */
+
+  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8]  = 1.3065629648763766 */
+  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8]  = 0.5411961001461971 */
+  /*   3135/4096 = 2*Cos[3*Pi/8]              = 0.7653668647301796 */
+  od_rotate_sub(p0, p1, od_avg_add(*p0, *p1), 21407, 14, 8867, 14, 3135, 12, 0);
+}
+
+/**
+ * 2-point orthonormal Type-IV iDCT
+ */
+static INLINE void od_idst_2(od_coeff *p0, od_coeff *p1) {
+  od_fdst_2(p0, p1);
+}
+
+/**
+ * 2-point asymmetric Type-IV fDCT
+ */
+static INLINE void od_fdst_2_asym(od_coeff *p0, od_coeff p0h,
+ od_coeff *p1) {
+
+  /* Stage 0 */
+
+  /* 15137/16384 = (Sin[3*Pi/8] + Cos[3*Pi/8])/Sqrt[2] = 0.9238795325112867 */
+  /*   3135/4096 = (Sin[3*Pi/8] - Cos[3*Pi/8])*Sqrt[2] = 0.7653668647301795 */
+  /*  8867/16384 = Cos[3*Pi/8]*Sqrt[2]                 = 0.5411961001461971 */
+  od_rotate_sub(p0, p1, od_add(p0h, *p1), 15137, 14, 3135, 12, 8867, 14, 0);
+}
+
+/**
+ * 2-point asymmetric Type-IV iDCT
+ */
+static INLINE void od_idst_2_asym(od_coeff *p0, od_coeff *p1) {
+
+  /* Stage 0 */
+
+  /* 15137/16384 = (Sin[3*Pi/8] + Cos[3*Pi/8])/Sqrt[2] = 0.9238795325112867 */
+  /*   3135/4096 = (Sin[3*Pi/8] - Cos[3*Pi/8])*Sqrt[2] = 0.7653668647301795 */
+  /*   8867/8192 = 2*Cos[3*Pi/8]*Sqrt[2]               = 1.0823922002923940 */
+  od_rotate_sub(p0, p1, od_avg_add(*p1, *p0), 15137, 14, 3135, 12, 8867, 13, 1);
+}
+
+/* --- 4-point Transforms --- */
+
+/**
+ * 4-point orthonormal Type-II fDCT
+ */
+static INLINE void od_fdct_4(od_coeff *q0, od_coeff *q1, od_coeff *q2,
+ od_coeff *q3) {
+  od_coeff q1h;
+  od_coeff q3h;
+
+  /* +/- Butterflies with asymmetric output. */
+  od_butterfly_neg(q0, q3, &q3h);
+  od_butterfly_add(q1, &q1h, q2);
+
+  /* Embedded 2-point transforms with asymmetric input. */
+  od_fdct_2_asym(q0, q1, q1h);
+  od_fdst_2_asym(q3, q3h, q2);
+}
+
+/**
+ * 4-point orthonormal Type-II iDCT
+ */
+static INLINE void od_idct_4(od_coeff *q0, od_coeff *q2,
+                             od_coeff *q1, od_coeff *q3)  {
+  od_coeff q1h;
+
+  /* Embedded 2-point transforms with asymmetric output. */
+  od_idst_2_asym(q3, q2);
+  od_idct_2_asym(q0, q1, &q1h);
+
+  /* +/- Butterflies with asymmetric input. */
+  od_butterfly_add_asym(q1, q1h, q2);
+  od_butterfly_neg_asym(q0, q3, od_rshift1(*q3));
+}
+
+/**
+ * 4-point asymmetric Type-II fDCT
+ */
+static INLINE void od_fdct_4_asym(od_coeff *q0, od_coeff *q1, od_coeff q1h,
+                                  od_coeff *q2, od_coeff *q3, od_coeff q3h) {
+
+  /* +/- Butterflies with asymmetric input. */
+  od_butterfly_neg_asym(q0, q3, q3h);
+  od_butterfly_sub_asym(q1, q1h, q2);
+
+  /* Embedded 2-point orthonormal transforms. */
+  od_fdct_2(q0, q1);
+  od_fdst_2(q3, q2);
+}
+
+/**
+ * 4-point asymmetric Type-II iDCT
+ */
+static INLINE void od_idct_4_asym(od_coeff *q0, od_coeff *q2,
+                                  od_coeff *q1, od_coeff *q1h,
+                                  od_coeff *q3, od_coeff *q3h)  {
+
+  /* Embedded 2-point orthonormal transforms. */
+  od_idst_2(q3, q2);
+  od_idct_2(q0, q1);
+
+  /* +/- Butterflies with asymmetric output. */
+  od_butterfly_sub(q1, q1h, q2);
+  od_butterfly_neg(q0, q3, q3h);
+}
+
+/**
+ * 4-point orthonormal Type-IV fDST
+ */
+static INLINE void od_fdst_4(od_coeff *q0, od_coeff *q1,
+                             od_coeff *q2, od_coeff *q3) {
+
+  /* Stage 0 */
+
+  /* 13623/16384 = (Sin[7*Pi/16] + Cos[7*Pi/16])/Sqrt[2] = 0.831469612302545 */
+  /* 18205/16384 = (Sin[7*Pi/16] - Cos[7*Pi/16])*Sqrt[2] = 1.111140466039204 */
+  /*  9041/32768 = Cos[7*Pi/16]*Sqrt[2]                  = 0.275899379282943 */
+  od_rotate_sub(q0, q3, od_add(*q3, *q0), 13623, 14, 18205, 14, 9041, 15, 1);
+
+  /* 16069/16384 = (Sin[5*Pi/16] + Cos[5*Pi/16])/Sqrt[2] = 0.9807852804032304 */
+  /* 12785/32768 = (Sin[5*Pi/16] - Cos[5*Pi/16])*Sqrt[2] = 0.3901806440322566 */
+  /* 12873/16384 = Cos[5*Pi/16]*Sqrt[2]                  = 0.7856949583871021 */
+  od_rotate_add(q2, q1, od_sub(*q1, *q2), 16069, 14, 12785, 15, 12873, 14, 1);
+
+  /* Stage 1 */
+
+  od_butterfly_sub_asym(q0, od_rshift1(*q0), q1);
+  od_butterfly_sub_asym(q2, od_rshift1(*q2), q3);
+
+  /* Stage 2 */
+
+  /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+  /* 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951 */
+  od_rotate_pi4_sub(q2, q1, od_avg_add(*q1, *q2), 11585, 13, 11585, 13);
+}
+
+/**
+ * 4-point orthonormal Type-IV iDST
+ */
+static INLINE void od_idst_4(od_coeff *q0, od_coeff *q2,
+                             od_coeff *q1, od_coeff *q3) {
+  od_coeff q0h;
+  od_coeff q2h;
+
+  /* Stage 0 */
+
+  /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+  /* 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951 */
+  od_rotate_pi4_sub(q2, q1, od_avg_add(*q1, *q2), 11585, 13, 11585, 13);
+
+  /* Stage 1 */
+
+  od_butterfly_sub(q2, &q2h, q3);
+  od_butterfly_sub(q0, &q0h, q1);
+
+  /* Stage 2 */
+
+  /* 16069/16384 = (Sin[5*Pi/16] + Cos[5*Pi/16])/Sqrt[2] = 0.9807852804032304 */
+  /* 12785/32768 = (Sin[5*Pi/16] - Cos[5*Pi/16])*Sqrt[2] = 0.3901806440322566 */
+  /* 12873/16384 = Cos[5*Pi/16]*Sqrt[2]                  = 0.7856949583871021 */
+  od_rotate_add(q2, q1, od_sub(*q1, q2h), 16069, 14, 12785, 15, 12873, 14, 0);
+
+  /* 13623/16384 = (Sin[7*Pi/16] + Cos[7*Pi/16])/Sqrt[2] = 0.831469612302545 */
+  /* 18205/16384 = (Sin[7*Pi/16] - Cos[7*Pi/16])*Sqrt[2] = 1.111140466039204 */
+  /*  9041/32768 = Cos[7*Pi/16]*Sqrt[2]                  = 0.275899379282943 */
+  od_rotate_sub(q0, q3, od_add(q0h, *q3), 13623, 14, 18205, 14, 9041, 15, 0);
+}
+
+/**
+ * 4-point asymmetric Type-IV fDST
+ */
+static INLINE void od_fdst_4_asym(od_coeff *q0, od_coeff q0h, od_coeff *q1,
+                                  od_coeff *q2, od_coeff q2h, od_coeff *q3) {
+
+  /* Stage 0 */
+
+  /*  9633/16384 = (Sin[7*Pi/16] + Cos[7*Pi/16])/2 = 0.5879378012096793 */
+  /*  12873/8192 = (Sin[7*Pi/16] - Cos[7*Pi/16])*2 = 1.5713899167742045 */
+  /* 12785/32768 = Cos[7*Pi/16]*2                  = 0.3901806440322565 */
+  od_rotate_sub(q0, q3, od_add(q0h, *q3), 9633, 14, 12873, 13, 12785, 15, 1);
+
+  /* 22725/32768 = (Sin[5*Pi/16] + Cos[5*Pi/16])/2 = 0.6935199226610738 */
+  /* 18081/32768 = (Sin[5*Pi/16] - Cos[5*Pi/16])*2 = 0.5517987585658861 */
+  /* 18205/16384 = Cos[5*Pi/16]*2                  = 1.1111404660392044 */
+  od_rotate_add(q2, q1, od_sub(*q1, q2h), 22725, 15, 18081, 15, 18205, 14, 1);
+
+  /* Stage 1 */
+
+  od_butterfly_sub_asym(q0, od_rshift1(*q0), q1);
+  od_butterfly_sub_asym(q2, od_rshift1(*q2), q3);
+
+  /* Stage 2 */
+
+  /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+  /* 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951 */
+  od_rotate_pi4_sub(q2, q1, od_avg_add(*q1, *q2), 11585, 13, 11585, 13);
+}
+
+/**
+ * 4-point asymmetric Type-IV iDST
+ */
+static INLINE void od_idst_4_asym(od_coeff *q0, od_coeff *q2,
+                                  od_coeff *q1, od_coeff *q3) {
+  od_coeff q0h;
+  od_coeff q2h;
+
+  /* Stage 0 */
+
+  /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+  /* 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951 */
+  od_rotate_pi4_sub(q2, q1, od_avg_add(*q1, *q2), 11585, 13, 11585, 13);
+
+  /* Stage 1 */
+
+  od_butterfly_sub(q2, &q2h, q3);
+  od_butterfly_sub(q0, &q0h, q1);
+
+  /* Stage 2 */
+
+  /* 22725/32768 = (Sin[5*Pi/16] + Cos[5*Pi/16])/2 = 0.6935199226610738 */
+  /* 18081/32768 = (Sin[5*Pi/16] - Cos[5*Pi/16])*2 = 0.5517987585658861 */
+  /* 18205/16384 = Cos[5*Pi/16]*2                  = 1.1111404660392044 */
+  od_rotate_add(q2, q1, od_sub(*q1, q2h), 22725, 15, 18081, 15, 18205, 14, 1);
+
+  /*  9633/16384 = (Sin[7*Pi/16] + Cos[7*Pi/16])/2 = 0.5879378012096793 */
+  /*  12873/8192 = (Sin[7*Pi/16] - Cos[7*Pi/16])*2 = 1.5713899167742045 */
+  /* 12785/32768 = Cos[7*Pi/16]*2                  = 0.3901806440322565 */
+  od_rotate_sub(q0, q3, od_add(q0h, *q3), 9633, 14, 12873, 13, 12785, 15, 1);
+}
+
+/* --- 8-point Transforms --- */
+
+/**
+ * 8-point orthonormal Type-II fDCT
+ */
+static INLINE void od_fdct_8(od_coeff *r0, od_coeff *r1,
+                             od_coeff *r2, od_coeff *r3,
+                             od_coeff *r4, od_coeff *r5,
+                             od_coeff *r6, od_coeff *r7) {
+  od_coeff r1h;
+  od_coeff r3h;
+  od_coeff r5h;
+  od_coeff r7h;
+
+  /* +/- Butterflies with asymmetric output. */
+  od_butterfly_neg(r0, r7, &r7h);
+  od_butterfly_add(r1, &r1h, r6);
+  od_butterfly_neg(r2, r5, &r5h);
+  od_butterfly_add(r3, &r3h, r4);
+
+  /* Embedded 4-point forward transforms with asymmetric input. */
+  od_fdct_4_asym(r0, r1, r1h, r2, r3, r3h);
+  od_fdst_4_asym(r7, r7h, r6, r5, r5h, r4);
+}
+
+/**
+ * 8-point orthonormal Type-II iDCT
+ */
+static INLINE void od_idct_8(od_coeff *r0, od_coeff *r4,
+                             od_coeff *r2, od_coeff *r6,
+                             od_coeff *r1, od_coeff *r5,
+                             od_coeff *r3, od_coeff *r7) {
+  od_coeff r1h;
+  od_coeff r3h;
+
+  /* Embedded 4-point inverse transforms with asymmetric output. */
+  od_idst_4_asym(r7, r5, r6, r4);
+  od_idct_4_asym(r0, r2, r1, &r1h, r3, &r3h);
+
+  /* +/- Butterflies with asymmetric input. */
+  od_butterfly_add_asym(r3, r3h, r4);
+  od_butterfly_neg_asym(r2, r5, od_rshift1(*r5));
+  od_butterfly_add_asym(r1, r1h, r6);
+  od_butterfly_neg_asym(r0, r7, od_rshift1(*r7));
+}
+
+/**
+ * 8-point asymmetric Type-II fDCT
+ */
+static INLINE void od_fdct_8_asym(od_coeff *r0, od_coeff *r1, od_coeff r1h,
+                                  od_coeff *r2, od_coeff *r3, od_coeff r3h,
+                                  od_coeff *r4, od_coeff *r5, od_coeff r5h,
+                                  od_coeff *r6, od_coeff *r7, od_coeff r7h) {
+
+  /* +/- Butterflies with asymmetric input. */
+  od_butterfly_neg_asym(r0, r7, r7h);
+  od_butterfly_sub_asym(r1, r1h, r6);
+  od_butterfly_neg_asym(r2, r5, r5h);
+  od_butterfly_sub_asym(r3, r3h, r4);
+
+  /* Embedded 4-point orthonormal transforms. */
+  od_fdct_4(r0, r1, r2, r3);
+  od_fdst_4(r7, r6, r5, r4);
+}
+
+/**
+ * 8-point asymmetric Type-II iDCT
+ */
+static INLINE void od_idct_8_asym(od_coeff *r0, od_coeff *r4,
+                                  od_coeff *r2, od_coeff *r6,
+                                  od_coeff *r1, od_coeff *r1h,
+                                  od_coeff *r5, od_coeff *r5h,
+                                  od_coeff *r3, od_coeff *r3h,
+                                  od_coeff *r7, od_coeff *r7h)  {
+
+  /* Embedded 4-point inverse orthonormal transforms. */
+  od_idst_4(r7, r5, r6, r4);
+  od_idct_4(r0, r2, r1, r3);
+
+  /* +/- Butterflies with asymmetric output. */
+  od_butterfly_sub(r3, r3h, r4);
+  od_butterfly_neg(r2, r5, r5h);
+  od_butterfly_sub(r1, r1h, r6);
+  od_butterfly_neg(r0, r7, r7h);
+}
+
+/**
+ * 8-point orthonormal Type-IV fDST
+ */
+static INLINE void od_fdst_8(od_coeff *r0, od_coeff *r1,
+                             od_coeff *r2, od_coeff *r3,
+                             od_coeff *r4, od_coeff *r5,
+                             od_coeff *r6, od_coeff *r7) {
+  od_coeff r0h;
+  od_coeff r2h;
+  od_coeff r5h;
+  od_coeff r7h;
+
+  /* Stage 0 */
+
+  /* 17911/16384 = Sin[15*Pi/32] + Cos[15*Pi/32] = 1.0932018670017576 */
+  /* 14699/16384 = Sin[15*Pi/32] - Cos[15*Pi/32] = 0.8971675863426363 */
+  /*    803/8192 = Cos[15*Pi/32]                 = 0.0980171403295606 */
+  od_rotate_sub(r0, r7, od_add(*r7, *r0), 17911, 14, 14699, 14, 803, 13, 0);
+
+  /* 40869/32768 = Sin[13*Pi/32] + Cos[13*Pi/32] = 1.24722501298667123 */
+  /* 21845/32768 = Sin[13*Pi/32] - Cos[13*Pi/32] = 0.66665565847774650 */
+  /*   1189/4096 = Cos[13*Pi/32]                 = 0.29028467725446233 */
+  od_rotate_add(r6, r1, od_sub(*r1, *r6), 40869, 15, 21845, 15, 1189, 12, 0);
+
+  /* 22173/16384 = Sin[11*Pi/32] + Cos[11*Pi/32] = 1.3533180011743526 */
+  /*   3363/8192 = Sin[11*Pi/32] - Cos[11*Pi/32] = 0.4105245275223574 */
+  /* 15447/32768 = Cos[11*Pi/32]                 = 0.47139673682599764 */
+  od_rotate_sub(r2, r5, od_add(*r5, *r2), 22173, 14, 3363, 13, 15447, 15, 0);
+
+  /* 23059/16384 = Sin[9*Pi/32] + Cos[9*Pi/32] = 1.4074037375263826 */
+  /*  2271/16384 = Sin[9*Pi/32] - Cos[9*Pi/32] = 0.1386171691990915 */
+  /*   5197/8192 = Cos[9*Pi/32]                = 0.6343932841636455 */
+  od_rotate_add(r4, r3, od_sub(*r3, *r4), 23059, 14, 2271, 14, 5197, 13, 0);
+
+  /* Stage 1 */
+
+  od_butterfly_add(r0, &r0h, r3);
+  od_butterfly_sub(r2, &r2h, r1);
+  od_butterfly_add(r5, &r5h, r6);
+  od_butterfly_sub(r7, &r7h, r4);
+
+  /* Stage 2 */
+
+  od_butterfly_add_asym(r7, r7h, r6);
+  od_butterfly_add_asym(r5, r5h, r3);
+  od_butterfly_add_asym(r2, r2h, r4);
+  od_butterfly_sub_asym(r0, r0h, r1);
+
+  /* Stage 3 */
+
+  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+  /*   3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796 */
+  od_rotate_add(r3, r4, od_avg_sub(*r4, *r3), 21407, 14, 8867, 14, 3135, 12, 0);
+
+  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+  /*   3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796 */
+  od_rotate_neg(r2, r5, od_avg_sub(*r2, *r5), 21407, 14, 8867, 14, 3135, 12);
+
+  /* 46341/32768 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+  /* 46341/32768 = 2*Cos[Pi/4]           = 1.4142135623730951 */
+  od_rotate_pi4_add(r1, r6, od_avg_sub(*r6, *r1), 46341, 15, 46341, 15);
+}
+
+/**
+ * 8-point orthonormal Type-IV iDST
+ */
+static INLINE void od_idst_8(od_coeff *r0, od_coeff *r4,
+                             od_coeff *r2, od_coeff *r6,
+                             od_coeff *r1, od_coeff *r5,
+                             od_coeff *r3, od_coeff *r7) {
+  od_coeff r0h;
+  od_coeff r2h;
+  od_coeff r5h;
+  od_coeff r7h;
+
+  /* Stage 3 */
+
+  /* 46341/32768 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+  /* 46341/32768 = 2*Cos[Pi/4]           = 1.4142135623730951 */
+  od_rotate_pi4_sub(r6, r1, od_avg_add(*r1, *r6), 11585, 13, 46341, 15);
+
+  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+  /*   3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796 */
+  od_rotate_neg(r5, r2, od_avg_sub(*r5, *r2), 21407, 14, 8867, 14, 3135, 12);
+
+  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+  /*   3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796 */
+  od_rotate_sub(r4, r3, od_avg_add(*r3, *r4), 21407, 14, 8867, 14, 3135, 12, 0);
+
+  /* Stage 2 */
+
+  od_butterfly_sub(r0, &r0h, r1);
+  od_butterfly_add(r2, &r2h, r4);
+  od_butterfly_add(r5, &r5h, r3);
+  od_butterfly_add(r7, &r7h, r6);
+
+  /* Stage 1 */
+
+  od_butterfly_sub_asym(r7, r7h, r4);
+  od_butterfly_add_asym(r5, r5h, r6);
+  od_butterfly_sub_asym(r2, r2h, r1);
+  od_butterfly_add_asym(r0, r0h, r3);
+
+  /* Stage 0 */
+
+  /* 23059/16384 = Sin[9*Pi/32] + Cos[9*Pi/32] = 1.4074037375263826 */
+  /*  2271/16384 = Sin[9*Pi/32] - Cos[9*Pi/32] = 0.1386171691990915 */
+  /*   5197/8192 = Cos[9*Pi/32]                = 0.6343932841636455 */
+  od_rotate_add(r4, r3, od_sub(*r3, *r4), 23059, 14, 2271, 14, 5197, 13, 0);
+
+  /* 22173/16384 = Sin[11*Pi/32] + Cos[11*Pi/32] = 1.3533180011743526 */
+  /*   3363/8192 = Sin[11*Pi/32] - Cos[11*Pi/32] = 0.4105245275223574 */
+  /* 15447/32768 = Cos[11*Pi/32]                 = 0.47139673682599764 */
+  od_rotate_sub(r2, r5, od_add(*r5, *r2), 22173, 14, 3363, 13, 15447, 15, 0);
+
+  /* 40869/32768 = Sin[13*Pi/32] + Cos[13*Pi/32] = 1.24722501298667123 */
+  /* 21845/32768 = Sin[13*Pi/32] - Cos[13*Pi/32] = 0.66665565847774650 */
+  /*   1189/4096 = Cos[13*Pi/32]                 = 0.29028467725446233 */
+  od_rotate_add(r6, r1, od_sub(*r1, *r6), 40869, 15, 21845, 15, 1189, 12, 0);
+
+  /* 17911/16384 = Sin[15*Pi/32] + Cos[15*Pi/32] = 1.0932018670017576 */
+  /* 14699/16384 = Sin[15*Pi/32] - Cos[15*Pi/32] = 0.8971675863426363 */
+  /*    803/8192 = Cos[15*Pi/32]                 = 0.0980171403295606 */
+  od_rotate_sub(r0, r7, od_add(*r7, *r0), 17911, 14, 14699, 14, 803, 13, 0);
+}
+
+/**
+ * 8-point asymmetric Type-IV fDST
+ */
+static INLINE void od_fdst_8_asym(od_coeff *r0, od_coeff r0h, od_coeff *r1,
+                                  od_coeff *r2, od_coeff r2h, od_coeff *r3,
+                                  od_coeff *r4, od_coeff r4h, od_coeff *r5,
+                                  od_coeff *r6, od_coeff r6h, od_coeff *r7) {
+  od_coeff r5h;
+  od_coeff r7h;
+
+  /* Stage 0 */
+
+  /* 12665/16384 = (Sin[15*Pi/32] + Cos[15*Pi/32])/Sqrt[2] = 0.77301045336274 */
+  /*   5197/4096 = (Sin[15*Pi/32] - Cos[15*Pi/32])*Sqrt[2] = 1.26878656832729 */
+  /*  2271/16384 = Cos[15*Pi/32]*Sqrt[2]                   = 0.13861716919909 */
+  od_rotate_sub(r0, r7, od_add(*r7, r0h), 12665, 14, 5197, 12, 2271, 14, 0);
+
+  /* 28899/32768 = Sin[13*Pi/32] + Cos[13*Pi/32])/Sqrt[2] = 0.881921264348355 */
+  /* 30893/32768 = Sin[13*Pi/32] - Cos[13*Pi/32])*Sqrt[2] = 0.942793473651995 */
+  /*   3363/8192 = Cos[13*Pi/32]*Sqrt[2]                  = 0.410524527522357 */
+  od_rotate_add(r6, r1, od_sub(*r1, r6h), 28899, 15, 30893, 15, 3363, 13, 0);
+
+  /* 31357/32768 = Sin[11*Pi/32] + Cos[11*Pi/32])/Sqrt[2] = 0.956940335732209 */
+  /*   1189/2048 = Sin[11*Pi/32] - Cos[11*Pi/32])*Sqrt[2] = 0.580569354508925 */
+  /* 21845/32768 = Cos[11*Pi/32]*Sqrt[2]                  = 0.666655658477747 */
+  od_rotate_sub(r2, r5, od_add(*r5, r2h), 31357, 15, 1189, 11, 21845, 15, 0);
+
+  /* 16305/16384 = (Sin[9*Pi/32] + Cos[9*Pi/32])/Sqrt[2] = 0.9951847266721969 */
+  /*    803/4096 = (Sin[9*Pi/32] - Cos[9*Pi/32])*Sqrt[2] = 0.1960342806591213 */
+  /* 14699/16384 = Cos[9*Pi/32]*Sqrt[2]                  = 0.8971675863426364 */
+  od_rotate_add(r4, r3, od_sub(*r3, r4h), 16305, 14, 803, 12, 14699, 14, 0);
+
+  /* Stage 1 */
+
+  od_butterfly_add(r0, &r0h, r3);
+  od_butterfly_sub(r2, &r2h, r1);
+  od_butterfly_add(r5, &r5h, r6);
+  od_butterfly_sub(r7, &r7h, r4);
+
+  /* Stage 2 */
+
+  od_butterfly_add_asym(r7, r7h, r6);
+  od_butterfly_add_asym(r5, r5h, r3);
+  od_butterfly_add_asym(r2, r2h, r4);
+  od_butterfly_sub_asym(r0, r0h, r1);
+
+  /* Stage 3 */
+
+  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+  /*   3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796 */
+  od_rotate_add(r3, r4, od_avg_sub(*r4, *r3), 21407, 14, 8867, 14, 3135, 12, 0);
+
+  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+  /*   3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796 */
+  od_rotate_neg(r2, r5, od_avg_sub(*r2, *r5), 21407, 14, 8867, 14, 3135, 12);
+
+  /* 46341/32768 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+  /* 46341/32768 = 2*Cos[Pi/4]           = 1.4142135623730951 */
+  od_rotate_pi4_add(r1, r6, od_avg_sub(*r6, *r1), 46341, 15, 46341, 15);
+}
+
+/**
+ * 8-point asymmetric Type-IV iDST
+ */
+static INLINE void od_idst_8_asym(od_coeff *r0, od_coeff *r4,
+                                  od_coeff *r2, od_coeff *r6,
+                                  od_coeff *r1, od_coeff *r5,
+                                  od_coeff *r3, od_coeff *r7) {
+  od_coeff r0h;
+  od_coeff r2h;
+  od_coeff r5h;
+  od_coeff r7h;
+
+  /* Stage 3 */
+
+  /* 46341/32768 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+  /* 46341/32768 = 2*Cos[Pi/4]           = 1.4142135623730951 */
+  od_rotate_pi4_sub(r6, r1, od_avg_add(*r1, *r6), 11585, 13, 11585, 13);
+
+  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+  /*   3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796 */
+  od_rotate_neg(r5, r2, od_avg_sub(*r5, *r2), 21407, 14, 8867, 14, 3135, 12);
+
+  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+  /*   3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796 */
+  od_rotate_sub(r4, r3, od_avg_add(*r3, *r4), 21407, 14, 8867, 14, 3135, 12, 0);
+
+  /* Stage 2 */
+
+  od_butterfly_sub(r0, &r0h, r1);
+  od_butterfly_add(r2, &r2h, r4);
+  od_butterfly_add(r5, &r5h, r3);
+  od_butterfly_add(r7, &r7h, r6);
+
+  /* Stage 1 */
+
+  od_butterfly_sub_asym(r7, r7h, r4);
+  od_butterfly_add_asym(r5, r5h, r6);
+  od_butterfly_sub_asym(r2, r2h, r1);
+  od_butterfly_add_asym(r0, r0h, r3);
+
+  /* Stage 0 */
+
+  /* 16305/16384 = (Sin[9*Pi/32] + Cos[9*Pi/32])/Sqrt[2] = 0.9951847266721969 */
+  /*    803/4096 = (Sin[9*Pi/32] - Cos[9*Pi/32])*Sqrt[2] = 0.1960342806591213 */
+  /* 14699/16384 = Cos[9*Pi/32]*Sqrt[2]                  = 0.8971675863426364 */
+  od_rotate_add(r4, r3, od_sub(*r3, *r4), 16305, 14, 803, 12, 14699, 14, 1);
+
+  /* 31357/32768 = Sin[11*Pi/32] + Cos[11*Pi/32])/Sqrt[2] = 0.956940335732209 */
+  /*   1189/2048 = Sin[11*Pi/32] - Cos[11*Pi/32])*Sqrt[2] = 0.580569354508925 */
+  /* 21845/32768 = Cos[11*Pi/32]*Sqrt[2]                  = 0.666655658477747 */
+  od_rotate_sub(r2, r5, od_add(*r5, *r2), 31357, 15, 1189, 11, 21845, 15, 1);
+
+  /* 28899/32768 = Sin[13*Pi/32] + Cos[13*Pi/32])/Sqrt[2] = 0.881921264348355 */
+  /* 30893/32768 = Sin[13*Pi/32] - Cos[13*Pi/32])*Sqrt[2] = 0.942793473651995 */
+  /*   3363/8192 = Cos[13*Pi/32]*Sqrt[2]                  = 0.410524527522357 */
+  od_rotate_add(r6, r1, od_sub(*r1, *r6), 28899, 15, 30893, 15, 3363, 13, 1);
+
+  /* 12665/16384 = (Sin[15*Pi/32] + Cos[15*Pi/32])/Sqrt[2] = 0.77301045336274 */
+  /*   5197/4096 = (Sin[15*Pi/32] - Cos[15*Pi/32])*Sqrt[2] = 1.26878656832729 */
+  /*  2271/16384 = Cos[15*Pi/32]*Sqrt[2]                   = 0.13861716919909 */
+  od_rotate_sub(r0, r7, od_add(*r7, *r0), 12665, 14, 5197, 12, 2271, 14, 1);
+}
+
+/* --- 16-point Transforms --- */
+
+/**
+ * 16-point orthonormal Type-II fDCT
+ */
+static INLINE void od_fdct_16(od_coeff *s0, od_coeff *s1,
+                              od_coeff *s2, od_coeff *s3,
+                              od_coeff *s4, od_coeff *s5,
+                              od_coeff *s6, od_coeff *s7,
+                              od_coeff *s8, od_coeff *s9,
+                              od_coeff *sa, od_coeff *sb,
+                              od_coeff *sc, od_coeff *sd,
+                              od_coeff *se, od_coeff *sf) {
+  od_coeff s1h;
+  od_coeff s3h;
+  od_coeff s5h;
+  od_coeff s7h;
+  od_coeff s9h;
+  od_coeff sbh;
+  od_coeff sdh;
+  od_coeff sfh;
+
+  /* +/- Butterflies with asymmetric output. */
+  od_butterfly_neg(s0, sf, &sfh);
+  od_butterfly_add(s1, &s1h, se);
+  od_butterfly_neg(s2, sd, &sdh);
+  od_butterfly_add(s3, &s3h, sc);
+  od_butterfly_neg(s4, sb, &sbh);
+  od_butterfly_add(s5, &s5h, sa);
+  od_butterfly_neg(s6, s9, &s9h);
+  od_butterfly_add(s7, &s7h, s8);
+
+  /* Embedded 8-point transforms with asymmetric input. */
+  od_fdct_8_asym(s0, s1, s1h, s2, s3, s3h, s4, s5, s5h, s6, s7, s7h);
+  od_fdst_8_asym(sf, sfh, se, sd, sdh, sc, sb, sbh, sa, s9, s9h, s8);
+}
+
+/**
+ * 16-point orthonormal Type-II iDCT
+ */
+static INLINE void od_idct_16(od_coeff *s0, od_coeff *s8,
+                              od_coeff *s4, od_coeff *sc,
+                              od_coeff *s2, od_coeff *sa,
+                              od_coeff *s6, od_coeff *se,
+                              od_coeff *s1, od_coeff *s9,
+                              od_coeff *s5, od_coeff *sd,
+                              od_coeff *s3, od_coeff *sb,
+                              od_coeff *s7, od_coeff *sf) {
+  od_coeff s1h;
+  od_coeff s3h;
+  od_coeff s5h;
+  od_coeff s7h;
+
+  /* Embedded 8-point transforms with asymmetric output. */
+  od_idst_8_asym(sf, sb, sd, s9, se, sa, sc, s8);
+  od_idct_8_asym(s0, s4, s2, s6, s1, &s1h, s5, &s5h, s3, &s3h, s7, &s7h);
+
+  /* +/- Butterflies with asymmetric input. */
+  od_butterfly_add_asym(s7, s7h, s8);
+  od_butterfly_neg_asym(s6, s9, od_rshift1(*s9));
+  od_butterfly_add_asym(s5, s5h, sa);
+  od_butterfly_neg_asym(s4, sb, od_rshift1(*sb));
+  od_butterfly_add_asym(s3, s3h, sc);
+  od_butterfly_neg_asym(s2, sd, od_rshift1(*sd));
+  od_butterfly_add_asym(s1, s1h, se);
+  od_butterfly_neg_asym(s0, sf, od_rshift1(*sf));
+}
+
+/**
+ * 16-point asymmetric Type-II fDCT
+ */
+static INLINE void od_fdct_16_asym(od_coeff *s0, od_coeff *s1, od_coeff s1h,
+                                   od_coeff *s2, od_coeff *s3, od_coeff s3h,
+                                   od_coeff *s4, od_coeff *s5, od_coeff s5h,
+                                   od_coeff *s6, od_coeff *s7, od_coeff s7h,
+                                   od_coeff *s8, od_coeff *s9, od_coeff s9h,
+                                   od_coeff *sa, od_coeff *sb, od_coeff sbh,
+                                   od_coeff *sc, od_coeff *sd, od_coeff sdh,
+                                   od_coeff *se, od_coeff *sf, od_coeff sfh) {
+
+  /* +/- Butterflies with asymmetric input. */
+  od_butterfly_neg_asym(s0, sf, sfh);
+  od_butterfly_sub_asym(s1, s1h, se);
+  od_butterfly_neg_asym(s2, sd, sdh);
+  od_butterfly_sub_asym(s3, s3h, sc);
+  od_butterfly_neg_asym(s4, sb, sbh);
+  od_butterfly_sub_asym(s5, s5h, sa);
+  od_butterfly_neg_asym(s6, s9, s9h);
+  od_butterfly_sub_asym(s7, s7h, s8);
+
+  /* Embedded 8-point orthonormal transforms. */
+  od_fdct_8(s0, s1, s2, s3, s4, s5, s6, s7);
+  od_fdst_8(sf, se, sd, sc, sb, sa, s9, s8);
+}
+
+/**
+ * 16-point asymmetric Type-II iDCT
+ */
+static INLINE void od_idct_16_asym(od_coeff *s0, od_coeff *s8,
+                                   od_coeff *s4, od_coeff *sc,
+                                   od_coeff *s2, od_coeff *sa,
+                                   od_coeff *s6, od_coeff *se,
+                                   od_coeff *s1, od_coeff *s1h,
+                                   od_coeff *s9, od_coeff *s9h,
+                                   od_coeff *s5, od_coeff *s5h,
+                                   od_coeff *sd, od_coeff *sdh,
+                                   od_coeff *s3, od_coeff *s3h,
+                                   od_coeff *sb, od_coeff *sbh,
+                                   od_coeff *s7, od_coeff *s7h,
+                                   od_coeff *sf, od_coeff *sfh) {
+
+  /* Embedded 8-point orthonormal transforms. */
+  od_idst_8(sf, sb, sd, s9, se, sa, sc, s8);
+  od_idct_8(s0, s4, s2, s6, s1, s5, s3, s7);
+
+  /* +/- Butterflies with asymmetric output. */
+  od_butterfly_sub(s7, s7h, s8);
+  od_butterfly_neg(s6, s9, s9h);
+  od_butterfly_sub(s5, s5h, sa);
+  od_butterfly_neg(s4, sb, sbh);
+  od_butterfly_sub(s3, s3h, sc);
+  od_butterfly_neg(s2, sd, sdh);
+  od_butterfly_sub(s1, s1h, se);
+  od_butterfly_neg(s0, sf, sfh);
+}
+
+/**
+ * 16-point orthonormal Type-IV fDST
+ */
+static INLINE void od_fdst_16(od_coeff *s0, od_coeff *s1,
+                              od_coeff *s2, od_coeff *s3,
+                              od_coeff *s4, od_coeff *s5,
+                              od_coeff *s6, od_coeff *s7,
+                              od_coeff *s8, od_coeff *s9,
+                              od_coeff *sa, od_coeff *sb,
+                              od_coeff *sc, od_coeff *sd,
+                              od_coeff *se, od_coeff *sf) {
+  od_coeff s0h;
+  od_coeff s2h;
+  od_coeff sdh;
+  od_coeff sfh;
+
+  /* Stage 0 */
+
+  /* 24279/32768 = (Sin[31*Pi/64] + Cos[31*Pi/64])/Sqrt[2] = 0.74095112535496 */
+  /* 44011/32768 = (Sin[31*Pi/64] - Cos[31*Pi/64])*Sqrt[2] = 1.34311790969404 */
+  /*  1137/16384 = Cos[31*Pi/64]*Sqrt[2]                   = 0.06939217050794 */
+  od_rotate_sub(s0, sf, od_add(*sf, *s0), 24279, 15, 44011, 15, 1137, 14, 1);
+
+  /* 1645/2048 = (Sin[29*Pi/64] + Cos[29*Pi/64])/Sqrt[2] = 0.8032075314806449 */
+  /*   305/256 = (Sin[29*Pi/64] - Cos[29*Pi/64])*Sqrt[2] = 1.1913986089848667 */
+  /*  425/2048 = Cos[29*Pi/64]*Sqrt[2]                   = 0.2075082269882116 */
+  od_rotate_add(se, s1, od_sub(*s1, *se), 1645, 11, 305, 8, 425, 11, 1);
+
+  /* 14053/32768 = (Sin[27*Pi/64] + Cos[27*Pi/64])/Sqrt[2] = 0.85772861000027 */
+  /*   8423/8192 = (Sin[27*Pi/64] - Cos[27*Pi/64])*Sqrt[2] = 1.02820548838644 */
+  /*   2815/8192 = Cos[27*Pi/64]*Sqrt[2]                   = 0.34362586580705 */
+  od_rotate_sub(s2, sd, od_add(*sd, *s2), 14053, 14, 8423, 13, 2815, 13, 1);
+
+  /* 14811/16384 = (Sin[25*Pi/64] + Cos[25*Pi/64])/Sqrt[2] = 0.90398929312344 */
+  /*   7005/8192 = (Sin[25*Pi/64] - Cos[25*Pi/64])*Sqrt[2] = 0.85511018686056 */
+  /*   3903/8192 = Cos[25*Pi/64]*Sqrt[2]                   = 0.47643419969316 */
+  od_rotate_add(sc, s3, od_sub(*s3, *sc), 14811, 14, 7005, 13, 3903, 13, 1);
+
+  /* 30853/32768 = (Sin[23*Pi/64] + Cos[23*Pi/64])/Sqrt[2] = 0.94154406518302 */
+  /* 11039/16384 = (Sin[23*Pi/64] - Cos[23*Pi/64])*Sqrt[2] = 0.67377970678444 */
+  /* 19813/32768 = Cos[23*Pi/64]*Sqrt[2]                   = 0.60465421179080 */
+  od_rotate_sub(s4, sb, od_add(*sb, *s4), 30853, 15, 11039, 14, 19813, 15, 1);
+
+  /* 15893/16384 = (Sin[21*Pi/64] + Cos[21*Pi/64])/Sqrt[2] = 0.97003125319454 */
+  /*   3981/8192 = (Sin[21*Pi/64] - Cos[21*Pi/64])*Sqrt[2] = 0.89716758634264 */
+  /*   1489/2048 = Cos[21*Pi/64]*Sqrt[2]                   = 0.72705107329128 */
+  od_rotate_add(sa, s5, od_sub(*s5, *sa), 15893, 14, 3981, 13, 1489, 11, 1);
+
+  /* 32413/32768 = (Sin[19*Pi/64] + Cos[19*Pi/64])/Sqrt[2] = 0.98917650996478 */
+  /*    601/2048 = (Sin[19*Pi/64] - Cos[19*Pi/64])*Sqrt[2] = 0.29346094891072 */
+  /* 27605/32768 = Cos[19*Pi/64]*Sqrt[2]                   = 0.84244603550942 */
+  od_rotate_sub(s6, s9, od_add(*s9, *s6), 32413, 15, 601, 11, 27605, 15, 1);
+
+  /* 32729/32768 = (Sin[17*Pi/64] + Cos[17*Pi/64])/Sqrt[2] = 0.99879545620517 */
+  /*    201/2048 = (Sin[17*Pi/64] - Cos[17*Pi/64])*Sqrt[2] = 0.09813534865484 */
+  /* 31121/32768 = Cos[17*Pi/64]*Sqrt[2]                   = 0.94972778187775 */
+  od_rotate_add(s8, s7, od_sub(*s7, *s8), 32729, 15, 201, 11, 31121, 15, 1);
+
+  /* Stage 1 */
+
+  od_butterfly_sub_asym(s0, od_rshift1(*s0), s7);
+  od_butterfly_sub_asym(s8, od_rshift1(*s8), sf);
+  od_butterfly_add_asym(s4, od_rshift1(*s4), s3);
+  od_butterfly_add_asym(sc, od_rshift1(*sc), sb);
+  od_butterfly_sub_asym(s2, od_rshift1(*s2), s5);
+  od_butterfly_sub_asym(sa, od_rshift1(*sa), sd);
+  od_butterfly_add_asym(s6, od_rshift1(*s6), s1);
+  od_butterfly_add_asym(se, od_rshift1(*se), s9);
+
+  /* Stage 2 */
+
+  od_butterfly_add(s8, NULL, s4);
+  od_butterfly_add(s7, NULL, sb);
+  od_butterfly_sub(sa, NULL, s6);
+  od_butterfly_sub(s5, NULL, s9);
+  od_butterfly_add(s0, &s0h, s3);
+  od_butterfly_add(sd, &sdh, se);
+  od_butterfly_sub(s2, &s2h, s1);
+  od_butterfly_sub(sf, &sfh, sc);
+
+  /* Stage 3 */
+
+  /*   9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
+  /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
+  /* 12785/32768 = 2*Cos[7*Pi/16]              = 0.3901806440322565 */
+  od_rotate_sub(s8, s7, od_avg_add(*s7, *s8), 9633, 13, 12873, 14, 12785, 15,
+   0);
+
+  /* 45451/32768 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
+  /*  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
+  /* 18205/32768 = Cos[5*Pi/16]                = 0.5555702330196022 */
+  od_rotate_sub(s9, s6, od_add(*s6, *s9), 45451, 15, 9041, 15, 18205, 15, 0);
+
+  /* 22725/16384 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
+  /*  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
+  /* 18205/32768 = 2*Cos[5*Pi/16]              = 1.1111404660392044 */
+  od_rotate_neg(s5, sa, od_avg_sub(*s5, *sa), 22725, 14, 9041, 15, 18205, 14);
+
+  /* 38531/32768 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
+  /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
+  /*  6393/32768 = Cos[7*Pi/16]                = 0.1950903220161283 */
+  od_rotate_neg(s4, sb, od_sub(*s4, *sb), 38531, 15, 12873, 14, 6393, 15);
+
+  /* Stage 4 */
+
+  od_butterfly_add_asym(s2, s2h, sc);
+  od_butterfly_sub_asym(s0, s0h, s1);
+  od_butterfly_add_asym(sf, sfh, se);
+  od_butterfly_add_asym(sd, sdh, s3);
+  od_butterfly_add_asym(s7, od_rshift1(*s7), s6);
+  od_butterfly_sub_asym(s8, od_rshift1(*s8), s9);
+  od_butterfly_sub_asym(sa, od_rshift1(*sa), sb);
+  od_butterfly_add_asym(s5, od_rshift1(*s5), s4);
+
+  /* Stage 5 */
+
+  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+  /*   3135/4096 = 2*Cos[7*Pi/8]             = 0.7653668647301796 */
+  od_rotate_sub(sc, s3, od_avg_add(*s3, *sc), 21407, 14, 8867, 14, 3135, 12, 0);
+
+  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3870398453221475 */
+  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+  /*   3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796 */
+  od_rotate_neg(s2, sd, od_avg_sub(*s2, *sd), 21407, 14, 8867, 14, 3135, 12);
+
+  /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+  /* 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951 */
+  od_rotate_pi4_sub(sa, s5, od_avg_add(*s5, *sa), 11585, 13, 11585, 13);
+
+  /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+  /* 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951 */
+  od_rotate_pi4_sub(s6, s9, od_avg_add(*s9, *s6), 11585, 13, 11585, 13);
+
+  /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+  /* 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951 */
+  od_rotate_pi4_sub(se, s1, od_avg_add(*s1, *se), 11585, 13, 11585, 13);
+}
+
+/**
+ * 16-point orthonormal Type-IV iDST
+ */
+static INLINE void od_idst_16(od_coeff *s0, od_coeff *s8,
+                              od_coeff *s4, od_coeff *sc,
+                              od_coeff *s2, od_coeff *sa,
+                              od_coeff *s6, od_coeff *se,
+                              od_coeff *s1, od_coeff *s9,
+                              od_coeff *s5, od_coeff *sd,
+                              od_coeff *s3, od_coeff *sb,
+                              od_coeff *s7, od_coeff *sf) {
+  od_coeff s0h;
+  od_coeff s2h;
+  od_coeff s4h;
+  od_coeff s6h;
+  od_coeff s8h;
+  od_coeff sah;
+  od_coeff sch;
+  od_coeff sdh;
+  od_coeff seh;
+  od_coeff sfh;
+
+  /* Stage 5 */
+
+  /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+  /* 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951 */
+  od_rotate_pi4_sub(s6, s9, od_avg_add(*s9, *s6), 11585, 13, 11585, 13);
+
+  /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+  /* 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951 */
+  od_rotate_pi4_sub(sa, s5, od_avg_add(*s5, *sa), 11585, 13, 11585, 13);
+
+  /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+  /* 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951 */
+  od_rotate_pi4_sub(se, s1, od_avg_add(*s1, *se), 11585, 13, 11585, 13);
+
+  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+  /*   3135/4096 = 2*Cos[7*Pi/8]             = 0.7653668647301796 */
+  od_rotate_sub(sc, s3, od_avg_add(*s3, *sc), 21407, 14, 8867, 14, 3135, 12, 0);
+
+  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3870398453221475 */
+  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+  /*   3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796 */
+  od_rotate_neg(sd, s2, od_avg_sub(*sd, *s2), 21407, 14, 8867, 14, 3135, 12);
+
+  /* Stage 4 */
+
+  od_butterfly_add(s5, NULL, s4);
+  od_butterfly_sub(sa, NULL, sb);
+  od_butterfly_sub(s8, NULL, s9);
+  od_butterfly_add(s7, NULL, s6);
+  od_butterfly_add(sd, &sdh, s3);
+  od_butterfly_add(sf, &sfh, se);
+  od_butterfly_sub(s0, &s0h, s1);
+  od_butterfly_add(s2, &s2h, sc);
+
+  /* Stage 3 */
+
+  /*   9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
+  /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
+  /* 12785/32768 = 2*Cos[7*Pi/16]              = 0.3901806440322565 */
+  od_rotate_sub(s8, s7, od_avg_add(*s7, *s8), 9633, 13, 12873, 14, 12785, 15,
+   0);
+
+  /* 45451/32768 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
+  /*  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
+  /* 18205/32768 = Cos[5*Pi/16]                = 0.5555702330196022 */
+  od_rotate_sub(s9, s6, od_add(*s6, *s9), 45451, 15, 9041, 15, 18205, 15, 0);
+
+  /* 22725/16384 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
+  /*  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
+  /* 18205/32768 = 2*Cos[5*Pi/16]              = 1.1111404660392044 */
+  od_rotate_neg(sa, s5, od_avg_sub(*sa, *s5), 22725, 14, 9041, 15, 18205, 14);
+
+  /* 38531/32768 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
+  /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
+  /*  6393/32768 = Cos[7*Pi/16]                = 0.1950903220161283 */
+  od_rotate_neg(sb, s4, od_sub(*sb, *s4), 38531, 15, 12873, 14, 6393, 15);
+
+  /* Stage 2 */
+
+  od_butterfly_add_asym(s8, od_rshift1(*s8), s4);
+  od_butterfly_add_asym(s7, od_rshift1(*s7), sb);
+  od_butterfly_sub_asym(sa, od_rshift1(*sa), s6);
+  od_butterfly_sub_asym(s5, od_rshift1(*s5), s9);
+  od_butterfly_add_asym(s0, s0h, s3);
+  od_butterfly_add_asym(sd, sdh, se);
+  od_butterfly_sub_asym(s2, s2h, s1);
+  od_butterfly_sub_asym(sf, sfh, sc);
+
+  /* Stage 1 */
+
+  od_butterfly_sub(s0, &s0h, s7);
+  od_butterfly_sub(s8, &s8h, sf);
+  od_butterfly_add(s4, &s4h, s3);
+  od_butterfly_add(sc, &sch, sb);
+  od_butterfly_sub(s2, &s2h, s5);
+  od_butterfly_sub(sa, &sah, sd);
+  od_butterfly_add(s6, &s6h, s1);
+  od_butterfly_add(se, &seh, s9);
+
+  /* Stage 0 */
+
+  /* 32729/32768 = (Sin[17*Pi/64] + Cos[17*Pi/64])/Sqrt[2] = 0.99879545620517 */
+  /*    201/2048 = (Sin[17*Pi/64] - Cos[17*Pi/64])*Sqrt[2] = 0.09813534865484 */
+  /* 31121/32768 = Cos[17*Pi/64]*Sqrt[2]                   = 0.94972778187775 */
+  od_rotate_add(s8, s7, od_sub(*s7, s8h), 32729, 15, 201, 11, 31121, 15, 0);
+
+  /* 32413/32768 = (Sin[19*Pi/64] + Cos[19*Pi/64])/Sqrt[2] = 0.98917650996478 */
+  /*    601/2048 = (Sin[19*Pi/64] - Cos[19*Pi/64])*Sqrt[2] = 0.29346094891072 */
+  /* 27605/32768 = Cos[19*Pi/64]*Sqrt[2]                   = 0.84244603550942 */
+  od_rotate_sub(s6, s9, od_add(*s9, s6h), 32413, 15, 601, 11, 27605, 15, 0);
+
+  /* 15893/16384 = (Sin[21*Pi/64] + Cos[21*Pi/64])/Sqrt[2] = 0.97003125319454 */
+  /*   3981/8192 = (Sin[21*Pi/64] - Cos[21*Pi/64])*Sqrt[2] = 0.89716758634264 */
+  /*   1489/2048 = Cos[21*Pi/64]*Sqrt[2]                   = 0.72705107329128 */
+  od_rotate_add(sa, s5, od_sub(*s5, sah), 15893, 14, 3981, 13, 1489, 11, 0);
+
+  /* 30853/32768 = (Sin[23*Pi/64] + Cos[23*Pi/64])/Sqrt[2] = 0.94154406518302 */
+  /* 11039/16384 = (Sin[23*Pi/64] - Cos[23*Pi/64])*Sqrt[2] = 0.67377970678444 */
+  /* 19813/32768 = Cos[23*Pi/64]*Sqrt[2]                   = 0.60465421179080 */
+  od_rotate_sub(s4, sb, od_add(*sb, s4h), 30853, 15, 11039, 14, 19813, 15, 0);
+
+  /* 14811/16384 = (Sin[25*Pi/64] + Cos[25*Pi/64])/Sqrt[2] = 0.90398929312344 */
+  /*   7005/8192 = (Sin[25*Pi/64] - Cos[25*Pi/64])*Sqrt[2] = 0.85511018686056 */
+  /*   3903/8192 = Cos[25*Pi/64]*Sqrt[2]                   = 0.47643419969316 */
+  od_rotate_add(sc, s3, od_sub(*s3, sch), 14811, 14, 7005, 13, 3903, 13, 0);
+
+  /* 14053/32768 = (Sin[27*Pi/64] + Cos[27*Pi/64])/Sqrt[2] = 0.85772861000027 */
+  /*   8423/8192 = (Sin[27*Pi/64] - Cos[27*Pi/64])*Sqrt[2] = 1.02820548838644 */
+  /*   2815/8192 = Cos[27*Pi/64]*Sqrt[2]                   = 0.34362586580705 */
+  od_rotate_sub(s2, sd, od_add(*sd, s2h), 14053, 14, 8423, 13, 2815, 13, 0);
+
+  /* 1645/2048 = (Sin[29*Pi/64] + Cos[29*Pi/64])/Sqrt[2] = 0.8032075314806449 */
+  /*   305/256 = (Sin[29*Pi/64] - Cos[29*Pi/64])*Sqrt[2] = 1.1913986089848667 */
+  /*  425/2048 = Cos[29*Pi/64]*Sqrt[2]                   = 0.2075082269882116 */
+  od_rotate_add(se, s1, od_sub(*s1, seh), 1645, 11, 305, 8, 425, 11, 0);
+
+  /* 24279/32768 = (Sin[31*Pi/64] + Cos[31*Pi/64])/Sqrt[2] = 0.74095112535496 */
+  /* 44011/32768 = (Sin[31*Pi/64] - Cos[31*Pi/64])*Sqrt[2] = 1.34311790969404 */
+  /*  1137/16384 = Cos[31*Pi/64]*Sqrt[2]                   = 0.06939217050794 */
+  od_rotate_sub(s0, sf, od_add(*sf, s0h), 24279, 15, 44011, 15, 1137, 14, 0);
+}
+
+/**
+ * 16-point asymmetric Type-IV fDST
+ */
+static INLINE void od_fdst_16_asym(od_coeff *s0, od_coeff s0h, od_coeff *s1,
+                                   od_coeff *s2, od_coeff s2h, od_coeff *s3,
+                                   od_coeff *s4, od_coeff s4h, od_coeff *s5,
+                                   od_coeff *s6, od_coeff s6h, od_coeff *s7,
+                                   od_coeff *s8, od_coeff s8h, od_coeff *s9,
+                                   od_coeff *sa, od_coeff sah, od_coeff *sb,
+                                   od_coeff *sc, od_coeff sch, od_coeff *sd,
+                                   od_coeff *se, od_coeff seh, od_coeff *sf) {
+  od_coeff sdh;
+  od_coeff sfh;
+
+  /* Stage 0 */
+
+  /*   1073/2048 = (Sin[31*Pi/64] + Cos[31*Pi/64])/2 = 0.5239315652662953 */
+  /* 62241/32768 = (Sin[31*Pi/64] - Cos[31*Pi/64])*2 = 1.8994555637555088 */
+  /*   201/16384 = Cos[31*Pi/64]*2                   = 0.0981353486548360 */
+  od_rotate_sub(s0, sf, od_add(*sf, s0h), 1073, 11, 62241, 15, 201, 11, 1);
+
+  /* 18611/32768 = (Sin[29*Pi/64] + Cos[29*Pi/64])/2 = 0.5679534922100714 */
+  /* 55211/32768 = (Sin[29*Pi/64] - Cos[29*Pi/64])*2 = 1.6848920710188384 */
+  /*    601/2048 = Cos[29*Pi/64]*2                   = 0.2934609489107235 */
+  od_rotate_add(se, s1, od_sub(*s1, seh), 18611, 15, 55211, 15, 601, 11, 1);
+
+  /*  9937/16384 = (Sin[27*Pi/64] + Cos[27*Pi/64])/2 = 0.6065057165489039 */
+  /*   1489/1024 = (Sin[27*Pi/64] - Cos[27*Pi/64])*2 = 1.4541021465825602 */
+  /*   3981/8192 = Cos[27*Pi/64]*2                   = 0.4859603598065277 */
+  od_rotate_sub(s2, sd, od_add(*sd, s2h), 9937, 14, 1489, 10, 3981, 13, 1);
+
+  /* 10473/16384 = (Sin[25*Pi/64] + Cos[25*Pi/64])/2 = 0.6392169592876205 */
+  /* 39627/32768 = (Sin[25*Pi/64] - Cos[25*Pi/64])*2 = 1.2093084235816014 */
+  /* 11039/16384 = Cos[25*Pi/64]*2                   = 0.6737797067844401 */
+  od_rotate_add(sc, s3, od_sub(*s3, sch), 10473, 14, 39627, 15, 11039, 14, 1);
+
+  /* 2727/4096 = (Sin[23*Pi/64] + Cos[23*Pi/64])/2 = 0.6657721932768628 */
+  /* 3903/4096 = (Sin[23*Pi/64] - Cos[23*Pi/64])*2 = 0.9528683993863225 */
+  /* 7005/8192 = Cos[23*Pi/64]*2                   = 0.8551101868605642 */
+  od_rotate_sub(s4, sb, od_add(*sb, s4h), 2727, 12, 3903, 12, 7005, 13, 1);
+
+  /* 5619/8192 = (Sin[21*Pi/64] + Cos[21*Pi/64])/2 = 0.6859156770967569 */
+  /* 2815/4096 = (Sin[21*Pi/64] - Cos[21*Pi/64])*2 = 0.6872517316141069 */
+  /* 8423/8192 = Cos[21*Pi/64]*2                   = 1.0282054883864433 */
+  od_rotate_add(sa, s5, od_sub(*s5, sah), 5619, 13, 2815, 12, 8423, 13, 1);
+
+  /*   2865/4096 = (Sin[19*Pi/64] + Cos[19*Pi/64])/2 = 0.6994534179865391 */
+  /* 13588/32768 = (Sin[19*Pi/64] - Cos[19*Pi/64])*2 = 0.4150164539764232 */
+  /*     305/256 = Cos[19*Pi/64]*2                   = 1.1913986089848667 */
+  od_rotate_sub(s6, s9, od_add(*s9, s6h), 2865, 12, 13599, 15, 305, 8, 1);
+
+  /* 23143/32768 = (Sin[17*Pi/64] + Cos[17*Pi/64])/2 = 0.7062550401009887 */
+  /*   1137/8192 = (Sin[17*Pi/64] - Cos[17*Pi/64])*2 = 0.1387843410158816 */
+  /* 44011/32768 = Cos[17*Pi/64]*2                   = 1.3431179096940367 */
+  od_rotate_add(s8, s7, od_sub(*s7, s8h), 23143, 15, 1137, 13, 44011, 15, 1);
+
+  /* Stage 1 */
+
+  od_butterfly_sub_asym(s0, od_rshift1(*s0), s7);
+  od_butterfly_sub_asym(s8, od_rshift1(*s8), sf);
+  od_butterfly_add_asym(s4, od_rshift1(*s4), s3);
+  od_butterfly_add_asym(sc, od_rshift1(*sc), sb);
+  od_butterfly_sub_asym(s2, od_rshift1(*s2), s5);
+  od_butterfly_sub_asym(sa, od_rshift1(*sa), sd);
+  od_butterfly_add_asym(s6, od_rshift1(*s6), s1);
+  od_butterfly_add_asym(se, od_rshift1(*se), s9);
+
+  /* Stage 2 */
+
+  od_butterfly_add(s8, NULL, s4);
+  od_butterfly_add(s7, NULL, sb);
+  od_butterfly_sub(sa, NULL, s6);
+  od_butterfly_sub(s5, NULL, s9);
+  od_butterfly_add(s0, &s0h, s3);
+  od_butterfly_add(sd, &sdh, se);
+  od_butterfly_sub(s2, &s2h, s1);
+  od_butterfly_sub(sf, &sfh, sc);
+
+  /* Stage 3 */
+
+  /*   9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
+  /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
+  /*  6393/32768 = Cos[7*Pi/16]                = 0.1950903220161283 */
+  od_rotate_sub(s8, s7, od_add(*s7, *s8), 9633, 13, 12873, 14, 6393, 15, 0);
+
+  /* 45451/32768 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
+  /*  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
+  /* 18205/32768 = Cos[5*Pi/16]                = 0.5555702330196022 */
+  od_rotate_sub(s9, s6, od_add(*s6, *s9), 45451, 15, 9041, 15, 18205, 15, 0);
+
+  /*  11363/8192 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
+  /*  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
+  /*   4551/8192 = Cos[5*Pi/16]                = 0.5555702330196022 */
+  od_rotate_neg(s5, sa, od_sub(*s5, *sa), 11363, 13, 9041, 15, 4551, 13);
+
+  /*  9633/32768 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
+  /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
+  /*  6393/32768 = Cos[7*Pi/16]                = 0.1950903220161283 */
+  od_rotate_neg(s4, sb, od_sub(*s4, *sb), 9633, 13, 12873, 14, 6393, 15);
+
+  /* Stage 4 */
+
+  od_butterfly_add_asym(s2, s2h, sc);
+  od_butterfly_sub_asym(s0, s0h, s1);
+  od_butterfly_add_asym(sf, sfh, se);
+  od_butterfly_add_asym(sd, sdh, s3);
+  od_butterfly_add_asym(s7, od_rshift1(*s7), s6);
+  od_butterfly_sub_asym(s8, od_rshift1(*s8), s9);
+  od_butterfly_sub_asym(sa, od_rshift1(*sa), sb);
+  od_butterfly_add_asym(s5, od_rshift1(*s5), s4);
+
+  /* Stage 5 */
+
+  /*  10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+  /*   3135/8192 = Cos[7*Pi/8]               = 0.3826834323650898 */
+  od_rotate_sub(sc, s3, od_add(*s3, *sc), 10703, 13, 8867, 14, 3135, 13, 0);
+
+  /*  10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3870398453221475 */
+  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+  /*   3135/8192 = Cos[3*Pi/8]               = 0.3826834323650898 */
+  od_rotate_neg(s2, sd, od_sub(*s2, *sd), 10703, 13, 8867, 14, 3135, 13);
+
+  /*  11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+  /* 11585/16384 = Cos[Pi/4]             = 0.7071067811865475 */
+  od_rotate_pi4_sub(sa, s5, od_add(*s5, *sa), 11585, 13, 11585, 14);
+
+  /*  11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+  /* 11585/16384 = Cos[Pi/4]             = 0.7071067811865475 */
+  od_rotate_pi4_sub(s6, s9, od_add(*s9, *s6), 11585, 13, 11585, 14);
+
+  /*  11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+  /* 11585/16384 = Cos[Pi/4]             = 0.7071067811865475 */
+  od_rotate_pi4_sub(se, s1, od_add(*s1, *se), 11585, 13, 11585, 14);
+}
+
+/**
+ * 16-point asymmetric Type-IV iDST
+ */
+static INLINE void od_idst_16_asym(od_coeff *s0, od_coeff *s8,
+                                   od_coeff *s4, od_coeff *sc,
+                                   od_coeff *s2, od_coeff *sa,
+                                   od_coeff *s6, od_coeff *se,
+                                   od_coeff *s1, od_coeff *s9,
+                                   od_coeff *s5, od_coeff *sd,
+                                   od_coeff *s3, od_coeff *sb,
+                                   od_coeff *s7, od_coeff *sf) {
+  od_coeff s0h;
+  od_coeff s2h;
+  od_coeff s4h;
+  od_coeff s6h;
+  od_coeff s8h;
+  od_coeff sah;
+  od_coeff sch;
+  od_coeff sdh;
+  od_coeff seh;
+  od_coeff sfh;
+
+  /* Stage 5 */
+
+  /*  11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+  /* 11585/16384 = Cos[Pi/4]           = 0.7071067811865475 */
+  od_rotate_pi4_sub(s6, s9, od_add(*s9, *s6), 11585, 13, 11585, 14);
+
+  /*  11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+  /* 11585/16384 = 2*Cos[Pi/4]           = 0.7071067811865475 */
+  od_rotate_pi4_sub(sa, s5, od_add(*s5, *sa), 11585, 13, 11585, 14);
+
+  /*  11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+  /* 11585/16384 = 2*Cos[Pi/4]           = 0.7071067811865475 */
+  od_rotate_pi4_sub(se, s1, od_add(*s1, *se), 11585, 13, 11585, 14);
+
+  /*  10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+  /*   3135/8192 = Cos[7*Pi/8]               = 0.7653668647301796 */
+  od_rotate_sub(sc, s3, od_add(*s3, *sc), 10703, 13, 8867, 14, 3135, 13, 0);
+
+  /*  10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3870398453221475 */
+  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+  /*   3135/8192 = Cos[3*Pi/8]               = 0.7653668647301796 */
+  od_rotate_neg(sd, s2, od_sub(*sd, *s2), 10703, 13, 8867, 14, 3135, 13);
+
+  /* Stage 4 */
+
+  od_butterfly_add(s5, NULL, s4);
+  od_butterfly_sub(sa, NULL, sb);
+  od_butterfly_sub(s8, NULL, s9);
+  od_butterfly_add(s7, NULL, s6);
+  od_butterfly_add(sd, &sdh, s3);
+  od_butterfly_add(sf, &sfh, se);
+  od_butterfly_sub(s0, &s0h, s1);
+  od_butterfly_add(s2, &s2h, sc);
+
+  /* Stage 3 */
+
+  /*   9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
+  /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
+  /*  6393/32768 = Cos[7*Pi/16]                = 0.1950903220161283 */
+  od_rotate_neg(sb, s4, od_sub(*sb, *s4), 9633, 13, 12873, 14, 6393, 15);
+
+  /*  11363/8192 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
+  /*  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
+  /*   4551/8192 = Cos[5*Pi/16]                = 0.5555702330196022 */
+  od_rotate_neg(sa, s5, od_sub(*sa, *s5), 11363, 13, 9041, 15, 4551, 13);
+
+  /* 22725/16384 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
+  /*  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
+  /* 18205/32768 = Cos[5*Pi/16]                = 0.5555702330196022 */
+  od_rotate_sub(s9, s6, od_add(*s6, *s9), 22725, 14, 9041, 15, 18205, 15, 0);
+
+  /*   9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
+  /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
+  /*  6393/32768 = Cos[7*Pi/16]                = 0.1950903220161283 */
+  od_rotate_sub(s8, s7, od_add(*s7, *s8), 9633, 13, 12873, 14, 6393, 15, 0);
+
+  /* Stage 2 */
+
+  od_butterfly_add_asym(s8, od_rshift1(*s8), s4);
+  od_butterfly_add_asym(s7, od_rshift1(*s7), sb);
+  od_butterfly_sub_asym(sa, od_rshift1(*sa), s6);
+  od_butterfly_sub_asym(s5, od_rshift1(*s5), s9);
+  od_butterfly_add_asym(s0, s0h, s3);
+  od_butterfly_add_asym(sd, sdh, se);
+  od_butterfly_sub_asym(s2, s2h, s1);
+  od_butterfly_sub_asym(sf, sfh, sc);
+
+  /* Stage 1 */
+
+  od_butterfly_sub(s0, &s0h, s7);
+  od_butterfly_sub(s8, &s8h, sf);
+  od_butterfly_add(s4, &s4h, s3);
+  od_butterfly_add(sc, &sch, sb);
+  od_butterfly_sub(s2, &s2h, s5);
+  od_butterfly_sub(sa, &sah, sd);
+  od_butterfly_add(s6, &s6h, s1);
+  od_butterfly_add(se, &seh, s9);
+
+  /* Stage 0 */
+
+  /* 23143/32768 = (Sin[17*Pi/64] + Cos[17*Pi/64])/2 = 0.7062550401009887 */
+  /*   1137/8192 = (Sin[17*Pi/64] - Cos[17*Pi/64])*2 = 0.1387843410158816 */
+  /* 44011/32768 = Cos[17*Pi/64]*2                   = 1.3431179096940367 */
+  od_rotate_add(s8, s7, od_sub(*s7, s8h), 23143, 15, 1137, 13, 44011, 15, 1);
+
+  /*   2865/4096 = (Sin[19*Pi/64] + Cos[19*Pi/64])/2 = 0.6994534179865391 */
+  /* 13599/32768 = (Sin[19*Pi/64] - Cos[19*Pi/64])*2 = 0.4150164539764232 */
+  /*     305/256 = Cos[19*Pi/64]*2                   = 1.1913986089848667 */
+  od_rotate_sub(s6, s9, od_add(*s9, s6h), 2865, 12, 13599, 15, 305, 8, 1);
+
+  /* 5619/8192 = (Sin[21*Pi/64] + Cos[21*Pi/64])/2 = 0.6859156770967569 */
+  /* 2815/4096 = (Sin[21*Pi/64] - Cos[21*Pi/64])*2 = 0.6872517316141069 */
+  /* 8423/8192 = Cos[21*Pi/64]*2                   = 1.0282054883864433 */
+  od_rotate_add(sa, s5, od_sub(*s5, sah), 5619, 13, 2815, 12, 8423, 13, 1);
+
+  /* 2727/4096 = (Sin[23*Pi/64] + Cos[23*Pi/64])/2 = 0.6657721932768628 */
+  /* 3903/4096 = (Sin[23*Pi/64] - Cos[23*Pi/64])*2 = 0.9528683993863225 */
+  /* 7005/8192 = Cos[23*Pi/64]*2                   = 0.8551101868605642 */
+  od_rotate_sub(s4, sb, od_add(*sb, s4h), 2727, 12, 3903, 12, 7005, 13, 1);
+
+  /* 10473/16384 = (Sin[25*Pi/64] + Cos[25*Pi/64])/2 = 0.6392169592876205 */
+  /* 39627/32768 = (Sin[25*Pi/64] - Cos[25*Pi/64])*2 = 1.2093084235816014 */
+  /* 11039/16384 = Cos[25*Pi/64]*2                   = 0.6737797067844401 */
+  od_rotate_add(sc, s3, od_sub(*s3, sch), 10473, 14, 39627, 15, 11039, 14, 1);
+
+  /*  9937/16384 = (Sin[27*Pi/64] + Cos[27*Pi/64])/2 = 0.6065057165489039 */
+  /*   1489/1024 = (Sin[27*Pi/64] - Cos[27*Pi/64])*2 = 1.4541021465825602 */
+  /*   3981/8192 = Cos[27*Pi/64]*2                   = 0.4859603598065277 */
+  od_rotate_sub(s2, sd, od_add(*sd, s2h), 9937, 14, 1489, 10, 3981, 13, 1);
+
+  /* 18611/32768 = (Sin[29*Pi/64] + Cos[29*Pi/64])/2 = 0.5679534922100714 */
+  /* 55211/32768 = (Sin[29*Pi/64] - Cos[29*Pi/64])*2 = 1.6848920710188384 */
+  /*    601/2048 = Cos[29*Pi/64]*2                   = 0.2934609489107235 */
+  od_rotate_add(se, s1, od_sub(*s1, seh), 18611, 15, 55211, 15, 601, 11, 1);
+
+  /*   1073/2048 = (Sin[31*Pi/64] + Cos[31*Pi/64])/2 = 0.5239315652662953 */
+  /* 62241/32768 = (Sin[31*Pi/64] - Cos[31*Pi/64])*2 = 1.8994555637555088 */
+  /*    201/2048 = Cos[31*Pi/64]*2                   = 0.0981353486548360 */
+  od_rotate_sub(s0, sf, od_add(*sf, s0h), 1073, 11, 62241, 15, 201, 11, 1);
+}
+
+/* --- 32-point Transforms --- */
+
+/**
+ * 32-point orthonormal Type-II fDCT
+ */
+static INLINE void od_fdct_32(od_coeff *t0, od_coeff *t1,
+                              od_coeff *t2, od_coeff *t3,
+                              od_coeff *t4, od_coeff *t5,
+                              od_coeff *t6, od_coeff *t7,
+                              od_coeff *t8, od_coeff *t9,
+                              od_coeff *ta, od_coeff *tb,
+                              od_coeff *tc, od_coeff *td,
+                              od_coeff *te, od_coeff *tf,
+                              od_coeff *tg, od_coeff *th,
+                              od_coeff *ti, od_coeff *tj,
+                              od_coeff *tk, od_coeff *tl,
+                              od_coeff *tm, od_coeff *tn,
+                              od_coeff *to, od_coeff *tp,
+                              od_coeff *tq, od_coeff *tr,
+                              od_coeff *ts, od_coeff *tt,
+                              od_coeff *tu, od_coeff *tv) {
+  od_coeff t1h;
+  od_coeff t3h;
+  od_coeff t5h;
+  od_coeff t7h;
+  od_coeff t9h;
+  od_coeff tbh;
+  od_coeff tdh;
+  od_coeff tfh;
+  od_coeff thh;
+  od_coeff tjh;
+  od_coeff tlh;
+  od_coeff tnh;
+  od_coeff tph;
+  od_coeff trh;
+  od_coeff tth;
+  od_coeff tvh;
+
+  /* +/- Butterflies with asymmetric output. */
+  od_butterfly_neg(t0, tv, &tvh);
+  od_butterfly_add(t1, &t1h, tu);
+  od_butterfly_neg(t2, tt, &tth);
+  od_butterfly_add(t3, &t3h, ts);
+  od_butterfly_neg(t4, tr, &trh);
+  od_butterfly_add(t5, &t5h, tq);
+  od_butterfly_neg(t6, tp, &tph);
+  od_butterfly_add(t7, &t7h, to);
+  od_butterfly_neg(t8, tn, &tnh);
+  od_butterfly_add(t9, &t9h, tm);
+  od_butterfly_neg(ta, tl, &tlh);
+  od_butterfly_add(tb, &tbh, tk);
+  od_butterfly_neg(tc, tj, &tjh);
+  od_butterfly_add(td, &tdh, ti);
+  od_butterfly_neg(te, th, &thh);
+  od_butterfly_add(tf, &tfh, tg);
+
+  /* Embedded 16-point transforms with asymmetric input. */
+  od_fdct_16_asym(
+   t0, t1, t1h, t2, t3, t3h, t4, t5, t5h, t6, t7, t7h,
+   t8, t9, t9h, ta, tb, tbh, tc, td, tdh, te, tf, tfh);
+  od_fdst_16_asym(
+   tv, tvh, tu, tt, tth, ts, tr, trh, tq, tp, tph, to,
+   tn, tnh, tm, tl, tlh, tk, tj, tjh, ti, th, thh, tg);
+}
+
+/**
+ * 32-point orthonormal Type-II iDCT
+ */
+static INLINE void od_idct_32(od_coeff *t0, od_coeff *tg,
+                              od_coeff *t8, od_coeff *to,
+                              od_coeff *t4, od_coeff *tk,
+                              od_coeff *tc, od_coeff *ts,
+                              od_coeff *t2, od_coeff *ti,
+                              od_coeff *ta, od_coeff *tq,
+                              od_coeff *t6, od_coeff *tm,
+                              od_coeff *te, od_coeff *tu,
+                              od_coeff *t1, od_coeff *th,
+                              od_coeff *t9, od_coeff *tp,
+                              od_coeff *t5, od_coeff *tl,
+                              od_coeff *td, od_coeff *tt,
+                              od_coeff *t3, od_coeff *tj,
+                              od_coeff *tb, od_coeff *tr,
+                              od_coeff *t7, od_coeff *tn,
+                              od_coeff *tf, od_coeff *tv) {
+  od_coeff t1h;
+  od_coeff t3h;
+  od_coeff t5h;
+  od_coeff t7h;
+  od_coeff t9h;
+  od_coeff tbh;
+  od_coeff tdh;
+  od_coeff tfh;
+
+  /* Embedded 16-point transforms with asymmetric output. */
+  od_idst_16_asym(
+   tv, tn, tr, tj, tt, tl, tp, th, tu, tm, tq, ti, ts, tk, to, tg);
+  od_idct_16_asym(
+   t0, t8, t4, tc, t2, ta, t6, te,
+   t1, &t1h, t9, &t9h, t5, &t5h, td, &tdh,
+   t3, &t3h, tb, &tbh, t7, &t7h, tf, &tfh);
+
+  /* +/- Butterflies with asymmetric input. */
+  od_butterfly_add_asym(tf, tfh, tg);
+  od_butterfly_neg_asym(te, th, od_rshift1(*th));
+  od_butterfly_add_asym(td, tdh, ti);
+  od_butterfly_neg_asym(tc, tj, od_rshift1(*tj));
+  od_butterfly_add_asym(tb, tbh, tk);
+  od_butterfly_neg_asym(ta, tl, od_rshift1(*tl));
+  od_butterfly_add_asym(t9, t9h, tm);
+  od_butterfly_neg_asym(t8, tn, od_rshift1(*tn));
+  od_butterfly_add_asym(t7, t7h, to);
+  od_butterfly_neg_asym(t6, tp, od_rshift1(*tp));
+  od_butterfly_add_asym(t5, t5h, tq);
+  od_butterfly_neg_asym(t4, tr, od_rshift1(*tr));
+  od_butterfly_add_asym(t3, t3h, ts);
+  od_butterfly_neg_asym(t2, tt, od_rshift1(*tt));
+  od_butterfly_add_asym(t1, t1h, tu);
+  od_butterfly_neg_asym(t0, tv, od_rshift1(*tv));
+}
+
+#endif
diff --git a/av1/common/x86/daala_inv_txfm_avx2.c b/av1/common/x86/daala_inv_txfm_avx2.c
index 73f8029..f060bfe 100644
--- a/av1/common/x86/daala_inv_txfm_avx2.c
+++ b/av1/common/x86/daala_inv_txfm_avx2.c
@@ -982,6 +982,7 @@
   }
 }
 
+#if 0
 static void od_row_idct4_avx2(int16_t *out, int rows, const tran_low_t *in) {
   od_row_tx4_avx2(out, rows, in, od_idct4_kernel8_epi16);
 }
@@ -992,6 +993,7 @@
   od_col_tx4_add_hbd_avx2(output_pixels, output_stride, cols, in, bd,
                           od_idct4_kernel8_epi16);
 }
+#endif
 
 static void od_row_idst4_avx2(int16_t *out, int rows, const tran_low_t *in) {
   od_row_tx4_avx2(out, rows, in, od_idst_vii4_kernel8_epi16);
@@ -1034,6 +1036,7 @@
                                     __m256i *r6, __m256i *r1, __m256i *r5,
                                     __m256i *r3, __m256i *r7);
 
+#if 0
 static void od_row_tx8_avx2(int16_t *out, int rows, const tran_low_t *in,
                             od_tx8_kernel8_epi16 kernel8_epi16,
                             od_tx8_mm256_kernel kernel8_epi32) {
@@ -1176,6 +1179,7 @@
                           od_flip_idst8_kernel8_epi16,
                           od_flip_idst8_kernel16_epi16);
 }
+#endif
 
 static void od_row_iidtx8_avx2(int16_t *out, int rows, const tran_low_t *in) {
   od_row_iidtx_avx2(out, rows * 8, in);
@@ -1201,6 +1205,7 @@
                                      __m256i *sc, __m256i *sd, __m256i *se,
                                      __m256i *sf);
 
+#if 0
 static void od_row_tx16_avx2(int16_t *out, int rows, const tran_low_t *in,
 #if CONFIG_RECT_TX_EXT
                              od_tx16_kernel8_epi16 kernel8_epi16,
@@ -1422,6 +1427,7 @@
                            od_flip_idst16_kernel8_epi16,
                            od_flip_idst16_kernel16_epi16);
 }
+#endif
 
 static void od_row_iidtx16_avx2(int16_t *out, int rows, const tran_low_t *in) {
   od_row_iidtx_avx2(out, rows * 16, in);
@@ -1440,19 +1446,11 @@
 
 static const daala_row_itx TX_ROW_MAP[TX_SIZES][TX_TYPES] = {
   // 4-point transforms
-  { od_row_idct4_avx2, od_row_idst4_avx2, od_row_flip_idst4_avx2,
-    od_row_iidtx4_avx2 },
+  { NULL, od_row_idst4_avx2, od_row_flip_idst4_avx2, od_row_iidtx4_avx2 },
   // 8-point transforms
-  { od_row_idct8_avx2,
-#if CONFIG_DAALA_TX_DST8
-    NULL, NULL,
-#else
-    od_row_idst8_avx2, od_row_flip_idst8_avx2,
-#endif
-    od_row_iidtx8_avx2 },
+  { NULL, NULL, NULL, od_row_iidtx8_avx2 },
   // 16-point transforms
-  { od_row_idct16_avx2, od_row_idst16_avx2, od_row_flip_idst16_avx2,
-    od_row_iidtx16_avx2 },
+  { NULL, NULL, NULL, od_row_iidtx16_avx2 },
   // 32-point transforms
   { NULL, NULL, NULL, NULL },
 #if CONFIG_TX64X64
@@ -1480,19 +1478,12 @@
   // High bit depth output
   {
       // 4-point transforms
-      { od_col_idct4_add_hbd_avx2, od_col_idst4_add_hbd_avx2,
-        od_col_flip_idst4_add_hbd_avx2, od_col_iidtx4_add_hbd_avx2 },
+      { NULL, od_col_idst4_add_hbd_avx2, od_col_flip_idst4_add_hbd_avx2,
+        od_col_iidtx4_add_hbd_avx2 },
       // 8-point transforms
-      { od_col_idct8_add_hbd_avx2,
-#if CONFIG_DAALA_TX_DST8
-        NULL, NULL,
-#else
-        od_col_idst8_add_hbd_avx2, od_col_flip_idst8_add_hbd_avx2,
-#endif
-        od_col_iidtx8_add_hbd_avx2 },
+      { NULL, NULL, NULL, od_col_iidtx8_add_hbd_avx2 },
       // 16-point transforms
-      { od_col_idct16_add_hbd_avx2, od_col_idst16_add_hbd_avx2,
-        od_col_flip_idst16_add_hbd_avx2, od_col_iidtx16_add_hbd_avx2 },
+      { NULL, NULL, NULL, od_col_iidtx16_add_hbd_avx2 },
       // 32-point transforms
       { NULL, NULL, NULL, NULL },
 #if CONFIG_TX64X64