daala_tx: Unify the asym and ortho DST designs.
This patch refactors the DST transforms so that the orthonormal and
asymmetric transforms are now nearly identical (up to multiplicaiton
constants and an extra set of shifts).
This means that the DST designs are now embeddable for every level
and should address hardware concerns about gate area.
In addition, minor changes were made to improve transform accuracy:
- all of the transforms now have perfect reconstruction for those
computations outside the rotations, i.e., all +/- butterfly steps
are exactly invertible
- two multiplication constants were reduced below < 1.0 (better for
SIMD and gives slightly improved accuracy)
- the averaging bias is removed which saves an extra addition for each
of the averaging steps
Additional averaging steps can be removed from the 8-point Type-IV DST
giving a 68% reduction in MSE for the 32-point DCT, but has not been
done in the event we use it in place of the 8-point Type-VII DST.
subset-1:
master-daala_tx@2017-12-10T22:38:19.651Z ->
new-daala_tx@2017-12-10T22:37:50.844Z
PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000
0.0057 | -0.0210 | -0.1821 | 0.0085 | -0.0002 | 0.0147 | -0.0674
Change-Id: Ib124eebf6f2e4b3c51c078d4e8f229fc5ec26171
diff --git a/av1/av1.cmake b/av1/av1.cmake
index df73ad8..e32405b 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -31,6 +31,7 @@
"${AOM_ROOT}/av1/common/convolve.h"
"${AOM_ROOT}/av1/common/daala_tx.c"
"${AOM_ROOT}/av1/common/daala_tx.h"
+ "${AOM_ROOT}/av1/common/daala_tx_kernels.h"
"${AOM_ROOT}/av1/common/debugmodes.c"
"${AOM_ROOT}/av1/common/entropy.c"
"${AOM_ROOT}/av1/common/entropy.h"
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index c9ddf39..2c76f18 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -26,6 +26,7 @@
AV1_COMMON_SRCS-yes += common/common.h
AV1_COMMON_SRCS-yes += common/daala_tx.c
AV1_COMMON_SRCS-yes += common/daala_tx.h
+AV1_COMMON_SRCS-yes += common/daala_tx_kernels.h
AV1_COMMON_SRCS-yes += common/daala_inv_txfm.c
AV1_COMMON_SRCS-yes += common/daala_inv_txfm.h
AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/daala_tx_kernels.h
diff --git a/av1/common/daala_tx.c b/av1/common/daala_tx.c
index 73f4596..854011b 100644
--- a/av1/common/daala_tx.c
+++ b/av1/common/daala_tx.c
@@ -1,5 +1,6 @@
#include "av1/common/daala_tx.h"
#include "av1/common/odintrin.h"
+#include "av1/common/daala_tx_kernels.h"
/* clang-format off */
@@ -39,32 +40,6 @@
} \
while (0)
-#define OD_FDCT_2_FLAT(p0, p1) \
- /* Embedded 2-point orthonormal Type-II fDCT with flattened rotations. */ \
- do { \
- int t_; \
- t_ = (p0 - p1 + 1) >> 1; \
- /* 46341/32768 ~= 2*Sin[Pi/4] = 1.4142135623730951 */ \
- p0 = (p1*46341 + 16384) >> 15; \
- /* 46341/32768 ~= 2*Cos[Pi/4] = 1.4142135623730951 */ \
- p1 = (t_*46341 + 16384) >> 15; \
- p0 += p1; \
- } \
- while (0)
-
-#define OD_IDCT_2_FLAT(p0, p1) \
- /* Embedded 2-point orthonormal Type-II iDCT with flattened rotations. */ \
- do { \
- int t_; \
- t_ = p0 + p1; \
- /* 11585/8192 ~= 2*Sin[Pi/4] = 1.4142135623730951 */ \
- p1 = (p0*11585 + 4096) >> 13; \
- /* 11585/16384 ~= Cos[Pi/4] = 0.7071067811865475 */ \
- p0 = (t_*11585 + 8192) >> 14; \
- p1 -= p0; \
- } \
- while (0)
-
#define OD_FDCT_2_ASYM_PR(p0, p1, p1h) \
/* Embedded 2-point asymmetric Type-II fDCT. */ \
do { \
@@ -82,9 +57,6 @@
} \
while (0)
-#define OD_FDCT_2_ASYM_FLAT OD_FDCT_2_ASYM_PR
-#define OD_IDCT_2_ASYM_FLAT OD_IDCT_2_ASYM_PR
-
#define OD_FDST_2_PR(p0, p1) \
/* Embedded 2-point orthonormal Type-IV fDST. */ \
do { \
@@ -112,24 +84,6 @@
} \
while (0)
-#define OD_FDST_2_FLAT(p0, p1) \
- do { \
- int t_; \
- int u_; \
- t_ = (p0 + p1 + 1) >> 1; \
- /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
- u_ = (p0*21407 + 8192) >> 14; \
- /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.541196100146197 */ \
- p0 = (p1*8867 + 8192) >> 14; \
- /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
- t_ = (t_*3135 + 2048) >> 12; \
- p0 += t_; \
- p1 = u_ - t_; \
- } \
- while (0)
-
-#define OD_IDST_2_FLAT OD_FDST_2_FLAT
-
#define OD_FDST_2_ASYM_PR(p0, p1) \
/* Embedded 2-point asymmetric Type-IV fDST. */ \
do { \
@@ -157,40 +111,6 @@
} \
while (0)
-#define OD_FDST_2_ASYM_FLAT(p0, p0h, p1) \
- /* Embedded 2-point asymmetric Type-IV fDST with flattened rotations. */ \
- do { \
- int t_; \
- int u_; \
- t_ = p0h + p1; \
- /* 15137/16384 ~= (Cos[3*Pi/8] + Sin[3*Pi/8])/Sqrt[2] = 0.9238795325112867 */ \
- u_ = (p0*15137 + 8192) >> 14; \
- /* 3135/4096 ~= (Cos[3*Pi/8] - Sin[3*Pi/8])*Sqrt[2] = 0.7653668647301795 */ \
- p0 = (p1*3135 + 2048) >> 12; \
- /* 8867/16384 ~= Cos[3*Pi/8]*Sqrt[2] = 0.5411961001461971 */ \
- t_ = (t_*8867 + 8192) >> 14; \
- p0 += t_; \
- p1 = u_ - t_; \
- } \
- while (0)
-
-#define OD_IDST_2_ASYM_FLAT(p0, p1) \
- /* Embedded 2-point asymmetric Type-IV iDST with flattened rotations. */ \
- do { \
- int t_; \
- int u_; \
- t_ = (p0 + p1 + 1) >> 1; \
- /* 3135/4096 ~= (Cos[Pi/8] - Sin[Pi/8])*Sqrt[2] = 0.7653668647301795 */ \
- u_ = (p1*3135 + 2048) >> 12; \
- /* 15137/16384 ~= (Cos[Pi/8] + Sin[Pi/8])/Sqrt[2] = 0.9238795325112867 */ \
- p1 = (p0*15137 + 8192) >> 14; \
- /* 8867/8192 ~= 2*Cos[3*Pi/8]*Sqrt[2] = 1.082392200292394 */ \
- t_ = (t_*8867 + 4096) >> 13; \
- p0 = u_ + t_; \
- p1 -= OD_RSHIFT1(t_); \
- } \
- while (0)
-
#define OD_FDCT_4_PR(q0, q2, q1, q3) \
/* Embedded 4-point orthonormal Type-II fDCT. */ \
do { \
@@ -248,61 +168,6 @@
} \
while (0)
-#define OD_FDCT_4_FLAT(q0, q1, q2, q3) \
- /* Embedded 4-point orthonormal Type-II fDCT with flattened rotations. */ \
- do { \
- int q1h; \
- int q3h; \
- q3 = q0 - q3; \
- q3h = OD_RSHIFT1(q3); \
- q0 -= q3h; \
- q1 += q2; \
- q1h = OD_RSHIFT1(q1); \
- q2 -= q1h; \
- OD_FDCT_2_ASYM_FLAT(q0, q1, q1h); \
- OD_FDST_2_ASYM_FLAT(q3, q3h, q2); \
- } \
- while (0)
-
-#define OD_IDCT_4_FLAT(q0, q2, q1, q3) \
- /* Embedded 4-point orthonormal Type-II iDCT with flattened rotations. */ \
- do { \
- int q1h; \
- OD_IDST_2_ASYM_FLAT(q3, q2); \
- OD_IDCT_2_ASYM_FLAT(q0, q1, q1h); \
- q2 += q1h; \
- q1 -= q2; \
- q0 += OD_RSHIFT1(q3); \
- q3 = q0 - q3; \
- } \
- while (0)
-
-#define OD_FDCT_4_ASYM_FLAT(q0, q1, q1h, q2, q3, q3h) \
- /* Embedded 4-point asymmetric Type-II fDCT with flattened rotations. */ \
- do { \
- q0 += q3h; \
- q3 = q0 - q3; \
- q2 -= q1h; \
- q1 += q2; \
- OD_FDCT_2_FLAT(q0, q1); \
- OD_FDST_2_FLAT(q3, q2); \
- } \
- while (0)
-
-#define OD_IDCT_4_ASYM_FLAT(q0, q2, q1, q1h, q3, q3h) \
- /* Embedded 4-point asymmetric Type-II iDCT with flattened rotations. */ \
- do { \
- OD_IDST_2_FLAT(q3, q2); \
- OD_IDCT_2_FLAT(q0, q1); \
- q1 -= q2; \
- q1h = OD_RSHIFT1(q1); \
- q2 += q1h; \
- q3 = q0 - q3; \
- q3h = OD_RSHIFT1(q3); \
- q0 -= q3h; \
- } \
- while (0)
-
#define OD_FDST_4_PR(q0, q2, q1, q3) \
/* Embedded 4-point orthonormal Type-IV fDST. */ \
do { \
@@ -384,90 +249,6 @@
} \
while (0)
-#define OD_FDST_4_FLAT(q0, q1, q2, q3) \
- /* Embedded 4-point orthonormal Type-IV fDST with flattened rotations. */ \
- do { \
- int t_; \
- int u_; \
- t_ = q0 - q3; \
- /* 13623/16384 ~= (Sin[7*Pi/16] + Cos[7*Pi/16])/Sqrt[2] ~=
- 0.8314696123025451 */ \
- u_ = (13623*q3 + 8192) >> 14; \
- /* 18205/16384 ~= (Sin[7*Pi/16] - Cos[7*Pi/16])*Sqrt[2] ~=
- 1.1111404660392046 */ \
- q3 = (18205*q0 + 8192) >> 14; \
- /* 9041/32768 ~= Cos[7*Pi/16]*Sqrt[2] ~= 0.275899379282943 */ \
- t_ = (9041*t_ + 16384) >> 15; \
- q0 = u_ + OD_RSHIFT1(t_); \
- q3 += t_; \
- t_ = q1 + q2; \
- /* 16069/16384 ~= (Sin[5*Pi/16] + Cos[5*Pi/16])/Sqrt[2] ~=
- 0.9807852804032304 */ \
- u_ = (16069*q1 + 8192) >> 14; \
- /* 12785/32768 ~= (Sin[5*Pi/16] - Cos[5*Pi/16])*Sqrt[2] ~=
- 0.3901806440322566 */ \
- q1 = (12785*q2 + 16384) >> 15; \
- /* 12873/16384 ~= Cos[5*Pi/16]*Sqrt[2] ~= 0.7856949583871021 */ \
- t_ = (12873*t_ + 8192) >> 14; \
- q2 = u_ - OD_RSHIFT1(t_); \
- q1 += t_; \
- q2 += OD_RSHIFT1(q3); \
- q3 -= q2; \
- q0 += OD_RSHIFT1(q1); \
- q1 -= q0; \
- t_ = (q1 + q2 + 1) >> 1; \
- /* 11585/8192 ~= 2*Sin[Pi/4] ~= 1.4142135623730951 */ \
- q1 = (11585*q2 + 4096) >> 13; \
- /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
- q2 = (11585*t_ + 4096) >> 13; \
- q1 -= q2; \
- } \
- while (0)
-
-#define OD_IDST_4_FLAT(q0, q1, q2, q3) \
- /* Embedded 4-point orthonormal Type-IV fDST with flattened rotations. */ \
- do { \
- int t_; \
- int u_; \
- int q2h; \
- int q3h; \
- t_ = (q1 + q2 + 1) >> 1; \
- /* 11585/8192 ~= 2*Sin[Pi/4] ~= 1.4142135623730951 */ \
- q2 = (11585*q1 + 4096) >> 13; \
- /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
- q1 = (11585*t_ + 4096) >> 13; \
- q2 -= q1; \
- q2 += q0; \
- q2h = OD_RSHIFT1(q2); \
- q0 -= q2h; \
- q3 += q1; \
- q3h = OD_RSHIFT1(q3); \
- q1 -= q3h; \
- t_ = q1 + q2h; \
- /* 16069/16384 ~= (Sin[5*Pi/16] + Cos[5*Pi/16])/Sqrt[2] ~=
- 0.9807852804032304 */ \
- u_ = (16069*q2 + 8192) >> 14; \
- /* 12785/32768 ~= (Sin[5*Pi/16] - Cos[5*Pi/16])*Sqrt[2] ~=
- 0.3901806440322566 */ \
- q2 = (12785*q1 + 16384) >> 15; \
- /* 12873/16384 ~= Cos[5*Pi/16]*Sqrt[2] ~= 0.7856949583871021 */ \
- t_ = (12873*t_ + 8192) >> 14; \
- q1 = u_ - t_; \
- q2 += t_; \
- t_ = q0 - q3h; \
- /* 13623/16384 ~= (Sin[7*Pi/16] + Cos[7*Pi/16])/Sqrt[2] ~=
- 0.8314696123025451 */ \
- u_ = (13623*q3 + 8192) >> 14; \
- /* 18205/16384 ~= (Sin[7*Pi/16] - Cos[7*Pi/16])*Sqrt[2] ~=
- 1.1111404660392046 */ \
- q3 = (18205*q0 + 8192) >> 14; \
- /* 9041/32768 ~= Cos[7*Pi/16]*Sqrt[2] = 0.275899379282943 */ \
- t_ = (9041*t_ + 16384) >> 15; \
- q0 = u_ + t_; \
- q3 += t_; \
- } \
- while (0)
-
#define OD_FDST_4_ASYM_PR(t0, t0h, t2, t1, t3) \
/* Embedded 4-point asymmetric Type-IV fDST. */ \
do { \
@@ -534,81 +315,6 @@
} \
while (0)
-#define OD_FDST_4_ASYM_FLAT(q0, q0h, q1, q2, q2h, q3) \
- /* Embedded 4-point asymmetric Type-IV fDST with flattened rotations. */ \
- do { \
- int t_; \
- int u_; \
- t_ = q0h - q3; \
- /* 38531/32768 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */ \
- u_ = (q3*38531 + 16384) >> 15; \
- /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */ \
- q3 = (q0*12873 + 8192) >> 14; \
- /* 12785/32768 ~= 2*Cos[7*Pi/16] ~= 0.3901806440322565 */ \
- t_ = (t_*12785 + 16384) >> 15; \
- q0 = u_ + OD_RSHIFT1(t_); \
- q3 += t_; \
- t_ = q1 + q2h; \
- /* 45451/32768 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
- u_ = (q1*45451 + 16384) >> 15; \
- /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
- q1 = (q2*9041 + 16384) >> 15; \
- /* 18205/16384 ~= 2*Cos[5*Pi/16] ~= 1.1111404660392044 */ \
- t_ = (t_*18205 + 8192) >> 14; \
- q1 += t_; \
- q2 = u_ - OD_RSHIFT1(t_); \
- q2 += OD_RSHIFT1(q3); \
- q3 -= q2; \
- q0 += OD_RSHIFT1(q1); \
- q1 -= q0; \
- t_ = (q1 + q2 + 1) >> 1; \
- /* 11585/8192 ~= 2*Sin[Pi/4] ~= 1.4142135623730951 */ \
- q1 = (q2*11585 + 4096) >> 13; \
- /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
- q2 = (t_*11585 + 4096) >> 13; \
- q1 -= q2; \
- } \
- while (0)
-
-#define OD_IDST_4_ASYM_FLAT(q0, q2, q1, q3) \
- do { \
- int t_; \
- int u_; \
- int q1h; \
- int q3h; \
- t_ = (q1 + q2 + 1) >> 1; \
- /* 11585/8192 ~= 2*Sin[Pi/4] ~= 1.4142135623730951 */ \
- q1 = (q2*11585 + 4096) >> 13; \
- /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
- q2 = (t_*11585 + 4096) >> 13; \
- q1 -= q2; \
- q1 += q0; \
- q1h = OD_RSHIFT1(q1); \
- q0 -= q1h; \
- q3 += q2; \
- q3h = OD_RSHIFT1(q3); \
- q2 -= q3h; \
- t_ = q1h + q2; \
- /* 45451/32768 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
- u_ = (q1*45451 + 16384) >> 15; \
- /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
- q1 = (q2*9041 + 16384) >> 15; \
- /* 18205/16384 ~= 2*Cos[5*Pi/16] ~= 1.1111404660392044 */ \
- t_ = (t_*18205 + 8192) >> 14; \
- q1 += OD_RSHIFT1(t_); \
- q2 = u_ - t_; \
- t_ = q0 - q3h; \
- /* 38531/32768 ~= Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */ \
- u_ = (q3*38531 + 16384) >> 15; \
- /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */ \
- q3 = (q0*12873 + 8192) >> 14; \
- /* 12785/32768 ~= 2*Cos[7*Pi/16] = 0.3901806440322565 */ \
- t_ = (t_*12785 + 16384) >> 15; \
- q3 += OD_RSHIFT1(t_); \
- q0 = u_ + t_; \
- } \
- while (0)
-
#define OD_FDCT_8_PR(r0, r4, r2, r6, r1, r5, r3, r7) \
/* Embedded 8-point orthonormal Type-II fDCT. */ \
do { \
@@ -654,48 +360,6 @@
} \
while (0)
-#define OD_FDCT_8_FLAT(r0, r1, r2, r3, r4, r5, r6, r7) \
- /* Embedded 8-point orthonormal Type-II fDCT with flattened rotations. */ \
- do { \
- int r1h; \
- int r3h; \
- int r5h; \
- int r7h; \
- r7 = r0 - r7; \
- r7h = OD_RSHIFT1(r7); \
- r0 -= r7h; \
- r1 += r6; \
- r1h = OD_RSHIFT1(r1); \
- r6 -= r1h; \
- r5 = r2 - r5; \
- r5h = OD_RSHIFT1(r5); \
- r2 -= r5h; \
- r3 += r4; \
- r3h = OD_RSHIFT1(r3); \
- r4 -= r3h; \
- OD_FDCT_4_ASYM_FLAT(r0, r1, r1h, r2, r3, r3h); \
- OD_FDST_4_ASYM_FLAT(r7, r7h, r6, r5, r5h, r4); \
- } \
- while (0)
-
-#define OD_IDCT_8_FLAT(r0, r4, r2, r6, r1, r5, r3, r7) \
- /* Embedded 8-point orthonormal Type-II iDCT with flattened rotations. */ \
- do { \
- int r1h; \
- int r3h; \
- OD_IDST_4_ASYM_FLAT(r7, r5, r6, r4); \
- OD_IDCT_4_ASYM_FLAT(r0, r2, r1, r1h, r3, r3h); \
- r4 += r3h; \
- r3 -= r4; \
- r2 += OD_RSHIFT1(r5); \
- r5 = r2 - r5; \
- r6 += r1h; \
- r1 -= r6; \
- r0 += OD_RSHIFT1(r7); \
- r7 = r0 - r7; \
- } \
- while (0)
-
#define OD_FDCT_8_ASYM_PR(r0, r4, r4h, r2, r6, r6h, r1, r5, r5h, r3, r7, r7h) \
/* Embedded 8-point asymmetric Type-II fDCT. */ \
do { \
@@ -732,42 +396,6 @@
} \
while (0)
-#define OD_FDCT_8_ASYM_FLAT(r0, r1, r1h, r2, r3, r3h, r4, r5, r5h, r6, r7, r7h) \
- /* Embedded 8-point asymmetric Type-II fDCT. */ \
- do { \
- r0 += r7h; \
- r7 = r0 - r7; \
- r4 -= r3h; \
- r3 += r4; \
- r2 += r5h; \
- r5 = r2 - r5; \
- r6 -= r1h; \
- r1 += r6; \
- OD_FDCT_4_FLAT(r0, r1, r2, r3); \
- OD_FDST_4_FLAT(r7, r6, r5, r4); \
- } \
- while (0)
-
-#define OD_IDCT_8_ASYM_FLAT(r0, r4, r2, r6, r1, r1h, r5, r5h, r3, r3h, r7, r7h) \
- /* Embedded 8-point asymmetric Type-II iDCT with flattened rotations. */ \
- do { \
- OD_IDST_4_FLAT(r7, r5, r6, r4); \
- OD_IDCT_4_FLAT(r0, r2, r1, r3); \
- r7 = r0 - r7; \
- r7h = OD_RSHIFT1(r7); \
- r0 -= r7h; \
- r1 -= r6; \
- r1h = OD_RSHIFT1(r1); \
- r6 += r1h; \
- r5 = r2 - r5; \
- r5h = OD_RSHIFT1(r5); \
- r2 -= r5h; \
- r3 -= r4; \
- r3h = OD_RSHIFT1(r3); \
- r4 += r3h; \
- } \
- while (0)
-
#define OD_FDST_8_PR(t0, t4, t2, t6, t1, t5, t3, t7) \
/* Embedded 8-point orthonormal Type-IV fDST. */ \
do { \
@@ -937,190 +565,6 @@
} \
while (0)
-#define OD_FDST_8_FLAT(r0, r1, r2, r3, r4, r5, r6, r7) \
- /* Embedded 8-point Type-IV fDST with flattened rotations. */ \
- do { \
- int t_; \
- int u_; \
- int r0h; \
- int r2h; \
- int r5h; \
- int r7h; \
- t_ = r3 - r4; \
- /* 23059/16384 ~= Sin[9*Pi/32] + Cos[9*Pi/32] ~= 1.4074037375263826 */ \
- u_ = (23059*r4 + 8192) >> 14; \
- /* 2271/16384 ~= Sin[9*Pi/32] - Cos[9*Pi/32] ~= 0.1386171691990915 */ \
- r4 = (2271*r3 + 8192) >> 14; \
- /* 5197/8192 ~= Cos[9*Pi/32] ~= 0.6343932841636455 */ \
- t_ = (5197*t_ + 4096) >> 13; \
- r3 = u_ + t_; \
- r4 += t_; \
- t_ = r2 + r5; \
- /* 22173/16384 ~= Sin[11*Pi/32] + Cos[11*Pi/32] ~= 1.3533180011743526 */ \
- u_ = (22173*r2 + 8192) >> 14; \
- /* 3363/8192 ~= Sin[11*Pi/32] - Cos[11*Pi/32] ~= 0.4105245275223574 */ \
- r2 = (3363*r5 + 4096) >> 13; \
- /* 15447/32768 ~= Cos[11*Pi/32] ~= 0.47139673682599764 */ \
- t_ = (15447*t_ + 16384) >> 15; \
- r2 += t_; \
- r5 = u_ - t_; \
- t_ = r1 - r6; \
- /* 40869/32768 ~= Sin[13*Pi/32] + Cos[13*Pi/32] ~= 1.247225012986671 */ \
- u_ = (40869*r6 + 16384) >> 15; \
- /* 21845/32768 ~= Sin[13*Pi/32] - Cos[13*Pi/32] ~= 0.6666556584777465 */ \
- r6 = (21845*r1 + 16384) >> 15; \
- /* 1189/4096 ~= Cos[13*Pi/32] ~= 0.29028467725446233 */ \
- t_ = (1189*t_ + 2048) >> 12; \
- r1 = u_ + t_; \
- r6 += t_; \
- t_ = r0 + r7; \
- /* 17911/16384 ~= Sin[15*Pi/32] + Cos[15*Pi/32] ~= 1.0932018670017576 */ \
- u_ = (17911*r0 + 8192) >> 14; \
- /* 14699/16384 ~= Sin[15*Pi/32] - Cos[15*Pi/32] ~= 0.8971675863426363 */ \
- r0 = (14699*r7 + 8192) >> 14; \
- /* 803/8192 ~= Cos[15*Pi/32] ~= 0.0980171403295606 */ \
- t_ = (803*t_ + 4096) >> 13; \
- r0 += t_; \
- r7 = u_ - t_; \
- r2 -= r1; \
- r2h = OD_RSHIFT1(r2); \
- r1 += r2h; \
- r5 += r6; \
- r5h = OD_RSHIFT1(r5); \
- r6 -= r5h; \
- r0 += r3; \
- r0h = OD_RSHIFT1(r0); \
- r3 -= r0h; \
- r7 -= r4; \
- r7h = OD_RSHIFT1(r7); \
- r4 += r7h; \
- r3 += r5h; \
- r5 -= r3; \
- r1 -= r0h; \
- r0 += r1; \
- r4 += r2h; \
- r2 -= r4; \
- r6 += r7h; \
- r7 -= r6; \
- t_ = (r4 - r3 + 1) >> 1; \
- /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
- u_ = (21407*r3 + 8192) >> 14; \
- /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
- r3 = (8867*r4 + 8192) >> 14; \
- /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
- t_ = (3135*t_ + 2048) >> 12; \
- r3 += t_; \
- r4 = u_ + t_; \
- t_ = (r2 - r5 + 1) >> 1; \
- /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
- u_ = (21407*r2 + 8192) >> 14; \
- /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
- r2 = (8867*r5 + 8192) >> 14; \
- /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
- t_ = (3135*t_ + 2048) >> 12; \
- r5 = t_ - u_; \
- r2 -= t_; \
- t_ = (r6 - r1 + 1) >> 1; \
- /* 11585/8192 ~= 2*Sin[Pi/4] ~= 1.4142135623730951 */ \
- r6 = (11585*r1 + 4096) >> 13; \
- /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
- r1 = (11585*t_ + 4096) >> 13; \
- r6 += r1; \
- } \
- while (0)
-
-#define OD_IDST_8_FLAT(r0, r4, r2, r6, r1, r5, r3, r7) \
- /* Embedded 8-point Type-IV iDST with flattened rotations. */ \
- do { \
- int t_; \
- int u_; \
- int r0h; \
- int r2h; \
- int r5h; \
- int r7h; \
- t_ = (r1 + r6 + 1) >> 1; \
- /* 11585/8192 ~= 2*Sin[Pi/4] ~= 1.4142135623730951 */ \
- r1 = (11585*r6 + 4096) >> 13; \
- /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
- r6 = (11585*t_ + 4096) >> 13; \
- r1 -= r6; \
- t_ = (r5 - r2 + 1) >> 1; \
- /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
- u_ = (21407*r5 + 8192) >> 14; \
- /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
- r5 = (8867*r2 + 8192) >> 14; \
- /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
- t_ = (3135*t_ + 2048) >> 12; \
- r5 -= t_; \
- r2 = t_ - u_; \
- t_ = (r3 + r4 + 1) >> 1; \
- /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
- u_ = (21407*r4 + 8192) >> 14; \
- /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
- r4 = (8867*r3 + 8192) >> 14; \
- /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
- t_ = (3135*t_ + 2048) >> 12; \
- r3 = u_ - t_; \
- r4 += t_; \
- r7 += r6; \
- r7h = OD_RSHIFT1(r7); \
- r6 -= r7h; \
- r2 += r4; \
- r2h = OD_RSHIFT1(r2); \
- r4 -= r2h; \
- r0 -= r1; \
- r0h = OD_RSHIFT1(r0); \
- r1 += r0h; \
- r5 += r3; \
- r5h = OD_RSHIFT1(r5); \
- r3 -= r5h; \
- r4 -= r7h; \
- r7 += r4; \
- r6 += r5h; \
- r5 -= r6; \
- r3 += r0h; \
- r0 -= r3; \
- r1 -= r2h; \
- r2 += r1; \
- t_ = r0 + r7; \
- /* 17911/16384 ~= Sin[15*Pi/32] + Cos[15*Pi/32] ~= 1.0932018670017576 */ \
- u_ = (17911*r0 + 8192) >> 14; \
- /* 14699/16384 ~= Sin[15*Pi/32] - Cos[15*Pi/32] ~= 0.8971675863426363 */ \
- r0 = (14699*r7 + 8192) >> 14; \
- /* 803/8192 ~= Cos[15*Pi/32] ~= 0.0980171403295606 */ \
- t_ = (803*t_ + 4096) >> 13; \
- r7 = u_ - t_; \
- r0 += t_; \
- t_ = r1 - r6; \
- /* 40869/32768 ~= Sin[13*Pi/32] + Cos[13*Pi/32] ~= 1.247225012986671 */ \
- u_ = (40869*r6 + 16384) >> 15; \
- /* 21845/32768 ~= Sin[13*Pi/32] - Cos[13*Pi/32] ~= 0.6666556584777465 */ \
- r6 = (21845*r1 + 16384) >> 15; \
- /* 1189/4096 ~= Cos[13*Pi/32] ~= 0.29028467725446233 */ \
- t_ = (1189*t_ + 2048) >> 12; \
- r1 = u_ + t_; \
- r6 += t_; \
- t_ = r2 + r5; \
- /* 22173/16384 ~= Sin[11*Pi/32] + Cos[11*Pi/32] ~= 1.3533180011743526 */ \
- u_ = (22173*r2 + 8192) >> 14; \
- /* 3363/8192 ~= Sin[11*Pi/32] - Cos[11*Pi/32] ~= 0.4105245275223574 */ \
- r2 = (3363*r5 + 4096) >> 13; \
- /* 15447/32768 ~= Cos[11*Pi/32] ~= 0.47139673682599764 */ \
- t_ = (15447*t_ + 16384) >> 15; \
- r5 = u_ - t_; \
- r2 += t_; \
- t_ = r3 - r4; \
- /* 23059/16384 ~= Sin[9*Pi/32] + Cos[9*Pi/32] ~= 1.4074037375263826 */ \
- u_ = (23059*r4 + 8192) >> 14; \
- /* 2271/16384 ~= Sin[9*Pi/32] - Cos[9*Pi/32] ~= 0.1386171691990915 */ \
- r4 = (2271*r3 + 8192) >> 14; \
- /* 5197/8192 ~= Cos[9*Pi/32] ~= 0.6343932841636455 */ \
- t_ = (5197*t_ + 4096) >> 13; \
- r3 = u_ + t_; \
- r4 += t_; \
- } \
- while (0)
-
/* Rewrite this so that t0h can be passed in. */
#define OD_FDST_8_ASYM_PR(t0, t4, t2, t6, t1, t5, t3, t7) \
/* Embedded 8-point asymmetric Type-IV fDST. */ \
@@ -1287,205 +731,6 @@
} \
while (0)
-#define OD_FDST_8_ASYM_FLAT(r0, r0h, r1, r2, r2h, r3, \
- r4, r4h, r5, r6, r6h, r7) \
- /* Embedded 8-point asymmetric Type-IV fDST with flattened rotations. */ \
- do { \
- int t_; \
- int u_; \
- int r5h; \
- int r7h; \
- t_ = r3 - r4h; \
- /* 16305/16384 ~= (Sin[9*Pi/32] + Cos[9*Pi/32])/Sqrt[2] ~=
- 0.9951847266721969 */ \
- u_ = (16305*r4 + 8192) >> 14; \
- /* 803/4096 ~= (Sin[9*Pi/32] - Cos[9*Pi/32])*Sqrt[2] ~=
- 0.1960342806591213 */ \
- r4 = (803*r3 + 2048) >> 12; \
- /* 14699/16384 ~= Cos[9*Pi/32]*Sqrt[2] ~= 0.8971675863426364 */ \
- t_ = (14699*t_ + 8192) >> 14; \
- r3 = u_ + t_; \
- r4 += t_; \
- t_ = r2h + r5; \
- /* 31357/32768 ~= (Sin[11*Pi/32] + Cos[11*Pi/32])/Sqrt[2]
- ~= 0.9569403357322087 */ \
- u_ = (31357*r2 + 16384) >> 15; \
- /* 1189/2048 ~= (Sin[11*Pi/32] - Cos[11*Pi/32])*Sqrt[2] ~=
- 0.5805693545089248 */ \
- r2 = (1189*r5 + 1024) >> 11; \
- /* 21845/32768 ~= Cos[11*Pi/32] ~= 0.6666556584777465 */ \
- t_ = (21845*t_ + 16384) >> 15; \
- r2 += t_; \
- r5 = u_ - t_; \
- t_ = r1 - r6h; \
- /* 28899/32768 ~= (Sin[13*Pi/32] + Cos[13*Pi/32])/Sqrt[2] ~=
- 0.8819212643483548 */ \
- u_ = (28899*r6 + 16384) >> 15; \
- /* 30893/32768 ~= (Sin[13*Pi/32] - Cos[13*Pi/32])*Sqrt[2] ~=
- 0.942793473651995297112775 */ \
- r6 = (30893*r1 + 16384) >> 15; \
- /* 3363/8192 ~= Cos[13*Pi/32]*Sqrt[2] ~= 0.41052452752235735 */ \
- t_ = (3363*t_ + 4096) >> 13; \
- r1 = u_ + t_; \
- r6 += t_; \
- t_ = r0h + r7; \
- /* 12665/16384 ~= (Sin[15*Pi/32] + Cos[15*Pi/32])/Sqrt[2] ~=
- 0.773010453362737 */ \
- u_ = (12665*r0 + 8192) >> 14; \
- /* 5197/4096 ~= (Sin[15*Pi/32] - Cos[15*Pi/32])*Sqrt[2] ~=
- 1.268786568327291 */ \
- r0 = (5197*r7 + 2048) >> 12; \
- /* 2271/16384 ~= Cos[15*Pi/32]*Sqrt[2] ~= 0.13861716919909148 */ \
- t_ = (2271*t_ + 8192) >> 14; \
- r0 += t_; \
- r7 = u_ - t_; \
- r2 -= r1; \
- r2h = OD_RSHIFT1(r2); \
- r1 += r2h; \
- r5 += r6; \
- r5h = OD_RSHIFT1(r5); \
- r6 -= r5h; \
- r0 += r3; \
- r0h = OD_RSHIFT1(r0); \
- r3 -= r0h; \
- r7 -= r4; \
- r7h = OD_RSHIFT1(r7); \
- r4 += r7h; \
- r3 += r5h; \
- r5 -= r3; \
- r1 -= r0h; \
- r0 += r1; \
- r4 += r2h; \
- r2 -= r4; \
- r6 += r7h; \
- r7 -= r6; \
- t_ = (r4 - r3 + 1) >> 1; \
- /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
- u_ = (21407*r3 + 8192) >> 14; \
- /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
- r3 = (8867*r4 + 8192) >> 14; \
- /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
- t_ = (3135*t_ + 2048) >> 12; \
- r3 += t_; \
- r4 = u_ + t_; \
- t_ = (r2 - r5 + 1) >> 1; \
- /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
- u_ = (21407*r2 + 8192) >> 14; \
- /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
- r2 = (8867*r5 + 8192) >> 14; \
- /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
- t_ = (3135*t_ + 2048) >> 12; \
- r5 = t_ - u_; \
- r2 -= t_; \
- t_ = (r6 - r1 + 1) >> 1; \
- /* 11585/8192 ~= 2*Sin[Pi/4] ~= 1.4142135623730951 */ \
- r6 = (11585*r1 + 4096) >> 13; \
- /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
- r1 = (11585*t_ + 4096) >> 13; \
- r6 += r1; \
- } \
- while (0)
-
-#define OD_IDST_8_ASYM_FLAT(r0, r4, r2, r6, r1, r5, r3, r7) \
- /* Embedded 8-point asymmetric Type-IV iDST with flattened rotations. */ \
- do { \
- int t_; \
- int u_; \
- int r0h; \
- int r2h; \
- int r5h; \
- int r7h; \
- t_ = (r1 + r6 + 1) >> 1; \
- /* 11585/8192 ~= 2*Sin[Pi/4] ~= 1.4142135623730951 */ \
- r1 = (11585*r6 + 4096) >> 13; \
- /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
- r6 = (11585*t_ + 4096) >> 13; \
- r1 -= r6; \
- t_ = (r5 - r2 + 1) >> 1; \
- /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
- u_ = (21407*r5 + 8192) >> 14; \
- /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
- r5 = (8867*r2 + 8192) >> 14; \
- /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
- t_ = (3135*t_ + 2048) >> 12; \
- r5 -= t_; \
- r2 = t_ - u_; \
- t_ = (r3 + r4 + 1) >> 1; \
- /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
- u_ = (21407*r4 + 8192) >> 14; \
- /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
- r4 = (8867*r3 + 8192) >> 14; \
- /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
- t_ = (3135*t_ + 2048) >> 12; \
- r3 = u_ - t_; \
- r4 += t_; \
- r7 += r6; \
- r7h = OD_RSHIFT1(r7); \
- r6 -= r7h; \
- r2 += r4; \
- r2h = OD_RSHIFT1(r2); \
- r4 -= r2h; \
- r0 -= r1; \
- r0h = OD_RSHIFT1(r0); \
- r1 += r0h; \
- r5 += r3; \
- r5h = OD_RSHIFT1(r5); \
- r3 -= r5h; \
- r4 -= r7h; \
- r7 += r4; \
- r6 += r5h; \
- r5 -= r6; \
- r3 += r0h; \
- r0 -= r3; \
- r1 -= r2h; \
- r2 += r1; \
- t_ = r0 + r7; \
- /* 12665/16384 ~= (Sin[15*Pi/32] + Cos[15*Pi/32])/Sqrt[2] ~=
- 0.773010453362737 */ \
- u_ = (12665*r0 + 8192) >> 14; \
- /* 5197/4096 ~= (Sin[15*Pi/32] - Cos[15*Pi/32])*Sqrt[2] ~=
- 1.268786568327291 */ \
- r0 = (5197*r7 + 2048) >> 12; \
- /* 2271/16384 ~= Cos[15*Pi/32]*Sqrt[2] ~= 0.13861716919909148 */ \
- t_ = (2271*t_ + 8192) >> 14; \
- r7 = u_ - OD_RSHIFT1(t_); \
- r0 += t_; \
- t_ = r1 - r6; \
- /* 28899/32768 ~= (Sin[13*Pi/32] + Cos[13*Pi/32])/Sqrt[2] ~=
- 0.8819212643483548 */ \
- u_ = (28899*r6 + 16384) >> 15; \
- /* 30893/32768 ~= (Sin[13*Pi/32] - Cos[13*Pi/32])*Sqrt[2] ~=
- 0.942793473651995297112775 */ \
- r6 = (30893*r1 + 16384) >> 15; \
- /* 3363/8192 ~= Cos[13*Pi/32]*Sqrt[2] ~= 0.41052452752235735 */ \
- t_ = (3363*t_ + 4096) >> 13; \
- r1 = u_ + OD_RSHIFT1(t_); \
- r6 += t_; \
- t_ = r2 + r5; \
- /* 31357/32768 ~= (Sin[11*Pi/32] + Cos[11*Pi/32])/Sqrt[2]
- ~= 0.9569403357322087 */ \
- u_ = (31357*r2 + 16384) >> 15; \
- /* 1189/2048 ~= (Sin[11*Pi/32] - Cos[11*Pi/32])*Sqrt[2] ~=
- 0.5805693545089248 */ \
- r2 = (1189*r5 + 1024) >> 11; \
- /* 21845/32768 ~= Cos[11*Pi/32] ~= 0.6666556584777465 */ \
- t_ = (21845*t_ + 16384) >> 15; \
- r5 = u_ - OD_RSHIFT1(t_); \
- r2 += t_; \
- t_ = r3 - r4; \
- /* 16305/16384 ~= (Sin[9*Pi/32] + Cos[9*Pi/32])/Sqrt[2] ~=
- 0.9951847266721969 */ \
- u_ = (16305*r4 + 8192) >> 14; \
- /* 803/4096 ~= (Sin[9*Pi/32] - Cos[9*Pi/32])*Sqrt[2] ~=
- 0.1960342806591213 */ \
- r4 = (803*r3 + 2048) >> 12; \
- /* 14699/16384 ~= Cos[9*Pi/32]*Sqrt[2] ~= 0.8971675863426364 */ \
- t_ = (14699*t_ + 8192) >> 14; \
- r3 = u_ + OD_RSHIFT1(t_); \
- r4 += t_; \
- } \
- while (0)
-
#define OD_FDCT_16_PR(s0, s8, s4, sc, s2, sa, s6, se, \
s1, s9, s5, sd, s3, sb, s7, sf) \
/* Embedded 16-point orthonormal Type-II fDCT. */ \
@@ -1552,76 +797,6 @@
} \
while (0)
-#define OD_FDCT_16_FLAT(s0, s1, s2, s3, s4, s5, s6, s7, \
- s8, s9, sa, sb, sc, sd, se, sf) \
- /* Embedded 16-point orthonormal Type-II fDCT with flattened rotations. */ \
- do { \
- int s1h; \
- int s3h; \
- int s5h; \
- int s7h; \
- int s9h; \
- int sbh; \
- int sdh; \
- int sfh; \
- sf = s0 - sf; \
- sfh = OD_RSHIFT1(sf); \
- s0 -= sfh; \
- s1 += se; \
- s1h = OD_RSHIFT1(s1); \
- se -= s1h; \
- sd = s2 - sd; \
- sdh = OD_RSHIFT1(sd); \
- s2 -= sdh; \
- s3 += sc; \
- s3h = OD_RSHIFT1(s3); \
- sc -= s3h; \
- sb = s4 - sb; \
- sbh = OD_RSHIFT1(sb); \
- s4 -= sbh; \
- s5 += sa; \
- s5h = OD_RSHIFT1(s5); \
- sa -= s5h; \
- s9 = s6 - s9; \
- s9h = OD_RSHIFT1(s9); \
- s6 -= s9h; \
- s7 += s8; \
- s7h = OD_RSHIFT1(s7); \
- s8 -= s7h; \
- OD_FDCT_8_ASYM_FLAT(s0, s1, s1h, s2, s3, s3h, s4, s5, s5h, s6, s7, s7h); \
- OD_FDST_8_ASYM_FLAT(sf, sfh, se, sd, sdh, sc, sb, sbh, sa, s9, s9h, s8); \
- } \
- while (0)
-
-#define OD_IDCT_16_FLAT(s0, s8, s4, sc, s2, sa, s6, se, \
- s1, s9, s5, sd, s3, sb, s7, sf) \
- /* Embedded 16-point orthonormal Type-II iDCT with flattened rotations. */ \
- do { \
- int s1h; \
- int s3h; \
- int s5h; \
- int s7h; \
- OD_IDST_8_ASYM_FLAT(sf, sb, sd, s9, se, sa, sc, s8); \
- OD_IDCT_8_ASYM_FLAT(s0, s4, s2, s6, s1, s1h, s5, s5h, s3, s3h, s7, s7h); \
- s8 += s7h; \
- s7 -= s8; \
- s6 += OD_RSHIFT1(s9); \
- s9 = s6 - s9; \
- sa += s5h; \
- s5 -= sa; \
- s4 += OD_RSHIFT1(sb); \
- sb = s4 - sb; \
- sc += s3h; \
- s3 -= sc; \
- s2 += OD_RSHIFT1(sd); \
- sd = s2 - sd; \
- se += s1h; \
- s1 -= se; \
- s0 += OD_RSHIFT1(sf); \
- sf = s0 - sf; \
- } \
- while (0)
-
#define OD_FDCT_16_ASYM_PR(t0, t8, t8h, t4, tc, tch, t2, ta, tah, t6, te, teh, \
t1, t9, t9h, t5, td, tdh, t3, tb, tbh, t7, tf, tfh) \
/* Embedded 16-point asymmetric Type-II fDCT. */ \
@@ -1680,64 +855,6 @@
} \
while (0)
-#define OD_FDCT_16_ASYM_FLAT(t0, t8, t8h, t4, tc, tch, t2, ta, tah, t6, \
- te, teh, t1, t9, t9h, t5, td, tdh, t3, tb, tbh, t7, tf, tfh) \
- /* Embedded 16-point asymmetric Type-II fDCT with flattened rotations. */ \
- do { \
- t0 += tfh; \
- tf = t0 - tf; \
- t1 -= teh; \
- te += t1; \
- t2 += tdh; \
- td = t2 - td; \
- t3 -= tch; \
- tc += t3; \
- t4 += tbh; \
- tb = t4 - tb; \
- t5 -= tah; \
- ta += t5; \
- t6 += t9h; \
- t9 = t6 - t9; \
- t7 -= t8h; \
- t8 += t7; \
- OD_FDCT_8_FLAT(t0, t8, t4, tc, t2, ta, t6, te); \
- OD_FDST_8_FLAT(tf, t7, tb, t3, td, t5, t9, t1); \
- } \
- while (0)
-
-#define OD_IDCT_16_ASYM_FLAT(t0, t8, t4, tc, t2, ta, t6, te, \
- t1, t1h, t9, t9h, t5, t5h, td, tdh, t3, t3h, tb, tbh, t7, t7h, tf, tfh) \
- /* Embedded 16-point asymmetric Type-II iDCT with flattened rotations. */ \
- do { \
- OD_IDST_8_FLAT(tf, tb, td, t9, te, ta, tc, t8); \
- OD_IDCT_8_FLAT(t0, t4, t2, t6, t1, t5, t3, t7); \
- t1 -= te; \
- t1h = OD_RSHIFT1(t1); \
- te += t1h; \
- t9 = t6 - t9; \
- t9h = OD_RSHIFT1(t9); \
- t6 -= t9h; \
- t5 -= ta; \
- t5h = OD_RSHIFT1(t5); \
- ta += t5h; \
- td = t2 - td; \
- tdh = OD_RSHIFT1(td); \
- t2 -= tdh; \
- t3 -= tc; \
- t3h = OD_RSHIFT1(t3); \
- tc += t3h; \
- tb = t4 - tb; \
- tbh = OD_RSHIFT1(tb); \
- t4 -= tbh; \
- t7 -= t8; \
- t7h = OD_RSHIFT1(t7); \
- t8 += t7h; \
- tf = t0 - tf; \
- tfh = OD_RSHIFT1(tf); \
- t0 -= tfh; \
- } \
- while (0)
-
#define OD_FDST_16_PR(s0, s8, s4, sc, s2, sa, s6, se, \
s1, s9, s5, sd, s3, sb, s7, sf) \
/* Embedded 16-point orthonormal Type-IV fDST. */ \
@@ -2119,471 +1236,6 @@
} \
while (0)
-#define OD_FDST_16_FLAT(s0, s8, s4, sc, s2, sa, s6, se, \
- s1, s9, s5, sd, s3, sb, s7, sf) \
- /* Embedded 16-point orthonormal Type-IV fDST with flattened rotations. */ \
- do { \
- int t_; \
- int u_; \
- int s0h; \
- int s4h; \
- int sbh; \
- int sfh; \
- t_ = s1 + se; \
- /* 32729/32768 ~= (Sin[17*Pi/64] + Cos[17*Pi/64])/Sqrt[2] ~=
- 0.9987954562051723 */ \
- u_ = (se*32729 + 16384) >> 15; \
- /* 201/2048 ~= (Sin[17*Pi/64] - Cos[17*Pi/64])*Sqrt[2] ~=
- 0.09813534865483615 */ \
- se = (s1*201 + 1024) >> 11; \
- /* 31121/32768 ~= Cos[17*Pi/64]*Sqrt[2] = 0.9497277818777543 */ \
- t_ = (t_*31121 + 16384) >> 15; \
- se += t_; \
- s1 = u_ - OD_RSHIFT1(t_); \
- t_ = s6 - s9; \
- /* 32413/32768 ~= (Sin[19*Pi/64] + Cos[19*Pi/64])/Sqrt[2] ~=
- 0.9891765099647809 */ \
- u_ = (s9*32413 + 16384) >> 15; \
- /* 601/2048 ~= (Sin[19*Pi/64] - Cos[19*Pi/64])*Sqrt[2]
- ~= 0.29346094891072355 */ \
- s9 = (s6*601 + 1024) >> 11; \
- /* 27605/32768 ~= Cos[19*Pi/64]*Sqrt[2] = 0.8424460355094193 */ \
- t_ = (t_*27605 + 16384) >> 15; \
- s9 += t_; \
- s6 = u_ + OD_RSHIFT1(t_); \
- t_ = s5 + sa; \
- /* 15893/16384 ~= (Sin[21*Pi/64] + Cos[21*Pi/64])/Sqrt[2] ~=
- 0.970031253194544 */ \
- u_ = (sa*15893 + 8192) >> 14; \
- /* 3981/8192 ~= (Sin[21*Pi/64] - Cos[21*Pi/64])*Sqrt[2] ~=
- 0.48596035980652796 */ \
- sa = (s5*3981 + 4096) >> 13; \
- /* 1489/2048 ~= Cos[21*Pi/64]*Sqrt[2] ~= 0.72705107329128 */ \
- t_ = (t_*1489 + 1024) >> 11; \
- sa += t_; \
- s5 = OD_RSHIFT1(t_) - u_; \
- t_ = sd - s2; \
- /* 30853/32768 ~= (Sin[23*Pi/64] + Cos[23*Pi/64])/Sqrt[2] ~=
- 0.9415440651830208 */ \
- u_ = (sd*30853 + 16384) >> 15; \
- /* 11039/16384 ~= (Sin[23*Pi/64] - Cos[23*Pi/64])*Sqrt[2] ~=
- 0.6737797067844402 */ \
- sd = (s2*11039 + 8192) >> 14; \
- /* 19813/32768 ~= Cos[23*Pi/64]*Sqrt[2] ~= 0.6046542117908008 */ \
- t_ = (t_*19813 + 16384) >> 15; \
- sd -= t_; \
- s2 = OD_RSHIFT1(t_) - u_; \
- t_ = s3 + sc; \
- /* 14811/16384 ~= (Sin[25*Pi/64] + Cos[25*Pi/64])/Sqrt[2] ~=
- 0.9039892931234433 */ \
- u_ = (sc*14811 + 8192) >> 14; \
- /* 7005/8192 ~= (Sin[25*Pi/64] - Cos[25*Pi/64])*Sqrt[2] ~=
- 0.8551101868605642 */ \
- sc = (s3*7005 + 4096) >> 13; \
- /* 3903/8192 ~= Cos[25*Pi/64]*Sqrt[2] ~= 0.47643419969316125 */ \
- t_ = (t_*3903 + 4096) >> 13; \
- sc += t_; \
- s3 = u_ - OD_RSHIFT1(t_); \
- t_ = sb - s4; \
- /* 14053/16384 ~= (Sin[27*Pi/64] + Cos[27*Pi/64])/Sqrt[2] ~=
- 0.857728610000272 */ \
- u_ = (sb*14053 + 8192) >> 14; \
- /* 8423/8192 ~= (Sin[27*Pi/64] - Cos[27*Pi/64])*Sqrt[2] ~=
- 1.0282054883864435 */ \
- sb = (s4*8423 + 4096) >> 13; \
- /* 2815/8192 ~= Cos[27*Pi/64]*Sqrt[2] = 0.34362586580705035 */ \
- t_ = (t_*2815 + 4096) >> 13; \
- sb -= t_; \
- s4 = OD_RSHIFT1(t_) - u_; \
- t_ = s7 + s8; \
- /* 1645/2048 ~= (Sin[29*Pi/64] + Cos[29*Pi/64])/Sqrt[2] ~=
- 0.8032075314806449 */ \
- u_ = (s8*1645 + 1024) >> 11; \
- /* 305/256 ~= (Sin[29*Pi/64] - Cos[29*Pi/64])*Sqrt[2] ~=
- 1.1913986089848667 */ \
- s8 = (s7*305 + 128) >> 8; \
- /* 425/2048 ~= Cos[29*Pi/64]*Sqrt[2] ~= 0.20750822698821159 */ \
- t_ = (t_*425 + 1024) >> 11; \
- s8 += t_; \
- s7 = u_ - OD_RSHIFT1(t_); \
- t_ = s0 - sf; \
- /* 24279/32768 ~= (Sin[31*Pi/64] + Cos[31*Pi/64])/Sqrt[2] ~=
- 0.7409511253549591 */ \
- u_ = (sf*24279 + 16384) >> 15; \
- /* 44011/32768 ~= (Sin[31*Pi/64] - Cos[31*Pi/64])*Sqrt[2] ~=
- 1.3431179096940369 */ \
- sf = (s0*44011 + 16384) >> 15; \
- /* 1137/16384 ~= Cos[31*Pi/64]*Sqrt[2] ~= 0.0693921705079406 */ \
- t_ = (t_*1137 + 8192) >> 14; \
- s0 = u_ + OD_RSHIFT1(t_); \
- sf += t_; \
- s3 -= OD_RSHIFT1(sd); \
- sd += s3; \
- s2 += OD_RSHIFT1(sc); \
- sc -= s2; \
- s5 -= OD_RSHIFT1(sb); \
- sb += s5; \
- s4 -= OD_RSHIFT1(sa); \
- sa += s4; \
- s1 += OD_RSHIFT1(sf); \
- sf -= s1; \
- s7 -= OD_RSHIFT1(s9); \
- s9 += s7; \
- s6 -= OD_RSHIFT1(s8); \
- s8 += s6; \
- s0 += OD_RSHIFT1(se); \
- se -= s0; \
- sa -= s9; \
- s9 += OD_RSHIFT1(sa); \
- s5 += s6; \
- s6 -= OD_RSHIFT1(s5); \
- s1 -= s2; \
- s2 += OD_RSHIFT1(s1); \
- se += sd; \
- sd -= OD_RSHIFT1(se); \
- s0 += sc; \
- s0h = OD_RSHIFT1(s0); \
- sc -= s0h; \
- sf -= s3; \
- sfh = OD_RSHIFT1(sf); \
- s3 += sfh; \
- sb += s7; \
- sbh = OD_RSHIFT1(sb); \
- s7 -= sbh; \
- s4 += s8; \
- s4h = OD_RSHIFT1(s4); \
- s8 -= s4h; \
- t_ = OD_PAVG(s1, se); \
- /* 9633/8192 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */ \
- u_ = (s1*9633 + 4096) >> 13; \
- /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */ \
- s1 = (se*12873 + 8192) >> 14; \
- /* 12785/32768 ~= 2*Cos[7*Pi/16] ~= 0.3901806440322565 */ \
- t_ = (t_*12785 + 16384) >> 15; \
- s1 += t_; \
- se = u_ - t_; \
- t_ = s6 + s9; \
- /* 45451/32768 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
- u_ = (s9*45451 + 16384) >> 15; \
- /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
- s9 = (s6*9041 + 16384) >> 15; \
- /* 18205/32768 ~= Cos[5*Pi/16] ~= 0.5555702330196022 */ \
- t_ = (t_*18205 + 16384) >> 15; \
- s9 += t_; \
- s6 = u_ - t_; \
- t_ = OD_PAVG(s5, sa); \
- /* 22725/16384 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
- u_ = (sa*22725 + 8192) >> 14; \
- /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
- sa = (s5*9041 + 16384) >> 15; \
- /* 18205/16384 ~= 2*Cos[5*Pi/16] ~= 1.1111404660392044 */ \
- t_ = (t_*18205 + 8192) >> 14; \
- sa += t_; \
- s5 = t_ - u_; \
- t_ = s2 + sd; \
- /* 38531/32768 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */ \
- u_ = (s2*38531 + 16384) >> 15; \
- /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */ \
- s2 = (sd*12873 + 8192) >> 14; \
- /* 6393/32768 ~= Cos[7*Pi/16] ~= 0.19509032201612825 */ \
- t_ = (t_*6393 + 16384) >> 15; \
- s2 += t_; \
- sd = u_ - t_; \
- s3 -= s4h; \
- s4 += s3; \
- s8 -= s0h; \
- s0 += s8; \
- s7 += sfh; \
- sf -= s7; \
- sc += sbh; \
- sb -= sc; \
- s6 += OD_RSHIFT1(se) ;\
- se -= s6; \
- s9 -= OD_RSHIFT1(s1); \
- s1 += s9; \
- sd -= OD_RSHIFT1(s5); \
- s5 += sd; \
- s2 -= OD_RSHIFT1(sa); \
- sa += s2; \
- t_ = OD_PAVG(s3, sc); \
- /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
- u_ = (s3*21407 + 8192) >> 14; \
- /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
- s3 = (sc*8867 + 8192) >> 14; \
- /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
- t_ = (t_*3135 + 2048) >> 12; \
- s3 += t_; \
- sc = u_ - t_; \
- t_ = OD_PAVG(s4, sb); \
- /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
- u_ = (s4*21407 + 8192) >> 14; \
- /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
- s4 = (sb*8867 + 8192) >> 14; \
- /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
- t_ = (t_*3134 + 2048) >> 12; \
- s4 += t_; \
- sb = u_ - t_; \
- t_ = OD_PAVG(s5, sa); \
- /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
- u_ = (sa*11585 + 4096) >> 13; \
- /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
- sa = (t_*11585 + 4096) >> 13; \
- s5 = sa - u_; \
- t_ = OD_PAVG(s6, -s9); \
- /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
- s6 = (s9*11585 + 4096) >> 13; \
- /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
- s9 = (t_*11585 + 4096) >> 13; \
- s6 += s9; \
- t_ = OD_PAVG(s7, -s8); \
- /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
- s7 = (s8*11585 + 4096) >> 13; \
- /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
- s8 = (t_*11585 + 4096) >> 13; \
- s7 += s8; \
- } \
- while (0)
-
-#define OD_IDST_16_FLAT(s0, s1, s2, s3, s4, s5, s6, s7, \
- s8, s9, sa, sb, sc, sd, se, sf) \
- /* Embedded 16-point orthonormal Type-IV iDST with flattened rotations. */ \
- do { \
- int t_; \
- int u_; \
- int s0h; \
- int s1h; \
- int s2h; \
- int s3h; \
- int s4h; \
- int s5h; \
- int s6h; \
- int s7h; \
- int sbh; \
- int sfh; \
- t_ = OD_PAVG(s6, s9); \
- /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
- s9 = (s6*11585 + 4096) >> 13; \
- /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
- s6 = (t_*11585 + 4096) >> 13; \
- s9 -= s6; \
- t_ = OD_PAVG(s5, sa); \
- /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
- sa = (s5*11585 + 4096) >> 13; \
- /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
- s5 = (t_*11585 + 4096) >> 13; \
- sa -= s5; \
- t_ = OD_PAVG(s7, s8); \
- /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
- s8 = (s7*11585 + 4096) >> 13; \
- /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */ \
- s7 = (t_*11585 + 4096) >> 13; \
- s8 -= s7; \
- t_ = OD_PAVG(s3, sc); \
- /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
- u_ = (s3*21407 + 8192) >> 14; \
- /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
- s3 = (sc*8867 + 8192) >> 14; \
- /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
- t_ = (t_*3135 + 2048) >> 12; \
- s3 += t_; \
- sc = u_ - t_; \
- t_ = OD_PAVG(sb, -s4); \
- /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
- u_ = (sb*21407 + 8192) >> 14; \
- /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
- sb = (s4*8867 + 8192) >> 14; \
- /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */ \
- t_ = (t_*3135 + 2048) >> 12; \
- sb -= t_; \
- s4 = t_ - u_; \
- sa += s2; \
- s2 -= OD_RSHIFT1(sa); \
- s5 -= sd; \
- sd += OD_RSHIFT1(s5); \
- s1 -= s9; \
- s9 += OD_RSHIFT1(s1); \
- se += s6; \
- s6 -= OD_RSHIFT1(se); \
- sb += sc; \
- sbh = OD_RSHIFT1(sb); \
- sc -= sbh; \
- sf += s7; \
- sfh = OD_RSHIFT1(sf); \
- s7 -= sfh; \
- s0 -= s8; \
- s0h = OD_RSHIFT1(s0); \
- s8 += s0h; \
- s4 += s3; \
- s4h = OD_RSHIFT1(s4); \
- s3 -= s4h; \
- t_ = sd - s2; \
- /* 38531/32768 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */ \
- u_ = (sd*38531 + 16384) >> 15; \
- /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */ \
- sd = (s2*12873 + 8192) >> 14; \
- /* 6393/32768 ~= Cos[7*Pi/16] ~= 0.19509032201612825 */ \
- t_ = (t_*6393 + 16384) >> 15; \
- sd -= t_; \
- s2 = t_ - u_; \
- t_ = OD_PAVG(s5, -sa); \
- /* 22725/16384 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
- u_ = (s5*22725 + 8192) >> 14; \
- /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
- s5 = (sa*9041 + 16384) >> 15; \
- /* 18205/16384 ~= 2*Cos[5*Pi/16] ~= 1.1111404660392044 */ \
- t_ = (t_*18205 + 8192) >> 14; \
- s5 -= t_; \
- sa = t_ - u_; \
- t_ = s6 + s9; \
- /* 45451/32768 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
- u_ = (s9*45451 + 16384) >> 15; \
- /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
- s9 = (s6*9041 + 16384) >> 15; \
- /* 18205/32768 ~= Cos[5*Pi/16] ~= 0.5555702330196022 */ \
- t_ = (t_*18205 + 16384) >> 15; \
- s9 += t_; \
- s6 = u_ - t_; \
- t_ = OD_PAVG(s1, se); \
- /* 9633/8192 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */ \
- u_ = (s1*9633 + 4096) >> 13; \
- /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */ \
- s1 = (se*12873 + 8192) >> 14; \
- /* 12785/32768 ~= 2*Cos[7*Pi/16] ~= 0.3901806440322565 */ \
- t_ = (t_*12785 + 16384) >> 15; \
- s1 += t_; \
- se = u_ - t_; \
- s8 -= s4h; \
- s4 += s8; \
- s7 += sbh; \
- sb -= s7; \
- s3 -= sfh; \
- sf += s3; \
- sc += s0h; \
- s0 -= sc; \
- sd += OD_RSHIFT1(se); \
- se -= sd; \
- s2 += OD_RSHIFT1(s1); \
- s1 -= s2; \
- s6 -= OD_RSHIFT1(s5); \
- s5 += s6; \
- s9 -= OD_RSHIFT1(sa); \
- sa += s9; \
- s0 -= se; \
- s0h = OD_RSHIFT1(s0); \
- se += s0h; \
- s1 -= sf; \
- s1h = OD_RSHIFT1(s1); \
- sf += s1h; \
- s2 += sc; \
- s2h = OD_RSHIFT1(s2); \
- sc -= s2h; \
- s3 += sd; \
- s3h = OD_RSHIFT1(s3); \
- sd -= s3h; \
- s4 -= sa; \
- s4h = OD_RSHIFT1(s4); \
- sa += s4h; \
- s5 -= sb; \
- s5h = OD_RSHIFT1(s5); \
- sb += s5h; \
- s6 += s8; \
- s6h = OD_RSHIFT1(s6); \
- s8 -= s6h; \
- s7 += s9; \
- s7h = OD_RSHIFT1(s7); \
- s9 -= s7h; \
- t_ = se - s1h; \
- /* 32729/32768 ~= (Sin[17*Pi/64] + Cos[17*Pi/64])/Sqrt[2] ~=
- 0.9987954562051723 */ \
- u_ = (s1*32729 + 16384) >> 15; \
- /* 201/2048 ~= (Sin[17*Pi/64] - Cos[17*Pi/64])*Sqrt[2] ~=
- 0.09813534865483615 */ \
- s1 = (se*201 + 1024) >> 11; \
- /* 31121/32768 ~= Cos[17*Pi/64]*Sqrt[2] ~=
- 0.9497277818777543 */ \
- t_ = (t_*31121 + 16384) >> 15; \
- s1 += t_; \
- se = u_ + t_; \
- t_ = s6h + s9; \
- /* 32413/32768 ~= (Sin[19*Pi/64] + Cos[19*Pi/64])/Sqrt[2] ~=
- 0.9891765099647809 */ \
- u_ = (s6*32413 + 16384) >> 15; \
- /* 601/2048 ~= (Sin[19*Pi/64] - Cos[19*Pi/64])*Sqrt[2] ~=
- 0.29346094891072355 */ \
- s6 = (s9*601 + 1024) >> 11; \
- /* 27605/32768 ~= Cos[19*Pi/64]*Sqrt[2] ~= 0.8424460355094193 */ \
- t_ = (t_*27605 + 16384) >> 15; \
- s6 += t_; \
- s9 = u_ - t_; \
- t_ = sa - s5h; \
- /* 15893/16384 ~= (Sin[21*Pi/64] + Cos[21*Pi/64])/Sqrt[2] ~=
- 0.970031253194544 */ \
- u_ = (s5*15893 + 8192) >> 14; \
- /* 3981/8192 ~= (Sin[21*Pi/64] - Cos[21*Pi/64])*Sqrt[2] ~=
- 0.48596035980652796 */ \
- s5 = (sa*3981 + 4096) >> 13; \
- /* 1489/2048 ~= Cos[21*Pi/64]*Sqrt[2] ~= 0.72705107329128 */ \
- t_ = (t_*1489 + 1024) >> 11; \
- s5 += t_; \
- sa = u_ + t_; \
- t_ = s2h + sd; \
- /* 30853/32768 ~= (Sin[23*Pi/64] + Cos[23*Pi/64])/Sqrt[2] ~=
- 0.9415440651830208 */ \
- u_ = (s2*30853 + 16384) >> 15; \
- /* 11039/16384 ~= (Sin[23*Pi/64] - Cos[23*Pi/64])*Sqrt[2] ~=
- 0.6737797067844402 */ \
- s2 = (sd*11039 + 8192) >> 14; \
- /* 19813/32768 ~= Cos[23*Pi/64]*Sqrt[2] ~= 0.6046542117908008 */ \
- t_ = (t_*19813 + 16384) >> 15; \
- s2 += t_; \
- sd = u_ - t_; \
- t_ = sc - s3h; \
- /* 14811/16384 ~= (Sin[25*Pi/64] + Cos[25*Pi/64])/Sqrt[2] ~=
- 0.9039892931234433 */ \
- u_ = (s3*14811 + 8192) >> 14; \
- /* 7005/8192 ~= (Sin[25*Pi/64] - Cos[25*Pi/64])*Sqrt[2] ~=
- 0.8551101868605642 */ \
- s3 = (sc*7005 + 4096) >> 13; \
- /* 3903/8192 ~= Cos[25*Pi/64]*Sqrt[2] ~= 0.47643419969316125 */ \
- t_ = (t_*3903 + 4096) >> 13; \
- s3 += t_; \
- sc = u_ + t_; \
- t_ = s4h + sb; \
- /* 14053/16384 ~= (Sin[27*Pi/64] + Cos[27*Pi/64])/Sqrt[2] ~=
- 0.857728610000272 */ \
- u_ = (s4*14053 + 8192) >> 14; \
- /* 8423/8192 ~= (Sin[27*Pi/64] - Cos[27*Pi/64])*Sqrt[2] ~=
- 1.0282054883864435 */ \
- s4 = (sb*8423 + 4096) >> 13; \
- /* 2815/8192 ~= Cos[27*Pi/64]*Sqrt[2] ~= 0.34362586580705035 */ \
- t_ = (t_*2815 + 4096) >> 13; \
- s4 += t_; \
- sb = u_ - t_; \
- t_ = s8 - s7h; \
- /* 1645/2048 ~= (Sin[29*Pi/64] + Cos[29*Pi/64])/Sqrt[2] ~=
- 0.8032075314806449 */ \
- u_ = (s7*1645 + 1024) >> 11; \
- /* 305/256 ~= (Sin[29*Pi/64] - Cos[29*Pi/64])*Sqrt[2] ~=
- 1.1913986089848667 */ \
- s7 = (s8*305 + 128) >> 8; \
- /* 425/2048 ~= Cos[29*Pi/64]*Sqrt[2] ~= 0.20750822698821159 */ \
- t_ = (t_*425 + 1024) >> 11; \
- s7 += t_; \
- s8 = u_ + t_; \
- t_ = s0h + sf; \
- /* 24279/32768 ~= (Sin[31*Pi/64] + Cos[31*Pi/64])/Sqrt[2] ~=
- 0.7409511253549591 */ \
- u_ = (s0*24279 + 16384) >> 15; \
- /* 44011/32768 ~= (Sin[31*Pi/64] - Cos[31*Pi/64])*Sqrt[2] ~=
- 1.3431179096940369 */ \
- s0 = (sf*44011 + 16384) >> 15; \
- /* 1137/16384 ~= Cos[31*Pi/64]*Sqrt[2] ~= 0.06939217050794069 */ \
- t_ = (t_*1137 + 8192) >> 14; \
- s0 += t_; \
- sf = u_ - t_; \
- } \
- while (0)
-
/* TODO: rewrite this to match OD_FDST_16. */
#define OD_FDST_16_ASYM_PR(t0, t0h, t8, t4, t4h, tc, t2, ta, t6, te, \
t1, t9, t5, td, t3, tb, t7, t7h, tf) \
@@ -3019,444 +1671,6 @@
} \
while (0)
-#define OD_FDST_16_ASYM_FLAT(s0, s0h, s8, s4, s4h, sc, s2, s2h, sa, s6, s6h, \
- se, s1, s1h, s9, s5, s5h, sd, s3, s3h, sb, s7, s7h, sf) \
- /* Embedded 16-point asymmetric Type-IV fDST with flattened rotations. */ \
- do { \
- int t_; \
- int u_; \
- int sbh; \
- int sfh; \
- t_ = s1h + se; \
- /* 46285/32768 ~= Sin[17*Pi/64] + Cos[17*Pi/64] ~= 1.4125100802019774 */ \
- u_ = (se*46285 + 16384) >> 15; \
- /* 1137/16384 ~= Sin[17*Pi/64] - Cos[17*Pi/64] ~= 0.06939217050794078 */ \
- se = (s1*1137 + 8192) >> 14; \
- /* 44011/32768 ~= Cos[17*Pi/64]*2 ~= 1.3431179096940367 */ \
- t_ = (t_*44011 + 16384) >> 15; \
- se += t_; \
- s1 = u_ - OD_RSHIFT1(t_); \
- t_ = s6h - s9; \
- /* 45839/32768 ~= Sin[19*Pi/64] + Cos[19*Pi/64] ~= 1.3989068359730783 */ \
- u_ = (s9*45839 + 16384) >> 15; \
- /* 425/2048 ~= Sin[19*Pi/64] - Cos[19*Pi/64] ~= 0.20750822698821159 */ \
- s9 = (425*s6 + 1024) >> 11; \
- /* 305/256 ~= Cos[19*Pi/64]*2 ~= 1.1913986089848667 */ \
- t_ = (305*t_ + 128) >> 8; \
- s9 += t_; \
- s6 = u_ + OD_RSHIFT1(t_); \
- t_ = s5h + sa; \
- /* 5619/4096 ~= Sin[21*Pi/64] + Cos[21*Pi/64] ~= 1.371831354193494 */ \
- u_ = (sa*5619 + 2048) >> 12; \
- /* 2815/8192 ~= Sin[21*Pi/64] - Cos[21*Pi/64] ~= 0.34362586580705046 */ \
- sa = (s5*2815 + 4096) >> 13; \
- /* 8423/8192 ~= Cos[21*Pi/64]*2 ~= 1.0282054883864433 */ \
- t_ = (t_*8423 + 4096) >> 13; \
- sa += t_; \
- s5 = OD_RSHIFT1(t_) - u_; \
- t_ = sd - s2h; \
- /* 2727/2048 ~= Sin[23*Pi/64] + Cos[23*Pi/64] ~= 1.3315443865537255 */ \
- u_ = (sd*2727 + 1024) >> 11; \
- /* 3903/8192 ~= Sin[23*Pi/64] - Cos[23*Pi/64] ~= 0.47643419969316125 */ \
- sd = (s2*3903 + 4096) >> 13; \
- /* 7005/8192 ~= Cos[23*Pi/64]*2 ~= 0.8551101868605642 */ \
- t_ = (t_*7005 + 4096) >> 13; \
- sd -= t_; \
- s2 = OD_RSHIFT1(t_) - u_; \
- t_ = s3h + sc; \
- /* 10473/8192 ~= Sin[25*Pi/64] + Cos[25*Pi/64] ~= 1.278433918575241 */ \
- u_ = (sc*10473 + 4096) >> 13; \
- /* 19813/32768 ~= Sin[25*Pi/64] - Cos[25*Pi/64] ~= 0.6046542117908007 */ \
- sc = (s3*19813 + 16384) >> 15; \
- /* 11039/16384 ~= Cos[25*Pi/64]*2 ~= 0.6737797067844401 */ \
- t_ = (t_*11039 + 8192) >> 14; \
- sc += t_; \
- s3 = u_ - OD_RSHIFT1(t_); \
- t_ = sb - s4h; \
- /* 9937/8192 ~= Sin[27*Pi/64] + Cos[27*Pi/64] ~= 1.213011433097808 */ \
- u_ = (sb*9937 + 4096) >> 13; \
- /* 1489/2048 ~= Sin[27*Pi/64] - Cos[27*Pi/64] ~= 0.72705107329128 */ \
- sb = (s4*1489 + 1024) >> 11; \
- /* 3981/8192 ~= Cos[27*Pi/64]*2 ~= 0.48596035980652774 */ \
- t_ = (t_*3981 + 4096) >> 13; \
- sb -= t_; \
- s4 = OD_RSHIFT1(t_) - u_; \
- t_ = s7h + s8; \
- /* 37221/32768 ~= Sin[29*Pi/64] + Cos[29*Pi/64] ~= 1.1359069844201428 */ \
- u_ = (s8*37221 + 16384) >> 15; \
- /* 27605/32768 ~= Sin[29*Pi/64] - Cos[29*Pi/64] ~= 0.8424460355094192 */ \
- s8 = (s7*27605 + 16384) >> 15; \
- /* 601/2048 ~= Cos[29*Pi/64]*2 ~= 0.2934609489107235 */ \
- t_ = (t_*601 + 1024) >> 11; \
- s8 += t_; \
- s7 = u_ - OD_RSHIFT1(t_); \
- t_ = s0h - sf; \
- /* 1073/1024 ~= Sin[31*Pi/64] + Cos[31*Pi/64] ~= 1.0478631305325905 */ \
- u_ = (sf*1073 + 512) >> 10; \
- /* 31121/32768 ~= Sin[31*Pi/64] - Cos[31*Pi/64] ~= 0.9497277818777544 */ \
- sf = (s0*31121 + 16384) >> 15; \
- /* 201/2048 ~= Cos[31*Pi/64]*2 ~= 0.09813534865483603 */ \
- t_ = (t_*201 + 1024) >> 11; \
- s0 = u_ + OD_RSHIFT1(t_); \
- sf += t_; \
- s3 -= OD_RSHIFT1(sd); \
- sd += s3; \
- s2 += OD_RSHIFT1(sc); \
- sc -= s2; \
- s5 -= OD_RSHIFT1(sb); \
- sb += s5; \
- s4 -= OD_RSHIFT1(sa); \
- sa += s4; \
- s1 += OD_RSHIFT1(sf); \
- sf -= s1; \
- s7 -= OD_RSHIFT1(s9); \
- s9 += s7; \
- s6 -= OD_RSHIFT1(s8); \
- s8 += s6; \
- s0 += OD_RSHIFT1(se); \
- se -= s0; \
- sa -= s9; \
- s9 += OD_RSHIFT1(sa); \
- s5 += s6; \
- s6 -= OD_RSHIFT1(s5); \
- s1 -= s2; \
- s2 += OD_RSHIFT1(s1); \
- se += sd; \
- sd -= OD_RSHIFT1(se); \
- s0 += sc; \
- s0h = OD_RSHIFT1(s0); \
- sc -= s0h; \
- sf -= s3; \
- sfh = OD_RSHIFT1(sf); \
- s3 += sfh; \
- sb += s7; \
- sbh = OD_RSHIFT1(sb); \
- s7 -= sbh; \
- s4 += s8; \
- s4h = OD_RSHIFT1(s4); \
- s8 -= s4h; \
- t_ = s1 + se; \
- /* 9633/8192 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */ \
- u_ = (s1*9633 + 4096) >> 13; \
- /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */ \
- s1 = (se*12873 + 8192) >> 14; \
- /* 6393/32768 ~= Cos[7*Pi/16] ~= 0.19509032201612825 */ \
- t_ = (t_*6393 + 16384) >> 15; \
- s1 += t_; \
- se = u_ - t_; \
- t_ = s6 + s9; \
- /* 22725/16384 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
- u_ = (s9*22725 + 8192) >> 14; \
- /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
- s9 = (s6*9041 + 16384) >> 15; \
- /* 18205/32768 ~= Cos[5*Pi/16] ~= 0.5555702330196022 */ \
- t_ = (t_*18205 + 16384) >> 15; \
- s9 += t_; \
- s6 = u_ - t_; \
- t_ = s5 + sa; \
- /* 11363/8192 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
- u_ = (sa*11363 + 4096) >> 13; \
- /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
- sa = (s5*9041 + 16384) >> 15; \
- /* 4551/8192 ~= Cos[5*Pi/16] ~= 0.5555702330196022 */ \
- t_ = (t_*4551 + 4096) >> 13; \
- sa += t_; \
- s5 = t_ - u_; \
- t_ = s2 + sd; \
- /* 9633/8192 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */ \
- u_ = (s2*9633 + 4096) >> 13; \
- /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */ \
- s2 = (sd*12873 + 8192) >> 14; \
- /* 6393/32768 ~= Cos[7*Pi/16] ~= 0.19509032201612825 */ \
- t_ = (t_*6393 + 16384) >> 15; \
- s2 += t_; \
- sd = u_ - t_; \
- s3 -= s4h; \
- s4 += s3; \
- s8 -= s0h; \
- s0 += s8; \
- s7 += sfh; \
- sf -= s7; \
- sc += sbh; \
- sb -= sc; \
- s6 += OD_RSHIFT1(se) ;\
- se -= s6; \
- s9 -= OD_RSHIFT1(s1); \
- s1 += s9; \
- sd -= OD_RSHIFT1(s5); \
- s5 += sd; \
- s2 -= OD_RSHIFT1(sa); \
- sa += s2; \
- t_ = s3 + sc; \
- /* 10703/8192 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
- u_ = (s3*10703 + 4096) >> 13; \
- /* 8867/16348 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
- s3 = (sc*8867 + 8192) >> 14; \
- /* 3135/8192 ~= Cos[3*Pi/8] ~= 0.3826834323650898 */ \
- t_ = (t_*3135 + 4096) >> 13; \
- s3 += t_; \
- sc = u_ - t_; \
- t_ = s4 + sb; \
- /* 10703/8192 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
- u_ = (s4*10703 + 4096) >> 13; \
- /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
- s4 = (sb*8867 + 8192) >> 14; \
- /* 3135/8192 ~= Cos[3*Pi/8] ~= 0.3826834323650898 */ \
- t_ = (t_*3135 + 4096) >> 13; \
- s4 += t_; \
- sb = u_ - t_; \
- t_ = s5 + sa; \
- /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
- u_ = (sa*11585 + 4096) >> 13; \
- /* 11585/16384 ~= Cos[Pi/4] ~= 0.7071067811865475 */ \
- sa = (t_*11585 + 8192) >> 14; \
- s5 = sa - u_; \
- t_ = s6 - s9; \
- /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
- s6 = (s9*11585 + 4096) >> 13; \
- /* 11585/16384 ~= Cos[Pi/4] ~= 0.7071067811865475 */ \
- s9 = (t_*11585 + 8192) >> 14; \
- s6 += s9; \
- t_ = s7 - s8; \
- /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
- s7 = (s8*11585 + 4096) >> 13; \
- /* 11585/16384 ~= Cos[Pi/4] ~= 0.7071067811865475 */ \
- s8 = (t_*11585 + 8192) >> 14; \
- s7 += s8; \
- } \
- while (0)
-
-#define OD_IDST_16_ASYM_FLAT(s0, s1, s2, s3, s4, s5, s6, s7, \
- s8, s9, sa, sb, sc, sd, se, sf) \
- /* Embedded 16-point asymmetric Type-IV iDST with flattened rotations. */ \
- do { \
- int t_; \
- int u_; \
- int s0h; \
- int s1h; \
- int s2h; \
- int s3h; \
- int s4h; \
- int s5h; \
- int s6h; \
- int s7h; \
- int sbh; \
- int sfh; \
- t_ = s6 + s9; \
- /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
- s9 = (s6*11585 + 4096) >> 13; \
- /* 11585/16384 ~= Cos[Pi/4] ~= 0.7071067811865475 */ \
- s6 = (t_*11585 + 8192) >> 14; \
- s9 -= s6; \
- t_ = s5 + sa; \
- /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
- sa = (s5*11585 + 4096) >> 13; \
- /* 11585/16384 ~= Cos[Pi/4] ~= 0.7071067811865475 */ \
- s5 = (t_*11585 + 8192) >> 14; \
- sa -= s5; \
- t_ = s7 + s8; \
- /* 11585/8192 ~= Sin[Pi/4] + Cos[Pi/4] ~= 1.4142135623730951 */ \
- s8 = (s7*11585 + 4096) >> 13; \
- /* 11585/16384 ~= Cos[Pi/4] ~= 0.7071067811865475 */ \
- s7 = (t_*11585 + 8192) >> 14; \
- s8 -= s7; \
- t_ = s3 - sc; \
- /* 10703/8192 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
- u_ = (sc*10703 + 4096) >> 13; \
- /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
- sc = (s3*8867 + 8192) >> 14; \
- /* 3135/8192 ~= Cos[3*Pi/8] ~= 0.3826834323650898 */ \
- t_ = (t_*3135 + 4096) >> 13; \
- sc += t_; \
- s3 = u_ + t_; \
- t_ = sb - s4; \
- /* 10703/8192 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */ \
- u_ = (sb*10703 + 4096) >> 13; \
- /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */ \
- sb = (s4*8867 + 8192) >> 14; \
- /* 3135/8192 ~= Cos[3*Pi/8] ~= 0.3826834323650898 */ \
- t_ = (t_*3135 + 4096) >> 13; \
- sb -= t_; \
- s4 = t_ - u_; \
- sa += s2; \
- s2 -= OD_RSHIFT1(sa); \
- s5 -= sd; \
- sd += OD_RSHIFT1(s5); \
- s1 -= s9; \
- s9 += OD_RSHIFT1(s1); \
- se += s6; \
- s6 -= OD_RSHIFT1(se); \
- sb += sc; \
- sbh = OD_RSHIFT1(sb); \
- sc -= sbh; \
- sf += s7; \
- sfh = OD_RSHIFT1(sf); \
- s7 -= sfh; \
- s0 -= s8; \
- s0h = OD_RSHIFT1(s0); \
- s8 += s0h; \
- s4 += s3; \
- s4h = OD_RSHIFT1(s4); \
- s3 -= s4h; \
- t_ = sd - s2; \
- /* 9633/8192 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */ \
- u_ = (sd*9633 + 4096) >> 13; \
- /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */ \
- sd = (s2*12873 + 8192) >> 14; \
- /* 6393/32768 ~= Cos[7*Pi/16] ~= 0.19509032201612825 */ \
- t_ = (t_*6393 + 16384) >> 15; \
- sd -= t_; \
- s2 = t_ - u_; \
- t_ = s5 - sa; \
- /* 11363/8192 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
- u_ = (s5*11363 + 4096) >> 13; \
- /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
- s5 = (sa*9041 + 16384) >> 15; \
- /* 4551/8192 ~= Cos[5*Pi/16] ~= 0.5555702330196022 */ \
- t_ = (t_*4551 + 4096) >> 13; \
- s5 -= t_; \
- sa = t_ - u_; \
- t_ = s6 + s9; \
- /* 22725/16384 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */ \
- u_ = (s9*22725 + 8192) >> 14; \
- /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */ \
- s9 = (s6*9041 + 16384) >> 15; \
- /* 18205/32768 ~= Cos[5*Pi/16] ~= 0.5555702330196022 */ \
- t_ = (t_*18205 + 16384) >> 15; \
- s9 += t_; \
- s6 = u_ - t_; \
- t_ = s1 + se; \
- /* 9633/8192 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */ \
- u_ = (s1*9633 + 4096) >> 13; \
- /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */ \
- s1 = (se*12873 + 8192) >> 14; \
- /* 6393/32768 ~= Cos[7*Pi/16] ~= 0.19509032201612825 */ \
- t_ = (t_*6393 + 16384) >> 15; \
- s1 += t_; \
- se = u_ - t_; \
- s8 -= s4h; \
- s4 += s8; \
- s7 += sbh; \
- sb -= s7; \
- s3 -= sfh; \
- sf += s3; \
- sc += s0h; \
- s0 -= sc; \
- sd += OD_RSHIFT1(se); \
- se -= sd; \
- s2 += OD_RSHIFT1(s1); \
- s1 -= s2; \
- s6 -= OD_RSHIFT1(s5); \
- s5 += s6; \
- s9 -= OD_RSHIFT1(sa); \
- sa += s9; \
- s0 -= se; \
- s0h = OD_RSHIFT1(s0); \
- se += s0h; \
- s1 -= sf; \
- s1h = OD_RSHIFT1(s1); \
- sf += s1h; \
- s2 += sc; \
- s2h = OD_RSHIFT1(s2); \
- sc -= s2h; \
- s3 += sd; \
- s3h = OD_RSHIFT1(s3); \
- sd -= s3h; \
- s4 -= sa; \
- s4h = OD_RSHIFT1(s4); \
- sa += s4h; \
- s5 -= sb; \
- s5h = OD_RSHIFT1(s5); \
- sb += s5h; \
- s6 += s8; \
- s6h = OD_RSHIFT1(s6); \
- s8 -= s6h; \
- s7 += s9; \
- s7h = OD_RSHIFT1(s7); \
- s9 -= s7h; \
- t_ = se - s1h; \
- /* 23143/32768 ~= (Sin[17*Pi/64] + Cos[17*Pi/64])/2 ~=
- 0.7062550401009887 */ \
- u_ = (s1*23143 + 16384) >> 15; \
- /* 1137/8192 ~= (Sin[17*Pi/64] - Cos[17*Pi/64])*2 ~=
- 0.13878434101588155 */ \
- s1 = (se*1137 + 4096) >> 13; \
- /* 44011/32768 ~= Cos[17*Pi/64]*2 ~= 1.3431179096940367 */ \
- t_ = (t_*44011 + 16384) >> 15; \
- s1 += t_; \
- se = u_ + OD_RSHIFT1(t_); \
- t_ = s6h + s9; \
- /* 2865/4096 ~= (Sin[19*Pi/64] + Cos[19*Pi/64])/2 ~= 0.6994534179865391 */ \
- u_ = (s6*2865 + 2048) >> 12; \
- /* 13599/32768 ~= (Sin[19*Pi/64] - Cos[19*Pi/64])*2 ~=
- 0.41501645397642317 */ \
- s6 = (s9*13599 + 16384) >> 15; \
- /* 305/256 ~= Cos[19*Pi/64]*2 ~= 1.1913986089848667 */ \
- t_ = (t_*305 + 128) >> 8; \
- s6 += t_; \
- s9 = u_ - OD_RSHIFT1(t_); \
- t_ = sa - s5h; \
- /* 5619/8192 ~= (Sin[21*Pi/64] + Cos[21*Pi/64])/2 ~= 0.685915677096747 */ \
- u_ = (s5*5619 + 4096) >> 13; \
- /* 2815/4096 ~= (Sin[21*Pi/64] - Cos[21*Pi/64])*2 ~= 0.6872517316141009 */ \
- s5 = (sa*2815 + 2048) >> 12; \
- /* 8423/8192 ~= Cos[21*Pi/64]*2 ~= 1.0282054883864433 */ \
- t_ = (t_*8423 + 4096) >> 13; \
- s5 += t_; \
- sa = u_ + OD_RSHIFT1(t_); \
- t_ = s2h + sd; \
- /* 2727/4096 ~= (Sin[23*Pi/64] + Cos[23*Pi/64])/2 ~= 0.6657721932768628 */ \
- u_ = (s2*2727 + 2048) >> 12; \
- /* 3903/4096 ~= (Sin[23*Pi/64] - Cos[23*Pi/64])*2 ~= 0.9528683993863225 */ \
- s2 = (sd*3903 + 2048) >> 12; \
- /* 7005/8192 ~= Cos[23*Pi/64]*2 ~= 0.8551101868605642 */ \
- t_ = (t_*7005 + 4096) >> 13; \
- s2 += t_; \
- sd = u_ - OD_RSHIFT1(t_); \
- t_ = sc - s3h; \
- /* 10473/16384 ~= (Sin[25*Pi/64] + Cos[25*Pi/64])/2 ~=
- 0.6392169592876205 */ \
- u_ = (s3*10473 + 8192) >> 14; \
- /* 39627/32768 ~= (Sin[25*Pi/64] - Cos[25*Pi/64])*2 ~=
- 1.2093084235816014 */ \
- s3 = (sc*39627 + 16384) >> 15; \
- /* 11039/16384 ~= Cos[25*Pi/64]*2 ~= 0.6737797067844401 */ \
- t_ = (t_*11039 + 8192) >> 14; \
- s3 += t_; \
- sc = u_ + OD_RSHIFT1(t_); \
- t_ = s4h + sb; \
- /* 9937/16384 ~= (Sin[27*Pi/64] + Cos[27*Pi/64])/2 ~= 0.606505716548904 */ \
- u_ = (s4*9937 + 8192) >> 14; \
- /* 1489/1024 ~= (Sin[27*Pi/64] - Cos[27*Pi/64])*2 ~= 1.45410214658256 */ \
- s4 = (sb*1489 + 512) >> 10; \
- /* 3981/8192 ~= Cos[27*Pi/64]*2 ~= 0.48596035980652774 */ \
- t_ = (t_*3981 + 4096) >> 13; \
- s4 += t_; \
- sb = u_ - OD_RSHIFT1(t_); \
- t_ = s8 - s7h; \
- /* 18611/32768 ~= (Sin[29*Pi/64] + Cos[29*Pi/64])/2 ~=
- 0.5679534922100714 */ \
- u_ = (s7*18611 + 16384) >> 15; \
- /* 55211/32768 ~= (Sin[29*Pi/64] - Cos[29*Pi/64])*2 ~=
- 1.6848920710188384 */ \
- s7 = (s8*55211 + 16384) >> 15; \
- /* 601/2048 ~= Cos[29*Pi/64]*2 ~= 0.2934609489107235 */ \
- t_ = (t_*601 + 1024) >> 11; \
- s7 += t_; \
- s8 = u_ + OD_RSHIFT1(t_); \
- t_ = s0h + sf; \
- /* 1073/2048 ~= (Sin[31*Pi/64] + Cos[31*Pi/64])/2 ~= 0.5239315652662953 */ \
- u_ = (s0*1073 + 1024) >> 11; \
- /* 62241/32768 ~= (Sin[31*Pi/64] - Cos[31*Pi/64])*2 ~=
- 1.8994555637555088 */ \
- s0 = (sf*62241 + 16384) >> 15; \
- /* 201/2048 ~= Cos[31*Pi/64]*2 ~= 0.09813534865483603 */ \
- t_ = (t_*201 + 1024) >> 11; \
- s0 += t_; \
- sf = u_ - OD_RSHIFT1(t_); \
- } \
- while (0)
-
#define OD_FDCT_32_PR(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, \
te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
/* Embedded 32-point orthonormal Type-II fDCT. */ \
@@ -3576,138 +1790,6 @@
} \
while (0)
-#define OD_FDCT_32_FLAT(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, \
- tm, te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
- /* Embedded 32-point orthonormal Type-II fDCT with flattened rotations. */ \
- do { \
- od_coeff tgh; \
- od_coeff thh; \
- od_coeff tih; \
- od_coeff tjh; \
- od_coeff tkh; \
- od_coeff tlh; \
- od_coeff tmh; \
- od_coeff tnh; \
- od_coeff toh; \
- od_coeff tph; \
- od_coeff tqh; \
- od_coeff trh; \
- od_coeff tsh; \
- od_coeff tth; \
- od_coeff tuh; \
- od_coeff tvh; \
- tv = t0 - tv; \
- tvh = OD_RSHIFT1(tv); \
- t0 -= tvh; \
- tu += t1; \
- tuh = OD_RSHIFT1(tu); \
- t1 -= tuh; \
- tt = t2 - tt; \
- tth = OD_RSHIFT1(tt); \
- t2 -= tth; \
- ts += t3; \
- tsh = OD_RSHIFT1(ts); \
- t3 -= tsh; \
- tr = t4 - tr; \
- trh = OD_RSHIFT1(tr); \
- t4 -= trh; \
- tq += t5; \
- tqh = OD_RSHIFT1(tq); \
- t5 -= tqh; \
- tp = t6 - tp; \
- tph = OD_RSHIFT1(tp); \
- t6 -= tph; \
- to += t7; \
- toh = OD_RSHIFT1(to); \
- t7 -= toh; \
- tn = t8 - tn; \
- tnh = OD_RSHIFT1(tn); \
- t8 -= tnh; \
- tm += t9; \
- tmh = OD_RSHIFT1(tm); \
- t9 -= tmh; \
- tl = ta - tl; \
- tlh = OD_RSHIFT1(tl); \
- ta -= tlh; \
- tk += tb; \
- tkh = OD_RSHIFT1(tk); \
- tb -= tkh; \
- tj = tc - tj; \
- tjh = OD_RSHIFT1(tj); \
- tc -= tjh; \
- ti += td; \
- tih = OD_RSHIFT1(ti); \
- td -= tih; \
- th = te - th; \
- thh = OD_RSHIFT1(th); \
- te -= thh; \
- tg += tf; \
- tgh = OD_RSHIFT1(tg); \
- tf -= tgh; \
- OD_FDCT_16_ASYM_FLAT(t0, tg, tgh, t8, to, toh, t4, tk, tkh, tc, ts, tsh, \
- t2, ti, tih, ta, tq, tqh, t6, tm, tmh, te, tu, tuh); \
- OD_FDST_16_ASYM_FLAT(tv, tvh, tf, tn, tnh, t7, tr, trh, tb, tj, tjh, t3, \
- tt, tth, td, tl, tlh, t5, tp, tph, t9, th, thh, t1); \
- } \
- while (0)
-
-#define OD_IDCT_32_FLAT(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, \
- tm, te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
- /* Embedded 32-point orthonormal Type-II iDCT with flattened rotations. */ \
- do { \
- od_coeff t1h; \
- od_coeff t3h; \
- od_coeff t5h; \
- od_coeff t7h; \
- od_coeff t9h; \
- od_coeff tbh; \
- od_coeff tdh; \
- od_coeff tfh; \
- od_coeff thh; \
- od_coeff tth; \
- od_coeff tvh; \
- OD_IDST_16_ASYM_FLAT(tv, tn, tr, tj, tt, tl, tp, th, \
- tu, tm, tq, ti, ts, tk, to, tg); \
- OD_IDCT_16_ASYM_FLAT(t0, t8, t4, tc, t2, ta, t6, te, \
- t1, t1h, t9, t9h, t5, t5h, td, tdh, t3, t3h, tb, tbh, t7, t7h, tf, tfh); \
- tu += t1h; \
- t1 -= tu; \
- thh = OD_RSHIFT1(th); \
- te += thh; \
- th = te - th; \
- tm += t9h; \
- t9 -= tm; \
- t6 += OD_RSHIFT1(tp); \
- tp = t6 - tp; \
- tq += t5h; \
- t5 -= tq; \
- ta += OD_RSHIFT1(tl); \
- tl = ta - tl; \
- ti += tdh; \
- td -= ti; \
- tth = OD_RSHIFT1(tt); \
- t2 += tth; \
- tt = t2 - tt; \
- ts += t3h; \
- t3 -= ts; \
- tc += OD_RSHIFT1(tj); \
- tj = tc - tj; \
- tk += tbh; \
- tb -= tk; \
- t4 += OD_RSHIFT1(tr); \
- tr = t4 - tr; \
- to += t7h; \
- t7 -= to; \
- t8 += OD_RSHIFT1(tn); \
- tn = t8 - tn; \
- tg += tfh; \
- tf -= tg; \
- tvh = OD_RSHIFT1(tv); \
- t0 += tvh; \
- tv = t0 - tv; \
- } \
- while (0)
-
/* Embedded 32-point orthonormal Type-IV fDST. */
#define OD_FDST_32_PR(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, \
te, tf, tg, th, ti, tj, tk, tl, tm, tn, to, tp, tq, tr, ts, tt, tu, tv) \
@@ -5682,7 +3764,7 @@
q1 = x[1*xstride];
q2 = x[2*xstride];
q3 = x[3*xstride];
- OD_FDCT_4_FLAT(q0, q1, q2, q3);
+ od_fdct_4(&q0, &q1, &q2, &q3);
y[0] = (od_coeff)q0;
y[1] = (od_coeff)q2;
y[2] = (od_coeff)q1;
@@ -5699,7 +3781,7 @@
q2 = y[1];
q1 = y[2];
q3 = y[3];
- OD_IDCT_4_FLAT(q0, q2, q1, q3);
+ od_idct_4(&q0, &q2, &q1, &q3);
x[0*xstride] = (od_coeff)q0;
x[1*xstride] = (od_coeff)q1;
x[2*xstride] = (od_coeff)q2;
@@ -5815,7 +3897,7 @@
r5 = x[5*xstride];
r6 = x[6*xstride];
r7 = x[7*xstride];
- OD_FDCT_8_FLAT(r0, r1, r2, r3, r4, r5, r6, r7);
+ od_fdct_8(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7);
y[0] = (od_coeff)r0;
y[1] = (od_coeff)r4;
y[2] = (od_coeff)r2;
@@ -5843,7 +3925,7 @@
r5 = y[5];
r3 = y[6];
r7 = y[7];
- OD_IDCT_8_FLAT(r0, r4, r2, r6, r1, r5, r3, r7);
+ od_idct_8(&r0, &r4, &r2, &r6, &r1, &r5, &r3, &r7);
x[0*xstride] = (od_coeff)r0;
x[1*xstride] = (od_coeff)r1;
x[2*xstride] = (od_coeff)r2;
@@ -5872,7 +3954,7 @@
r5 = x[5*xstride];
r6 = x[6*xstride];
r7 = x[7*xstride];
- OD_FDST_8_FLAT(r0, r1, r2, r3, r4, r5, r6, r7);
+ od_fdst_8(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7);
y[0] = (od_coeff)r0;
y[1] = (od_coeff)r4;
y[2] = (od_coeff)r2;
@@ -5900,7 +3982,7 @@
r5 = y[5];
r3 = y[6];
r7 = y[7];
- OD_IDST_8_FLAT(r0, r4, r2, r6, r1, r5, r3, r7);
+ od_idst_8(&r0, &r4, &r2, &r6, &r1, &r5, &r3, &r7);
x[0*xstride] = (od_coeff)r0;
x[1*xstride] = (od_coeff)r1;
x[2*xstride] = (od_coeff)r2;
@@ -6167,8 +4249,8 @@
sd = x[13*xstride];
se = x[14*xstride];
sf = x[15*xstride];
- OD_FDCT_16_FLAT(s0, s1, s2, s3, s4, s5, s6, s7,
- s8, s9, sa, sb, sc, sd, se, sf);
+ od_fdct_16(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7,
+ &s8, &s9, &sa, &sb, &sc, &sd, &se, &sf);
y[0] = (od_coeff)s0;
y[1] = (od_coeff)s8;
y[2] = (od_coeff)s4;
@@ -6220,8 +4302,8 @@
sb = y[13];
s7 = y[14];
sf = y[15];
- OD_IDCT_16_FLAT(s0, s8, s4, sc, s2, sa, s6, se,
- s1, s9, s5, sd, s3, sb, s7, sf);
+ od_idct_16(&s0, &s8, &s4, &sc, &s2, &sa, &s6, &se,
+ &s1, &s9, &s5, &sd, &s3, &sb, &s7, &sf);
x[0*xstride] = (od_coeff)s0;
x[1*xstride] = (od_coeff)s1;
x[2*xstride] = (od_coeff)s2;
@@ -6258,38 +4340,38 @@
int se;
int sf;
s0 = x[0*xstride];
- s8 = x[1*xstride];
- s4 = x[2*xstride];
- sc = x[3*xstride];
- s2 = x[4*xstride];
- sa = x[5*xstride];
+ s1 = x[1*xstride];
+ s2 = x[2*xstride];
+ s3 = x[3*xstride];
+ s4 = x[4*xstride];
+ s5 = x[5*xstride];
s6 = x[6*xstride];
- se = x[7*xstride];
- s1 = x[8*xstride];
+ s7 = x[7*xstride];
+ s8 = x[8*xstride];
s9 = x[9*xstride];
- s5 = x[10*xstride];
- sd = x[11*xstride];
- s3 = x[12*xstride];
- sb = x[13*xstride];
- s7 = x[14*xstride];
+ sa = x[10*xstride];
+ sb = x[11*xstride];
+ sc = x[12*xstride];
+ sd = x[13*xstride];
+ se = x[14*xstride];
sf = x[15*xstride];
- OD_FDST_16_FLAT(s0, s8, s4, sc, s2, sa, s6, se,
- s1, s9, s5, sd, s3, sb, s7, sf);
+ od_fdst_16(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7,
+ &s8, &s9, &sa, &sb, &sc, &sd, &se, &sf);
y[0] = (od_coeff)s0;
- y[1] = (od_coeff)s1;
- y[2] = (od_coeff)s2;
- y[3] = (od_coeff)s3;
- y[4] = (od_coeff)s4;
- y[5] = (od_coeff)s5;
+ y[1] = (od_coeff)s8;
+ y[2] = (od_coeff)s4;
+ y[3] = (od_coeff)sc;
+ y[4] = (od_coeff)s2;
+ y[5] = (od_coeff)sa;
y[6] = (od_coeff)s6;
- y[7] = (od_coeff)s7;
- y[8] = (od_coeff)s8;
+ y[7] = (od_coeff)se;
+ y[8] = (od_coeff)s1;
y[9] = (od_coeff)s9;
- y[10] = (od_coeff)sa;
- y[11] = (od_coeff)sb;
- y[12] = (od_coeff)sc;
- y[13] = (od_coeff)sd;
- y[14] = (od_coeff)se;
+ y[10] = (od_coeff)s5;
+ y[11] = (od_coeff)sd;
+ y[12] = (od_coeff)s3;
+ y[13] = (od_coeff)sb;
+ y[14] = (od_coeff)s7;
y[15] = (od_coeff)sf;
}
@@ -6326,8 +4408,8 @@
sb = y[13];
s7 = y[14];
sf = y[15];
- OD_IDST_16_FLAT(s0, s8, s4, sc, s2, sa, s6, se,
- s1, s9, s5, sd, s3, sb, s7, sf);
+ od_idst_16(&s0, &s8, &s4, &sc, &s2, &sa, &s6, &se,
+ &s1, &s9, &s5, &sd, &s3, &sb, &s7, &sf);
x[0*xstride] = (od_coeff)s0;
x[1*xstride] = (od_coeff)s1;
x[2*xstride] = (od_coeff)s2;
@@ -6412,9 +4494,10 @@
tn = x[29*xstride];
tf = x[30*xstride];
tv = x[31*xstride];
- OD_FDCT_32_FLAT(
- t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, te, tu,
- t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv);
+ od_fdct_32(
+ &t0, &tg, &t8, &to, &t4, &tk, &tc, &ts, &t2, &ti, &ta, &tq, &t6, &tm, &te,
+ &tu, &t1, &th, &t9, &tp, &t5, &tl, &td, &tt, &t3, &tj, &tb, &tr, &t7, &tn,
+ &tf, &tv);
y[0] = (od_coeff)t0;
y[1] = (od_coeff)t1;
y[2] = (od_coeff)t2;
@@ -6514,9 +4597,10 @@
tn = y[29];
tf = y[30];
tv = y[31];
- OD_IDCT_32_FLAT(
- t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, te, tu,
- t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv);
+ od_idct_32(
+ &t0, &tg, &t8, &to, &t4, &tk, &tc, &ts, &t2, &ti, &ta, &tq, &t6, &tm, &te,
+ &tu, &t1, &th, &t9, &tp, &t5, &tl, &td, &tt, &t3, &tj, &tb, &tr, &t7, &tn,
+ &tf, &tv);
x[0*xstride] = (od_coeff)t0;
x[1*xstride] = (od_coeff)t1;
x[2*xstride] = (od_coeff)t2;
diff --git a/av1/common/daala_tx_kernels.h b/av1/common/daala_tx_kernels.h
new file mode 100644
index 0000000..d6542c0
--- /dev/null
+++ b/av1/common/daala_tx_kernels.h
@@ -0,0 +1,1603 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#ifndef AOM_DSP_DAALA_TX_KERNELS_H_
+#define AOM_DSP_DAALA_TX_KERNELS_H_
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/odintrin.h"
+
+#define AVG_BIAS (0)
+
+static INLINE od_coeff od_rshift1(od_coeff v) {
+ return (v + (v < 0)) >> 1;
+}
+
+static INLINE od_coeff od_add(od_coeff p0, od_coeff p1) {
+ return p0 + p1;
+}
+
+static INLINE od_coeff od_sub(od_coeff p0, od_coeff p1) {
+ return p0 - p1;
+}
+
+static INLINE od_coeff od_avg_add(od_coeff p0, od_coeff p1) {
+ return (od_add(p0, p1) + AVG_BIAS) >> 1;
+}
+
+static INLINE od_coeff od_avg_sub(od_coeff p0, od_coeff p1) {
+ return (od_sub(p0, p1) + AVG_BIAS) >> 1;
+}
+
+/* Fixed point multiply. */
+static INLINE od_coeff od_mul(od_coeff n, int c, int q) {
+ return (n*c + ((1 << q) >> 1)) >> q;
+}
+
+/* Two multiply rotation primative (used when rotating by Pi/4). */
+static INLINE void od_rot2(od_coeff *p0, od_coeff *p1, od_coeff t, int c0,
+ int q0, int c1, int q1) {
+ *p1 = od_mul(*p0, c0, q0);
+ *p0 = od_mul(t, c1, q1);
+}
+
+/* Rotate by Pi/4 and add. */
+static INLINE void od_rotate_pi4_add(od_coeff *p0, od_coeff *p1, od_coeff t,
+ int c0, int q0, int c1, int q1) {
+ od_rot2(p0, p1, t, c0, q0, c1, q1);
+ *p1 = od_add(*p1, *p0);
+}
+
+/* Rotate by Pi/4 and subtract. */
+static INLINE void od_rotate_pi4_sub(od_coeff *p0, od_coeff *p1, od_coeff t,
+ int c0, int q0, int c1, int q1) {
+ od_rot2(p0, p1, t, c0, q0, c1, q1);
+ *p1 = od_sub(*p1, *p0);
+}
+
+/* Three multiply rotation primative. */
+static INLINE void od_rot3(od_coeff *p0, od_coeff *p1, od_coeff *t, od_coeff *u,
+ int c0, int q0, int c1, int q1, int c2, int q2) {
+ *u = od_mul(*p0, c0, q0);
+ *p0 = od_mul(*p1, c1, q1);
+ *t = od_mul(*t, c2, q2);
+}
+
+/* Rotate and add. */
+static INLINE void od_rotate_add(od_coeff *p0, od_coeff *p1, od_coeff t,
+ int c0, int q0, int c1, int q1, int c2, int q2, int shift) {
+ od_coeff u;
+ od_rot3(p0, p1, &t, &u, c0, q0, c1, q1, c2, q2);
+ *p0 = od_add(*p0, t);
+ if (shift) t = od_rshift1(t);
+ *p1 = od_add(u, t);
+}
+
+/* Rotate and subtract. */
+static INLINE void od_rotate_sub(od_coeff *p0, od_coeff *p1, od_coeff t,
+ int c0, int q0, int c1, int q1, int c2, int q2, int shift) {
+ od_coeff u;
+ od_rot3(p0, p1, &t, &u, c0, q0, c1, q1, c2, q2);
+ *p0 = od_add(*p0, t);
+ if (shift) t = od_rshift1(t);
+ *p1 = od_sub(u, t);
+}
+
+/* Rotate and subtract with negation. */
+static INLINE void od_rotate_neg(od_coeff *p0, od_coeff *p1, od_coeff t,
+ int c0, int q0, int c1, int q1, int c2, int q2) {
+ od_coeff u;
+ od_rot3(p0, p1, &t, &u, c0, q0, c1, q1, c2, q2);
+ *p0 = od_sub(*p0, t);
+ *p1 = od_sub(t, u);
+}
+
+/* Computes the +/- addition butterfly (asymmetric output).
+ The inverse to this function is od_butterfly_add_asym().
+
+ p0 = p0 + p1;
+ p1 = p1 - p0/2; */
+static INLINE void od_butterfly_add(od_coeff *p0, od_coeff *p0h, od_coeff *p1) {
+ od_coeff p0h_;
+ *p0 = od_add(*p0, *p1);
+ p0h_ = od_rshift1(*p0);
+ *p1 = od_sub(*p1, p0h_);
+ if (p0h != NULL) *p0h = p0h_;
+}
+
+/* Computes the asymmetric +/- addition butterfly (unscaled output).
+ The inverse to this function is od_butterfly_add().
+
+ p1 = p1 + p0/2;
+ p0 = p0 - p1; */
+static INLINE void od_butterfly_add_asym(od_coeff *p0, od_coeff p0h,
+ od_coeff *p1) {
+ *p1 = od_add(*p1, p0h);
+ *p0 = od_sub(*p0, *p1);
+}
+
+/* Computes the +/- subtraction butterfly (asymmetric output).
+ The inverse to this function is od_butterfly_sub_asym().
+
+ p0 = p0 - p1;
+ p1 = p1 + p0/2; */
+static INLINE void od_butterfly_sub(od_coeff *p0, od_coeff *p0h, od_coeff *p1) {
+ od_coeff p0h_;
+ *p0 = od_sub(*p0, *p1);
+ p0h_ = od_rshift1(*p0);
+ *p1 = od_add(*p1, p0h_);
+ if (p0h != NULL) *p0h = p0h_;
+}
+
+/* Computes the asymmetric +/- subtraction butterfly (unscaled output).
+ The inverse to this function is od_butterfly_sub().
+
+ p1 = p1 - p0/2;
+ p0 = p0 + p1; */
+static INLINE void od_butterfly_sub_asym(od_coeff *p0, od_coeff p0h,
+ od_coeff *p1) {
+ *p1 = od_sub(*p1, p0h);
+ *p0 = od_add(*p0, *p1);
+}
+
+/* Computes the +/- subtract and negate butterfly (asymmetric output).
+ The inverse to this function is od_butterfly_neg_asym().
+
+ p1 = p1 - p0;
+ p0 = p0 + p1/2;
+ p1 = -p1; */
+static INLINE void od_butterfly_neg(od_coeff *p0, od_coeff *p1, od_coeff *p1h) {
+ *p1 = od_sub(*p0, *p1);
+ *p1h = od_rshift1(*p1);
+ *p0 = od_sub(*p0, *p1h);
+}
+
+/* Computes the asymmetric +/- negate and subtract butterfly (unscaled output).
+ The inverse to this function is od_butterfly_neg().
+
+ p1 = -p1;
+ p0 = p0 - p1/2;
+ p1 = p1 + p0; */
+static INLINE void od_butterfly_neg_asym(od_coeff *p0, od_coeff *p1,
+ od_coeff p1h) {
+ *p0 = od_add(*p0, p1h);
+ *p1 = od_sub(*p0, *p1);
+}
+
+/* --- 2-point Transforms --- */
+
+/**
+ * 2-point orthonormal Type-II fDCT
+ */
+static INLINE void od_fdct_2(od_coeff *p0, od_coeff *p1) {
+ /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+ /* 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 */
+ od_rotate_pi4_add(p1, p0, od_avg_sub(*p0, *p1), 11585, 13, 11585, 13);
+}
+
+/**
+ * 2-point orthonormal Type-II iDCT
+ */
+static INLINE void od_idct_2(od_coeff *p0, od_coeff *p1) {
+ /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+ /* 11585/16384 = Cos[Pi/4] = 0.7071067811865475 */
+ od_rotate_pi4_sub(p0, p1, od_add(*p1, *p0), 11585, 13, 11585, 14);
+}
+
+/**
+ * 2-point asymmetric Type-II fDCT
+ */
+static INLINE void od_fdct_2_asym(od_coeff *p0, od_coeff *p1,
+ od_coeff p1h) {
+ od_butterfly_neg_asym(p0, p1, p1h);
+}
+
+/**
+ * 2-point asymmetric Type-II iDCT
+ */
+static INLINE void od_idct_2_asym(od_coeff *p0, od_coeff *p1, od_coeff *p1h) {
+ od_butterfly_neg(p0, p1, p1h);
+}
+
+/**
+ * 2-point orthonormal Type-IV fDCT
+ */
+static INLINE void od_fdst_2(od_coeff *p0, od_coeff *p1) {
+
+ /* Stage 0 */
+
+ /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+ /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461971 */
+ /* 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 */
+ od_rotate_sub(p0, p1, od_avg_add(*p0, *p1), 21407, 14, 8867, 14, 3135, 12, 0);
+}
+
+/**
+ * 2-point orthonormal Type-IV iDCT
+ */
+static INLINE void od_idst_2(od_coeff *p0, od_coeff *p1) {
+ od_fdst_2(p0, p1);
+}
+
+/**
+ * 2-point asymmetric Type-IV fDCT
+ */
+static INLINE void od_fdst_2_asym(od_coeff *p0, od_coeff p0h,
+ od_coeff *p1) {
+
+ /* Stage 0 */
+
+ /* 15137/16384 = (Sin[3*Pi/8] + Cos[3*Pi/8])/Sqrt[2] = 0.9238795325112867 */
+ /* 3135/4096 = (Sin[3*Pi/8] - Cos[3*Pi/8])*Sqrt[2] = 0.7653668647301795 */
+ /* 8867/16384 = Cos[3*Pi/8]*Sqrt[2] = 0.5411961001461971 */
+ od_rotate_sub(p0, p1, od_add(p0h, *p1), 15137, 14, 3135, 12, 8867, 14, 0);
+}
+
+/**
+ * 2-point asymmetric Type-IV iDCT
+ */
+static INLINE void od_idst_2_asym(od_coeff *p0, od_coeff *p1) {
+
+ /* Stage 0 */
+
+ /* 15137/16384 = (Sin[3*Pi/8] + Cos[3*Pi/8])/Sqrt[2] = 0.9238795325112867 */
+ /* 3135/4096 = (Sin[3*Pi/8] - Cos[3*Pi/8])*Sqrt[2] = 0.7653668647301795 */
+ /* 8867/8192 = 2*Cos[3*Pi/8]*Sqrt[2] = 1.0823922002923940 */
+ od_rotate_sub(p0, p1, od_avg_add(*p1, *p0), 15137, 14, 3135, 12, 8867, 13, 1);
+}
+
+/* --- 4-point Transforms --- */
+
+/**
+ * 4-point orthonormal Type-II fDCT
+ */
+static INLINE void od_fdct_4(od_coeff *q0, od_coeff *q1, od_coeff *q2,
+ od_coeff *q3) {
+ od_coeff q1h;
+ od_coeff q3h;
+
+ /* +/- Butterflies with asymmetric output. */
+ od_butterfly_neg(q0, q3, &q3h);
+ od_butterfly_add(q1, &q1h, q2);
+
+ /* Embedded 2-point transforms with asymmetric input. */
+ od_fdct_2_asym(q0, q1, q1h);
+ od_fdst_2_asym(q3, q3h, q2);
+}
+
+/**
+ * 4-point orthonormal Type-II iDCT
+ */
+static INLINE void od_idct_4(od_coeff *q0, od_coeff *q2,
+ od_coeff *q1, od_coeff *q3) {
+ od_coeff q1h;
+
+ /* Embedded 2-point transforms with asymmetric output. */
+ od_idst_2_asym(q3, q2);
+ od_idct_2_asym(q0, q1, &q1h);
+
+ /* +/- Butterflies with asymmetric input. */
+ od_butterfly_add_asym(q1, q1h, q2);
+ od_butterfly_neg_asym(q0, q3, od_rshift1(*q3));
+}
+
+/**
+ * 4-point asymmetric Type-II fDCT
+ */
+static INLINE void od_fdct_4_asym(od_coeff *q0, od_coeff *q1, od_coeff q1h,
+ od_coeff *q2, od_coeff *q3, od_coeff q3h) {
+
+ /* +/- Butterflies with asymmetric input. */
+ od_butterfly_neg_asym(q0, q3, q3h);
+ od_butterfly_sub_asym(q1, q1h, q2);
+
+ /* Embedded 2-point orthonormal transforms. */
+ od_fdct_2(q0, q1);
+ od_fdst_2(q3, q2);
+}
+
+/**
+ * 4-point asymmetric Type-II iDCT
+ */
+static INLINE void od_idct_4_asym(od_coeff *q0, od_coeff *q2,
+ od_coeff *q1, od_coeff *q1h,
+ od_coeff *q3, od_coeff *q3h) {
+
+ /* Embedded 2-point orthonormal transforms. */
+ od_idst_2(q3, q2);
+ od_idct_2(q0, q1);
+
+ /* +/- Butterflies with asymmetric output. */
+ od_butterfly_sub(q1, q1h, q2);
+ od_butterfly_neg(q0, q3, q3h);
+}
+
+/**
+ * 4-point orthonormal Type-IV fDST
+ */
+static INLINE void od_fdst_4(od_coeff *q0, od_coeff *q1,
+ od_coeff *q2, od_coeff *q3) {
+
+ /* Stage 0 */
+
+ /* 13623/16384 = (Sin[7*Pi/16] + Cos[7*Pi/16])/Sqrt[2] = 0.831469612302545 */
+ /* 18205/16384 = (Sin[7*Pi/16] - Cos[7*Pi/16])*Sqrt[2] = 1.111140466039204 */
+ /* 9041/32768 = Cos[7*Pi/16]*Sqrt[2] = 0.275899379282943 */
+ od_rotate_sub(q0, q3, od_add(*q3, *q0), 13623, 14, 18205, 14, 9041, 15, 1);
+
+ /* 16069/16384 = (Sin[5*Pi/16] + Cos[5*Pi/16])/Sqrt[2] = 0.9807852804032304 */
+ /* 12785/32768 = (Sin[5*Pi/16] - Cos[5*Pi/16])*Sqrt[2] = 0.3901806440322566 */
+ /* 12873/16384 = Cos[5*Pi/16]*Sqrt[2] = 0.7856949583871021 */
+ od_rotate_add(q2, q1, od_sub(*q1, *q2), 16069, 14, 12785, 15, 12873, 14, 1);
+
+ /* Stage 1 */
+
+ od_butterfly_sub_asym(q0, od_rshift1(*q0), q1);
+ od_butterfly_sub_asym(q2, od_rshift1(*q2), q3);
+
+ /* Stage 2 */
+
+ /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+ /* 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 */
+ od_rotate_pi4_sub(q2, q1, od_avg_add(*q1, *q2), 11585, 13, 11585, 13);
+}
+
+/**
+ * 4-point orthonormal Type-IV iDST
+ */
+static INLINE void od_idst_4(od_coeff *q0, od_coeff *q2,
+ od_coeff *q1, od_coeff *q3) {
+ od_coeff q0h;
+ od_coeff q2h;
+
+ /* Stage 0 */
+
+ /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+ /* 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 */
+ od_rotate_pi4_sub(q2, q1, od_avg_add(*q1, *q2), 11585, 13, 11585, 13);
+
+ /* Stage 1 */
+
+ od_butterfly_sub(q2, &q2h, q3);
+ od_butterfly_sub(q0, &q0h, q1);
+
+ /* Stage 2 */
+
+ /* 16069/16384 = (Sin[5*Pi/16] + Cos[5*Pi/16])/Sqrt[2] = 0.9807852804032304 */
+ /* 12785/32768 = (Sin[5*Pi/16] - Cos[5*Pi/16])*Sqrt[2] = 0.3901806440322566 */
+ /* 12873/16384 = Cos[5*Pi/16]*Sqrt[2] = 0.7856949583871021 */
+ od_rotate_add(q2, q1, od_sub(*q1, q2h), 16069, 14, 12785, 15, 12873, 14, 0);
+
+ /* 13623/16384 = (Sin[7*Pi/16] + Cos[7*Pi/16])/Sqrt[2] = 0.831469612302545 */
+ /* 18205/16384 = (Sin[7*Pi/16] - Cos[7*Pi/16])*Sqrt[2] = 1.111140466039204 */
+ /* 9041/32768 = Cos[7*Pi/16]*Sqrt[2] = 0.275899379282943 */
+ od_rotate_sub(q0, q3, od_add(q0h, *q3), 13623, 14, 18205, 14, 9041, 15, 0);
+}
+
+/**
+ * 4-point asymmetric Type-IV fDST
+ */
+static INLINE void od_fdst_4_asym(od_coeff *q0, od_coeff q0h, od_coeff *q1,
+ od_coeff *q2, od_coeff q2h, od_coeff *q3) {
+
+ /* Stage 0 */
+
+ /* 9633/16384 = (Sin[7*Pi/16] + Cos[7*Pi/16])/2 = 0.5879378012096793 */
+ /* 12873/8192 = (Sin[7*Pi/16] - Cos[7*Pi/16])*2 = 1.5713899167742045 */
+ /* 12785/32768 = Cos[7*Pi/16]*2 = 0.3901806440322565 */
+ od_rotate_sub(q0, q3, od_add(q0h, *q3), 9633, 14, 12873, 13, 12785, 15, 1);
+
+ /* 22725/32768 = (Sin[5*Pi/16] + Cos[5*Pi/16])/2 = 0.6935199226610738 */
+ /* 18081/32768 = (Sin[5*Pi/16] - Cos[5*Pi/16])*2 = 0.5517987585658861 */
+ /* 18205/16384 = Cos[5*Pi/16]*2 = 1.1111404660392044 */
+ od_rotate_add(q2, q1, od_sub(*q1, q2h), 22725, 15, 18081, 15, 18205, 14, 1);
+
+ /* Stage 1 */
+
+ od_butterfly_sub_asym(q0, od_rshift1(*q0), q1);
+ od_butterfly_sub_asym(q2, od_rshift1(*q2), q3);
+
+ /* Stage 2 */
+
+ /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+ /* 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 */
+ od_rotate_pi4_sub(q2, q1, od_avg_add(*q1, *q2), 11585, 13, 11585, 13);
+}
+
+/**
+ * 4-point asymmetric Type-IV iDST
+ */
+static INLINE void od_idst_4_asym(od_coeff *q0, od_coeff *q2,
+ od_coeff *q1, od_coeff *q3) {
+ od_coeff q0h;
+ od_coeff q2h;
+
+ /* Stage 0 */
+
+ /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+ /* 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 */
+ od_rotate_pi4_sub(q2, q1, od_avg_add(*q1, *q2), 11585, 13, 11585, 13);
+
+ /* Stage 1 */
+
+ od_butterfly_sub(q2, &q2h, q3);
+ od_butterfly_sub(q0, &q0h, q1);
+
+ /* Stage 2 */
+
+ /* 22725/32768 = (Sin[5*Pi/16] + Cos[5*Pi/16])/2 = 0.6935199226610738 */
+ /* 18081/32768 = (Sin[5*Pi/16] - Cos[5*Pi/16])*2 = 0.5517987585658861 */
+ /* 18205/16384 = Cos[5*Pi/16]*2 = 1.1111404660392044 */
+ od_rotate_add(q2, q1, od_sub(*q1, q2h), 22725, 15, 18081, 15, 18205, 14, 1);
+
+ /* 9633/16384 = (Sin[7*Pi/16] + Cos[7*Pi/16])/2 = 0.5879378012096793 */
+ /* 12873/8192 = (Sin[7*Pi/16] - Cos[7*Pi/16])*2 = 1.5713899167742045 */
+ /* 12785/32768 = Cos[7*Pi/16]*2 = 0.3901806440322565 */
+ od_rotate_sub(q0, q3, od_add(q0h, *q3), 9633, 14, 12873, 13, 12785, 15, 1);
+}
+
+/* --- 8-point Transforms --- */
+
+/**
+ * 8-point orthonormal Type-II fDCT
+ */
+static INLINE void od_fdct_8(od_coeff *r0, od_coeff *r1,
+ od_coeff *r2, od_coeff *r3,
+ od_coeff *r4, od_coeff *r5,
+ od_coeff *r6, od_coeff *r7) {
+ od_coeff r1h;
+ od_coeff r3h;
+ od_coeff r5h;
+ od_coeff r7h;
+
+ /* +/- Butterflies with asymmetric output. */
+ od_butterfly_neg(r0, r7, &r7h);
+ od_butterfly_add(r1, &r1h, r6);
+ od_butterfly_neg(r2, r5, &r5h);
+ od_butterfly_add(r3, &r3h, r4);
+
+ /* Embedded 4-point forward transforms with asymmetric input. */
+ od_fdct_4_asym(r0, r1, r1h, r2, r3, r3h);
+ od_fdst_4_asym(r7, r7h, r6, r5, r5h, r4);
+}
+
+/**
+ * 8-point orthonormal Type-II iDCT
+ */
+static INLINE void od_idct_8(od_coeff *r0, od_coeff *r4,
+ od_coeff *r2, od_coeff *r6,
+ od_coeff *r1, od_coeff *r5,
+ od_coeff *r3, od_coeff *r7) {
+ od_coeff r1h;
+ od_coeff r3h;
+
+ /* Embedded 4-point inverse transforms with asymmetric output. */
+ od_idst_4_asym(r7, r5, r6, r4);
+ od_idct_4_asym(r0, r2, r1, &r1h, r3, &r3h);
+
+ /* +/- Butterflies with asymmetric input. */
+ od_butterfly_add_asym(r3, r3h, r4);
+ od_butterfly_neg_asym(r2, r5, od_rshift1(*r5));
+ od_butterfly_add_asym(r1, r1h, r6);
+ od_butterfly_neg_asym(r0, r7, od_rshift1(*r7));
+}
+
+/**
+ * 8-point asymmetric Type-II fDCT
+ */
+static INLINE void od_fdct_8_asym(od_coeff *r0, od_coeff *r1, od_coeff r1h,
+ od_coeff *r2, od_coeff *r3, od_coeff r3h,
+ od_coeff *r4, od_coeff *r5, od_coeff r5h,
+ od_coeff *r6, od_coeff *r7, od_coeff r7h) {
+
+ /* +/- Butterflies with asymmetric input. */
+ od_butterfly_neg_asym(r0, r7, r7h);
+ od_butterfly_sub_asym(r1, r1h, r6);
+ od_butterfly_neg_asym(r2, r5, r5h);
+ od_butterfly_sub_asym(r3, r3h, r4);
+
+ /* Embedded 4-point orthonormal transforms. */
+ od_fdct_4(r0, r1, r2, r3);
+ od_fdst_4(r7, r6, r5, r4);
+}
+
+/**
+ * 8-point asymmetric Type-II iDCT
+ */
+static INLINE void od_idct_8_asym(od_coeff *r0, od_coeff *r4,
+ od_coeff *r2, od_coeff *r6,
+ od_coeff *r1, od_coeff *r1h,
+ od_coeff *r5, od_coeff *r5h,
+ od_coeff *r3, od_coeff *r3h,
+ od_coeff *r7, od_coeff *r7h) {
+
+ /* Embedded 4-point inverse orthonormal transforms. */
+ od_idst_4(r7, r5, r6, r4);
+ od_idct_4(r0, r2, r1, r3);
+
+ /* +/- Butterflies with asymmetric output. */
+ od_butterfly_sub(r3, r3h, r4);
+ od_butterfly_neg(r2, r5, r5h);
+ od_butterfly_sub(r1, r1h, r6);
+ od_butterfly_neg(r0, r7, r7h);
+}
+
+/**
+ * 8-point orthonormal Type-IV fDST
+ */
+static INLINE void od_fdst_8(od_coeff *r0, od_coeff *r1,
+ od_coeff *r2, od_coeff *r3,
+ od_coeff *r4, od_coeff *r5,
+ od_coeff *r6, od_coeff *r7) {
+ od_coeff r0h;
+ od_coeff r2h;
+ od_coeff r5h;
+ od_coeff r7h;
+
+ /* Stage 0 */
+
+ /* 17911/16384 = Sin[15*Pi/32] + Cos[15*Pi/32] = 1.0932018670017576 */
+ /* 14699/16384 = Sin[15*Pi/32] - Cos[15*Pi/32] = 0.8971675863426363 */
+ /* 803/8192 = Cos[15*Pi/32] = 0.0980171403295606 */
+ od_rotate_sub(r0, r7, od_add(*r7, *r0), 17911, 14, 14699, 14, 803, 13, 0);
+
+ /* 40869/32768 = Sin[13*Pi/32] + Cos[13*Pi/32] = 1.24722501298667123 */
+ /* 21845/32768 = Sin[13*Pi/32] - Cos[13*Pi/32] = 0.66665565847774650 */
+ /* 1189/4096 = Cos[13*Pi/32] = 0.29028467725446233 */
+ od_rotate_add(r6, r1, od_sub(*r1, *r6), 40869, 15, 21845, 15, 1189, 12, 0);
+
+ /* 22173/16384 = Sin[11*Pi/32] + Cos[11*Pi/32] = 1.3533180011743526 */
+ /* 3363/8192 = Sin[11*Pi/32] - Cos[11*Pi/32] = 0.4105245275223574 */
+ /* 15447/32768 = Cos[11*Pi/32] = 0.47139673682599764 */
+ od_rotate_sub(r2, r5, od_add(*r5, *r2), 22173, 14, 3363, 13, 15447, 15, 0);
+
+ /* 23059/16384 = Sin[9*Pi/32] + Cos[9*Pi/32] = 1.4074037375263826 */
+ /* 2271/16384 = Sin[9*Pi/32] - Cos[9*Pi/32] = 0.1386171691990915 */
+ /* 5197/8192 = Cos[9*Pi/32] = 0.6343932841636455 */
+ od_rotate_add(r4, r3, od_sub(*r3, *r4), 23059, 14, 2271, 14, 5197, 13, 0);
+
+ /* Stage 1 */
+
+ od_butterfly_add(r0, &r0h, r3);
+ od_butterfly_sub(r2, &r2h, r1);
+ od_butterfly_add(r5, &r5h, r6);
+ od_butterfly_sub(r7, &r7h, r4);
+
+ /* Stage 2 */
+
+ od_butterfly_add_asym(r7, r7h, r6);
+ od_butterfly_add_asym(r5, r5h, r3);
+ od_butterfly_add_asym(r2, r2h, r4);
+ od_butterfly_sub_asym(r0, r0h, r1);
+
+ /* Stage 3 */
+
+ /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+ /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+ /* 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 */
+ od_rotate_add(r3, r4, od_avg_sub(*r4, *r3), 21407, 14, 8867, 14, 3135, 12, 0);
+
+ /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+ /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+ /* 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 */
+ od_rotate_neg(r2, r5, od_avg_sub(*r2, *r5), 21407, 14, 8867, 14, 3135, 12);
+
+ /* 46341/32768 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+ /* 46341/32768 = 2*Cos[Pi/4] = 1.4142135623730951 */
+ od_rotate_pi4_add(r1, r6, od_avg_sub(*r6, *r1), 46341, 15, 46341, 15);
+}
+
+/**
+ * 8-point orthonormal Type-IV iDST
+ */
+static INLINE void od_idst_8(od_coeff *r0, od_coeff *r4,
+ od_coeff *r2, od_coeff *r6,
+ od_coeff *r1, od_coeff *r5,
+ od_coeff *r3, od_coeff *r7) {
+ od_coeff r0h;
+ od_coeff r2h;
+ od_coeff r5h;
+ od_coeff r7h;
+
+ /* Stage 3 */
+
+ /* 46341/32768 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+ /* 46341/32768 = 2*Cos[Pi/4] = 1.4142135623730951 */
+ od_rotate_pi4_sub(r6, r1, od_avg_add(*r1, *r6), 11585, 13, 46341, 15);
+
+ /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+ /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+ /* 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 */
+ od_rotate_neg(r5, r2, od_avg_sub(*r5, *r2), 21407, 14, 8867, 14, 3135, 12);
+
+ /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+ /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+ /* 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 */
+ od_rotate_sub(r4, r3, od_avg_add(*r3, *r4), 21407, 14, 8867, 14, 3135, 12, 0);
+
+ /* Stage 2 */
+
+ od_butterfly_sub(r0, &r0h, r1);
+ od_butterfly_add(r2, &r2h, r4);
+ od_butterfly_add(r5, &r5h, r3);
+ od_butterfly_add(r7, &r7h, r6);
+
+ /* Stage 1 */
+
+ od_butterfly_sub_asym(r7, r7h, r4);
+ od_butterfly_add_asym(r5, r5h, r6);
+ od_butterfly_sub_asym(r2, r2h, r1);
+ od_butterfly_add_asym(r0, r0h, r3);
+
+ /* Stage 0 */
+
+ /* 23059/16384 = Sin[9*Pi/32] + Cos[9*Pi/32] = 1.4074037375263826 */
+ /* 2271/16384 = Sin[9*Pi/32] - Cos[9*Pi/32] = 0.1386171691990915 */
+ /* 5197/8192 = Cos[9*Pi/32] = 0.6343932841636455 */
+ od_rotate_add(r4, r3, od_sub(*r3, *r4), 23059, 14, 2271, 14, 5197, 13, 0);
+
+ /* 22173/16384 = Sin[11*Pi/32] + Cos[11*Pi/32] = 1.3533180011743526 */
+ /* 3363/8192 = Sin[11*Pi/32] - Cos[11*Pi/32] = 0.4105245275223574 */
+ /* 15447/32768 = Cos[11*Pi/32] = 0.47139673682599764 */
+ od_rotate_sub(r2, r5, od_add(*r5, *r2), 22173, 14, 3363, 13, 15447, 15, 0);
+
+ /* 40869/32768 = Sin[13*Pi/32] + Cos[13*Pi/32] = 1.24722501298667123 */
+ /* 21845/32768 = Sin[13*Pi/32] - Cos[13*Pi/32] = 0.66665565847774650 */
+ /* 1189/4096 = Cos[13*Pi/32] = 0.29028467725446233 */
+ od_rotate_add(r6, r1, od_sub(*r1, *r6), 40869, 15, 21845, 15, 1189, 12, 0);
+
+ /* 17911/16384 = Sin[15*Pi/32] + Cos[15*Pi/32] = 1.0932018670017576 */
+ /* 14699/16384 = Sin[15*Pi/32] - Cos[15*Pi/32] = 0.8971675863426363 */
+ /* 803/8192 = Cos[15*Pi/32] = 0.0980171403295606 */
+ od_rotate_sub(r0, r7, od_add(*r7, *r0), 17911, 14, 14699, 14, 803, 13, 0);
+}
+
+/**
+ * 8-point asymmetric Type-IV fDST
+ */
+static INLINE void od_fdst_8_asym(od_coeff *r0, od_coeff r0h, od_coeff *r1,
+ od_coeff *r2, od_coeff r2h, od_coeff *r3,
+ od_coeff *r4, od_coeff r4h, od_coeff *r5,
+ od_coeff *r6, od_coeff r6h, od_coeff *r7) {
+ od_coeff r5h;
+ od_coeff r7h;
+
+ /* Stage 0 */
+
+ /* 12665/16384 = (Sin[15*Pi/32] + Cos[15*Pi/32])/Sqrt[2] = 0.77301045336274 */
+ /* 5197/4096 = (Sin[15*Pi/32] - Cos[15*Pi/32])*Sqrt[2] = 1.26878656832729 */
+ /* 2271/16384 = Cos[15*Pi/32]*Sqrt[2] = 0.13861716919909 */
+ od_rotate_sub(r0, r7, od_add(*r7, r0h), 12665, 14, 5197, 12, 2271, 14, 0);
+
+ /* 28899/32768 = Sin[13*Pi/32] + Cos[13*Pi/32])/Sqrt[2] = 0.881921264348355 */
+ /* 30893/32768 = Sin[13*Pi/32] - Cos[13*Pi/32])*Sqrt[2] = 0.942793473651995 */
+ /* 3363/8192 = Cos[13*Pi/32]*Sqrt[2] = 0.410524527522357 */
+ od_rotate_add(r6, r1, od_sub(*r1, r6h), 28899, 15, 30893, 15, 3363, 13, 0);
+
+ /* 31357/32768 = Sin[11*Pi/32] + Cos[11*Pi/32])/Sqrt[2] = 0.956940335732209 */
+ /* 1189/2048 = Sin[11*Pi/32] - Cos[11*Pi/32])*Sqrt[2] = 0.580569354508925 */
+ /* 21845/32768 = Cos[11*Pi/32]*Sqrt[2] = 0.666655658477747 */
+ od_rotate_sub(r2, r5, od_add(*r5, r2h), 31357, 15, 1189, 11, 21845, 15, 0);
+
+ /* 16305/16384 = (Sin[9*Pi/32] + Cos[9*Pi/32])/Sqrt[2] = 0.9951847266721969 */
+ /* 803/4096 = (Sin[9*Pi/32] - Cos[9*Pi/32])*Sqrt[2] = 0.1960342806591213 */
+ /* 14699/16384 = Cos[9*Pi/32]*Sqrt[2] = 0.8971675863426364 */
+ od_rotate_add(r4, r3, od_sub(*r3, r4h), 16305, 14, 803, 12, 14699, 14, 0);
+
+ /* Stage 1 */
+
+ od_butterfly_add(r0, &r0h, r3);
+ od_butterfly_sub(r2, &r2h, r1);
+ od_butterfly_add(r5, &r5h, r6);
+ od_butterfly_sub(r7, &r7h, r4);
+
+ /* Stage 2 */
+
+ od_butterfly_add_asym(r7, r7h, r6);
+ od_butterfly_add_asym(r5, r5h, r3);
+ od_butterfly_add_asym(r2, r2h, r4);
+ od_butterfly_sub_asym(r0, r0h, r1);
+
+ /* Stage 3 */
+
+ /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+ /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+ /* 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 */
+ od_rotate_add(r3, r4, od_avg_sub(*r4, *r3), 21407, 14, 8867, 14, 3135, 12, 0);
+
+ /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+ /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+ /* 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 */
+ od_rotate_neg(r2, r5, od_avg_sub(*r2, *r5), 21407, 14, 8867, 14, 3135, 12);
+
+ /* 46341/32768 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+ /* 46341/32768 = 2*Cos[Pi/4] = 1.4142135623730951 */
+ od_rotate_pi4_add(r1, r6, od_avg_sub(*r6, *r1), 46341, 15, 46341, 15);
+}
+
+/**
+ * 8-point asymmetric Type-IV iDST
+ */
+static INLINE void od_idst_8_asym(od_coeff *r0, od_coeff *r4,
+ od_coeff *r2, od_coeff *r6,
+ od_coeff *r1, od_coeff *r5,
+ od_coeff *r3, od_coeff *r7) {
+ od_coeff r0h;
+ od_coeff r2h;
+ od_coeff r5h;
+ od_coeff r7h;
+
+ /* Stage 3 */
+
+ /* 46341/32768 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+ /* 46341/32768 = 2*Cos[Pi/4] = 1.4142135623730951 */
+ od_rotate_pi4_sub(r6, r1, od_avg_add(*r1, *r6), 11585, 13, 11585, 13);
+
+ /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+ /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+ /* 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 */
+ od_rotate_neg(r5, r2, od_avg_sub(*r5, *r2), 21407, 14, 8867, 14, 3135, 12);
+
+ /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+ /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+ /* 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 */
+ od_rotate_sub(r4, r3, od_avg_add(*r3, *r4), 21407, 14, 8867, 14, 3135, 12, 0);
+
+ /* Stage 2 */
+
+ od_butterfly_sub(r0, &r0h, r1);
+ od_butterfly_add(r2, &r2h, r4);
+ od_butterfly_add(r5, &r5h, r3);
+ od_butterfly_add(r7, &r7h, r6);
+
+ /* Stage 1 */
+
+ od_butterfly_sub_asym(r7, r7h, r4);
+ od_butterfly_add_asym(r5, r5h, r6);
+ od_butterfly_sub_asym(r2, r2h, r1);
+ od_butterfly_add_asym(r0, r0h, r3);
+
+ /* Stage 0 */
+
+ /* 16305/16384 = (Sin[9*Pi/32] + Cos[9*Pi/32])/Sqrt[2] = 0.9951847266721969 */
+ /* 803/4096 = (Sin[9*Pi/32] - Cos[9*Pi/32])*Sqrt[2] = 0.1960342806591213 */
+ /* 14699/16384 = Cos[9*Pi/32]*Sqrt[2] = 0.8971675863426364 */
+ od_rotate_add(r4, r3, od_sub(*r3, *r4), 16305, 14, 803, 12, 14699, 14, 1);
+
+ /* 31357/32768 = Sin[11*Pi/32] + Cos[11*Pi/32])/Sqrt[2] = 0.956940335732209 */
+ /* 1189/2048 = Sin[11*Pi/32] - Cos[11*Pi/32])*Sqrt[2] = 0.580569354508925 */
+ /* 21845/32768 = Cos[11*Pi/32]*Sqrt[2] = 0.666655658477747 */
+ od_rotate_sub(r2, r5, od_add(*r5, *r2), 31357, 15, 1189, 11, 21845, 15, 1);
+
+ /* 28899/32768 = Sin[13*Pi/32] + Cos[13*Pi/32])/Sqrt[2] = 0.881921264348355 */
+ /* 30893/32768 = Sin[13*Pi/32] - Cos[13*Pi/32])*Sqrt[2] = 0.942793473651995 */
+ /* 3363/8192 = Cos[13*Pi/32]*Sqrt[2] = 0.410524527522357 */
+ od_rotate_add(r6, r1, od_sub(*r1, *r6), 28899, 15, 30893, 15, 3363, 13, 1);
+
+ /* 12665/16384 = (Sin[15*Pi/32] + Cos[15*Pi/32])/Sqrt[2] = 0.77301045336274 */
+ /* 5197/4096 = (Sin[15*Pi/32] - Cos[15*Pi/32])*Sqrt[2] = 1.26878656832729 */
+ /* 2271/16384 = Cos[15*Pi/32]*Sqrt[2] = 0.13861716919909 */
+ od_rotate_sub(r0, r7, od_add(*r7, *r0), 12665, 14, 5197, 12, 2271, 14, 1);
+}
+
+/* --- 16-point Transforms --- */
+
+/**
+ * 16-point orthonormal Type-II fDCT
+ */
+static INLINE void od_fdct_16(od_coeff *s0, od_coeff *s1,
+ od_coeff *s2, od_coeff *s3,
+ od_coeff *s4, od_coeff *s5,
+ od_coeff *s6, od_coeff *s7,
+ od_coeff *s8, od_coeff *s9,
+ od_coeff *sa, od_coeff *sb,
+ od_coeff *sc, od_coeff *sd,
+ od_coeff *se, od_coeff *sf) {
+ od_coeff s1h;
+ od_coeff s3h;
+ od_coeff s5h;
+ od_coeff s7h;
+ od_coeff s9h;
+ od_coeff sbh;
+ od_coeff sdh;
+ od_coeff sfh;
+
+ /* +/- Butterflies with asymmetric output. */
+ od_butterfly_neg(s0, sf, &sfh);
+ od_butterfly_add(s1, &s1h, se);
+ od_butterfly_neg(s2, sd, &sdh);
+ od_butterfly_add(s3, &s3h, sc);
+ od_butterfly_neg(s4, sb, &sbh);
+ od_butterfly_add(s5, &s5h, sa);
+ od_butterfly_neg(s6, s9, &s9h);
+ od_butterfly_add(s7, &s7h, s8);
+
+ /* Embedded 8-point transforms with asymmetric input. */
+ od_fdct_8_asym(s0, s1, s1h, s2, s3, s3h, s4, s5, s5h, s6, s7, s7h);
+ od_fdst_8_asym(sf, sfh, se, sd, sdh, sc, sb, sbh, sa, s9, s9h, s8);
+}
+
+/**
+ * 16-point orthonormal Type-II iDCT
+ */
+static INLINE void od_idct_16(od_coeff *s0, od_coeff *s8,
+ od_coeff *s4, od_coeff *sc,
+ od_coeff *s2, od_coeff *sa,
+ od_coeff *s6, od_coeff *se,
+ od_coeff *s1, od_coeff *s9,
+ od_coeff *s5, od_coeff *sd,
+ od_coeff *s3, od_coeff *sb,
+ od_coeff *s7, od_coeff *sf) {
+ od_coeff s1h;
+ od_coeff s3h;
+ od_coeff s5h;
+ od_coeff s7h;
+
+ /* Embedded 8-point transforms with asymmetric output. */
+ od_idst_8_asym(sf, sb, sd, s9, se, sa, sc, s8);
+ od_idct_8_asym(s0, s4, s2, s6, s1, &s1h, s5, &s5h, s3, &s3h, s7, &s7h);
+
+ /* +/- Butterflies with asymmetric input. */
+ od_butterfly_add_asym(s7, s7h, s8);
+ od_butterfly_neg_asym(s6, s9, od_rshift1(*s9));
+ od_butterfly_add_asym(s5, s5h, sa);
+ od_butterfly_neg_asym(s4, sb, od_rshift1(*sb));
+ od_butterfly_add_asym(s3, s3h, sc);
+ od_butterfly_neg_asym(s2, sd, od_rshift1(*sd));
+ od_butterfly_add_asym(s1, s1h, se);
+ od_butterfly_neg_asym(s0, sf, od_rshift1(*sf));
+}
+
+/**
+ * 16-point asymmetric Type-II fDCT
+ */
+static INLINE void od_fdct_16_asym(od_coeff *s0, od_coeff *s1, od_coeff s1h,
+ od_coeff *s2, od_coeff *s3, od_coeff s3h,
+ od_coeff *s4, od_coeff *s5, od_coeff s5h,
+ od_coeff *s6, od_coeff *s7, od_coeff s7h,
+ od_coeff *s8, od_coeff *s9, od_coeff s9h,
+ od_coeff *sa, od_coeff *sb, od_coeff sbh,
+ od_coeff *sc, od_coeff *sd, od_coeff sdh,
+ od_coeff *se, od_coeff *sf, od_coeff sfh) {
+
+ /* +/- Butterflies with asymmetric input. */
+ od_butterfly_neg_asym(s0, sf, sfh);
+ od_butterfly_sub_asym(s1, s1h, se);
+ od_butterfly_neg_asym(s2, sd, sdh);
+ od_butterfly_sub_asym(s3, s3h, sc);
+ od_butterfly_neg_asym(s4, sb, sbh);
+ od_butterfly_sub_asym(s5, s5h, sa);
+ od_butterfly_neg_asym(s6, s9, s9h);
+ od_butterfly_sub_asym(s7, s7h, s8);
+
+ /* Embedded 8-point orthonormal transforms. */
+ od_fdct_8(s0, s1, s2, s3, s4, s5, s6, s7);
+ od_fdst_8(sf, se, sd, sc, sb, sa, s9, s8);
+}
+
+/**
+ * 16-point asymmetric Type-II iDCT
+ */
+static INLINE void od_idct_16_asym(od_coeff *s0, od_coeff *s8,
+ od_coeff *s4, od_coeff *sc,
+ od_coeff *s2, od_coeff *sa,
+ od_coeff *s6, od_coeff *se,
+ od_coeff *s1, od_coeff *s1h,
+ od_coeff *s9, od_coeff *s9h,
+ od_coeff *s5, od_coeff *s5h,
+ od_coeff *sd, od_coeff *sdh,
+ od_coeff *s3, od_coeff *s3h,
+ od_coeff *sb, od_coeff *sbh,
+ od_coeff *s7, od_coeff *s7h,
+ od_coeff *sf, od_coeff *sfh) {
+
+ /* Embedded 8-point orthonormal transforms. */
+ od_idst_8(sf, sb, sd, s9, se, sa, sc, s8);
+ od_idct_8(s0, s4, s2, s6, s1, s5, s3, s7);
+
+ /* +/- Butterflies with asymmetric output. */
+ od_butterfly_sub(s7, s7h, s8);
+ od_butterfly_neg(s6, s9, s9h);
+ od_butterfly_sub(s5, s5h, sa);
+ od_butterfly_neg(s4, sb, sbh);
+ od_butterfly_sub(s3, s3h, sc);
+ od_butterfly_neg(s2, sd, sdh);
+ od_butterfly_sub(s1, s1h, se);
+ od_butterfly_neg(s0, sf, sfh);
+}
+
+/**
+ * 16-point orthonormal Type-IV fDST
+ */
+static INLINE void od_fdst_16(od_coeff *s0, od_coeff *s1,
+ od_coeff *s2, od_coeff *s3,
+ od_coeff *s4, od_coeff *s5,
+ od_coeff *s6, od_coeff *s7,
+ od_coeff *s8, od_coeff *s9,
+ od_coeff *sa, od_coeff *sb,
+ od_coeff *sc, od_coeff *sd,
+ od_coeff *se, od_coeff *sf) {
+ od_coeff s0h;
+ od_coeff s2h;
+ od_coeff sdh;
+ od_coeff sfh;
+
+ /* Stage 0 */
+
+ /* 24279/32768 = (Sin[31*Pi/64] + Cos[31*Pi/64])/Sqrt[2] = 0.74095112535496 */
+ /* 44011/32768 = (Sin[31*Pi/64] - Cos[31*Pi/64])*Sqrt[2] = 1.34311790969404 */
+ /* 1137/16384 = Cos[31*Pi/64]*Sqrt[2] = 0.06939217050794 */
+ od_rotate_sub(s0, sf, od_add(*sf, *s0), 24279, 15, 44011, 15, 1137, 14, 1);
+
+ /* 1645/2048 = (Sin[29*Pi/64] + Cos[29*Pi/64])/Sqrt[2] = 0.8032075314806449 */
+ /* 305/256 = (Sin[29*Pi/64] - Cos[29*Pi/64])*Sqrt[2] = 1.1913986089848667 */
+ /* 425/2048 = Cos[29*Pi/64]*Sqrt[2] = 0.2075082269882116 */
+ od_rotate_add(se, s1, od_sub(*s1, *se), 1645, 11, 305, 8, 425, 11, 1);
+
+ /* 14053/32768 = (Sin[27*Pi/64] + Cos[27*Pi/64])/Sqrt[2] = 0.85772861000027 */
+ /* 8423/8192 = (Sin[27*Pi/64] - Cos[27*Pi/64])*Sqrt[2] = 1.02820548838644 */
+ /* 2815/8192 = Cos[27*Pi/64]*Sqrt[2] = 0.34362586580705 */
+ od_rotate_sub(s2, sd, od_add(*sd, *s2), 14053, 14, 8423, 13, 2815, 13, 1);
+
+ /* 14811/16384 = (Sin[25*Pi/64] + Cos[25*Pi/64])/Sqrt[2] = 0.90398929312344 */
+ /* 7005/8192 = (Sin[25*Pi/64] - Cos[25*Pi/64])*Sqrt[2] = 0.85511018686056 */
+ /* 3903/8192 = Cos[25*Pi/64]*Sqrt[2] = 0.47643419969316 */
+ od_rotate_add(sc, s3, od_sub(*s3, *sc), 14811, 14, 7005, 13, 3903, 13, 1);
+
+ /* 30853/32768 = (Sin[23*Pi/64] + Cos[23*Pi/64])/Sqrt[2] = 0.94154406518302 */
+ /* 11039/16384 = (Sin[23*Pi/64] - Cos[23*Pi/64])*Sqrt[2] = 0.67377970678444 */
+ /* 19813/32768 = Cos[23*Pi/64]*Sqrt[2] = 0.60465421179080 */
+ od_rotate_sub(s4, sb, od_add(*sb, *s4), 30853, 15, 11039, 14, 19813, 15, 1);
+
+ /* 15893/16384 = (Sin[21*Pi/64] + Cos[21*Pi/64])/Sqrt[2] = 0.97003125319454 */
+ /* 3981/8192 = (Sin[21*Pi/64] - Cos[21*Pi/64])*Sqrt[2] = 0.89716758634264 */
+ /* 1489/2048 = Cos[21*Pi/64]*Sqrt[2] = 0.72705107329128 */
+ od_rotate_add(sa, s5, od_sub(*s5, *sa), 15893, 14, 3981, 13, 1489, 11, 1);
+
+ /* 32413/32768 = (Sin[19*Pi/64] + Cos[19*Pi/64])/Sqrt[2] = 0.98917650996478 */
+ /* 601/2048 = (Sin[19*Pi/64] - Cos[19*Pi/64])*Sqrt[2] = 0.29346094891072 */
+ /* 27605/32768 = Cos[19*Pi/64]*Sqrt[2] = 0.84244603550942 */
+ od_rotate_sub(s6, s9, od_add(*s9, *s6), 32413, 15, 601, 11, 27605, 15, 1);
+
+ /* 32729/32768 = (Sin[17*Pi/64] + Cos[17*Pi/64])/Sqrt[2] = 0.99879545620517 */
+ /* 201/2048 = (Sin[17*Pi/64] - Cos[17*Pi/64])*Sqrt[2] = 0.09813534865484 */
+ /* 31121/32768 = Cos[17*Pi/64]*Sqrt[2] = 0.94972778187775 */
+ od_rotate_add(s8, s7, od_sub(*s7, *s8), 32729, 15, 201, 11, 31121, 15, 1);
+
+ /* Stage 1 */
+
+ od_butterfly_sub_asym(s0, od_rshift1(*s0), s7);
+ od_butterfly_sub_asym(s8, od_rshift1(*s8), sf);
+ od_butterfly_add_asym(s4, od_rshift1(*s4), s3);
+ od_butterfly_add_asym(sc, od_rshift1(*sc), sb);
+ od_butterfly_sub_asym(s2, od_rshift1(*s2), s5);
+ od_butterfly_sub_asym(sa, od_rshift1(*sa), sd);
+ od_butterfly_add_asym(s6, od_rshift1(*s6), s1);
+ od_butterfly_add_asym(se, od_rshift1(*se), s9);
+
+ /* Stage 2 */
+
+ od_butterfly_add(s8, NULL, s4);
+ od_butterfly_add(s7, NULL, sb);
+ od_butterfly_sub(sa, NULL, s6);
+ od_butterfly_sub(s5, NULL, s9);
+ od_butterfly_add(s0, &s0h, s3);
+ od_butterfly_add(sd, &sdh, se);
+ od_butterfly_sub(s2, &s2h, s1);
+ od_butterfly_sub(sf, &sfh, sc);
+
+ /* Stage 3 */
+
+ /* 9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
+ /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
+ /* 12785/32768 = 2*Cos[7*Pi/16] = 0.3901806440322565 */
+ od_rotate_sub(s8, s7, od_avg_add(*s7, *s8), 9633, 13, 12873, 14, 12785, 15,
+ 0);
+
+ /* 45451/32768 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
+ /* 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
+ /* 18205/32768 = Cos[5*Pi/16] = 0.5555702330196022 */
+ od_rotate_sub(s9, s6, od_add(*s6, *s9), 45451, 15, 9041, 15, 18205, 15, 0);
+
+ /* 22725/16384 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
+ /* 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
+ /* 18205/32768 = 2*Cos[5*Pi/16] = 1.1111404660392044 */
+ od_rotate_neg(s5, sa, od_avg_sub(*s5, *sa), 22725, 14, 9041, 15, 18205, 14);
+
+ /* 38531/32768 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
+ /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
+ /* 6393/32768 = Cos[7*Pi/16] = 0.1950903220161283 */
+ od_rotate_neg(s4, sb, od_sub(*s4, *sb), 38531, 15, 12873, 14, 6393, 15);
+
+ /* Stage 4 */
+
+ od_butterfly_add_asym(s2, s2h, sc);
+ od_butterfly_sub_asym(s0, s0h, s1);
+ od_butterfly_add_asym(sf, sfh, se);
+ od_butterfly_add_asym(sd, sdh, s3);
+ od_butterfly_add_asym(s7, od_rshift1(*s7), s6);
+ od_butterfly_sub_asym(s8, od_rshift1(*s8), s9);
+ od_butterfly_sub_asym(sa, od_rshift1(*sa), sb);
+ od_butterfly_add_asym(s5, od_rshift1(*s5), s4);
+
+ /* Stage 5 */
+
+ /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+ /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+ /* 3135/4096 = 2*Cos[7*Pi/8] = 0.7653668647301796 */
+ od_rotate_sub(sc, s3, od_avg_add(*s3, *sc), 21407, 14, 8867, 14, 3135, 12, 0);
+
+ /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3870398453221475 */
+ /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+ /* 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 */
+ od_rotate_neg(s2, sd, od_avg_sub(*s2, *sd), 21407, 14, 8867, 14, 3135, 12);
+
+ /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+ /* 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 */
+ od_rotate_pi4_sub(sa, s5, od_avg_add(*s5, *sa), 11585, 13, 11585, 13);
+
+ /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+ /* 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 */
+ od_rotate_pi4_sub(s6, s9, od_avg_add(*s9, *s6), 11585, 13, 11585, 13);
+
+ /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+ /* 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 */
+ od_rotate_pi4_sub(se, s1, od_avg_add(*s1, *se), 11585, 13, 11585, 13);
+}
+
+/**
+ * 16-point orthonormal Type-IV iDST
+ */
+static INLINE void od_idst_16(od_coeff *s0, od_coeff *s8,
+ od_coeff *s4, od_coeff *sc,
+ od_coeff *s2, od_coeff *sa,
+ od_coeff *s6, od_coeff *se,
+ od_coeff *s1, od_coeff *s9,
+ od_coeff *s5, od_coeff *sd,
+ od_coeff *s3, od_coeff *sb,
+ od_coeff *s7, od_coeff *sf) {
+ od_coeff s0h;
+ od_coeff s2h;
+ od_coeff s4h;
+ od_coeff s6h;
+ od_coeff s8h;
+ od_coeff sah;
+ od_coeff sch;
+ od_coeff sdh;
+ od_coeff seh;
+ od_coeff sfh;
+
+ /* Stage 5 */
+
+ /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+ /* 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 */
+ od_rotate_pi4_sub(s6, s9, od_avg_add(*s9, *s6), 11585, 13, 11585, 13);
+
+ /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+ /* 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 */
+ od_rotate_pi4_sub(sa, s5, od_avg_add(*s5, *sa), 11585, 13, 11585, 13);
+
+ /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+ /* 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 */
+ od_rotate_pi4_sub(se, s1, od_avg_add(*s1, *se), 11585, 13, 11585, 13);
+
+ /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+ /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+ /* 3135/4096 = 2*Cos[7*Pi/8] = 0.7653668647301796 */
+ od_rotate_sub(sc, s3, od_avg_add(*s3, *sc), 21407, 14, 8867, 14, 3135, 12, 0);
+
+ /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3870398453221475 */
+ /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+ /* 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 */
+ od_rotate_neg(sd, s2, od_avg_sub(*sd, *s2), 21407, 14, 8867, 14, 3135, 12);
+
+ /* Stage 4 */
+
+ od_butterfly_add(s5, NULL, s4);
+ od_butterfly_sub(sa, NULL, sb);
+ od_butterfly_sub(s8, NULL, s9);
+ od_butterfly_add(s7, NULL, s6);
+ od_butterfly_add(sd, &sdh, s3);
+ od_butterfly_add(sf, &sfh, se);
+ od_butterfly_sub(s0, &s0h, s1);
+ od_butterfly_add(s2, &s2h, sc);
+
+ /* Stage 3 */
+
+ /* 9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
+ /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
+ /* 12785/32768 = 2*Cos[7*Pi/16] = 0.3901806440322565 */
+ od_rotate_sub(s8, s7, od_avg_add(*s7, *s8), 9633, 13, 12873, 14, 12785, 15,
+ 0);
+
+ /* 45451/32768 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
+ /* 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
+ /* 18205/32768 = Cos[5*Pi/16] = 0.5555702330196022 */
+ od_rotate_sub(s9, s6, od_add(*s6, *s9), 45451, 15, 9041, 15, 18205, 15, 0);
+
+ /* 22725/16384 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
+ /* 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
+ /* 18205/32768 = 2*Cos[5*Pi/16] = 1.1111404660392044 */
+ od_rotate_neg(sa, s5, od_avg_sub(*sa, *s5), 22725, 14, 9041, 15, 18205, 14);
+
+ /* 38531/32768 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
+ /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
+ /* 6393/32768 = Cos[7*Pi/16] = 0.1950903220161283 */
+ od_rotate_neg(sb, s4, od_sub(*sb, *s4), 38531, 15, 12873, 14, 6393, 15);
+
+ /* Stage 2 */
+
+ od_butterfly_add_asym(s8, od_rshift1(*s8), s4);
+ od_butterfly_add_asym(s7, od_rshift1(*s7), sb);
+ od_butterfly_sub_asym(sa, od_rshift1(*sa), s6);
+ od_butterfly_sub_asym(s5, od_rshift1(*s5), s9);
+ od_butterfly_add_asym(s0, s0h, s3);
+ od_butterfly_add_asym(sd, sdh, se);
+ od_butterfly_sub_asym(s2, s2h, s1);
+ od_butterfly_sub_asym(sf, sfh, sc);
+
+ /* Stage 1 */
+
+ od_butterfly_sub(s0, &s0h, s7);
+ od_butterfly_sub(s8, &s8h, sf);
+ od_butterfly_add(s4, &s4h, s3);
+ od_butterfly_add(sc, &sch, sb);
+ od_butterfly_sub(s2, &s2h, s5);
+ od_butterfly_sub(sa, &sah, sd);
+ od_butterfly_add(s6, &s6h, s1);
+ od_butterfly_add(se, &seh, s9);
+
+ /* Stage 0 */
+
+ /* 32729/32768 = (Sin[17*Pi/64] + Cos[17*Pi/64])/Sqrt[2] = 0.99879545620517 */
+ /* 201/2048 = (Sin[17*Pi/64] - Cos[17*Pi/64])*Sqrt[2] = 0.09813534865484 */
+ /* 31121/32768 = Cos[17*Pi/64]*Sqrt[2] = 0.94972778187775 */
+ od_rotate_add(s8, s7, od_sub(*s7, s8h), 32729, 15, 201, 11, 31121, 15, 0);
+
+ /* 32413/32768 = (Sin[19*Pi/64] + Cos[19*Pi/64])/Sqrt[2] = 0.98917650996478 */
+ /* 601/2048 = (Sin[19*Pi/64] - Cos[19*Pi/64])*Sqrt[2] = 0.29346094891072 */
+ /* 27605/32768 = Cos[19*Pi/64]*Sqrt[2] = 0.84244603550942 */
+ od_rotate_sub(s6, s9, od_add(*s9, s6h), 32413, 15, 601, 11, 27605, 15, 0);
+
+ /* 15893/16384 = (Sin[21*Pi/64] + Cos[21*Pi/64])/Sqrt[2] = 0.97003125319454 */
+ /* 3981/8192 = (Sin[21*Pi/64] - Cos[21*Pi/64])*Sqrt[2] = 0.89716758634264 */
+ /* 1489/2048 = Cos[21*Pi/64]*Sqrt[2] = 0.72705107329128 */
+ od_rotate_add(sa, s5, od_sub(*s5, sah), 15893, 14, 3981, 13, 1489, 11, 0);
+
+ /* 30853/32768 = (Sin[23*Pi/64] + Cos[23*Pi/64])/Sqrt[2] = 0.94154406518302 */
+ /* 11039/16384 = (Sin[23*Pi/64] - Cos[23*Pi/64])*Sqrt[2] = 0.67377970678444 */
+ /* 19813/32768 = Cos[23*Pi/64]*Sqrt[2] = 0.60465421179080 */
+ od_rotate_sub(s4, sb, od_add(*sb, s4h), 30853, 15, 11039, 14, 19813, 15, 0);
+
+ /* 14811/16384 = (Sin[25*Pi/64] + Cos[25*Pi/64])/Sqrt[2] = 0.90398929312344 */
+ /* 7005/8192 = (Sin[25*Pi/64] - Cos[25*Pi/64])*Sqrt[2] = 0.85511018686056 */
+ /* 3903/8192 = Cos[25*Pi/64]*Sqrt[2] = 0.47643419969316 */
+ od_rotate_add(sc, s3, od_sub(*s3, sch), 14811, 14, 7005, 13, 3903, 13, 0);
+
+ /* 14053/32768 = (Sin[27*Pi/64] + Cos[27*Pi/64])/Sqrt[2] = 0.85772861000027 */
+ /* 8423/8192 = (Sin[27*Pi/64] - Cos[27*Pi/64])*Sqrt[2] = 1.02820548838644 */
+ /* 2815/8192 = Cos[27*Pi/64]*Sqrt[2] = 0.34362586580705 */
+ od_rotate_sub(s2, sd, od_add(*sd, s2h), 14053, 14, 8423, 13, 2815, 13, 0);
+
+ /* 1645/2048 = (Sin[29*Pi/64] + Cos[29*Pi/64])/Sqrt[2] = 0.8032075314806449 */
+ /* 305/256 = (Sin[29*Pi/64] - Cos[29*Pi/64])*Sqrt[2] = 1.1913986089848667 */
+ /* 425/2048 = Cos[29*Pi/64]*Sqrt[2] = 0.2075082269882116 */
+ od_rotate_add(se, s1, od_sub(*s1, seh), 1645, 11, 305, 8, 425, 11, 0);
+
+ /* 24279/32768 = (Sin[31*Pi/64] + Cos[31*Pi/64])/Sqrt[2] = 0.74095112535496 */
+ /* 44011/32768 = (Sin[31*Pi/64] - Cos[31*Pi/64])*Sqrt[2] = 1.34311790969404 */
+ /* 1137/16384 = Cos[31*Pi/64]*Sqrt[2] = 0.06939217050794 */
+ od_rotate_sub(s0, sf, od_add(*sf, s0h), 24279, 15, 44011, 15, 1137, 14, 0);
+}
+
+/**
+ * 16-point asymmetric Type-IV fDST
+ */
+static INLINE void od_fdst_16_asym(od_coeff *s0, od_coeff s0h, od_coeff *s1,
+ od_coeff *s2, od_coeff s2h, od_coeff *s3,
+ od_coeff *s4, od_coeff s4h, od_coeff *s5,
+ od_coeff *s6, od_coeff s6h, od_coeff *s7,
+ od_coeff *s8, od_coeff s8h, od_coeff *s9,
+ od_coeff *sa, od_coeff sah, od_coeff *sb,
+ od_coeff *sc, od_coeff sch, od_coeff *sd,
+ od_coeff *se, od_coeff seh, od_coeff *sf) {
+ od_coeff sdh;
+ od_coeff sfh;
+
+ /* Stage 0 */
+
+ /* 1073/2048 = (Sin[31*Pi/64] + Cos[31*Pi/64])/2 = 0.5239315652662953 */
+ /* 62241/32768 = (Sin[31*Pi/64] - Cos[31*Pi/64])*2 = 1.8994555637555088 */
+ /* 201/16384 = Cos[31*Pi/64]*2 = 0.0981353486548360 */
+ od_rotate_sub(s0, sf, od_add(*sf, s0h), 1073, 11, 62241, 15, 201, 11, 1);
+
+ /* 18611/32768 = (Sin[29*Pi/64] + Cos[29*Pi/64])/2 = 0.5679534922100714 */
+ /* 55211/32768 = (Sin[29*Pi/64] - Cos[29*Pi/64])*2 = 1.6848920710188384 */
+ /* 601/2048 = Cos[29*Pi/64]*2 = 0.2934609489107235 */
+ od_rotate_add(se, s1, od_sub(*s1, seh), 18611, 15, 55211, 15, 601, 11, 1);
+
+ /* 9937/16384 = (Sin[27*Pi/64] + Cos[27*Pi/64])/2 = 0.6065057165489039 */
+ /* 1489/1024 = (Sin[27*Pi/64] - Cos[27*Pi/64])*2 = 1.4541021465825602 */
+ /* 3981/8192 = Cos[27*Pi/64]*2 = 0.4859603598065277 */
+ od_rotate_sub(s2, sd, od_add(*sd, s2h), 9937, 14, 1489, 10, 3981, 13, 1);
+
+ /* 10473/16384 = (Sin[25*Pi/64] + Cos[25*Pi/64])/2 = 0.6392169592876205 */
+ /* 39627/32768 = (Sin[25*Pi/64] - Cos[25*Pi/64])*2 = 1.2093084235816014 */
+ /* 11039/16384 = Cos[25*Pi/64]*2 = 0.6737797067844401 */
+ od_rotate_add(sc, s3, od_sub(*s3, sch), 10473, 14, 39627, 15, 11039, 14, 1);
+
+ /* 2727/4096 = (Sin[23*Pi/64] + Cos[23*Pi/64])/2 = 0.6657721932768628 */
+ /* 3903/4096 = (Sin[23*Pi/64] - Cos[23*Pi/64])*2 = 0.9528683993863225 */
+ /* 7005/8192 = Cos[23*Pi/64]*2 = 0.8551101868605642 */
+ od_rotate_sub(s4, sb, od_add(*sb, s4h), 2727, 12, 3903, 12, 7005, 13, 1);
+
+ /* 5619/8192 = (Sin[21*Pi/64] + Cos[21*Pi/64])/2 = 0.6859156770967569 */
+ /* 2815/4096 = (Sin[21*Pi/64] - Cos[21*Pi/64])*2 = 0.6872517316141069 */
+ /* 8423/8192 = Cos[21*Pi/64]*2 = 1.0282054883864433 */
+ od_rotate_add(sa, s5, od_sub(*s5, sah), 5619, 13, 2815, 12, 8423, 13, 1);
+
+ /* 2865/4096 = (Sin[19*Pi/64] + Cos[19*Pi/64])/2 = 0.6994534179865391 */
+ /* 13588/32768 = (Sin[19*Pi/64] - Cos[19*Pi/64])*2 = 0.4150164539764232 */
+ /* 305/256 = Cos[19*Pi/64]*2 = 1.1913986089848667 */
+ od_rotate_sub(s6, s9, od_add(*s9, s6h), 2865, 12, 13599, 15, 305, 8, 1);
+
+ /* 23143/32768 = (Sin[17*Pi/64] + Cos[17*Pi/64])/2 = 0.7062550401009887 */
+ /* 1137/8192 = (Sin[17*Pi/64] - Cos[17*Pi/64])*2 = 0.1387843410158816 */
+ /* 44011/32768 = Cos[17*Pi/64]*2 = 1.3431179096940367 */
+ od_rotate_add(s8, s7, od_sub(*s7, s8h), 23143, 15, 1137, 13, 44011, 15, 1);
+
+ /* Stage 1 */
+
+ od_butterfly_sub_asym(s0, od_rshift1(*s0), s7);
+ od_butterfly_sub_asym(s8, od_rshift1(*s8), sf);
+ od_butterfly_add_asym(s4, od_rshift1(*s4), s3);
+ od_butterfly_add_asym(sc, od_rshift1(*sc), sb);
+ od_butterfly_sub_asym(s2, od_rshift1(*s2), s5);
+ od_butterfly_sub_asym(sa, od_rshift1(*sa), sd);
+ od_butterfly_add_asym(s6, od_rshift1(*s6), s1);
+ od_butterfly_add_asym(se, od_rshift1(*se), s9);
+
+ /* Stage 2 */
+
+ od_butterfly_add(s8, NULL, s4);
+ od_butterfly_add(s7, NULL, sb);
+ od_butterfly_sub(sa, NULL, s6);
+ od_butterfly_sub(s5, NULL, s9);
+ od_butterfly_add(s0, &s0h, s3);
+ od_butterfly_add(sd, &sdh, se);
+ od_butterfly_sub(s2, &s2h, s1);
+ od_butterfly_sub(sf, &sfh, sc);
+
+ /* Stage 3 */
+
+ /* 9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
+ /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
+ /* 6393/32768 = Cos[7*Pi/16] = 0.1950903220161283 */
+ od_rotate_sub(s8, s7, od_add(*s7, *s8), 9633, 13, 12873, 14, 6393, 15, 0);
+
+ /* 45451/32768 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
+ /* 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
+ /* 18205/32768 = Cos[5*Pi/16] = 0.5555702330196022 */
+ od_rotate_sub(s9, s6, od_add(*s6, *s9), 45451, 15, 9041, 15, 18205, 15, 0);
+
+ /* 11363/8192 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
+ /* 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
+ /* 4551/8192 = Cos[5*Pi/16] = 0.5555702330196022 */
+ od_rotate_neg(s5, sa, od_sub(*s5, *sa), 11363, 13, 9041, 15, 4551, 13);
+
+ /* 9633/32768 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
+ /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
+ /* 6393/32768 = Cos[7*Pi/16] = 0.1950903220161283 */
+ od_rotate_neg(s4, sb, od_sub(*s4, *sb), 9633, 13, 12873, 14, 6393, 15);
+
+ /* Stage 4 */
+
+ od_butterfly_add_asym(s2, s2h, sc);
+ od_butterfly_sub_asym(s0, s0h, s1);
+ od_butterfly_add_asym(sf, sfh, se);
+ od_butterfly_add_asym(sd, sdh, s3);
+ od_butterfly_add_asym(s7, od_rshift1(*s7), s6);
+ od_butterfly_sub_asym(s8, od_rshift1(*s8), s9);
+ od_butterfly_sub_asym(sa, od_rshift1(*sa), sb);
+ od_butterfly_add_asym(s5, od_rshift1(*s5), s4);
+
+ /* Stage 5 */
+
+ /* 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+ /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+ /* 3135/8192 = Cos[7*Pi/8] = 0.3826834323650898 */
+ od_rotate_sub(sc, s3, od_add(*s3, *sc), 10703, 13, 8867, 14, 3135, 13, 0);
+
+ /* 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3870398453221475 */
+ /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+ /* 3135/8192 = Cos[3*Pi/8] = 0.3826834323650898 */
+ od_rotate_neg(s2, sd, od_sub(*s2, *sd), 10703, 13, 8867, 14, 3135, 13);
+
+ /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+ /* 11585/16384 = Cos[Pi/4] = 0.7071067811865475 */
+ od_rotate_pi4_sub(sa, s5, od_add(*s5, *sa), 11585, 13, 11585, 14);
+
+ /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+ /* 11585/16384 = Cos[Pi/4] = 0.7071067811865475 */
+ od_rotate_pi4_sub(s6, s9, od_add(*s9, *s6), 11585, 13, 11585, 14);
+
+ /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+ /* 11585/16384 = Cos[Pi/4] = 0.7071067811865475 */
+ od_rotate_pi4_sub(se, s1, od_add(*s1, *se), 11585, 13, 11585, 14);
+}
+
+/**
+ * 16-point asymmetric Type-IV iDST
+ */
+static INLINE void od_idst_16_asym(od_coeff *s0, od_coeff *s8,
+ od_coeff *s4, od_coeff *sc,
+ od_coeff *s2, od_coeff *sa,
+ od_coeff *s6, od_coeff *se,
+ od_coeff *s1, od_coeff *s9,
+ od_coeff *s5, od_coeff *sd,
+ od_coeff *s3, od_coeff *sb,
+ od_coeff *s7, od_coeff *sf) {
+ od_coeff s0h;
+ od_coeff s2h;
+ od_coeff s4h;
+ od_coeff s6h;
+ od_coeff s8h;
+ od_coeff sah;
+ od_coeff sch;
+ od_coeff sdh;
+ od_coeff seh;
+ od_coeff sfh;
+
+ /* Stage 5 */
+
+ /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+ /* 11585/16384 = Cos[Pi/4] = 0.7071067811865475 */
+ od_rotate_pi4_sub(s6, s9, od_add(*s9, *s6), 11585, 13, 11585, 14);
+
+ /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+ /* 11585/16384 = 2*Cos[Pi/4] = 0.7071067811865475 */
+ od_rotate_pi4_sub(sa, s5, od_add(*s5, *sa), 11585, 13, 11585, 14);
+
+ /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
+ /* 11585/16384 = 2*Cos[Pi/4] = 0.7071067811865475 */
+ od_rotate_pi4_sub(se, s1, od_add(*s1, *se), 11585, 13, 11585, 14);
+
+ /* 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
+ /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+ /* 3135/8192 = Cos[7*Pi/8] = 0.7653668647301796 */
+ od_rotate_sub(sc, s3, od_add(*s3, *sc), 10703, 13, 8867, 14, 3135, 13, 0);
+
+ /* 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3870398453221475 */
+ /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
+ /* 3135/8192 = Cos[3*Pi/8] = 0.7653668647301796 */
+ od_rotate_neg(sd, s2, od_sub(*sd, *s2), 10703, 13, 8867, 14, 3135, 13);
+
+ /* Stage 4 */
+
+ od_butterfly_add(s5, NULL, s4);
+ od_butterfly_sub(sa, NULL, sb);
+ od_butterfly_sub(s8, NULL, s9);
+ od_butterfly_add(s7, NULL, s6);
+ od_butterfly_add(sd, &sdh, s3);
+ od_butterfly_add(sf, &sfh, se);
+ od_butterfly_sub(s0, &s0h, s1);
+ od_butterfly_add(s2, &s2h, sc);
+
+ /* Stage 3 */
+
+ /* 9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
+ /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
+ /* 6393/32768 = Cos[7*Pi/16] = 0.1950903220161283 */
+ od_rotate_neg(sb, s4, od_sub(*sb, *s4), 9633, 13, 12873, 14, 6393, 15);
+
+ /* 11363/8192 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
+ /* 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
+ /* 4551/8192 = Cos[5*Pi/16] = 0.5555702330196022 */
+ od_rotate_neg(sa, s5, od_sub(*sa, *s5), 11363, 13, 9041, 15, 4551, 13);
+
+ /* 22725/16384 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
+ /* 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
+ /* 18205/32768 = Cos[5*Pi/16] = 0.5555702330196022 */
+ od_rotate_sub(s9, s6, od_add(*s6, *s9), 22725, 14, 9041, 15, 18205, 15, 0);
+
+ /* 9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
+ /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
+ /* 6393/32768 = Cos[7*Pi/16] = 0.1950903220161283 */
+ od_rotate_sub(s8, s7, od_add(*s7, *s8), 9633, 13, 12873, 14, 6393, 15, 0);
+
+ /* Stage 2 */
+
+ od_butterfly_add_asym(s8, od_rshift1(*s8), s4);
+ od_butterfly_add_asym(s7, od_rshift1(*s7), sb);
+ od_butterfly_sub_asym(sa, od_rshift1(*sa), s6);
+ od_butterfly_sub_asym(s5, od_rshift1(*s5), s9);
+ od_butterfly_add_asym(s0, s0h, s3);
+ od_butterfly_add_asym(sd, sdh, se);
+ od_butterfly_sub_asym(s2, s2h, s1);
+ od_butterfly_sub_asym(sf, sfh, sc);
+
+ /* Stage 1 */
+
+ od_butterfly_sub(s0, &s0h, s7);
+ od_butterfly_sub(s8, &s8h, sf);
+ od_butterfly_add(s4, &s4h, s3);
+ od_butterfly_add(sc, &sch, sb);
+ od_butterfly_sub(s2, &s2h, s5);
+ od_butterfly_sub(sa, &sah, sd);
+ od_butterfly_add(s6, &s6h, s1);
+ od_butterfly_add(se, &seh, s9);
+
+ /* Stage 0 */
+
+ /* 23143/32768 = (Sin[17*Pi/64] + Cos[17*Pi/64])/2 = 0.7062550401009887 */
+ /* 1137/8192 = (Sin[17*Pi/64] - Cos[17*Pi/64])*2 = 0.1387843410158816 */
+ /* 44011/32768 = Cos[17*Pi/64]*2 = 1.3431179096940367 */
+ od_rotate_add(s8, s7, od_sub(*s7, s8h), 23143, 15, 1137, 13, 44011, 15, 1);
+
+ /* 2865/4096 = (Sin[19*Pi/64] + Cos[19*Pi/64])/2 = 0.6994534179865391 */
+ /* 13599/32768 = (Sin[19*Pi/64] - Cos[19*Pi/64])*2 = 0.4150164539764232 */
+ /* 305/256 = Cos[19*Pi/64]*2 = 1.1913986089848667 */
+ od_rotate_sub(s6, s9, od_add(*s9, s6h), 2865, 12, 13599, 15, 305, 8, 1);
+
+ /* 5619/8192 = (Sin[21*Pi/64] + Cos[21*Pi/64])/2 = 0.6859156770967569 */
+ /* 2815/4096 = (Sin[21*Pi/64] - Cos[21*Pi/64])*2 = 0.6872517316141069 */
+ /* 8423/8192 = Cos[21*Pi/64]*2 = 1.0282054883864433 */
+ od_rotate_add(sa, s5, od_sub(*s5, sah), 5619, 13, 2815, 12, 8423, 13, 1);
+
+ /* 2727/4096 = (Sin[23*Pi/64] + Cos[23*Pi/64])/2 = 0.6657721932768628 */
+ /* 3903/4096 = (Sin[23*Pi/64] - Cos[23*Pi/64])*2 = 0.9528683993863225 */
+ /* 7005/8192 = Cos[23*Pi/64]*2 = 0.8551101868605642 */
+ od_rotate_sub(s4, sb, od_add(*sb, s4h), 2727, 12, 3903, 12, 7005, 13, 1);
+
+ /* 10473/16384 = (Sin[25*Pi/64] + Cos[25*Pi/64])/2 = 0.6392169592876205 */
+ /* 39627/32768 = (Sin[25*Pi/64] - Cos[25*Pi/64])*2 = 1.2093084235816014 */
+ /* 11039/16384 = Cos[25*Pi/64]*2 = 0.6737797067844401 */
+ od_rotate_add(sc, s3, od_sub(*s3, sch), 10473, 14, 39627, 15, 11039, 14, 1);
+
+ /* 9937/16384 = (Sin[27*Pi/64] + Cos[27*Pi/64])/2 = 0.6065057165489039 */
+ /* 1489/1024 = (Sin[27*Pi/64] - Cos[27*Pi/64])*2 = 1.4541021465825602 */
+ /* 3981/8192 = Cos[27*Pi/64]*2 = 0.4859603598065277 */
+ od_rotate_sub(s2, sd, od_add(*sd, s2h), 9937, 14, 1489, 10, 3981, 13, 1);
+
+ /* 18611/32768 = (Sin[29*Pi/64] + Cos[29*Pi/64])/2 = 0.5679534922100714 */
+ /* 55211/32768 = (Sin[29*Pi/64] - Cos[29*Pi/64])*2 = 1.6848920710188384 */
+ /* 601/2048 = Cos[29*Pi/64]*2 = 0.2934609489107235 */
+ od_rotate_add(se, s1, od_sub(*s1, seh), 18611, 15, 55211, 15, 601, 11, 1);
+
+ /* 1073/2048 = (Sin[31*Pi/64] + Cos[31*Pi/64])/2 = 0.5239315652662953 */
+ /* 62241/32768 = (Sin[31*Pi/64] - Cos[31*Pi/64])*2 = 1.8994555637555088 */
+ /* 201/2048 = Cos[31*Pi/64]*2 = 0.0981353486548360 */
+ od_rotate_sub(s0, sf, od_add(*sf, s0h), 1073, 11, 62241, 15, 201, 11, 1);
+}
+
+/* --- 32-point Transforms --- */
+
+/**
+ * 32-point orthonormal Type-II fDCT
+ */
+static INLINE void od_fdct_32(od_coeff *t0, od_coeff *t1,
+ od_coeff *t2, od_coeff *t3,
+ od_coeff *t4, od_coeff *t5,
+ od_coeff *t6, od_coeff *t7,
+ od_coeff *t8, od_coeff *t9,
+ od_coeff *ta, od_coeff *tb,
+ od_coeff *tc, od_coeff *td,
+ od_coeff *te, od_coeff *tf,
+ od_coeff *tg, od_coeff *th,
+ od_coeff *ti, od_coeff *tj,
+ od_coeff *tk, od_coeff *tl,
+ od_coeff *tm, od_coeff *tn,
+ od_coeff *to, od_coeff *tp,
+ od_coeff *tq, od_coeff *tr,
+ od_coeff *ts, od_coeff *tt,
+ od_coeff *tu, od_coeff *tv) {
+ od_coeff t1h;
+ od_coeff t3h;
+ od_coeff t5h;
+ od_coeff t7h;
+ od_coeff t9h;
+ od_coeff tbh;
+ od_coeff tdh;
+ od_coeff tfh;
+ od_coeff thh;
+ od_coeff tjh;
+ od_coeff tlh;
+ od_coeff tnh;
+ od_coeff tph;
+ od_coeff trh;
+ od_coeff tth;
+ od_coeff tvh;
+
+ /* +/- Butterflies with asymmetric output. */
+ od_butterfly_neg(t0, tv, &tvh);
+ od_butterfly_add(t1, &t1h, tu);
+ od_butterfly_neg(t2, tt, &tth);
+ od_butterfly_add(t3, &t3h, ts);
+ od_butterfly_neg(t4, tr, &trh);
+ od_butterfly_add(t5, &t5h, tq);
+ od_butterfly_neg(t6, tp, &tph);
+ od_butterfly_add(t7, &t7h, to);
+ od_butterfly_neg(t8, tn, &tnh);
+ od_butterfly_add(t9, &t9h, tm);
+ od_butterfly_neg(ta, tl, &tlh);
+ od_butterfly_add(tb, &tbh, tk);
+ od_butterfly_neg(tc, tj, &tjh);
+ od_butterfly_add(td, &tdh, ti);
+ od_butterfly_neg(te, th, &thh);
+ od_butterfly_add(tf, &tfh, tg);
+
+ /* Embedded 16-point transforms with asymmetric input. */
+ od_fdct_16_asym(
+ t0, t1, t1h, t2, t3, t3h, t4, t5, t5h, t6, t7, t7h,
+ t8, t9, t9h, ta, tb, tbh, tc, td, tdh, te, tf, tfh);
+ od_fdst_16_asym(
+ tv, tvh, tu, tt, tth, ts, tr, trh, tq, tp, tph, to,
+ tn, tnh, tm, tl, tlh, tk, tj, tjh, ti, th, thh, tg);
+}
+
+/**
+ * 32-point orthonormal Type-II iDCT
+ */
+static INLINE void od_idct_32(od_coeff *t0, od_coeff *tg,
+ od_coeff *t8, od_coeff *to,
+ od_coeff *t4, od_coeff *tk,
+ od_coeff *tc, od_coeff *ts,
+ od_coeff *t2, od_coeff *ti,
+ od_coeff *ta, od_coeff *tq,
+ od_coeff *t6, od_coeff *tm,
+ od_coeff *te, od_coeff *tu,
+ od_coeff *t1, od_coeff *th,
+ od_coeff *t9, od_coeff *tp,
+ od_coeff *t5, od_coeff *tl,
+ od_coeff *td, od_coeff *tt,
+ od_coeff *t3, od_coeff *tj,
+ od_coeff *tb, od_coeff *tr,
+ od_coeff *t7, od_coeff *tn,
+ od_coeff *tf, od_coeff *tv) {
+ od_coeff t1h;
+ od_coeff t3h;
+ od_coeff t5h;
+ od_coeff t7h;
+ od_coeff t9h;
+ od_coeff tbh;
+ od_coeff tdh;
+ od_coeff tfh;
+
+ /* Embedded 16-point transforms with asymmetric output. */
+ od_idst_16_asym(
+ tv, tn, tr, tj, tt, tl, tp, th, tu, tm, tq, ti, ts, tk, to, tg);
+ od_idct_16_asym(
+ t0, t8, t4, tc, t2, ta, t6, te,
+ t1, &t1h, t9, &t9h, t5, &t5h, td, &tdh,
+ t3, &t3h, tb, &tbh, t7, &t7h, tf, &tfh);
+
+ /* +/- Butterflies with asymmetric input. */
+ od_butterfly_add_asym(tf, tfh, tg);
+ od_butterfly_neg_asym(te, th, od_rshift1(*th));
+ od_butterfly_add_asym(td, tdh, ti);
+ od_butterfly_neg_asym(tc, tj, od_rshift1(*tj));
+ od_butterfly_add_asym(tb, tbh, tk);
+ od_butterfly_neg_asym(ta, tl, od_rshift1(*tl));
+ od_butterfly_add_asym(t9, t9h, tm);
+ od_butterfly_neg_asym(t8, tn, od_rshift1(*tn));
+ od_butterfly_add_asym(t7, t7h, to);
+ od_butterfly_neg_asym(t6, tp, od_rshift1(*tp));
+ od_butterfly_add_asym(t5, t5h, tq);
+ od_butterfly_neg_asym(t4, tr, od_rshift1(*tr));
+ od_butterfly_add_asym(t3, t3h, ts);
+ od_butterfly_neg_asym(t2, tt, od_rshift1(*tt));
+ od_butterfly_add_asym(t1, t1h, tu);
+ od_butterfly_neg_asym(t0, tv, od_rshift1(*tv));
+}
+
+#endif
diff --git a/av1/common/x86/daala_inv_txfm_avx2.c b/av1/common/x86/daala_inv_txfm_avx2.c
index 73f8029..f060bfe 100644
--- a/av1/common/x86/daala_inv_txfm_avx2.c
+++ b/av1/common/x86/daala_inv_txfm_avx2.c
@@ -982,6 +982,7 @@
}
}
+#if 0
static void od_row_idct4_avx2(int16_t *out, int rows, const tran_low_t *in) {
od_row_tx4_avx2(out, rows, in, od_idct4_kernel8_epi16);
}
@@ -992,6 +993,7 @@
od_col_tx4_add_hbd_avx2(output_pixels, output_stride, cols, in, bd,
od_idct4_kernel8_epi16);
}
+#endif
static void od_row_idst4_avx2(int16_t *out, int rows, const tran_low_t *in) {
od_row_tx4_avx2(out, rows, in, od_idst_vii4_kernel8_epi16);
@@ -1034,6 +1036,7 @@
__m256i *r6, __m256i *r1, __m256i *r5,
__m256i *r3, __m256i *r7);
+#if 0
static void od_row_tx8_avx2(int16_t *out, int rows, const tran_low_t *in,
od_tx8_kernel8_epi16 kernel8_epi16,
od_tx8_mm256_kernel kernel8_epi32) {
@@ -1176,6 +1179,7 @@
od_flip_idst8_kernel8_epi16,
od_flip_idst8_kernel16_epi16);
}
+#endif
static void od_row_iidtx8_avx2(int16_t *out, int rows, const tran_low_t *in) {
od_row_iidtx_avx2(out, rows * 8, in);
@@ -1201,6 +1205,7 @@
__m256i *sc, __m256i *sd, __m256i *se,
__m256i *sf);
+#if 0
static void od_row_tx16_avx2(int16_t *out, int rows, const tran_low_t *in,
#if CONFIG_RECT_TX_EXT
od_tx16_kernel8_epi16 kernel8_epi16,
@@ -1422,6 +1427,7 @@
od_flip_idst16_kernel8_epi16,
od_flip_idst16_kernel16_epi16);
}
+#endif
static void od_row_iidtx16_avx2(int16_t *out, int rows, const tran_low_t *in) {
od_row_iidtx_avx2(out, rows * 16, in);
@@ -1440,19 +1446,11 @@
static const daala_row_itx TX_ROW_MAP[TX_SIZES][TX_TYPES] = {
// 4-point transforms
- { od_row_idct4_avx2, od_row_idst4_avx2, od_row_flip_idst4_avx2,
- od_row_iidtx4_avx2 },
+ { NULL, od_row_idst4_avx2, od_row_flip_idst4_avx2, od_row_iidtx4_avx2 },
// 8-point transforms
- { od_row_idct8_avx2,
-#if CONFIG_DAALA_TX_DST8
- NULL, NULL,
-#else
- od_row_idst8_avx2, od_row_flip_idst8_avx2,
-#endif
- od_row_iidtx8_avx2 },
+ { NULL, NULL, NULL, od_row_iidtx8_avx2 },
// 16-point transforms
- { od_row_idct16_avx2, od_row_idst16_avx2, od_row_flip_idst16_avx2,
- od_row_iidtx16_avx2 },
+ { NULL, NULL, NULL, od_row_iidtx16_avx2 },
// 32-point transforms
{ NULL, NULL, NULL, NULL },
#if CONFIG_TX64X64
@@ -1480,19 +1478,12 @@
// High bit depth output
{
// 4-point transforms
- { od_col_idct4_add_hbd_avx2, od_col_idst4_add_hbd_avx2,
- od_col_flip_idst4_add_hbd_avx2, od_col_iidtx4_add_hbd_avx2 },
+ { NULL, od_col_idst4_add_hbd_avx2, od_col_flip_idst4_add_hbd_avx2,
+ od_col_iidtx4_add_hbd_avx2 },
// 8-point transforms
- { od_col_idct8_add_hbd_avx2,
-#if CONFIG_DAALA_TX_DST8
- NULL, NULL,
-#else
- od_col_idst8_add_hbd_avx2, od_col_flip_idst8_add_hbd_avx2,
-#endif
- od_col_iidtx8_add_hbd_avx2 },
+ { NULL, NULL, NULL, od_col_iidtx8_add_hbd_avx2 },
// 16-point transforms
- { od_col_idct16_add_hbd_avx2, od_col_idst16_add_hbd_avx2,
- od_col_flip_idst16_add_hbd_avx2, od_col_iidtx16_add_hbd_avx2 },
+ { NULL, NULL, NULL, od_col_iidtx16_add_hbd_avx2 },
// 32-point transforms
{ NULL, NULL, NULL, NULL },
#if CONFIG_TX64X64