blob: 19f620fe7d94dff82fee1cecb6d30ee67f631bb8 [file] [log] [blame]
/*
* Copyright (c) 2017, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
/* This header does not use an include guard.
It is intentionally designed to be included multiple times.
The file that includes it should define the following macros:
OD_KERNEL A label for the width of the kernel, e.g., kernel8
OD_WORD A label for the size of the SIMD word, e.g., epi16
OD_REG The type of a SIMD register, e.g., __m128i
OD_ADD The intrinsic function for addition
OD_SUB The intrinsic function for subtraction
OD_RSHIFT1 The function that implements an unbiased right shift by 1
OD_AVG The function that implements a signed PAVG[WD]
I.e., (a + b + 1) >> 1, without overflow
OD_HRSUB The function that implements a VHRSUB.S<16|32>
I.e., (a - b + 1) >> 1, without overflow
OD_MUL The function that implements the multiplies
I.e., (a * b + ((1 << r) >> 1)) >> r, without overflow
OD_SWAP The function that swaps two SIMD registers
See daala_inv_txfm_avx2.c for examples. */
#define OD_KERNEL_FUNC_IMPL(name, kernel, word) name##_##kernel##_##word
#define OD_KERNEL_FUNC_WRAPPER(name, kernel, word) \
OD_KERNEL_FUNC_IMPL(name, kernel, word)
#define OD_KERNEL_FUNC(name) OD_KERNEL_FUNC_WRAPPER(name, OD_KERNEL, OD_WORD)
static INLINE void OD_KERNEL_FUNC(od_rotate_add)(OD_REG *q0, OD_REG *q1, int c0,
int r0, int c1, int r1, int c2,
int r2, int s, int avg) {
OD_REG t_;
OD_REG u_;
if (avg)
t_ = OD_AVG(*q0, *q1);
else
t_ = OD_ADD(*q0, *q1);
u_ = OD_MUL(*q1, c0, r0);
*q1 = OD_MUL(*q0, c1, r1);
t_ = OD_MUL(t_, c2, r2);
if (s)
*q0 = OD_SUB(u_, OD_RSHIFT1(t_));
else
*q0 = OD_SUB(u_, t_);
*q1 = OD_ADD(*q1, t_);
}
static INLINE void OD_KERNEL_FUNC(od_rotate_addh)(OD_REG *q0, OD_REG *q1,
OD_REG *q1h, int c0, int r0,
int c1, int r1, int c2,
int r2, int s) {
OD_REG t_;
OD_REG u_;
t_ = OD_ADD(*q0, *q1h);
u_ = OD_MUL(*q1, c0, r0);
*q1 = OD_MUL(*q0, c1, r1);
t_ = OD_MUL(t_, c2, r2);
*q0 = OD_SUB(u_, t_);
if (s)
*q1 = OD_ADD(*q1, OD_RSHIFT1(t_));
else
*q1 = OD_ADD(*q1, t_);
}
static INLINE void OD_KERNEL_FUNC(od_rotate_sub)(OD_REG *q0, OD_REG *q1, int c0,
int r0, int c1, int r1, int c2,
int r2, int s) {
OD_REG t_;
OD_REG u_;
t_ = OD_SUB(*q0, *q1);
u_ = OD_MUL(*q1, c0, r0);
*q1 = OD_MUL(*q0, c1, r1);
t_ = OD_MUL(t_, c2, r2);
if (s)
*q0 = OD_ADD(u_, OD_RSHIFT1(t_));
else
*q0 = OD_ADD(u_, t_);
*q1 = OD_ADD(*q1, t_);
}
static INLINE void OD_KERNEL_FUNC(od_rotate_sub2)(OD_REG *q0, OD_REG *q1,
int c0, int r0, int c1,
int r1, int c2, int r2,
int avg) {
OD_REG t_;
OD_REG u_;
if (avg)
t_ = OD_HRSUB(*q1, *q0);
else
t_ = OD_SUB(*q1, *q0);
u_ = OD_MUL(*q1, c0, r0);
*q1 = OD_MUL(*q0, c1, r1);
t_ = OD_MUL(t_, c2, r2);
*q0 = OD_SUB(t_, u_);
*q1 = OD_SUB(*q1, t_);
}
static INLINE void OD_KERNEL_FUNC(od_rotate_subh)(OD_REG *q0, OD_REG *q1,
OD_REG *q1h, int c0, int r0,
int c1, int r1, int c2,
int r2, int s) {
OD_REG t_;
OD_REG u_;
t_ = OD_SUB(*q0, *q1h);
u_ = OD_MUL(*q1, c0, r0);
*q1 = OD_MUL(*q0, c1, r1);
t_ = OD_MUL(t_, c2, r2);
*q0 = OD_ADD(u_, t_);
if (s)
*q1 = OD_ADD(*q1, OD_RSHIFT1(t_));
else
*q1 = OD_ADD(*q1, t_);
}
static INLINE void OD_KERNEL_FUNC(od_rotate45)(OD_REG *p0, OD_REG *p1,
int avg) {
OD_REG t_;
if (avg)
t_ = OD_AVG(*p0, *p1);
else
t_ = OD_ADD(*p0, *p1);
/* 11585/8192 ~= 2*Sin[Pi/4] ~= 1.4142135623730951 */
*p0 = OD_MUL(*p1, 11585, 13);
/* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */
if (avg)
*p1 = OD_MUL(t_, 11585, 13);
else
*p1 = OD_MUL(t_, 11585, 14);
*p0 = OD_SUB(*p0, *p1);
}
static INLINE void OD_KERNEL_FUNC(od_butterfly_add)(OD_REG *q0, OD_REG *q1) {
*q0 = OD_ADD(*q0, OD_RSHIFT1(*q1));
*q1 = OD_SUB(*q0, *q1);
}
static INLINE void OD_KERNEL_FUNC(od_butterfly_add2)(OD_REG *q0, OD_REG *q1) {
*q0 = OD_ADD(*q0, OD_RSHIFT1(*q1));
*q1 = OD_SUB(*q1, *q0);
}
static INLINE void OD_KERNEL_FUNC(od_butterfly_sub2)(OD_REG *q0, OD_REG *q1) {
*q0 = OD_SUB(*q0, OD_RSHIFT1(*q1));
*q1 = OD_ADD(*q1, *q0);
}
static INLINE void OD_KERNEL_FUNC(od_butterfly_addh)(OD_REG *q0, OD_REG *q1,
OD_REG *q1h) {
*q0 = OD_ADD(*q0, *q1h);
*q1 = OD_SUB(*q1, *q0);
}
static INLINE void OD_KERNEL_FUNC(od_butterfly_subh)(OD_REG *q0, OD_REG *q1,
OD_REG *q1h) {
*q0 = OD_SUB(*q0, *q1h);
*q1 = OD_ADD(*q1, *q0);
}
static INLINE void OD_KERNEL_FUNC(od_butterfly_v1)(OD_REG *q0, OD_REG *q1,
OD_REG *q1h) {
*q1 = OD_SUB(*q0, *q1);
*q1h = OD_RSHIFT1(*q1);
*q0 = OD_SUB(*q0, *q1h);
}
static INLINE void OD_KERNEL_FUNC(od_butterfly_v2)(OD_REG *q0, OD_REG *q1,
OD_REG *q1h) {
*q1 = OD_SUB(*q1, *q0);
*q1h = OD_RSHIFT1(*q1);
*q0 = OD_ADD(*q0, *q1h);
}
static INLINE void OD_KERNEL_FUNC(od_butterfly_v3)(OD_REG *q0, OD_REG *q1,
OD_REG *q1h) {
*q1 = OD_ADD(*q0, *q1);
*q1h = OD_RSHIFT1(*q1);
*q0 = OD_SUB(*q0, *q1h);
}
static INLINE void OD_KERNEL_FUNC(od_idct2)(OD_REG *p0, OD_REG *p1) {
OD_KERNEL_FUNC(od_rotate45)(p1, p0, 0);
}
static INLINE void OD_KERNEL_FUNC(od_idst2)(OD_REG *p0, OD_REG *p1, int neg) {
// Note: special case of rotation
OD_REG t_;
OD_REG u_;
if (neg)
t_ = OD_HRSUB(*p0, *p1);
else
t_ = OD_AVG(*p0, *p1);
/* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */
u_ = OD_MUL(*p0, 21407, 14);
/* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.541196100146197 */
*p0 = OD_MUL(*p1, 8867, 14);
/* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */
t_ = OD_MUL(t_, 3135, 12);
if (neg) {
*p0 = OD_SUB(*p0, t_);
*p1 = OD_SUB(t_, u_);
} else {
*p0 = OD_ADD(*p0, t_);
*p1 = OD_SUB(u_, t_);
}
}
static INLINE void OD_KERNEL_FUNC(od_idct2_asym)(OD_REG *p0, OD_REG *p1,
OD_REG *p1h) {
OD_KERNEL_FUNC(od_butterfly_v1)(p0, p1, p1h);
}
static INLINE void OD_KERNEL_FUNC(od_idst2_asym)(OD_REG *p0, OD_REG *p1) {
// Note: special case of rotation
OD_REG t_;
OD_REG u_;
t_ = OD_AVG(*p0, *p1);
/* 3135/4096 ~= (Cos[Pi/8] - Sin[Pi/8])*Sqrt[2] = 0.7653668647301795 */
u_ = OD_MUL(*p1, 3135, 12);
/* 15137/16384 ~= (Cos[Pi/8] + Sin[Pi/8])/Sqrt[2] = 0.9238795325112867 */
*p1 = OD_MUL(*p0, 15137, 14);
/* 8867/8192 ~= Cos[3*Pi/8]*2*Sqrt[2] = 1.082392200292394 */
t_ = OD_MUL(t_, 8867, 13);
*p0 = OD_ADD(u_, t_);
*p1 = OD_SUB(*p1, OD_RSHIFT1(t_));
}
static INLINE void OD_KERNEL_FUNC(od_idct4)(OD_REG *q0, OD_REG *q2, OD_REG *q1,
OD_REG *q3) {
OD_REG q1h;
OD_KERNEL_FUNC(od_idst2_asym)(q3, q2);
OD_KERNEL_FUNC(od_idct2_asym)(q0, q1, &q1h);
OD_KERNEL_FUNC(od_butterfly_addh)(q2, q1, &q1h);
OD_KERNEL_FUNC(od_butterfly_add)(q0, q3);
}
static INLINE void OD_KERNEL_FUNC(od_idct4_asym)(OD_REG *q0, OD_REG *q2,
OD_REG *q1, OD_REG *q1h,
OD_REG *q3, OD_REG *q3h) {
OD_KERNEL_FUNC(od_idst2)(q3, q2, 0);
OD_KERNEL_FUNC(od_idct2)(q0, q1);
OD_KERNEL_FUNC(od_butterfly_v2)(q2, q1, q1h);
OD_KERNEL_FUNC(od_butterfly_v1)(q0, q3, q3h);
}
static INLINE void OD_KERNEL_FUNC(od_idst_vii4)(OD_REG *q0, OD_REG *q1,
OD_REG *q2, OD_REG *q3) {
// Note: special case
OD_REG t0;
OD_REG t1;
OD_REG t2;
OD_REG t3;
OD_REG t3h;
OD_REG t4;
OD_REG u4;
t0 = OD_SUB(*q0, *q3);
t1 = OD_ADD(*q0, *q2);
t2 = OD_ADD(*q3, OD_HRSUB(t0, *q2));
t3 = *q1;
t4 = OD_ADD(*q2, *q3);
/* 467/2048 ~= 2*Sin[1*Pi/9]/3 ~= 0.228013428883779 */
t0 = OD_MUL(t0, 467, 11);
/* 7021/16384 ~= 2*Sin[2*Pi/9]/3 ~= 0.428525073124360 */
t1 = OD_MUL(t1, 7021, 14);
/* 37837/32768 ~= 4*Sin[3*Pi/9]/3 ~= 1.154700538379252 */
t2 = OD_MUL(t2, 37837, 15);
/* 37837/32768 ~= 4*Sin[3*Pi/9]/3 ~= 1.154700538379252 */
t3 = OD_MUL(t3, 37837, 15);
/* 21513/32768 ~= 2*Sin[4*Pi/9]/3 ~= 0.656538502008139 */
t4 = OD_MUL(t4, 21513, 15);
t3h = OD_RSHIFT1(t3);
u4 = OD_ADD(t4, t3h);
*q0 = OD_ADD(t0, u4);
/* We swap q1 and q2 to correct for the bitreverse reordering that
od_row_tx4_avx2() does. */
*q2 = OD_ADD(t1, OD_SUB(t3, u4));
*q1 = t2;
*q3 = OD_ADD(t0, OD_SUB(t1, t3h));
}
static INLINE void OD_KERNEL_FUNC(od_flip_idst_vii4)(OD_REG *q0, OD_REG *q1,
OD_REG *q2, OD_REG *q3) {
OD_KERNEL_FUNC(od_idst_vii4)(q0, q1, q2, q3);
OD_SWAP(q0, q3);
OD_SWAP(q1, q2);
}
static INLINE void OD_KERNEL_FUNC(od_idst4)(OD_REG *q0, OD_REG *q1, OD_REG *q2,
OD_REG *q3) {
OD_REG q2h;
OD_REG q3h;
OD_KERNEL_FUNC(od_rotate45)(q2, q1, 1);
OD_KERNEL_FUNC(od_butterfly_v3)(q0, q2, &q2h);
OD_KERNEL_FUNC(od_butterfly_v3)(q1, q3, &q3h);
/* 16069/16384 ~= (Sin[5*Pi/16] + Cos[5*Pi/16])/Sqrt[2] ~= 0.9807852804032 */
/* 12785/32768 ~= (Sin[5*Pi/16] - Cos[5*Pi/16])*Sqrt[2] ~= 0.3901806440323 */
/* 12873/16384 ~= Cos[5*Pi/16]*Sqrt[2] ~= 0.7856949583871021 */
OD_KERNEL_FUNC(od_rotate_addh)
(q1, q2, &q2h, 16069, 14, 12785, 15, 12873, 14, 0);
/* 13623/16384 ~= (Sin[7*Pi/16] + Cos[7*Pi/16])/Sqrt[2] ~= 0.8314696123025 */
/* 18205/16384 ~= (Sin[7*Pi/16] - Cos[7*Pi/16])*Sqrt[2] ~= 1.1111404660392 */
/* 9041/32768 ~= Cos[7*Pi/16]*Sqrt[2] = 0.275899379282943 */
OD_KERNEL_FUNC(od_rotate_subh)
(q0, q3, &q3h, 13623, 14, 18205, 14, 9041, 15, 0);
}
static INLINE void OD_KERNEL_FUNC(od_idst4_asym)(OD_REG *q0, OD_REG *q2,
OD_REG *q1, OD_REG *q3) {
OD_REG q1h;
OD_REG q3h;
OD_KERNEL_FUNC(od_rotate45)(q1, q2, 1);
OD_KERNEL_FUNC(od_butterfly_v3)(q0, q1, &q1h);
OD_KERNEL_FUNC(od_butterfly_v3)(q2, q3, &q3h);
/* 45451/32768 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */
/* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */
/* 18205/16384 ~= 2*Cos[5*Pi/16] ~= 1.1111404660392044 */
OD_KERNEL_FUNC(od_rotate_addh)
(q2, q1, &q1h, 45451, 15, 9041, 15, 18205, 14, 1);
/* 38531/32768 ~= Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
/* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
/* 12785/32768 ~= 2*Cos[7*Pi/16] = 0.3901806440322565 */
OD_KERNEL_FUNC(od_rotate_subh)
(q0, q3, &q3h, 38531, 15, 12873, 14, 12785, 15, 1);
}
static INLINE void OD_KERNEL_FUNC(od_idct8)(OD_REG *r0, OD_REG *r4, OD_REG *r2,
OD_REG *r6, OD_REG *r1, OD_REG *r5,
OD_REG *r3, OD_REG *r7) {
OD_REG r1h;
OD_REG r3h;
OD_KERNEL_FUNC(od_idst4_asym)(r7, r5, r6, r4);
OD_KERNEL_FUNC(od_idct4_asym)(r0, r2, r1, &r1h, r3, &r3h);
OD_KERNEL_FUNC(od_butterfly_addh)(r4, r3, &r3h);
OD_KERNEL_FUNC(od_butterfly_add)(r2, r5);
OD_KERNEL_FUNC(od_butterfly_addh)(r6, r1, &r1h);
OD_KERNEL_FUNC(od_butterfly_add)(r0, r7);
}
static INLINE void OD_KERNEL_FUNC(od_idct8_asym)(
OD_REG *r0, OD_REG *r4, OD_REG *r2, OD_REG *r6, OD_REG *r1, OD_REG *r1h,
OD_REG *r5, OD_REG *r5h, OD_REG *r3, OD_REG *r3h, OD_REG *r7, OD_REG *r7h) {
OD_KERNEL_FUNC(od_idst4)(r7, r5, r6, r4);
OD_KERNEL_FUNC(od_idct4)(r0, r2, r1, r3);
OD_KERNEL_FUNC(od_butterfly_v1)(r0, r7, r7h);
OD_KERNEL_FUNC(od_butterfly_v2)(r6, r1, r1h);
OD_KERNEL_FUNC(od_butterfly_v1)(r2, r5, r5h);
OD_KERNEL_FUNC(od_butterfly_v2)(r4, r3, r3h);
}
static INLINE void OD_KERNEL_FUNC(od_idst8)(OD_REG *r0, OD_REG *r4, OD_REG *r2,
OD_REG *r6, OD_REG *r1, OD_REG *r5,
OD_REG *r3, OD_REG *r7) {
OD_REG r0h;
OD_REG r2h;
OD_REG r5h;
OD_REG r7h;
OD_KERNEL_FUNC(od_rotate45)(r1, r6, 1);
OD_KERNEL_FUNC(od_idst2)(r5, r2, 1);
OD_KERNEL_FUNC(od_idst2)(r4, r3, 0);
OD_KERNEL_FUNC(od_butterfly_v3)(r6, r7, &r7h);
OD_KERNEL_FUNC(od_butterfly_v3)(r4, r2, &r2h);
OD_KERNEL_FUNC(od_butterfly_v2)(r1, r0, &r0h);
OD_KERNEL_FUNC(od_butterfly_v3)(r3, r5, &r5h);
OD_KERNEL_FUNC(od_butterfly_subh)(r4, r7, &r7h);
OD_KERNEL_FUNC(od_butterfly_addh)(r6, r5, &r5h);
OD_KERNEL_FUNC(od_butterfly_addh)(r3, r0, &r0h);
OD_KERNEL_FUNC(od_butterfly_subh)(r1, r2, &r2h);
/* 17911/16384 ~= Sin[15*Pi/32] + Cos[15*Pi/32] ~= 1.0932018670017576 */
/* 14699/16384 ~= Sin[15*Pi/32] - Cos[15*Pi/32] ~= 0.8971675863426363 */
/* 803/8192 ~= Cos[15*Pi/32] ~= 0.0980171403295606 */
OD_KERNEL_FUNC(od_rotate_add)(r7, r0, 17911, 14, 14699, 14, 803, 13, 0, 0);
/* 40869/32768 ~= Sin[13*Pi/32] + Cos[13*Pi/32] ~= 1.247225012986671 */
/* 21845/32768 ~= Sin[13*Pi/32] - Cos[13*Pi/32] ~= 0.6666556584777465 */
/* 1189/4096 ~= Cos[13*Pi/32] ~= 0.29028467725446233 */
OD_KERNEL_FUNC(od_rotate_sub)(r1, r6, 40869, 15, 21845, 15, 1189, 12, 0);
/* 22173/16384 ~= Sin[11*Pi/32] + Cos[11*Pi/32] ~= 1.3533180011743526 */
/* 3363/8192 ~= Sin[11*Pi/32] - Cos[11*Pi/32] ~= 0.4105245275223574 */
/* 15447/32768 ~= Cos[11*Pi/32] ~= 0.47139673682599764 */
OD_KERNEL_FUNC(od_rotate_add)(r5, r2, 22173, 14, 3363, 13, 15447, 15, 0, 0);
/* 23059/16384 ~= Sin[9*Pi/32] + Cos[9*Pi/32] ~= 1.4074037375263826 */
/* 2271/16384 ~= Sin[9*Pi/32] - Cos[9*Pi/32] ~= 0.1386171691990915 */
/* 5197/8192 ~= Cos[9*Pi/32] ~= 0.6343932841636455 */
OD_KERNEL_FUNC(od_rotate_sub)(r3, r4, 23059, 14, 2271, 14, 5197, 13, 0);
}
static INLINE void OD_KERNEL_FUNC(od_idst8_asym)(OD_REG *r0, OD_REG *r4,
OD_REG *r2, OD_REG *r6,
OD_REG *r1, OD_REG *r5,
OD_REG *r3, OD_REG *r7) {
OD_REG r0h;
OD_REG r2h;
OD_REG r5h;
OD_REG r7h;
OD_KERNEL_FUNC(od_rotate45)(r1, r6, 1);
OD_KERNEL_FUNC(od_idst2)(r5, r2, 1);
OD_KERNEL_FUNC(od_idst2)(r4, r3, 0);
OD_KERNEL_FUNC(od_butterfly_v3)(r6, r7, &r7h);
OD_KERNEL_FUNC(od_butterfly_v3)(r4, r2, &r2h);
OD_KERNEL_FUNC(od_butterfly_v2)(r1, r0, &r0h);
OD_KERNEL_FUNC(od_butterfly_v3)(r3, r5, &r5h);
OD_KERNEL_FUNC(od_butterfly_subh)(r4, r7, &r7h);
OD_KERNEL_FUNC(od_butterfly_addh)(r6, r5, &r5h);
OD_KERNEL_FUNC(od_butterfly_addh)(r3, r0, &r0h);
OD_KERNEL_FUNC(od_butterfly_subh)(r1, r2, &r2h);
/* 12665/16384 ~= (Sin[15*Pi/32] + Cos[15*Pi/32])/Sqrt[2] ~= 0.77301045336 */
/* 5197/4096 ~= (Sin[15*Pi/32] - Cos[15*Pi/32])*Sqrt[2] ~= 1.2687865683273 */
/* 2271/16384 ~= Cos[15*Pi/32]*Sqrt[2] ~= 0.13861716919909148 */
OD_KERNEL_FUNC(od_rotate_add)(r7, r0, 12665, 14, 5197, 12, 2271, 14, 1, 0);
/* 28899/32768 ~= (Sin[13*Pi/32] + Cos[13*Pi/32])/Sqrt[2] ~= 0.88192126435 */
/* 30893/32768 ~= (Sin[13*Pi/32] - Cos[13*Pi/32])*Sqrt[2] ~= 0.94279347365 */
/* 3363/8192 ~= Cos[13*Pi/32]*Sqrt[2] ~= 0.41052452752235735 */
OD_KERNEL_FUNC(od_rotate_sub)(r1, r6, 28899, 15, 30893, 15, 3363, 13, 1);
/* 31357/32768 ~= (Sin[11*Pi/32] + Cos[11*Pi/32])/Sqrt[2] ~= 0.95694033573 */
/* 1189/2048 ~= (Sin[11*Pi/32] - Cos[11*Pi/32])*Sqrt[2] ~= 0.5805693545089 */
/* 21845/32768 ~= Cos[11*Pi/32] ~= 0.6666556584777465 */
OD_KERNEL_FUNC(od_rotate_add)(r5, r2, 31357, 15, 1189, 11, 21845, 15, 1, 0);
/* 16305/16384 ~= (Sin[9*Pi/32] + Cos[9*Pi/32])/Sqrt[2] ~= 0.9951847266722 */
/* 803/4096 ~= (Sin[9*Pi/32] - Cos[9*Pi/32])*Sqrt[2] ~= 0.1960342806591213 */
/* 14699/16384 ~= Cos[9*Pi/32]*Sqrt[2] ~= 0.8971675863426364 */
OD_KERNEL_FUNC(od_rotate_sub)(r3, r4, 16305, 14, 803, 12, 14699, 14, 1);
}
static INLINE void OD_KERNEL_FUNC(od_flip_idst8)(OD_REG *r0, OD_REG *r4,
OD_REG *r2, OD_REG *r6,
OD_REG *r1, OD_REG *r5,
OD_REG *r3, OD_REG *r7) {
OD_KERNEL_FUNC(od_idst8)(r0, r4, r2, r6, r1, r5, r3, r7);
OD_SWAP(r0, r7);
OD_SWAP(r4, r3);
OD_SWAP(r2, r5);
OD_SWAP(r6, r1);
}
static INLINE void OD_KERNEL_FUNC(od_idct16)(OD_REG *s0, OD_REG *s8, OD_REG *s4,
OD_REG *sc, OD_REG *s2, OD_REG *sa,
OD_REG *s6, OD_REG *se, OD_REG *s1,
OD_REG *s9, OD_REG *s5, OD_REG *sd,
OD_REG *s3, OD_REG *sb, OD_REG *s7,
OD_REG *sf) {
OD_REG s1h;
OD_REG s3h;
OD_REG s5h;
OD_REG s7h;
OD_KERNEL_FUNC(od_idst8_asym)(sf, sb, sd, s9, se, sa, sc, s8);
OD_KERNEL_FUNC(od_idct8_asym)
(s0, s4, s2, s6, s1, &s1h, s5, &s5h, s3, &s3h, s7, &s7h);
OD_KERNEL_FUNC(od_butterfly_addh)(s8, s7, &s7h);
OD_KERNEL_FUNC(od_butterfly_add)(s6, s9);
OD_KERNEL_FUNC(od_butterfly_addh)(sa, s5, &s5h);
OD_KERNEL_FUNC(od_butterfly_add)(s4, sb);
OD_KERNEL_FUNC(od_butterfly_addh)(sc, s3, &s3h);
OD_KERNEL_FUNC(od_butterfly_add)(s2, sd);
OD_KERNEL_FUNC(od_butterfly_addh)(se, s1, &s1h);
OD_KERNEL_FUNC(od_butterfly_add)(s0, sf);
}
static INLINE void OD_KERNEL_FUNC(od_idst16)(OD_REG *s0, OD_REG *s1, OD_REG *s2,
OD_REG *s3, OD_REG *s4, OD_REG *s5,
OD_REG *s6, OD_REG *s7, OD_REG *s8,
OD_REG *s9, OD_REG *sa, OD_REG *sb,
OD_REG *sc, OD_REG *sd, OD_REG *se,
OD_REG *sf) {
OD_REG s0h;
OD_REG s1h;
OD_REG s2h;
OD_REG s3h;
OD_REG s4h;
OD_REG s5h;
OD_REG s6h;
OD_REG s7h;
OD_REG sbh;
OD_REG sfh;
OD_REG h;
OD_KERNEL_FUNC(od_rotate45)(s9, s6, 1);
OD_KERNEL_FUNC(od_rotate45)(sa, s5, 1);
OD_KERNEL_FUNC(od_rotate45)(s8, s7, 1);
OD_KERNEL_FUNC(od_idst2)(s3, sc, 0);
OD_KERNEL_FUNC(od_idst2)(sb, s4, 1);
OD_KERNEL_FUNC(od_butterfly_v3)(s2, sa, &h);
OD_KERNEL_FUNC(od_butterfly_v2)(sd, s5, &h);
OD_KERNEL_FUNC(od_butterfly_v2)(s9, s1, &h);
OD_KERNEL_FUNC(od_butterfly_v3)(s6, se, &h);
OD_KERNEL_FUNC(od_butterfly_v3)(sc, sb, &sbh);
OD_KERNEL_FUNC(od_butterfly_v3)(s7, sf, &sfh);
OD_KERNEL_FUNC(od_butterfly_v2)(s8, s0, &s0h);
OD_KERNEL_FUNC(od_butterfly_v3)(s3, s4, &s4h);
/* 38531/32768 ~= Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
/* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
/* 6393/32768 ~= Cos[7*Pi/16] = 0.19509032201612825 */
OD_KERNEL_FUNC(od_rotate_sub2)
(s2, sd, 38531, 15, 12873, 14, 6393, 15, 0);
/* 22725/16384 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */
/* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */
/* 18205/16384 ~= 2*Cos[5*Pi/16] ~= 1.1111404660392044 */
OD_KERNEL_FUNC(od_rotate_sub2)
(sa, s5, 22725, 14, 9041, 15, 18205, 14, 1);
/* 45451/32768 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */
/* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */
/* 18205/32768 ~= Cos[5*Pi/16] ~= 0.5555702330196022 */
OD_KERNEL_FUNC(od_rotate_add)
(s6, s9, 45451, 15, 9041, 15, 18205, 15, 0, 0);
/* 9633/8192 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */
/* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */
/* 12785/32768 ~= 2*Cos[7*Pi/16] ~= 0.3901806440322565 */
OD_KERNEL_FUNC(od_rotate_add)
(se, s1, 9633, 13, 12873, 14, 12785, 15, 0, 1);
OD_KERNEL_FUNC(od_butterfly_subh)(s8, s4, &s4h);
OD_KERNEL_FUNC(od_butterfly_addh)(s7, sb, &sbh);
OD_KERNEL_FUNC(od_butterfly_subh)(s3, sf, &sfh);
OD_KERNEL_FUNC(od_butterfly_addh)(sc, s0, &s0h);
OD_KERNEL_FUNC(od_butterfly_add2)(sd, se);
OD_KERNEL_FUNC(od_butterfly_add2)(s2, s1);
OD_KERNEL_FUNC(od_butterfly_sub2)(s6, s5);
OD_KERNEL_FUNC(od_butterfly_sub2)(s9, sa);
OD_KERNEL_FUNC(od_butterfly_v2)(se, s0, &s0h);
OD_KERNEL_FUNC(od_butterfly_v2)(sf, s1, &s1h);
OD_KERNEL_FUNC(od_butterfly_v3)(sc, s2, &s2h);
OD_KERNEL_FUNC(od_butterfly_v3)(sd, s3, &s3h);
OD_KERNEL_FUNC(od_butterfly_v2)(sa, s4, &s4h);
OD_KERNEL_FUNC(od_butterfly_v2)(sb, s5, &s5h);
OD_KERNEL_FUNC(od_butterfly_v3)(s8, s6, &s6h);
OD_KERNEL_FUNC(od_butterfly_v3)(s9, s7, &s7h);
/* 32729/32768 ~= (Sin[17*Pi/64] + Cos[17*Pi/64])/Sqrt[2] ~= 0.99879545620 */
/* 201/2048 ~= (Sin[17*Pi/64] - Cos[17*Pi/64])*Sqrt[2] ~= 0.09813534865484 */
/* 31121/32768 ~= Cos[17*Pi/64]*Sqrt[2] ~= 0.9497277818777543 */
OD_KERNEL_FUNC(od_rotate_subh)
(se, s1, &s1h, 32729, 15, 201, 11, 31121, 15, 0);
/* 32413/32768 ~= (Sin[19*Pi/64] + Cos[19*Pi/64])/Sqrt[2] ~= 0.98917650996 */
/* 601/2048 ~= (Sin[19*Pi/64] - Cos[19*Pi/64])*Sqrt[2] ~= 0.29346094891072 */
/* 27605/32768 ~= Cos[19*Pi/64]*Sqrt[2] ~= 0.8424460355094193 */
OD_KERNEL_FUNC(od_rotate_addh)
(s9, s6, &s6h, 32413, 15, 601, 11, 27605, 15, 0);
/* 15893/16384 ~= (Sin[21*Pi/64] + Cos[21*Pi/64])/Sqrt[2] ~= 0.97003125319 */
/* 3981/8192 ~= (Sin[21*Pi/64] - Cos[21*Pi/64])*Sqrt[2] ~= 0.4859603598065 */
/* 1489/2048 ~= Cos[21*Pi/64]*Sqrt[2] ~= 0.72705107329128 */
OD_KERNEL_FUNC(od_rotate_subh)
(sa, s5, &s5h, 15893, 14, 3981, 13, 1489, 11, 0);
/* 30853/32768 ~= (Sin[23*Pi/64] + Cos[23*Pi/64])/Sqrt[2] ~= 0.94154406518 */
/* 11039/16384 ~= (Sin[23*Pi/64] - Cos[23*Pi/64])*Sqrt[2] ~= 0.67377970678 */
/* 19813/32768 ~= Cos[23*Pi/64]*Sqrt[2] ~= 0.6046542117908008 */
OD_KERNEL_FUNC(od_rotate_addh)
(sd, s2, &s2h, 30853, 15, 11039, 14, 19813, 15, 0);
/* 14811/16384 ~= (Sin[25*Pi/64] + Cos[25*Pi/64])/Sqrt[2] ~= 0.90398929312 */
/* 7005/8192 ~= (Sin[25*Pi/64] - Cos[25*Pi/64])*Sqrt[2] ~= 0.8551101868606 */
/* 3903/8192 ~= Cos[25*Pi/64]*Sqrt[2] ~= 0.47643419969316125 */
OD_KERNEL_FUNC(od_rotate_subh)
(sc, s3, &s3h, 14811, 14, 7005, 13, 3903, 13, 0);
/* 14053/16384 ~= (Sin[27*Pi/64] + Cos[27*Pi/64])/Sqrt[2] ~= 0.85772861000 */
/* 8423/8192 ~= (Sin[27*Pi/64] - Cos[27*Pi/64])*Sqrt[2] ~= 1.0282054883864 */
/* 2815/8192 ~= Cos[27*Pi/64]*Sqrt[2] ~= 0.34362586580705035 */
OD_KERNEL_FUNC(od_rotate_addh)
(sb, s4, &s4h, 14053, 14, 8423, 13, 2815, 13, 0);
/* 1645/2048 ~= (Sin[29*Pi/64] + Cos[29*Pi/64])/Sqrt[2] ~= 0.8032075314806 */
/* 305/256 ~= (Sin[29*Pi/64] - Cos[29*Pi/64])*Sqrt[2] ~= 1.191398608984867 */
/* 425/2048 ~= Cos[29*Pi/64]*Sqrt[2] ~= 0.20750822698821159 */
OD_KERNEL_FUNC(od_rotate_subh)
(s8, s7, &s7h, 1645, 11, 305, 8, 425, 11, 0);
/* 24279/32768 ~= (Sin[31*Pi/64] + Cos[31*Pi/64])/Sqrt[2] ~= 0.74095112535 */
/* 44011/32768 ~= (Sin[31*Pi/64] - Cos[31*Pi/64])*Sqrt[2] ~= 1.34311790969 */
/* 1137/16384 ~= Cos[31*Pi/64]*Sqrt[2] ~= 0.06939217050794069 */
OD_KERNEL_FUNC(od_rotate_addh)
(sf, s0, &s0h, 24279, 15, 44011, 15, 1137, 14, 0);
}
static INLINE void OD_KERNEL_FUNC(od_flip_idst16)(
OD_REG *s0, OD_REG *s1, OD_REG *s2, OD_REG *s3, OD_REG *s4, OD_REG *s5,
OD_REG *s6, OD_REG *s7, OD_REG *s8, OD_REG *s9, OD_REG *sa, OD_REG *sb,
OD_REG *sc, OD_REG *sd, OD_REG *se, OD_REG *sf) {
OD_KERNEL_FUNC(od_idst16)
(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sa, sb, sc, sd, se, sf);
OD_SWAP(s0, sf);
OD_SWAP(s1, se);
OD_SWAP(s2, sd);
OD_SWAP(s3, sc);
OD_SWAP(s4, sb);
OD_SWAP(s5, sa);
OD_SWAP(s6, s9);
OD_SWAP(s7, s8);
}