Add new 8-point Type-VII DST to daala_tx. Replaces the lifting based orthonormal 8-point Type-IV DST with an orthonormal 8-point Type-VII DST that has no iterative multiplies. Change-Id: Idb95e7f3056c7c74a5444565ab5262b58ef5604a
diff --git a/av1/common/daala_tx.c b/av1/common/daala_tx.c index 1630359..ab245c7 100644 --- a/av1/common/daala_tx.c +++ b/av1/common/daala_tx.c
@@ -3238,60 +3238,207 @@ x[7*xstride] = (od_coeff)r7; } +const int OD_DST_8_PERM[8] = { 0, 7, 1, 6, 2, 5, 3, 4 }; + +/* Computes the Polynomial Product Y(z) ≡ X(z)*H(z) modulo (z^8 + 1) using + Nussbaumer's "short" algorithm [1]. + The monomial coefficients in Y(z) are exactly the values of an acyclic + convolution of the monomial coefficients of X(z) and H(z). + Since H(z) is fixed, the multiplication terms are constant and precomputed. + + [1] Nussbaumer, Henri J. "Fast Fourier Transform and Convolution Algorithms" + Springer-Verlag: Berlin, Heidelberg, New York (1981) pages 76-78. */ +static void od_poly_prod_8(od_coeff y[8], const od_coeff x[8]) { + /* 21 "muls", 75 adds, 18 shifts */ + od_coeff q0; + od_coeff q1; + od_coeff q2; + od_coeff q3; + od_coeff q4; + od_coeff q5; + od_coeff q6; + od_coeff q7; + od_coeff q8; + od_coeff q9; + od_coeff q10; + od_coeff q11; + od_coeff q12; + od_coeff q13; + od_coeff q14; + od_coeff q15; + od_coeff q16; + od_coeff q17; + od_coeff q18; + od_coeff q19; + od_coeff q20; + od_coeff t0; + od_coeff t1; + od_coeff t2; + od_coeff t3; + od_coeff t4; + od_coeff t5; + od_coeff t6; + od_coeff t7; + od_coeff u0; + od_coeff u1; + od_coeff u1h; + od_coeff u2; + od_coeff u2h; + od_coeff u3; + od_coeff u4; + od_coeff u4h; + od_coeff u5; + od_coeff u6; + od_coeff u7; + od_coeff u7h; + od_coeff u8; + od_coeff u9; + od_coeff u10; + od_coeff u11; + od_coeff u12; + od_coeff u13; + od_coeff u14; + od_coeff u15; + od_coeff u16; + od_coeff u17; + od_coeff u18; + od_coeff u19; + od_coeff u20; + od_coeff u21; + od_coeff u22; + od_coeff u23; + od_coeff u24; + od_coeff u25; + od_coeff u26; + od_coeff u27; + t0 = x[0]; + t1 = x[1]; + t2 = x[2]; + t3 = x[3]; + t4 = x[4]; + t5 = x[5]; + t6 = x[6]; + t7 = x[7]; + /* Stage 0 Butterfly */ + u7 = t0 - t7; + u7h = OD_DCT_RSHIFT(u7, 1); + u0 = t0 - u7h; + u2 = t2 - t6; + u2h = OD_DCT_RSHIFT(u2, 1); + u6 = t2 - u2h; + u4 = t4 + t5; + u4h = OD_DCT_RSHIFT(u4, 1); + u5 = t4 - u4h; + u1 = t3 - t1; + u1h = OD_DCT_RSHIFT(u1, 1); + u3 = t3 - u1h; + /* Stage 1 Butterfly */ + q0 = u0 + u2h; + q1 = q0 - u2; + q4 = u3 + u4h; + q5 = q4 - u4; + q2 = u7h + u5; + q7 = u7 - q2; + q6 = u1h + u6; + q3 = u1 - q6; + /* Stage 2 Half-Butterfly */ + /*The intermediate sums can overflow 16 bits, but all SIMD instruction sets + should be able to compute them without issue (i.e., using PAVGW or + V{R}HADD.S16).*/ + q8 = (q0 + q4 + 1) >> 1; + q9 = (q1 + q5) >> 1; + q10 = (q2 + q3 + 1) >> 1; + q11 = (q7 + q6) >> 1; + /* Stage 3 */ + q12 = t0 + t3; + q13 = t0; + q14 = t3; + q15 = t5 - t6; + q16 = t6; + q17 = t5; + q18 = ((q6 + ((t0 + t6 + 1) >> 1)) - (q4 + (t5 >> 1))) >> 1; + q19 = ((q7 + ((t5 + t6 + 1) >> 1)) - (q0 + (t3 >> 1))) >> 1; + q20 = (q18 - q19) >> 1; + /* Stage 4 */ + q0 = (-5995*q0 + 8192) >> 14; + q1 = (-1373*q1 + 4096) >> 13; + q2 = (22891*q2 + 16384) >> 15; + q3 = (-217*q3 + 512) >> 10; + q4 = (13427*q4 + 16384) >> 15; + q5 = (-11013*q5 + 8192) >> 14; + q6 = (1373*q6 + 1024) >> 11; + q7 = (-14077*q7 + 16384) >> 15; + q8 = (-1437*q8 + 16384) >> 15; + q9 = (27519*q9 + 16384) >> 15; + q10 = (-15947*q10 + 16384) >> 15; + q11 = (-7891*q11 + 16384) >> 15; + q12 = (4897*q12 + 16384) >> 15; + q13 = (-5079*q13 + 8192) >> 14; + q14 = (365*q14 + 16384) >> 15; + q15 = (3325*q15 + 8192) >> 14; + q16 = (-5225*q16 + 8192) >> 14; + q17 = (-1425*q17 + 8192) >> 14; + q18 = (3453*q18 + 16384) >> 15; + q19 = (-8421*q19 + 8192) >> 14; + q20 = (-20295*q20 + 16384) >> 15; + /* Stage 5 */ + u0 = q0 + q8; + u1 = q1 + q9; + u2 = q2 + q10; + u3 = q3 + q10; + u4 = q4 + q8; + u5 = q5 + q9; + u6 = q6 + q11; + u7 = q7 + q11; + /* Stage 6 */ + u10 = u0 + u1; + u11 = u0 - u1; + u12 = u2 + u7; + u13 = u2 - u7; + u14 = u3 + u6; + u15 = u3 - u6; + u16 = u5 + u4; + u17 = u5 - u4; + /* Stage 7 */ + u8 = q19 + q20; + u9 = q19 - q18; + u18 = q12 + u8; + u19 = u18 + q13; + u20 = u18 + q14; + u21 = u9 << 1; + u22 = q15 + u21; + u23 = q16 - u22; + u24 = u22 + q17; + u25 = u8 << 1; + u26 = u25 << 1; + u27 = u25 - u9; + /* Stage 8 */ + y[0] = u14 + u16 + u20; + y[1] = u12 - u10 - u25; + y[2] = u9 + u13 - u17; + y[3] = u9 - u10 - u12 - u19; + y[4] = u15 - u11 - u27; + y[5] = u23 - u11 - u15; + y[6] = u13 + u17 - u24 + u26; + y[7] = u16 - u14 + u21 - u25; +} + void od_bin_fdst8(od_coeff y[8], const od_coeff *x, int xstride) { - int r0; - int r1; - int r2; - int r3; - int r4; - int r5; - int r6; - int r7; - r0 = x[0*xstride]; - r4 = x[1*xstride]; - r2 = x[2*xstride]; - r6 = x[3*xstride]; - r1 = x[4*xstride]; - r5 = x[5*xstride]; - r3 = x[6*xstride]; - r7 = x[7*xstride]; - OD_FDST_8(r0, r4, r2, r6, r1, r5, r3, r7); - y[0] = (od_coeff)r0; - y[1] = (od_coeff)r1; - y[2] = (od_coeff)r2; - y[3] = (od_coeff)r3; - y[4] = (od_coeff)r4; - y[5] = (od_coeff)r5; - y[6] = (od_coeff)r6; - y[7] = (od_coeff)r7; + int i; + od_coeff xp[8]; + od_coeff yp[8]; + for (i = 0; i < 8; i++) xp[i] = x[i*xstride]; + od_poly_prod_8(yp, xp); + for (i = 0; i < 8; i++) y[OD_DST_8_PERM[i]] = yp[i]; } void od_bin_idst8(od_coeff *x, int xstride, const od_coeff y[8]) { - int r0; - int r1; - int r2; - int r3; - int r4; - int r5; - int r6; - int r7; - r0 = y[0]; - r4 = y[1]; - r2 = y[2]; - r6 = y[3]; - r1 = y[4]; - r5 = y[5]; - r3 = y[6]; - r7 = y[7]; - OD_IDST_8(r0, r4, r2, r6, r1, r5, r3, r7); - x[0*xstride] = (od_coeff)r0; - x[1*xstride] = (od_coeff)r1; - x[2*xstride] = (od_coeff)r2; - x[3*xstride] = (od_coeff)r3; - x[4*xstride] = (od_coeff)r4; - x[5*xstride] = (od_coeff)r5; - x[6*xstride] = (od_coeff)r6; - x[7*xstride] = (od_coeff)r7; + int i; + od_coeff xp[8]; + od_coeff yp[8]; + for (i = 0; i < 8; i++) yp[i] = y[OD_DST_8_PERM[i]]; + od_poly_prod_8(xp, yp); + for (i = 0; i < 8; i++) x[i*xstride] = xp[i]; } void od_bin_fdct16(od_coeff y[16], const od_coeff *x, int xstride) {