Add an orthonormal 4-point Type-VII DST.

Replaces the orthonormal Type-IV DST with an orthonormal Type-VII DST
 in od_bin_fdst4() and od_bin_idst4()

Change-Id: I4ff0888e740d8cc063a2e5deaeceef7cb0d80485
diff --git a/av1/common/daala_tx.c b/av1/common/daala_tx.c
index c35cc44..285436f 100644
--- a/av1/common/daala_tx.c
+++ b/av1/common/daala_tx.c
@@ -3150,36 +3150,86 @@
   x[3*xstride] = q3;
 }
 
+/* 4-point orthonormal Type-VII fDST. */
 void od_bin_fdst4(od_coeff y[4], const od_coeff *x, int xstride) {
+  /* 11 adds, 5 "muls".*/
   int q0;
   int q1;
   int q2;
   int q3;
-  q0 = x[3*xstride];
-  q2 = x[2*xstride];
+  int t0;
+  int t1;
+  int t2;
+  int t3;
+  int t4;
+  q0 = x[0*xstride];
   q1 = x[1*xstride];
-  q3 = x[0*xstride];
-  OD_FDST_4(q0, q2, q1, q3);
-  y[0] = (od_coeff)q3;
-  y[1] = (od_coeff)q2;
-  y[2] = (od_coeff)q1;
-  y[3] = (od_coeff)q0;
+  q2 = x[2*xstride];
+  q3 = x[3*xstride];
+  t0 = q1 + q3;
+  t1 = q0 + q1 - q3;
+  t2 = q0 - q1;
+  t3 = q2;
+  t4 = q0 + q3;
+  /* 7021/16384 ~= 2*Sin[2*Pi/9]/3 ~= 0.428525073124360 */
+  t0 = (t0*7021 + 8192) >> 14;
+  /* 18919/32768 ~= 2*Sin[3*Pi/9]/3 ~= 0.577350269189626 */
+  t1 = (t1*18919 + 16384) >> 15;
+  /* 21513/32768 ~= 2*Sin[4*Pi/9]/3 ~= 0.656538502008139 */
+  t2 = (t2*21513 + 16384) >> 15;
+  /* 18919/32768 ~= 2*Sin[3*Pi/9]/3 ~= 0.577350269189626 */
+  t3 = (t3*18919 + 16384) >> 15;
+  /* 467/2048 ~= 2*Sin[1*Pi/9]/3 ~= 0.228013428883779 */
+  t4 = (t4*467 + 1024) >> 11;
+  q0 = t0 + t3 + t4;
+  q1 = t1;
+  q2 = t0 + t2 - t3;
+  q3 = t2 + t3 - t4;
+  y[0] = (od_coeff)q0;
+  y[1] = (od_coeff)q1;
+  y[2] = (od_coeff)q2;
+  y[3] = (od_coeff)q3;
 }
 
+/* 4-point orthonormal Type-VII iDST. */
 void od_bin_idst4(od_coeff *x, int xstride, const od_coeff y[4]) {
+  /* 11 adds, 5 "muls".*/
   int q0;
   int q1;
   int q2;
   int q3;
-  q0 = y[3];
-  q2 = y[2];
+  int t0;
+  int t1;
+  int t2;
+  int t3;
+  int t4;
+  q0 = y[0];
   q1 = y[1];
-  q3 = y[0];
-  OD_IDST_4(q0, q2, q1, q3);
-  x[0*xstride] = q3;
-  x[1*xstride] = q2;
-  x[2*xstride] = q1;
-  x[3*xstride] = q0;
+  q2 = y[2];
+  q3 = y[3];
+  t0 = q0 - q3;
+  t1 = q0 + q2;
+  t2 = q0 - q2 + q3;
+  t3 = q1;
+  t4 = q2 + q3;
+  /* 467/2048 ~= 2*Sin[1*Pi/9]/3 ~= 0.228013428883779 */
+  t0 = (t0*467 + 1024) >> 11;
+  /* 7021/16384 ~= 2*Sin[2*Pi/9]/3 ~= 0.428525073124360 */
+  t1 = (t1*7021 + 8192) >> 14;
+  /* 18919/32768 ~= 2*Sin[3*Pi/9]/3 ~= 0.577350269189626 */
+  t2 = (t2*18919 + 16384) >> 15;
+  /* 18919/32768 ~= 2*Sin[3*Pi/9]/3 ~= 0.577350269189626 */
+  t3 = (t3*18919 + 16384) >> 15;
+  /* 21513/32768 ~= 2*Sin[4*Pi/9]/3 ~= 0.656538502008139 */
+  t4 = (t4*21513 + 16384) >> 15;
+  q0 = t0 + t3 + t4;
+  q1 = t1 + t3 - t4;
+  q2 = t2;
+  q3 = t0 + t1 - t3;
+  x[0*xstride] = q0;
+  x[1*xstride] = q1;
+  x[2*xstride] = q2;
+  x[3*xstride] = q3;
 }
 
 void od_bin_fdct8(od_coeff y[8], const od_coeff *x, int xstride) {