Add CONFIG_DAALA_DCT8 experiment.

This experiment replaces the 8-point Type-II DCT and 8-point Type-IV DST
 scaling vp9 transforms with the 8-point orthonormal Daala transforms.
These have reduced complexity and are perfect reconstruction at the cost
 of a slightly worse coding performance.
This is because the Daala transforms expect the input to be shifted by 4
 bits but the output scale of the vp9 transforms is only 3 bits.

subset-1:

monty-square-baseline-subset1 ->
  monty-square-dct8-subset1@2017-07-17T21:37:44.281Z

  PSNR | PSNR Cb | PSNR Cr | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
0.0019 | -0.0011 | -0.0585 |  -0.0111 | 0.0305 |  0.0317 |     0.0187

objective-1-fast:

monty-square-baseline-o1f ->
  monty-square-dct8-o1f@2017-07-17T21:37:15.735Z

  PSNR | PSNR Cb | PSNR Cr | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
0.0285 |  0.0129 | -0.5080 |   0.0529 | 0.0345 |  0.0441 |     0.0054

Change-Id: I2b775495398fb717204a295397c3c5e3ca938183
diff --git a/aom_dsp/inv_txfm.c b/aom_dsp/inv_txfm.c
index 13bebef..bf1345c 100644
--- a/aom_dsp/inv_txfm.c
+++ b/aom_dsp/inv_txfm.c
@@ -14,7 +14,7 @@
 
 #include "./aom_dsp_rtcd.h"
 #include "aom_dsp/inv_txfm.h"
-#if CONFIG_DAALA_DCT4
+#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8
 #include "av1/common/daala_tx.h"
 #endif
 
@@ -172,6 +172,18 @@
   }
 }
 
+#if CONFIG_DAALA_DCT8
+void aom_idct8_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  od_coeff x[8];
+  od_coeff y[8];
+  for (i = 0; i < 8; i++) y[i] = (od_coeff)input[i];
+  od_bin_idct8(x, 1, y);
+  for (i = 0; i < 8; i++) output[i] = (tran_low_t)x[i];
+}
+
+#else
+
 void aom_idct8_c(const tran_low_t *input, tran_low_t *output) {
   tran_low_t step1[8], step2[8];
   tran_high_t temp1, temp2;
@@ -225,6 +237,7 @@
   output[6] = WRAPLOW(step1[1] - step1[6]);
   output[7] = WRAPLOW(step1[0] - step1[7]);
 }
+#endif
 
 void aom_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   tran_low_t out[8 * 8];
@@ -300,6 +313,18 @@
   output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
 }
 
+#if CONFIG_DAALA_DCT8
+void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  od_coeff x[8];
+  od_coeff y[8];
+  for (i = 0; i < 8; i++) y[i] = (od_coeff)input[i];
+  od_bin_idst8(x, 1, y);
+  for (i = 0; i < 8; i++) output[i] = (tran_low_t)x[i];
+}
+
+#else
+
 void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
@@ -377,6 +402,8 @@
   output[7] = WRAPLOW(-x1);
 }
 
+#endif
+
 void aom_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   tran_low_t out[8 * 8] = { 0 };
   tran_low_t *outptr = out;
diff --git a/av1/common/daala_tx.c b/av1/common/daala_tx.c
index 32ad568..31f03de 100644
--- a/av1/common/daala_tx.c
+++ b/av1/common/daala_tx.c
@@ -11,6 +11,33 @@
 #  define OD_DCT_OVERFLOW_CHECK(val, scale, offset, idx)
 # endif
 
+#define OD_FDCT_2(p0, p1) \
+  /* Embedded 2-point orthonormal Type-II fDCT. */ \
+  do { \
+    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
+    OD_DCT_OVERFLOW_CHECK(p1, 13573, 16384, 100); \
+    p0 -= (p1*13573 + 16384) >> 15; \
+    /* 5793/8192 ~= Sin[pi/4] ~= 0.707106781186547 */ \
+    OD_DCT_OVERFLOW_CHECK(p0, 5793, 4096, 101); \
+    p1 += (p0*5793 + 4096) >> 13; \
+    /* 3393/8192 ~= Tan[pi/8] ~= 0.414213562373095 */ \
+    OD_DCT_OVERFLOW_CHECK(p1, 3393, 4096, 102); \
+    p0 -= (p1*3393 + 4096) >> 13; \
+  } \
+  while (0)
+
+#define OD_IDCT_2(p0, p1) \
+  /* Embedded 2-point orthonormal Type-II iDCT. */ \
+  do { \
+    /* 3393/8192 ~= Tan[pi/8] ~= 0.414213562373095 */ \
+    p0 += (p1*3393 + 4096) >> 13; \
+    /* 5793/8192 ~= Sin[pi/4] ~= 0.707106781186547 */ \
+    p1 -= (p0*5793 + 4096) >> 13; \
+    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
+    p0 += (p1*13573 + 16384) >> 15; \
+  } \
+  while (0)
+
 #define OD_FDCT_2_ASYM(p0, p1, p1h) \
   /* Embedded 2-point asymmetric Type-II fDCT. */ \
   do { \
@@ -28,6 +55,33 @@
   } \
   while (0)
 
+#define OD_FDST_2(p0, p1) \
+  /* Embedded 2-point orthonormal Type-IV fDST. */ \
+  do { \
+    /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
+    OD_DCT_OVERFLOW_CHECK(p1, 10947, 8192, 103); \
+    p0 -= (p1*10947 + 8192) >> 14; \
+    /* 473/512 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
+    OD_DCT_OVERFLOW_CHECK(p0, 473, 256, 104); \
+    p1 += (p0*473 + 256) >> 9; \
+    /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
+    OD_DCT_OVERFLOW_CHECK(p1, 10947, 8192, 105); \
+    p0 -= (p1*10947 + 8192) >> 14; \
+  } \
+  while (0)
+
+#define OD_IDST_2(p0, p1) \
+  /* Embedded 2-point orthonormal Type-IV iDST. */ \
+  do { \
+    /* 10947/16384 ~= Tan[3*Pi/16]) ~= 0.668178637919299 */ \
+    p0 += (p1*10947 + 8192) >> 14; \
+    /* 473/512 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
+    p1 -= (p0*473 + 256) >> 9; \
+    /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
+    p0 += (p1*10947 + 8192) >> 14; \
+  } \
+  while (0)
+
 #define OD_FDST_2_ASYM(p0, p1) \
   /* Embedded 2-point asymmetric Type-IV fDST. */ \
   do { \
@@ -86,6 +140,312 @@
   } \
   while (0)
 
+#define OD_FDCT_4_ASYM(q0, q2, q2h, q1, q3, q3h) \
+  /* Embedded 4-point asymmetric Type-II fDCT. */ \
+  do { \
+    q0 += q3h; \
+    q3 = q0 - q3; \
+    q1 = q2h - q1; \
+    q2 = q1 - q2; \
+    OD_FDCT_2(q0, q2); \
+    OD_FDST_2(q3, q1); \
+  } \
+  while (0)
+
+#define OD_IDCT_4_ASYM(q0, q2, q1, q1h, q3, q3h) \
+  /* Embedded 4-point asymmetric Type-II iDCT. */ \
+  do { \
+    OD_IDST_2(q3, q2); \
+    OD_IDCT_2(q0, q1); \
+    q1 = q2 - q1; \
+    q1h = OD_DCT_RSHIFT(q1, 1); \
+    q2 = q1h - q2; \
+    q3 = q0 - q3; \
+    q3h = OD_DCT_RSHIFT(q3, 1); \
+    q0 -= q3h; \
+  } \
+  while (0)
+
+#define OD_FDST_4_ASYM(t0, t0h, t2, t1, t3) \
+  /* Embedded 4-point asymmetric Type-IV fDST. */ \
+  do { \
+    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
+    OD_DCT_OVERFLOW_CHECK(t1, 7489, 4096, 106); \
+    t2 -= (t1*7489 + 4096) >> 13; \
+    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
+    OD_DCT_OVERFLOW_CHECK(t1, 11585, 8192, 107); \
+    t1 += (t2*11585 + 8192) >> 14; \
+    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
+    OD_DCT_OVERFLOW_CHECK(t1, 19195, 16384, 108); \
+    t2 += (t1*19195 + 16384) >> 15; \
+    t3 += OD_DCT_RSHIFT(t2, 1); \
+    t2 -= t3; \
+    t1 = t0h - t1; \
+    t0 -= t1; \
+    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
+    OD_DCT_OVERFLOW_CHECK(t0, 6723, 4096, 109); \
+    t3 += (t0*6723 + 4096) >> 13; \
+    /* 8035/8192 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
+    OD_DCT_OVERFLOW_CHECK(t3, 8035, 4096, 110); \
+    t0 -= (t3*8035 + 4096) >> 13; \
+    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
+    OD_DCT_OVERFLOW_CHECK(t0, 6723, 4096, 111); \
+    t3 += (t0*6723 + 4096) >> 13; \
+    /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
+    OD_DCT_OVERFLOW_CHECK(t1, 8757, 8192, 112); \
+    t2 += (t1*8757 + 8192) >> 14; \
+    /* 6811/8192 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
+    OD_DCT_OVERFLOW_CHECK(t2, 6811, 4096, 113); \
+    t1 -= (t2*6811 + 4096) >> 13; \
+    /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
+    OD_DCT_OVERFLOW_CHECK(t1, 8757, 8192, 114); \
+    t2 += (t1*8757 + 8192) >> 14; \
+  } \
+  while (0)
+
+#define OD_IDST_4_ASYM(t0, t0h, t2, t1, t3) \
+  /* Embedded 4-point asymmetric Type-IV iDST. */ \
+  do { \
+    /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
+    t1 -= (t2*8757 + 8192) >> 14; \
+    /* 6811/8192 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
+    t2 += (t1*6811 + 4096) >> 13; \
+    /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
+    t1 -= (t2*8757 + 8192) >> 14; \
+    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
+    t3 -= (t0*6723 + 4096) >> 13; \
+    /* 8035/8192 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
+    t0 += (t3*8035 + 4096) >> 13; \
+    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
+    t3 -= (t0*6723 + 4096) >> 13; \
+    t0 += t2; \
+    t0h = OD_DCT_RSHIFT(t0, 1); \
+    t2 = t0h - t2; \
+    t1 += t3; \
+    t3 -= OD_DCT_RSHIFT(t1, 1); \
+    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
+    t1 -= (t2*19195 + 16384) >> 15; \
+    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
+    t2 -= (t1*11585 + 8192) >> 14; \
+    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
+    t1 += (t2*7489 + 4096) >> 13; \
+  } \
+  while (0)
+
+#define OD_FDCT_8(r0, r4, r2, r6, r1, r5, r3, r7) \
+  /* Embedded 8-point orthonormal Type-II fDCT. */ \
+  do { \
+    int r4h; \
+    int r5h; \
+    int r6h; \
+    int r7h; \
+    r7 = r0 - r7; \
+    r7h = OD_DCT_RSHIFT(r7, 1); \
+    r0 -= r7h; \
+    r6 += r1; \
+    r6h = OD_DCT_RSHIFT(r6, 1); \
+    r1 = r6h - r1; \
+    r5 = r2 - r5; \
+    r5h = OD_DCT_RSHIFT(r5, 1); \
+    r2 -= r5h; \
+    r4 += r3; \
+    r4h = OD_DCT_RSHIFT(r4, 1); \
+    r3 = r4h - r3; \
+    OD_FDCT_4_ASYM(r0, r4, r4h, r2, r6, r6h); \
+    OD_FDST_4_ASYM(r7, r7h, r3, r5, r1); \
+  } \
+  while (0)
+
+#define OD_IDCT_8(r0, r4, r2, r6, r1, r5, r3, r7) \
+  /* Embedded 8-point orthonormal Type-II iDCT. */ \
+  do { \
+    int r1h; \
+    int r3h; \
+    int r5h; \
+    int r7h; \
+    OD_IDST_4_ASYM(r7, r7h, r5, r6, r4); \
+    OD_IDCT_4_ASYM(r0, r2, r1, r1h, r3, r3h); \
+    r0 += r7h; \
+    r7 = r0 - r7; \
+    r6 = r1h - r6; \
+    r1 -= r6; \
+    r5h = OD_DCT_RSHIFT(r5, 1); \
+    r2 += r5h; \
+    r5 = r2 - r5; \
+    r4 = r3h - r4; \
+    r3 -= r4; \
+  } \
+  while (0)
+
+#define OD_FDST_8(t0, t4, t2, t6, t1, t5, t3, t7) \
+  /* Embedded 8-point orthonormal Type-IV fDST. */ \
+  do { \
+    int t0h; \
+    int t2h; \
+    int t5h; \
+    int t7h; \
+    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
+    OD_DCT_OVERFLOW_CHECK(t1, 13573, 16384, 115); \
+    t6 -= (t1*13573 + 16384) >> 15; \
+    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
+    OD_DCT_OVERFLOW_CHECK(t6, 11585, 8192, 116); \
+    t1 += (t6*11585 + 8192) >> 14; \
+    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
+    OD_DCT_OVERFLOW_CHECK(t1, 13573, 16384, 117); \
+    t6 -= (t1*13573 + 16384) >> 15; \
+    /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
+    OD_DCT_OVERFLOW_CHECK(t2, 21895, 16384, 118); \
+    t5 -= (t2*21895 + 16384) >> 15; \
+    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
+    OD_DCT_OVERFLOW_CHECK(t5, 15137, 8192, 119); \
+    t2 += (t5*15137 + 8192) >> 14; \
+    /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
+    OD_DCT_OVERFLOW_CHECK(t2, 10947, 8192, 120); \
+    t5 -= (t2*10947 + 8192) >> 14; \
+    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
+    OD_DCT_OVERFLOW_CHECK(t3, 3259, 8192, 121); \
+    t4 -= (t3*3259 + 8192) >> 14; \
+    /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
+    OD_DCT_OVERFLOW_CHECK(t4, 3135, 4096, 122); \
+    t3 += (t4*3135 + 4096) >> 13; \
+    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
+    OD_DCT_OVERFLOW_CHECK(t3, 3259, 8192, 123); \
+    t4 -= (t3*3259 + 8192) >> 14; \
+    t7 += t1; \
+    t7h = OD_DCT_RSHIFT(t7, 1); \
+    t1 -= t7h; \
+    t2 = t3 - t2; \
+    t2h = OD_DCT_RSHIFT(t2, 1); \
+    t3 -= t2h; \
+    t0 -= t6; \
+    t0h = OD_DCT_RSHIFT(t0, 1); \
+    t6 += t0h; \
+    t5 = t4 - t5; \
+    t5h = OD_DCT_RSHIFT(t5, 1); \
+    t4 -= t5h; \
+    t1 += t5h; \
+    t5 = t1 - t5; \
+    t4 += t0h; \
+    t0 -= t4; \
+    t6 -= t2h; \
+    t2 += t6; \
+    t3 -= t7h; \
+    t7 += t3; \
+    /* TODO: Can we move this into another operation */ \
+    t7 = -t7; \
+    /* 7425/8192 ~= Tan[15*Pi/64] ~= 0.906347169019147 */ \
+    OD_DCT_OVERFLOW_CHECK(t7, 7425, 4096, 124); \
+    t0 -= (t7*7425 + 4096) >> 13; \
+    /* 8153/8192 ~= Sin[15*Pi/32] ~= 0.995184726672197 */ \
+    OD_DCT_OVERFLOW_CHECK(t0, 8153, 4096, 125); \
+    t7 += (t0*8153 + 4096) >> 13; \
+    /* 7425/8192 ~= Tan[15*Pi/64] ~= 0.906347169019147 */ \
+    OD_DCT_OVERFLOW_CHECK(t7, 7425, 4096, 126); \
+    t0 -= (t7*7425 + 4096) >> 13; \
+    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.148335987538347 */ \
+    OD_DCT_OVERFLOW_CHECK(t1, 4861, 16384, 127); \
+    t6 -= (t1*4861 + 16384) >> 15; \
+    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.290284677254462 */ \
+    OD_DCT_OVERFLOW_CHECK(t6, 1189, 2048, 128); \
+    t1 += (t6*1189 + 2048) >> 12; \
+    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.148335987538347 */ \
+    OD_DCT_OVERFLOW_CHECK(t1, 4861, 16384, 129); \
+    t6 -= (t1*4861 + 16384) >> 15; \
+    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.599376933681924 */ \
+    OD_DCT_OVERFLOW_CHECK(t5, 2455, 2048, 130); \
+    t2 -= (t5*2455 + 2048) >> 12; \
+    /* 7225/8192 ~= Sin[11*Pi/32] ~= 0.881921264348355 */ \
+    OD_DCT_OVERFLOW_CHECK(t2, 7225, 4096, 131); \
+    t5 += (t2*7225 + 4096) >> 13; \
+    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.599376933681924 */ \
+    OD_DCT_OVERFLOW_CHECK(t5, 2455, 2048, 132); \
+    t2 -= (t5*2455 + 2048) >> 12; \
+    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.357805721314524 */ \
+    OD_DCT_OVERFLOW_CHECK(t3, 11725, 16384, 133); \
+    t4 -= (t3*11725 + 16384) >> 15; \
+    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.634393284163645 */ \
+    OD_DCT_OVERFLOW_CHECK(t4, 5197, 4096, 134); \
+    t3 += (t4*5197 + 4096) >> 13; \
+    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.357805721314524 */ \
+    OD_DCT_OVERFLOW_CHECK(t3, 11725, 16384, 135); \
+    t4 -= (t3*11725 + 16384) >> 15; \
+  } \
+  while (0)
+
+#define OD_IDST_8(t0, t4, t2, t6, t1, t5, t3, t7) \
+  /* Embedded 8-point orthonormal Type-IV iDST. */ \
+  do { \
+    int t0h; \
+    int t2h; \
+    int t5h_; \
+    int t7h_; \
+    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.357805721314524 */ \
+    t1 += (t6*11725 + 16384) >> 15; \
+    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.634393284163645 */ \
+    t6 -= (t1*5197 + 4096) >> 13; \
+    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.357805721314524 */ \
+    t1 += (t6*11725 + 16384) >> 15; \
+    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.599376933681924 */ \
+    t2 += (t5*2455 + 2048) >> 12; \
+    /* 7225/8192 ~= Sin[11*Pi/32] ~= 0.881921264348355 */ \
+    t5 -= (t2*7225 + 4096) >> 13; \
+    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.599376933681924 */ \
+    t2 += (t5*2455 + 2048) >> 12; \
+    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.148335987538347 */ \
+    t3 += (t4*4861 + 16384) >> 15; \
+    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.290284677254462 */ \
+    t4 -= (t3*1189 + 2048) >> 12; \
+    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.148335987538347 */ \
+    t3 += (t4*4861 + 16384) >> 15; \
+    /* 7425/8192 ~= Tan[15*Pi/64] ~= 0.906347169019147 */ \
+    t0 += (t7*7425 + 4096) >> 13; \
+    /* 8153/8192 ~= Sin[15*Pi/32] ~= 0.995184726672197 */ \
+    t7 -= (t0*8153 + 4096) >> 13; \
+    /* 7425/8192 ~= Tan[15*Pi/64] ~= 0.906347169019147 */ \
+    t0 += (t7*7425 + 4096) >> 13; \
+    /* TODO: Can we move this into another operation */ \
+    t7 = -t7; \
+    t7 -= t6; \
+    t7h_ = OD_DCT_RSHIFT(t7, 1); \
+    t6 += t7h_; \
+    t2 -= t3; \
+    t2h = OD_DCT_RSHIFT(t2, 1); \
+    t3 += t2h; \
+    t0 += t1; \
+    t0h = OD_DCT_RSHIFT(t0, 1); \
+    t1 -= t0h; \
+    t5 = t4 - t5; \
+    t5h_ = OD_DCT_RSHIFT(t5, 1); \
+    t4 -= t5h_; \
+    t1 += t5h_; \
+    t5 = t1 - t5; \
+    t3 -= t0h; \
+    t0 += t3; \
+    t6 += t2h; \
+    t2 = t6 - t2; \
+    t4 += t7h_; \
+    t7 -= t4; \
+    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
+    t1 += (t6*3259 + 8192) >> 14; \
+    /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
+    t6 -= (t1*3135 + 4096) >> 13; \
+    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
+    t1 += (t6*3259 + 8192) >> 14; \
+    /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
+    t5 += (t2*10947 + 8192) >> 14; \
+    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
+    t2 -= (t5*15137 + 8192) >> 14; \
+    /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
+    t5 += (t2*21895 + 16384) >> 15; \
+    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
+    t3 += (t4*13573 + 16384) >> 15; \
+    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
+    t4 -= (t3*11585 + 8192) >> 14; \
+    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
+    t3 += (t4*13573 + 16384) >> 15; \
+  } \
+  while (0)
+
 void od_bin_fdct4(od_coeff y[4], const od_coeff *x, int xstride) {
   int q0;
   int q1;
@@ -117,3 +477,115 @@
   x[2*xstride] = q2;
   x[3*xstride] = q3;
 }
+
+void od_bin_fdct8(od_coeff y[8], const od_coeff *x, int xstride) {
+  int r0;
+  int r1;
+  int r2;
+  int r3;
+  int r4;
+  int r5;
+  int r6;
+  int r7;
+  r0 = x[0*xstride];
+  r4 = x[1*xstride];
+  r2 = x[2*xstride];
+  r6 = x[3*xstride];
+  r1 = x[4*xstride];
+  r5 = x[5*xstride];
+  r3 = x[6*xstride];
+  r7 = x[7*xstride];
+  OD_FDCT_8(r0, r4, r2, r6, r1, r5, r3, r7);
+  y[0] = (od_coeff)r0;
+  y[1] = (od_coeff)r1;
+  y[2] = (od_coeff)r2;
+  y[3] = (od_coeff)r3;
+  y[4] = (od_coeff)r4;
+  y[5] = (od_coeff)r5;
+  y[6] = (od_coeff)r6;
+  y[7] = (od_coeff)r7;
+}
+
+void od_bin_idct8(od_coeff *x, int xstride, const od_coeff y[8]) {
+  int r0;
+  int r1;
+  int r2;
+  int r3;
+  int r4;
+  int r5;
+  int r6;
+  int r7;
+  r0 = y[0];
+  r4 = y[1];
+  r2 = y[2];
+  r6 = y[3];
+  r1 = y[4];
+  r5 = y[5];
+  r3 = y[6];
+  r7 = y[7];
+  OD_IDCT_8(r0, r4, r2, r6, r1, r5, r3, r7);
+  x[0*xstride] = (od_coeff)r0;
+  x[1*xstride] = (od_coeff)r1;
+  x[2*xstride] = (od_coeff)r2;
+  x[3*xstride] = (od_coeff)r3;
+  x[4*xstride] = (od_coeff)r4;
+  x[5*xstride] = (od_coeff)r5;
+  x[6*xstride] = (od_coeff)r6;
+  x[7*xstride] = (od_coeff)r7;
+}
+
+void od_bin_fdst8(od_coeff y[8], const od_coeff *x, int xstride) {
+  int r0;
+  int r1;
+  int r2;
+  int r3;
+  int r4;
+  int r5;
+  int r6;
+  int r7;
+  r0 = x[0*xstride];
+  r4 = x[1*xstride];
+  r2 = x[2*xstride];
+  r6 = x[3*xstride];
+  r1 = x[4*xstride];
+  r5 = x[5*xstride];
+  r3 = x[6*xstride];
+  r7 = x[7*xstride];
+  OD_FDST_8(r0, r4, r2, r6, r1, r5, r3, r7);
+  y[0] = (od_coeff)r0;
+  y[1] = (od_coeff)r1;
+  y[2] = (od_coeff)r2;
+  y[3] = (od_coeff)r3;
+  y[4] = (od_coeff)r4;
+  y[5] = (od_coeff)r5;
+  y[6] = (od_coeff)r6;
+  y[7] = (od_coeff)r7;
+}
+
+void od_bin_idst8(od_coeff *x, int xstride, const od_coeff y[8]) {
+  int r0;
+  int r1;
+  int r2;
+  int r3;
+  int r4;
+  int r5;
+  int r6;
+  int r7;
+  r0 = y[0];
+  r4 = y[1];
+  r2 = y[2];
+  r6 = y[3];
+  r1 = y[4];
+  r5 = y[5];
+  r3 = y[6];
+  r7 = y[7];
+  OD_IDST_8(r0, r4, r2, r6, r1, r5, r3, r7);
+  x[0*xstride] = (od_coeff)r0;
+  x[1*xstride] = (od_coeff)r1;
+  x[2*xstride] = (od_coeff)r2;
+  x[3*xstride] = (od_coeff)r3;
+  x[4*xstride] = (od_coeff)r4;
+  x[5*xstride] = (od_coeff)r5;
+  x[6*xstride] = (od_coeff)r6;
+  x[7*xstride] = (od_coeff)r7;
+}
diff --git a/av1/common/daala_tx.h b/av1/common/daala_tx.h
index 5b739c6..39a844c 100644
--- a/av1/common/daala_tx.h
+++ b/av1/common/daala_tx.h
@@ -5,5 +5,9 @@
 
 void od_bin_fdct4(od_coeff y[4], const od_coeff *x, int xstride);
 void od_bin_idct4(od_coeff *x, int xstride, const od_coeff y[4]);
+void od_bin_fdct8(od_coeff y[8], const od_coeff *x, int xstride);
+void od_bin_idct8(od_coeff *x, int xstride, const od_coeff y[8]);
+void od_bin_fdst8(od_coeff y[8], const od_coeff *x, int xstride);
+void od_bin_idst8(od_coeff *x, int xstride, const od_coeff y[8]);
 
 #endif
diff --git a/av1/common/idct.c b/av1/common/idct.c
index 9d4e4b3..63dfdb0 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -47,7 +47,13 @@
 
 static void iidtx8_c(const tran_low_t *input, tran_low_t *output) {
   int i;
-  for (i = 0; i < 8; ++i) output[i] = input[i] * 2;
+  for (i = 0; i < 8; ++i) {
+#if CONFIG_DAALA_DCT8
+    output[i] = input[i];
+#else
+    output[i] = input[i] * 2;
+#endif
+  }
 }
 
 static void iidtx16_c(const tran_low_t *input, tran_low_t *output) {
@@ -1142,12 +1148,18 @@
 
   // inverse transform row vectors
   for (i = 0; i < 8; ++i) {
+#if CONFIG_DAALA_DCT8
+    tran_low_t temp_in[8];
+    for (j = 0; j < 8; j++) temp_in[j] = input[j] * 2;
+    IHT_8[tx_type].rows(temp_in, out[i]);
+#else
 #if CONFIG_LGT
     if (use_lgt_row)
       ilgt8(input, out[i], lgtmtx_row[i]);
     else
 #endif
       IHT_8[tx_type].rows(input, out[i]);
+#endif
     input += 8;
   }
 
@@ -1177,7 +1189,11 @@
     for (j = 0; j < 8; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
+#if CONFIG_DAALA_DCT8
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
+#else
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+#endif
     }
   }
 }
@@ -1397,6 +1413,7 @@
     aom_iwht4x4_1_add(input, dest, stride);
 }
 
+#if !CONFIG_DAALA_DCT8
 static void idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
                         const TxfmParam *txfm_param) {
 // If dc is 1, then input[0] is the reconstructed value, do not need
@@ -1421,6 +1438,7 @@
   else
     aom_idct8x8_64_add(input, dest, stride);
 }
+#endif
 
 static void idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
                           const TxfmParam *txfm_param) {
@@ -1664,7 +1682,11 @@
                              const TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   switch (tx_type) {
+#if !CONFIG_DAALA_DCT8
     case DCT_DCT: idct8x8_add(input, dest, stride, txfm_param); break;
+#else
+    case DCT_DCT:
+#endif
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index a0f0e51..2ffc656 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c
@@ -21,7 +21,7 @@
 #include "av1/common/av1_fwd_txfm1d.h"
 #include "av1/common/av1_fwd_txfm1d_cfg.h"
 #include "av1/common/idct.h"
-#if CONFIG_DAALA_DCT4
+#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8
 #include "av1/common/daala_tx.h"
 #endif
 
@@ -91,6 +91,18 @@
 }
 #endif
 
+#if CONFIG_DAALA_DCT8
+static void fdct8(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  od_coeff x[8];
+  od_coeff y[8];
+  for (i = 0; i < 8; i++) x[i] = (od_coeff)input[i];
+  od_bin_fdct8(y, x, 1);
+  for (i = 0; i < 8; i++) output[i] = (tran_low_t)y[i];
+}
+
+#else
+
 static void fdct8(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
   tran_low_t step[8];
@@ -168,6 +180,7 @@
 
   range_check(output, 8, 16);
 }
+#endif
 
 static void fdct16(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
@@ -783,6 +796,18 @@
   output[3] = (tran_low_t)fdct_round_shift(s3);
 }
 
+#if CONFIG_DAALA_DCT8
+static void fadst8(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  od_coeff x[8];
+  od_coeff y[8];
+  for (i = 0; i < 8; i++) x[i] = (od_coeff)input[i];
+  od_bin_fdst8(y, x, 1);
+  for (i = 0; i < 8; i++) output[i] = (tran_low_t)y[i];
+}
+
+#else
+
 static void fadst8(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
 
@@ -853,6 +878,7 @@
   output[6] = (tran_low_t)x5;
   output[7] = (tran_low_t)-x1;
 }
+#endif
 
 static void fadst16(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
@@ -1120,7 +1146,13 @@
 
 static void fidtx8(const tran_low_t *input, tran_low_t *output) {
   int i;
-  for (i = 0; i < 8; ++i) output[i] = input[i] * 2;
+  for (i = 0; i < 8; ++i) {
+#if CONFIG_DAALA_DCT8
+    output[i] = input[i];
+#else
+    output[i] = input[i] * 2;
+#endif
+  }
 }
 
 static void fidtx16(const tran_low_t *input, tran_low_t *output) {
@@ -2133,9 +2165,13 @@
 #if CONFIG_DCT_ONLY
   assert(tx_type == DCT_DCT);
 #endif
+#if !CONFIG_DAALA_DCT8
   if (tx_type == DCT_DCT) {
     aom_fdct8x8_c(input, output, stride);
-  } else {
+    return;
+  }
+#endif
+  {
     static const transform_2d FHT[] = {
       { fdct8, fdct8 },    // DCT_DCT
       { fadst8, fdct8 },   // ADST_DCT
@@ -2175,7 +2211,11 @@
 
     // Columns
     for (i = 0; i < 8; ++i) {
+#if CONFIG_DAALA_DCT8
+      for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 16;
+#else
       for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 4;
+#endif
 #if CONFIG_LGT
       if (use_lgt_col)
         flgt8(temp_in, temp_out, lgtmtx_col[i]);
@@ -2194,8 +2234,13 @@
       else
 #endif
         ht.rows(temp_in, temp_out);
+#if CONFIG_DAALA_DCT8
       for (j = 0; j < 8; ++j)
         output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+#else
+      for (j = 0; j < 8; ++j)
+        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+#endif
     }
   }
 }
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index 250dc9e..abdae1d 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -170,6 +170,7 @@
 set(CONFIG_ANALYZER 0 CACHE NUMBER "Internal flag.")
 set(CONFIG_DCT_ONLY 0 CACHE NUMBER "Internal flag.")
 set(CONFIG_DAALA_DCT4 0 CACHE NUMBER "Internal flag.")
+set(CONFIG_DAALA_DCT8 0 CACHE NUMBER "Internal flag.")
 set(CONFIG_GF_GROUPS 0 CACHE NUMBER "Internal flag.")
 set(CONFIG_MRC_TX 0 CACHE NUMBER "Internal flag.")
 set(CONFIG_INTER_STATS_ONLY 0 CACHE NUMBER "Internal flag.")
diff --git a/build/cmake/aom_configure.cmake b/build/cmake/aom_configure.cmake
index 4dbaff7..4eeb7ed 100644
--- a/build/cmake/aom_configure.cmake
+++ b/build/cmake/aom_configure.cmake
@@ -243,6 +243,34 @@
    set(CONFIG_VAR_TX_NO_TX_MODE 0)
 endif()
 
+if (CONFIG_DAALA_DCT8)
+  if (HAVE_MMX)
+     message(WARNING
+       "--- Disabled HAVE_MMX, required for CONFIG_DAALA_DCT8.")
+     set(HAVE_MMX 0)
+  endif()
+  if (CONFIG_RECT_TX)
+     message(WARNING
+       "--- Disabled CONFIG_RECT_TX, required for CONFIG_DAALA_DCT8.")
+     set(CONFIG_RECT_TX 0)
+  endif()
+  if (CONFIG_VAR_TX)
+     message(WARNING
+       "--- Disabled CONFIG_VAR_TX, required for CONFIG_DAALA_DCT8.")
+     set(CONFIG_VAR_TX 0)
+  endif()
+  if (CONFIG_LGT)
+     message(WARNING
+       "--- Disabled CONFIG_LGT, required for CONFIG_DAALA_DCT8.")
+     set(CONFIG_LGT 0)
+   endif()
+  if (NOT CONFIG_LOWBITDEPTH)
+     message(WARNING
+       "--- Enabled CONFIG_LOWBITDEPTH, required for CONFIG_DAALA_DCT8.")
+     set(CONFIG_LOWBITDEPTH 1)
+   endif()
+endif()
+
 if (NOT MSVC)
   aom_push_var(CMAKE_REQUIRED_LIBRARIES "m")
   aom_check_c_compiles("fenv_check"
diff --git a/configure b/configure
index f8191b9..c776ab4 100755
--- a/configure
+++ b/configure
@@ -294,6 +294,7 @@
     xiphrc
     dct_only
     daala_dct4
+    daala_dct8
     cb4x4
     chroma_2x2
     chroma_sub8x8
@@ -573,6 +574,13 @@
       disable_feature lgt
       enable_feature lowbitdepth
     fi
+    if enabled daala_dct8; then
+      disable_feature mmx
+      disable_feature rect_tx
+      disable_feature var_tx
+      disable_feature lgt
+      enable_feature lowbitdepth
+    fi
     if enabled var_tx_no_tx_mode && ! enabled var_tx; then
       log_echo "var_tx_no_tx_mode requires var_tx, so disabling var_tx_no_tx_mode"
       disable_feature var_tx_no_tx_mode
diff --git a/test/av1_dct_test.cc b/test/av1_dct_test.cc
index 2d4f26b..8ce7a79 100644
--- a/test/av1_dct_test.cc
+++ b/test/av1_dct_test.cc
@@ -23,7 +23,7 @@
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 1
 #define AV1_DCT_GTEST
 #include "av1/encoder/dct.c"
-#if CONFIG_DAALA_DCT4
+#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8
 #include "av1/common/daala_tx.c"
 #endif