Add CONFIG_DAALA_DCT4 experiment.

This experiment replaces the 4-point Type-II scaled-output vp9 DCT
 transform with the 4-point Type-II orthonormal Daala DCT transform.
Right now the CONFIG_DAALA_DCT4 experiment depends on CONFIG_DCT_ONLY
 as it does not add an orthonormal 4-point DST.

subset-1:

monty-baseline-dctonly-squaretx-subset1 ->
  monty-dct4-dctonly-squaretx-subset1-rerun

  PSNR | PSNR Cb | PSNR Cr | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
0.0055 | -0.0132 | -0.0405 |   0.0261 | 0.0005 |  0.0246 |     0.0226

objective-1-fast:

monty-baseline-dctonly-squaretx-o1f ->
  monty-dct4-dctonly-squaretx-o1f

   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
-0.0215 | -0.1573 |     N/A |  -0.0131 | -0.0347 | -0.0390 |    -0.1121

Change-Id: Idef8f6e5525037d5bbb2d0927675c21d1922d69a
diff --git a/aom_dsp/inv_txfm.c b/aom_dsp/inv_txfm.c
index c4efae6..13bebef 100644
--- a/aom_dsp/inv_txfm.c
+++ b/aom_dsp/inv_txfm.c
@@ -14,6 +14,9 @@
 
 #include "./aom_dsp_rtcd.h"
 #include "aom_dsp/inv_txfm.h"
+#if CONFIG_DAALA_DCT4
+#include "av1/common/daala_tx.h"
+#endif
 
 void aom_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
@@ -93,6 +96,18 @@
   }
 }
 
+#if CONFIG_DAALA_DCT4
+void aom_idct4_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  od_coeff x[4];
+  od_coeff y[4];
+  for (i = 0; i < 4; i++) y[i] = input[i];
+  od_bin_idct4(x, 1, y);
+  for (i = 0; i < 4; i++) output[i] = (tran_low_t)x[i];
+}
+
+#else
+
 void aom_idct4_c(const tran_low_t *input, tran_low_t *output) {
   tran_low_t step[4];
   tran_high_t temp1, temp2;
@@ -112,6 +127,7 @@
   output[2] = WRAPLOW(step[1] - step[2]);
   output[3] = WRAPLOW(step[0] - step[3]);
 }
+#endif
 
 void aom_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   tran_low_t out[4 * 4];
diff --git a/av1/av1.cmake b/av1/av1.cmake
index f269c72..c2811ca 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -33,6 +33,8 @@
     "${AOM_ROOT}/av1/common/common_data.h"
     "${AOM_ROOT}/av1/common/convolve.c"
     "${AOM_ROOT}/av1/common/convolve.h"
+    "${AOM_ROOT}/av1/common/daala_tx.c"
+    "${AOM_ROOT}/av1/common/daala_tx.h"
     "${AOM_ROOT}/av1/common/debugmodes.c"
     "${AOM_ROOT}/av1/common/entropy.c"
     "${AOM_ROOT}/av1/common/entropy.h"
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index c18f7d5..95263ab 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -24,6 +24,8 @@
 AV1_COMMON_SRCS-yes += common/alloccommon.h
 AV1_COMMON_SRCS-yes += common/blockd.h
 AV1_COMMON_SRCS-yes += common/common.h
+AV1_COMMON_SRCS-yes += common/daala_tx.c
+AV1_COMMON_SRCS-yes += common/daala_tx.h
 AV1_COMMON_SRCS-yes += common/entropy.h
 AV1_COMMON_SRCS-yes += common/entropymode.h
 AV1_COMMON_SRCS-yes += common/entropymv.h
diff --git a/av1/common/daala_tx.c b/av1/common/daala_tx.c
new file mode 100644
index 0000000..32ad568
--- /dev/null
+++ b/av1/common/daala_tx.c
@@ -0,0 +1,119 @@
+#include "av1/common/daala_tx.h"
+#include "av1/common/odintrin.h"
+
+/* clang-format off */
+
+# define OD_DCT_RSHIFT(_a, _b) OD_UNBIASED_RSHIFT32(_a, _b)
+
+/* TODO: Daala DCT overflow checks need to be ported as a later test */
+# if defined(OD_DCT_CHECK_OVERFLOW)
+# else
+#  define OD_DCT_OVERFLOW_CHECK(val, scale, offset, idx)
+# endif
+
+#define OD_FDCT_2_ASYM(p0, p1, p1h) \
+  /* Embedded 2-point asymmetric Type-II fDCT. */ \
+  do { \
+    p0 += p1h; \
+    p1 = p0 - p1; \
+  } \
+  while (0)
+
+#define OD_IDCT_2_ASYM(p0, p1, p1h) \
+  /* Embedded 2-point asymmetric Type-II iDCT. */ \
+  do { \
+    p1 = p0 - p1; \
+    p1h = OD_DCT_RSHIFT(p1, 1); \
+    p0 -= p1h; \
+  } \
+  while (0)
+
+#define OD_FDST_2_ASYM(p0, p1) \
+  /* Embedded 2-point asymmetric Type-IV fDST. */ \
+  do { \
+    /* 11507/16384 ~= 4*Sin[Pi/8] - 2*Tan[Pi/8] ~= 0.702306604714169 */ \
+    OD_DCT_OVERFLOW_CHECK(p1, 11507, 8192, 187); \
+    p0 -= (p1*11507 + 8192) >> 14; \
+    /* 669/1024 ~= Cos[Pi/8]/Sqrt[2] ~= 0.653281482438188 */ \
+    OD_DCT_OVERFLOW_CHECK(p0, 669, 512, 188); \
+    p1 += (p0*669 + 512) >> 10; \
+    /* 4573/4096 ~= 4*Sin[Pi/8] - Tan[Pi/8] ~= 1.11652016708726 */ \
+    OD_DCT_OVERFLOW_CHECK(p1, 4573, 2048, 189); \
+    p0 -= (p1*4573 + 2048) >> 12; \
+  } \
+  while (0)
+
+#define OD_IDST_2_ASYM(p0, p1) \
+  /* Embedded 2-point asymmetric Type-IV iDST. */ \
+  do { \
+    /* 4573/4096 ~= 4*Sin[Pi/8] - Tan[Pi/8] ~= 1.11652016708726 */ \
+    p0 += (p1*4573 + 2048) >> 12; \
+    /* 669/1024 ~= Cos[Pi/8]/Sqrt[2] ~= 0.653281482438188 */ \
+    p1 -= (p0*669 + 512) >> 10; \
+    /* 11507/16384 ~= 4*Sin[Pi/8] - 2*Tan[Pi/8] ~= 0.702306604714169 */ \
+    p0 += (p1*11507 + 8192) >> 14; \
+  } \
+  while (0)
+
+#define OD_FDCT_4(q0, q2, q1, q3) \
+  /* Embedded 4-point orthonormal Type-II fDCT. */ \
+  do { \
+    int q2h; \
+    int q3h; \
+    q3 = q0 - q3; \
+    q3h = OD_DCT_RSHIFT(q3, 1); \
+    q0 -= q3h; \
+    q2 += q1; \
+    q2h = OD_DCT_RSHIFT(q2, 1); \
+    q1 = q2h - q1; \
+    OD_FDCT_2_ASYM(q0, q2, q2h); \
+    OD_FDST_2_ASYM(q3, q1); \
+  } \
+  while (0)
+
+#define OD_IDCT_4(q0, q2, q1, q3) \
+  /* Embedded 4-point orthonormal Type-II iDCT. */ \
+  do { \
+    int q1h; \
+    int q3h; \
+    OD_IDST_2_ASYM(q3, q2); \
+    OD_IDCT_2_ASYM(q0, q1, q1h); \
+    q3h = OD_DCT_RSHIFT(q3, 1); \
+    q0 += q3h; \
+    q3 = q0 - q3; \
+    q2 = q1h - q2; \
+    q1 -= q2; \
+  } \
+  while (0)
+
+void od_bin_fdct4(od_coeff y[4], const od_coeff *x, int xstride) {
+  int q0;
+  int q1;
+  int q2;
+  int q3;
+  q0 = x[0*xstride];
+  q2 = x[1*xstride];
+  q1 = x[2*xstride];
+  q3 = x[3*xstride];
+  OD_FDCT_4(q0, q2, q1, q3);
+  y[0] = (od_coeff)q0;
+  y[1] = (od_coeff)q1;
+  y[2] = (od_coeff)q2;
+  y[3] = (od_coeff)q3;
+}
+
+void od_bin_idct4(od_coeff *x, int xstride, const od_coeff y[4]) {
+  int q0;
+  int q1;
+  int q2;
+  int q3;
+  q0 = y[0];
+  q2 = y[1];
+  q1 = y[2];
+  q3 = y[3];
+  OD_IDCT_4(q0, q2, q1, q3);
+  x[0*xstride] = q0;
+  x[1*xstride] = q1;
+  x[2*xstride] = q2;
+  x[3*xstride] = q3;
+}
diff --git a/av1/common/daala_tx.h b/av1/common/daala_tx.h
new file mode 100644
index 0000000..5b739c6
--- /dev/null
+++ b/av1/common/daala_tx.h
@@ -0,0 +1,9 @@
+#ifndef AOM_DSP_DAALA_TX_H_
+#define AOM_DSP_DAALA_TX_H_
+
+#include "av1/common/odintrin.h"
+
+void od_bin_fdct4(od_coeff y[4], const od_coeff *x, int xstride);
+void od_bin_idct4(od_coeff *x, int xstride, const od_coeff y[4]);
+
+#endif
diff --git a/av1/common/idct.c b/av1/common/idct.c
index 49a91fb..80a671d 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -36,8 +36,13 @@
 #if CONFIG_EXT_TX
 static void iidtx4_c(const tran_low_t *input, tran_low_t *output) {
   int i;
-  for (i = 0; i < 4; ++i)
+  for (i = 0; i < 4; ++i) {
+#if CONFIG_DAALA_DCT4
+    output[i] = input[i];
+#else
     output[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
+#endif
+  }
 }
 
 static void iidtx8_c(const tran_low_t *input, tran_low_t *output) {
@@ -249,10 +254,12 @@
 void av1_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          const INV_TXFM_PARAM *param) {
   int tx_type = param->tx_type;
+#if !CONFIG_DAALA_DCT4
   if (tx_type == DCT_DCT) {
     aom_idct4x4_16_add(input, dest, stride);
     return;
   }
+#endif
   static const transform_2d IHT_4[] = {
     { aom_idct4_c, aom_idct4_c },    // DCT_DCT  = 0
     { aom_iadst4_c, aom_idct4_c },   // ADST_DCT = 1
@@ -293,12 +300,18 @@
 
   // inverse transform row vectors
   for (i = 0; i < 4; ++i) {
+#if CONFIG_DAALA_DCT4
+    tran_low_t temp_in[4];
+    for (j = 0; j < 4; j++) temp_in[j] = input[j] << 1;
+    IHT_4[tx_type].rows(temp_in, out[i]);
+#else
 #if CONFIG_LGT
     if (use_lgt_row)
       ilgt4(input, out[i], lgtmtx_row[i]);
     else
 #endif
       IHT_4[tx_type].rows(input, out[i]);
+#endif
     input += 4;
   }
 
@@ -328,7 +341,11 @@
     for (j = 0; j < 4; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
+#if CONFIG_DAALA_DCT4
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
+#else
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
+#endif
     }
   }
 }
@@ -1440,7 +1457,11 @@
   }
 
   switch (tx_type) {
+#if !CONFIG_DAALA_DCT4
     case DCT_DCT: av1_idct4x4_add(input, dest, stride, param); break;
+#else
+    case DCT_DCT:
+#endif
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
diff --git a/av1/common/odintrin.h b/av1/common/odintrin.h
index 3a778ab..a50c456 100644
--- a/av1/common/odintrin.h
+++ b/av1/common/odintrin.h
@@ -89,6 +89,12 @@
 
 typedef int od_coeff;
 
+/*This is the strength reduced version of ((_a)/(1 << (_b))).
+  This will not work for _b == 0, however currently this is only used for
+   b == 1 anyway.*/
+# define OD_UNBIASED_RSHIFT32(_a, _b) \
+  (((int32_t)(((uint32_t)(_a) >> (32 - (_b))) + (_a))) >> (_b))
+
 #define OD_DIVU_DMAX (1024)
 
 extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2];
diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index 198c76d..e9d166c 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c
@@ -21,6 +21,9 @@
 #include "av1/common/av1_fwd_txfm1d.h"
 #include "av1/common/av1_fwd_txfm1d_cfg.h"
 #include "av1/common/idct.h"
+#if CONFIG_DAALA_DCT4
+#include "av1/common/daala_tx.h"
+#endif
 
 static INLINE void range_check(const tran_low_t *input, const int size,
                                const int bit) {
@@ -39,6 +42,18 @@
 #endif
 }
 
+#if CONFIG_DAALA_DCT4
+static void fdct4(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  od_coeff x[4];
+  od_coeff y[4];
+  for (i = 0; i < 4; i++) x[i] = (od_coeff)input[i];
+  od_bin_fdct4(y, x, 1);
+  for (i = 0; i < 4; i++) output[i] = (tran_low_t)y[i];
+}
+
+#else
+
 static void fdct4(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
   tran_low_t step[4];
@@ -74,6 +89,7 @@
 
   range_check(output, 4, 16);
 }
+#endif
 
 static void fdct8(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
@@ -1078,8 +1094,13 @@
 // being used for square transforms.
 static void fidtx4(const tran_low_t *input, tran_low_t *output) {
   int i;
-  for (i = 0; i < 4; ++i)
+  for (i = 0; i < 4; ++i) {
+#if CONFIG_DAALA_DCT4
+    output[i] = input[i];
+#else
     output[i] = (tran_low_t)fdct_round_shift(input[i] * Sqrt2);
+#endif
+  }
 }
 
 static void fidtx8(const tran_low_t *input, tran_low_t *output) {
@@ -1199,9 +1220,13 @@
 #if CONFIG_DCT_ONLY
   assert(tx_type == DCT_DCT);
 #endif
+#if !CONFIG_DAALA_DCT4
   if (tx_type == DCT_DCT) {
     aom_fdct4x4_c(input, output, stride);
-  } else {
+    return;
+  }
+#endif
+  {
     static const transform_2d FHT[] = {
       { fdct4, fdct4 },    // DCT_DCT
       { fadst4, fdct4 },   // ADST_DCT
@@ -1243,8 +1268,11 @@
 
     // Columns
     for (i = 0; i < 4; ++i) {
+      /* A C99-safe upshift by 4 for both Daala and VPx TX. */
       for (j = 0; j < 4; ++j) temp_in[j] = input[j * stride + i] * 16;
+#if !CONFIG_DAALA_DCT4
       if (i == 0 && temp_in[0]) temp_in[0] += 1;
+#endif
 #if CONFIG_LGT
       if (use_lgt_col)
         flgt4(temp_in, temp_out, lgtmtx_col[i]);
@@ -1263,7 +1291,13 @@
       else
 #endif
         ht.rows(temp_in, temp_out);
+#if CONFIG_DAALA_DCT4
+      /* Daala TX has orthonormal scaling; shift down by only 1 to achieve
+         the usual VPx coefficient left-shift of 3. */
+      for (j = 0; j < 4; ++j) output[j + i * 4] = temp_out[j] >> 1;
+#else
       for (j = 0; j < 4; ++j) output[j + i * 4] = (temp_out[j] + 1) >> 2;
+#endif
     }
   }
 }
diff --git a/configure b/configure
index b4f75fc..96faac0 100755
--- a/configure
+++ b/configure
@@ -291,6 +291,7 @@
     cfl
     xiphrc
     dct_only
+    daala_dct4
     cb4x4
     chroma_2x2
     chroma_sub8x8
@@ -559,6 +560,14 @@
       log_echo "ec_smallmul requires not ans, so disabling ec_smallmul"
       disable_feature ec_smallmul
     fi
+    if enabled daala_dct4; then
+      enable_feature dct_only
+      disable_feature mmx
+      disable_feature rect_tx
+      disable_feature var_tx
+      disable_feature lgt
+      enable_feature lowbitdepth
+    fi
     if enabled ext_tile; then
       log_echo "ext_tile not compatible with reference_buffer, so"
       log_echo "disabling reference_buffer"
diff --git a/test/av1_dct_test.cc b/test/av1_dct_test.cc
index 691cc8b..2d4f26b 100644
--- a/test/av1_dct_test.cc
+++ b/test/av1_dct_test.cc
@@ -23,6 +23,9 @@
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 1
 #define AV1_DCT_GTEST
 #include "av1/encoder/dct.c"
+#if CONFIG_DAALA_DCT4
+#include "av1/common/daala_tx.c"
+#endif
 
 using libaom_test::ACMRandom;