Add 4-point DST to DAALA_DCT4 experiment

CONFIG_DAALA_DCT4 currently force-enables CONFIG_DCT_ONLY due to a
missing 4-point DST.  The DST had not been included because it was a
significant coding performance loss; this turned out to be a bug that
has since been corrected.

This patch adds a 4-point type IV DST to the DAALA_DCT4 experiment.
There is a small coding performance loss in using the type IV over
AV1's current type VII.

subset-1:
   monty-newdst4test-baseline-s1-F@2017-07-29T04:58:43.976Z ->
      monty-newdst4test-daala-s1-F@2017-07-29T04:59:56.094Z

   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
-0.0336 |  0.1393 |  0.0491 |   0.4118 | -0.0439 |  0.2084 |     0.0476

objective-1-fast:
   monty-newdst4test-baseline-o1f-F@2017-07-29T04:58:10.439Z ->
      monty-newdst4test-daala-o1f-F@2017-07-29T04:59:04.678Z

  PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
0.0064 |  0.1071 | -0.0108 |   0.1133 | -0.0035 |  0.0765 |     0.0502

Change-Id: Ie29835edbe0e41bc86f4b09457e88d924cc9bf7e
diff --git a/aom_dsp/inv_txfm.c b/aom_dsp/inv_txfm.c
index 0aa4672..bb5aec5 100644
--- a/aom_dsp/inv_txfm.c
+++ b/aom_dsp/inv_txfm.c
@@ -277,6 +277,16 @@
   }
 }
 
+#if CONFIG_DAALA_DCT4
+void aom_iadst4_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  od_coeff x[4];
+  od_coeff y[4];
+  for (i = 0; i < 4; i++) y[i] = input[i];
+  od_bin_idst4(x, 1, y);
+  for (i = 0; i < 4; i++) output[i] = (tran_low_t)x[i];
+}
+#else
 void aom_iadst4_c(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
 
@@ -313,6 +323,7 @@
   output[2] = WRAPLOW(dct_const_round_shift(s2));
   output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
 }
+#endif
 
 #if CONFIG_DAALA_DCT8
 void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) {
diff --git a/av1/common/daala_tx.c b/av1/common/daala_tx.c
index 72e9ebc..c35cc44 100644
--- a/av1/common/daala_tx.c
+++ b/av1/common/daala_tx.c
@@ -3150,6 +3150,38 @@
   x[3*xstride] = q3;
 }
 
+void od_bin_fdst4(od_coeff y[4], const od_coeff *x, int xstride) {
+  int q0;
+  int q1;
+  int q2;
+  int q3;
+  q0 = x[3*xstride];
+  q2 = x[2*xstride];
+  q1 = x[1*xstride];
+  q3 = x[0*xstride];
+  OD_FDST_4(q0, q2, q1, q3);
+  y[0] = (od_coeff)q3;
+  y[1] = (od_coeff)q2;
+  y[2] = (od_coeff)q1;
+  y[3] = (od_coeff)q0;
+}
+
+void od_bin_idst4(od_coeff *x, int xstride, const od_coeff y[4]) {
+  int q0;
+  int q1;
+  int q2;
+  int q3;
+  q0 = y[3];
+  q2 = y[2];
+  q1 = y[1];
+  q3 = y[0];
+  OD_IDST_4(q0, q2, q1, q3);
+  x[0*xstride] = q3;
+  x[1*xstride] = q2;
+  x[2*xstride] = q1;
+  x[3*xstride] = q0;
+}
+
 void od_bin_fdct8(od_coeff y[8], const od_coeff *x, int xstride) {
   int r0;
   int r1;
diff --git a/av1/common/daala_tx.h b/av1/common/daala_tx.h
index cef35c9..16ab4c5 100644
--- a/av1/common/daala_tx.h
+++ b/av1/common/daala_tx.h
@@ -5,6 +5,8 @@
 
 void od_bin_fdct4(od_coeff y[4], const od_coeff *x, int xstride);
 void od_bin_idct4(od_coeff *x, int xstride, const od_coeff y[4]);
+void od_bin_fdst4(od_coeff y[4], const od_coeff *x, int xstride);
+void od_bin_idst4(od_coeff *x, int xstride, const od_coeff y[4]);
 void od_bin_fdct8(od_coeff y[8], const od_coeff *x, int xstride);
 void od_bin_idct8(od_coeff *x, int xstride, const od_coeff y[8]);
 void od_bin_fdst8(od_coeff y[8], const od_coeff *x, int xstride);
diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index 0bb4798..1bddf0c 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c
@@ -793,6 +793,18 @@
 }
 #endif
 
+#if CONFIG_DAALA_DCT4
+static void fadst4(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  od_coeff x[4];
+  od_coeff y[4];
+  for (i = 0; i < 4; i++) x[i] = (od_coeff)input[i];
+  od_bin_fdst4(y, x, 1);
+  for (i = 0; i < 4; i++) output[i] = (tran_low_t)y[i];
+}
+
+#else
+
 static void fadst4(const tran_low_t *input, tran_low_t *output) {
   tran_high_t x0, x1, x2, x3;
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
@@ -832,6 +844,7 @@
   output[2] = (tran_low_t)fdct_round_shift(s2);
   output[3] = (tran_low_t)fdct_round_shift(s3);
 }
+#endif
 
 #if CONFIG_DAALA_DCT8
 static void fadst8(const tran_low_t *input, tran_low_t *output) {
diff --git a/build/cmake/aom_configure.cmake b/build/cmake/aom_configure.cmake
index df4252f..74011ad 100644
--- a/build/cmake/aom_configure.cmake
+++ b/build/cmake/aom_configure.cmake
@@ -243,10 +243,6 @@
    change_config_and_warn(CONFIG_VAR_TX 1 CONFIG_VAR_TX_NO_TX_MODE)
 endif()
 
-if (CONFIG_DAALA_DCT4 AND NOT CONFIG_DCT_ONLY)
-  change_config_and_warn(CONFIG_DCT_ONLY 1 CONFIG_DAALA_DCT4)
-endif()
-
 if (CONFIG_DAALA_DCT64)
   if (NOT CONFIG_TX64X64)
      message(WARNING
diff --git a/configure b/configure
index f4c509a..b5f7d73 100755
--- a/configure
+++ b/configure
@@ -575,9 +575,6 @@
       log_echo "ec_smallmul requires not ans, so disabling ec_smallmul"
       disable_feature ec_smallmul
     fi
-    if enabled daala_dct4; then
-      enable_feature dct_only
-    fi
     if enabled daala_dct64; then
       enable_feature tx64x64
     fi