Simplify Daala inverse TX toplevel for constant shift

Rather than backing out all the LGT-related shifting matrices
throughout the existing TX code, separate out and simplify Daala
inverse TX into a single dedicated entry point.  When DAALA_TX is
enabled, CONFIG_HIGHBITDEPTH is also forced, and all of Daala TX
(lowbd and highbd) uses this single TX dispatch.

This patch is purely non-functional changes.

subset 1:
monty-TXtesting-fwd-s1@2017-11-12T05:25:09.557Z ->
 monty-TXtesting-inv-s1@2017-11-12T05:25:43.878Z

  PSNR | PSNR Cb | PSNR Cr | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
0.0000 |  0.0000 |  0.0000 |   0.0000 | 0.0000 |  0.0000 |     0.0000

objective-1-fast:
monty-TXtesting-fwd-o1f@2017-11-12T05:25:29.386Z ->
 monty-TXtesting-inv-o1f@2017-11-12T05:25:58.897Z

  PSNR | PSNR Cb | PSNR Cr | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
0.0000 |  0.0000 |  0.0000 |   0.0000 | 0.0000 |  0.0000 |     0.0000

Change-Id: I790e8d7ac08eb214eb712f5441d6e5f76ebddf17
diff --git a/av1/av1.cmake b/av1/av1.cmake
index b0a6943..f0b8541 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -35,6 +35,8 @@
     "${AOM_ROOT}/av1/common/convolve.h"
     "${AOM_ROOT}/av1/common/daala_tx.c"
     "${AOM_ROOT}/av1/common/daala_tx.h"
+    "${AOM_ROOT}/av1/common/daala_inv_txfm.c"
+    "${AOM_ROOT}/av1/common/daala_inv_txfm.h"
     "${AOM_ROOT}/av1/common/debugmodes.c"
     "${AOM_ROOT}/av1/common/entropy.c"
     "${AOM_ROOT}/av1/common/entropy.h"
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index 6c3e25c..f0c65c2 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -26,6 +26,8 @@
 AV1_COMMON_SRCS-yes += common/common.h
 AV1_COMMON_SRCS-yes += common/daala_tx.c
 AV1_COMMON_SRCS-yes += common/daala_tx.h
+AV1_COMMON_SRCS-yes += common/daala_inv_txfm.c
+AV1_COMMON_SRCS-yes += common/daala_inv_txfm.h
 AV1_COMMON_SRCS-yes += common/entropy.h
 AV1_COMMON_SRCS-yes += common/entropymode.h
 AV1_COMMON_SRCS-yes += common/entropymv.h
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index efacab2..39a7f29 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -78,62 +78,64 @@
   }
 }
 
-add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-specialize qw/av1_iht4x8_32_add sse2/;
+if (aom_config("CONFIG_DAALA_TX") ne "yes") {
+  add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+  specialize qw/av1_iht4x8_32_add sse2/;
 
-add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-specialize qw/av1_iht8x4_32_add sse2/;
+  add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+  specialize qw/av1_iht8x4_32_add sse2/;
 
-add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-specialize qw/av1_iht8x16_128_add sse2/;
+  add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+  specialize qw/av1_iht8x16_128_add sse2/;
 
-add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-specialize qw/av1_iht16x8_128_add sse2/;
+  add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+  specialize qw/av1_iht16x8_128_add sse2/;
 
-add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-specialize qw/av1_iht16x32_512_add sse2/;
+  add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+  specialize qw/av1_iht16x32_512_add sse2/;
 
-add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-specialize qw/av1_iht32x16_512_add sse2/;
+  add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+  specialize qw/av1_iht32x16_512_add sse2/;
 
-add_proto qw/void av1_iht4x16_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+  add_proto qw/void av1_iht4x16_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
 
-add_proto qw/void av1_iht16x4_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+  add_proto qw/void av1_iht16x4_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
 
-add_proto qw/void av1_iht8x32_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+  add_proto qw/void av1_iht8x32_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
 
-add_proto qw/void av1_iht32x8_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+  add_proto qw/void av1_iht32x8_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
 
-add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-if (aom_config("CONFIG_DAALA_TX8") ne "yes") {
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    specialize qw/av1_iht8x8_64_add sse2/;
-  } else {
-    specialize qw/av1_iht8x8_64_add sse2 neon/;
+  add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+  if (aom_config("CONFIG_DAALA_TX8") ne "yes") {
+    if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+      specialize qw/av1_iht8x8_64_add sse2/;
+   } else {
+       specialize qw/av1_iht8x8_64_add sse2 neon/;
+    }
   }
-}
 
-add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
+  add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
 
-if (aom_config("CONFIG_DAALA_TX16") ne "yes") {
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    specialize qw/av1_iht16x16_256_add sse2 avx2/;
-  } else {
-    specialize qw/av1_iht16x16_256_add sse2 avx2/;
+  if (aom_config("CONFIG_DAALA_TX16") ne "yes") {
+    if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+      specialize qw/av1_iht16x16_256_add sse2 avx2/;
+    } else {
+      specialize qw/av1_iht16x16_256_add sse2 avx2/;
+    }
   }
-}
 
-add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
+  add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
 
-if (aom_config("CONFIG_HIGHBITDEPTH") ne "yes") {
-}
+  if (aom_config("CONFIG_HIGHBITDEPTH") ne "yes") {
+  }
 
-add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
+  add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
 
-if (aom_config("CONFIG_TX64X64") eq "yes") {
-  add_proto qw/void av1_iht64x64_4096_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
-  add_proto qw/void av1_iht32x64_2048_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
-  add_proto qw/void av1_iht64x32_2048_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
+  if (aom_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void av1_iht64x64_4096_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
+    add_proto qw/void av1_iht32x64_2048_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
+    add_proto qw/void av1_iht64x32_2048_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
+  }
 }
 
 if (aom_config("CONFIG_NEW_QUANT") eq "yes") {
diff --git a/av1/common/daala_inv_txfm.c b/av1/common/daala_inv_txfm.c
new file mode 100644
index 0000000..031cf14
--- /dev/null
+++ b/av1/common/daala_inv_txfm.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./av1_rtcd.h"
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "av1/common/daala_tx.h"
+#include "av1/common/daala_inv_txfm.h"
+
+#if CONFIG_DAALA_TX
+
+// Temporary while we still need av1_get_tx_scale() for testing
+#include "av1/common/idct.h"
+
+// Complete Daala TX map, sans lossless which is special cased
+typedef void (*daala_itx)(od_coeff *, int, const od_coeff[]);
+
+static daala_itx tx_map[TX_SIZES][TX_TYPES] = {
+  //  4-point transforms
+  { od_bin_idct4, od_bin_idst4, od_bin_idst4, od_bin_iidtx4 },
+
+  //  8-point transforms
+  { od_bin_idct8, od_bin_idst8, od_bin_idst8, od_bin_iidtx8 },
+
+  //  16-point transforms
+  { od_bin_idct16, od_bin_idst16, od_bin_idst16, od_bin_iidtx16 },
+
+  //  32-point transforms
+  { od_bin_idct32, od_bin_idst32, od_bin_idst32, od_bin_iidtx32 },
+
+#if CONFIG_TX64X64
+  //  64-point transforms
+  { od_bin_idct64, NULL, NULL, od_bin_iidtx64 },
+#endif
+};
+
+static int tx_flip(TX_TYPE_1D t) { return t == FLIPADST_1D; }
+
+// Daala TX toplevel inverse entry point.  This same function is
+// intended for both low and high bitdepth cases with a tran_low_t of
+// 32 bits (matching od_coeff), and a passed-in pixel buffer of either
+// bytes (hbd=0) or shorts (hbd=1).
+void daala_inv_txfm_add(const tran_low_t *input_coeffs, void *output_pixels,
+                        int output_stride, TxfmParam *txfm_param) {
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int px_depth = txfm_param->bd;
+  assert(tx_size <= TX_SIZES_ALL);
+  assert(tx_type <= TX_TYPES);
+
+  if (txfm_param->lossless) {
+    // Transform function special-cased for lossless
+    assert(tx_type == DCT_DCT);
+    assert(tx_size == TX_4X4);
+    if (txfm_param->is_hbd)
+      av1_iwht4x4_add(input_coeffs, output_pixels, output_stride, txfm_param);
+    else
+      // Note that the output pointer in the prototype is uint8, but the
+      // function converts to short internally
+      av1_highbd_iwht4x4_add(input_coeffs, output_pixels, output_stride,
+                             txfm_param->eob, px_depth);
+  } else {
+    // General TX case
+    // Q3 coeff Q4 TX compatability mode, with av1_get_tx_scale
+    const int downshift = 4;
+    assert(sizeof(tran_low_t) == sizeof(od_coeff));
+    assert(sizeof(tran_low_t) >= 4);
+
+    // Hook into existing map translation infrastructure to select
+    // appropriate TX functions
+    const int cols = tx_size_wide[tx_size];
+    const int rows = tx_size_high[tx_size];
+    const TX_SIZE col_idx = txsize_vert_map[tx_size];
+    const TX_SIZE row_idx = txsize_horz_map[tx_size];
+    assert(col_idx <= TX_SIZES);
+    assert(row_idx <= TX_SIZES);
+    assert(vtx_tab[tx_type] <= (int)TX_TYPES_1D);
+    assert(htx_tab[tx_type] <= (int)TX_TYPES_1D);
+    daala_itx col_tx = tx_map[col_idx][vtx_tab[tx_type]];
+    daala_itx row_tx = tx_map[row_idx][htx_tab[tx_type]];
+    int col_flip = tx_flip(vtx_tab[tx_type]);
+    int row_flip = tx_flip(htx_tab[tx_type]);
+    od_coeff tmpsq[MAX_TX_SQUARE];
+    int r;
+    int c;
+
+    assert(col_tx);
+    assert(row_tx);
+
+    // This is temporary while we're testing against existing
+    // behavior (preshift up one plus av1_get_tx_scale).
+    // Remove before flight
+    od_coeff tmp[MAX_TX_SQUARE];
+    int upshift = 1 + av1_get_tx_scale(tx_size);
+    for (r = 0; r < rows; ++r)
+      for (c = 0; c < cols; ++c)
+        tmp[r * cols + c] = input_coeffs[r * cols + c] << upshift;
+    input_coeffs = tmp;
+
+    // Inverse-transform rows
+    for (r = 0; r < rows; ++r) {
+      // The output addressing transposes
+      if (row_flip)
+        row_tx(tmpsq + r + (rows * cols) - rows, -rows,
+               input_coeffs + r * cols);
+      else
+        row_tx(tmpsq + r, rows, input_coeffs + r * cols);
+    }
+
+    // Inverse-transform columns
+    for (c = 0; c < cols; ++c) {
+      // Above transposed, so our cols are now rows
+      if (col_flip)
+        col_tx(tmpsq + c * rows + rows - 1, -1, tmpsq + c * rows);
+      else
+        col_tx(tmpsq + c * rows, 1, tmpsq + c * rows);
+    }
+
+    // Sum with destination according to bit depth
+    // The tmpsq array is currently transposed relative to output
+    if (txfm_param->is_hbd) {
+      // Destination array is shorts
+      uint16_t *out16 = CONVERT_TO_SHORTPTR(output_pixels);
+      for (r = 0; r < rows; ++r)
+        for (c = 0; c < cols; ++c)
+          out16[r * output_stride + c] = highbd_clip_pixel_add(
+              out16[r * output_stride + c],
+              (tmpsq[c * rows + r] + (1 << downshift >> 1)) >> downshift,
+              px_depth);
+    } else {
+      // Destination array is bytes
+      uint8_t *out8 = (uint8_t *)output_pixels;
+      for (r = 0; r < rows; ++r)
+        for (c = 0; c < cols; ++c)
+          out8[r * output_stride + c] = clip_pixel_add(
+              out8[r * output_stride + c],
+              (tmpsq[c * rows + r] + (1 << downshift >> 1)) >> downshift);
+    }
+  }
+}
+
+#endif
diff --git a/av1/common/daala_inv_txfm.h b/av1/common/daala_inv_txfm.h
new file mode 100644
index 0000000..aa79a14
--- /dev/null
+++ b/av1/common/daala_inv_txfm.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_DAALA_INV_TXFM_H_
+#define AV1_ENCODER_DAALA_INV_TXFM_H_
+
+#include "./aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void daala_inv_txfm_add(const tran_low_t *input_coeffs, void *output_pixels,
+                        int output_stride, TxfmParam *txfm_param);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_DAALA_INV_TXFM_H_
diff --git a/av1/common/daala_tx.c b/av1/common/daala_tx.c
index ed2095b..e5c3c60 100644
--- a/av1/common/daala_tx.c
+++ b/av1/common/daala_tx.c
@@ -5218,6 +5218,38 @@
 }
 #endif
 
+void od_bin_iidtx4(od_coeff *x, int xstride, const od_coeff y[4]) {
+  int i;
+  for (i = 0; i < 4; i++)
+    x[i*xstride] = y[i];
+}
+
+void od_bin_iidtx8(od_coeff *x, int xstride, const od_coeff y[8]) {
+  int i;
+  for (i = 0; i < 8; i++)
+    x[i*xstride] = y[i];
+}
+
+void od_bin_iidtx16(od_coeff *x, int xstride, const od_coeff y[16]) {
+  int i;
+  for (i = 0; i < 16; i++)
+    x[i*xstride] = y[i];
+}
+
+void od_bin_iidtx32(od_coeff *x, int xstride, const od_coeff y[32]) {
+  int i;
+  for (i = 0; i < 32; i++)
+    x[i*xstride] = y[i];
+}
+
+#if CONFIG_TX64X64
+void od_bin_iidtx64(od_coeff *x, int xstride, const od_coeff y[64]) {
+  int i;
+  for (i = 0; i < 64; i++)
+    x[i*xstride] = y[i];
+}
+#endif
+
 // Below are intermediate wrappers that handle the case when
 // tran_low_t is a smaller type than od_coeff
 void daala_fdct4(const tran_low_t *input, tran_low_t *output) {
diff --git a/av1/common/daala_tx.h b/av1/common/daala_tx.h
index e482ed1..2943802 100644
--- a/av1/common/daala_tx.h
+++ b/av1/common/daala_tx.h
@@ -35,26 +35,31 @@
 void od_bin_fdct4(od_coeff y[4], const od_coeff *x, int xstride);
 void od_bin_idct4(od_coeff *x, int xstride, const od_coeff y[4]);
 void od_bin_fdst4(od_coeff y[4], const od_coeff *x, int xstride);
-void od_bin_fidtx4(od_coeff y[4], const od_coeff *x, int xstride);
 void od_bin_idst4(od_coeff *x, int xstride, const od_coeff y[4]);
+void od_bin_fidtx4(od_coeff y[4], const od_coeff *x, int xstride);
+void od_bin_iidtx4(od_coeff *x, int xstride, const od_coeff y[4]);
 void od_bin_fdct8(od_coeff y[8], const od_coeff *x, int xstride);
 void od_bin_idct8(od_coeff *x, int xstride, const od_coeff y[8]);
 void od_bin_fdst8(od_coeff y[8], const od_coeff *x, int xstride);
-void od_bin_fidtx8(od_coeff y[8], const od_coeff *x, int xstride);
 void od_bin_idst8(od_coeff *x, int xstride, const od_coeff y[8]);
+void od_bin_fidtx8(od_coeff y[8], const od_coeff *x, int xstride);
+void od_bin_iidtx8(od_coeff *x, int xstride, const od_coeff y[8]);
 void od_bin_fdct16(od_coeff y[16], const od_coeff *x, int xstride);
 void od_bin_idct16(od_coeff *x, int xstride, const od_coeff y[16]);
 void od_bin_fdst16(od_coeff y[16], const od_coeff *x, int xstride);
-void od_bin_fidtx16(od_coeff y[16], const od_coeff *x, int xstride);
 void od_bin_idst16(od_coeff *x, int xstride, const od_coeff y[16]);
+void od_bin_fidtx16(od_coeff y[16], const od_coeff *x, int xstride);
+void od_bin_iidtx16(od_coeff *x, int xstride, const od_coeff y[16]);
 void od_bin_fdct32(od_coeff y[32], const od_coeff *x, int xstride);
 void od_bin_idct32(od_coeff *x, int xstride, const od_coeff y[32]);
 void od_bin_fdst32(od_coeff y[32], const od_coeff *x, int xstride);
+void od_bin_idst32(od_coeff *x, int xstride, const od_coeff y[32]);
 void od_bin_fidtx32(od_coeff y[32], const od_coeff *x, int xstride);
+void od_bin_iidtx32(od_coeff *x, int xstride, const od_coeff y[32]);
 #if CONFIG_TX64X64
 void od_bin_fdct64(od_coeff y[64], const od_coeff *x, int xstride);
 void od_bin_idct64(od_coeff *x, int xstride, const od_coeff y[64]);
 void od_bin_fidtx64(od_coeff y[64], const od_coeff *x, int xstride);
-
+void od_bin_iidtx64(od_coeff *x, int xstride, const od_coeff y[64]);
 #endif
 #endif
diff --git a/av1/common/idct.c b/av1/common/idct.c
index f5e4b7a..7279f5e 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -22,6 +22,9 @@
 #if CONFIG_DAALA_TX4 || CONFIG_DAALA_TX8 || CONFIG_DAALA_TX16 || \
     CONFIG_DAALA_TX32 || CONFIG_DAALA_TX64
 #include "av1/common/daala_tx.h"
+#if CONFIG_DAALA_TX
+#include "av1/common/daala_inv_txfm.h"
+#endif
 #endif
 
 int av1_get_tx_scale(const TX_SIZE tx_size) {
@@ -29,6 +32,8 @@
   return (pels > 256) + (pels > 1024) + (pels > 4096);
 }
 
+#if !CONFIG_DAALA_TX
+
 // NOTE: The implementation of all inverses need to be aware of the fact
 // that input and output could be the same buffer.
 
@@ -1673,6 +1678,7 @@
   else
     aom_idct4x4_1_add(input, dest, stride);
 }
+#endif  // !CONFIG_DAALA_TX
 
 void av1_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                      const TxfmParam *txfm_param) {
@@ -1684,6 +1690,7 @@
     aom_iwht4x4_1_add(input, dest, stride);
 }
 
+#if !CONFIG_DAALA_TX
 #if !CONFIG_DAALA_TX8
 static void idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
                         const TxfmParam *txfm_param) {
@@ -2080,6 +2087,7 @@
 #endif
 }
 #endif  // CONFIG_TX64X64
+#endif  // !CONFIG_DAALA_TX
 
 void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                             int eob, int bd) {
@@ -2089,6 +2097,7 @@
     aom_highbd_iwht4x4_1_add(input, dest, stride, bd);
 }
 
+#if !CONFIG_DAALA_TX
 static const int32_t *cast_to_int32(const tran_low_t *input) {
   assert(sizeof(int32_t) == sizeof(tran_low_t));
   return (const int32_t *)input;
@@ -2377,10 +2386,15 @@
   }
 }
 #endif  // CONFIG_TX64X64
+#endif  // !CONFIG_DAALA_TX
 
 void av1_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
                       TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+#if CONFIG_DAALA_TX
+  assert(!txfm_param->is_hbd);
+  daala_inv_txfm_add(input, dest, stride, txfm_param);
+#else
   const TX_SIZE tx_size = txfm_param->tx_size;
   switch (tx_size) {
 #if CONFIG_TX64X64
@@ -2413,6 +2427,7 @@
 #endif
     default: assert(0 && "Invalid transform size"); break;
   }
+#endif
 }
 
 #if CONFIG_TXMG
@@ -2524,8 +2539,11 @@
 
 void av1_highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
                              TxfmParam *txfm_param) {
-  const TX_SIZE tx_size = txfm_param->tx_size;
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+#if CONFIG_DAALA_TX
+  daala_inv_txfm_add(input, dest, stride, txfm_param);
+#else
+  const TX_SIZE tx_size = txfm_param->tx_size;
   switch (tx_size) {
 #if CONFIG_TX64X64
     case TX_64X64:
@@ -2589,4 +2607,5 @@
 #endif
     default: assert(0 && "Invalid transform size"); break;
   }
+#endif
 }
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index a95fa6f..de7010c 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -29,6 +29,9 @@
 #include "av1/encoder/encodetxb.h"
 #endif
 #include "av1/encoder/hybrid_fwd_txfm.h"
+#if CONFIG_DAALA_TX
+#include "av1/common/daala_inv_txfm.h"
+#endif
 #include "av1/encoder/rd.h"
 #include "av1/encoder/tokenize.h"
 
@@ -724,6 +727,9 @@
     txfm_param.tx_set_type = get_ext_tx_set_type(
         txfm_param.tx_size, plane_bsize, is_inter_block(&xd->mi[0]->mbmi),
         cm->reduced_tx_set_used);
+#if CONFIG_DAALA_TX
+    daala_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
+#else
 #if CONFIG_HIGHBITDEPTH
     if (txfm_param.is_hbd) {
       av1_highbd_inv_txfm_add_4x4(dqcoeff, dst, pd->dst.stride, &txfm_param);
@@ -735,6 +741,7 @@
     } else {
       av1_idct4x4_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
     }
+#endif
   }
 }
 
diff --git a/test/av1_fht16x16_test.cc b/test/av1_fht16x16_test.cc
index fefdab9..d428332 100644
--- a/test/av1_fht16x16_test.cc
+++ b/test/av1_fht16x16_test.cc
@@ -23,6 +23,7 @@
 
 using libaom_test::ACMRandom;
 
+#if !CONFIG_DAALA_TX
 namespace {
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
@@ -268,3 +269,4 @@
 #endif  // HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH && !CONFIG_DAALA_TX16
 
 }  // namespace
+#endif  // !CONFIG_DAALA_TX
diff --git a/test/av1_fht16x32_test.cc b/test/av1_fht16x32_test.cc
index ae37e2d..e6b960a 100644
--- a/test/av1_fht16x32_test.cc
+++ b/test/av1_fht16x32_test.cc
@@ -23,6 +23,8 @@
 
 using libaom_test::ACMRandom;
 
+#if !CONFIG_DAALA_TX
+
 namespace {
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
@@ -151,3 +153,5 @@
 #endif  // HAVE_SSE2
 
 }  // namespace
+
+#endif  // !CONFIG_DAALA_TX
diff --git a/test/av1_fht16x8_test.cc b/test/av1_fht16x8_test.cc
index 8c49993..91cb69c 100644
--- a/test/av1_fht16x8_test.cc
+++ b/test/av1_fht16x8_test.cc
@@ -23,6 +23,7 @@
 
 using libaom_test::ACMRandom;
 
+#if !CONFIG_DAALA_TX
 namespace {
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
@@ -149,3 +150,5 @@
 #endif  // HAVE_SSE2
 
 }  // namespace
+
+#endif  // !CONFIG_DAALA_TX
diff --git a/test/av1_fht32x16_test.cc b/test/av1_fht32x16_test.cc
index 97f564d..603f711 100644
--- a/test/av1_fht32x16_test.cc
+++ b/test/av1_fht32x16_test.cc
@@ -23,6 +23,8 @@
 
 using libaom_test::ACMRandom;
 
+#if !CONFIG_DAALA_TX
+
 namespace {
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
@@ -151,3 +153,4 @@
 #endif  // HAVE_SSE2
 
 }  // namespace
+#endif  // !CONFIG_DAALA_TX
diff --git a/test/av1_fht4x4_test.cc b/test/av1_fht4x4_test.cc
index df7b03c..f11bfe5 100644
--- a/test/av1_fht4x4_test.cc
+++ b/test/av1_fht4x4_test.cc
@@ -23,6 +23,7 @@
 
 using libaom_test::ACMRandom;
 
+#if !CONFIG_DAALA_TX
 namespace {
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
@@ -229,3 +230,4 @@
 #endif  // HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH && !CONFIG_DAALA_TX4
 
 }  // namespace
+#endif  // !CONFIG_DAALA_TX
diff --git a/test/av1_fht4x8_test.cc b/test/av1_fht4x8_test.cc
index 2b13fcd..6adada5 100644
--- a/test/av1_fht4x8_test.cc
+++ b/test/av1_fht4x8_test.cc
@@ -23,6 +23,7 @@
 
 using libaom_test::ACMRandom;
 
+#if !CONFIG_DAALA_TX
 namespace {
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
@@ -139,3 +140,4 @@
 #endif  // HAVE_SSE2
 
 }  // namespace
+#endif  // !CONFIG_DAALA_TX
diff --git a/test/av1_fht8x16_test.cc b/test/av1_fht8x16_test.cc
index 9490a32..c7e6160 100644
--- a/test/av1_fht8x16_test.cc
+++ b/test/av1_fht8x16_test.cc
@@ -22,6 +22,7 @@
 
 using libaom_test::ACMRandom;
 
+#if !CONFIG_DAALA_TX
 namespace {
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
@@ -148,3 +149,4 @@
 #endif  // HAVE_SSE2
 
 }  // namespace
+#endif  // !CONFIG_DAALA_TX
diff --git a/test/av1_fht8x4_test.cc b/test/av1_fht8x4_test.cc
index b891031..b221e61 100644
--- a/test/av1_fht8x4_test.cc
+++ b/test/av1_fht8x4_test.cc
@@ -22,6 +22,7 @@
 
 using libaom_test::ACMRandom;
 
+#if !CONFIG_DAALA_TX
 namespace {
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
@@ -138,3 +139,4 @@
 #endif  // HAVE_SSE2
 
 }  // namespace
+#endif  // !CONFIG_DAALA_TX
diff --git a/test/av1_fht8x8_test.cc b/test/av1_fht8x8_test.cc
index 1bffe4e..ec8baac 100644
--- a/test/av1_fht8x8_test.cc
+++ b/test/av1_fht8x8_test.cc
@@ -23,6 +23,7 @@
 
 using libaom_test::ACMRandom;
 
+#if !CONFIG_DAALA_TX
 namespace {
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
@@ -227,3 +228,4 @@
 #endif  // HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH && !CONFIG_DAALA_TX8
 
 }  // namespace
+#endif  // !CONFIG_DAALA_TX
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 5adc296..2962ffb 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -30,6 +30,7 @@
 
 using libaom_test::ACMRandom;
 
+#if !CONFIG_DAALA_TX
 namespace {
 
 const int kNumCoeffs = 256;
@@ -873,3 +874,4 @@
                                                      DCT_DCT, AOM_BITS_8)));
 #endif  // HAVE_MSA && !CONFIG_HIGHBITDEPTH
 }  // namespace
+#endif  // !CONFIG_DAALA_TX
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 5ce263c..bcb2fbe 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -29,6 +29,7 @@
 
 using libaom_test::ACMRandom;
 
+#if !CONFIG_DAALA_TX
 namespace {
 typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
 typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
@@ -336,3 +337,4 @@
                                                      DCT_DCT, AOM_BITS_8, 16)));
 #endif  // HAVE_MSA && !CONFIG_HIGHBITDEPTH
 }  // namespace
+#endif  // !CONFIG_DAALA_TX
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 2b8f524..2ad1afc 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -29,6 +29,7 @@
 
 using libaom_test::ACMRandom;
 
+#if !CONFIG_DAALA_TX
 namespace {
 
 const int kNumCoeffs = 64;
@@ -724,3 +725,4 @@
                                                      DCT_DCT, AOM_BITS_8)));
 #endif  // HAVE_MSA && !CONFIG_HIGHBITDEPTH
 }  // namespace
+#endif  // !CONFIG_DAALA_TX