Remove DAALA_TX experiment

This experiment has been abandonned for AV1.

Change-Id: Ief8ed6a51a5e7bac17838ebb7a88d88bbf90a96f
diff --git a/aom_dsp/inv_txfm.h b/aom_dsp/inv_txfm.h
index dcd3fa1..14cc989 100644
--- a/aom_dsp/inv_txfm.h
+++ b/aom_dsp/inv_txfm.h
@@ -28,13 +28,6 @@
 }
 
 static INLINE tran_high_t check_range(tran_high_t input, int bd) {
-#if CONFIG_DAALA_TX
-  // Daala TX coeffs cover a different range from AV1 TX
-  // all depths: 19 bit integer
-  const int32_t int_max = (1 << (TX_COEFF_DEPTH + 6)) - 1;
-  const int32_t int_min = -int_max - 1;
-  (void)bd;
-#else
   // AV1 TX case
   // - 8 bit: signed 16 bit integer
   // - 10 bit: signed 18 bit integer
@@ -42,7 +35,6 @@
   // - max quantization error = 1828 << (bd - 8)
   const int32_t int_max = (1 << (7 + bd)) - 1 + (914 << (bd - 7));
   const int32_t int_min = -int_max - 1;
-#endif
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   assert(int_min <= input);
   assert(input <= int_max);
@@ -57,9 +49,6 @@
 void aom_idct8_c(const tran_low_t *input, tran_low_t *output);
 void aom_idct16_c(const tran_low_t *input, tran_low_t *output);
 void aom_idct32_c(const tran_low_t *input, tran_low_t *output);
-#if CONFIG_TX64X64 && CONFIG_DAALA_TX64
-void aom_idct64_c(const tran_low_t *input, tran_low_t *output);
-#endif
 void aom_iadst4_c(const tran_low_t *input, tran_low_t *output);
 void aom_iadst8_c(const tran_low_t *input, tran_low_t *output);
 void aom_iadst16_c(const tran_low_t *input, tran_low_t *output);
diff --git a/av1/av1.cmake b/av1/av1.cmake
index c26aa3e..74122c9 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -70,16 +70,6 @@
     "${AOM_ROOT}/av1/common/tile_common.c"
     "${AOM_ROOT}/av1/common/tile_common.h")
 
-if (CONFIG_DAALA_TX)
-  set(AOM_AV1_COMMON_SOURCES
-      ${AOM_AV1_COMMON_SOURCES}
-      "${AOM_ROOT}/av1/common/daala_tx.c"
-      "${AOM_ROOT}/av1/common/daala_tx.h"
-      "${AOM_ROOT}/av1/common/daala_tx_kernels.h"
-      "${AOM_ROOT}/av1/common/daala_inv_txfm.c"
-      "${AOM_ROOT}/av1/common/daala_inv_txfm.h")
-endif ()
-
 set(AOM_AV1_DECODER_SOURCES
     "${AOM_ROOT}/av1/av1_dx_iface.c"
     "${AOM_ROOT}/av1/decoder/decodeframe.c"
@@ -165,13 +155,6 @@
     "${AOM_ROOT}/av1/encoder/tokenize.c"
     "${AOM_ROOT}/av1/encoder/tokenize.h")
 
-if (CONFIG_DAALA_TX)
-  set(AOM_AV1_ENCODER_SOURCES
-      ${AOM_AV1_ENCODER_SOURCES}
-      "${AOM_ROOT}/av1/encoder/daala_fwd_txfm.c"
-      "${AOM_ROOT}/av1/encoder/daala_fwd_txfm.h")
-endif ()
-
 set(AOM_AV1_COMMON_INTRIN_SSE2
     "${AOM_ROOT}/av1/common/x86/idct_intrin_sse2.c")
 
@@ -186,12 +169,6 @@
 set(AOM_AV1_COMMON_INTRIN_AVX2
     "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c"
     "${AOM_ROOT}/av1/common/x86/hybrid_inv_txfm_avx2.c")
-if (CONFIG_DAALA_TX)
-  set(AOM_AV1_COMMON_INTRIN_AVX2
-      ${AOM_AV1_COMMON_INTRIN_AVX2}
-      "${AOM_ROOT}/av1/common/x86/daala_tx_kernels.h"
-      "${AOM_ROOT}/av1/common/x86/daala_inv_txfm_avx2.c")
-endif ()
 
 set(AOM_AV1_COMMON_INTRIN_MSA
     "${AOM_ROOT}/av1/common/mips/msa/av1_idct16x16_msa.c"
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index ee41933..9f0dbc4 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -28,10 +28,6 @@
 #include "av1/common/restoration.h"
 #endif
 
-#if CONFIG_DAALA_TX
-#include "av1/common/daala_inv_txfm.h"
-#endif
-
 #if CONFIG_CFL
 #include "av1/common/cfl.h"
 #endif
@@ -95,11 +91,11 @@
 # Inverse dct
 #
 add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-if (aom_config("CONFIG_DAALA_TX4") ne "yes") {
+{
     specialize qw/av1_iht4x4_16_add sse2/;
 }
 
-if (aom_config("CONFIG_DAALA_TX") ne "yes") {
+{
   add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
   specialize qw/av1_iht4x8_32_add sse2/;
 
@@ -127,13 +123,13 @@
   add_proto qw/void av1_iht32x8_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
 
   add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  if (aom_config("CONFIG_DAALA_TX8") ne "yes") {
+  {
       specialize qw/av1_iht8x8_64_add sse2/;
   }
 
   add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
 
-  if (aom_config("CONFIG_DAALA_TX16") ne "yes") {
+  {
       specialize qw/av1_iht16x16_256_add sse2 avx2/;
   }
 
@@ -247,19 +243,19 @@
 add_proto qw/void av1_inv_txfm2d_add_16x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 add_proto qw/void av1_inv_txfm2d_add_32x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 add_proto qw/void av1_inv_txfm2d_add_4x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-if (aom_config("CONFIG_DAALA_TX4") ne "yes") {
+{
   specialize qw/av1_inv_txfm2d_add_4x4 sse4_1/;
 }
 add_proto qw/void av1_inv_txfm2d_add_8x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-if (aom_config("CONFIG_DAALA_TX8") ne "yes") {
+{
   specialize qw/av1_inv_txfm2d_add_8x8 sse4_1/;
 }
 add_proto qw/void av1_inv_txfm2d_add_16x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-if (aom_config("CONFIG_DAALA_TX16") ne "yes") {
+{
   specialize qw/av1_inv_txfm2d_add_16x16 sse4_1/;
 }
 add_proto qw/void av1_inv_txfm2d_add_32x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-if (aom_config("CONFIG_DAALA_TX32") ne "yes") {
+{
   specialize qw/av1_inv_txfm2d_add_32x32 avx2/;
 }
 if (aom_config("CONFIG_TX64X64") eq "yes") {
@@ -300,24 +296,24 @@
   # fdct functions
 
   add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  if (aom_config("CONFIG_DAALA_TX4") ne "yes") {
+  {
     specialize qw/av1_fht4x4 sse2/;
   }
 
   add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
 
   add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  if (aom_config("CONFIG_DAALA_TX8") ne "yes") {
+  {
     specialize qw/av1_fht8x8 sse2/;
   }
 
   add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  if (aom_config("CONFIG_DAALA_TX16") ne "yes") {
+  {
     specialize qw/av1_fht16x16 sse2 avx2/;
   }
 
   add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  if (aom_config("CONFIG_DAALA_TX32") ne "yes") {
+  {
     specialize qw/av1_fht32x32 sse2 avx2/;
   }
 
@@ -370,19 +366,19 @@
   add_proto qw/void av1_fwd_txfm2d_8x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_32x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  if (aom_config("CONFIG_DAALA_TX4") ne "yes") {
+  {
     specialize qw/av1_fwd_txfm2d_4x4 sse4_1/;
   }
   add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  if (aom_config("CONFIG_DAALA_TX8") ne "yes") {
+  {
     specialize qw/av1_fwd_txfm2d_8x8 sse4_1/;
   }
   add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  if (aom_config("CONFIG_DAALA_TX16") ne "yes") {
+  {
     specialize qw/av1_fwd_txfm2d_16x16 sse4_1/;
   }
   add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  if (aom_config("CONFIG_DAALA_TX32") ne "yes") {
+  {
     specialize qw/av1_fwd_txfm2d_32x32 sse4_1/;
   }
 
@@ -433,7 +429,7 @@
     }
 
     add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
-    if (aom_config("CONFIG_DAALA_TX") ne "yes") {
+    {
       specialize qw/av1_highbd_block_error sse2/;
     }
 
@@ -579,12 +575,6 @@
 
 }
 
-# DAALA_TX functions
-if (aom_config("CONFIG_DAALA_TX") eq "yes") {
-  add_proto qw/void daala_inv_txfm_add/, "const tran_low_t *input_coeffs, void *output_pixels, int output_stride, TxfmParam *txfm_param";
-  specialize qw/daala_inv_txfm_add avx2/;
-}
-
 # CFL
 if (aom_config("CONFIG_CFL") eq "yes") {
   add_proto qw/void av1_cfl_subtract/, "int16_t *pred_buf_q3, int width, int height, int16_t avg_q3";
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 2a82927..0cd4ec4 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -765,16 +765,6 @@
 #endif  // USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
   if (use_reduced_set)
     return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
-#if CONFIG_DAALA_TX_DST32
-  if (tx_size_sqr_up > TX_32X32)
-    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY;
-  if (is_inter)
-    return (tx_size_sqr >= TX_16X16 ? EXT_TX_SET_DTT9_IDTX_1DDCT
-                                    : EXT_TX_SET_ALL16);
-  else
-    return (tx_size_sqr >= TX_16X16 ? EXT_TX_SET_DTT4_IDTX
-                                    : EXT_TX_SET_DTT4_IDTX_1DDCT);
-#endif
   if (tx_size_sqr_up == TX_32X32)
     return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY;
   if (is_inter)
@@ -970,12 +960,8 @@
   if (is_inter_block(mbmi) && !av1_ext_tx_used[tx_set_type][mbmi->tx_type])
     return DCT_DCT;
 
-#if CONFIG_DAALA_TX_DST32
-  if (xd->lossless[mbmi->segment_id] || txsize_sqr_map[tx_size] > TX_32X32)
-#else
   if (xd->lossless[mbmi->segment_id] || txsize_sqr_map[tx_size] > TX_32X32 ||
       (txsize_sqr_map[tx_size] >= TX_32X32 && !is_inter_block(mbmi)))
-#endif
     return DCT_DCT;
   if (plane_type == PLANE_TYPE_Y) {
     return mbmi->tx_type;
@@ -1345,7 +1331,7 @@
   if (tx_size == TX_16X64 || tx_size == TX_64X16) {
     return 512;
   }
-#endif  // CONFIG_TX64X64 && !CONFIG_DAALA_TX
+#endif  // CONFIG_TX64X64
   return tx_size_2d[tx_size];
 }
 
diff --git a/av1/common/daala_inv_txfm.c b/av1/common/daala_inv_txfm.c
deleted file mode 100644
index 04092e0..0000000
--- a/av1/common/daala_inv_txfm.c
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "av1/common/daala_tx.h"
-#include "av1/common/daala_inv_txfm.h"
-#include "av1/common/idct.h"
-
-#if CONFIG_DAALA_TX
-
-// Complete Daala TX map, sans lossless which is special cased
-typedef void (*daala_itx)(od_coeff *, int, const od_coeff[]);
-
-static daala_itx tx_map[TX_SIZES][TX_TYPES] = {
-  //  4-point transforms
-  { od_bin_idct4, od_bin_idst4, od_bin_idst4, od_bin_iidtx4 },
-
-  //  8-point transforms
-  { od_bin_idct8, od_bin_idst8, od_bin_idst8, od_bin_iidtx8 },
-
-  //  16-point transforms
-  { od_bin_idct16, od_bin_idst16, od_bin_idst16, od_bin_iidtx16 },
-
-  //  32-point transforms
-  { od_bin_idct32, od_bin_idst32, od_bin_idst32, od_bin_iidtx32 },
-
-#if CONFIG_TX64X64
-  //  64-point transforms
-  { od_bin_idct64, NULL, NULL, od_bin_iidtx64 },
-#endif
-};
-
-static int tx_flip(TX_TYPE_1D t) { return t == FLIPADST_1D; }
-
-// Daala TX toplevel inverse entry point.  This same function is
-// intended for both low and high bitdepth cases with a tran_low_t of
-// 32 bits (matching od_coeff), and a passed-in pixel buffer of either
-// bytes (hbd=0) or shorts (hbd=1).
-void daala_inv_txfm_add_c(const tran_low_t *input_coeffs, void *output_pixels,
-                          int output_stride, TxfmParam *txfm_param) {
-  const TX_SIZE tx_size = txfm_param->tx_size;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int px_depth = txfm_param->bd;
-  assert(tx_size <= TX_SIZES_ALL);
-  assert(tx_type <= TX_TYPES);
-
-  if (txfm_param->lossless) {
-    // Transform function special-cased for lossless
-    assert(tx_type == DCT_DCT);
-    assert(tx_size == TX_4X4);
-    if (txfm_param->is_hbd)
-      // Note that the output pointer in the prototype is uint8, but the
-      // function converts to short internally
-      av1_highbd_iwht4x4_add(input_coeffs, output_pixels, output_stride,
-                             txfm_param->eob, px_depth);
-    else
-      av1_iwht4x4_add(input_coeffs, output_pixels, output_stride, txfm_param);
-  } else {
-    // General TX case
-    const int downshift = TX_COEFF_DEPTH - px_depth;
-    assert(downshift >= 0);
-    assert(sizeof(tran_low_t) == sizeof(od_coeff));
-    assert(sizeof(tran_low_t) >= 4);
-
-    // Hook into existing map translation infrastructure to select
-    // appropriate TX functions
-    const int cols = tx_size_wide[tx_size];
-    const int rows = tx_size_high[tx_size];
-    const TX_SIZE col_idx = txsize_vert_map[tx_size];
-    const TX_SIZE row_idx = txsize_horz_map[tx_size];
-    assert(col_idx <= TX_SIZES);
-    assert(row_idx <= TX_SIZES);
-    assert(vtx_tab[tx_type] <= (int)TX_TYPES_1D);
-    assert(htx_tab[tx_type] <= (int)TX_TYPES_1D);
-    daala_itx col_tx = tx_map[col_idx][vtx_tab[tx_type]];
-    daala_itx row_tx = tx_map[row_idx][htx_tab[tx_type]];
-    int col_flip = tx_flip(vtx_tab[tx_type]);
-    int row_flip = tx_flip(htx_tab[tx_type]);
-    od_coeff tmpsq[MAX_TX_SQUARE];
-#if CONFIG_TX64X64
-    tran_low_t pad_input[MAX_TX_SQUARE];
-#endif
-    int r;
-    int c;
-
-    assert(col_tx);
-    assert(row_tx);
-
-#if CONFIG_TX64X64
-    if (rows > 32 || cols > 32) {
-      int avail_rows;
-      int avail_cols;
-      // TODO(urvang): Can the same array be reused, instead of using a new
-      // array?
-      // Remap 32x32 input into a modified input by:
-      // - Copying over these values in top-left 32x32 locations.
-      // - Setting the rest of the locations to 0.
-      avail_rows = AOMMIN(rows, 32);
-      avail_cols = AOMMIN(cols, 32);
-      for (r = 0; r < avail_rows; r++) {
-        memcpy(pad_input + r * cols, input_coeffs + r * avail_cols,
-               avail_cols * sizeof(*pad_input));
-        if (cols > avail_cols) {
-          memset(pad_input + r * cols + avail_cols, 0,
-                 (cols - avail_cols) * sizeof(*pad_input));
-        }
-      }
-      if (rows > avail_rows) {
-        memset(pad_input + avail_rows * cols, 0,
-               (rows - avail_rows) * cols * sizeof(*pad_input));
-      }
-      input_coeffs = pad_input;
-    }
-#endif
-
-    // Inverse-transform rows
-    for (r = 0; r < rows; ++r) {
-      // The output addressing transposes
-      if (row_flip)
-        row_tx(tmpsq + r + (rows * cols) - rows, -rows,
-               input_coeffs + r * cols);
-      else
-        row_tx(tmpsq + r, rows, input_coeffs + r * cols);
-    }
-
-    // Inverse-transform columns
-    for (c = 0; c < cols; ++c) {
-      // Above transposed, so our cols are now rows
-      if (col_flip)
-        col_tx(tmpsq + c * rows + rows - 1, -1, tmpsq + c * rows);
-      else
-        col_tx(tmpsq + c * rows, 1, tmpsq + c * rows);
-    }
-
-    // Sum with destination according to bit depth
-    // The tmpsq array is currently transposed relative to output
-    if (txfm_param->is_hbd) {
-      // Destination array is shorts
-      uint16_t *out16 = CONVERT_TO_SHORTPTR(output_pixels);
-      for (r = 0; r < rows; ++r)
-        for (c = 0; c < cols; ++c)
-          out16[r * output_stride + c] = highbd_clip_pixel_add(
-              out16[r * output_stride + c],
-              (tmpsq[c * rows + r] + (1 << downshift >> 1)) >> downshift,
-              px_depth);
-    } else {
-      // Destination array is bytes
-      uint8_t *out8 = (uint8_t *)output_pixels;
-      for (r = 0; r < rows; ++r)
-        for (c = 0; c < cols; ++c)
-          out8[r * output_stride + c] = clip_pixel_add(
-              out8[r * output_stride + c],
-              (tmpsq[c * rows + r] + (1 << downshift >> 1)) >> downshift);
-    }
-  }
-}
-
-#endif
diff --git a/av1/common/daala_inv_txfm.h b/av1/common/daala_inv_txfm.h
deleted file mode 100644
index 3e0df30..0000000
--- a/av1/common/daala_inv_txfm.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_ENCODER_DAALA_INV_TXFM_H_
-#define AV1_ENCODER_DAALA_INV_TXFM_H_
-
-#include "./aom_config.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void daala_inv_txfm_add_c(const tran_low_t *input_coeffs, void *output_pixels,
-                          int output_stride, TxfmParam *txfm_param);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AV1_ENCODER_DAALA_INV_TXFM_H_
diff --git a/av1/common/daala_tx.c b/av1/common/daala_tx.c
deleted file mode 100644
index 854011b..0000000
--- a/av1/common/daala_tx.c
+++ /dev/null
@@ -1,5527 +0,0 @@
-#include "av1/common/daala_tx.h"
-#include "av1/common/odintrin.h"
-#include "av1/common/daala_tx_kernels.h"
-
-/* clang-format off */
-
-#define OD_RSHIFT1(_a) (((_a) + ((_a) < 0)) >> 1)
-#define OD_PAVG(_a, _b) (((_a) + (_b) + 1) >> 1)
-
-/* TODO: Daala DCT overflow checks need to be ported as a later test */
-# if defined(OD_DCT_CHECK_OVERFLOW)
-# else
-#  define OD_DCT_OVERFLOW_CHECK(val, scale, offset, idx)
-# endif
-
-#define OD_FDCT_2_PR(p0, p1) \
-  /* Embedded 2-point orthonormal Type-II fDCT. */ \
-  do { \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(p1, 13573, 16384, 100); \
-    p0 -= (p1*13573 + 16384) >> 15; \
-    /* 5793/8192 ~= Sin[pi/4] ~= 0.707106781186547 */ \
-    OD_DCT_OVERFLOW_CHECK(p0, 5793, 4096, 101); \
-    p1 += (p0*5793 + 4096) >> 13; \
-    /* 3393/8192 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(p1, 3393, 4096, 102); \
-    p0 -= (p1*3393 + 4096) >> 13; \
-  } \
-  while (0)
-
-#define OD_IDCT_2_PR(p0, p1) \
-  /* Embedded 2-point orthonormal Type-II iDCT. */ \
-  do { \
-    /* 3393/8192 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    p0 += (p1*3393 + 4096) >> 13; \
-    /* 5793/8192 ~= Sin[pi/4] ~= 0.707106781186547 */ \
-    p1 -= (p0*5793 + 4096) >> 13; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    p0 += (p1*13573 + 16384) >> 15; \
-  } \
-  while (0)
-
-#define OD_FDCT_2_ASYM_PR(p0, p1, p1h) \
-  /* Embedded 2-point asymmetric Type-II fDCT. */ \
-  do { \
-    p0 += p1h; \
-    p1 = p0 - p1; \
-  } \
-  while (0)
-
-#define OD_IDCT_2_ASYM_PR(p0, p1, p1h) \
-  /* Embedded 2-point asymmetric Type-II iDCT. */ \
-  do { \
-    p1 = p0 - p1; \
-    p1h = OD_RSHIFT1(p1); \
-    p0 -= p1h; \
-  } \
-  while (0)
-
-#define OD_FDST_2_PR(p0, p1) \
-  /* Embedded 2-point orthonormal Type-IV fDST. */ \
-  do { \
-    /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(p1, 10947, 8192, 103); \
-    p0 -= (p1*10947 + 8192) >> 14; \
-    /* 473/512 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(p0, 473, 256, 104); \
-    p1 += (p0*473 + 256) >> 9; \
-    /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(p1, 10947, 8192, 105); \
-    p0 -= (p1*10947 + 8192) >> 14; \
-  } \
-  while (0)
-
-#define OD_IDST_2_PR(p0, p1) \
-  /* Embedded 2-point orthonormal Type-IV iDST. */ \
-  do { \
-    /* 10947/16384 ~= Tan[3*Pi/16]) ~= 0.668178637919299 */ \
-    p0 += (p1*10947 + 8192) >> 14; \
-    /* 473/512 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    p1 -= (p0*473 + 256) >> 9; \
-    /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    p0 += (p1*10947 + 8192) >> 14; \
-  } \
-  while (0)
-
-#define OD_FDST_2_ASYM_PR(p0, p1) \
-  /* Embedded 2-point asymmetric Type-IV fDST. */ \
-  do { \
-    /* 11507/16384 ~= 4*Sin[Pi/8] - 2*Tan[Pi/8] ~= 0.702306604714169 */ \
-    OD_DCT_OVERFLOW_CHECK(p1, 11507, 8192, 187); \
-    p0 -= (p1*11507 + 8192) >> 14; \
-    /* 669/1024 ~= Cos[Pi/8]/Sqrt[2] ~= 0.653281482438188 */ \
-    OD_DCT_OVERFLOW_CHECK(p0, 669, 512, 188); \
-    p1 += (p0*669 + 512) >> 10; \
-    /* 4573/4096 ~= 4*Sin[Pi/8] - Tan[Pi/8] ~= 1.11652016708726 */ \
-    OD_DCT_OVERFLOW_CHECK(p1, 4573, 2048, 189); \
-    p0 -= (p1*4573 + 2048) >> 12; \
-  } \
-  while (0)
-
-#define OD_IDST_2_ASYM_PR(p0, p1) \
-  /* Embedded 2-point asymmetric Type-IV iDST. */ \
-  do { \
-    /* 4573/4096 ~= 4*Sin[Pi/8] - Tan[Pi/8] ~= 1.11652016708726 */ \
-    p0 += (p1*4573 + 2048) >> 12; \
-    /* 669/1024 ~= Cos[Pi/8]/Sqrt[2] ~= 0.653281482438188 */ \
-    p1 -= (p0*669 + 512) >> 10; \
-    /* 11507/16384 ~= 4*Sin[Pi/8] - 2*Tan[Pi/8] ~= 0.702306604714169 */ \
-    p0 += (p1*11507 + 8192) >> 14; \
-  } \
-  while (0)
-
-#define OD_FDCT_4_PR(q0, q2, q1, q3) \
-  /* Embedded 4-point orthonormal Type-II fDCT. */ \
-  do { \
-    int q2h; \
-    int q3h; \
-    q3 = q0 - q3; \
-    q3h = OD_RSHIFT1(q3); \
-    q0 -= q3h; \
-    q2 += q1; \
-    q2h = OD_RSHIFT1(q2); \
-    q1 = q2h - q1; \
-    OD_FDCT_2_ASYM_PR(q0, q2, q2h); \
-    OD_FDST_2_ASYM_PR(q3, q1); \
-  } \
-  while (0)
-
-#define OD_IDCT_4_PR(q0, q2, q1, q3) \
-  /* Embedded 4-point orthonormal Type-II iDCT. */ \
-  do { \
-    int q1h; \
-    int q3h; \
-    OD_IDST_2_ASYM_PR(q3, q2); \
-    OD_IDCT_2_ASYM_PR(q0, q1, q1h); \
-    q3h = OD_RSHIFT1(q3); \
-    q0 += q3h; \
-    q3 = q0 - q3; \
-    q2 = q1h - q2; \
-    q1 -= q2; \
-  } \
-  while (0)
-
-#define OD_FDCT_4_ASYM_PR(q0, q2, q2h, q1, q3, q3h) \
-  /* Embedded 4-point asymmetric Type-II fDCT. */ \
-  do { \
-    q0 += q3h; \
-    q3 = q0 - q3; \
-    q1 = q2h - q1; \
-    q2 = q1 - q2; \
-    OD_FDCT_2_PR(q0, q2); \
-    OD_FDST_2_PR(q3, q1); \
-  } \
-  while (0)
-
-#define OD_IDCT_4_ASYM_PR(q0, q2, q1, q1h, q3, q3h) \
-  /* Embedded 4-point asymmetric Type-II iDCT. */ \
-  do { \
-    OD_IDST_2_PR(q3, q2); \
-    OD_IDCT_2_PR(q0, q1); \
-    q1 = q2 - q1; \
-    q1h = OD_RSHIFT1(q1); \
-    q2 = q1h - q2; \
-    q3 = q0 - q3; \
-    q3h = OD_RSHIFT1(q3); \
-    q0 -= q3h; \
-  } \
-  while (0)
-
-#define OD_FDST_4_PR(q0, q2, q1, q3) \
-  /* Embedded 4-point orthonormal Type-IV fDST. */ \
-  do { \
-    int q0h; \
-    int q1h; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(q1, 13573, 16384, 190); \
-    q2 += (q1*13573 + 16384) >> 15; \
-    /* 5793/8192 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    OD_DCT_OVERFLOW_CHECK(q2, 5793, 4096, 191); \
-    q1 -= (q2*5793 + 4096) >> 13; \
-    /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(q1, 3393, 4096, 192); \
-    q2 += (q1*3393 + 4096) >> 13; \
-    q0 += q2; \
-    q0h = OD_RSHIFT1(q0); \
-    q2 = q0h - q2; \
-    q1 += q3; \
-    q1h = OD_RSHIFT1(q1); \
-    q3 -= q1h; \
-    /* 537/1024 ~= (1/Sqrt[2] - Cos[3*Pi/16]/2)/Sin[3*Pi/16] ~=
-        0.524455699240090 */ \
-    OD_DCT_OVERFLOW_CHECK(q1, 537, 512, 193); \
-    q2 -= (q1*537 + 512) >> 10; \
-    /* 1609/2048 ~= Sqrt[2]*Sin[3*Pi/16] ~= 0.785694958387102 */ \
-    OD_DCT_OVERFLOW_CHECK(q2, 1609, 1024, 194); \
-    q1 += (q2*1609 + 1024) >> 11; \
-    /* 7335/32768 ~= (1/Sqrt[2] - Cos[3*Pi/16])/Sin[3*Pi/16] ~=
-        0.223847182092655 */ \
-    OD_DCT_OVERFLOW_CHECK(q1, 7335, 16384, 195); \
-    q2 += (q1*7335 + 16384) >> 15; \
-    /* 5091/8192 ~= (1/Sqrt[2] - Cos[7*Pi/16]/2)/Sin[7*Pi/16] ~=
-        0.6215036383171189 */ \
-    OD_DCT_OVERFLOW_CHECK(q0, 5091, 4096, 196); \
-    q3 += (q0*5091 + 4096) >> 13; \
-    /* 5681/4096 ~= Sqrt[2]*Sin[7*Pi/16] ~= 1.38703984532215 */ \
-    OD_DCT_OVERFLOW_CHECK(q3, 5681, 2048, 197); \
-    q0 -= (q3*5681 + 2048) >> 12; \
-    /* 4277/8192 ~= (1/Sqrt[2] - Cos[7*Pi/16])/Sin[7*Pi/16] ~=
-        0.52204745462729 */ \
-    OD_DCT_OVERFLOW_CHECK(q0, 4277, 4096, 198); \
-    q3 += (q0*4277 + 4096) >> 13; \
-  } \
-  while (0)
-
-#define OD_IDST_4_PR(q0, q2, q1, q3) \
-  /* Embedded 4-point orthonormal Type-IV iDST. */ \
-  do { \
-    int q0h; \
-    int q2h; \
-    /* 4277/8192 ~= (1/Sqrt[2] - Cos[7*Pi/16])/Sin[7*Pi/16] ~=
-        0.52204745462729 */ \
-    q3 -= (q0*4277 + 4096) >> 13; \
-    /* 5681/4096 ~= Sqrt[2]*Sin[7*Pi/16] ~= 1.38703984532215 */ \
-    q0 += (q3*5681 + 2048) >> 12; \
-    /* 5091/8192 ~= (1/Sqrt[2] - Cos[7*Pi/16]/2)/Sin[7*Pi/16] ~=
-        0.6215036383171189 */ \
-    q3 -= (q0*5091 + 4096) >> 13; \
-    /* 7335/32768 ~= (1/Sqrt[2] - Cos[3*Pi/16])/Sin[3*Pi/16] ~=
-        0.223847182092655 */ \
-    q1 -= (q2*7335 + 16384) >> 15; \
-    /* 1609/2048 ~= Sqrt[2]*Sin[3*Pi/16] ~= 0.785694958387102 */ \
-    q2 -= (q1*1609 + 1024) >> 11; \
-    /* 537/1024 ~= (1/Sqrt[2] - Cos[3*Pi/16]/2)/Sin[3*Pi/16] ~=
-        0.524455699240090 */ \
-    q1 += (q2*537 + 512) >> 10; \
-    q2h = OD_RSHIFT1(q2); \
-    q3 += q2h; \
-    q2 -= q3; \
-    q0h = OD_RSHIFT1(q0); \
-    q1 = q0h - q1; \
-    q0 -= q1; \
-    /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    q1 -= (q2*3393 + 4096) >> 13; \
-    /* 5793/8192 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    q2 += (q1*5793 + 4096) >> 13; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    q1 -= (q2*13573 + 16384) >> 15; \
-  } \
-  while (0)
-
-#define OD_FDST_4_ASYM_PR(t0, t0h, t2, t1, t3) \
-  /* Embedded 4-point asymmetric Type-IV fDST. */ \
-  do { \
-    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 7489, 4096, 106); \
-    t2 -= (t1*7489 + 4096) >> 13; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 11585, 8192, 107); \
-    t1 += (t2*11585 + 8192) >> 14; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 19195, 16384, 108); \
-    t2 += (t1*19195 + 16384) >> 15; \
-    t3 += OD_RSHIFT1(t2); \
-    t2 -= t3; \
-    t1 = t0h - t1; \
-    t0 -= t1; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 6723, 4096, 109); \
-    t3 += (t0*6723 + 4096) >> 13; \
-    /* 8035/8192 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 8035, 4096, 110); \
-    t0 -= (t3*8035 + 4096) >> 13; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 6723, 4096, 111); \
-    t3 += (t0*6723 + 4096) >> 13; \
-    /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 8757, 8192, 112); \
-    t2 += (t1*8757 + 8192) >> 14; \
-    /* 6811/8192 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 6811, 4096, 113); \
-    t1 -= (t2*6811 + 4096) >> 13; \
-    /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 8757, 8192, 114); \
-    t2 += (t1*8757 + 8192) >> 14; \
-  } \
-  while (0)
-
-#define OD_IDST_4_ASYM_PR(t0, t0h, t2, t1, t3) \
-  /* Embedded 4-point asymmetric Type-IV iDST. */ \
-  do { \
-    /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    t1 -= (t2*8757 + 8192) >> 14; \
-    /* 6811/8192 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
-    t2 += (t1*6811 + 4096) >> 13; \
-    /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    t1 -= (t2*8757 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    t3 -= (t0*6723 + 4096) >> 13; \
-    /* 8035/8192 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    t0 += (t3*8035 + 4096) >> 13; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    t3 -= (t0*6723 + 4096) >> 13; \
-    t0 += t2; \
-    t0h = OD_RSHIFT1(t0); \
-    t2 = t0h - t2; \
-    t1 += t3; \
-    t3 -= OD_RSHIFT1(t1); \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    t1 -= (t2*19195 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    t2 -= (t1*11585 + 8192) >> 14; \
-    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    t1 += (t2*7489 + 4096) >> 13; \
-  } \
-  while (0)
-
-#define OD_FDCT_8_PR(r0, r4, r2, r6, r1, r5, r3, r7) \
-  /* Embedded 8-point orthonormal Type-II fDCT. */ \
-  do { \
-    int r4h; \
-    int r5h; \
-    int r6h; \
-    int r7h; \
-    r7 = r0 - r7; \
-    r7h = OD_RSHIFT1(r7); \
-    r0 -= r7h; \
-    r6 += r1; \
-    r6h = OD_RSHIFT1(r6); \
-    r1 = r6h - r1; \
-    r5 = r2 - r5; \
-    r5h = OD_RSHIFT1(r5); \
-    r2 -= r5h; \
-    r4 += r3; \
-    r4h = OD_RSHIFT1(r4); \
-    r3 = r4h - r3; \
-    OD_FDCT_4_ASYM_PR(r0, r4, r4h, r2, r6, r6h); \
-    OD_FDST_4_ASYM_PR(r7, r7h, r3, r5, r1); \
-  } \
-  while (0)
-
-#define OD_IDCT_8_PR(r0, r4, r2, r6, r1, r5, r3, r7) \
-  /* Embedded 8-point orthonormal Type-II iDCT. */ \
-  do { \
-    int r1h; \
-    int r3h; \
-    int r5h; \
-    int r7h; \
-    OD_IDST_4_ASYM_PR(r7, r7h, r5, r6, r4); \
-    OD_IDCT_4_ASYM_PR(r0, r2, r1, r1h, r3, r3h); \
-    r0 += r7h; \
-    r7 = r0 - r7; \
-    r6 = r1h - r6; \
-    r1 -= r6; \
-    r5h = OD_RSHIFT1(r5); \
-    r2 += r5h; \
-    r5 = r2 - r5; \
-    r4 = r3h - r4; \
-    r3 -= r4; \
-  } \
-  while (0)
-
-#define OD_FDCT_8_ASYM_PR(r0, r4, r4h, r2, r6, r6h, r1, r5, r5h, r3, r7, r7h) \
-  /* Embedded 8-point asymmetric Type-II fDCT. */ \
-  do { \
-    r0 += r7h; \
-    r7 = r0 - r7; \
-    r1 = r6h - r1; \
-    r6 -= r1; \
-    r2 += r5h; \
-    r5 = r2 - r5; \
-    r3 = r4h - r3; \
-    r4 -= r3; \
-    OD_FDCT_4_PR(r0, r4, r2, r6); \
-    OD_FDST_4_PR(r7, r3, r5, r1); \
-  } \
-  while (0)
-
-#define OD_IDCT_8_ASYM_PR(r0, r4, r2, r6, r1, r1h, r5, r5h, r3, r3h, r7, r7h) \
-  /* Embedded 8-point asymmetric Type-II iDCT. */ \
-  do { \
-    OD_IDST_4_PR(r7, r5, r6, r4); \
-    OD_IDCT_4_PR(r0, r2, r1, r3); \
-    r7 = r0 - r7; \
-    r7h = OD_RSHIFT1(r7); \
-    r0 -= r7h; \
-    r1 += r6; \
-    r1h = OD_RSHIFT1(r1); \
-    r6 = r1h - r6; \
-    r5 = r2 - r5; \
-    r5h = OD_RSHIFT1(r5); \
-    r2 -= r5h; \
-    r3 += r4; \
-    r3h = OD_RSHIFT1(r3); \
-    r4 = r3h - r4; \
-  } \
-  while (0)
-
-#define OD_FDST_8_PR(t0, t4, t2, t6, t1, t5, t3, t7)  \
-  /* Embedded 8-point orthonormal Type-IV fDST. */ \
-  do { \
-    int t0h; \
-    int t2h; \
-    int t5h; \
-    int t7h; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 13573, 16384, 115); \
-    t6 -= (t1*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 11585, 8192, 116); \
-    t1 += (t6*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 13573, 16384, 117); \
-    t6 -= (t1*13573 + 16384) >> 15; \
-    /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 21895, 16384, 118); \
-    t5 -= (t2*21895 + 16384) >> 15; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 15137, 8192, 119); \
-    t2 += (t5*15137 + 8192) >> 14; \
-    /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 10947, 8192, 120); \
-    t5 -= (t2*10947 + 8192) >> 14; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 3259, 8192, 121); \
-    t4 -= (t3*3259 + 8192) >> 14; \
-    /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 3135, 4096, 122); \
-    t3 += (t4*3135 + 4096) >> 13; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 3259, 8192, 123); \
-    t4 -= (t3*3259 + 8192) >> 14; \
-    t7 += t1; \
-    t7h = OD_RSHIFT1(t7); \
-    t1 -= t7h; \
-    t2 = t3 - t2; \
-    t2h = OD_RSHIFT1(t2); \
-    t3 -= t2h; \
-    t0 -= t6; \
-    t0h = OD_RSHIFT1(t0); \
-    t6 += t0h; \
-    t5 = t4 - t5; \
-    t5h = OD_RSHIFT1(t5); \
-    t4 -= t5h; \
-    t1 += t5h; \
-    t5 = t1 - t5; \
-    t4 += t0h; \
-    t0 -= t4; \
-    t6 -= t2h; \
-    t2 += t6; \
-    t3 -= t7h; \
-    t7 += t3; \
-    /* TODO: Can we move this into another operation */ \
-    t7 = -t7; \
-    /* 7425/8192 ~= Tan[15*Pi/64] ~= 0.906347169019147 */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 7425, 4096, 124); \
-    t0 -= (t7*7425 + 4096) >> 13; \
-    /* 8153/8192 ~= Sin[15*Pi/32] ~= 0.995184726672197 */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 8153, 4096, 125); \
-    t7 += (t0*8153 + 4096) >> 13; \
-    /* 7425/8192 ~= Tan[15*Pi/64] ~= 0.906347169019147 */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 7425, 4096, 126); \
-    t0 -= (t7*7425 + 4096) >> 13; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.148335987538347 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 4861, 16384, 127); \
-    t6 -= (t1*4861 + 16384) >> 15; \
-    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.290284677254462 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 1189, 2048, 128); \
-    t1 += (t6*1189 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.148335987538347 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 4861, 16384, 129); \
-    t6 -= (t1*4861 + 16384) >> 15; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.599376933681924 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 2455, 2048, 130); \
-    t2 -= (t5*2455 + 2048) >> 12; \
-    /* 7225/8192 ~= Sin[11*Pi/32] ~= 0.881921264348355 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 7225, 4096, 131); \
-    t5 += (t2*7225 + 4096) >> 13; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.599376933681924 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 2455, 2048, 132); \
-    t2 -= (t5*2455 + 2048) >> 12; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.357805721314524 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 11725, 16384, 133); \
-    t4 -= (t3*11725 + 16384) >> 15; \
-    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.634393284163645 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 5197, 4096, 134); \
-    t3 += (t4*5197 + 4096) >> 13; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.357805721314524 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 11725, 16384, 135); \
-    t4 -= (t3*11725 + 16384) >> 15; \
-  } \
-  while (0)
-
-#define OD_IDST_8_PR(t0, t4, t2, t6, t1, t5, t3, t7) \
-  /* Embedded 8-point orthonormal Type-IV iDST. */ \
-  do { \
-    int t0h; \
-    int t2h; \
-    int t5h_; \
-    int t7h_; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.357805721314524 */ \
-    t1 += (t6*11725 + 16384) >> 15; \
-    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.634393284163645 */ \
-    t6 -= (t1*5197 + 4096) >> 13; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.357805721314524 */ \
-    t1 += (t6*11725 + 16384) >> 15; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.599376933681924 */ \
-    t2 += (t5*2455 + 2048) >> 12; \
-    /* 7225/8192 ~= Sin[11*Pi/32] ~= 0.881921264348355 */ \
-    t5 -= (t2*7225 + 4096) >> 13; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.599376933681924 */ \
-    t2 += (t5*2455 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.148335987538347 */ \
-    t3 += (t4*4861 + 16384) >> 15; \
-    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.290284677254462 */ \
-    t4 -= (t3*1189 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.148335987538347 */ \
-    t3 += (t4*4861 + 16384) >> 15; \
-    /* 7425/8192 ~= Tan[15*Pi/64] ~= 0.906347169019147 */ \
-    t0 += (t7*7425 + 4096) >> 13; \
-    /* 8153/8192 ~= Sin[15*Pi/32] ~= 0.995184726672197 */ \
-    t7 -= (t0*8153 + 4096) >> 13; \
-    /* 7425/8192 ~= Tan[15*Pi/64] ~= 0.906347169019147 */ \
-    t0 += (t7*7425 + 4096) >> 13; \
-    /* TODO: Can we move this into another operation */ \
-    t7 = -t7; \
-    t7 -= t6; \
-    t7h_ = OD_RSHIFT1(t7); \
-    t6 += t7h_; \
-    t2 -= t3; \
-    t2h = OD_RSHIFT1(t2); \
-    t3 += t2h; \
-    t0 += t1; \
-    t0h = OD_RSHIFT1(t0); \
-    t1 -= t0h; \
-    t5 = t4 - t5; \
-    t5h_ = OD_RSHIFT1(t5); \
-    t4 -= t5h_; \
-    t1 += t5h_; \
-    t5 = t1 - t5; \
-    t3 -= t0h; \
-    t0 += t3; \
-    t6 += t2h; \
-    t2 = t6 - t2; \
-    t4 += t7h_; \
-    t7 -= t4; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    t1 += (t6*3259 + 8192) >> 14; \
-    /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
-    t6 -= (t1*3135 + 4096) >> 13; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    t1 += (t6*3259 + 8192) >> 14; \
-    /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    t5 += (t2*10947 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    t2 -= (t5*15137 + 8192) >> 14; \
-    /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    t5 += (t2*21895 + 16384) >> 15; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    t3 += (t4*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    t4 -= (t3*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    t3 += (t4*13573 + 16384) >> 15; \
-  } \
-  while (0)
-
-/* Rewrite this so that t0h can be passed in. */
-#define OD_FDST_8_ASYM_PR(t0, t4, t2, t6, t1, t5, t3, t7) \
-  /* Embedded 8-point asymmetric Type-IV fDST. */ \
-  do { \
-    int t0h; \
-    int t2h; \
-    int t5h; \
-    int t7h; \
-    /* 1035/2048 ~= (Sqrt[2] - Cos[7*Pi/32])/(2*Sin[7*Pi/32]) */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 1035, 1024, 199); \
-    t6 += (t1*1035 + 1024) >> 11; \
-    /* 3675/4096 ~= Sqrt[2]*Sin[7*Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 3675, 2048, 200); \
-    t1 -= (t6*3675 + 2048) >> 12; \
-    /* 851/8192 ~= (Cos[7*Pi/32] - 1/Sqrt[2])/Sin[7*Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 851, 4096, 201); \
-    t6 -= (t1*851 + 4096) >> 13; \
-    /* 4379/8192 ~= (Sqrt[2] - Sin[5*Pi/32])/(2*Cos[5*Pi/32]) */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 4379, 4096, 202); \
-    t5 += (t2*4379 + 4096) >> 13; \
-    /* 10217/8192 ~= Sqrt[2]*Cos[5*Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 10217, 4096, 203); \
-    t2 -= (t5*10217 + 4096) >> 13; \
-    /* 4379/16384 ~= (1/Sqrt[2] - Sin[5*Pi/32])/Cos[5*Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 4379, 8192, 204); \
-    t5 += (t2*4379 + 8192) >> 14; \
-    /* 12905/16384 ~= (Sqrt[2] - Cos[3*Pi/32])/(2*Sin[3*Pi/32]) */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 12905, 8192, 205); \
-    t4 += (t3*12905 + 8192) >> 14; \
-    /* 3363/8192 ~= Sqrt[2]*Sin[3*Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 3363, 4096, 206); \
-    t3 -= (t4*3363 + 4096) >> 13; \
-    /* 3525/4096 ~= (Cos[3*Pi/32] - 1/Sqrt[2])/Sin[3*Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 3525, 2048, 207); \
-    t4 -= (t3*3525 + 2048) >> 12; \
-    /* 5417/8192 ~= (Sqrt[2] - Sin[Pi/32])/(2*Cos[Pi/32]) */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 5417, 4096, 208); \
-    t7 += (t0*5417 + 4096) >> 13; \
-    /* 5765/4096 ~= Sqrt[2]*Cos[Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 5765, 2048, 209); \
-    t0 -= (t7*5765 + 2048) >> 12; \
-    /* 2507/4096 ~= (1/Sqrt[2] - Sin[Pi/32])/Cos[Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 2507, 2048, 210); \
-    t7 += (t0*2507 + 2048) >> 12; \
-    t0 += t1; \
-    t0h = OD_RSHIFT1(t0); \
-    t1 -= t0h; \
-    t2 -= t3; \
-    t2h = OD_RSHIFT1(t2); \
-    t3 += t2h; \
-    t5 -= t4; \
-    t5h = OD_RSHIFT1(t5); \
-    t4 += t5h; \
-    t7 += t6; \
-    t7h = OD_RSHIFT1(t7); \
-    t6 = t7h - t6; \
-    t4 = t7h - t4; \
-    t7 -= t4; \
-    t1 += t5h; \
-    t5 = t1 - t5; \
-    t6 += t2h; \
-    t2 = t6 - t2; \
-    t3 -= t0h; \
-    t0 += t3; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 3259, 8192, 211); \
-    t1 += (t6*3259 + 8192) >> 14; \
-    /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 3135, 4096, 212); \
-    t6 -= (t1*3135 + 4096) >> 13; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 3259, 8192, 213); \
-    t1 += (t6*3259 + 8192) >> 14; \
-    /* 2737/4096 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 2737, 2048, 214); \
-    t5 += (t2*2737 + 2048) >> 12; \
-    /* 473/512 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 473, 256, 215); \
-    t2 -= (t5*473 + 256) >> 9; \
-    /* 2737/4096 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 2737, 2048, 216); \
-    t5 += (t2*2737 + 2048) >> 12; \
-    /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 3393, 4096, 217); \
-    t3 += (t4*3393 + 4096) >> 13; \
-    /* 5793/8192 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 5793, 4096, 218); \
-    t4 -= (t3*5793 + 4096) >> 13; \
-    /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 3393, 4096, 219); \
-    t3 += (t4*3393 + 4096) >> 13; \
-  } \
-  while (0)
-
-#define OD_IDST_8_ASYM_PR(t0, t4, t2, t6, t1, t5, t3, t7) \
-  /* Embedded 8-point asymmetric Type-IV iDST. */ \
-  do { \
-    int t0h; \
-    int t2h; \
-    int t5h__; \
-    int t7h__; \
-    /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    t6 -= (t1*3393 + 4096) >> 13; \
-    /* 5793/8192 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    t1 += (t6*5793 + 4096) >> 13; \
-    /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    t6 -= (t1*3393 + 4096) >> 13; \
-    /* 2737/4096 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    t5 -= (t2*2737 + 2048) >> 12; \
-    /* 473/512 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    t2 += (t5*473 + 256) >> 9; \
-    /* 2737/4096 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    t5 -= (t2*2737 + 2048) >> 12; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    t4 -= (t3*3259 + 8192) >> 14; \
-    /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
-    t3 += (t4*3135 + 4096) >> 13; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    t4 -= (t3*3259 + 8192) >> 14; \
-    t0 -= t6; \
-    t0h = OD_RSHIFT1(t0); \
-    t6 += t0h; \
-    t2 = t3 - t2; \
-    t2h = OD_RSHIFT1(t2); \
-    t3 -= t2h; \
-    t5 = t4 - t5; \
-    t5h__ = OD_RSHIFT1(t5); \
-    t4 -= t5h__; \
-    t7 += t1; \
-    t7h__ = OD_RSHIFT1(t7); \
-    t1 = t7h__ - t1; \
-    t3 = t7h__ - t3; \
-    t7 -= t3; \
-    t1 -= t5h__; \
-    t5 += t1; \
-    t6 -= t2h; \
-    t2 += t6; \
-    t4 += t0h; \
-    t0 -= t4; \
-    /* 2507/4096 ~= (1/Sqrt[2] - Sin[Pi/32])/Cos[Pi/32] */ \
-    t7 -= (t0*2507 + 2048) >> 12; \
-    /* 5765/4096 ~= Sqrt[2]*Cos[Pi/32] */ \
-    t0 += (t7*5765 + 2048) >> 12; \
-    /* 5417/8192 ~= (Sqrt[2] - Sin[Pi/32])/(2*Cos[Pi/32]) */ \
-    t7 -= (t0*5417 + 4096) >> 13; \
-    /* 3525/4096 ~= (Cos[3*Pi/32] - 1/Sqrt[2])/Sin[3*Pi/32] */ \
-    t1 += (t6*3525 + 2048) >> 12; \
-    /* 3363/8192 ~= Sqrt[2]*Sin[3*Pi/32] */ \
-    t6 += (t1*3363 + 4096) >> 13; \
-    /* 12905/16384 ~= (1/Sqrt[2] - Cos[3*Pi/32]/1)/Sin[3*Pi/32] */ \
-    t1 -= (t6*12905 + 8192) >> 14; \
-    /* 4379/16384 ~= (1/Sqrt[2] - Sin[5*Pi/32])/Cos[5*Pi/32] */ \
-    t5 -= (t2*4379 + 8192) >> 14; \
-    /* 10217/8192 ~= Sqrt[2]*Cos[5*Pi/32] */ \
-    t2 += (t5*10217 + 4096) >> 13; \
-    /* 4379/8192 ~= (Sqrt[2] - Sin[5*Pi/32])/(2*Cos[5*Pi/32]) */ \
-    t5 -= (t2*4379 + 4096) >> 13; \
-    /* 851/8192 ~= (Cos[7*Pi/32] - 1/Sqrt[2])/Sin[7*Pi/32] */ \
-    t3 += (t4*851 + 4096) >> 13; \
-    /* 3675/4096 ~= Sqrt[2]*Sin[7*Pi/32] */ \
-    t4 += (t3*3675 + 2048) >> 12; \
-    /* 1035/2048 ~= (Sqrt[2] - Cos[7*Pi/32])/(2*Sin[7*Pi/32]) */ \
-    t3 -= (t4*1035 + 1024) >> 11; \
-  } \
-  while (0)
-
-#define OD_FDCT_16_PR(s0, s8, s4, sc, s2, sa, s6, se, \
-  s1, s9, s5, sd, s3, sb, s7, sf) \
-  /* Embedded 16-point orthonormal Type-II fDCT. */ \
-  do { \
-    int s8h; \
-    int sah; \
-    int sch; \
-    int seh; \
-    int sfh; \
-    sf = s0 - sf; \
-    sfh = OD_RSHIFT1(sf); \
-    s0 -= sfh; \
-    se += s1; \
-    seh = OD_RSHIFT1(se); \
-    s1 = seh - s1; \
-    sd = s2 - sd; \
-    s2 -= OD_RSHIFT1(sd); \
-    sc += s3; \
-    sch = OD_RSHIFT1(sc); \
-    s3 = sch - s3; \
-    sb = s4 - sb; \
-    s4 -= OD_RSHIFT1(sb); \
-    sa += s5; \
-    sah = OD_RSHIFT1(sa); \
-    s5 = sah - s5; \
-    s9 = s6 - s9; \
-    s6 -= OD_RSHIFT1(s9); \
-    s8 += s7; \
-    s8h = OD_RSHIFT1(s8); \
-    s7 = s8h - s7; \
-    OD_FDCT_8_ASYM_PR(s0, s8, s8h, s4, sc, sch, s2, sa, sah, s6, se, seh); \
-    OD_FDST_8_ASYM_PR(sf, s7, sb, s3, sd, s5, s9, s1); \
-  } \
-  while (0)
-
-#define OD_IDCT_16_PR(s0, s8, s4, sc, s2, sa, s6, se, \
-  s1, s9, s5, sd, s3, sb, s7, sf) \
-  /* Embedded 16-point orthonormal Type-II iDCT. */ \
-  do { \
-    int s1h; \
-    int s3h; \
-    int s5h; \
-    int s7h; \
-    int sfh; \
-    OD_IDST_8_ASYM_PR(sf, sb, sd, s9, se, sa, sc, s8); \
-    OD_IDCT_8_ASYM_PR(s0, s4, s2, s6, s1, s1h, s5, s5h, s3, s3h, s7, s7h); \
-    sfh = OD_RSHIFT1(sf); \
-    s0 += sfh; \
-    sf = s0 - sf; \
-    se = s1h - se; \
-    s1 -= se; \
-    s2 += OD_RSHIFT1(sd); \
-    sd = s2 - sd; \
-    sc = s3h - sc; \
-    s3 -= sc; \
-    s4 += OD_RSHIFT1(sb); \
-    sb = s4 - sb; \
-    sa = s5h - sa; \
-    s5 -= sa; \
-    s6 += OD_RSHIFT1(s9); \
-    s9 = s6 - s9; \
-    s8 = s7h - s8; \
-    s7 -= s8; \
-  } \
-  while (0)
-
-#define OD_FDCT_16_ASYM_PR(t0, t8, t8h, t4, tc, tch, t2, ta, tah, t6, te, teh, \
-  t1, t9, t9h, t5, td, tdh, t3, tb, tbh, t7, tf, tfh) \
-  /* Embedded 16-point asymmetric Type-II fDCT. */ \
-  do { \
-    t0 += tfh; \
-    tf = t0 - tf; \
-    t1 -= teh; \
-    te += t1; \
-    t2 += tdh; \
-    td = t2 - td; \
-    t3 -= tch; \
-    tc += t3; \
-    t4 += tbh; \
-    tb = t4 - tb; \
-    t5 -= tah; \
-    ta += t5; \
-    t6 += t9h; \
-    t9 = t6 - t9; \
-    t7 -= t8h; \
-    t8 += t7; \
-    OD_FDCT_8_PR(t0, t8, t4, tc, t2, ta, t6, te); \
-    OD_FDST_8_PR(tf, t7, tb, t3, td, t5, t9, t1); \
-  } \
-  while (0)
-
-#define OD_IDCT_16_ASYM_PR(t0, t8, t4, tc, t2, ta, t6, te, \
-  t1, t1h, t9, t9h, t5, t5h, td, tdh, t3, t3h, tb, tbh, t7, t7h, tf, tfh) \
-  /* Embedded 16-point asymmetric Type-II iDCT. */ \
-  do { \
-    OD_IDST_8_PR(tf, tb, td, t9, te, ta, tc, t8); \
-    OD_IDCT_8_PR(t0, t4, t2, t6, t1, t5, t3, t7); \
-    t1 -= te; \
-    t1h = OD_RSHIFT1(t1); \
-    te += t1h; \
-    t9 = t6 - t9; \
-    t9h = OD_RSHIFT1(t9); \
-    t6 -= t9h; \
-    t5 -= ta; \
-    t5h = OD_RSHIFT1(t5); \
-    ta += t5h; \
-    td = t2 - td; \
-    tdh = OD_RSHIFT1(td); \
-    t2 -= tdh; \
-    t3 -= tc; \
-    t3h = OD_RSHIFT1(t3); \
-    tc += t3h; \
-    tb = t4 - tb; \
-    tbh = OD_RSHIFT1(tb); \
-    t4 -= tbh; \
-    t7 -= t8; \
-    t7h = OD_RSHIFT1(t7); \
-    t8 += t7h; \
-    tf = t0 - tf; \
-    tfh = OD_RSHIFT1(tf); \
-    t0 -= tfh; \
-  } \
-  while (0)
-
-#define OD_FDST_16_PR(s0, s8, s4, sc, s2, sa, s6, se, \
-  s1, s9, s5, sd, s3, sb, s7, sf) \
-  /* Embedded 16-point orthonormal Type-IV fDST. */ \
-  do { \
-    int s0h; \
-    int s2h; \
-    int sdh; \
-    int sfh; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(s3, 13573, 16384, 220); \
-    s1 += (se*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    OD_DCT_OVERFLOW_CHECK(s1, 11585, 8192, 221); \
-    se -= (s1*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(s3, 13573, 16384, 222); \
-    s1 += (se*13573 + 16384) >> 15; \
-    /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(s2, 21895, 16384, 223); \
-    sd += (s2*21895 + 16384) >> 15; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(sd, 15137, 16384, 224); \
-    s2 -= (sd*15137 + 8192) >> 14; \
-    /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(s2, 21895, 16384, 225); \
-    sd += (s2*21895 + 16384) >> 15; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    OD_DCT_OVERFLOW_CHECK(s3, 3259, 8192, 226); \
-    sc += (s3*3259 + 8192) >> 14; \
-    /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
-    OD_DCT_OVERFLOW_CHECK(sc, 3135, 4096, 227); \
-    s3 -= (sc*3135 + 4096) >> 13; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    OD_DCT_OVERFLOW_CHECK(s3, 3259, 8192, 228); \
-    sc += (s3*3259 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(s5, 13573, 16384, 229); \
-    sa += (s5*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    OD_DCT_OVERFLOW_CHECK(sa, 11585, 8192, 230); \
-    s5 -= (sa*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(s5, 13573, 16384, 231); \
-    sa += (s5*13573 + 16384) >> 15; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(s9, 13573, 16384, 232); \
-    s6 += (s9*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[pi/4] ~= 0.707106781186547 */ \
-    OD_DCT_OVERFLOW_CHECK(s6, 11585, 8192, 233); \
-    s9 -= (s6*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(s9, 13573, 16384, 234); \
-    s6 += (s9*13573 + 16384) >> 15; \
-    sf += se; \
-    sfh = OD_RSHIFT1(sf); \
-    se = sfh - se; \
-    s0 += s1; \
-    s0h = OD_RSHIFT1(s0); \
-    s1 = s0h - s1; \
-    s2 = s3 - s2; \
-    s2h = OD_RSHIFT1(s2); \
-    s3 -= s2h; \
-    sd -= sc; \
-    sdh = OD_RSHIFT1(sd); \
-    sc += sdh; \
-    sa = s4 - sa; \
-    s4 -= OD_RSHIFT1(sa); \
-    s5 += sb; \
-    sb = OD_RSHIFT1(s5) - sb; \
-    s8 += s6; \
-    s6 -= OD_RSHIFT1(s8); \
-    s7 = s9 - s7; \
-    s9 -= OD_RSHIFT1(s7); \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(sb, 6723, 4096, 235); \
-    s4 += (sb*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    OD_DCT_OVERFLOW_CHECK(s4, 16069, 8192, 236); \
-    sb -= (s4*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32]) ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(sb, 6723, 4096, 237); \
-    s4 += (sb*6723 + 4096) >> 13; \
-    /* 8757/16384 ~= Tan[5*Pi/32]) ~= 0.534511135950792 */ \
-    OD_DCT_OVERFLOW_CHECK(s5, 8757, 8192, 238); \
-    sa += (s5*8757 + 8192) >> 14; \
-    /* 6811/8192 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
-    OD_DCT_OVERFLOW_CHECK(sa, 6811, 4096, 239); \
-    s5 -= (sa*6811 + 4096) >> 13; \
-    /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    OD_DCT_OVERFLOW_CHECK(s5, 8757, 8192, 240); \
-    sa += (s5*8757 + 8192) >> 14; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    OD_DCT_OVERFLOW_CHECK(s9, 2485, 4096, 241); \
-    s6 += (s9*2485 + 4096) >> 13; \
-    /* 4551/8192 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
-    OD_DCT_OVERFLOW_CHECK(s6, 4551, 4096, 242); \
-    s9 -= (s6*4551 + 4096) >> 13; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    OD_DCT_OVERFLOW_CHECK(s9, 2485, 4096, 243); \
-    s6 += (s9*2485 + 4096) >> 13; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    OD_DCT_OVERFLOW_CHECK(s8, 3227, 16384, 244); \
-    s7 += (s8*3227 + 16384) >> 15; \
-    /* 6393/32768 ~= Sin[Pi/16] ~= 0.19509032201612825 */ \
-    OD_DCT_OVERFLOW_CHECK(s7, 6393, 16384, 245); \
-    s8 -= (s7*6393 + 16384) >> 15; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    OD_DCT_OVERFLOW_CHECK(s8, 3227, 16384, 246); \
-    s7 += (s8*3227 + 16384) >> 15; \
-    s1 -= s2h; \
-    s2 += s1; \
-    se += sdh; \
-    sd = se - sd; \
-    s3 += sfh; \
-    sf -= s3; \
-    sc = s0h - sc; \
-    s0 -= sc; \
-    sb += OD_RSHIFT1(s8); \
-    s8 = sb - s8; \
-    s4 += OD_RSHIFT1(s7); \
-    s7 -= s4; \
-    s6 += OD_RSHIFT1(s5); \
-    s5 = s6 - s5; \
-    s9 -= OD_RSHIFT1(sa); \
-    sa += s9; \
-    s8 += s0; \
-    s0 -= OD_RSHIFT1(s8); \
-    sf += s7; \
-    s7 = OD_RSHIFT1(sf) - s7; \
-    s1 -= s6; \
-    s6 += OD_RSHIFT1(s1); \
-    s9 += se; \
-    se = OD_RSHIFT1(s9) - se; \
-    s2 += sa; \
-    sa = OD_RSHIFT1(s2) - sa; \
-    s5 += sd; \
-    sd -= OD_RSHIFT1(s5); \
-    s4 = sc - s4; \
-    sc -= OD_RSHIFT1(s4); \
-    s3 -= sb; \
-    sb += OD_RSHIFT1(s3); \
-    /* 2799/4096 ~= (1/Sqrt[2] - Cos[31*Pi/64]/2)/Sin[31*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(sf, 2799, 2048, 247); \
-    s0 -= (sf*2799 + 2048) >> 12; \
-    /* 2893/2048 ~= Sqrt[2]*Sin[31*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s0, 2893, 1024, 248); \
-    sf += (s0*2893 + 1024) >> 11; \
-    /* 5397/8192 ~= (Cos[Pi/4] - Cos[31*Pi/64])/Sin[31*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(sf, 5397, 4096, 249); \
-    s0 -= (sf*5397 + 4096) >> 13; \
-    /* 41/64 ~= (1/Sqrt[2] - Cos[29*Pi/64]/2)/Sin[29*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s1, 41, 32, 250); \
-    se += (s1*41 + 32) >> 6; \
-    /* 2865/2048 ~= Sqrt[2]*Sin[29*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(se, 2865, 1024, 251); \
-    s1 -= (se*2865 + 1024) >> 11; \
-    /* 4641/8192 ~= (1/Sqrt[2] - Cos[29*Pi/64])/Sin[29*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s1, 4641, 4096, 252); \
-    se += (s1*4641 + 4096) >> 13; \
-    /* 2473/4096 ~= (1/Sqrt[2] - Cos[27*Pi/64]/2)/Sin[27*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s2, 2473, 2048, 253); \
-    sd += (s2*2473 + 2048) >> 12; \
-    /* 5619/4096 ~= Sqrt[2]*Sin[27*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(sd, 5619, 2048, 254); \
-    s2 -= (sd*5619 + 2048) >> 12; \
-    /* 7839/16384 ~= (1/Sqrt[2] - Cos[27*Pi/64])/Sin[27*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s2, 7839, 8192, 255); \
-    sd += (s2*7839 + 8192) >> 14; \
-    /* 5747/8192 ~= (1/Sqrt[2] - Cos[7*Pi/64]/2)/Sin[7*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s3, 5747, 4096, 256); \
-    sc -= (s3*5747 + 4096) >> 13; \
-    /* 3903/8192 ~= Sqrt[2]*Sin[7*Pi/64] ~= */ \
-    OD_DCT_OVERFLOW_CHECK(sc, 3903, 4096, 257); \
-    s3 += (sc*3903 + 4096) >> 13; \
-    /* 5701/8192 ~= (1/Sqrt[2] - Cos[7*Pi/64])/Sin[7*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s3, 5701, 4096, 258); \
-    sc += (s3*5701 + 4096) >> 13; \
-    /* 4471/8192 ~= (1/Sqrt[2] - Cos[23*Pi/64]/2)/Sin[23*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s4, 4471, 4096, 259); \
-    sb += (s4*4471 + 4096) >> 13; \
-    /* 1309/1024 ~= Sqrt[2]*Sin[23*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(sb, 1309, 512, 260); \
-    s4 -= (sb*1309 + 512) >> 10; \
-    /* 5067/16384 ~= (1/Sqrt[2] - Cos[23*Pi/64])/Sin[23*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s4, 5067, 8192, 261); \
-    sb += (s4*5067 + 8192) >> 14; \
-    /* 2217/4096 ~= (1/Sqrt[2] - Cos[11*Pi/64]/2)/Sin[11*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s5, 2217, 2048, 262); \
-    sa -= (s5*2217 + 2048) >> 12; \
-    /* 1489/2048 ~= Sqrt[2]*Sin[11*Pi/64] ~= 0.72705107329128 */ \
-    OD_DCT_OVERFLOW_CHECK(sa, 1489, 1024, 263); \
-    s5 += (sa*1489 + 1024) >> 11; \
-    /* 75/256 ~= (1/Sqrt[2] - Cos[11*Pi/64])/Sin[11*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s5, 75, 128, 264); \
-    sa += (s5*75 + 128) >> 8; \
-    /* 2087/4096 ~= (1/Sqrt[2] - Cos[19*Pi/64]/2)/Sin[19*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s9, 2087, 2048, 265); \
-    s6 -= (s9*2087 + 2048) >> 12; \
-    /* 4653/4096 ~= Sqrt[2]*Sin[19*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s6, 4653, 2048, 266); \
-    s9 += (s6*4653 + 2048) >> 12; \
-    /* 4545/32768 ~= (1/Sqrt[2] - Cos[19*Pi/64])/Sin[19*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s9, 4545, 16384, 267); \
-    s6 -= (s9*4545 + 16384) >> 15; \
-    /* 2053/4096 ~= (1/Sqrt[2] - Cos[15*Pi/64]/2)/Sin[15*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s8, 2053, 2048, 268); \
-    s7 += (s8*2053 + 2048) >> 12; \
-    /* 1945/2048 ~= Sqrt[2]*Sin[15*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s7, 1945, 1024, 269); \
-    s8 -= (s7*1945 + 1024) >> 11; \
-    /* 1651/32768 ~= (1/Sqrt[2] - Cos[15*Pi/64])/Sin[15*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s8, 1651, 16384, 270); \
-    s7 -= (s8*1651 + 16384) >> 15; \
-  } \
-  while (0)
-
-#define OD_IDST_16_PR(s0, s8, s4, sc, s2, sa, s6, se, \
-  s1, s9, s5, sd, s3, sb, s7, sf) \
-  /* Embedded 16-point orthonormal Type-IV iDST. */ \
-  do { \
-    int s0h; \
-    int s4h; \
-    int sbh; \
-    int sfh; \
-    /* 1651/32768 ~= (1/Sqrt[2] - Cos[15*Pi/64])/Sin[15*Pi/64] */ \
-    se += (s1*1651 + 16384) >> 15; \
-    /* 1945/2048 ~= Sqrt[2]*Sin[15*Pi/64] */ \
-    s1 += (se*1945 + 1024) >> 11; \
-    /* 2053/4096 ~= (1/Sqrt[2] - Cos[15*Pi/64]/2)/Sin[15*Pi/64] */ \
-    se -= (s1*2053 + 2048) >> 12; \
-    /* 4545/32768 ~= (1/Sqrt[2] - Cos[19*Pi/64])/Sin[19*Pi/64] */ \
-    s6 += (s9*4545 + 16384) >> 15; \
-    /* 4653/32768 ~= Sqrt[2]*Sin[19*Pi/64] */ \
-    s9 -= (s6*4653 + 2048) >> 12; \
-    /* 2087/4096 ~= (1/Sqrt[2] - Cos[19*Pi/64]/2)/Sin[19*Pi/64] */ \
-    s6 += (s9*2087 + 2048) >> 12; \
-    /* 75/256 ~= (1/Sqrt[2] - Cos[11*Pi/64])/Sin[11*Pi/64] */ \
-    s5 -= (sa*75 + 128) >> 8; \
-    /* 1489/2048 ~= Sqrt[2]*Sin[11*Pi/64] */ \
-    sa -= (s5*1489 + 1024) >> 11; \
-    /* 2217/4096 ~= (1/Sqrt[2] - Cos[11*Pi/64]/2)/Sin[11*Pi/64] */ \
-    s5 += (sa*2217 + 2048) >> 12; \
-    /* 5067/16384 ~= (1/Sqrt[2] - Cos[23*Pi/64])/Sin[23*Pi/64] */ \
-    sd -= (s2*5067 + 8192) >> 14; \
-    /* 1309/1024 ~= Sqrt[2]*Sin[23*Pi/64] */ \
-    s2 += (sd*1309 + 512) >> 10; \
-    /* 4471/8192 ~= (1/Sqrt[2] - Cos[23*Pi/64]/2)/Sin[23*Pi/64] */ \
-    sd -= (s2*4471 + 4096) >> 13; \
-    /* 5701/8192 ~= (1/Sqrt[2] - Cos[7*Pi/64])/Sin[7*Pi/64] */  \
-    s3 -= (sc*5701 + 4096) >> 13; \
-    /* 3903/8192 ~= Sqrt[2]*Sin[7*Pi/64] */ \
-    sc -= (s3*3903 + 4096) >> 13; \
-    /* 5747/8192 ~= (1/Sqrt[2] - Cos[7*Pi/64]/2)/Sin[7*Pi/64] */ \
-    s3 += (sc*5747 + 4096) >> 13; \
-    /* 7839/16384 ~= (1/Sqrt[2] - Cos[27*Pi/64])/Sin[27*Pi/64] */ \
-    sb -= (s4*7839 + 8192) >> 14; \
-    /* 5619/4096 ~= Sqrt[2]*Sin[27*Pi/64] */ \
-    s4 += (sb*5619 + 2048) >> 12; \
-    /* 2473/4096 ~= (1/Sqrt[2] - Cos[27*Pi/64]/2)/Sin[27*Pi/64] */ \
-    sb -= (s4*2473 + 2048) >> 12; \
-    /* 4641/8192 ~= (1/Sqrt[2] - Cos[29*Pi/64])/Sin[29*Pi/64] */ \
-    s7 -= (s8*4641 + 4096) >> 13; \
-    /* 2865/2048 ~= Sqrt[2]*Sin[29*Pi/64] */ \
-    s8 += (s7*2865 + 1024) >> 11; \
-    /* 41/64 ~= (1/Sqrt[2] - Cos[29*Pi/64]/2)/Sin[29*Pi/64] */ \
-    s7 -= (s8*41 + 32) >> 6; \
-    /* 5397/8192 ~= (Cos[Pi/4] - Cos[31*Pi/64])/Sin[31*Pi/64] */ \
-    s0 += (sf*5397 + 4096) >> 13; \
-    /* 2893/2048 ~= Sqrt[2]*Sin[31*Pi/64] */ \
-    sf -= (s0*2893 + 1024) >> 11; \
-    /* 2799/4096 ~= (1/Sqrt[2] - Cos[31*Pi/64]/2)/Sin[31*Pi/64] */ \
-    s0 += (sf*2799 + 2048) >> 12; \
-    sd -= OD_RSHIFT1(sc); \
-    sc += sd; \
-    s3 += OD_RSHIFT1(s2); \
-    s2 = s3 - s2; \
-    sb += OD_RSHIFT1(sa); \
-    sa -= sb; \
-    s5 = OD_RSHIFT1(s4) - s5; \
-    s4 -= s5; \
-    s7 = OD_RSHIFT1(s9) - s7; \
-    s9 -= s7; \
-    s6 -= OD_RSHIFT1(s8); \
-    s8 += s6; \
-    se = OD_RSHIFT1(sf) - se; \
-    sf -= se; \
-    s0 += OD_RSHIFT1(s1); \
-    s1 -= s0; \
-    s5 -= s9; \
-    s9 += OD_RSHIFT1(s5); \
-    sa = s6 - sa; \
-    s6 -= OD_RSHIFT1(sa); \
-    se += s2; \
-    s2 -= OD_RSHIFT1(se); \
-    s1 = sd - s1; \
-    sd -= OD_RSHIFT1(s1); \
-    s0 += s3; \
-    s0h = OD_RSHIFT1(s0); \
-    s3 = s0h - s3; \
-    sf += sc; \
-    sfh = OD_RSHIFT1(sf); \
-    sc -= sfh; \
-    sb = s7 - sb; \
-    sbh = OD_RSHIFT1(sb); \
-    s7 -= sbh; \
-    s4 -= s8; \
-    s4h = OD_RSHIFT1(s4); \
-    s8 += s4h; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    se -= (s1*3227 + 16384) >> 15; \
-    /* 6393/32768 ~= Sin[Pi/16] ~= 0.19509032201612825 */ \
-    s1 += (se*6393 + 16384) >> 15; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    se -= (s1*3227 + 16384) >> 15; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    s6 -= (s9*2485 + 4096) >> 13; \
-    /* 4551/8192 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
-    s9 += (s6*4551 + 4096) >> 13; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    s6 -= (s9*2485 + 4096) >> 13; \
-    /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    s5 -= (sa*8757 + 8192) >> 14; \
-    /* 6811/8192 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
-    sa += (s5*6811 + 4096) >> 13; \
-    /* 8757/16384 ~= Tan[5*Pi/32]) ~= 0.534511135950792 */ \
-    s5 -= (sa*8757 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32]) ~= 0.820678790828660 */ \
-    s2 -= (sd*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    sd += (s2*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    s2 -= (sd*6723 + 4096) >> 13; \
-    s9 += OD_RSHIFT1(se); \
-    se = s9 - se; \
-    s6 += OD_RSHIFT1(s1); \
-    s1 -= s6; \
-    sd = OD_RSHIFT1(sa) - sd; \
-    sa -= sd; \
-    s2 += OD_RSHIFT1(s5); \
-    s5 = s2 - s5; \
-    s3 -= sbh; \
-    sb += s3; \
-    sc += s4h; \
-    s4 = sc - s4; \
-    s8 = s0h - s8; \
-    s0 -= s8; \
-    s7 = sfh - s7; \
-    sf -= s7; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    s6 -= (s9*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[pi/4] ~= 0.707106781186547 */ \
-    s9 += (s6*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    s6 -= (s9*13573 + 16384) >> 15; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    s5 -= (sa*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[pi/4] ~= 0.707106781186547 */ \
-    sa += (s5*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    s5 -= (sa*13573 + 16384) >> 15; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    s3 -= (sc*3259 + 8192) >> 14; \
-    /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
-    sc += (s3*3135 + 4096) >> 13; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    s3 -= (sc*3259 + 8192) >> 14; \
-    /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    sb -= (s4*21895 + 16384) >> 15; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    s4 += (sb*15137 + 8192) >> 14; \
-    /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    sb -= (s4*21895 + 16384) >> 15; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    s8 -= (s7*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[pi/4] ~= 0.707106781186547 */ \
-    s7 += (s8*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    s8 -= (s7*13573 + 16384) >> 15; \
-  } \
-  while (0)
-
-/* TODO: rewrite this to match OD_FDST_16. */
-#define OD_FDST_16_ASYM_PR(t0, t0h, t8, t4, t4h, tc, t2, ta, t6, te, \
-  t1, t9, t5, td, t3, tb, t7, t7h, tf) \
-  /* Embedded 16-point asymmetric Type-IV fDST. */ \
-  do { \
-    int t2h; \
-    int t3h; \
-    int t6h; \
-    int t8h; \
-    int t9h; \
-    int tch; \
-    int tdh; \
-    /* TODO: Can we move these into another operation */ \
-    t8 = -t8; \
-    t9 = -t9; \
-    ta = -ta; \
-    tb = -tb; \
-    td = -td; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    OD_DCT_OVERFLOW_CHECK(te, 13573, 8192, 136); \
-    t1 -= (te*13573 + 8192) >> 14; \
-    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 11585, 16384, 137); \
-    te += (t1*11585 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    OD_DCT_OVERFLOW_CHECK(te, 13573, 8192, 138); \
-    t1 -= (te*13573 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    OD_DCT_OVERFLOW_CHECK(td, 4161, 8192, 139); \
-    t2 += (td*4161 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 15137, 8192, 140); \
-    td -= (t2*15137 + 8192) >> 14; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    OD_DCT_OVERFLOW_CHECK(td, 14341, 8192, 141); \
-    t2 += (td*14341 + 8192) >> 14; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 14341, 8192, 142); \
-    tc -= (t3*14341 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 15137, 8192, 143); \
-    t3 += (tc*15137 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 4161, 8192, 144); \
-    tc -= (t3*4161 + 8192) >> 14; \
-    te = t0h - te; \
-    t0 -= te; \
-    tf = OD_RSHIFT1(t1) - tf; \
-    t1 -= tf; \
-    /* TODO: Can we move this into another operation */ \
-    tc = -tc; \
-    t2 = OD_RSHIFT1(tc) - t2; \
-    tc -= t2; \
-    t3 = OD_RSHIFT1(td) - t3; \
-    td = t3 - td; \
-    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 7489, 4096, 145); \
-    t9 -= (t6*7489 + 4096) >> 13; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    OD_DCT_OVERFLOW_CHECK(t9, 11585, 8192, 146); \
-    t6 += (t9*11585 + 8192) >> 14; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 19195, 16384, 147); \
-    t9 += (t6*19195 + 16384) >> 15; \
-    t8 += OD_RSHIFT1(t9); \
-    t9 -= t8; \
-    t6 = t7h - t6; \
-    t7 -= t6; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 6723, 4096, 148); \
-    t8 += (t7*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    OD_DCT_OVERFLOW_CHECK(t8, 16069, 8192, 149); \
-    t7 -= (t8*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32]) ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 6723, 4096, 150); \
-    t8 += (t7*6723 + 4096) >> 13; \
-    /* 17515/32768 ~= Tan[5*Pi/32]) ~= 0.534511135950792 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 17515, 16384, 151); \
-    t9 += (t6*17515 + 16384) >> 15; \
-    /* 13623/16384 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
-    OD_DCT_OVERFLOW_CHECK(t9, 13623, 8192, 152); \
-    t6 -= (t9*13623 + 8192) >> 14; \
-    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 17515, 16384, 153); \
-    t9 += (t6*17515 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 13573, 8192, 154); \
-    t5 += (ta*13573 + 8192) >> 14; \
-    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 11585, 16384, 155); \
-    ta -= (t5*11585 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 13573, 8192, 156); \
-    t5 += (ta*13573 + 8192) >> 14; \
-    tb += OD_RSHIFT1(t5); \
-    t5 = tb - t5; \
-    ta += t4h; \
-    t4 -= ta; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 2485, 4096, 157); \
-    ta += (t5*2485 + 4096) >> 13; \
-    /* 18205/32768 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 18205, 16384, 158); \
-    t5 -= (ta*18205 + 16384) >> 15; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 2485, 4096, 159); \
-    ta += (t5*2485 + 4096) >> 13; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 6723, 4096, 160); \
-    tb -= (t4*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 16069, 8192, 161); \
-    t4 += (tb*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 6723, 4096, 162); \
-    tb -= (t4*6723 + 4096) >> 13; \
-    /* TODO: Can we move this into another operation */ \
-    t5 = -t5; \
-    tc -= tf; \
-    tch = OD_RSHIFT1(tc); \
-    tf += tch; \
-    t3 += t0; \
-    t3h = OD_RSHIFT1(t3); \
-    t0 -= t3h; \
-    td -= t1; \
-    tdh = OD_RSHIFT1(td); \
-    t1 += tdh; \
-    t2 += te; \
-    t2h = OD_RSHIFT1(t2); \
-    te -= t2h; \
-    t8 += t4; \
-    t8h = OD_RSHIFT1(t8); \
-    t4 = t8h - t4; \
-    t7 = tb - t7; \
-    t7h = OD_RSHIFT1(t7); \
-    tb = t7h - tb; \
-    t6 -= ta; \
-    t6h = OD_RSHIFT1(t6); \
-    ta += t6h; \
-    t9 = t5 - t9; \
-    t9h = OD_RSHIFT1(t9); \
-    t5 -= t9h; \
-    t0 -= t7h; \
-    t7 += t0; \
-    tf += t8h; \
-    t8 -= tf; \
-    te -= t6h; \
-    t6 += te; \
-    t1 += t9h; \
-    t9 -= t1; \
-    tb -= tch; \
-    tc += tb; \
-    t4 += t3h; \
-    t3 -= t4; \
-    ta -= tdh; \
-    td += ta; \
-    t5 = t2h - t5; \
-    t2 -= t5; \
-    /* TODO: Can we move these into another operation */ \
-    t8 = -t8; \
-    t9 = -t9; \
-    ta = -ta; \
-    tb = -tb; \
-    tc = -tc; \
-    td = -td; \
-    tf = -tf; \
-    /* 7799/8192 ~= Tan[31*Pi/128] ~= 0.952079146700925 */ \
-    OD_DCT_OVERFLOW_CHECK(tf, 7799, 4096, 163); \
-    t0 -= (tf*7799 + 4096) >> 13; \
-    /* 4091/4096 ~= Sin[31*Pi/64] ~= 0.998795456205172 */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 4091, 2048, 164); \
-    tf += (t0*4091 + 2048) >> 12; \
-    /* 7799/8192 ~= Tan[31*Pi/128] ~= 0.952079146700925 */ \
-    OD_DCT_OVERFLOW_CHECK(tf, 7799, 4096, 165); \
-    t0 -= (tf*7799 + 4096) >> 13; \
-    /* 2417/32768 ~= Tan[3*Pi/128] ~= 0.0737644315224493 */ \
-    OD_DCT_OVERFLOW_CHECK(te, 2417, 16384, 166); \
-    t1 += (te*2417 + 16384) >> 15; \
-    /* 601/4096 ~= Sin[3*Pi/64] ~= 0.146730474455362 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 601, 2048, 167); \
-    te -= (t1*601 + 2048) >> 12; \
-    /* 2417/32768 ~= Tan[3*Pi/128] ~= 0.0737644315224493 */ \
-    OD_DCT_OVERFLOW_CHECK(te, 2417, 16384, 168); \
-    t1 += (te*2417 + 16384) >> 15; \
-    /* 14525/32768 ~= Tan[17*Pi/128] ~= 0.443269513890864 */ \
-    OD_DCT_OVERFLOW_CHECK(t8, 14525, 16384, 169); \
-    t7 -= (t8*14525 + 16384) >> 15; \
-    /* 3035/4096 ~= Sin[17*Pi/64] ~= 0.740951125354959 */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 3035, 2048, 170); \
-    t8 += (t7*3035 + 2048) >> 12; \
-    /* 7263/16384 ~= Tan[17*Pi/128] ~= 0.443269513890864 */ \
-    OD_DCT_OVERFLOW_CHECK(t8, 7263, 8192, 171); \
-    t7 -= (t8*7263 + 8192) >> 14; \
-    /* 6393/8192 ~= Tan[27*Pi/128] ~= 0.780407659653944 */ \
-    OD_DCT_OVERFLOW_CHECK(td, 6393, 4096, 172); \
-    t2 -= (td*6393 + 4096) >> 13; \
-    /* 3973/4096 ~= Sin[27*Pi/64] ~= 0.970031253194544 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 3973, 2048, 173); \
-    td += (t2*3973 + 2048) >> 12; \
-    /* 6393/8192 ~= Tan[27*Pi/128] ~= 0.780407659653944 */ \
-    OD_DCT_OVERFLOW_CHECK(td, 6393, 4096, 174); \
-    t2 -= (td*6393 + 4096) >> 13; \
-    /* 9281/16384 ~= Tan[21*Pi/128] ~= 0.566493002730344 */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 9281, 8192, 175); \
-    t5 -= (ta*9281 + 8192) >> 14; \
-    /* 7027/8192 ~= Sin[21*Pi/64] ~= 0.857728610000272 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 7027, 4096, 176); \
-    ta += (t5*7027 + 4096) >> 13; \
-    /* 9281/16384 ~= Tan[21*Pi/128] ~= 0.566493002730344 */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 9281, 8192, 177); \
-    t5 -= (ta*9281 + 8192) >> 14; \
-    /* 11539/16384 ~= Tan[25*Pi/128] ~= 0.704279460865044 */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 11539, 8192, 178); \
-    t3 -= (tc*11539 + 8192) >> 14; \
-    /* 7713/8192 ~= Sin[25*Pi/64] ~= 0.941544065183021 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 7713, 4096, 179); \
-    tc += (t3*7713 + 4096) >> 13; \
-    /* 11539/16384 ~= Tan[25*Pi/128] ~= 0.704279460865044 */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 11539, 8192, 180); \
-    t3 -= (tc*11539 + 8192) >> 14; \
-    /* 10375/16384 ~= Tan[23*Pi/128] ~= 0.633243016177569 */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 10375, 8192, 181); \
-    t4 -= (tb*10375 + 8192) >> 14; \
-    /* 7405/8192 ~= Sin[23*Pi/64] ~= 0.903989293123443 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 7405, 4096, 182); \
-    tb += (t4*7405 + 4096) >> 13; \
-    /* 10375/16384 ~= Tan[23*Pi/128] ~= 0.633243016177569 */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 10375, 8192, 183); \
-    t4 -= (tb*10375 + 8192) >> 14; \
-    /* 8247/16384 ~= Tan[19*Pi/128] ~= 0.503357699799294 */ \
-    OD_DCT_OVERFLOW_CHECK(t9, 8247, 8192, 184); \
-    t6 -= (t9*8247 + 8192) >> 14; \
-    /* 1645/2048 ~= Sin[19*Pi/64] ~= 0.803207531480645 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 1645, 1024, 185); \
-    t9 += (t6*1645 + 1024) >> 11; \
-    /* 8247/16384 ~= Tan[19*Pi/128] ~= 0.503357699799294 */ \
-    OD_DCT_OVERFLOW_CHECK(t9, 8247, 8192, 186); \
-    t6 -= (t9*8247 + 8192) >> 14; \
-  } \
-  while (0)
-
-#define OD_IDST_16_ASYM_PR(t0, t0h, t8, t4, tc, t2, t2h, ta, t6, te, teh, \
-  t1, t9, t5, td, t3, tb, t7, tf) \
-  /* Embedded 16-point asymmetric Type-IV iDST. */ \
-  do { \
-    int t1h_; \
-    int t3h_; \
-    int t4h; \
-    int t6h; \
-    int t9h_; \
-    int tbh_; \
-    int tch; \
-    /* 8247/16384 ~= Tan[19*Pi/128] ~= 0.503357699799294 */ \
-    t6 += (t9*8247 + 8192) >> 14; \
-    /* 1645/2048 ~= Sin[19*Pi/64] ~= 0.803207531480645 */ \
-    t9 -= (t6*1645 + 1024) >> 11; \
-    /* 8247/16384 ~= Tan[19*Pi/128] ~= 0.503357699799294 */ \
-    t6 += (t9*8247 + 8192) >> 14; \
-    /* 10375/16384 ~= Tan[23*Pi/128] ~= 0.633243016177569 */ \
-    t2 += (td*10375 + 8192) >> 14; \
-    /* 7405/8192 ~= Sin[23*Pi/64] ~= 0.903989293123443 */ \
-    td -= (t2*7405 + 4096) >> 13; \
-    /* 10375/16384 ~= Tan[23*Pi/128] ~= 0.633243016177569 */ \
-    t2 += (td*10375 + 8192) >> 14; \
-    /* 11539/16384 ~= Tan[25*Pi/128] ~= 0.704279460865044 */ \
-    tc += (t3*11539 + 8192) >> 14; \
-    /* 7713/8192 ~= Sin[25*Pi/64] ~= 0.941544065183021 */ \
-    t3 -= (tc*7713 + 4096) >> 13; \
-    /* 11539/16384 ~= Tan[25*Pi/128] ~= 0.704279460865044 */ \
-    tc += (t3*11539 + 8192) >> 14; \
-    /* 9281/16384 ~= Tan[21*Pi/128] ~= 0.566493002730344 */ \
-    ta += (t5*9281 + 8192) >> 14; \
-    /* 7027/8192 ~= Sin[21*Pi/64] ~= 0.857728610000272 */ \
-    t5 -= (ta*7027 + 4096) >> 13; \
-    /* 9281/16384 ~= Tan[21*Pi/128] ~= 0.566493002730344 */ \
-    ta += (t5*9281 + 8192) >> 14; \
-    /* 6393/8192 ~= Tan[27*Pi/128] ~= 0.780407659653944 */ \
-    t4 += (tb*6393 + 4096) >> 13; \
-    /* 3973/4096 ~= Sin[27*Pi/64] ~= 0.970031253194544 */ \
-    tb -= (t4*3973 + 2048) >> 12; \
-    /* 6393/8192 ~= Tan[27*Pi/128] ~= 0.780407659653944 */ \
-    t4 += (tb*6393 + 4096) >> 13; \
-    /* 7263/16384 ~= Tan[17*Pi/128] ~= 0.443269513890864 */ \
-    te += (t1*7263 + 8192) >> 14; \
-    /* 3035/4096 ~= Sin[17*Pi/64] ~= 0.740951125354959 */ \
-    t1 -= (te*3035 + 2048) >> 12; \
-    /* 14525/32768 ~= Tan[17*Pi/128] ~= 0.443269513890864 */ \
-    te += (t1*14525 + 16384) >> 15; \
-    /* 2417/32768 ~= Tan[3*Pi/128] ~= 0.0737644315224493 */ \
-    t8 -= (t7*2417 + 16384) >> 15; \
-    /* 601/4096 ~= Sin[3*Pi/64] ~= 0.146730474455362 */ \
-    t7 += (t8*601 + 2048) >> 12; \
-    /* 2417/32768 ~= Tan[3*Pi/128] ~= 0.0737644315224493 */ \
-    t8 -= (t7*2417 + 16384) >> 15; \
-    /* 7799/8192 ~= Tan[31*Pi/128] ~= 0.952079146700925 */ \
-    t0 += (tf*7799 + 4096) >> 13; \
-    /* 4091/4096 ~= Sin[31*Pi/64] ~= 0.998795456205172 */ \
-    tf -= (t0*4091 + 2048) >> 12; \
-    /* 7799/8192 ~= Tan[31*Pi/128] ~= 0.952079146700925 */ \
-    t0 += (tf*7799 + 4096) >> 13; \
-    /* TODO: Can we move these into another operation */ \
-    t1 = -t1; \
-    t3 = -t3; \
-    t5 = -t5; \
-    t9 = -t9; \
-    tb = -tb; \
-    td = -td; \
-    tf = -tf; \
-    t4 += ta; \
-    t4h = OD_RSHIFT1(t4); \
-    ta = t4h - ta; \
-    tb -= t5; \
-    tbh_ = OD_RSHIFT1(tb); \
-    t5 += tbh_; \
-    tc += t2; \
-    tch = OD_RSHIFT1(tc); \
-    t2 -= tch; \
-    t3 -= td; \
-    t3h_ = OD_RSHIFT1(t3); \
-    td += t3h_; \
-    t9 += t8; \
-    t9h_ = OD_RSHIFT1(t9); \
-    t8 -= t9h_; \
-    t6 -= t7; \
-    t6h = OD_RSHIFT1(t6); \
-    t7 += t6h; \
-    t1 += tf; \
-    t1h_ = OD_RSHIFT1(t1); \
-    tf -= t1h_; \
-    te -= t0; \
-    teh = OD_RSHIFT1(te); \
-    t0 += teh; \
-    ta += t9h_; \
-    t9 = ta - t9; \
-    t5 -= t6h; \
-    t6 += t5; \
-    td = teh - td; \
-    te = td - te; \
-    t2 = t1h_ - t2; \
-    t1 -= t2; \
-    t7 += t4h; \
-    t4 -= t7; \
-    t8 -= tbh_; \
-    tb += t8; \
-    t0 += tch; \
-    tc -= t0; \
-    tf -= t3h_; \
-    t3 += tf; \
-    /* TODO: Can we move this into another operation */ \
-    ta = -ta; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    td += (t2*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    t2 -= (td*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    td += (t2*6723 + 4096) >> 13; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    t5 -= (ta*2485 + 4096) >> 13; \
-    /* 18205/32768 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
-    ta += (t5*18205 + 16384) >> 15; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    t5 -= (ta*2485 + 4096) >> 13; \
-    t2 += t5; \
-    t2h = OD_RSHIFT1(t2); \
-    t5 -= t2h; \
-    ta = td - ta; \
-    td -= OD_RSHIFT1(ta); \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    ta -= (t5*13573 + 8192) >> 14; \
-    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
-    t5 += (ta*11585 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    ta -= (t5*13573 + 8192) >> 14; \
-    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    t9 -= (t6*17515 + 16384) >> 15; \
-    /* 13623/16384 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
-    t6 += (t9*13623 + 8192) >> 14; \
-    /* 17515/32768 ~= Tan[5*Pi/32]) ~= 0.534511135950792 */ \
-    t9 -= (t6*17515 + 16384) >> 15; \
-    /* 6723/8192 ~= Tan[7*Pi/32]) ~= 0.820678790828660 */ \
-    t1 -= (te*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    te += (t1*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32]) ~= 0.820678790828660 */ \
-    t1 -= (te*6723 + 4096) >> 13; \
-    te += t6; \
-    teh = OD_RSHIFT1(te); \
-    t6 = teh - t6; \
-    t9 += t1; \
-    t1 -= OD_RSHIFT1(t9); \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    t9 -= (t6*19195 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    t6 -= (t9*11585 + 8192) >> 14; \
-    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    t9 += (t6*7489 + 4096) >> 13; \
-    tb = tc - tb; \
-    tc = OD_RSHIFT1(tb) - tc; \
-    t3 += t4; \
-    t4 = OD_RSHIFT1(t3) - t4; \
-    /* TODO: Can we move this into another operation */ \
-    t3 = -t3; \
-    t8 += tf; \
-    tf = OD_RSHIFT1(t8) - tf; \
-    t0 += t7; \
-    t0h = OD_RSHIFT1(t0); \
-    t7 = t0h - t7; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    t3 += (tc*4161 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    tc -= (t3*15137 + 8192) >> 14; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    t3 += (tc*14341 + 8192) >> 14; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    t4 -= (tb*14341 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    tb += (t4*15137 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    t4 -= (tb*4161 + 8192) >> 14; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    t8 += (t7*13573 + 8192) >> 14; \
-    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
-    t7 -= (t8*11585 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    t8 += (t7*13573 + 8192) >> 14; \
-    /* TODO: Can we move these into another operation */ \
-    t1 = -t1; \
-    t5 = -t5; \
-    t9 = -t9; \
-    tb = -tb; \
-    td = -td; \
-  } \
-  while (0)
-
-#define OD_FDCT_32_PR(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, \
-  te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
-  /* Embedded 32-point orthonormal Type-II fDCT. */ \
-  do { \
-    int tgh; \
-    int thh; \
-    int tih; \
-    int tkh; \
-    int tmh; \
-    int tnh; \
-    int toh; \
-    int tqh; \
-    int tsh; \
-    int tuh; \
-    int tvh; \
-    tv = t0 - tv; \
-    tvh = OD_RSHIFT1(tv); \
-    t0 -= tvh; \
-    tu += t1; \
-    tuh = OD_RSHIFT1(tu); \
-    t1 = tuh - t1; \
-    tt = t2 - tt; \
-    t2 -= OD_RSHIFT1(tt); \
-    ts += t3; \
-    tsh = OD_RSHIFT1(ts); \
-    t3 = tsh - t3; \
-    tr = t4 - tr; \
-    t4 -= OD_RSHIFT1(tr); \
-    tq += t5; \
-    tqh = OD_RSHIFT1(tq); \
-    t5 = tqh - t5; \
-    tp = t6 - tp; \
-    t6 -= OD_RSHIFT1(tp); \
-    to += t7; \
-    toh = OD_RSHIFT1(to); \
-    t7 = toh - t7; \
-    tn = t8 - tn; \
-    tnh = OD_RSHIFT1(tn); \
-    t8 -= tnh; \
-    tm += t9; \
-    tmh = OD_RSHIFT1(tm); \
-    t9 = tmh - t9; \
-    tl = ta - tl; \
-    ta -= OD_RSHIFT1(tl); \
-    tk += tb; \
-    tkh = OD_RSHIFT1(tk); \
-    tb = tkh - tb; \
-    tj = tc - tj; \
-    tc -= OD_RSHIFT1(tj); \
-    ti += td; \
-    tih = OD_RSHIFT1(ti); \
-    td = tih - td; \
-    th = te - th; \
-    thh = OD_RSHIFT1(th); \
-    te -= thh; \
-    tg += tf; \
-    tgh = OD_RSHIFT1(tg); \
-    tf = tgh - tf; \
-    OD_FDCT_16_ASYM_PR(t0, tg, tgh, t8, to, toh, t4, tk, tkh, tc, ts, tsh, \
-     t2, ti, tih, ta, tq, tqh, t6, tm, tmh, te, tu, tuh); \
-    OD_FDST_16_ASYM_PR(tv, tvh, tf, tn, tnh, t7, tr, tb, tj, t3, \
-     tt, td, tl, t5, tp, t9, th, thh, t1); \
-  } \
-  while (0)
-
-#define OD_IDCT_32_PR(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, \
-  te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
-  /* Embedded 32-point orthonormal Type-II iDCT. */ \
-  do { \
-    int t1h; \
-    int t3h; \
-    int t5h; \
-    int t7h; \
-    int t9h; \
-    int tbh; \
-    int tdh; \
-    int tfh; \
-    int thh; \
-    int tth; \
-    int tvh; \
-    OD_IDST_16_ASYM_PR(tv, tvh, tn, tr, tj, tt, tth, tl, tp, th, thh, \
-     tu, tm, tq, ti, ts, tk, to, tg); \
-    OD_IDCT_16_ASYM_PR(t0, t8, t4, tc, t2, ta, t6, te, \
-     t1, t1h, t9, t9h, t5, t5h, td, tdh, t3, t3h, tb, tbh, t7, t7h, tf, tfh); \
-    tu = t1h - tu; \
-    t1 -= tu; \
-    te += thh; \
-    th = te - th; \
-    tm = t9h - tm; \
-    t9 -= tm; \
-    t6 += OD_RSHIFT1(tp); \
-    tp = t6 - tp; \
-    tq = t5h - tq; \
-    t5 -= tq; \
-    ta += OD_RSHIFT1(tl); \
-    tl = ta - tl; \
-    ti = tdh - ti; \
-    td -= ti; \
-    t2 += tth; \
-    tt = t2 - tt; \
-    ts = t3h - ts; \
-    t3 -= ts; \
-    tc += OD_RSHIFT1(tj); \
-    tj = tc - tj; \
-    tk = tbh - tk; \
-    tb -= tk; \
-    t4 += OD_RSHIFT1(tr); \
-    tr = t4 - tr; \
-    to = t7h - to; \
-    t7 -= to; \
-    t8 += OD_RSHIFT1(tn); \
-    tn = t8 - tn; \
-    tg = tfh - tg; \
-    tf -= tg; \
-    t0 += tvh; \
-    tv = t0 - tv; \
-  } \
-  while (0)
-
-/* Embedded 32-point orthonormal Type-IV fDST. */
-#define OD_FDST_32_PR(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, \
-  te, tf, tg, th, ti, tj, tk, tl, tm, tn, to, tp, tq, tr, ts, tt, tu, tv) \
-  /* 117 "muls", 117 + 128 = 245 adds, 36 shifts */ \
-  do { \
-    od_coeff t0h; \
-    od_coeff t1h; \
-    od_coeff t2h; \
-    od_coeff t3h; \
-    od_coeff t4h; \
-    od_coeff t6h; \
-    od_coeff t8h; \
-    od_coeff t9h; \
-    od_coeff tah; \
-    od_coeff tbh; \
-    od_coeff tch; \
-    od_coeff tdh; \
-    od_coeff teh; \
-    od_coeff tfh; \
-    od_coeff tgh; \
-    od_coeff thh; \
-    od_coeff tih; \
-    od_coeff tjh; \
-    od_coeff tkh; \
-    od_coeff tlh; \
-    od_coeff tmh; \
-    od_coeff tnh; \
-    od_coeff tph; \
-    od_coeff trh; \
-    od_coeff tsh; \
-    od_coeff tth; \
-    od_coeff tuh; \
-    od_coeff tvh; \
-    /* Stage 0 */ \
-    tp += (t6*659 + 2048) >> 12; \
-    t6 -= (tp*10279 + 16384) >> 15; \
-    tp += (t6*659 + 2048) >> 12; \
-    th += (te*3045 + 4096) >> 13; \
-    te -= (th*21403 + 16384) >> 15; \
-    th += (te*3045 + 4096) >> 13; \
-    t9 += (tm*20191 + 16384) >> 15; \
-    tm -= (t9*29269 + 16384) >> 15; \
-    t9 += (tm*20191 + 16384) >> 15; \
-    tu += (t1*1207 + 16384) >> 15; \
-    t1 -= (tu*2411 + 16384) >> 15; \
-    tu += (t1*1207 + 16384) >> 15; \
-    t4 += (tr*13113 + 8192) >> 14; \
-    tr -= (t4*7993 + 4096) >> 13; \
-    t4 += (tr*13113 + 8192) >> 14; \
-    tj += (tc*10381 + 16384) >> 15; \
-    tc -= (tj*4717 + 4096) >> 13; \
-    tj += (tc*10381 + 16384) >> 15; \
-    tb += (tk*18035 + 16384) >> 15; \
-    tk -= (tb*6921 + 4096) >> 13; \
-    tb += (tk*18035 + 16384) >> 15; \
-    ts += (t3*1411 + 8192) >> 14; \
-    t3 -= (ts*2801 + 8192) >> 14; \
-    ts += (t3*1411 + 8192) >> 14; \
-    tq += (t5*2225 + 8192) >> 14; \
-    t5 -= (tq*2185 + 4096) >> 13; \
-    tq += (t5*2225 + 8192) >> 14; \
-    ti += (td*11273 + 16384) >> 15; \
-    td -= (ti*315 + 256) >> 9; \
-    ti += (td*11273 + 16384) >> 15; \
-    tl += (ta*8637 + 16384) >> 15; \
-    ta -= (tl*16151 + 16384) >> 15; \
-    tl += (ta*8637 + 16384) >> 15; \
-    tt += (t2*2013 + 16384) >> 15; \
-    t2 -= (tt*4011 + 16384) >> 15; \
-    tt += (t2*2013 + 16384) >> 15; \
-    to += (t7*6101 + 16384) >> 15; \
-    t7 -= (to*11793 + 16384) >> 15; \
-    to += (t7*6101 + 16384) >> 15; \
-    t8 += (tn*10659 + 8192) >> 14; \
-    tn -= (t8*29957 + 16384) >> 15; \
-    t8 += (tn*10659 + 8192) >> 14; \
-    tg += (tf*819 + 1024) >> 11; \
-    tf -= (tg*22595 + 16384) >> 15; \
-    tg += (tf*819 + 1024) >> 11; \
-    t0 += (tv*31973 + 16384) >> 15; \
-    tv -= (t0*16379 + 8192) >> 14; \
-    t0 += (tv*31973 + 16384) >> 15; \
-    /* Stage 1 */ \
-    tj -= ts; \
-    tjh = OD_RSHIFT1(tj); \
-    ts += tjh; \
-    tr = tk - tr; \
-    trh = OD_RSHIFT1(tr); \
-    tk = trh - tk; \
-    tc += t3; \
-    tch = OD_RSHIFT1(tc); \
-    t3 -= tch; \
-    t4 += tb; \
-    t4h = OD_RSHIFT1(t4); \
-    tb -= t4h; \
-    tv += tf; \
-    tvh = OD_RSHIFT1(tv); \
-    tf -= tvh; \
-    t8 -= to; \
-    t8h = OD_RSHIFT1(t8); \
-    to += t8h; \
-    t0 += tg; \
-    t0h = OD_RSHIFT1(t0); \
-    tg -= t0h; \
-    tn = t7 - tn; \
-    tnh = OD_RSHIFT1(tn); \
-    t7 -= tnh; \
-    th -= tu; \
-    thh = OD_RSHIFT1(th); \
-    tu += thh; \
-    t6 += tm; \
-    t6h = OD_RSHIFT1(t6); \
-    tm = t6h - tm; \
-    te += t1; \
-    teh = OD_RSHIFT1(te); \
-    t1 -= teh; \
-    tp += t9; \
-    tph = OD_RSHIFT1(tp); \
-    t9 -= tph; \
-    t2 -= td; \
-    t2h = OD_RSHIFT1(t2); \
-    td += t2h; \
-    tl = tq - tl; \
-    tlh = OD_RSHIFT1(tl); \
-    tq -= tlh; \
-    tt += ti; \
-    tth = OD_RSHIFT1(tt); \
-    ti -= tth; \
-    ta += t5; \
-    tah = OD_RSHIFT1(ta); \
-    t5 -= tah; \
-    /* Stage 2 */ \
-    tm -= thh; \
-    th += tm; \
-    t9 = teh - t9; \
-    te -= t9; \
-    td = tlh - td; \
-    tl -= td; \
-    ti += tah; \
-    ta -= ti; \
-    tk = tjh - tk; \
-    tj -= tk; \
-    tb -= tch; \
-    tc += tb; \
-    tg += tnh; \
-    tn = tg - tn; \
-    tf += t8h; \
-    t8 = tf - t8; \
-    t3 -= trh; \
-    tr += t3; \
-    ts += t4h; \
-    t4 -= ts; \
-    to -= t0h; \
-    t0 += to; \
-    t7 = tvh - t7; \
-    tv = t7 - tv; \
-    t1 -= t6h; \
-    t6 += t1; \
-    tu += tph; \
-    tp -= tu; \
-    tq -= tth; \
-    tt += tq; \
-    t5 += t2h; \
-    t2 -= t5; \
-    /* Stage 3 */ \
-    tj += (tc*11725 + 16384) >> 15; \
-    tc -= (tj*5197 + 4096) >> 13; \
-    tj += (tc*11725 + 16384) >> 15; \
-    td += (ti*513 + 1024) >> 11; \
-    ti -= (td*15447 + 16384) >> 15; \
-    td += (ti*513 + 1024) >> 11; \
-    th += (te*4861 + 16384) >> 15; \
-    te -= (th*1189 + 2048) >> 12; \
-    th += (te*4861 + 16384) >> 15; \
-    tg += (tf*805 + 8192) >> 14; \
-    tf -= (tg*803 + 4096) >> 13; \
-    tg += (tf*805 + 8192) >> 14; \
-    tb += (tk*7749 + 8192) >> 14; \
-    tk -= (tb*12665 + 8192) >> 14; \
-    tb += (tk*7749 + 8192) >> 14; \
-    tl += (ta*2455 + 2048) >> 12; \
-    ta -= (tl*28899 + 16384) >> 15; \
-    tl += (ta*2455 + 2048) >> 12; \
-    t9 += (tm*12151 + 8192) >> 14; \
-    tm -= (t9*31357 + 16384) >> 15; \
-    t9 += (tm*12151 + 8192) >> 14; \
-    tn += (t8*29699 + 16384) >> 15; \
-    t8 -= (tn*16305 + 8192) >> 14; \
-    tn += (t8*29699 + 16384) >> 15; \
-    /* Stage 4 */ \
-    tf -= tc; \
-    tfh = OD_RSHIFT1(tf); \
-    tc += tfh; \
-    ti = th - ti; \
-    tih = OD_RSHIFT1(ti); \
-    th -= tih; \
-    tg += tj; \
-    tgh = OD_RSHIFT1(tg); \
-    tj = tgh - tj; \
-    td -= te; \
-    tdh = OD_RSHIFT1(td); \
-    te += tdh; \
-    tm = ta - tm; \
-    tmh = OD_RSHIFT1(tm); \
-    ta = tmh - ta; \
-    t9 += tl; \
-    t9h = OD_RSHIFT1(t9); \
-    tl -= t9h; \
-    tb += t8; \
-    tbh = OD_RSHIFT1(tb); \
-    t8 -= tbh; \
-    tk += tn; \
-    tkh = OD_RSHIFT1(tk); \
-    tn -= tkh; \
-    t1 -= t2; \
-    t1h = OD_RSHIFT1(t1); \
-    t2 += t1h; \
-    t3 += tv; \
-    t3h = OD_RSHIFT1(t3); \
-    tv -= t3h; \
-    tu += tt; \
-    tuh = OD_RSHIFT1(tu); \
-    tt -= tuh; \
-    ts -= t0; \
-    tsh = OD_RSHIFT1(ts); \
-    t0 += tsh; \
-    tq = t6 - tq; \
-    t6 -= OD_RSHIFT1(tq); \
-    to += tr; \
-    tr = OD_RSHIFT1(to) - tr; \
-    t7 = t4 - t7; \
-    t4 -= OD_RSHIFT1(t7); \
-    t5 -= tp; \
-    tp += OD_RSHIFT1(t5); \
-    /* Stage 5 */ \
-    tp += (t6*2485 + 4096) >> 13; \
-    t6 -= (tp*18205 + 16384) >> 15; \
-    tp += (t6*2485 + 4096) >> 13; \
-    to += (t7*3227 + 16384) >> 15; \
-    t7 -= (to*6393 + 16384) >> 15; \
-    to += (t7*3227 + 16384) >> 15; \
-    tq += (t5*17515 + 16384) >> 15; \
-    t5 -= (tq*13623 + 8192) >> 14; \
-    tq += (t5*17515 + 16384) >> 15; \
-    t4 += (tr*6723 + 4096) >> 13; \
-    tr -= (t4*16069 + 8192) >> 14; \
-    t4 += (tr*6723 + 4096) >> 13; \
-    /* Stage 6 */ \
-    tj += tdh; \
-    td -= tj; \
-    tc -= tih; \
-    ti += tc; \
-    th = tgh - th; \
-    tg -= th; \
-    te += tfh; \
-    tf -= te; \
-    tl = tkh - tl; \
-    tk -= tl; \
-    ta += tbh; \
-    tb -= ta; \
-    tn -= tmh; \
-    tm += tn; \
-    t8 += t9h; \
-    t9 = t8 - t9; \
-    tt = t3h - tt; \
-    t3 -= tt; \
-    t2 -= tsh; \
-    ts += t2; \
-    tv -= t1h; \
-    t1 += tv; \
-    t0 += tuh; \
-    tu -= t0; \
-    tp = OD_RSHIFT1(to) - tp; \
-    to -= tp; \
-    t6 += OD_RSHIFT1(t7); \
-    t7 -= t6; \
-    t4 = OD_RSHIFT1(tq) - t4; \
-    tq -= t4; \
-    tr += OD_RSHIFT1(t5); \
-    t5 = tr - t5; \
-    /* Stage 7 */ \
-    td += (ti*21895 + 16384) >> 15; \
-    ti -= (td*15137 + 8192) >> 14; \
-    td += (ti*21895 + 16384) >> 15; \
-    tj += (tc*21895 + 16384) >> 15; \
-    tc -= (tj*15137 + 8192) >> 14; \
-    tj += (tc*21895 + 16384) >> 15; \
-    th += (te*13573 + 16384) >> 15; \
-    te -= (th*11585 + 8192) >> 14; \
-    th += (te*13573 + 16384) >> 15; \
-    tb += (tk*21895 + 16384) >> 15; \
-    tk -= (tb*15137 + 8192) >> 14; \
-    tb += (tk*21895 + 16384) >> 15; \
-    ta += (tl*3259 + 8192) >> 14; \
-    tl -= (ta*3135 + 4096) >> 13; \
-    ta += (tl*3259 + 8192) >> 14; \
-    t9 += (tm*13573 + 16384) >> 15; \
-    tm -= (t9*11585 + 8192) >> 14; \
-    t9 += (tm*13573 + 16384) >> 15; \
-    ts += (t3*3259 + 8192) >> 14; \
-    t3 -= (ts*3135 + 4096) >> 13; \
-    ts += (t3*3259 + 8192) >> 14; \
-    t2 += (tt*3259 + 8192) >> 14; \
-    tt -= (t2*3135 + 4096) >> 13; \
-    t2 += (tt*3259 + 8192) >> 14; \
-    tu += (t1*13573 + 16384) >> 15; \
-    t1 -= (tu*11585 + 8192) >> 14; \
-    tu += (t1*13573 + 16384) >> 15; \
-    tp += (t6*13573 + 16384) >> 15; \
-    t6 -= (tp*11585 + 8192) >> 14; \
-    tp += (t6*13573 + 16384) >> 15; \
-    tq += (t5*13573 + 16384) >> 15; \
-    t5 -= (tq*11585 + 8192) >> 14; \
-    tq += (t5*13573 + 16384) >> 15; \
-  } \
-  while (0)
-
-/* Embedded 32-point orthonormal Type-IV iDST. */
-#define OD_IDST_32_PR(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, \
-  te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
-  /* 117 "muls", 117 + 128 = 245 adds, 36 shifts */ \
-  do { \
-    od_coeff t0h; \
-    od_coeff t1h; \
-    od_coeff t2h; \
-    od_coeff t3h; \
-    od_coeff t4h; \
-    od_coeff t6h; \
-    od_coeff t8h; \
-    od_coeff t9h; \
-    od_coeff tah; \
-    od_coeff tbh; \
-    od_coeff tch; \
-    od_coeff tdh; \
-    od_coeff teh; \
-    od_coeff tfh; \
-    od_coeff tgh; \
-    od_coeff thh; \
-    od_coeff tih; \
-    od_coeff tjh; \
-    od_coeff tkh; \
-    od_coeff tlh; \
-    od_coeff tmh; \
-    od_coeff tnh; \
-    od_coeff tph; \
-    od_coeff trh; \
-    od_coeff tsh; \
-    od_coeff tth; \
-    od_coeff tuh; \
-    od_coeff tvh; \
-    /* Stage 0 */ \
-    tq -= (t5*13573 + 16384) >> 15; \
-    t5 += (tq*11585 + 8192) >> 14; \
-    tq -= (t5*13573 + 16384) >> 15; \
-    tp -= (t6*13573 + 16384) >> 15; \
-    t6 += (tp*11585 + 8192) >> 14; \
-    tp -= (t6*13573 + 16384) >> 15; \
-    tu -= (t1*13573 + 16384) >> 15; \
-    t1 += (tu*11585 + 8192) >> 14; \
-    tu -= (t1*13573 + 16384) >> 15; \
-    t2 -= (tt*3259 + 8192) >> 14; \
-    tt += (t2*3135 + 4096) >> 13; \
-    t2 -= (tt*3259 + 8192) >> 14; \
-    ts -= (t3*3259 + 8192) >> 14; \
-    t3 += (ts*3135 + 4096) >> 13; \
-    ts -= (t3*3259 + 8192) >> 14; \
-    t9 -= (tm*13573 + 16384) >> 15; \
-    tm += (t9*11585 + 8192) >> 14; \
-    t9 -= (tm*13573 + 16384) >> 15; \
-    ta -= (tl*3259 + 8192) >> 14; \
-    tl += (ta*3135 + 4096) >> 13; \
-    ta -= (tl*3259 + 8192) >> 14; \
-    tb -= (tk*21895 + 16384) >> 15; \
-    tk += (tb*15137 + 8192) >> 14; \
-    tb -= (tk*21895 + 16384) >> 15; \
-    th -= (te*13573 + 16384) >> 15; \
-    te += (th*11585 + 8192) >> 14; \
-    th -= (te*13573 + 16384) >> 15; \
-    tj -= (tc*21895 + 16384) >> 15; \
-    tc += (tj*15137 + 8192) >> 14; \
-    tj -= (tc*21895 + 16384) >> 15; \
-    td -= (ti*21895 + 16384) >> 15; \
-    ti += (td*15137 + 8192) >> 14; \
-    td -= (ti*21895 + 16384) >> 15; \
-    /* Stage 1 */ \
-    t5 = tr - t5; \
-    tr -= OD_RSHIFT1(t5); \
-    tq += t4; \
-    t4 = OD_RSHIFT1(tq) - t4; \
-    t7 += t6; \
-    t6 -= OD_RSHIFT1(t7); \
-    to += tp; \
-    tp = OD_RSHIFT1(to) - tp; \
-    tu += t0; \
-    tuh = OD_RSHIFT1(tu); \
-    t0 -= tuh; \
-    t1 -= tv; \
-    t1h = OD_RSHIFT1(t1); \
-    tv += t1h; \
-    ts -= t2; \
-    tsh = OD_RSHIFT1(ts); \
-    t2 += tsh; \
-    t3 += tt; \
-    t3h = OD_RSHIFT1(t3); \
-    tt = t3h - tt; \
-    t9 = t8 - t9; \
-    t9h = OD_RSHIFT1(t9); \
-    t8 -= t9h; \
-    tm -= tn; \
-    tmh = OD_RSHIFT1(tm); \
-    tn += tmh; \
-    tb += ta; \
-    tbh = OD_RSHIFT1(tb); \
-    ta -= tbh; \
-    tk += tl; \
-    tkh = OD_RSHIFT1(tk); \
-    tl = tkh - tl; \
-    tf += te; \
-    tfh = OD_RSHIFT1(tf); \
-    te -= tfh; \
-    tg += th; \
-    tgh = OD_RSHIFT1(tg); \
-    th = tgh - th; \
-    ti -= tc; \
-    tih = OD_RSHIFT1(ti); \
-    tc += tih; \
-    td += tj; \
-    tdh = OD_RSHIFT1(td); \
-    tj -= tdh; \
-    /* Stage 2 */ \
-    t4 -= (tr*6723 + 4096) >> 13; \
-    tr += (t4*16069 + 8192) >> 14; \
-    t4 -= (tr*6723 + 4096) >> 13; \
-    tq -= (t5*17515 + 16384) >> 15; \
-    t5 += (tq*13623 + 8192) >> 14; \
-    tq -= (t5*17515 + 16384) >> 15; \
-    to -= (t7*3227 + 16384) >> 15; \
-    t7 += (to*6393 + 16384) >> 15; \
-    to -= (t7*3227 + 16384) >> 15; \
-    tp -= (t6*2485 + 4096) >> 13; \
-    t6 += (tp*18205 + 16384) >> 15; \
-    tp -= (t6*2485 + 4096) >> 13; \
-    /* Stage 3 */ \
-    tp -= OD_RSHIFT1(t5); \
-    t5 += tp; \
-    t4 += OD_RSHIFT1(t7); \
-    t7 = t4 - t7; \
-    tr = OD_RSHIFT1(to) - tr; \
-    to -= tr; \
-    t6 += OD_RSHIFT1(tq); \
-    tq = t6 - tq; \
-    t0 -= tsh; \
-    ts += t0; \
-    tt += tuh; \
-    tu -= tt; \
-    tv += t3h; \
-    t3 -= tv; \
-    t2 -= t1h; \
-    t1 += t2; \
-    tn += tkh; \
-    tk -= tn; \
-    t8 += tbh; \
-    tb -= t8; \
-    tl += t9h; \
-    t9 -= tl; \
-    ta = tmh - ta; \
-    tm = ta - tm; \
-    te -= tdh; \
-    td += te; \
-    tj = tgh - tj; \
-    tg -= tj; \
-    th += tih; \
-    ti = th - ti; \
-    tc -= tfh; \
-    tf += tc; \
-    /* Stage 4 */ \
-    tn -= (t8*29699 + 16384) >> 15; \
-    t8 += (tn*16305 + 8192) >> 14; \
-    tn -= (t8*29699 + 16384) >> 15; \
-    t9 -= (tm*12151 + 8192) >> 14; \
-    tm += (t9*31357 + 16384) >> 15; \
-    t9 -= (tm*12151 + 8192) >> 14; \
-    tl -= (ta*2455 + 2048) >> 12; \
-    ta += (tl*28899 + 16384) >> 15; \
-    tl -= (ta*2455 + 2048) >> 12; \
-    tb -= (tk*7749 + 8192) >> 14; \
-    tk += (tb*12665 + 8192) >> 14; \
-    tb -= (tk*7749 + 8192) >> 14; \
-    tg -= (tf*805 + 8192) >> 14; \
-    tf += (tg*803 + 4096) >> 13; \
-    tg -= (tf*805 + 8192) >> 14; \
-    th -= (te*4861 + 16384) >> 15; \
-    te += (th*1189 + 2048) >> 12; \
-    th -= (te*4861 + 16384) >> 15; \
-    td -= (ti*513 + 1024) >> 11; \
-    ti += (td*15447 + 16384) >> 15; \
-    td -= (ti*513 + 1024) >> 11; \
-    tj -= (tc*11725 + 16384) >> 15; \
-    tc += (tj*5197 + 4096) >> 13; \
-    tj -= (tc*11725 + 16384) >> 15; \
-    /* Stage 5 */ \
-    t2 += t5; \
-    t2h = OD_RSHIFT1(t2); \
-    t5 -= t2h; \
-    tt -= tq; \
-    tth = OD_RSHIFT1(tt); \
-    tq += tth; \
-    tp += tu; \
-    tph = OD_RSHIFT1(tp); \
-    tu -= tph; \
-    t6 -= t1; \
-    t6h = OD_RSHIFT1(t6); \
-    t1 += t6h; \
-    tv = t7 - tv; \
-    tvh = OD_RSHIFT1(tv); \
-    t7 = tvh - t7; \
-    t0 -= to; \
-    t0h = OD_RSHIFT1(t0); \
-    to += t0h; \
-    t4 += ts; \
-    t4h = OD_RSHIFT1(t4); \
-    ts -= t4h; \
-    tr -= t3; \
-    trh = OD_RSHIFT1(tr); \
-    t3 += trh; \
-    t8 = tf - t8; \
-    t8h = OD_RSHIFT1(t8); \
-    tf -= t8h; \
-    tn = tg - tn; \
-    tnh = OD_RSHIFT1(tn); \
-    tg -= tnh; \
-    tc -= tb; \
-    tch = OD_RSHIFT1(tc); \
-    tb += tch; \
-    tj += tk; \
-    tjh = OD_RSHIFT1(tj); \
-    tk = tjh - tk; \
-    ta += ti; \
-    tah = OD_RSHIFT1(ta); \
-    ti -= tah; \
-    tl += td; \
-    tlh = OD_RSHIFT1(tl); \
-    td = tlh - td; \
-    te += t9; \
-    teh = OD_RSHIFT1(te); \
-    t9 = teh - t9; \
-    th -= tm; \
-    thh = OD_RSHIFT1(th); \
-    tm += thh; \
-    /* Stage 6 */ \
-    t5 += tah; \
-    ta -= t5; \
-    ti += tth; \
-    tt -= ti; \
-    tq += tlh; \
-    tl = tq - tl; \
-    td -= t2h; \
-    t2 += td; \
-    t9 += tph; \
-    tp -= t9; \
-    t1 += teh; \
-    te -= t1; \
-    tm = t6h - tm; \
-    t6 -= tm; \
-    tu -= thh; \
-    th += tu; \
-    t7 += tnh; \
-    tn = t7 - tn; \
-    tg += t0h; \
-    t0 -= tg; \
-    to -= t8h; \
-    t8 += to; \
-    tf += tvh; \
-    tv -= tf; \
-    tb += t4h; \
-    t4 -= tb; \
-    t3 += tch; \
-    tc -= t3; \
-    tk = trh - tk; \
-    tr = tk - tr; \
-    ts -= tjh; \
-    tj += ts; \
-    /* Stage 7 */ \
-    t0 -= (tv*31973 + 16384) >> 15; \
-    tv += (t0*16379 + 8192) >> 14; \
-    t0 -= (tv*31973 + 16384) >> 15; \
-    tg -= (tf*819 + 1024) >> 11; \
-    tf += (tg*22595 + 16384) >> 15; \
-    tg -= (tf*819 + 1024) >> 11; \
-    t8 -= (tn*10659 + 8192) >> 14; \
-    tn += (t8*29957 + 16384) >> 15; \
-    t8 -= (tn*10659 + 8192) >> 14; \
-    to -= (t7*6101 + 16384) >> 15; \
-    t7 += (to*11793 + 16384) >> 15; \
-    to -= (t7*6101 + 16384) >> 15; \
-    tt -= (t2*2013 + 16384) >> 15; \
-    t2 += (tt*4011 + 16384) >> 15; \
-    tt -= (t2*2013 + 16384) >> 15; \
-    tl -= (ta*8637 + 16384) >> 15; \
-    ta += (tl*16151 + 16384) >> 15; \
-    tl -= (ta*8637 + 16384) >> 15; \
-    ti -= (td*11273 + 16384) >> 15; \
-    td += (ti*315 + 256) >> 9; \
-    ti -= (td*11273 + 16384) >> 15; \
-    tq -= (t5*2225 + 8192) >> 14; \
-    t5 += (tq*2185 + 4096) >> 13; \
-    tq -= (t5*2225 + 8192) >> 14; \
-    ts -= (t3*1411 + 8192) >> 14; \
-    t3 += (ts*2801 + 8192) >> 14; \
-    ts -= (t3*1411 + 8192) >> 14; \
-    tb -= (tk*18035 + 16384) >> 15; \
-    tk += (tb*6921 + 4096) >> 13; \
-    tb -= (tk*18035 + 16384) >> 15; \
-    tj -= (tc*10381 + 16384) >> 15; \
-    tc += (tj*4717 + 4096) >> 13; \
-    tj -= (tc*10381 + 16384) >> 15; \
-    t4 -= (tr*13113 + 8192) >> 14; \
-    tr += (t4*7993 + 4096) >> 13; \
-    t4 -= (tr*13113 + 8192) >> 14; \
-    tu -= (t1*1207 + 16384) >> 15; \
-    t1 += (tu*2411 + 16384) >> 15; \
-    tu -= (t1*1207 + 16384) >> 15; \
-    t9 -= (tm*20191 + 16384) >> 15; \
-    tm += (t9*29269 + 16384) >> 15; \
-    t9 -= (tm*20191 + 16384) >> 15; \
-    th -= (te*3045 + 4096) >> 13; \
-    te += (th*21403 + 16384) >> 15; \
-    th -= (te*3045 + 4096) >> 13; \
-    tp -= (t6*659 + 2048) >> 12; \
-    t6 += (tp*10279 + 16384) >> 15; \
-    tp -= (t6*659 + 2048) >> 12; \
-  } \
-  while (0)
-
-#if CONFIG_TX64X64
-#define OD_FDCT_32_ASYM_PR(t0, tg, tgh, t8, to, toh, t4, tk, tkh, tc, ts, tsh, \
-  t2, ti, tih, ta, tq, tqh, t6, tm, tmh, te, tu, tuh, t1, th, thh, \
-  t9, tp, tph, t5, tl, tlh, td, tt, tth, t3, tj, tjh, tb, tr, trh, \
-  t7, tn, tnh, tf, tv, tvh) \
-  /* Embedded 32-point asymmetric Type-II fDCT. */ \
-  do { \
-    t0 += tvh; \
-    tv = t0 - tv; \
-    t1 = tuh - t1; \
-    tu -= t1; \
-    t2 += tth; \
-    tt = t2 - tt; \
-    t3 = tsh - t3; \
-    ts -= t3; \
-    t4 += trh; \
-    tr = t4 - tr; \
-    t5 = tqh - t5; \
-    tq -= t5; \
-    t6 += tph; \
-    tp = t6 - tp; \
-    t7 = toh - t7; \
-    to -= t7; \
-    t8 += tnh; \
-    tn = t8 - tn; \
-    t9 = tmh - t9; \
-    tm -= t9; \
-    ta += tlh; \
-    tl = ta - tl; \
-    tb = tkh - tb; \
-    tk -= tb; \
-    tc += tjh; \
-    tj = tc - tj; \
-    td = tih - td; \
-    ti -= td; \
-    te += thh; \
-    th = te - th; \
-    tf = tgh - tf; \
-    tg -= tf; \
-    OD_FDCT_16_PR(t0, tg, t8, to, t4, tk, tc, ts, \
-     t2, ti, ta, tq, t6, tm, te, tu); \
-    OD_FDST_16_PR(tv, tf, tn, t7, tr, tb, tj, t3, \
-     tt, td, tl, t5, tp, t9, th, t1); \
-  } \
-  while (0)
-
-#define OD_IDCT_32_ASYM_PR(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, \
-  t6, tm, te, tu, t1, t1h, th, thh, t9, t9h, tp, tph, t5, t5h, tl, tlh, \
-  td, tdh, tt, tth, t3, t3h, tj, tjh, tb, tbh, tr, trh, t7, t7h, tn, tnh, \
-  tf, tfh, tv, tvh) \
-  /* Embedded 32-point asymmetric Type-II iDCT. */ \
-  do { \
-    OD_IDST_16_PR(tv, tn, tr, tj, tt, tl, tp, th, \
-     tu, tm, tq, ti, ts, tk, to, tg); \
-    OD_IDCT_16_PR(t0, t8, t4, tc, t2, ta, t6, te, \
-     t1, t9, t5, td, t3, tb, t7, tf); \
-    tv = t0 - tv; \
-    tvh = OD_RSHIFT1(tv); \
-    t0 -= tvh; \
-    t1 += tu; \
-    t1h = OD_RSHIFT1(t1); \
-    tu = t1h - tu; \
-    tt = t2 - tt; \
-    tth = OD_RSHIFT1(tt); \
-    t2 -= tth; \
-    t3 += ts; \
-    t3h = OD_RSHIFT1(t3); \
-    ts = t3h - ts; \
-    tr = t4 - tr; \
-    trh = OD_RSHIFT1(tr); \
-    t4 -= trh; \
-    t5 += tq; \
-    t5h = OD_RSHIFT1(t5); \
-    tq = t5h - tq; \
-    tp = t6 - tp; \
-    tph = OD_RSHIFT1(tp); \
-    t6 -= tph; \
-    t7 += to; \
-    t7h = OD_RSHIFT1(t7); \
-    to = t7h - to; \
-    tn = t8 - tn; \
-    tnh = OD_RSHIFT1(tn); \
-    t8 -= tnh; \
-    t9 += tm; \
-    t9h = OD_RSHIFT1(t9); \
-    tm = t9h - tm; \
-    tl = ta - tl; \
-    tlh = OD_RSHIFT1(tl); \
-    ta -= tlh; \
-    tb += tk; \
-    tbh = OD_RSHIFT1(tb); \
-    tk = tbh - tk; \
-    tj = tc - tj; \
-    tjh = OD_RSHIFT1(tj); \
-    tc -= tjh; \
-    td += ti; \
-    tdh = OD_RSHIFT1(td); \
-    ti = tdh - ti; \
-    th = te - th; \
-    thh = OD_RSHIFT1(th); \
-    te -= thh; \
-    tf += tg; \
-    tfh = OD_RSHIFT1(tf); \
-    tg = tfh - tg; \
-  } \
-  while (0)
-
-#define OD_FDST_32_ASYM_PR(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, \
-  tm, te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
-  /* Embedded 32-point asymmetric Type-IV fDST. */ \
-  do { \
-    int t0h; \
-    int t1h; \
-    int t4h; \
-    int t5h; \
-    int tqh; \
-    int trh; \
-    int tuh; \
-    int tvh; \
-    \
-    tu = -tu; \
-    \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    OD_DCT_OVERFLOW_CHECK(tq, 13573, 8192, 271); \
-    t5 -= (tq*13573 + 8192) >> 14; \
-    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 11585, 16384, 272); \
-    tq += (t5*11585 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    OD_DCT_OVERFLOW_CHECK(tq, 13573, 8192, 273); \
-    t5 -= (tq*13573 + 8192) >> 14; \
-    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 29957, 16384, 274); \
-    tp += (t6*29957 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    OD_DCT_OVERFLOW_CHECK(tp, 11585, 8192, 275); \
-    t6 -= (tp*11585 + 8192) >> 14; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 19195, 16384, 276); \
-    tp -= (t6*19195 + 16384) >> 15; \
-    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 29957, 16384, 277); \
-    tu += (t1*29957 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    OD_DCT_OVERFLOW_CHECK(tu, 11585, 8192, 278); \
-    t1 -= (tu*11585 + 8192) >> 14; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 19195, 16384, 279); \
-    tu -= (t1*19195 + 16384) >> 15; \
-    /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 28681, 16384, 280); \
-    tt += (t2*28681 + 16384) >> 15; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(tt, 15137, 8192, 281); \
-    t2 -= (tt*15137 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 4161, 8192, 282); \
-    tt += (t2*4161 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    OD_DCT_OVERFLOW_CHECK(ts, 4161, 8192, 283); \
-    t3 += (ts*4161 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 15137, 8192, 284); \
-    ts -= (t3*15137 + 8192) >> 14; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    OD_DCT_OVERFLOW_CHECK(ts, 14341, 8192, 285); \
-    t3 += (ts*14341 + 8192) >> 14; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    OD_DCT_OVERFLOW_CHECK(tm, 19195, 16384, 286); \
-    t9 -= (tm*19195 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    OD_DCT_OVERFLOW_CHECK(t9, 11585, 8192, 287); \
-    tm -= (t9*11585 + 8192) >> 14; \
-    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(tm, 7489, 4096, 288); \
-    t9 += (tm*7489 + 4096) >> 13; \
-    /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \
-    OD_DCT_OVERFLOW_CHECK(tl, 3259, 4096, 289); \
-    ta += (tl*3259 + 4096) >> 13; \
-    /* 3135/16384 ~= Sin[Pi/8]/2 ~= 0.1913417161825449 */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 3135, 8192, 290); \
-    tl -= (ta*3135 + 8192) >> 14; \
-    /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \
-    OD_DCT_OVERFLOW_CHECK(tl, 3259, 4096, 291); \
-    ta += (tl*3259 + 4096) >> 13; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    OD_DCT_OVERFLOW_CHECK(tk, 4161, 8192, 292); \
-    tb += (tk*4161 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 15137, 8192, 293); \
-    tk -= (tb*15137 + 8192) >> 14; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    OD_DCT_OVERFLOW_CHECK(tk, 14341, 8192, 294); \
-    tb += (tk*14341 + 8192) >> 14; \
-    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(te, 29957, 16384, 295); \
-    th += (te*29957 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    OD_DCT_OVERFLOW_CHECK(th, 11585, 8192, 296); \
-    te -= (th*11585 + 8192) >> 14; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    OD_DCT_OVERFLOW_CHECK(te, 19195, 16384, 297); \
-    th -= (te*19195 + 16384) >> 15; \
-    /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 28681, 16384, 298); \
-    tj += (tc*28681 + 16384) >> 15; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(tj, 15137, 8192, 299); \
-    tc -= (tj*15137 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 4161, 8192, 300); \
-    tj += (tc*4161 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    OD_DCT_OVERFLOW_CHECK(ti, 4161, 8192, 301); \
-    td += (ti*4161 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(td, 15137, 8192, 302); \
-    ti -= (td*15137 + 8192) >> 14; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    OD_DCT_OVERFLOW_CHECK(ti, 14341, 8192, 303); \
-    td += (ti*14341 + 8192) >> 14; \
-    \
-    t1 = -t1; \
-    t2 = -t2; \
-    t3 = -t3; \
-    td = -td; \
-    tg = -tg; \
-    to = -to; \
-    ts = -ts; \
-    \
-    tr -= OD_RSHIFT1(t5); \
-    t5 += tr; \
-    tq -= OD_RSHIFT1(t4); /* pass */ \
-    t4 += tq; \
-    t6 -= OD_RSHIFT1(t7); \
-    t7 += t6; \
-    to -= OD_RSHIFT1(tp); /* pass */ \
-    tp += to; \
-    t1 += OD_RSHIFT1(t0); /* pass */ \
-    t0 -= t1; \
-    tv -= OD_RSHIFT1(tu); \
-    tu += tv; \
-    t3 -= OD_RSHIFT1(tt); \
-    tt += t3; \
-    t2 += OD_RSHIFT1(ts); \
-    ts -= t2; \
-    t9 -= OD_RSHIFT1(t8); /* pass */ \
-    t8 += t9; \
-    tn += OD_RSHIFT1(tm); \
-    tm -= tn; \
-    tb += OD_RSHIFT1(ta); \
-    ta -= tb; \
-    tl -= OD_RSHIFT1(tk); \
-    tk += tl; \
-    te -= OD_RSHIFT1(tf); /* pass */ \
-    tf += te; \
-    tg -= OD_RSHIFT1(th); \
-    th += tg; \
-    tc -= OD_RSHIFT1(ti); \
-    ti += tc; \
-    td += OD_RSHIFT1(tj); \
-    tj -= td; \
-    \
-    t4 = -t4; \
-    \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \
-    OD_DCT_OVERFLOW_CHECK(tr, 6723, 4096, 304); \
-    t4 += (tr*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.9807852804032304 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 16069, 8192, 305); \
-    tr -= (t4*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \
-    OD_DCT_OVERFLOW_CHECK(tr, 6723, 4096, 306); \
-    t4 += (tr*6723 + 4096) >> 13; \
-    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \
-    OD_DCT_OVERFLOW_CHECK(tq, 17515, 16384, 307); \
-    t5 += (tq*17515 + 16384) >> 15; \
-    /* 13623/16384 ~= Sin[5*Pi/16] ~= 0.8314696123025452 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 13623, 8192, 308); \
-    tq -= (t5*13623 + 8192) >> 14; \
-    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \
-    OD_DCT_OVERFLOW_CHECK(tq, 17515, 16384, 309); \
-    t5 += (tq*17515 + 16384) >> 15; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    OD_DCT_OVERFLOW_CHECK(to, 3227, 16384, 310); \
-    t7 += (to*3227 + 16384) >> 15; \
-    /* 6393/32768 ~= Sin[Pi/16] ~= 0.19509032201612825 */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 6393, 16384, 311); \
-    to -= (t7*6393 + 16384) >> 15; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    OD_DCT_OVERFLOW_CHECK(to, 3227, 16384, 312); \
-    t7 += (to*3227 + 16384) >> 15; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    OD_DCT_OVERFLOW_CHECK(tp, 2485, 4096, 313); \
-    t6 += (tp*2485 + 4096) >> 13; \
-    /* 18205/32768 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 18205, 16384, 314); \
-    tp -= (t6*18205 + 16384) >> 15; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    OD_DCT_OVERFLOW_CHECK(tp, 2485, 4096, 315); \
-    t6 += (tp*2485 + 4096) >> 13; \
-    \
-    t5 = -t5; \
-    \
-    tr += to; \
-    trh = OD_RSHIFT1(tr); \
-    to -= trh; \
-    t4 += t7; \
-    t4h = OD_RSHIFT1(t4); \
-    t7 -= t4h; \
-    t5 += tp; \
-    t5h = OD_RSHIFT1(t5); \
-    tp -= t5h; \
-    tq += t6; \
-    tqh = OD_RSHIFT1(tq); \
-    t6 -= tqh; \
-    t0 -= t3; \
-    t0h = OD_RSHIFT1(t0); \
-    t3 += t0h; \
-    tv -= ts; \
-    tvh = OD_RSHIFT1(tv); \
-    ts += tvh; \
-    tu += tt; \
-    tuh = OD_RSHIFT1(tu); \
-    tt -= tuh; \
-    t1 -= t2; \
-    t1h = OD_RSHIFT1(t1); \
-    t2 += t1h; \
-    t8 += tb; \
-    tb -= OD_RSHIFT1(t8); \
-    tn += tk; \
-    tk -= OD_RSHIFT1(tn); \
-    t9 += tl; \
-    tl -= OD_RSHIFT1(t9); \
-    tm -= ta; \
-    ta += OD_RSHIFT1(tm); \
-    tc -= tf; \
-    tf += OD_RSHIFT1(tc); \
-    tj += tg; \
-    tg -= OD_RSHIFT1(tj); \
-    td -= te; \
-    te += OD_RSHIFT1(td); \
-    ti += th; \
-    th -= OD_RSHIFT1(ti); \
-    \
-    t9 = -t9; \
-    tl = -tl; \
-    \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    OD_DCT_OVERFLOW_CHECK(tn, 805, 8192, 316); \
-    t8 += (tn*805 + 8192) >> 14; \
-    /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \
-    OD_DCT_OVERFLOW_CHECK(t8, 803, 4096, 317); \
-    tn -= (t8*803 + 4096) >> 13; \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    OD_DCT_OVERFLOW_CHECK(tn, 805, 8192, 318); \
-    t8 += (tn*805 + 8192) >> 14; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 11725, 16384, 319); \
-    tk += (tb*11725 + 16384) >> 15; \
-    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \
-    OD_DCT_OVERFLOW_CHECK(tk, 5197, 4096, 320); \
-    tb -= (tk*5197 + 4096) >> 13; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 11725, 16384, 321); \
-    tk += (tb*11725 + 16384) >> 15; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \
-    OD_DCT_OVERFLOW_CHECK(tl, 2455, 2048, 322); \
-    ta += (tl*2455 + 2048) >> 12; \
-    /* 14449/16384 ~= Sin[11*Pi/32] ~= 0.881921264348355 */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 14449, 8192, 323); \
-    tl -= (ta*14449 + 8192) >> 14; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \
-    OD_DCT_OVERFLOW_CHECK(tl, 2455, 2048, 324); \
-    ta += (tl*2455 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    OD_DCT_OVERFLOW_CHECK(tm, 4861, 16384, 325); \
-    t9 += (tm*4861 + 16384) >> 15; \
-    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \
-    OD_DCT_OVERFLOW_CHECK(t9, 1189, 2048, 326); \
-    tm -= (t9*1189 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    OD_DCT_OVERFLOW_CHECK(tm, 4861, 16384, 327); \
-    t9 += (tm*4861 + 16384) >> 15; \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    OD_DCT_OVERFLOW_CHECK(tg, 805, 8192, 328); \
-    tf += (tg*805 + 8192) >> 14; \
-    /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \
-    OD_DCT_OVERFLOW_CHECK(tf, 803, 4096, 329); \
-    tg -= (tf*803 + 4096) >> 13; \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    OD_DCT_OVERFLOW_CHECK(tg, 805, 8192, 330); \
-    tf += (tg*805 + 8192) >> 14; \
-    /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    OD_DCT_OVERFLOW_CHECK(tj, 2931, 4096, 331); \
-    tc += (tj*2931 + 4096) >> 13; \
-    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 5197, 4096, 332); \
-    tj -= (tc*5197 + 4096) >> 13; \
-    /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    OD_DCT_OVERFLOW_CHECK(tj, 2931, 4096, 333); \
-    tc += (tj*2931 + 4096) >> 13; \
-    /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \
-    OD_DCT_OVERFLOW_CHECK(ti, 513, 1024, 334); \
-    td += (ti*513 + 1024) >> 11; \
-    /* 7723/16384 ~= Sin[5*Pi/32] ~= 0.47139673682599764 */ \
-    OD_DCT_OVERFLOW_CHECK(td, 7723, 8192, 335); \
-    ti -= (td*7723 + 8192) >> 14; \
-    /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \
-    OD_DCT_OVERFLOW_CHECK(ti, 513, 1024, 336); \
-    td += (ti*513 + 1024) >> 11; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    OD_DCT_OVERFLOW_CHECK(th, 4861, 16384, 337); \
-    te += (th*4861 + 16384) >> 15; \
-    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \
-    OD_DCT_OVERFLOW_CHECK(te, 1189, 2048, 338); \
-    th -= (te*1189 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    OD_DCT_OVERFLOW_CHECK(th, 4861, 16384, 339); \
-    te += (th*4861 + 16384) >> 15; \
-    \
-    ta = -ta; \
-    tb = -tb; \
-    \
-    tt += t5h; \
-    t5 -= tt; \
-    t2 -= tqh; \
-    tq += t2; \
-    tp += t1h; \
-    t1 -= tp; \
-    t6 -= tuh; \
-    tu += t6; \
-    t7 += tvh; \
-    tv -= t7; \
-    to += t0h; \
-    t0 -= to; \
-    t3 -= t4h; \
-    t4 += t3; \
-    ts += trh; \
-    tr -= ts; \
-    tf -= OD_RSHIFT1(tn); \
-    tn += tf; \
-    tg -= OD_RSHIFT1(t8); \
-    t8 += tg; \
-    tk += OD_RSHIFT1(tc); \
-    tc -= tk; \
-    tb += OD_RSHIFT1(tj); \
-    tj -= tb; \
-    ta += OD_RSHIFT1(ti); \
-    ti -= ta; \
-    tl += OD_RSHIFT1(td); \
-    td -= tl; \
-    te -= OD_RSHIFT1(tm); \
-    tm += te; \
-    th -= OD_RSHIFT1(t9); \
-    t9 += th; \
-    ta -= t5; \
-    t5 += OD_RSHIFT1(ta); \
-    tq -= tl; \
-    tl += OD_RSHIFT1(tq); \
-    t2 -= ti; \
-    ti += OD_RSHIFT1(t2); \
-    td -= tt; \
-    tt += OD_RSHIFT1(td); \
-    tm += tp; \
-    tp -= OD_RSHIFT1(tm); \
-    t6 += t9; \
-    t9 -= OD_RSHIFT1(t6); \
-    te -= tu; \
-    tu += OD_RSHIFT1(te); \
-    t1 -= th; \
-    th += OD_RSHIFT1(t1); \
-    t0 -= tg; \
-    tg += OD_RSHIFT1(t0); \
-    tf += tv; \
-    tv -= OD_RSHIFT1(tf); \
-    t8 -= t7; \
-    t7 += OD_RSHIFT1(t8); \
-    to -= tn; \
-    tn += OD_RSHIFT1(to); \
-    t4 -= tk; \
-    tk += OD_RSHIFT1(t4); \
-    tb -= tr; \
-    tr += OD_RSHIFT1(tb); \
-    t3 -= tj; \
-    tj += OD_RSHIFT1(t3); \
-    tc -= ts; \
-    ts += OD_RSHIFT1(tc); \
-    \
-    tr = -tr; \
-    ts = -ts; \
-    tt = -tt; \
-    tu = -tu; \
-    \
-    /* 2847/4096 ~= (1/Sqrt[2] - Cos[63*Pi/128]/2)/Sin[63*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 2847, 2048, 340); \
-    tv += (t0*2847 + 2048) >> 12; \
-    /* 5791/4096 ~= Sqrt[2]*Sin[63*Pi/128] */  \
-    OD_DCT_OVERFLOW_CHECK(tv, 5791, 2048, 341); \
-    t0 -= (tv*5791 + 2048) >> 12; \
-    /* 5593/8192 ~= (1/Sqrt[2] - Cos[63*Pi/128])/Sin[63*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 5593, 4096, 342); \
-    tv += (t0*5593 + 4096) >> 13; \
-    /* 4099/8192 ~= (1/Sqrt[2] - Cos[31*Pi/128]/2)/Sin[31*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tf, 4099, 4096, 343); \
-    tg -= (tf*4099 + 4096) >> 13; \
-    /* 1997/2048 ~= Sqrt[2]*Sin[31*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tg, 1997, 1024, 344); \
-    tf += (tg*1997 + 1024) >> 11; \
-    /* -815/32768 ~= (1/Sqrt[2] - Cos[31*Pi/128])/Sin[31*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tf, 815, 16384, 345); \
-    tg += (tf*815 + 16384) >> 15; \
-    /* 2527/4096 ~= (1/Sqrt[2] - Cos[17*Pi/128]/2)/Sin[17*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t8, 2527, 2048, 346); \
-    tn -= (t8*2527 + 2048) >> 12; \
-    /* 4695/8192 ~= Sqrt[2]*Sin[17*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tn, 4695, 4096, 347); \
-    t8 += (tn*4695 + 4096) >> 13; \
-    /* -4187/8192 ~= (1/Sqrt[2] - Cos[17*Pi/128])/Sin[17*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t8, 4187, 4096, 348); \
-    tn += (t8*4187 + 4096) >> 13; \
-    /* 5477/8192 ~= (1/Sqrt[2] - Cos[15*Pi/128]/2)/Sin[15*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(to, 5477, 4096, 349); \
-    t7 += (to*5477 + 4096) >> 13; \
-    /* 4169/8192 ~= Sqrt[2]*Sin[15*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 4169, 4096, 350); \
-    to -= (t7*4169 + 4096) >> 13; \
-    /* -2571/4096 ~= (1/Sqrt[2] - Cos[15*Pi/128])/Sin[15*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(to, 2571, 2048, 351); \
-    t7 -= (to*2571 + 2048) >> 12; \
-    /* 5331/8192 ~= (1/Sqrt[2] - Cos[59*Pi/128]/2)/Sin[59*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 5331, 4096, 352); \
-    tt += (t2*5331 + 4096) >> 13; \
-    /* 5749/4096 ~= Sqrt[2]*Sin[59*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tt, 5749, 2048, 353); \
-    t2 -= (tt*5749 + 2048) >> 12; \
-    /* 2413/4096 ~= (1/Sqrt[2] - Cos[59*Pi/128])/Sin[59*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 2413, 2048, 354); \
-    tt += (t2*2413 + 2048) >> 12; \
-    /* 4167/8192 ~= (1/Sqrt[2] - Cos[27*Pi/128]/2)/Sin[27*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(td, 4167, 4096, 355); \
-    ti -= (td*4167 + 4096) >> 13; \
-    /* 891/1024 ~= Sqrt[2]*Sin[27*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(ti, 891, 512, 356); \
-    td += (ti*891 + 512) >> 10; \
-    /* -4327/32768 ~= (1/Sqrt[2] - Cos[27*Pi/128])/Sin[27*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(td, 4327, 16384, 357); \
-    ti += (td*4327 + 16384) >> 15; \
-    /* 2261/4096 ~= (1/Sqrt[2] - Cos[21*Pi/128]/2)/Sin[21*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 2261, 2048, 358); \
-    tl -= (ta*2261 + 2048) >> 12; \
-    /* 2855/4096 ~= Sqrt[2]*Sin[21*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tl, 2855, 2048, 359); \
-    ta += (tl*2855 + 2048) >> 12; \
-    /* -5417/16384 ~= (1/Sqrt[2] - Cos[21*Pi/128])/Sin[21*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 5417, 8192, 360); \
-    tl += (ta*5417 + 8192) >> 14; \
-    /* 3459/4096 ~= (1/Sqrt[2] - Cos[11*Pi/128]/2)/Sin[11*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tq, 3459, 2048, 361); \
-    t5 += (tq*3459 + 2048) >> 12; \
-    /* 1545/4096 ~= Sqrt[2]*Sin[11*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 1545, 2048, 362); \
-    tq -= (t5*1545 + 2048) >> 12; \
-    /* -1971/2048 ~= (1/Sqrt[2] - Cos[11*Pi/128])/Sin[11*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tq, 1971, 1024, 363); \
-    t5 -= (tq*1971 + 1024) >> 11; \
-    /* 323/512 ~= (1/Sqrt[2] - Cos[57*Pi/128]/2)/Sin[57*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 323, 256, 364); \
-    ts += (t3*323 + 256) >> 9; \
-    /* 5707/4096 ~= Sqrt[2]*Sin[57*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(ts, 5707, 2048, 365); \
-    t3 -= (ts*5707 + 2048) >> 12; \
-    /* 2229/4096 ~= (1/Sqrt[2] - Cos[57*Pi/128])/Sin[57*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 2229, 2048, 366); \
-    ts += (t3*2229 + 2048) >> 12; \
-    /* 1061/2048 ~= (1/Sqrt[2] - Cos[25*Pi/128]/2)/Sin[25*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 1061, 1024, 367); \
-    tj -= (tc*1061 + 1024) >> 11; \
-    /* 6671/8192 ~= Sqrt[2]*Sin[25*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tj, 6671, 4096, 368); \
-    tc += (tj*6671 + 4096) >> 13; \
-    /* -6287/32768 ~= (1/Sqrt[2] - Cos[25*Pi/128])/Sin[25*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 6287, 16384, 369); \
-    tj += (tc*6287 + 16384) >> 15; \
-    /* 4359/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128]/2)/Sin[23*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 4359, 4096, 370); \
-    tk -= (tb*4359 + 4096) >> 13; \
-    /* 3099/4096 ~= Sqrt[2]*Sin[23*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tk, 3099, 2048, 371); \
-    tb += (tk*3099 + 2048) >> 12; \
-    /* -2109/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128])/Sin[23*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 2109, 4096, 372); \
-    tk += (tb*2109 + 4096) >> 13; \
-    /* 5017/8192 ~= (1/Sqrt[2] - Cos[55*Pi/128]/2)/Sin[55*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 5017, 4096, 373); \
-    tr += (t4*5017 + 4096) >> 13; \
-    /* 1413/1024 ~= Sqrt[2]*Sin[55*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tr, 1413, 512, 374); \
-    t4 -= (tr*1413 + 512) >> 10; \
-    /* 8195/16384 ~= (1/Sqrt[2] - Cos[55*Pi/128])/Sin[55*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 8195, 8192, 375); \
-    tr += (t4*8195 + 8192) >> 14; \
-    /* 2373/4096 ~= (1/Sqrt[2] - Cos[19*Pi/128]/2)/Sin[19*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tm, 2373, 2048, 376); \
-    t9 += (tm*2373 + 2048) >> 12; \
-    /* 5209/8192 ~= Sqrt[2]*Sin[19*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t9, 5209, 4096, 377); \
-    tm -= (t9*5209 + 4096) >> 13; \
-    /* -3391/8192 ~= (1/Sqrt[2] - Cos[19*Pi/128])/Sin[19*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tm, 3391, 4096, 378); \
-    t9 -= (tm*3391 + 4096) >> 13; \
-    /* 1517/2048 ~= (1/Sqrt[2] - Cos[13*Pi/128]/2)/Sin[13*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 1517, 1024, 379); \
-    tp -= (t6*1517 + 1024) >> 11; \
-    /* 1817/4096 ~= Sqrt[2]*Sin[13*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tp, 1817, 2048, 380); \
-    t6 += (tp*1817 + 2048) >> 12; \
-    /* -6331/8192 ~= (1/Sqrt[2] - Cos[13*Pi/128])/Sin[13*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 6331, 4096, 381); \
-    tp += (t6*6331 + 4096) >> 13; \
-    /* 515/1024 ~= (1/Sqrt[2] - Cos[29*Pi/128]/2)/Sin[29*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(te, 515, 512, 382); \
-    th -= (te*515 + 512) >> 10; \
-    /* 7567/8192 ~= Sqrt[2]*Sin[29*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(th, 7567, 4096, 383); \
-    te += (th*7567 + 4096) >> 13; \
-    /* -2513/32768 ~= (1/Sqrt[2] - Cos[29*Pi/128])/Sin[29*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(te, 2513, 16384, 384); \
-    th += (te*2513 + 16384) >> 15; \
-    /* 2753/4096 ~= (1/Sqrt[2] - Cos[61*Pi/128]/2)/Sin[61*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 2753, 2048, 385); \
-    tu += (t1*2753 + 2048) >> 12; \
-    /* 5777/4096 ~= Sqrt[2]*Sin[61*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tu, 5777, 2048, 386); \
-    t1 -= (tu*5777 + 2048) >> 12; \
-    /* 1301/2048 ~= (1/Sqrt[2] - Cos[61*Pi/128])/Sin[61*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 1301, 1024, 387); \
-    tu += (t1*1301 + 1024) >> 11; \
-  } \
-  while (0)
-
-#define OD_IDST_32_ASYM_PR(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, \
-  tm, te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
-  /* Embedded 32-point asymmetric Type-IV iDST. */ \
-  do { \
-    int t0h; \
-    int t4h; \
-    int tbh; \
-    int tfh; \
-    int tgh; \
-    int tkh; \
-    int trh; \
-    int tvh; \
-    /* 1301/2048 ~= (1/Sqrt[2] - Cos[61*Pi/128])/Sin[61*Pi/128] */ \
-    tf -= (tg*1301 + 1024) >> 11; \
-    /* 5777/4096 ~= Sqrt[2]*Sin[61*Pi/128] */ \
-    tg += (tf*5777 + 2048) >> 12; \
-    /* 2753/4096 ~= (1/Sqrt[2] - Cos[61*Pi/128]/2)/Sin[61*Pi/128] */ \
-    tf -= (tg*2753 + 2048) >> 12; \
-    /* -2513/32768 ~= (1/Sqrt[2] - Cos[29*Pi/128])/Sin[29*Pi/128] */ \
-    th -= (te*2513 + 16384) >> 15; \
-    /* 7567/8192 ~= Sqrt[2]*Sin[29*Pi/128] */ \
-    te -= (th*7567 + 4096) >> 13; \
-    /* 515/1024 ~= (1/Sqrt[2] - Cos[29*Pi/128]/2)/Sin[29*Pi/128] */ \
-    th += (te*515 + 512) >> 10; \
-    /* -6331/8192 ~= (1/Sqrt[2] - Cos[13*Pi/128])/Sin[13*Pi/128] */ \
-    tj -= (tc*6331 + 4096) >> 13; \
-    /* 1817/4096 ~= Sqrt[2]*Sin[13*Pi/128] */ \
-    tc -= (tj*1817 + 2048) >> 12; \
-    /* 1517/2048 ~= (1/Sqrt[2] - Cos[13*Pi/128]/2)/Sin[13*Pi/128] */ \
-    tj += (tc*1517 + 1024) >> 11; \
-    /* -3391/8192 ~= (1/Sqrt[2] - Cos[19*Pi/128])/Sin[19*Pi/128] */ \
-    ti += (td*3391 + 4096) >> 13; \
-    /* 5209/8192 ~= Sqrt[2]*Sin[19*Pi/128] */ \
-    td += (ti*5209 + 4096) >> 13; \
-    /* 2373/4096 ~= (1/Sqrt[2] - Cos[19*Pi/128]/2)/Sin[19*Pi/128] */ \
-    ti -= (td*2373 + 2048) >> 12; \
-    /* 8195/16384 ~= (1/Sqrt[2] - Cos[55*Pi/128])/Sin[55*Pi/128] */ \
-    tr -= (t4*8195 + 8192) >> 14; \
-    /* 1413/1024 ~= Sqrt[2]*Sin[55*Pi/128] */ \
-    t4 += (tr*1413 + 512) >> 10; \
-    /* 5017/8192 ~= (1/Sqrt[2] - Cos[55*Pi/128]/2)/Sin[55*Pi/128] */ \
-    tr -= (t4*5017 + 4096) >> 13; \
-    /* -2109/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128])/Sin[23*Pi/128] */ \
-    t5 -= (tq*2109 + 4096) >> 13; \
-    /* 3099/4096 ~= Sqrt[2]*Sin[23*Pi/128] */ \
-    tq -= (t5*3099 + 2048) >> 12; \
-    /* 4359/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128]/2)/Sin[23*Pi/128] */ \
-    t5 += (tq*4359 + 4096) >> 13; \
-    /* -6287/32768 ~= (1/Sqrt[2] - Cos[25*Pi/128])/Sin[25*Pi/128] */ \
-    tp -= (t6*6287 + 16384) >> 15; \
-    /* 6671/8192 ~= Sqrt[2]*Sin[25*Pi/128] */ \
-    t6 -= (tp*6671 + 4096) >> 13; \
-    /* 1061/2048 ~= (1/Sqrt[2] - Cos[25*Pi/128]/2)/Sin[25*Pi/128] */ \
-    tp += (t6*1061 + 1024) >> 11; \
-    /* 2229/4096 ~= (1/Sqrt[2] - Cos[57*Pi/128])/Sin[57*Pi/128] */ \
-    t7 -= (to*2229 + 2048) >> 12; \
-    /* 5707/4096 ~= Sqrt[2]*Sin[57*Pi/128] */ \
-    to += (t7*5707 + 2048) >> 12; \
-    /* 323/512 ~= (1/Sqrt[2] - Cos[57*Pi/128]/2)/Sin[57*Pi/128] */ \
-    t7 -= (to*323 + 256) >> 9; \
-    /* -1971/2048 ~= (1/Sqrt[2] - Cos[11*Pi/128])/Sin[11*Pi/128] */ \
-    tk += (tb*1971 + 1024) >> 11; \
-    /* 1545/4096 ~= Sqrt[2]*Sin[11*Pi/128] */ \
-    tb += (tk*1545 + 2048) >> 12; \
-    /* 3459/4096 ~= (1/Sqrt[2] - Cos[11*Pi/128]/2)/Sin[11*Pi/128] */ \
-    tk -= (tb*3459 + 2048) >> 12; \
-    /* -5417/16384 ~= (1/Sqrt[2] - Cos[21*Pi/128])/Sin[21*Pi/128] */ \
-    tl -= (ta*5417 + 8192) >> 14; \
-    /* 2855/4096 ~= Sqrt[2]*Sin[21*Pi/128] */ \
-    ta -= (tl*2855 + 2048) >> 12; \
-    /* 2261/4096 ~= (1/Sqrt[2] - Cos[21*Pi/128]/2)/Sin[21*Pi/128] */ \
-    tl += (ta*2261 + 2048) >> 12; \
-    /* -4327/32768 ~= (1/Sqrt[2] - Cos[27*Pi/128])/Sin[27*Pi/128] */ \
-    t9 -= (tm*4327 + 16384) >> 15; \
-    /* 891/1024 ~= Sqrt[2]*Sin[27*Pi/128] */ \
-    tm -= (t9*891 + 512) >> 10; \
-    /* 4167/8192 ~= (1/Sqrt[2] - Cos[27*Pi/128]/2)/Sin[27*Pi/128] */ \
-    t9 += (tm*4167 + 4096) >> 13; \
-    /* 2413/4096 ~= (1/Sqrt[2] - Cos[59*Pi/128])/Sin[59*Pi/128] */ \
-    tn -= (t8*2413 + 2048) >> 12; \
-    /* 5749/4096 ~= Sqrt[2]*Sin[59*Pi/128] */ \
-    t8 += (tn*5749 + 2048) >> 12; \
-    /* 5331/8192 ~= (1/Sqrt[2] - Cos[59*Pi/128]/2)/Sin[59*Pi/128] */ \
-    tn -= (t8*5331 + 4096) >> 13; \
-    /* -2571/4096 ~= (1/Sqrt[2] - Cos[15*Pi/128])/Sin[15*Pi/128] */ \
-    ts += (t3*2571 + 2048) >> 12; \
-    /* 4169/8192 ~= Sqrt[2]*Sin[15*Pi/128] */ \
-    t3 += (ts*4169 + 4096) >> 13; \
-    /* 5477/8192 ~= (1/Sqrt[2] - Cos[15*Pi/128]/2)/Sin[15*Pi/128] */ \
-    ts -= (t3*5477 + 4096) >> 13; \
-    /* -4187/8192 ~= (1/Sqrt[2] - Cos[17*Pi/128])/Sin[17*Pi/128] */ \
-    tt -= (t2*4187 + 4096) >> 13; \
-    /* 4695/8192 ~= Sqrt[2]*Sin[17*Pi/128] */ \
-    t2 -= (tt*4695 + 4096) >> 13; \
-    /* 2527/4096 ~= (1/Sqrt[2] - Cos[17*Pi/128]/2)/Sin[17*Pi/128] */ \
-    tt += (t2*2527 + 2048) >> 12; \
-    /* -815/32768 ~= (1/Sqrt[2] - Cos[31*Pi/128])/Sin[31*Pi/128] */ \
-    t1 -= (tu*815 + 16384) >> 15; \
-    /* 1997/2048 ~= Sqrt[2]*Sin[31*Pi/128] */ \
-    tu -= (t1*1997 + 1024) >> 11; \
-    /* 4099/8192 ~= (1/Sqrt[2] - Cos[31*Pi/128]/2)/Sin[31*Pi/128] */ \
-    t1 += (tu*4099 + 4096) >> 13; \
-    /* 5593/8192 ~= (1/Sqrt[2] - Cos[63*Pi/128])/Sin[63*Pi/128] */ \
-    tv -= (t0*5593 + 4096) >> 13; \
-    /* 5791/4096 ~= Sqrt[2]*Sin[63*Pi/128] */ \
-    t0 += (tv*5791 + 2048) >> 12; \
-    /* 2847/4096 ~= (1/Sqrt[2] - Cos[63*Pi/128]/2)/Sin[63*Pi/128] */ \
-    tv -= (t0*2847 + 2048) >> 12; \
-    \
-    t7 = -t7; \
-    tf = -tf; \
-    tn = -tn; \
-    tr = -tr; \
-    \
-    t7 -= OD_RSHIFT1(t6); \
-    t6 += t7; \
-    tp -= OD_RSHIFT1(to); \
-    to += tp; \
-    tr -= OD_RSHIFT1(tq); \
-    tq += tr; \
-    t5 -= OD_RSHIFT1(t4); \
-    t4 += t5; \
-    tt -= OD_RSHIFT1(t3); \
-    t3 += tt; \
-    ts -= OD_RSHIFT1(t2); \
-    t2 += ts; \
-    tv += OD_RSHIFT1(tu); \
-    tu -= tv; \
-    t1 -= OD_RSHIFT1(t0); \
-    t0 += t1; \
-    th -= OD_RSHIFT1(tg); \
-    tg += th; \
-    tf -= OD_RSHIFT1(te); \
-    te += tf; \
-    ti += OD_RSHIFT1(tc); \
-    tc -= ti; \
-    tj += OD_RSHIFT1(td); \
-    td -= tj; \
-    tn -= OD_RSHIFT1(tm); \
-    tm += tn; \
-    t9 -= OD_RSHIFT1(t8); \
-    t8 += t9; \
-    tl -= OD_RSHIFT1(tb); \
-    tb += tl; \
-    tk -= OD_RSHIFT1(ta); \
-    ta += tk; \
-    \
-    ti -= th; \
-    th += OD_RSHIFT1(ti); \
-    td -= te; \
-    te += OD_RSHIFT1(td); \
-    tm += tl; \
-    tl -= OD_RSHIFT1(tm); \
-    t9 += ta; \
-    ta -= OD_RSHIFT1(t9); \
-    tp += tq; \
-    tq -= OD_RSHIFT1(tp); \
-    t6 += t5; \
-    t5 -= OD_RSHIFT1(t6); \
-    t2 -= t1; \
-    t1 += OD_RSHIFT1(t2); \
-    tt -= tu; \
-    tu += OD_RSHIFT1(tt); \
-    tr += t7; \
-    trh = OD_RSHIFT1(tr); \
-    t7 -= trh; \
-    t4 -= to; \
-    t4h = OD_RSHIFT1(t4); \
-    to += t4h; \
-    t0 += t3; \
-    t0h = OD_RSHIFT1(t0); \
-    t3 -= t0h; \
-    tv += ts; \
-    tvh = OD_RSHIFT1(tv); \
-    ts -= tvh; \
-    tf -= tc; \
-    tfh = OD_RSHIFT1(tf); \
-    tc += tfh; \
-    tg += tj; \
-    tgh = OD_RSHIFT1(tg); \
-    tj -= tgh; \
-    tb -= t8; \
-    tbh = OD_RSHIFT1(tb); \
-    t8 += tbh; \
-    tk += tn; \
-    tkh = OD_RSHIFT1(tk); \
-    tn -= tkh; \
-    \
-    ta = -ta; \
-    tq = -tq; \
-    \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    te -= (th*4861 + 16384) >> 15; \
-    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \
-    th += (te*1189 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    te -= (th*4861 + 16384) >> 15; \
-    /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \
-    tm -= (t9*513 + 1024) >> 11; \
-    /* 7723/16384 ~= Sin[5*Pi/32] ~= 0.47139673682599764 */ \
-    t9 += (tm*7723 + 8192) >> 14; \
-    /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \
-    tm -= (t9*513 + 1024) >> 11; \
-    /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    t6 -= (tp*2931 + 4096) >> 13; \
-    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \
-    tp += (t6*5197 + 4096) >> 13; \
-    /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    t6 -= (tp*2931 + 4096) >> 13; \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    tu -= (t1*805 + 8192) >> 14; \
-    /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \
-    t1 += (tu*803 + 4096) >> 13; \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    tu -= (t1*805 + 8192) >> 14; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    ti -= (td*4861 + 16384) >> 15; \
-    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \
-    td += (ti*1189 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    ti -= (td*4861 + 16384) >> 15; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \
-    ta -= (tl*2455 + 2048) >> 12; \
-    /* 14449/16384 ~= Sin[11*Pi/32] ~= 0.881921264348355 */ \
-    tl += (ta*14449 + 8192) >> 14; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \
-    ta -= (tl*2455 + 2048) >> 12; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    t5 -= (tq*11725 + 16384) >> 15; \
-    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \
-    tq += (t5*5197 + 4096) >> 13; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    t5 -= (tq*11725 + 16384) >> 15; \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    t2 -= (tt*805 + 8192) >> 14; \
-    /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \
-    tt += (t2*803 + 4096) >> 13; \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    t2 -= (tt*805 + 8192) >> 14; \
-    \
-    tl = -tl; \
-    ti = -ti; \
-    \
-    th += OD_RSHIFT1(t9); \
-    t9 -= th; \
-    te -= OD_RSHIFT1(tm); \
-    tm += te; \
-    t1 += OD_RSHIFT1(tp); \
-    tp -= t1; \
-    tu -= OD_RSHIFT1(t6); \
-    t6 += tu; \
-    ta -= OD_RSHIFT1(td); \
-    td += ta; \
-    tl += OD_RSHIFT1(ti); \
-    ti -= tl; \
-    t5 += OD_RSHIFT1(tt); \
-    tt -= t5; \
-    tq += OD_RSHIFT1(t2); \
-    t2 -= tq; \
-    \
-    t8 -= tgh; \
-    tg += t8; \
-    tn += tfh; \
-    tf -= tn; \
-    t7 -= tvh; \
-    tv += t7; \
-    to -= t0h; \
-    t0 += to; \
-    tc += tbh; \
-    tb -= tc; \
-    tj += tkh; \
-    tk -= tj; \
-    ts += t4h; \
-    t4 -= ts; \
-    t3 += trh; \
-    tr -= t3; \
-    \
-    tk = -tk; \
-    \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    tc -= (tj*2485 + 4096) >> 13; \
-    /* 18205/32768 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
-    tj += (tc*18205 + 16384) >> 15; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    tc -= (tj*2485 + 4096) >> 13; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    ts -= (t3*3227 + 16384) >> 15; \
-    /* 6393/32768 ~= Sin[Pi/16] ~= 0.19509032201612825 */ \
-    t3 += (ts*6393 + 16384) >> 15; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    ts -= (t3*3227 + 16384) >> 15; \
-    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \
-    tk -= (tb*17515 + 16384) >> 15; \
-    /* 13623/16384 ~= Sin[5*Pi/16] ~= 0.8314696123025452 */ \
-    tb += (tk*13623 + 8192) >> 14; \
-    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \
-    tk -= (tb*17515 + 16384) >> 15; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \
-    t4 -= (tr*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.9807852804032304 */ \
-    tr += (t4*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \
-    t4 -= (tr*6723 + 4096) >> 13; \
-    \
-    t4 = -t4; \
-    \
-    tp += tm; \
-    tm -= OD_RSHIFT1(tp); \
-    t9 -= t6; \
-    t6 += OD_RSHIFT1(t9); \
-    th -= t1; \
-    t1 += OD_RSHIFT1(th); \
-    tu -= te; \
-    te += OD_RSHIFT1(tu); /* pass */ \
-    t5 -= tl; \
-    tl += OD_RSHIFT1(t5); \
-    ta += tq; \
-    tq -= OD_RSHIFT1(ta); \
-    td += tt; \
-    tt -= OD_RSHIFT1(td); \
-    t2 -= ti; \
-    ti += OD_RSHIFT1(t2); /* pass */ \
-    t7 += t8; \
-    t8 -= OD_RSHIFT1(t7); \
-    tn -= to; \
-    to += OD_RSHIFT1(tn); \
-    tf -= tv; \
-    tv += OD_RSHIFT1(tf); \
-    t0 += tg; \
-    tg -= OD_RSHIFT1(t0); /* pass */ \
-    tj -= t3; \
-    t3 += OD_RSHIFT1(tj); /* pass */ \
-    ts -= tc; \
-    tc += OD_RSHIFT1(ts); \
-    t4 -= tb; \
-    tb += OD_RSHIFT1(t4); /* pass */ \
-    tk -= tr; \
-    tr += OD_RSHIFT1(tk); \
-    \
-    t1 = -t1; \
-    t3 = -t3; \
-    t7 = -t7; \
-    t8 = -t8; \
-    tg = -tg; \
-    tm = -tm; \
-    to = -to; \
-    \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    tm -= (t9*14341 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    t9 += (tm*15137 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    tm -= (t9*4161 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    tp -= (t6*4161 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    t6 += (tp*15137 + 8192) >> 14; \
-    /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    tp -= (t6*28681 + 16384) >> 15; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    th += (te*19195 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    te += (th*11585 + 8192) >> 14; \
-    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    th -= (te*29957 + 16384) >> 15; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    tq -= (t5*14341 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    t5 += (tq*15137 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    tq -= (t5*4161 + 8192) >> 14; \
-    /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \
-    ta -= (tl*3259 + 4096) >> 13; \
-    /* 3135/16384 ~= Sin[Pi/8]/2 ~= 0.1913417161825449 */ \
-    tl += (ta*3135 + 8192) >> 14; \
-    /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \
-    ta -= (tl*3259 + 4096) >> 13; \
-    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    ti -= (td*7489 + 4096) >> 13; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    td += (ti*11585 + 8192) >> 14; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    ti += (td*19195 + 16384) >> 15; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    to -= (t7*14341 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    t7 += (to*15137 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    to -= (t7*4161 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    tn -= (t8*4161 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    t8 += (tn*15137 + 8192) >> 14; \
-    /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    tn -= (t8*28681 + 16384) >> 15; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    tf += (tg*19195 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    tg += (tf*11585 + 8192) >> 14; \
-    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    tf -= (tg*29957 + 16384) >> 15; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    tj += (tc*19195 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    tc += (tj*11585 + 8192) >> 14; \
-    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    tj -= (tc*29957 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    tk += (tb*13573 + 8192) >> 14; \
-    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
-    tb -= (tk*11585 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    tk += (tb*13573 + 8192) >> 14; \
-    \
-    tf = -tf; \
-    \
-  } \
-  while (0)
-
-#define OD_FDCT_64_PR(u0, uw, ug, uM, u8, uE, uo, uU, u4, uA, uk, uQ, uc, uI, \
-  us, uY, u2, uy, ui, uO, ua, uG, uq, uW, u6, uC, um, uS, ue, uK, uu, u_, u1, \
-  ux, uh, uN, u9, uF, up, uV, u5, uB, ul, uR, ud, uJ, ut, uZ, u3, uz, uj, uP, \
-  ub, uH, ur, uX, u7, uD, un, uT, uf, uL, uv, u) \
-  /* Embedded 64-point orthonormal Type-II fDCT. */ \
-  do { \
-    int uwh; \
-    int uxh; \
-    int uyh; \
-    int uzh; \
-    int uAh; \
-    int uBh; \
-    int uCh; \
-    int uDh; \
-    int uEh; \
-    int uFh; \
-    int uGh; \
-    int uHh; \
-    int uIh; \
-    int uJh; \
-    int uKh; \
-    int uLh; \
-    int uMh; \
-    int uNh; \
-    int uOh; \
-    int uPh; \
-    int uQh; \
-    int uRh; \
-    int uSh; \
-    int uTh; \
-    int uUh; \
-    int uVh; \
-    int uWh; \
-    int uXh; \
-    int uYh; \
-    int uZh; \
-    int u_h; \
-    int uh_; \
-    u = u0 - u; \
-    uh_ = OD_RSHIFT1(u); \
-    u0 -= uh_; \
-    u_ += u1; \
-    u_h = OD_RSHIFT1(u_); \
-    u1 = u_h - u1; \
-    uZ = u2 - uZ; \
-    uZh = OD_RSHIFT1(uZ); \
-    u2 -= uZh; \
-    uY += u3; \
-    uYh = OD_RSHIFT1(uY); \
-    u3 = uYh - u3; \
-    uX = u4 - uX; \
-    uXh = OD_RSHIFT1(uX); \
-    u4 -= uXh; \
-    uW += u5; \
-    uWh = OD_RSHIFT1(uW); \
-    u5 = uWh - u5; \
-    uV = u6 - uV; \
-    uVh = OD_RSHIFT1(uV); \
-    u6 -= uVh; \
-    uU += u7; \
-    uUh = OD_RSHIFT1(uU); \
-    u7 = uUh - u7; \
-    uT = u8 - uT; \
-    uTh = OD_RSHIFT1(uT); \
-    u8 -= uTh; \
-    uS += u9; \
-    uSh = OD_RSHIFT1(uS); \
-    u9 = uSh - u9; \
-    uR = ua - uR; \
-    uRh = OD_RSHIFT1(uR); \
-    ua -= uRh; \
-    uQ += ub; \
-    uQh = OD_RSHIFT1(uQ); \
-    ub = uQh - ub; \
-    uP = uc - uP; \
-    uPh = OD_RSHIFT1(uP); \
-    uc -= uPh; \
-    uO += ud; \
-    uOh = OD_RSHIFT1(uO); \
-    ud = uOh - ud; \
-    uN = ue - uN; \
-    uNh = OD_RSHIFT1(uN); \
-    ue -= uNh; \
-    uM += uf; \
-    uMh = OD_RSHIFT1(uM); \
-    uf = uMh - uf; \
-    uL = ug - uL; \
-    uLh = OD_RSHIFT1(uL); \
-    ug -= uLh; \
-    uK += uh; \
-    uKh = OD_RSHIFT1(uK); \
-    uh = uKh - uh; \
-    uJ = ui - uJ; \
-    uJh = OD_RSHIFT1(uJ); \
-    ui -= uJh; \
-    uI += uj; \
-    uIh = OD_RSHIFT1(uI); \
-    uj = uIh - uj; \
-    uH = uk - uH; \
-    uHh = OD_RSHIFT1(uH); \
-    uk -= uHh; \
-    uG += ul; \
-    uGh = OD_RSHIFT1(uG); \
-    ul = uGh - ul; \
-    uF = um - uF; \
-    uFh = OD_RSHIFT1(uF); \
-    um -= uFh; \
-    uE += un; \
-    uEh = OD_RSHIFT1(uE); \
-    un = uEh - un; \
-    uD = uo - uD; \
-    uDh = OD_RSHIFT1(uD); \
-    uo -= uDh; \
-    uC += up; \
-    uCh = OD_RSHIFT1(uC); \
-    up = uCh - up; \
-    uB = uq - uB; \
-    uBh = OD_RSHIFT1(uB); \
-    uq -= uBh; \
-    uA += ur; \
-    uAh = OD_RSHIFT1(uA); \
-    ur = uAh - ur; \
-    uz = us - uz; \
-    uzh = OD_RSHIFT1(uz); \
-    us -= uzh; \
-    uy += ut; \
-    uyh = OD_RSHIFT1(uy); \
-    ut = uyh - ut; \
-    ux = uu - ux; \
-    uxh = OD_RSHIFT1(ux); \
-    uu -= uxh; \
-    uw += uv; \
-    uwh = OD_RSHIFT1(uw); \
-    uv = uwh - uv; \
-    OD_FDCT_32_ASYM_PR(u0, uw, uwh, ug, uM, uMh, u8, uE, uEh, uo, uU, uUh, \
-      u4, uA, uAh, uk, uQ, uQh, uc, uI, uIh, us, uY, uYh, u2, uy, uyh, \
-      ui, uO, uOh, ua, uG, uGh, uq, uW, uWh, u6, uC, uCh, um, uS, uSh, \
-      ue, uK, uKh, uu, u_, u_h); \
-    OD_FDST_32_ASYM_PR(u, uv, uL, uf, uT, un, uD, u7, uX, ur, uH, ub, uP, uj, \
-      uz, u3, uZ, ut, uJ, ud, uR, ul, uB, u5, uV, up, uF, u9, uN, uh, ux, u1); \
-  } \
-  while (0)
-
-#define OD_IDCT_64_PR(u0, uw, ug, uM, u8, uE, uo, uU, u4, uA, uk, uQ, uc, uI, \
-  us, uY, u2, uy, ui, uO, ua, uG, uq, uW, u6, uC, um, uS, ue, uK, uu, u_, u1, \
-  ux, uh, uN, u9, uF, up, uV, u5, uB, ul, uR, ud, uJ, ut, uZ, u3, uz, uj, uP, \
-  ub, uH, ur, uX, u7, uD, un, uT, uf, uL, uv, u) \
-  /* Embedded 64-point orthonormal Type-II fDCT. */ \
-  do { \
-    int u1h; \
-    int u3h; \
-    int u5h; \
-    int u7h; \
-    int u9h; \
-    int ubh; \
-    int udh; \
-    int ufh; \
-    int uhh; \
-    int ujh; \
-    int ulh; \
-    int unh; \
-    int uph; \
-    int urh; \
-    int uth; \
-    int uvh; \
-    int uxh; \
-    int uzh; \
-    int uBh; \
-    int uDh; \
-    int uFh; \
-    int uHh; \
-    int uJh; \
-    int uLh; \
-    int uNh; \
-    int uPh; \
-    int uRh; \
-    int uTh; \
-    int uVh; \
-    int uXh; \
-    int uZh; \
-    int uh_; \
-    OD_IDST_32_ASYM_PR(u, uL, uT, uD, uX, uH, uP, uz, uZ, uJ, uR, uB, uV, uF, \
-      uN, ux, u_, uK, uS, uC, uW, uG, uO, uy, uY, uI, uQ, uA, uU, uE, uM, uw); \
-    OD_IDCT_32_ASYM_PR(u0, ug, u8, uo, u4, uk, uc, us, u2, ui, ua, uq, u6, um, \
-      ue, uu, u1, u1h, uh, uhh, u9, u9h, up, uph, u5, u5h, ul, ulh, ud, udh, \
-      ut, uth, u3, u3h, uj, ujh, ub, ubh, ur, urh, u7, u7h, un, unh, uf, ufh, \
-      uv, uvh); \
-    uh_ = OD_RSHIFT1(u); \
-    u0 += uh_; \
-    u = u0 - u; \
-    u_ = u1h - u_; \
-    u1 -= u_; \
-    uZh = OD_RSHIFT1(uZ); \
-    u2 += uZh; \
-    uZ = u2 - uZ; \
-    uY = u3h - uY; \
-    u3 -= uY; \
-    uXh = OD_RSHIFT1(uX); \
-    u4 += uXh; \
-    uX = u4 - uX; \
-    uW = u5h - uW; \
-    u5 -= uW; \
-    uVh = OD_RSHIFT1(uV); \
-    u6 += uVh; \
-    uV = u6 - uV; \
-    uU = u7h - uU; \
-    u7 -= uU; \
-    uTh = OD_RSHIFT1(uT); \
-    u8 += uTh; \
-    uT = u8 - uT; \
-    uS = u9h - uS; \
-    u9 -= uS; \
-    uRh = OD_RSHIFT1(uR); \
-    ua += uRh; \
-    uR = ua - uR; \
-    uQ = ubh - uQ; \
-    ub -= uQ; \
-    uPh = OD_RSHIFT1(uP); \
-    uc += uPh; \
-    uP = uc - uP; \
-    uO = udh - uO; \
-    ud -= uO; \
-    uNh = OD_RSHIFT1(uN); \
-    ue += uNh; \
-    uN = ue - uN; \
-    uM = ufh - uM; \
-    uf -= uM; \
-    uLh = OD_RSHIFT1(uL); \
-    ug += uLh; \
-    uL = ug - uL; \
-    uK = uhh - uK; \
-    uh -= uK; \
-    uJh = OD_RSHIFT1(uJ); \
-    ui += uJh; \
-    uJ = ui - uJ; \
-    uI = ujh - uI; \
-    uj -= uI; \
-    uHh = OD_RSHIFT1(uH); \
-    uk += uHh; \
-    uH = uk - uH; \
-    uG = ulh - uG; \
-    ul -= uG; \
-    uFh = OD_RSHIFT1(uF); \
-    um += uFh; \
-    uF = um - uF; \
-    uE = unh - uE; \
-    un -= uE; \
-    uDh = OD_RSHIFT1(uD); \
-    uo += uDh; \
-    uD = uo - uD; \
-    uC = uph - uC; \
-    up -= uC; \
-    uBh = OD_RSHIFT1(uB); \
-    uq += uBh; \
-    uB = uq - uB; \
-    uA = urh - uA; \
-    ur -= uA; \
-    uzh = OD_RSHIFT1(uz); \
-    us += uzh; \
-    uz = us - uz; \
-    uy = uth - uy; \
-    ut -= uy; \
-    uxh = OD_RSHIFT1(ux); \
-    uu += uxh; \
-    ux = uu - ux; \
-    uw = uvh - uw; \
-    uv -= uw; \
-  } while (0)
-#endif
-
-/* 4-point orthonormal Type-II fDCT. */
-void od_bin_fdct4(od_coeff y[4], const od_coeff *x, int xstride) {
-  int q0;
-  int q1;
-  int q2;
-  int q3;
-  q0 = x[0*xstride];
-  q1 = x[1*xstride];
-  q2 = x[2*xstride];
-  q3 = x[3*xstride];
-  od_fdct_4(&q0, &q1, &q2, &q3);
-  y[0] = (od_coeff)q0;
-  y[1] = (od_coeff)q2;
-  y[2] = (od_coeff)q1;
-  y[3] = (od_coeff)q3;
-}
-
-/* 4-point orthonormal Type-II iDCT. */
-void od_bin_idct4(od_coeff *x, int xstride, const od_coeff y[4]) {
-  int q0;
-  int q1;
-  int q2;
-  int q3;
-  q0 = y[0];
-  q2 = y[1];
-  q1 = y[2];
-  q3 = y[3];
-  od_idct_4(&q0, &q2, &q1, &q3);
-  x[0*xstride] = (od_coeff)q0;
-  x[1*xstride] = (od_coeff)q1;
-  x[2*xstride] = (od_coeff)q2;
-  x[3*xstride] = (od_coeff)q3;
-}
-
-/* 4-point orthonormal Type-VII fDST. */
-void od_bin_fdst4(od_coeff y[4], const od_coeff *x, int xstride) {
-  /* 11 adds, 5 "muls", 2 shifts.*/
-  int q0;
-  int q1;
-  int q2;
-  int q3;
-  int t0;
-  int t1;
-  int t2;
-  int t3;
-  int t3h;
-  int t4;
-  int u4;
-  q0 = x[0*xstride];
-  q1 = x[1*xstride];
-  q2 = x[2*xstride];
-  q3 = x[3*xstride];
-  t0 = q1 + q3;
-  /*When used in a 4x16 transform, the following line could overflow 16 bits
-    in SIMD unless implemented using PAVGW or VRHSUB.S16.*/
-  t1 = q1 + OD_PAVG(q0, -t0);
-  t2 = q0 - q1;
-  t3 = q2;
-  t4 = q0 + q3;
-  /* 7021/16384 ~= 2*Sin[2*Pi/9]/3 ~= 0.428525073124360 */
-  t0 = (t0*7021 + 8192) >> 14;
-  /* 37837/32768 ~= 4*Sin[3*Pi/9]/3 ~= 1.154700538379252 */
-  t1 = (t1*37837 + 16384) >> 15;
-  /* 21513/32768 ~= 2*Sin[4*Pi/9]/3 ~= 0.656538502008139 */
-  t2 = (t2*21513 + 16384) >> 15;
-  /* 37837/32768 ~= 4*Sin[3*Pi/9]/3 ~= 1.154700538379252 */
-  t3 = (t3*37837 + 16384) >> 15;
-  /* 467/2048 ~= 2*Sin[1*Pi/9]/3 ~= 0.228013428883779 */
-  t4 = (t4*467 + 1024) >> 11;
-  t3h = OD_RSHIFT1(t3);
-  u4 = t4 + t3h;
-  q0 = t0 + u4;
-  q1 = t1;
-  q2 = t0 + t2 - t3h;
-  q3 = t2 + t3 - u4;
-  y[0] = (od_coeff)q0;
-  y[1] = (od_coeff)q1;
-  y[2] = (od_coeff)q2;
-  y[3] = (od_coeff)q3;
-}
-
-/* 4-point orthonormal Type-VII iDST. */
-void od_bin_idst4(od_coeff *x, int xstride, const od_coeff y[4]) {
-  /* 11 adds, 5 "muls".*/
-  int q0;
-  int q1;
-  int q2;
-  int q3;
-  int t0;
-  int t1;
-  int t2;
-  int t3;
-  int t3h;
-  int t4;
-  int u4;
-  q0 = y[0];
-  q1 = y[1];
-  q2 = y[2];
-  q3 = y[3];
-  t0 = q0 - q3;
-  t1 = q0 + q2;
-  t2 = q3 + OD_PAVG(t0, -q2);
-  t3 = q1;
-  t4 = q2 + q3;
-  /* 467/2048 ~= 2*Sin[1*Pi/9]/3 ~= 0.228013428883779 */
-  t0 = (t0*467 + 1024) >> 11;
-  /* 7021/16384 ~= 2*Sin[2*Pi/9]/3 ~= 0.428525073124360 */
-  t1 = (t1*7021 + 8192) >> 14;
-  /* 37837/32768 ~= 4*Sin[3*Pi/9]/3 ~= 1.154700538379252 */
-  t2 = (t2*37837 + 16384) >> 15;
-  /* 37837/32768 ~= 4*Sin[3*Pi/9]/3 ~= 1.154700538379252 */
-  t3 = (t3*37837 + 16384) >> 15;
-  /* 21513/32768 ~= 2*Sin[4*Pi/9]/3 ~= 0.656538502008139 */
-  t4 = (t4*21513 + 16384) >> 15;
-  t3h = OD_RSHIFT1(t3);
-  u4 = t4 + t3h;
-  q0 = t0 + u4;
-  q1 = t1 + t3 - u4;
-  q2 = t2;
-  q3 = t0 + t1 - t3h;
-  x[0*xstride] = q0;
-  x[1*xstride] = q1;
-  x[2*xstride] = q2;
-  x[3*xstride] = q3;
-}
-
-void od_bin_fdct8(od_coeff y[8], const od_coeff *x, int xstride) {
-  int r0;
-  int r1;
-  int r2;
-  int r3;
-  int r4;
-  int r5;
-  int r6;
-  int r7;
-  r0 = x[0*xstride];
-  r1 = x[1*xstride];
-  r2 = x[2*xstride];
-  r3 = x[3*xstride];
-  r4 = x[4*xstride];
-  r5 = x[5*xstride];
-  r6 = x[6*xstride];
-  r7 = x[7*xstride];
-  od_fdct_8(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7);
-  y[0] = (od_coeff)r0;
-  y[1] = (od_coeff)r4;
-  y[2] = (od_coeff)r2;
-  y[3] = (od_coeff)r6;
-  y[4] = (od_coeff)r1;
-  y[5] = (od_coeff)r5;
-  y[6] = (od_coeff)r3;
-  y[7] = (od_coeff)r7;
-}
-
-void od_bin_idct8(od_coeff *x, int xstride, const od_coeff y[8]) {
-  int r0;
-  int r1;
-  int r2;
-  int r3;
-  int r4;
-  int r5;
-  int r6;
-  int r7;
-  r0 = y[0];
-  r4 = y[1];
-  r2 = y[2];
-  r6 = y[3];
-  r1 = y[4];
-  r5 = y[5];
-  r3 = y[6];
-  r7 = y[7];
-  od_idct_8(&r0, &r4, &r2, &r6, &r1, &r5, &r3, &r7);
-  x[0*xstride] = (od_coeff)r0;
-  x[1*xstride] = (od_coeff)r1;
-  x[2*xstride] = (od_coeff)r2;
-  x[3*xstride] = (od_coeff)r3;
-  x[4*xstride] = (od_coeff)r4;
-  x[5*xstride] = (od_coeff)r5;
-  x[6*xstride] = (od_coeff)r6;
-  x[7*xstride] = (od_coeff)r7;
-}
-
-#if !CONFIG_DAALA_TX_DST8
-void od_bin_fdst8(od_coeff y[8], const od_coeff *x, int xstride) {
-  int r0;
-  int r1;
-  int r2;
-  int r3;
-  int r4;
-  int r5;
-  int r6;
-  int r7;
-  r0 = x[0*xstride];
-  r1 = x[1*xstride];
-  r2 = x[2*xstride];
-  r3 = x[3*xstride];
-  r4 = x[4*xstride];
-  r5 = x[5*xstride];
-  r6 = x[6*xstride];
-  r7 = x[7*xstride];
-  od_fdst_8(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7);
-  y[0] = (od_coeff)r0;
-  y[1] = (od_coeff)r4;
-  y[2] = (od_coeff)r2;
-  y[3] = (od_coeff)r6;
-  y[4] = (od_coeff)r1;
-  y[5] = (od_coeff)r5;
-  y[6] = (od_coeff)r3;
-  y[7] = (od_coeff)r7;
-}
-
-void od_bin_idst8(od_coeff *x, int xstride, const od_coeff y[8]) {
-  int r0;
-  int r1;
-  int r2;
-  int r3;
-  int r4;
-  int r5;
-  int r6;
-  int r7;
-  r0 = y[0];
-  r4 = y[1];
-  r2 = y[2];
-  r6 = y[3];
-  r1 = y[4];
-  r5 = y[5];
-  r3 = y[6];
-  r7 = y[7];
-  od_idst_8(&r0, &r4, &r2, &r6, &r1, &r5, &r3, &r7);
-  x[0*xstride] = (od_coeff)r0;
-  x[1*xstride] = (od_coeff)r1;
-  x[2*xstride] = (od_coeff)r2;
-  x[3*xstride] = (od_coeff)r3;
-  x[4*xstride] = (od_coeff)r4;
-  x[5*xstride] = (od_coeff)r5;
-  x[6*xstride] = (od_coeff)r6;
-  x[7*xstride] = (od_coeff)r7;
-}
-#else
-const int OD_DST_8_PERM[8] = { 0, 7, 1, 6, 2, 5, 3, 4 };
-
-/* Computes the Polynomial Product Y(z) ≡ X(z)*H(z) modulo (z^8 + 1) using
-    Nussbaumer's "short" algorithm [1].
-   The monomial coefficients in Y(z) are exactly the values of an acyclic
-    convolution of the monomial coefficients of X(z) and H(z).
-   Since H(z) is fixed, the multiplication terms are constant and precomputed.
-
-   [1] Nussbaumer, Henri J. "Fast Fourier Transform and Convolution Algorithms"
-        Springer-Verlag: Berlin, Heidelberg, New York (1981) pages 76-78. */
-static void od_poly_prod_8(od_coeff y[8], const od_coeff x[8]) {
-  /* 21 "muls", 76 adds, 21 shifts */
-  od_coeff q0;
-  od_coeff q1;
-  od_coeff q2;
-  od_coeff q3;
-  od_coeff q4;
-  od_coeff q5;
-  od_coeff q6;
-  od_coeff q7;
-  od_coeff q8;
-  od_coeff q9;
-  od_coeff q10;
-  od_coeff q11;
-  od_coeff q12;
-  od_coeff q13;
-  od_coeff q14;
-  od_coeff q15;
-  od_coeff q16;
-  od_coeff q17;
-  od_coeff q18;
-  od_coeff q19;
-  od_coeff q20;
-  od_coeff r0;
-  od_coeff r1;
-  od_coeff r2;
-  od_coeff r3;
-  od_coeff r4;
-  od_coeff r5;
-  od_coeff r6;
-  od_coeff r7;
-  od_coeff t0;
-  od_coeff t1;
-  od_coeff t2;
-  od_coeff t3;
-  od_coeff t4;
-  od_coeff t5;
-  od_coeff t6;
-  od_coeff t7;
-  od_coeff u0;
-  od_coeff u1;
-  od_coeff u1h;
-  od_coeff u2;
-  od_coeff u2h;
-  od_coeff u3;
-  od_coeff u4;
-  od_coeff u4h;
-  od_coeff u5;
-  od_coeff u6;
-  od_coeff u7;
-  od_coeff u7h;
-  od_coeff u8;
-  od_coeff u9;
-  od_coeff u10;
-  od_coeff u11;
-  od_coeff u12;
-  od_coeff u13;
-  od_coeff u14;
-  od_coeff u15;
-  od_coeff u16;
-  od_coeff u17;
-  od_coeff u18;
-  od_coeff u19;
-  od_coeff u20;
-  od_coeff u21;
-  od_coeff u22;
-  od_coeff u23;
-  od_coeff u24;
-  od_coeff u25;
-  od_coeff u26;
-  od_coeff u27;
-  t0 = x[0];
-  t1 = x[1];
-  t2 = x[2];
-  t3 = x[3];
-  t4 = x[4];
-  t5 = x[5];
-  t6 = x[6];
-  t7 = x[7];
-  /* Stage 0 Butterfly */
-  u7 = t0 - t7;
-  u7h = OD_RSHIFT1(u7);
-  u0 = t0 - u7h;
-  u2 = t2 - t6;
-  u2h = OD_RSHIFT1(u2);
-  u6 = t2 - u2h;
-  u4 = t4 + t5;
-  u4h = OD_RSHIFT1(u4);
-  u5 = t4 - u4h;
-  u1 = t3 - t1;
-  u1h = OD_RSHIFT1(u1);
-  u3 = t3 - u1h;
-  /* Stage 1 Butterfly */
-  q0 = u0 + u2h;
-  q1 = q0 - u2;
-  q4 = u3 + u4h;
-  q5 = q4 - u4;
-  q2 = u7h + u5;
-  q7 = u7 - q2;
-  q6 = u1h + u6;
-  q3 = u1 - q6;
-  /* Stage 2 Half-Butterfly */
-  /*The intermediate sums can overflow 16 bits, but all SIMD instruction sets
-     should be able to compute them without issue (i.e., using PAVGW or
-     V{R}HADD.S16).*/
-  q8 = (q0 + q4 + 1) >> 1;
-  q9 = (q1 + q5) >> 1;
-  q10 = (q2 + q3 + 1) >> 1;
-  q11 = (q7 + q6) >> 1;
-  /* Stage 3 */
-  q12 = t0 + t3;
-  q13 = t0;
-  q14 = t3;
-  q15 = t5 - t6;
-  q16 = t6;
-  q17 = t5;
-  r0 = t2 + t4;
-  r1 = t2 - OD_RSHIFT1(r0);
-  r2 = (r1 - q15 + 1) >> 1;
-  r3 = OD_RSHIFT1(t0);
-  r4 = (r3 - t1 + 1) >> 1;
-  /* q18 = (q6 - q4)/2 + (t0 - q15)/4
-         = (t0 + t2 - t4)/4 - (t1 + t5 - t6)/2 */
-  q18 = r2 + r4;
-  r5 = t5 - (q15 >> 1);
-  r6 = (r0 + t3 + 1) >> 1;
-  r7 = (t7 + r6 + 1) >> 1;
-  /* q19 = (q7 - q0)/2 + (t5 + t6 - t3)/4
-         = (t5 + t6 - t7)/2 - (t2 + t3 + t4)/4 */
-  q19 = r5 - r7;
-  q20 = (q18 - q19) >> 1;
-  /* Stage 4 */
-  q0 = (-5995*q0 + 8192) >> 14;
-  q1 = (-1373*q1 + 4096) >> 13;
-  q2 = (22891*q2 + 16384) >> 15;
-  q3 = (-217*q3 + 512) >> 10;
-  q4 = (13427*q4 + 16384) >> 15;
-  q5 = (-11013*q5 + 8192) >> 14;
-  q6 = (1373*q6 + 1024) >> 11;
-  q7 = (-14077*q7 + 16384) >> 15;
-  q8 = (-1437*q8 + 16384) >> 15;
-  q9 = (27519*q9 + 16384) >> 15;
-  q10 = (-15947*q10 + 16384) >> 15;
-  q11 = (-7891*q11 + 16384) >> 15;
-  q12 = (4897*q12 + 16384) >> 15;
-  q13 = (-5079*q13 + 8192) >> 14;
-  q14 = (365*q14 + 16384) >> 15;
-  q15 = (3325*q15 + 8192) >> 14;
-  q16 = (-5225*q16 + 8192) >> 14;
-  q17 = (-1425*q17 + 8192) >> 14;
-  q18 = (3453*q18 + 16384) >> 15;
-  q19 = (-8421*q19 + 8192) >> 14;
-  q20 = (-20295*q20 + 16384) >> 15;
-  /* Stage 5 */
-  u0 = q0 + q8;
-  u1 = q1 + q9;
-  u2 = q2 + q10;
-  u3 = q3 + q10;
-  u4 = q4 + q8;
-  u5 = q5 + q9;
-  u6 = q6 + q11;
-  u7 = q7 + q11;
-  /* Stage 6 */
-  u10 = u0 + u1;
-  u11 = u0 - u1;
-  u12 = u2 + u7;
-  u13 = u2 - u7;
-  u14 = u3 + u6;
-  u15 = u3 - u6;
-  u16 = u5 + u4;
-  u17 = u5 - u4;
-  /* Stage 7 */
-  u8 = q19 + q20;
-  u9 = q19 - q18;
-  u18 = q12 + u8;
-  u19 = u18 + q13;
-  u20 = u18 + q14;
-  u21 = 2*u9;
-  u22 = q15 + u21;
-  u23 = q16 - u22;
-  u24 = u22 + q17;
-  u25 = 2*u8;
-  u26 = 2*u25;
-  u27 = u25 - u9;
-  /* Stage 8 */
-  y[0] = u14 + u16 + u20;
-  y[1] = u12 - u10 - u25;
-  y[2] = u9 + u13 - u17;
-  y[3] = u9 - u10 - u12 - u19;
-  y[4] = u15 - u11 - u27;
-  y[5] = u23 - u11 - u15;
-  y[6] = u13 + u17 - u24 + u26;
-  y[7] = u16 - u14 + u21 - u25;
-}
-
-void od_bin_fdst8(od_coeff y[8], const od_coeff *x, int xstride) {
-  int i;
-  od_coeff xp[8];
-  od_coeff yp[8];
-  for (i = 0; i < 8; i++) xp[i] = x[i*xstride];
-  od_poly_prod_8(yp, xp);
-  for (i = 0; i < 8; i++) y[OD_DST_8_PERM[i]] = yp[i];
-}
-
-void od_bin_idst8(od_coeff *x, int xstride, const od_coeff y[8]) {
-  int i;
-  od_coeff xp[8];
-  od_coeff yp[8];
-  for (i = 0; i < 8; i++) yp[i] = y[OD_DST_8_PERM[i]];
-  od_poly_prod_8(xp, yp);
-  for (i = 0; i < 8; i++) x[i*xstride] = xp[i];
-}
-#endif
-void od_bin_fdct16(od_coeff y[16], const od_coeff *x, int xstride) {
-  int s0;
-  int s1;
-  int s2;
-  int s3;
-  int s4;
-  int s5;
-  int s6;
-  int s7;
-  int s8;
-  int s9;
-  int sa;
-  int sb;
-  int sc;
-  int sd;
-  int se;
-  int sf;
-  s0 = x[0*xstride];
-  s1 = x[1*xstride];
-  s2 = x[2*xstride];
-  s3 = x[3*xstride];
-  s4 = x[4*xstride];
-  s5 = x[5*xstride];
-  s6 = x[6*xstride];
-  s7 = x[7*xstride];
-  s8 = x[8*xstride];
-  s9 = x[9*xstride];
-  sa = x[10*xstride];
-  sb = x[11*xstride];
-  sc = x[12*xstride];
-  sd = x[13*xstride];
-  se = x[14*xstride];
-  sf = x[15*xstride];
-  od_fdct_16(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7,
-    &s8, &s9, &sa, &sb, &sc, &sd, &se, &sf);
-  y[0] = (od_coeff)s0;
-  y[1] = (od_coeff)s8;
-  y[2] = (od_coeff)s4;
-  y[3] = (od_coeff)sc;
-  y[4] = (od_coeff)s2;
-  y[5] = (od_coeff)sa;
-  y[6] = (od_coeff)s6;
-  y[7] = (od_coeff)se;
-  y[8] = (od_coeff)s1;
-  y[9] = (od_coeff)s9;
-  y[10] = (od_coeff)s5;
-  y[11] = (od_coeff)sd;
-  y[12] = (od_coeff)s3;
-  y[13] = (od_coeff)sb;
-  y[14] = (od_coeff)s7;
-  y[15] = (od_coeff)sf;
-}
-
-void od_bin_idct16(od_coeff *x, int xstride, const od_coeff y[16]) {
-  int s0;
-  int s1;
-  int s2;
-  int s3;
-  int s4;
-  int s5;
-  int s6;
-  int s7;
-  int s8;
-  int s9;
-  int sa;
-  int sb;
-  int sc;
-  int sd;
-  int se;
-  int sf;
-  s0 = y[0];
-  s8 = y[1];
-  s4 = y[2];
-  sc = y[3];
-  s2 = y[4];
-  sa = y[5];
-  s6 = y[6];
-  se = y[7];
-  s1 = y[8];
-  s9 = y[9];
-  s5 = y[10];
-  sd = y[11];
-  s3 = y[12];
-  sb = y[13];
-  s7 = y[14];
-  sf = y[15];
-  od_idct_16(&s0, &s8, &s4, &sc, &s2, &sa, &s6, &se,
-    &s1, &s9, &s5, &sd, &s3, &sb, &s7, &sf);
-  x[0*xstride] = (od_coeff)s0;
-  x[1*xstride] = (od_coeff)s1;
-  x[2*xstride] = (od_coeff)s2;
-  x[3*xstride] = (od_coeff)s3;
-  x[4*xstride] = (od_coeff)s4;
-  x[5*xstride] = (od_coeff)s5;
-  x[6*xstride] = (od_coeff)s6;
-  x[7*xstride] = (od_coeff)s7;
-  x[8*xstride] = (od_coeff)s8;
-  x[9*xstride] = (od_coeff)s9;
-  x[10*xstride] = (od_coeff)sa;
-  x[11*xstride] = (od_coeff)sb;
-  x[12*xstride] = (od_coeff)sc;
-  x[13*xstride] = (od_coeff)sd;
-  x[14*xstride] = (od_coeff)se;
-  x[15*xstride] = (od_coeff)sf;
-}
-
-void od_bin_fdst16(od_coeff y[16], const od_coeff *x, int xstride) {
-  int s0;
-  int s1;
-  int s2;
-  int s3;
-  int s4;
-  int s5;
-  int s6;
-  int s7;
-  int s8;
-  int s9;
-  int sa;
-  int sb;
-  int sc;
-  int sd;
-  int se;
-  int sf;
-  s0 = x[0*xstride];
-  s1 = x[1*xstride];
-  s2 = x[2*xstride];
-  s3 = x[3*xstride];
-  s4 = x[4*xstride];
-  s5 = x[5*xstride];
-  s6 = x[6*xstride];
-  s7 = x[7*xstride];
-  s8 = x[8*xstride];
-  s9 = x[9*xstride];
-  sa = x[10*xstride];
-  sb = x[11*xstride];
-  sc = x[12*xstride];
-  sd = x[13*xstride];
-  se = x[14*xstride];
-  sf = x[15*xstride];
-  od_fdst_16(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7,
-    &s8, &s9, &sa, &sb, &sc, &sd, &se, &sf);
-  y[0] = (od_coeff)s0;
-  y[1] = (od_coeff)s8;
-  y[2] = (od_coeff)s4;
-  y[3] = (od_coeff)sc;
-  y[4] = (od_coeff)s2;
-  y[5] = (od_coeff)sa;
-  y[6] = (od_coeff)s6;
-  y[7] = (od_coeff)se;
-  y[8] = (od_coeff)s1;
-  y[9] = (od_coeff)s9;
-  y[10] = (od_coeff)s5;
-  y[11] = (od_coeff)sd;
-  y[12] = (od_coeff)s3;
-  y[13] = (od_coeff)sb;
-  y[14] = (od_coeff)s7;
-  y[15] = (od_coeff)sf;
-}
-
-void od_bin_idst16(od_coeff *x, int xstride, const od_coeff y[16]) {
-  int s0;
-  int s1;
-  int s2;
-  int s3;
-  int s4;
-  int s5;
-  int s6;
-  int s7;
-  int s8;
-  int s9;
-  int sa;
-  int sb;
-  int sc;
-  int sd;
-  int se;
-  int sf;
-  s0 = y[0];
-  s8 = y[1];
-  s4 = y[2];
-  sc = y[3];
-  s2 = y[4];
-  sa = y[5];
-  s6 = y[6];
-  se = y[7];
-  s1 = y[8];
-  s9 = y[9];
-  s5 = y[10];
-  sd = y[11];
-  s3 = y[12];
-  sb = y[13];
-  s7 = y[14];
-  sf = y[15];
-  od_idst_16(&s0, &s8, &s4, &sc, &s2, &sa, &s6, &se,
-    &s1, &s9, &s5, &sd, &s3, &sb, &s7, &sf);
-  x[0*xstride] = (od_coeff)s0;
-  x[1*xstride] = (od_coeff)s1;
-  x[2*xstride] = (od_coeff)s2;
-  x[3*xstride] = (od_coeff)s3;
-  x[4*xstride] = (od_coeff)s4;
-  x[5*xstride] = (od_coeff)s5;
-  x[6*xstride] = (od_coeff)s6;
-  x[7*xstride] = (od_coeff)s7;
-  x[8*xstride] = (od_coeff)s8;
-  x[9*xstride] = (od_coeff)s9;
-  x[10*xstride] = (od_coeff)sa;
-  x[11*xstride] = (od_coeff)sb;
-  x[12*xstride] = (od_coeff)sc;
-  x[13*xstride] = (od_coeff)sd;
-  x[14*xstride] = (od_coeff)se;
-  x[15*xstride] = (od_coeff)sf;
-}
-
-void od_bin_fdct32(od_coeff y[32], const od_coeff *x, int xstride) {
-  /*215 adds, 38 shifts, 87 "muls".*/
-  int t0;
-  int t1;
-  int t2;
-  int t3;
-  int t4;
-  int t5;
-  int t6;
-  int t7;
-  int t8;
-  int t9;
-  int ta;
-  int tb;
-  int tc;
-  int td;
-  int te;
-  int tf;
-  int tg;
-  int th;
-  int ti;
-  int tj;
-  int tk;
-  int tl;
-  int tm;
-  int tn;
-  int to;
-  int tp;
-  int tq;
-  int tr;
-  int ts;
-  int tt;
-  int tu;
-  int tv;
-  t0 = x[0*xstride];
-  tg = x[1*xstride];
-  t8 = x[2*xstride];
-  to = x[3*xstride];
-  t4 = x[4*xstride];
-  tk = x[5*xstride];
-  tc = x[6*xstride];
-  ts = x[7*xstride];
-  t2 = x[8*xstride];
-  ti = x[9*xstride];
-  ta = x[10*xstride];
-  tq = x[11*xstride];
-  t6 = x[12*xstride];
-  tm = x[13*xstride];
-  te = x[14*xstride];
-  tu = x[15*xstride];
-  t1 = x[16*xstride];
-  th = x[17*xstride];
-  t9 = x[18*xstride];
-  tp = x[19*xstride];
-  t5 = x[20*xstride];
-  tl = x[21*xstride];
-  td = x[22*xstride];
-  tt = x[23*xstride];
-  t3 = x[24*xstride];
-  tj = x[25*xstride];
-  tb = x[26*xstride];
-  tr = x[27*xstride];
-  t7 = x[28*xstride];
-  tn = x[29*xstride];
-  tf = x[30*xstride];
-  tv = x[31*xstride];
-  od_fdct_32(
-    &t0, &tg, &t8, &to, &t4, &tk, &tc, &ts, &t2, &ti, &ta, &tq, &t6, &tm, &te,
-    &tu, &t1, &th, &t9, &tp, &t5, &tl, &td, &tt, &t3, &tj, &tb, &tr, &t7, &tn,
-    &tf, &tv);
-  y[0] = (od_coeff)t0;
-  y[1] = (od_coeff)t1;
-  y[2] = (od_coeff)t2;
-  y[3] = (od_coeff)t3;
-  y[4] = (od_coeff)t4;
-  y[5] = (od_coeff)t5;
-  y[6] = (od_coeff)t6;
-  y[7] = (od_coeff)t7;
-  y[8] = (od_coeff)t8;
-  y[9] = (od_coeff)t9;
-  y[10] = (od_coeff)ta;
-  y[11] = (od_coeff)tb;
-  y[12] = (od_coeff)tc;
-  y[13] = (od_coeff)td;
-  y[14] = (od_coeff)te;
-  y[15] = (od_coeff)tf;
-  y[16] = (od_coeff)tg;
-  y[17] = (od_coeff)th;
-  y[18] = (od_coeff)ti;
-  y[19] = (od_coeff)tj;
-  y[20] = (od_coeff)tk;
-  y[21] = (od_coeff)tl;
-  y[22] = (od_coeff)tm;
-  y[23] = (od_coeff)tn;
-  y[24] = (od_coeff)to;
-  y[25] = (od_coeff)tp;
-  y[26] = (od_coeff)tq;
-  y[27] = (od_coeff)tr;
-  y[28] = (od_coeff)ts;
-  y[29] = (od_coeff)tt;
-  y[30] = (od_coeff)tu;
-  y[31] = (od_coeff)tv;
-}
-
-void od_bin_idct32(od_coeff *x, int xstride, const od_coeff y[32]) {
-  int t0;
-  int t1;
-  int t2;
-  int t3;
-  int t4;
-  int t5;
-  int t6;
-  int t7;
-  int t8;
-  int t9;
-  int ta;
-  int tb;
-  int tc;
-  int td;
-  int te;
-  int tf;
-  int tg;
-  int th;
-  int ti;
-  int tj;
-  int tk;
-  int tl;
-  int tm;
-  int tn;
-  int to;
-  int tp;
-  int tq;
-  int tr;
-  int ts;
-  int tt;
-  int tu;
-  int tv;
-  t0 = y[0];
-  tg = y[1];
-  t8 = y[2];
-  to = y[3];
-  t4 = y[4];
-  tk = y[5];
-  tc = y[6];
-  ts = y[7];
-  t2 = y[8];
-  ti = y[9];
-  ta = y[10];
-  tq = y[11];
-  t6 = y[12];
-  tm = y[13];
-  te = y[14];
-  tu = y[15];
-  t1 = y[16];
-  th = y[17];
-  t9 = y[18];
-  tp = y[19];
-  t5 = y[20];
-  tl = y[21];
-  td = y[22];
-  tt = y[23];
-  t3 = y[24];
-  tj = y[25];
-  tb = y[26];
-  tr = y[27];
-  t7 = y[28];
-  tn = y[29];
-  tf = y[30];
-  tv = y[31];
-  od_idct_32(
-    &t0, &tg, &t8, &to, &t4, &tk, &tc, &ts, &t2, &ti, &ta, &tq, &t6, &tm, &te,
-    &tu, &t1, &th, &t9, &tp, &t5, &tl, &td, &tt, &t3, &tj, &tb, &tr, &t7, &tn,
-    &tf, &tv);
-  x[0*xstride] = (od_coeff)t0;
-  x[1*xstride] = (od_coeff)t1;
-  x[2*xstride] = (od_coeff)t2;
-  x[3*xstride] = (od_coeff)t3;
-  x[4*xstride] = (od_coeff)t4;
-  x[5*xstride] = (od_coeff)t5;
-  x[6*xstride] = (od_coeff)t6;
-  x[7*xstride] = (od_coeff)t7;
-  x[8*xstride] = (od_coeff)t8;
-  x[9*xstride] = (od_coeff)t9;
-  x[10*xstride] = (od_coeff)ta;
-  x[11*xstride] = (od_coeff)tb;
-  x[12*xstride] = (od_coeff)tc;
-  x[13*xstride] = (od_coeff)td;
-  x[14*xstride] = (od_coeff)te;
-  x[15*xstride] = (od_coeff)tf;
-  x[16*xstride] = (od_coeff)tg;
-  x[17*xstride] = (od_coeff)th;
-  x[18*xstride] = (od_coeff)ti;
-  x[19*xstride] = (od_coeff)tj;
-  x[20*xstride] = (od_coeff)tk;
-  x[21*xstride] = (od_coeff)tl;
-  x[22*xstride] = (od_coeff)tm;
-  x[23*xstride] = (od_coeff)tn;
-  x[24*xstride] = (od_coeff)to;
-  x[25*xstride] = (od_coeff)tp;
-  x[26*xstride] = (od_coeff)tq;
-  x[27*xstride] = (od_coeff)tr;
-  x[28*xstride] = (od_coeff)ts;
-  x[29*xstride] = (od_coeff)tt;
-  x[30*xstride] = (od_coeff)tu;
-  x[31*xstride] = (od_coeff)tv;
-}
-
-void od_bin_fdst32(od_coeff y[32], const od_coeff *x, int xstride) {
-  od_coeff t0;
-  od_coeff t1;
-  od_coeff t2;
-  od_coeff t3;
-  od_coeff t4;
-  od_coeff t5;
-  od_coeff t6;
-  od_coeff t7;
-  od_coeff t8;
-  od_coeff t9;
-  od_coeff ta;
-  od_coeff tb;
-  od_coeff tc;
-  od_coeff td;
-  od_coeff te;
-  od_coeff tf;
-  od_coeff tg;
-  od_coeff th;
-  od_coeff ti;
-  od_coeff tj;
-  od_coeff tk;
-  od_coeff tl;
-  od_coeff tm;
-  od_coeff tn;
-  od_coeff to;
-  od_coeff tp;
-  od_coeff tq;
-  od_coeff tr;
-  od_coeff ts;
-  od_coeff tt;
-  od_coeff tu;
-  od_coeff tv;
-  #if !CONFIG_DAALA_TX_DST32
-    assert(0 && "od_bin_fdst32() called when !CONFIG_DAALA_TX_DST32");
-  #endif
-  t0 = x[0*xstride];
-  t1 = x[1*xstride];
-  t2 = x[2*xstride];
-  t3 = x[3*xstride];
-  t4 = x[4*xstride];
-  t5 = x[5*xstride];
-  t6 = x[6*xstride];
-  t7 = x[7*xstride];
-  t8 = x[8*xstride];
-  t9 = x[9*xstride];
-  ta = x[10*xstride];
-  tb = x[11*xstride];
-  tc = x[12*xstride];
-  td = x[13*xstride];
-  te = x[14*xstride];
-  tf = x[15*xstride];
-  tg = x[16*xstride];
-  th = x[17*xstride];
-  ti = x[18*xstride];
-  tj = x[19*xstride];
-  tk = x[20*xstride];
-  tl = x[21*xstride];
-  tm = x[22*xstride];
-  tn = x[23*xstride];
-  to = x[24*xstride];
-  tp = x[25*xstride];
-  tq = x[26*xstride];
-  tr = x[27*xstride];
-  ts = x[28*xstride];
-  tt = x[29*xstride];
-  tu = x[30*xstride];
-  tv = x[31*xstride];
-  OD_FDST_32_PR(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf,
-    tg, th, ti, tj, tk, tl, tm, tn, to, tp, tq, tr, ts, tt, tu, tv);
-  y[0] = t0;
-  y[1] = tg;
-  y[2] = t8;
-  y[3] = to;
-  y[4] = t4;
-  y[5] = tk;
-  y[6] = tc;
-  y[7] = ts;
-  y[8] = t2;
-  y[9] = ti;
-  y[10] = ta;
-  y[11] = tq;
-  y[12] = t6;
-  y[13] = tm;
-  y[14] = te;
-  y[15] = tu;
-  y[16] = t1;
-  y[17] = th;
-  y[18] = t9;
-  y[19] = tp;
-  y[20] = t5;
-  y[21] = tl;
-  y[22] = td;
-  y[23] = tt;
-  y[24] = t3;
-  y[25] = tj;
-  y[26] = tb;
-  y[27] = tr;
-  y[28] = t7;
-  y[29] = tn;
-  y[30] = tf;
-  y[31] = tv;
-}
-
-void od_bin_idst32(od_coeff *x, int xstride, const od_coeff y[32]) {
-  od_coeff t0;
-  od_coeff t1;
-  od_coeff t2;
-  od_coeff t3;
-  od_coeff t4;
-  od_coeff t5;
-  od_coeff t6;
-  od_coeff t7;
-  od_coeff t8;
-  od_coeff t9;
-  od_coeff ta;
-  od_coeff tb;
-  od_coeff tc;
-  od_coeff td;
-  od_coeff te;
-  od_coeff tf;
-  od_coeff tg;
-  od_coeff th;
-  od_coeff ti;
-  od_coeff tj;
-  od_coeff tk;
-  od_coeff tl;
-  od_coeff tm;
-  od_coeff tn;
-  od_coeff to;
-  od_coeff tp;
-  od_coeff tq;
-  od_coeff tr;
-  od_coeff ts;
-  od_coeff tt;
-  od_coeff tu;
-  od_coeff tv;
-  #if !CONFIG_DAALA_TX_DST32
-    assert(0 && "od_bin_idst32() called when !CONFIG_DAALA_TX_DST32");
-  #endif
-  t0 = y[0];
-  tg = y[1];
-  t8 = y[2];
-  to = y[3];
-  t4 = y[4];
-  tk = y[5];
-  tc = y[6];
-  ts = y[7];
-  t2 = y[8];
-  ti = y[9];
-  ta = y[10];
-  tq = y[11];
-  t6 = y[12];
-  tm = y[13];
-  te = y[14];
-  tu = y[15];
-  t1 = y[16];
-  th = y[17];
-  t9 = y[18];
-  tp = y[19];
-  t5 = y[20];
-  tl = y[21];
-  td = y[22];
-  tt = y[23];
-  t3 = y[24];
-  tj = y[25];
-  tb = y[26];
-  tr = y[27];
-  t7 = y[28];
-  tn = y[29];
-  tf = y[30];
-  tv = y[31];
-  OD_IDST_32_PR(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, te, tu,
-    t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv);
-  x[0*xstride] = t0;
-  x[1*xstride] = t1;
-  x[2*xstride] = t2;
-  x[3*xstride] = t3;
-  x[4*xstride] = t4;
-  x[5*xstride] = t5;
-  x[6*xstride] = t6;
-  x[7*xstride] = t7;
-  x[8*xstride] = t8;
-  x[9*xstride] = t9;
-  x[10*xstride] = ta;
-  x[11*xstride] = tb;
-  x[12*xstride] = tc;
-  x[13*xstride] = td;
-  x[14*xstride] = te;
-  x[15*xstride] = tf;
-  x[16*xstride] = tg;
-  x[17*xstride] = th;
-  x[18*xstride] = ti;
-  x[19*xstride] = tj;
-  x[20*xstride] = tk;
-  x[21*xstride] = tl;
-  x[22*xstride] = tm;
-  x[23*xstride] = tn;
-  x[24*xstride] = to;
-  x[25*xstride] = tp;
-  x[26*xstride] = tq;
-  x[27*xstride] = tr;
-  x[28*xstride] = ts;
-  x[29*xstride] = tt;
-  x[30*xstride] = tu;
-  x[31*xstride] = tv;
-}
-
-#if CONFIG_TX64X64
-void od_bin_fdct64(od_coeff y[64], const od_coeff *x, int xstride) {
-  int t0;
-  int t1;
-  int t2;
-  int t3;
-  int t4;
-  int t5;
-  int t6;
-  int t7;
-  int t8;
-  int t9;
-  int ta;
-  int tb;
-  int tc;
-  int td;
-  int te;
-  int tf;
-  int tg;
-  int th;
-  int ti;
-  int tj;
-  int tk;
-  int tl;
-  int tm;
-  int tn;
-  int to;
-  int tp;
-  int tq;
-  int tr;
-  int ts;
-  int tt;
-  int tu;
-  int tv;
-  int tw;
-  int tx;
-  int ty;
-  int tz;
-  int tA;
-  int tB;
-  int tC;
-  int tD;
-  int tE;
-  int tF;
-  int tG;
-  int tH;
-  int tI;
-  int tJ;
-  int tK;
-  int tL;
-  int tM;
-  int tN;
-  int tO;
-  int tP;
-  int tQ;
-  int tR;
-  int tS;
-  int tT;
-  int tU;
-  int tV;
-  int tW;
-  int tX;
-  int tY;
-  int tZ;
-  int t_;
-  int t;
-  t0 = x[0*xstride];
-  tw = x[1*xstride];
-  tg = x[2*xstride];
-  tM = x[3*xstride];
-  t8 = x[4*xstride];
-  tE = x[5*xstride];
-  to = x[6*xstride];
-  tU = x[7*xstride];
-  t4 = x[8*xstride];
-  tA = x[9*xstride];
-  tk = x[10*xstride];
-  tQ = x[11*xstride];
-  tc = x[12*xstride];
-  tI = x[13*xstride];
-  ts = x[14*xstride];
-  tY = x[15*xstride];
-  t2 = x[16*xstride];
-  ty = x[17*xstride];
-  ti = x[18*xstride];
-  tO = x[19*xstride];
-  ta = x[20*xstride];
-  tG = x[21*xstride];
-  tq = x[22*xstride];
-  tW = x[23*xstride];
-  t6 = x[24*xstride];
-  tC = x[25*xstride];
-  tm = x[26*xstride];
-  tS = x[27*xstride];
-  te = x[28*xstride];
-  tK = x[29*xstride];
-  tu = x[30*xstride];
-  t_ = x[31*xstride];
-  t1 = x[32*xstride];
-  tx = x[33*xstride];
-  th = x[34*xstride];
-  tN = x[35*xstride];
-  t9 = x[36*xstride];
-  tF = x[37*xstride];
-  tp = x[38*xstride];
-  tV = x[39*xstride];
-  t5 = x[40*xstride];
-  tB = x[41*xstride];
-  tl = x[42*xstride];
-  tR = x[43*xstride];
-  td = x[44*xstride];
-  tJ = x[45*xstride];
-  tt = x[46*xstride];
-  tZ = x[47*xstride];
-  t3 = x[48*xstride];
-  tz = x[49*xstride];
-  tj = x[50*xstride];
-  tP = x[51*xstride];
-  tb = x[52*xstride];
-  tH = x[53*xstride];
-  tr = x[54*xstride];
-  tX = x[55*xstride];
-  t7 = x[56*xstride];
-  tD = x[57*xstride];
-  tn = x[58*xstride];
-  tT = x[59*xstride];
-  tf = x[60*xstride];
-  tL = x[61*xstride];
-  tv = x[62*xstride];
-  t = x[63*xstride];
-  OD_FDCT_64_PR(t0, tw, tg, tM, t8, tE, to, tU, t4, tA, tk, tQ, tc, tI, ts, tY,
-    t2, ty, ti, tO, ta, tG, tq, tW, t6, tC, tm, tS, te, tK, tu, t_, t1, tx, th,
-    tN, t9, tF, tp, tV, t5, tB, tl, tR, td, tJ, tt, tZ, t3, tz, tj, tP, tb, tH,
-    tr, tX, t7, tD, tn, tT, tf, tL, tv, t);
-  y[0] = (od_coeff)t0;
-  y[1] = (od_coeff)t1;
-  y[2] = (od_coeff)t2;
-  y[3] = (od_coeff)t3;
-  y[4] = (od_coeff)t4;
-  y[5] = (od_coeff)t5;
-  y[6] = (od_coeff)t6;
-  y[7] = (od_coeff)t7;
-  y[8] = (od_coeff)t8;
-  y[9] = (od_coeff)t9;
-  y[10] = (od_coeff)ta;
-  y[11] = (od_coeff)tb;
-  y[12] = (od_coeff)tc;
-  y[13] = (od_coeff)td;
-  y[14] = (od_coeff)te;
-  y[15] = (od_coeff)tf;
-  y[16] = (od_coeff)tg;
-  y[17] = (od_coeff)th;
-  y[18] = (od_coeff)ti;
-  y[19] = (od_coeff)tj;
-  y[20] = (od_coeff)tk;
-  y[21] = (od_coeff)tl;
-  y[22] = (od_coeff)tm;
-  y[23] = (od_coeff)tn;
-  y[24] = (od_coeff)to;
-  y[25] = (od_coeff)tp;
-  y[26] = (od_coeff)tq;
-  y[27] = (od_coeff)tr;
-  y[28] = (od_coeff)ts;
-  y[29] = (od_coeff)tt;
-  y[30] = (od_coeff)tu;
-  y[31] = (od_coeff)tv;
-  y[32] = (od_coeff)tw;
-  y[33] = (od_coeff)tx;
-  y[34] = (od_coeff)ty;
-  y[35] = (od_coeff)tz;
-  y[36] = (od_coeff)tA;
-  y[37] = (od_coeff)tB;
-  y[38] = (od_coeff)tC;
-  y[39] = (od_coeff)tD;
-  y[40] = (od_coeff)tE;
-  y[41] = (od_coeff)tF;
-  y[41] = (od_coeff)tF;
-  y[42] = (od_coeff)tG;
-  y[43] = (od_coeff)tH;
-  y[44] = (od_coeff)tI;
-  y[45] = (od_coeff)tJ;
-  y[46] = (od_coeff)tK;
-  y[47] = (od_coeff)tL;
-  y[48] = (od_coeff)tM;
-  y[49] = (od_coeff)tN;
-  y[50] = (od_coeff)tO;
-  y[51] = (od_coeff)tP;
-  y[52] = (od_coeff)tQ;
-  y[53] = (od_coeff)tR;
-  y[54] = (od_coeff)tS;
-  y[55] = (od_coeff)tT;
-  y[56] = (od_coeff)tU;
-  y[57] = (od_coeff)tV;
-  y[58] = (od_coeff)tW;
-  y[59] = (od_coeff)tX;
-  y[60] = (od_coeff)tY;
-  y[61] = (od_coeff)tZ;
-  y[62] = (od_coeff)t_;
-  y[63] = (od_coeff)t;
-}
-
-void od_bin_idct64(od_coeff *x, int xstride, const od_coeff y[64]) {
-  int t0;
-  int t1;
-  int t2;
-  int t3;
-  int t4;
-  int t5;
-  int t6;
-  int t7;
-  int t8;
-  int t9;
-  int ta;
-  int tb;
-  int tc;
-  int td;
-  int te;
-  int tf;
-  int tg;
-  int th;
-  int ti;
-  int tj;
-  int tk;
-  int tl;
-  int tm;
-  int tn;
-  int to;
-  int tp;
-  int tq;
-  int tr;
-  int ts;
-  int tt;
-  int tu;
-  int tv;
-  int tw;
-  int tx;
-  int ty;
-  int tz;
-  int tA;
-  int tB;
-  int tC;
-  int tD;
-  int tE;
-  int tF;
-  int tG;
-  int tH;
-  int tI;
-  int tJ;
-  int tK;
-  int tL;
-  int tM;
-  int tN;
-  int tO;
-  int tP;
-  int tQ;
-  int tR;
-  int tS;
-  int tT;
-  int tU;
-  int tV;
-  int tW;
-  int tX;
-  int tY;
-  int tZ;
-  int t_;
-  int t;
-  t0 = y[0];
-  tw = y[1];
-  tg = y[2];
-  tM = y[3];
-  t8 = y[4];
-  tE = y[5];
-  to = y[6];
-  tU = y[7];
-  t4 = y[8];
-  tA = y[9];
-  tk = y[10];
-  tQ = y[11];
-  tc = y[12];
-  tI = y[13];
-  ts = y[14];
-  tY = y[15];
-  t2 = y[16];
-  ty = y[17];
-  ti = y[18];
-  tO = y[19];
-  ta = y[20];
-  tG = y[21];
-  tq = y[22];
-  tW = y[23];
-  t6 = y[24];
-  tC = y[25];
-  tm = y[26];
-  tS = y[27];
-  te = y[28];
-  tK = y[29];
-  tu = y[30];
-  t_ = y[31];
-  t1 = y[32];
-  tx = y[33];
-  th = y[34];
-  tN = y[35];
-  t9 = y[36];
-  tF = y[37];
-  tp = y[38];
-  tV = y[39];
-  t5 = y[40];
-  tB = y[41];
-  tl = y[42];
-  tR = y[43];
-  td = y[44];
-  tJ = y[45];
-  tt = y[46];
-  tZ = y[47];
-  t3 = y[48];
-  tz = y[49];
-  tj = y[50];
-  tP = y[51];
-  tb = y[52];
-  tH = y[53];
-  tr = y[54];
-  tX = y[55];
-  t7 = y[56];
-  tD = y[57];
-  tn = y[58];
-  tT = y[59];
-  tf = y[60];
-  tL = y[61];
-  tv = y[62];
-  t = y[63];
-  OD_IDCT_64_PR(t0, tw, tg, tM, t8, tE, to, tU, t4, tA, tk, tQ, tc, tI, ts, tY,
-    t2, ty, ti, tO, ta, tG, tq, tW, t6, tC, tm, tS, te, tK, tu, t_, t1, tx, th,
-    tN, t9, tF, tp, tV, t5, tB, tl, tR, td, tJ, tt, tZ, t3, tz, tj, tP, tb, tH,
-    tr, tX, t7, tD, tn, tT, tf, tL, tv, t);
-  x[0*xstride] = (od_coeff)t0;
-  x[1*xstride] = (od_coeff)t1;
-  x[2*xstride] = (od_coeff)t2;
-  x[3*xstride] = (od_coeff)t3;
-  x[4*xstride] = (od_coeff)t4;
-  x[5*xstride] = (od_coeff)t5;
-  x[6*xstride] = (od_coeff)t6;
-  x[7*xstride] = (od_coeff)t7;
-  x[8*xstride] = (od_coeff)t8;
-  x[9*xstride] = (od_coeff)t9;
-  x[10*xstride] = (od_coeff)ta;
-  x[11*xstride] = (od_coeff)tb;
-  x[12*xstride] = (od_coeff)tc;
-  x[13*xstride] = (od_coeff)td;
-  x[14*xstride] = (od_coeff)te;
-  x[15*xstride] = (od_coeff)tf;
-  x[16*xstride] = (od_coeff)tg;
-  x[17*xstride] = (od_coeff)th;
-  x[18*xstride] = (od_coeff)ti;
-  x[19*xstride] = (od_coeff)tj;
-  x[20*xstride] = (od_coeff)tk;
-  x[21*xstride] = (od_coeff)tl;
-  x[22*xstride] = (od_coeff)tm;
-  x[23*xstride] = (od_coeff)tn;
-  x[24*xstride] = (od_coeff)to;
-  x[25*xstride] = (od_coeff)tp;
-  x[26*xstride] = (od_coeff)tq;
-  x[27*xstride] = (od_coeff)tr;
-  x[28*xstride] = (od_coeff)ts;
-  x[29*xstride] = (od_coeff)tt;
-  x[30*xstride] = (od_coeff)tu;
-  x[31*xstride] = (od_coeff)tv;
-  x[32*xstride] = (od_coeff)tw;
-  x[33*xstride] = (od_coeff)tx;
-  x[34*xstride] = (od_coeff)ty;
-  x[35*xstride] = (od_coeff)tz;
-  x[36*xstride] = (od_coeff)tA;
-  x[37*xstride] = (od_coeff)tB;
-  x[38*xstride] = (od_coeff)tC;
-  x[39*xstride] = (od_coeff)tD;
-  x[40*xstride] = (od_coeff)tE;
-  x[41*xstride] = (od_coeff)tF;
-  x[41*xstride] = (od_coeff)tF;
-  x[42*xstride] = (od_coeff)tG;
-  x[43*xstride] = (od_coeff)tH;
-  x[44*xstride] = (od_coeff)tI;
-  x[45*xstride] = (od_coeff)tJ;
-  x[46*xstride] = (od_coeff)tK;
-  x[47*xstride] = (od_coeff)tL;
-  x[48*xstride] = (od_coeff)tM;
-  x[49*xstride] = (od_coeff)tN;
-  x[50*xstride] = (od_coeff)tO;
-  x[51*xstride] = (od_coeff)tP;
-  x[52*xstride] = (od_coeff)tQ;
-  x[53*xstride] = (od_coeff)tR;
-  x[54*xstride] = (od_coeff)tS;
-  x[55*xstride] = (od_coeff)tT;
-  x[56*xstride] = (od_coeff)tU;
-  x[57*xstride] = (od_coeff)tV;
-  x[58*xstride] = (od_coeff)tW;
-  x[59*xstride] = (od_coeff)tX;
-  x[60*xstride] = (od_coeff)tY;
-  x[61*xstride] = (od_coeff)tZ;
-  x[62*xstride] = (od_coeff)t_;
-  x[63*xstride] = (od_coeff)t;
-}
-#endif
-
-void od_bin_fidtx4(od_coeff y[4], const od_coeff *x, int xstride) {
-  int i;
-  for (i = 0; i < 4; i++)
-    y[i] = x[i*xstride];
-}
-
-void od_bin_fidtx8(od_coeff y[8], const od_coeff *x, int xstride) {
-  int i;
-  for (i = 0; i < 8; i++)
-    y[i] = x[i*xstride];
-}
-
-void od_bin_fidtx16(od_coeff y[16], const od_coeff *x, int xstride) {
-  int i;
-  for (i = 0; i < 16; i++)
-    y[i] = x[i*xstride];
-}
-
-void od_bin_fidtx32(od_coeff y[32], const od_coeff *x, int xstride) {
-  int i;
-  for (i = 0; i < 32; i++)
-    y[i] = x[i*xstride];
-}
-
-#if CONFIG_TX64X64
-void od_bin_fidtx64(od_coeff y[64], const od_coeff *x, int xstride) {
-  int i;
-  for (i = 0; i < 64; i++)
-    y[i] = x[i*xstride];
-}
-#endif
-
-void od_bin_iidtx4(od_coeff *x, int xstride, const od_coeff y[4]) {
-  int i;
-  for (i = 0; i < 4; i++)
-    x[i*xstride] = y[i];
-}
-
-void od_bin_iidtx8(od_coeff *x, int xstride, const od_coeff y[8]) {
-  int i;
-  for (i = 0; i < 8; i++)
-    x[i*xstride] = y[i];
-}
-
-void od_bin_iidtx16(od_coeff *x, int xstride, const od_coeff y[16]) {
-  int i;
-  for (i = 0; i < 16; i++)
-    x[i*xstride] = y[i];
-}
-
-void od_bin_iidtx32(od_coeff *x, int xstride, const od_coeff y[32]) {
-  int i;
-  for (i = 0; i < 32; i++)
-    x[i*xstride] = y[i];
-}
-
-#if CONFIG_TX64X64
-void od_bin_iidtx64(od_coeff *x, int xstride, const od_coeff y[64]) {
-  int i;
-  for (i = 0; i < 64; i++)
-    x[i*xstride] = y[i];
-}
-#endif
-
-// Below are intermediate wrappers that handle the case when
-// tran_low_t is a smaller type than od_coeff
-void daala_fdct4(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[4];
-  od_coeff y[4];
-  for (i = 0; i < 4; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdct4(y, x, 1);
-  for (i = 0; i < 4; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idct4(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[4];
-  od_coeff y[4];
-  for (i = 0; i < 4; i++) y[i] = input[i];
-  od_bin_idct4(x, 1, y);
-  for (i = 0; i < 4; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_fdst4(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[4];
-  od_coeff y[4];
-  for (i = 0; i < 4; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdst4(y, x, 1);
-  for (i = 0; i < 4; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idst4(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[4];
-  od_coeff y[4];
-  for (i = 0; i < 4; i++) y[i] = input[i];
-  od_bin_idst4(x, 1, y);
-  for (i = 0; i < 4; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_idtx4(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 4; i++) output[i] = input[i];
-}
-
-void daala_fdct8(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[8];
-  od_coeff y[8];
-  for (i = 0; i < 8; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdct8(y, x, 1);
-  for (i = 0; i < 8; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idct8(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[8];
-  od_coeff y[8];
-  for (i = 0; i < 8; i++) y[i] = (od_coeff)input[i];
-  od_bin_idct8(x, 1, y);
-  for (i = 0; i < 8; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_fdst8(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[8];
-  od_coeff y[8];
-  for (i = 0; i < 8; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdst8(y, x, 1);
-  for (i = 0; i < 8; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idst8(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[8];
-  od_coeff y[8];
-  for (i = 0; i < 8; i++) y[i] = (od_coeff)input[i];
-  od_bin_idst8(x, 1, y);
-  for (i = 0; i < 8; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_idtx8(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 8; i++) output[i] = input[i];
-}
-
-void daala_fdct16(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[16];
-  od_coeff y[16];
-  for (i = 0; i < 16; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdct16(y, x, 1);
-  for (i = 0; i < 16; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idct16(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[16];
-  od_coeff y[16];
-  for (i = 0; i < 16; i++) y[i] = (od_coeff)input[i];
-  od_bin_idct16(x, 1, y);
-  for (i = 0; i < 16; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_fdst16(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[16];
-  od_coeff y[16];
-  for (i = 0; i < 16; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdst16(y, x, 1);
-  for (i = 0; i < 16; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idst16(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[16];
-  od_coeff y[16];
-  for (i = 0; i < 16; i++) y[i] = (od_coeff)input[i];
-  od_bin_idst16(x, 1, y);
-  for (i = 0; i < 16; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_idtx16(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 16; i++) output[i] = input[i];
-}
-
-void daala_fdct32(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[32];
-  od_coeff y[32];
-  for (i = 0; i < 32; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdct32(y, x, 1);
-  for (i = 0; i < 32; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idct32(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[32];
-  od_coeff y[32];
-  for (i = 0; i < 32; i++) y[i] = (od_coeff)input[i];
-  od_bin_idct32(x, 1, y);
-  for (i = 0; i < 32; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_fdst32(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[32];
-  od_coeff y[32];
-  for (i = 0; i < 32; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdst32(y, x, 1);
-  for (i = 0; i < 32; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idst32(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[32];
-  od_coeff y[32];
-  for (i = 0; i < 32; i++) y[i] = input[i];
-  od_bin_idst32(x, 1, y);
-  for (i = 0; i < 32; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_idtx32(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 32; i++) output[i] = input[i];
-}
-
-#if CONFIG_TX64X64
-void daala_fdct64(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[64];
-  od_coeff y[64];
-  for (i = 0; i < 64; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdct64(y, x, 1);
-  for (i = 0; i < 64; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idct64(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[64];
-  od_coeff y[64];
-  for (i = 0; i < 64; i++) y[i] = (od_coeff)input[i];
-  od_bin_idct64(x, 1, y);
-  for (i = 0; i < 64; i++) output[i] = (tran_low_t)x[i];
-}
-
-/* Preserve the "half-right" transform behavior. */
-void daala_fdst64(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  tran_low_t inputhalf[32];
-  for (i = 0; i < 32; ++i) {
-    output[32 + i] = input[i];
-  }
-  for (i = 0; i < 32; ++i) {
-    inputhalf[i] = input[i + 32];
-  }
-  daala_fdct32(inputhalf, output);
-}
-
-/* Preserve the "half-right" transform behavior. */
-void daala_idst64(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  tran_low_t inputhalf[32];
-  for (i = 0; i < 32; ++i) {
-    inputhalf[i] = input[i];
-  }
-  for (i = 0; i < 32; ++i) {
-    output[i] = input[32 + i];
-  }
-  daala_idct32(inputhalf, output + 32);
-}
-
-void daala_idtx64(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 64; i++) output[i] = input[i];
-}
-#endif
diff --git a/av1/common/daala_tx.h b/av1/common/daala_tx.h
deleted file mode 100644
index 2943802..0000000
--- a/av1/common/daala_tx.h
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifndef AOM_DSP_DAALA_TX_H_
-#define AOM_DSP_DAALA_TX_H_
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "av1/common/odintrin.h"
-
-void daala_fdct4(const tran_low_t *input, tran_low_t *output);
-void daala_idct4(const tran_low_t *input, tran_low_t *output);
-void daala_fdst4(const tran_low_t *input, tran_low_t *output);
-void daala_idst4(const tran_low_t *input, tran_low_t *output);
-void daala_idtx4(const tran_low_t *input, tran_low_t *output);
-void daala_fdct8(const tran_low_t *input, tran_low_t *output);
-void daala_idct8(const tran_low_t *input, tran_low_t *output);
-void daala_fdst8(const tran_low_t *input, tran_low_t *output);
-void daala_idst8(const tran_low_t *input, tran_low_t *output);
-void daala_idtx8(const tran_low_t *input, tran_low_t *output);
-void daala_fdct16(const tran_low_t *input, tran_low_t *output);
-void daala_idct16(const tran_low_t *input, tran_low_t *output);
-void daala_fdst16(const tran_low_t *input, tran_low_t *output);
-void daala_idst16(const tran_low_t *input, tran_low_t *output);
-void daala_idtx16(const tran_low_t *input, tran_low_t *output);
-void daala_fdct32(const tran_low_t *input, tran_low_t *output);
-void daala_idct32(const tran_low_t *input, tran_low_t *output);
-void daala_fdst32(const tran_low_t *input, tran_low_t *output);
-void daala_idst32(const tran_low_t *input, tran_low_t *output);
-void daala_idtx32(const tran_low_t *input, tran_low_t *output);
-#if CONFIG_TX64X64
-void daala_fdct64(const tran_low_t *input, tran_low_t *output);
-void daala_idct64(const tran_low_t *input, tran_low_t *output);
-void daala_fdst64(const tran_low_t *input, tran_low_t *output);
-void daala_idst64(const tran_low_t *input, tran_low_t *output);
-void daala_idtx64(const tran_low_t *input, tran_low_t *output);
-#endif
-
-void od_bin_fdct4(od_coeff y[4], const od_coeff *x, int xstride);
-void od_bin_idct4(od_coeff *x, int xstride, const od_coeff y[4]);
-void od_bin_fdst4(od_coeff y[4], const od_coeff *x, int xstride);
-void od_bin_idst4(od_coeff *x, int xstride, const od_coeff y[4]);
-void od_bin_fidtx4(od_coeff y[4], const od_coeff *x, int xstride);
-void od_bin_iidtx4(od_coeff *x, int xstride, const od_coeff y[4]);
-void od_bin_fdct8(od_coeff y[8], const od_coeff *x, int xstride);
-void od_bin_idct8(od_coeff *x, int xstride, const od_coeff y[8]);
-void od_bin_fdst8(od_coeff y[8], const od_coeff *x, int xstride);
-void od_bin_idst8(od_coeff *x, int xstride, const od_coeff y[8]);
-void od_bin_fidtx8(od_coeff y[8], const od_coeff *x, int xstride);
-void od_bin_iidtx8(od_coeff *x, int xstride, const od_coeff y[8]);
-void od_bin_fdct16(od_coeff y[16], const od_coeff *x, int xstride);
-void od_bin_idct16(od_coeff *x, int xstride, const od_coeff y[16]);
-void od_bin_fdst16(od_coeff y[16], const od_coeff *x, int xstride);
-void od_bin_idst16(od_coeff *x, int xstride, const od_coeff y[16]);
-void od_bin_fidtx16(od_coeff y[16], const od_coeff *x, int xstride);
-void od_bin_iidtx16(od_coeff *x, int xstride, const od_coeff y[16]);
-void od_bin_fdct32(od_coeff y[32], const od_coeff *x, int xstride);
-void od_bin_idct32(od_coeff *x, int xstride, const od_coeff y[32]);
-void od_bin_fdst32(od_coeff y[32], const od_coeff *x, int xstride);
-void od_bin_idst32(od_coeff *x, int xstride, const od_coeff y[32]);
-void od_bin_fidtx32(od_coeff y[32], const od_coeff *x, int xstride);
-void od_bin_iidtx32(od_coeff *x, int xstride, const od_coeff y[32]);
-#if CONFIG_TX64X64
-void od_bin_fdct64(od_coeff y[64], const od_coeff *x, int xstride);
-void od_bin_idct64(od_coeff *x, int xstride, const od_coeff y[64]);
-void od_bin_fidtx64(od_coeff y[64], const od_coeff *x, int xstride);
-void od_bin_iidtx64(od_coeff *x, int xstride, const od_coeff y[64]);
-#endif
-#endif
diff --git a/av1/common/daala_tx_kernels.h b/av1/common/daala_tx_kernels.h
deleted file mode 100644
index 1559228..0000000
--- a/av1/common/daala_tx_kernels.h
+++ /dev/null
@@ -1,1629 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#ifndef AOM_DSP_DAALA_TX_KERNELS_H_
-#define AOM_DSP_DAALA_TX_KERNELS_H_
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "av1/common/odintrin.h"
-
-#define AVG_BIAS (0)
-
-static INLINE od_coeff od_add(od_coeff p0, od_coeff p1) {
-  return p0 + p1;
-}
-
-static INLINE od_coeff od_sub(od_coeff p0, od_coeff p1) {
-  return p0 - p1;
-}
-
-static INLINE od_coeff od_add_avg(od_coeff p0, od_coeff p1) {
-  return (od_add(p0, p1) + AVG_BIAS) >> 1;
-}
-
-static INLINE od_coeff od_sub_avg(od_coeff p0, od_coeff p1) {
-  return (od_sub(p0, p1) + AVG_BIAS) >> 1;
-}
-
-static INLINE od_coeff od_rshift1(od_coeff v) {
-  return (v + (v < 0)) >> 1;
-}
-
-/* Fixed point multiply. */
-static INLINE od_coeff od_mul(od_coeff n, int c, int q) {
-  return (n*c + ((1 << q) >> 1)) >> q;
-}
-
-/* Two multiply rotation primative (used when rotating by Pi/4). */
-static INLINE void od_rot2(od_coeff *p0, od_coeff *p1, od_coeff t, int c0,
- int q0, int c1, int q1) {
-  *p1 = od_mul(*p0, c0, q0);
-  *p0 = od_mul(t, c1, q1);
-}
-
-/* Three multiply rotation primative. */
-static INLINE void od_rot3(od_coeff *p0, od_coeff *p1, od_coeff *t, od_coeff *u,
- int c0, int q0, int c1, int q1, int c2, int q2) {
-  *u = od_mul(*p0, c0, q0);
-  *p0 = od_mul(*p1, c1, q1);
-  *t = od_mul(*t, c2, q2);
-}
-
-#define NONE (0)
-#define AVG (!NONE)
-#define SHIFT (!NONE)
-
-#define ADD (0)
-#define SUB (1)
-
-/* Rotate by Pi/4 and add. */
-static INLINE void od_rotate_pi4_kernel(od_coeff *p0, od_coeff *p1, int c0,
- int q0, int c1, int q1, int type, int avg) {
-  od_coeff t;
-  t = type == ADD ?
-   avg ? od_add_avg(*p1, *p0) : od_add(*p1, *p0) :
-   avg ? od_sub_avg(*p1, *p0) : od_sub(*p1, *p0);
-  od_rot2(p0, p1, t, c0, q0, c1, q1);
-  *p1 = type == ADD ? od_sub(*p1, *p0) : od_add(*p1, *p0);
-}
-
-#define od_rotate_pi4_add(p0, p1, c0, q0, c1, q1) \
- od_rotate_pi4_kernel(p0, p1, c0, q0, c1, q1, ADD, NONE)
-#define od_rotate_pi4_sub(p0, p1, c0, q0, c1, q1) \
- od_rotate_pi4_kernel(p0, p1, c0, q0, c1, q1, SUB, NONE)
-
-#define od_rotate_pi4_add_avg(p0, p1, c0, q0, c1, q1) \
- od_rotate_pi4_kernel(p0, p1, c0, q0, c1, q1, ADD, AVG)
-#define od_rotate_pi4_sub_avg(p0, p1, c0, q0, c1, q1) \
- od_rotate_pi4_kernel(p0, p1, c0, q0, c1, q1, SUB, AVG)
-
-/* Rotate and add. */
-static INLINE void od_rotate_kernel(od_coeff *p0, od_coeff *p1, od_coeff v,
- int c0, int q0, int c1, int q1, int c2, int q2, int type, int avg, int shift) {
-  od_coeff u;
-  od_coeff t;
-  t = type == ADD ?
-   avg ? od_add_avg(*p1, v) : od_add(*p1, v) :
-   avg ? od_sub_avg(*p1, v) : od_sub(*p1, v);
-  od_rot3(p0, p1, &t, &u, c0, q0, c1, q1, c2, q2);
-  *p0 = od_add(*p0, t);
-  if (shift) t = od_rshift1(t);
-  *p1 = type == ADD ? od_sub(u, t) : od_add(u, t);
-}
-
-#define od_rotate_add(p0, p1, c0, q0, c1, q1, c2, q2, shift) \
- od_rotate_kernel(p0, p1, *p0, c0, q0, c1, q1, c2, q2, ADD, NONE, shift)
-#define od_rotate_sub(p0, p1, c0, q0, c1, q1, c2, q2, shift) \
- od_rotate_kernel(p0, p1, *p0, c0, q0, c1, q1, c2, q2, SUB, NONE, shift)
-
-#define od_rotate_add_avg(p0, p1, c0, q0, c1, q1, c2, q2, shift) \
- od_rotate_kernel(p0, p1, *p0, c0, q0, c1, q1, c2, q2, ADD, AVG, shift)
-#define od_rotate_sub_avg(p0, p1, c0, q0, c1, q1, c2, q2, shift) \
- od_rotate_kernel(p0, p1, *p0, c0, q0, c1, q1, c2, q2, SUB, AVG, shift)
-
-#define od_rotate_add_half(p0, p1, v, c0, q0, c1, q1, c2, q2, shift) \
- od_rotate_kernel(p0, p1, v, c0, q0, c1, q1, c2, q2, ADD, NONE, shift)
-#define od_rotate_sub_half(p0, p1, v, c0, q0, c1, q1, c2, q2, shift) \
- od_rotate_kernel(p0, p1, v, c0, q0, c1, q1, c2, q2, SUB, NONE, shift)
-
-/* Rotate and subtract with negation. */
-static INLINE void od_rotate_neg_kernel(od_coeff *p0, od_coeff *p1,
- int c0, int q0, int c1, int q1, int c2, int q2, int avg) {
-  od_coeff u;
-  od_coeff t;
-  t = avg ? od_sub_avg(*p0, *p1) : od_sub(*p0, *p1);
-  od_rot3(p0, p1, &t, &u, c0, q0, c1, q1, c2, q2);
-  *p0 = od_sub(*p0, t);
-  *p1 = od_sub(t, u);
-}
-
-#define od_rotate_neg(p0, p1, c0, q0, c1, q1, c2, q2) \
- od_rotate_neg_kernel(p0, p1, c0, q0, c1, q1, c2, q2, NONE)
-#define od_rotate_neg_avg(p0, p1, c0, q0, c1, q1, c2, q2) \
- od_rotate_neg_kernel(p0, p1, c0, q0, c1, q1, c2, q2, AVG)
-
-/* Computes the +/- addition butterfly (asymmetric output).
-   The inverse to this function is od_butterfly_add_asym().
-
-    p0 = p0 + p1;
-    p1 = p1 - p0/2; */
-static INLINE void od_butterfly_add(od_coeff *p0, od_coeff *p0h, od_coeff *p1) {
-  od_coeff p0h_;
-  *p0 = od_add(*p0, *p1);
-  p0h_ = od_rshift1(*p0);
-  *p1 = od_sub(*p1, p0h_);
-  if (p0h != NULL) *p0h = p0h_;
-}
-
-/* Computes the asymmetric +/- addition butterfly (unscaled output).
-   The inverse to this function is od_butterfly_add().
-
-    p1 = p1 + p0/2;
-    p0 = p0 - p1; */
-static INLINE void od_butterfly_add_asym(od_coeff *p0, od_coeff p0h,
- od_coeff *p1) {
-  *p1 = od_add(*p1, p0h);
-  *p0 = od_sub(*p0, *p1);
-}
-
-/* Computes the +/- subtraction butterfly (asymmetric output).
-   The inverse to this function is od_butterfly_sub_asym().
-
-    p0 = p0 - p1;
-    p1 = p1 + p0/2; */
-static INLINE void od_butterfly_sub(od_coeff *p0, od_coeff *p0h, od_coeff *p1) {
-  od_coeff p0h_;
-  *p0 = od_sub(*p0, *p1);
-  p0h_ = od_rshift1(*p0);
-  *p1 = od_add(*p1, p0h_);
-  if (p0h != NULL) *p0h = p0h_;
-}
-
-/* Computes the asymmetric +/- subtraction butterfly (unscaled output).
-   The inverse to this function is od_butterfly_sub().
-
-    p1 = p1 - p0/2;
-    p0 = p0 + p1; */
-static INLINE void od_butterfly_sub_asym(od_coeff *p0, od_coeff p0h,
- od_coeff *p1) {
-  *p1 = od_sub(*p1, p0h);
-  *p0 = od_add(*p0, *p1);
-}
-
-/* Computes the +/- subtract and negate butterfly (asymmetric output).
-   The inverse to this function is od_butterfly_neg_asym().
-
-    p1 = p1 - p0;
-    p0 = p0 + p1/2;
-    p1 = -p1; */
-static INLINE void od_butterfly_neg(od_coeff *p0, od_coeff *p1, od_coeff *p1h) {
-  *p1 = od_sub(*p0, *p1);
-  *p1h = od_rshift1(*p1);
-  *p0 = od_sub(*p0, *p1h);
-}
-
-/*  Computes the asymmetric +/- negate and subtract butterfly (unscaled output).
-    The inverse to this function is od_butterfly_neg().
-
-    p1 = -p1;
-    p0 = p0 - p1/2;
-    p1 = p1 + p0; */
-static INLINE void od_butterfly_neg_asym(od_coeff *p0, od_coeff *p1,
- od_coeff p1h) {
-  *p0 = od_add(*p0, p1h);
-  *p1 = od_sub(*p0, *p1);
-}
-
-/* --- 2-point Transforms --- */
-
-/**
- * 2-point orthonormal Type-II fDCT
- */
-static INLINE void od_fdct_2(od_coeff *p0, od_coeff *p1) {
-  /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4]  = 1.4142135623730951 */
-  /* 11585/8192 = 2*Cos[Pi/4]            = 1.4142135623730951 */
-  od_rotate_pi4_sub_avg(p1, p0, 11585, 13, 11585, 13);
-}
-
-/**
- * 2-point orthonormal Type-II iDCT
- */
-static INLINE void od_idct_2(od_coeff *p0, od_coeff *p1) {
-  /*  11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
-  /* 11585/16384 = Cos[Pi/4]             = 0.7071067811865475 */
-  od_rotate_pi4_add(p0, p1, 11585, 13, 11585, 14);
-}
-
-/**
- * 2-point asymmetric Type-II fDCT
- */
-static INLINE void od_fdct_2_asym(od_coeff *p0, od_coeff *p1, od_coeff p1h) {
-  od_butterfly_neg_asym(p0, p1, p1h);
-}
-
-/**
- * 2-point asymmetric Type-II iDCT
- */
-static INLINE void od_idct_2_asym(od_coeff *p0, od_coeff *p1, od_coeff *p1h) {
-  od_butterfly_neg(p0, p1, p1h);
-}
-
-/**
- * 2-point orthonormal Type-IV fDCT
- */
-static INLINE void od_fdst_2(od_coeff *p0, od_coeff *p1) {
-
-  /* Stage 0 */
-
-  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8]  = 1.3065629648763766 */
-  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8]  = 0.5411961001461971 */
-  /*   3135/4096 = 2*Cos[3*Pi/8]              = 0.7653668647301796 */
-  od_rotate_add_avg(p0, p1, 21407, 14, 8867, 14, 3135, 12, NONE);
-}
-
-/**
- * 2-point orthonormal Type-IV iDCT
- */
-static INLINE void od_idst_2(od_coeff *p0, od_coeff *p1) {
-  od_fdst_2(p0, p1);
-}
-
-/**
- * 2-point asymmetric Type-IV fDCT
- */
-static INLINE void od_fdst_2_asym(od_coeff *p0, od_coeff p0h, od_coeff *p1) {
-
-  /* Stage 0 */
-
-  /* 15137/16384 = (Sin[3*Pi/8] + Cos[3*Pi/8])/Sqrt[2] = 0.9238795325112867 */
-  /*   3135/4096 = (Sin[3*Pi/8] - Cos[3*Pi/8])*Sqrt[2] = 0.7653668647301795 */
-  /*  8867/16384 = Cos[3*Pi/8]*Sqrt[2]                 = 0.5411961001461971 */
-  od_rotate_add_half(p0, p1, p0h, 15137, 14, 3135, 12, 8867, 14, NONE);
-}
-
-/**
- * 2-point asymmetric Type-IV iDCT
- */
-static INLINE void od_idst_2_asym(od_coeff *p0, od_coeff *p1) {
-
-  /* Stage 0 */
-
-  /* 15137/16384 = (Sin[3*Pi/8] + Cos[3*Pi/8])/Sqrt[2] = 0.9238795325112867 */
-  /*   3135/4096 = (Sin[3*Pi/8] - Cos[3*Pi/8])*Sqrt[2] = 0.7653668647301795 */
-  /*   8867/8192 = 2*Cos[3*Pi/8]*Sqrt[2]               = 1.0823922002923940 */
-  od_rotate_add_avg(p0, p1, 15137, 14, 3135, 12, 8867, 13, SHIFT);
-}
-
-/* --- 4-point Transforms --- */
-
-/**
- * 4-point orthonormal Type-II fDCT
- */
-static INLINE void od_fdct_4(od_coeff *q0, od_coeff *q1, od_coeff *q2,
- od_coeff *q3) {
-  od_coeff q1h;
-  od_coeff q3h;
-
-  /* +/- Butterflies with asymmetric output. */
-  od_butterfly_neg(q0, q3, &q3h);
-  od_butterfly_add(q1, &q1h, q2);
-
-  /* Embedded 2-point transforms with asymmetric input. */
-  od_fdct_2_asym(q0, q1, q1h);
-  od_fdst_2_asym(q3, q3h, q2);
-}
-
-/**
- * 4-point orthonormal Type-II iDCT
- */
-static INLINE void od_idct_4(od_coeff *q0, od_coeff *q2,
-                             od_coeff *q1, od_coeff *q3)  {
-  od_coeff q1h;
-
-  /* Embedded 2-point transforms with asymmetric output. */
-  od_idst_2_asym(q3, q2);
-  od_idct_2_asym(q0, q1, &q1h);
-
-  /* +/- Butterflies with asymmetric input. */
-  od_butterfly_add_asym(q1, q1h, q2);
-  od_butterfly_neg_asym(q0, q3, od_rshift1(*q3));
-}
-
-/**
- * 4-point asymmetric Type-II fDCT
- */
-static INLINE void od_fdct_4_asym(od_coeff *q0, od_coeff *q1, od_coeff q1h,
-                                  od_coeff *q2, od_coeff *q3, od_coeff q3h) {
-
-  /* +/- Butterflies with asymmetric input. */
-  od_butterfly_neg_asym(q0, q3, q3h);
-  od_butterfly_sub_asym(q1, q1h, q2);
-
-  /* Embedded 2-point orthonormal transforms. */
-  od_fdct_2(q0, q1);
-  od_fdst_2(q3, q2);
-}
-
-/**
- * 4-point asymmetric Type-II iDCT
- */
-static INLINE void od_idct_4_asym(od_coeff *q0, od_coeff *q2,
-                                  od_coeff *q1, od_coeff *q1h,
-                                  od_coeff *q3, od_coeff *q3h)  {
-
-  /* Embedded 2-point orthonormal transforms. */
-  od_idst_2(q3, q2);
-  od_idct_2(q0, q1);
-
-  /* +/- Butterflies with asymmetric output. */
-  od_butterfly_sub(q1, q1h, q2);
-  od_butterfly_neg(q0, q3, q3h);
-}
-
-/**
- * 4-point orthonormal Type-IV fDST
- */
-static INLINE void od_fdst_4(od_coeff *q0, od_coeff *q1,
-                             od_coeff *q2, od_coeff *q3) {
-
-  /* Stage 0 */
-
-  /* 13623/16384 = (Sin[7*Pi/16] + Cos[7*Pi/16])/Sqrt[2] = 0.831469612302545 */
-  /* 18205/16384 = (Sin[7*Pi/16] - Cos[7*Pi/16])*Sqrt[2] = 1.111140466039204 */
-  /*  9041/32768 = Cos[7*Pi/16]*Sqrt[2]                  = 0.275899379282943 */
-  od_rotate_add(q0, q3, 13623, 14, 18205, 14, 9041, 15, SHIFT);
-
-  /* 16069/16384 = (Sin[5*Pi/16] + Cos[5*Pi/16])/Sqrt[2] = 0.9807852804032304 */
-  /* 12785/32768 = (Sin[5*Pi/16] - Cos[5*Pi/16])*Sqrt[2] = 0.3901806440322566 */
-  /* 12873/16384 = Cos[5*Pi/16]*Sqrt[2]                  = 0.7856949583871021 */
-  od_rotate_sub(q2, q1, 16069, 14, 12785, 15, 12873, 14, SHIFT);
-
-  /* Stage 1 */
-
-  od_butterfly_sub_asym(q0, od_rshift1(*q0), q1);
-  od_butterfly_sub_asym(q2, od_rshift1(*q2), q3);
-
-  /* Stage 2 */
-
-  /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
-  /* 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951 */
-  od_rotate_pi4_add_avg(q2, q1, 11585, 13, 11585, 13);
-}
-
-/**
- * 4-point orthonormal Type-IV iDST
- */
-static INLINE void od_idst_4(od_coeff *q0, od_coeff *q2,
-                             od_coeff *q1, od_coeff *q3) {
-  od_coeff q0h;
-  od_coeff q2h;
-
-  /* Stage 0 */
-
-  /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
-  /* 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951 */
-  od_rotate_pi4_add_avg(q2, q1, 11585, 13, 11585, 13);
-
-  /* Stage 1 */
-
-  od_butterfly_sub(q2, &q2h, q3);
-  od_butterfly_sub(q0, &q0h, q1);
-
-  /* Stage 2 */
-
-  /* 16069/16384 = (Sin[5*Pi/16] + Cos[5*Pi/16])/Sqrt[2] = 0.9807852804032304 */
-  /* 12785/32768 = (Sin[5*Pi/16] - Cos[5*Pi/16])*Sqrt[2] = 0.3901806440322566 */
-  /* 12873/16384 = Cos[5*Pi/16]*Sqrt[2]                  = 0.7856949583871021 */
-  od_rotate_sub_half(q2, q1, q2h, 16069, 14, 12785, 15, 12873, 14, NONE);
-
-  /* 13623/16384 = (Sin[7*Pi/16] + Cos[7*Pi/16])/Sqrt[2] = 0.831469612302545 */
-  /* 18205/16384 = (Sin[7*Pi/16] - Cos[7*Pi/16])*Sqrt[2] = 1.111140466039204 */
-  /*  9041/32768 = Cos[7*Pi/16]*Sqrt[2]                  = 0.275899379282943 */
-  od_rotate_add_half(q0, q3, q0h, 13623, 14, 18205, 14, 9041, 15, NONE);
-}
-
-/**
- * 4-point asymmetric Type-IV fDST
- */
-static INLINE void od_fdst_4_asym(od_coeff *q0, od_coeff q0h, od_coeff *q1,
-                                  od_coeff *q2, od_coeff q2h, od_coeff *q3) {
-
-  /* Stage 0 */
-
-  /*  9633/16384 = (Sin[7*Pi/16] + Cos[7*Pi/16])/2 = 0.5879378012096793 */
-  /*  12873/8192 = (Sin[7*Pi/16] - Cos[7*Pi/16])*2 = 1.5713899167742045 */
-  /* 12785/32768 = Cos[7*Pi/16]*2                  = 0.3901806440322565 */
-  od_rotate_add_half(q0, q3, q0h, 9633, 14, 12873, 13, 12785, 15, SHIFT);
-
-  /* 22725/32768 = (Sin[5*Pi/16] + Cos[5*Pi/16])/2 = 0.6935199226610738 */
-  /* 18081/32768 = (Sin[5*Pi/16] - Cos[5*Pi/16])*2 = 0.5517987585658861 */
-  /* 18205/16384 = Cos[5*Pi/16]*2                  = 1.1111404660392044 */
-  od_rotate_sub_half(q2, q1, q2h, 22725, 15, 18081, 15, 18205, 14, SHIFT);
-
-  /* Stage 1 */
-
-  od_butterfly_sub_asym(q0, od_rshift1(*q0), q1);
-  od_butterfly_sub_asym(q2, od_rshift1(*q2), q3);
-
-  /* Stage 2 */
-
-  /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
-  /* 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951 */
-  od_rotate_pi4_add_avg(q2, q1, 11585, 13, 11585, 13);
-}
-
-/**
- * 4-point asymmetric Type-IV iDST
- */
-static INLINE void od_idst_4_asym(od_coeff *q0, od_coeff *q2,
-                                  od_coeff *q1, od_coeff *q3) {
-  od_coeff q0h;
-  od_coeff q2h;
-
-  /* Stage 0 */
-
-  /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
-  /* 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951 */
-  od_rotate_pi4_add_avg(q2, q1, 11585, 13, 11585, 13);
-
-  /* Stage 1 */
-
-  od_butterfly_sub(q2, &q2h, q3);
-  od_butterfly_sub(q0, &q0h, q1);
-
-  /* Stage 2 */
-
-  /* 22725/32768 = (Sin[5*Pi/16] + Cos[5*Pi/16])/2 = 0.6935199226610738 */
-  /* 18081/32768 = (Sin[5*Pi/16] - Cos[5*Pi/16])*2 = 0.5517987585658861 */
-  /* 18205/16384 = Cos[5*Pi/16]*2                  = 1.1111404660392044 */
-  od_rotate_sub_half(q2, q1, q2h, 22725, 15, 18081, 15, 18205, 14, SHIFT);
-
-  /*  9633/16384 = (Sin[7*Pi/16] + Cos[7*Pi/16])/2 = 0.5879378012096793 */
-  /*  12873/8192 = (Sin[7*Pi/16] - Cos[7*Pi/16])*2 = 1.5713899167742045 */
-  /* 12785/32768 = Cos[7*Pi/16]*2                  = 0.3901806440322565 */
-  od_rotate_add_half(q0, q3, q0h, 9633, 14, 12873, 13, 12785, 15, SHIFT);
-}
-
-/* --- 8-point Transforms --- */
-
-/**
- * 8-point orthonormal Type-II fDCT
- */
-static INLINE void od_fdct_8(od_coeff *r0, od_coeff *r1,
-                             od_coeff *r2, od_coeff *r3,
-                             od_coeff *r4, od_coeff *r5,
-                             od_coeff *r6, od_coeff *r7) {
-  od_coeff r1h;
-  od_coeff r3h;
-  od_coeff r5h;
-  od_coeff r7h;
-
-  /* +/- Butterflies with asymmetric output. */
-  od_butterfly_neg(r0, r7, &r7h);
-  od_butterfly_add(r1, &r1h, r6);
-  od_butterfly_neg(r2, r5, &r5h);
-  od_butterfly_add(r3, &r3h, r4);
-
-  /* Embedded 4-point forward transforms with asymmetric input. */
-  od_fdct_4_asym(r0, r1, r1h, r2, r3, r3h);
-  od_fdst_4_asym(r7, r7h, r6, r5, r5h, r4);
-}
-
-/**
- * 8-point orthonormal Type-II iDCT
- */
-static INLINE void od_idct_8(od_coeff *r0, od_coeff *r4,
-                             od_coeff *r2, od_coeff *r6,
-                             od_coeff *r1, od_coeff *r5,
-                             od_coeff *r3, od_coeff *r7) {
-  od_coeff r1h;
-  od_coeff r3h;
-
-  /* Embedded 4-point inverse transforms with asymmetric output. */
-  od_idst_4_asym(r7, r5, r6, r4);
-  od_idct_4_asym(r0, r2, r1, &r1h, r3, &r3h);
-
-  /* +/- Butterflies with asymmetric input. */
-  od_butterfly_add_asym(r3, r3h, r4);
-  od_butterfly_neg_asym(r2, r5, od_rshift1(*r5));
-  od_butterfly_add_asym(r1, r1h, r6);
-  od_butterfly_neg_asym(r0, r7, od_rshift1(*r7));
-}
-
-/**
- * 8-point asymmetric Type-II fDCT
- */
-static INLINE void od_fdct_8_asym(od_coeff *r0, od_coeff *r1, od_coeff r1h,
-                                  od_coeff *r2, od_coeff *r3, od_coeff r3h,
-                                  od_coeff *r4, od_coeff *r5, od_coeff r5h,
-                                  od_coeff *r6, od_coeff *r7, od_coeff r7h) {
-
-  /* +/- Butterflies with asymmetric input. */
-  od_butterfly_neg_asym(r0, r7, r7h);
-  od_butterfly_sub_asym(r1, r1h, r6);
-  od_butterfly_neg_asym(r2, r5, r5h);
-  od_butterfly_sub_asym(r3, r3h, r4);
-
-  /* Embedded 4-point orthonormal transforms. */
-  od_fdct_4(r0, r1, r2, r3);
-  od_fdst_4(r7, r6, r5, r4);
-}
-
-/**
- * 8-point asymmetric Type-II iDCT
- */
-static INLINE void od_idct_8_asym(od_coeff *r0, od_coeff *r4,
-                                  od_coeff *r2, od_coeff *r6,
-                                  od_coeff *r1, od_coeff *r1h,
-                                  od_coeff *r5, od_coeff *r5h,
-                                  od_coeff *r3, od_coeff *r3h,
-                                  od_coeff *r7, od_coeff *r7h)  {
-
-  /* Embedded 4-point inverse orthonormal transforms. */
-  od_idst_4(r7, r5, r6, r4);
-  od_idct_4(r0, r2, r1, r3);
-
-  /* +/- Butterflies with asymmetric output. */
-  od_butterfly_sub(r3, r3h, r4);
-  od_butterfly_neg(r2, r5, r5h);
-  od_butterfly_sub(r1, r1h, r6);
-  od_butterfly_neg(r0, r7, r7h);
-}
-
-/**
- * 8-point orthonormal Type-IV fDST
- */
-static INLINE void od_fdst_8(od_coeff *r0, od_coeff *r1,
-                             od_coeff *r2, od_coeff *r3,
-                             od_coeff *r4, od_coeff *r5,
-                             od_coeff *r6, od_coeff *r7) {
-  od_coeff r0h;
-  od_coeff r2h;
-  od_coeff r5h;
-  od_coeff r7h;
-
-  /* Stage 0 */
-
-  /* 17911/16384 = Sin[15*Pi/32] + Cos[15*Pi/32] = 1.0932018670017576 */
-  /* 14699/16384 = Sin[15*Pi/32] - Cos[15*Pi/32] = 0.8971675863426363 */
-  /*    803/8192 = Cos[15*Pi/32]                 = 0.0980171403295606 */
-  od_rotate_add(r0, r7, 17911, 14, 14699, 14, 803, 13, NONE);
-
-  /* 40869/32768 = Sin[13*Pi/32] + Cos[13*Pi/32] = 1.24722501298667123 */
-  /* 21845/32768 = Sin[13*Pi/32] - Cos[13*Pi/32] = 0.66665565847774650 */
-  /*   1189/4096 = Cos[13*Pi/32]                 = 0.29028467725446233 */
-  od_rotate_sub(r6, r1, 40869, 15, 21845, 15, 1189, 12, NONE);
-
-  /* 22173/16384 = Sin[11*Pi/32] + Cos[11*Pi/32] = 1.3533180011743526 */
-  /*   3363/8192 = Sin[11*Pi/32] - Cos[11*Pi/32] = 0.4105245275223574 */
-  /* 15447/32768 = Cos[11*Pi/32]                 = 0.47139673682599764 */
-  od_rotate_add(r2, r5, 22173, 14, 3363, 13, 15447, 15, NONE);
-
-  /* 23059/16384 = Sin[9*Pi/32] + Cos[9*Pi/32] = 1.4074037375263826 */
-  /*  2271/16384 = Sin[9*Pi/32] - Cos[9*Pi/32] = 0.1386171691990915 */
-  /*   5197/8192 = Cos[9*Pi/32]                = 0.6343932841636455 */
-  od_rotate_sub(r4, r3, 23059, 14, 2271, 14, 5197, 13, NONE);
-
-  /* Stage 1 */
-
-  od_butterfly_add(r0, &r0h, r3);
-  od_butterfly_sub(r2, &r2h, r1);
-  od_butterfly_add(r5, &r5h, r6);
-  od_butterfly_sub(r7, &r7h, r4);
-
-  /* Stage 2 */
-
-  od_butterfly_add_asym(r7, r7h, r6);
-  od_butterfly_add_asym(r5, r5h, r3);
-  od_butterfly_add_asym(r2, r2h, r4);
-  od_butterfly_sub_asym(r0, r0h, r1);
-
-  /* Stage 3 */
-
-  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
-  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
-  /*   3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796 */
-  od_rotate_sub_avg(r3, r4, 21407, 14, 8867, 14, 3135, 12, NONE);
-
-  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
-  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
-  /*   3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796 */
-  od_rotate_neg_avg(r2, r5, 21407, 14, 8867, 14, 3135, 12);
-
-  /* 46341/32768 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
-  /* 46341/32768 = 2*Cos[Pi/4]           = 1.4142135623730951 */
-  od_rotate_pi4_sub_avg(r1, r6, 46341, 15, 46341, 15);
-}
-
-/**
- * 8-point orthonormal Type-IV iDST
- */
-static INLINE void od_idst_8(od_coeff *r0, od_coeff *r4,
-                             od_coeff *r2, od_coeff *r6,
-                             od_coeff *r1, od_coeff *r5,
-                             od_coeff *r3, od_coeff *r7) {
-  od_coeff r0h;
-  od_coeff r2h;
-  od_coeff r5h;
-  od_coeff r7h;
-
-  /* Stage 3 */
-
-  /* 46341/32768 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
-  /* 46341/32768 = 2*Cos[Pi/4]           = 1.4142135623730951 */
-  od_rotate_pi4_add_avg(r6, r1, 11585, 13, 46341, 15);
-
-  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
-  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
-  /*   3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796 */
-  od_rotate_neg_avg(r5, r2, 21407, 14, 8867, 14, 3135, 12);
-
-  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
-  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
-  /*   3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796 */
-  od_rotate_add_avg(r4, r3, 21407, 14, 8867, 14, 3135, 12, NONE);
-
-  /* Stage 2 */
-
-  od_butterfly_sub(r0, &r0h, r1);
-  od_butterfly_add(r2, &r2h, r4);
-  od_butterfly_add(r5, &r5h, r3);
-  od_butterfly_add(r7, &r7h, r6);
-
-  /* Stage 1 */
-
-  od_butterfly_sub_asym(r7, r7h, r4);
-  od_butterfly_add_asym(r5, r5h, r6);
-  od_butterfly_sub_asym(r2, r2h, r1);
-  od_butterfly_add_asym(r0, r0h, r3);
-
-  /* Stage 0 */
-
-  /* 23059/16384 = Sin[9*Pi/32] + Cos[9*Pi/32] = 1.4074037375263826 */
-  /*  2271/16384 = Sin[9*Pi/32] - Cos[9*Pi/32] = 0.1386171691990915 */
-  /*   5197/8192 = Cos[9*Pi/32]                = 0.6343932841636455 */
-  od_rotate_sub(r4, r3, 23059, 14, 2271, 14, 5197, 13, NONE);
-
-  /* 22173/16384 = Sin[11*Pi/32] + Cos[11*Pi/32] = 1.3533180011743526 */
-  /*   3363/8192 = Sin[11*Pi/32] - Cos[11*Pi/32] = 0.4105245275223574 */
-  /* 15447/32768 = Cos[11*Pi/32]                 = 0.47139673682599764 */
-  od_rotate_add(r2, r5, 22173, 14, 3363, 13, 15447, 15, NONE);
-
-  /* 40869/32768 = Sin[13*Pi/32] + Cos[13*Pi/32] = 1.24722501298667123 */
-  /* 21845/32768 = Sin[13*Pi/32] - Cos[13*Pi/32] = 0.66665565847774650 */
-  /*   1189/4096 = Cos[13*Pi/32]                 = 0.29028467725446233 */
-  od_rotate_sub(r6, r1, 40869, 15, 21845, 15, 1189, 12, NONE);
-
-  /* 17911/16384 = Sin[15*Pi/32] + Cos[15*Pi/32] = 1.0932018670017576 */
-  /* 14699/16384 = Sin[15*Pi/32] - Cos[15*Pi/32] = 0.8971675863426363 */
-  /*    803/8192 = Cos[15*Pi/32]                 = 0.0980171403295606 */
-  od_rotate_add(r0, r7, 17911, 14, 14699, 14, 803, 13, NONE);
-}
-
-/**
- * 8-point asymmetric Type-IV fDST
- */
-static INLINE void od_fdst_8_asym(od_coeff *r0, od_coeff r0h, od_coeff *r1,
-                                  od_coeff *r2, od_coeff r2h, od_coeff *r3,
-                                  od_coeff *r4, od_coeff r4h, od_coeff *r5,
-                                  od_coeff *r6, od_coeff r6h, od_coeff *r7) {
-  od_coeff r5h;
-  od_coeff r7h;
-
-  /* Stage 0 */
-
-  /* 12665/16384 = (Sin[15*Pi/32] + Cos[15*Pi/32])/Sqrt[2] = 0.77301045336274 */
-  /*   5197/4096 = (Sin[15*Pi/32] - Cos[15*Pi/32])*Sqrt[2] = 1.26878656832729 */
-  /*  2271/16384 = Cos[15*Pi/32]*Sqrt[2]                   = 0.13861716919909 */
-  od_rotate_add_half(r0, r7, r0h, 12665, 14, 5197, 12, 2271, 14, NONE);
-
-  /* 28899/32768 = Sin[13*Pi/32] + Cos[13*Pi/32])/Sqrt[2] = 0.881921264348355 */
-  /* 30893/32768 = Sin[13*Pi/32] - Cos[13*Pi/32])*Sqrt[2] = 0.942793473651995 */
-  /*   3363/8192 = Cos[13*Pi/32]*Sqrt[2]                  = 0.410524527522357 */
-  od_rotate_sub_half(r6, r1, r6h, 28899, 15, 30893, 15, 3363, 13, NONE);
-
-  /* 31357/32768 = Sin[11*Pi/32] + Cos[11*Pi/32])/Sqrt[2] = 0.956940335732209 */
-  /*   1189/2048 = Sin[11*Pi/32] - Cos[11*Pi/32])*Sqrt[2] = 0.580569354508925 */
-  /* 21845/32768 = Cos[11*Pi/32]*Sqrt[2]                  = 0.666655658477747 */
-  od_rotate_add_half(r2, r5, r2h, 31357, 15, 1189, 11, 21845, 15, NONE);
-
-  /* 16305/16384 = (Sin[9*Pi/32] + Cos[9*Pi/32])/Sqrt[2] = 0.9951847266721969 */
-  /*    803/4096 = (Sin[9*Pi/32] - Cos[9*Pi/32])*Sqrt[2] = 0.1960342806591213 */
-  /* 14699/16384 = Cos[9*Pi/32]*Sqrt[2]                  = 0.8971675863426364 */
-  od_rotate_sub_half(r4, r3, r4h, 16305, 14, 803, 12, 14699, 14, NONE);
-
-  /* Stage 1 */
-
-  od_butterfly_add(r0, &r0h, r3);
-  od_butterfly_sub(r2, &r2h, r1);
-  od_butterfly_add(r5, &r5h, r6);
-  od_butterfly_sub(r7, &r7h, r4);
-
-  /* Stage 2 */
-
-  od_butterfly_add_asym(r7, r7h, r6);
-  od_butterfly_add_asym(r5, r5h, r3);
-  od_butterfly_add_asym(r2, r2h, r4);
-  od_butterfly_sub_asym(r0, r0h, r1);
-
-  /* Stage 3 */
-
-  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
-  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
-  /*   3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796 */
-  od_rotate_sub_avg(r3, r4, 21407, 14, 8867, 14, 3135, 12, NONE);
-
-  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
-  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
-  /*   3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796 */
-  od_rotate_neg_avg(r2, r5, 21407, 14, 8867, 14, 3135, 12);
-
-  /* 46341/32768 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
-  /* 46341/32768 = 2*Cos[Pi/4]           = 1.4142135623730951 */
-  od_rotate_pi4_sub_avg(r1, r6, 46341, 15, 46341, 15);
-}
-
-/**
- * 8-point asymmetric Type-IV iDST
- */
-static INLINE void od_idst_8_asym(od_coeff *r0, od_coeff *r4,
-                                  od_coeff *r2, od_coeff *r6,
-                                  od_coeff *r1, od_coeff *r5,
-                                  od_coeff *r3, od_coeff *r7) {
-  od_coeff r0h;
-  od_coeff r2h;
-  od_coeff r5h;
-  od_coeff r7h;
-
-  /* Stage 3 */
-
-  /* 46341/32768 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
-  /* 46341/32768 = 2*Cos[Pi/4]           = 1.4142135623730951 */
-  od_rotate_pi4_add_avg(r6, r1, 11585, 13, 11585, 13);
-
-  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
-  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
-  /*   3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796 */
-  od_rotate_neg_avg(r5, r2, 21407, 14, 8867, 14, 3135, 12);
-
-  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
-  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
-  /*   3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796 */
-  od_rotate_add_avg(r4, r3, 21407, 14, 8867, 14, 3135, 12, NONE);
-
-  /* Stage 2 */
-
-  od_butterfly_sub(r0, &r0h, r1);
-  od_butterfly_add(r2, &r2h, r4);
-  od_butterfly_add(r5, &r5h, r3);
-  od_butterfly_add(r7, &r7h, r6);
-
-  /* Stage 1 */
-
-  od_butterfly_sub_asym(r7, r7h, r4);
-  od_butterfly_add_asym(r5, r5h, r6);
-  od_butterfly_sub_asym(r2, r2h, r1);
-  od_butterfly_add_asym(r0, r0h, r3);
-
-  /* Stage 0 */
-
-  /* 16305/16384 = (Sin[9*Pi/32] + Cos[9*Pi/32])/Sqrt[2] = 0.9951847266721969 */
-  /*    803/4096 = (Sin[9*Pi/32] - Cos[9*Pi/32])*Sqrt[2] = 0.1960342806591213 */
-  /* 14699/16384 = Cos[9*Pi/32]*Sqrt[2]                  = 0.8971675863426364 */
-  od_rotate_sub(r4, r3, 16305, 14, 803, 12, 14699, 14, SHIFT);
-
-  /* 31357/32768 = Sin[11*Pi/32] + Cos[11*Pi/32])/Sqrt[2] = 0.956940335732209 */
-  /*   1189/2048 = Sin[11*Pi/32] - Cos[11*Pi/32])*Sqrt[2] = 0.580569354508925 */
-  /* 21845/32768 = Cos[11*Pi/32]*Sqrt[2]                  = 0.666655658477747 */
-  od_rotate_add(r2, r5, 31357, 15, 1189, 11, 21845, 15, SHIFT);
-
-  /* 28899/32768 = Sin[13*Pi/32] + Cos[13*Pi/32])/Sqrt[2] = 0.881921264348355 */
-  /* 30893/32768 = Sin[13*Pi/32] - Cos[13*Pi/32])*Sqrt[2] = 0.942793473651995 */
-  /*   3363/8192 = Cos[13*Pi/32]*Sqrt[2]                  = 0.410524527522357 */
-  od_rotate_sub(r6, r1, 28899, 15, 30893, 15, 3363, 13, SHIFT);
-
-  /* 12665/16384 = (Sin[15*Pi/32] + Cos[15*Pi/32])/Sqrt[2] = 0.77301045336274 */
-  /*   5197/4096 = (Sin[15*Pi/32] - Cos[15*Pi/32])*Sqrt[2] = 1.26878656832729 */
-  /*  2271/16384 = Cos[15*Pi/32]*Sqrt[2]                   = 0.13861716919909 */
-  od_rotate_add(r0, r7, 12665, 14, 5197, 12, 2271, 14, SHIFT);
-}
-
-/* --- 16-point Transforms --- */
-
-/**
- * 16-point orthonormal Type-II fDCT
- */
-static INLINE void od_fdct_16(od_coeff *s0, od_coeff *s1,
-                              od_coeff *s2, od_coeff *s3,
-                              od_coeff *s4, od_coeff *s5,
-                              od_coeff *s6, od_coeff *s7,
-                              od_coeff *s8, od_coeff *s9,
-                              od_coeff *sa, od_coeff *sb,
-                              od_coeff *sc, od_coeff *sd,
-                              od_coeff *se, od_coeff *sf) {
-  od_coeff s1h;
-  od_coeff s3h;
-  od_coeff s5h;
-  od_coeff s7h;
-  od_coeff s9h;
-  od_coeff sbh;
-  od_coeff sdh;
-  od_coeff sfh;
-
-  /* +/- Butterflies with asymmetric output. */
-  od_butterfly_neg(s0, sf, &sfh);
-  od_butterfly_add(s1, &s1h, se);
-  od_butterfly_neg(s2, sd, &sdh);
-  od_butterfly_add(s3, &s3h, sc);
-  od_butterfly_neg(s4, sb, &sbh);
-  od_butterfly_add(s5, &s5h, sa);
-  od_butterfly_neg(s6, s9, &s9h);
-  od_butterfly_add(s7, &s7h, s8);
-
-  /* Embedded 8-point transforms with asymmetric input. */
-  od_fdct_8_asym(s0, s1, s1h, s2, s3, s3h, s4, s5, s5h, s6, s7, s7h);
-  od_fdst_8_asym(sf, sfh, se, sd, sdh, sc, sb, sbh, sa, s9, s9h, s8);
-}
-
-/**
- * 16-point orthonormal Type-II iDCT
- */
-static INLINE void od_idct_16(od_coeff *s0, od_coeff *s8,
-                              od_coeff *s4, od_coeff *sc,
-                              od_coeff *s2, od_coeff *sa,
-                              od_coeff *s6, od_coeff *se,
-                              od_coeff *s1, od_coeff *s9,
-                              od_coeff *s5, od_coeff *sd,
-                              od_coeff *s3, od_coeff *sb,
-                              od_coeff *s7, od_coeff *sf) {
-  od_coeff s1h;
-  od_coeff s3h;
-  od_coeff s5h;
-  od_coeff s7h;
-
-  /* Embedded 8-point transforms with asymmetric output. */
-  od_idst_8_asym(sf, sb, sd, s9, se, sa, sc, s8);
-  od_idct_8_asym(s0, s4, s2, s6, s1, &s1h, s5, &s5h, s3, &s3h, s7, &s7h);
-
-  /* +/- Butterflies with asymmetric input. */
-  od_butterfly_add_asym(s7, s7h, s8);
-  od_butterfly_neg_asym(s6, s9, od_rshift1(*s9));
-  od_butterfly_add_asym(s5, s5h, sa);
-  od_butterfly_neg_asym(s4, sb, od_rshift1(*sb));
-  od_butterfly_add_asym(s3, s3h, sc);
-  od_butterfly_neg_asym(s2, sd, od_rshift1(*sd));
-  od_butterfly_add_asym(s1, s1h, se);
-  od_butterfly_neg_asym(s0, sf, od_rshift1(*sf));
-}
-
-/**
- * 16-point asymmetric Type-II fDCT
- */
-static INLINE void od_fdct_16_asym(od_coeff *s0, od_coeff *s1, od_coeff s1h,
-                                   od_coeff *s2, od_coeff *s3, od_coeff s3h,
-                                   od_coeff *s4, od_coeff *s5, od_coeff s5h,
-                                   od_coeff *s6, od_coeff *s7, od_coeff s7h,
-                                   od_coeff *s8, od_coeff *s9, od_coeff s9h,
-                                   od_coeff *sa, od_coeff *sb, od_coeff sbh,
-                                   od_coeff *sc, od_coeff *sd, od_coeff sdh,
-                                   od_coeff *se, od_coeff *sf, od_coeff sfh) {
-
-  /* +/- Butterflies with asymmetric input. */
-  od_butterfly_neg_asym(s0, sf, sfh);
-  od_butterfly_sub_asym(s1, s1h, se);
-  od_butterfly_neg_asym(s2, sd, sdh);
-  od_butterfly_sub_asym(s3, s3h, sc);
-  od_butterfly_neg_asym(s4, sb, sbh);
-  od_butterfly_sub_asym(s5, s5h, sa);
-  od_butterfly_neg_asym(s6, s9, s9h);
-  od_butterfly_sub_asym(s7, s7h, s8);
-
-  /* Embedded 8-point orthonormal transforms. */
-  od_fdct_8(s0, s1, s2, s3, s4, s5, s6, s7);
-  od_fdst_8(sf, se, sd, sc, sb, sa, s9, s8);
-}
-
-/**
- * 16-point asymmetric Type-II iDCT
- */
-static INLINE void od_idct_16_asym(od_coeff *s0, od_coeff *s8,
-                                   od_coeff *s4, od_coeff *sc,
-                                   od_coeff *s2, od_coeff *sa,
-                                   od_coeff *s6, od_coeff *se,
-                                   od_coeff *s1, od_coeff *s1h,
-                                   od_coeff *s9, od_coeff *s9h,
-                                   od_coeff *s5, od_coeff *s5h,
-                                   od_coeff *sd, od_coeff *sdh,
-                                   od_coeff *s3, od_coeff *s3h,
-                                   od_coeff *sb, od_coeff *sbh,
-                                   od_coeff *s7, od_coeff *s7h,
-                                   od_coeff *sf, od_coeff *sfh) {
-
-  /* Embedded 8-point orthonormal transforms. */
-  od_idst_8(sf, sb, sd, s9, se, sa, sc, s8);
-  od_idct_8(s0, s4, s2, s6, s1, s5, s3, s7);
-
-  /* +/- Butterflies with asymmetric output. */
-  od_butterfly_sub(s7, s7h, s8);
-  od_butterfly_neg(s6, s9, s9h);
-  od_butterfly_sub(s5, s5h, sa);
-  od_butterfly_neg(s4, sb, sbh);
-  od_butterfly_sub(s3, s3h, sc);
-  od_butterfly_neg(s2, sd, sdh);
-  od_butterfly_sub(s1, s1h, se);
-  od_butterfly_neg(s0, sf, sfh);
-}
-
-/**
- * 16-point orthonormal Type-IV fDST
- */
-static INLINE void od_fdst_16(od_coeff *s0, od_coeff *s1,
-                              od_coeff *s2, od_coeff *s3,
-                              od_coeff *s4, od_coeff *s5,
-                              od_coeff *s6, od_coeff *s7,
-                              od_coeff *s8, od_coeff *s9,
-                              od_coeff *sa, od_coeff *sb,
-                              od_coeff *sc, od_coeff *sd,
-                              od_coeff *se, od_coeff *sf) {
-  od_coeff s0h;
-  od_coeff s2h;
-  od_coeff sdh;
-  od_coeff sfh;
-
-  /* Stage 0 */
-
-  /* 24279/32768 = (Sin[31*Pi/64] + Cos[31*Pi/64])/Sqrt[2] = 0.74095112535496 */
-  /* 44011/32768 = (Sin[31*Pi/64] - Cos[31*Pi/64])*Sqrt[2] = 1.34311790969404 */
-  /*  1137/16384 = Cos[31*Pi/64]*Sqrt[2]                   = 0.06939217050794 */
-  od_rotate_add(s0, sf, 24279, 15, 44011, 15, 1137, 14, SHIFT);
-
-  /* 1645/2048 = (Sin[29*Pi/64] + Cos[29*Pi/64])/Sqrt[2] = 0.8032075314806449 */
-  /*   305/256 = (Sin[29*Pi/64] - Cos[29*Pi/64])*Sqrt[2] = 1.1913986089848667 */
-  /*  425/2048 = Cos[29*Pi/64]*Sqrt[2]                   = 0.2075082269882116 */
-  od_rotate_sub(se, s1, 1645, 11, 305, 8, 425, 11, SHIFT);
-
-  /* 14053/32768 = (Sin[27*Pi/64] + Cos[27*Pi/64])/Sqrt[2] = 0.85772861000027 */
-  /*   8423/8192 = (Sin[27*Pi/64] - Cos[27*Pi/64])*Sqrt[2] = 1.02820548838644 */
-  /*   2815/8192 = Cos[27*Pi/64]*Sqrt[2]                   = 0.34362586580705 */
-  od_rotate_add(s2, sd, 14053, 14, 8423, 13, 2815, 13, SHIFT);
-
-  /* 14811/16384 = (Sin[25*Pi/64] + Cos[25*Pi/64])/Sqrt[2] = 0.90398929312344 */
-  /*   7005/8192 = (Sin[25*Pi/64] - Cos[25*Pi/64])*Sqrt[2] = 0.85511018686056 */
-  /*   3903/8192 = Cos[25*Pi/64]*Sqrt[2]                   = 0.47643419969316 */
-  od_rotate_sub(sc, s3, 14811, 14, 7005, 13, 3903, 13, SHIFT);
-
-  /* 30853/32768 = (Sin[23*Pi/64] + Cos[23*Pi/64])/Sqrt[2] = 0.94154406518302 */
-  /* 11039/16384 = (Sin[23*Pi/64] - Cos[23*Pi/64])*Sqrt[2] = 0.67377970678444 */
-  /* 19813/32768 = Cos[23*Pi/64]*Sqrt[2]                   = 0.60465421179080 */
-  od_rotate_add(s4, sb, 30853, 15, 11039, 14, 19813, 15, SHIFT);
-
-  /* 15893/16384 = (Sin[21*Pi/64] + Cos[21*Pi/64])/Sqrt[2] = 0.97003125319454 */
-  /*   3981/8192 = (Sin[21*Pi/64] - Cos[21*Pi/64])*Sqrt[2] = 0.89716758634264 */
-  /*   1489/2048 = Cos[21*Pi/64]*Sqrt[2]                   = 0.72705107329128 */
-  od_rotate_sub(sa, s5, 15893, 14, 3981, 13, 1489, 11, SHIFT);
-
-  /* 32413/32768 = (Sin[19*Pi/64] + Cos[19*Pi/64])/Sqrt[2] = 0.98917650996478 */
-  /*    601/2048 = (Sin[19*Pi/64] - Cos[19*Pi/64])*Sqrt[2] = 0.29346094891072 */
-  /* 27605/32768 = Cos[19*Pi/64]*Sqrt[2]                   = 0.84244603550942 */
-  od_rotate_add(s6, s9, 32413, 15, 601, 11, 27605, 15, SHIFT);
-
-  /* 32729/32768 = (Sin[17*Pi/64] + Cos[17*Pi/64])/Sqrt[2] = 0.99879545620517 */
-  /*    201/2048 = (Sin[17*Pi/64] - Cos[17*Pi/64])*Sqrt[2] = 0.09813534865484 */
-  /* 31121/32768 = Cos[17*Pi/64]*Sqrt[2]                   = 0.94972778187775 */
-  od_rotate_sub(s8, s7, 32729, 15, 201, 11, 31121, 15, SHIFT);
-
-  /* Stage 1 */
-
-  od_butterfly_sub_asym(s0, od_rshift1(*s0), s7);
-  od_butterfly_sub_asym(s8, od_rshift1(*s8), sf);
-  od_butterfly_add_asym(s4, od_rshift1(*s4), s3);
-  od_butterfly_add_asym(sc, od_rshift1(*sc), sb);
-  od_butterfly_sub_asym(s2, od_rshift1(*s2), s5);
-  od_butterfly_sub_asym(sa, od_rshift1(*sa), sd);
-  od_butterfly_add_asym(s6, od_rshift1(*s6), s1);
-  od_butterfly_add_asym(se, od_rshift1(*se), s9);
-
-  /* Stage 2 */
-
-  od_butterfly_add(s8, NULL, s4);
-  od_butterfly_add(s7, NULL, sb);
-  od_butterfly_sub(sa, NULL, s6);
-  od_butterfly_sub(s5, NULL, s9);
-  od_butterfly_add(s0, &s0h, s3);
-  od_butterfly_add(sd, &sdh, se);
-  od_butterfly_sub(s2, &s2h, s1);
-  od_butterfly_sub(sf, &sfh, sc);
-
-  /* Stage 3 */
-
-  /*   9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
-  /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
-  /* 12785/32768 = 2*Cos[7*Pi/16]              = 0.3901806440322565 */
-  od_rotate_add_avg(s8, s7, 9633, 13, 12873, 14, 12785, 15, NONE);
-
-  /* 45451/32768 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
-  /*  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
-  /* 18205/32768 = Cos[5*Pi/16]                = 0.5555702330196022 */
-  od_rotate_add(s9, s6, 45451, 15, 9041, 15, 18205, 15, NONE);
-
-  /* 22725/16384 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
-  /*  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
-  /* 18205/32768 = 2*Cos[5*Pi/16]              = 1.1111404660392044 */
-  od_rotate_neg_avg(s5, sa, 22725, 14, 9041, 15, 18205, 14);
-
-  /* 38531/32768 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
-  /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
-  /*  6393/32768 = Cos[7*Pi/16]                = 0.1950903220161283 */
-  od_rotate_neg(s4, sb, 38531, 15, 12873, 14, 6393, 15);
-
-  /* Stage 4 */
-
-  od_butterfly_add_asym(s2, s2h, sc);
-  od_butterfly_sub_asym(s0, s0h, s1);
-  od_butterfly_add_asym(sf, sfh, se);
-  od_butterfly_add_asym(sd, sdh, s3);
-  od_butterfly_add_asym(s7, od_rshift1(*s7), s6);
-  od_butterfly_sub_asym(s8, od_rshift1(*s8), s9);
-  od_butterfly_sub_asym(sa, od_rshift1(*sa), sb);
-  od_butterfly_add_asym(s5, od_rshift1(*s5), s4);
-
-  /* Stage 5 */
-
-  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
-  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
-  /*   3135/4096 = 2*Cos[7*Pi/8]             = 0.7653668647301796 */
-  od_rotate_add_avg(sc, s3, 21407, 14, 8867, 14, 3135, 12, NONE);
-
-  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3870398453221475 */
-  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
-  /*   3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796 */
-  od_rotate_neg_avg(s2, sd, 21407, 14, 8867, 14, 3135, 12);
-
-  /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
-  /* 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951 */
-  od_rotate_pi4_add_avg(sa, s5, 11585, 13, 11585, 13);
-
-  /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
-  /* 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951 */
-  od_rotate_pi4_add_avg(s6, s9, 11585, 13, 11585, 13);
-
-  /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
-  /* 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951 */
-  od_rotate_pi4_add_avg(se, s1, 11585, 13, 11585, 13);
-}
-
-/**
- * 16-point orthonormal Type-IV iDST
- */
-static INLINE void od_idst_16(od_coeff *s0, od_coeff *s8,
-                              od_coeff *s4, od_coeff *sc,
-                              od_coeff *s2, od_coeff *sa,
-                              od_coeff *s6, od_coeff *se,
-                              od_coeff *s1, od_coeff *s9,
-                              od_coeff *s5, od_coeff *sd,
-                              od_coeff *s3, od_coeff *sb,
-                              od_coeff *s7, od_coeff *sf) {
-  od_coeff s0h;
-  od_coeff s2h;
-  od_coeff s4h;
-  od_coeff s6h;
-  od_coeff s8h;
-  od_coeff sah;
-  od_coeff sch;
-  od_coeff sdh;
-  od_coeff seh;
-  od_coeff sfh;
-
-  /* Stage 5 */
-
-  /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
-  /* 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951 */
-  od_rotate_pi4_add_avg(s6, s9, 11585, 13, 11585, 13);
-
-  /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
-  /* 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951 */
-  od_rotate_pi4_add_avg(sa, s5, 11585, 13, 11585, 13);
-
-  /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
-  /* 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951 */
-  od_rotate_pi4_add_avg(se, s1, 11585, 13, 11585, 13);
-
-  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
-  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
-  /*   3135/4096 = 2*Cos[7*Pi/8]             = 0.7653668647301796 */
-  od_rotate_add_avg(sc, s3, 21407, 14, 8867, 14, 3135, 12, NONE);
-
-  /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3870398453221475 */
-  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
-  /*   3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796 */
-  od_rotate_neg_avg(sd, s2, 21407, 14, 8867, 14, 3135, 12);
-
-  /* Stage 4 */
-
-  od_butterfly_add(s5, NULL, s4);
-  od_butterfly_sub(sa, NULL, sb);
-  od_butterfly_sub(s8, NULL, s9);
-  od_butterfly_add(s7, NULL, s6);
-  od_butterfly_add(sd, &sdh, s3);
-  od_butterfly_add(sf, &sfh, se);
-  od_butterfly_sub(s0, &s0h, s1);
-  od_butterfly_add(s2, &s2h, sc);
-
-  /* Stage 3 */
-
-  /*   9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
-  /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
-  /* 12785/32768 = 2*Cos[7*Pi/16]              = 0.3901806440322565 */
-  od_rotate_add_avg(s8, s7, 9633, 13, 12873, 14, 12785, 15, NONE);
-
-  /* 45451/32768 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
-  /*  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
-  /* 18205/32768 = Cos[5*Pi/16]                = 0.5555702330196022 */
-  od_rotate_add(s9, s6, 45451, 15, 9041, 15, 18205, 15, NONE);
-
-  /* 22725/16384 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
-  /*  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
-  /* 18205/32768 = 2*Cos[5*Pi/16]              = 1.1111404660392044 */
-  od_rotate_neg_avg(sa, s5, 22725, 14, 9041, 15, 18205, 14);
-
-  /* 38531/32768 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
-  /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
-  /*  6393/32768 = Cos[7*Pi/16]                = 0.1950903220161283 */
-  od_rotate_neg(sb, s4, 38531, 15, 12873, 14, 6393, 15);
-
-  /* Stage 2 */
-
-  od_butterfly_add_asym(s8, od_rshift1(*s8), s4);
-  od_butterfly_add_asym(s7, od_rshift1(*s7), sb);
-  od_butterfly_sub_asym(sa, od_rshift1(*sa), s6);
-  od_butterfly_sub_asym(s5, od_rshift1(*s5), s9);
-  od_butterfly_add_asym(s0, s0h, s3);
-  od_butterfly_add_asym(sd, sdh, se);
-  od_butterfly_sub_asym(s2, s2h, s1);
-  od_butterfly_sub_asym(sf, sfh, sc);
-
-  /* Stage 1 */
-
-  od_butterfly_sub(s0, &s0h, s7);
-  od_butterfly_sub(s8, &s8h, sf);
-  od_butterfly_add(s4, &s4h, s3);
-  od_butterfly_add(sc, &sch, sb);
-  od_butterfly_sub(s2, &s2h, s5);
-  od_butterfly_sub(sa, &sah, sd);
-  od_butterfly_add(s6, &s6h, s1);
-  od_butterfly_add(se, &seh, s9);
-
-  /* Stage 0 */
-
-  /* 32729/32768 = (Sin[17*Pi/64] + Cos[17*Pi/64])/Sqrt[2] = 0.99879545620517 */
-  /*    201/2048 = (Sin[17*Pi/64] - Cos[17*Pi/64])*Sqrt[2] = 0.09813534865484 */
-  /* 31121/32768 = Cos[17*Pi/64]*Sqrt[2]                   = 0.94972778187775 */
-  od_rotate_sub_half(s8, s7, s8h, 32729, 15, 201, 11, 31121, 15, NONE);
-
-  /* 32413/32768 = (Sin[19*Pi/64] + Cos[19*Pi/64])/Sqrt[2] = 0.98917650996478 */
-  /*    601/2048 = (Sin[19*Pi/64] - Cos[19*Pi/64])*Sqrt[2] = 0.29346094891072 */
-  /* 27605/32768 = Cos[19*Pi/64]*Sqrt[2]                   = 0.84244603550942 */
-  od_rotate_add_half(s6, s9, s6h, 32413, 15, 601, 11, 27605, 15, NONE);
-
-  /* 15893/16384 = (Sin[21*Pi/64] + Cos[21*Pi/64])/Sqrt[2] = 0.97003125319454 */
-  /*   3981/8192 = (Sin[21*Pi/64] - Cos[21*Pi/64])*Sqrt[2] = 0.89716758634264 */
-  /*   1489/2048 = Cos[21*Pi/64]*Sqrt[2]                   = 0.72705107329128 */
-  od_rotate_sub_half(sa, s5, sah, 15893, 14, 3981, 13, 1489, 11, NONE);
-
-  /* 30853/32768 = (Sin[23*Pi/64] + Cos[23*Pi/64])/Sqrt[2] = 0.94154406518302 */
-  /* 11039/16384 = (Sin[23*Pi/64] - Cos[23*Pi/64])*Sqrt[2] = 0.67377970678444 */
-  /* 19813/32768 = Cos[23*Pi/64]*Sqrt[2]                   = 0.60465421179080 */
-  od_rotate_add_half(s4, sb, s4h, 30853, 15, 11039, 14, 19813, 15, NONE);
-
-  /* 14811/16384 = (Sin[25*Pi/64] + Cos[25*Pi/64])/Sqrt[2] = 0.90398929312344 */
-  /*   7005/8192 = (Sin[25*Pi/64] - Cos[25*Pi/64])*Sqrt[2] = 0.85511018686056 */
-  /*   3903/8192 = Cos[25*Pi/64]*Sqrt[2]                   = 0.47643419969316 */
-  od_rotate_sub_half(sc, s3, sch, 14811, 14, 7005, 13, 3903, 13, NONE);
-
-  /* 14053/32768 = (Sin[27*Pi/64] + Cos[27*Pi/64])/Sqrt[2] = 0.85772861000027 */
-  /*   8423/8192 = (Sin[27*Pi/64] - Cos[27*Pi/64])*Sqrt[2] = 1.02820548838644 */
-  /*   2815/8192 = Cos[27*Pi/64]*Sqrt[2]                   = 0.34362586580705 */
-  od_rotate_add_half(s2, sd, s2h, 14053, 14, 8423, 13, 2815, 13, NONE);
-
-  /* 1645/2048 = (Sin[29*Pi/64] + Cos[29*Pi/64])/Sqrt[2] = 0.8032075314806449 */
-  /*   305/256 = (Sin[29*Pi/64] - Cos[29*Pi/64])*Sqrt[2] = 1.1913986089848667 */
-  /*  425/2048 = Cos[29*Pi/64]*Sqrt[2]                   = 0.2075082269882116 */
-  od_rotate_sub_half(se, s1, seh, 1645, 11, 305, 8, 425, 11, NONE);
-
-  /* 24279/32768 = (Sin[31*Pi/64] + Cos[31*Pi/64])/Sqrt[2] = 0.74095112535496 */
-  /* 44011/32768 = (Sin[31*Pi/64] - Cos[31*Pi/64])*Sqrt[2] = 1.34311790969404 */
-  /*  1137/16384 = Cos[31*Pi/64]*Sqrt[2]                   = 0.06939217050794 */
-  od_rotate_add_half(s0, sf, s0h, 24279, 15, 44011, 15, 1137, 14, NONE);
-}
-
-/**
- * 16-point asymmetric Type-IV fDST
- */
-static INLINE void od_fdst_16_asym(od_coeff *s0, od_coeff s0h, od_coeff *s1,
-                                   od_coeff *s2, od_coeff s2h, od_coeff *s3,
-                                   od_coeff *s4, od_coeff s4h, od_coeff *s5,
-                                   od_coeff *s6, od_coeff s6h, od_coeff *s7,
-                                   od_coeff *s8, od_coeff s8h, od_coeff *s9,
-                                   od_coeff *sa, od_coeff sah, od_coeff *sb,
-                                   od_coeff *sc, od_coeff sch, od_coeff *sd,
-                                   od_coeff *se, od_coeff seh, od_coeff *sf) {
-  od_coeff sdh;
-  od_coeff sfh;
-
-  /* Stage 0 */
-
-  /*   1073/2048 = (Sin[31*Pi/64] + Cos[31*Pi/64])/2 = 0.5239315652662953 */
-  /* 62241/32768 = (Sin[31*Pi/64] - Cos[31*Pi/64])*2 = 1.8994555637555088 */
-  /*   201/16384 = Cos[31*Pi/64]*2                   = 0.0981353486548360 */
-  od_rotate_add_half(s0, sf, s0h, 1073, 11, 62241, 15, 201, 11, SHIFT);
-
-  /* 18611/32768 = (Sin[29*Pi/64] + Cos[29*Pi/64])/2 = 0.5679534922100714 */
-  /* 55211/32768 = (Sin[29*Pi/64] - Cos[29*Pi/64])*2 = 1.6848920710188384 */
-  /*    601/2048 = Cos[29*Pi/64]*2                   = 0.2934609489107235 */
-  od_rotate_sub_half(se, s1, seh, 18611, 15, 55211, 15, 601, 11, SHIFT);
-
-  /*  9937/16384 = (Sin[27*Pi/64] + Cos[27*Pi/64])/2 = 0.6065057165489039 */
-  /*   1489/1024 = (Sin[27*Pi/64] - Cos[27*Pi/64])*2 = 1.4541021465825602 */
-  /*   3981/8192 = Cos[27*Pi/64]*2                   = 0.4859603598065277 */
-  od_rotate_add_half(s2, sd, s2h, 9937, 14, 1489, 10, 3981, 13, SHIFT);
-
-  /* 10473/16384 = (Sin[25*Pi/64] + Cos[25*Pi/64])/2 = 0.6392169592876205 */
-  /* 39627/32768 = (Sin[25*Pi/64] - Cos[25*Pi/64])*2 = 1.2093084235816014 */
-  /* 11039/16384 = Cos[25*Pi/64]*2                   = 0.6737797067844401 */
-  od_rotate_sub_half(sc, s3, sch, 10473, 14, 39627, 15, 11039, 14, SHIFT);
-
-  /* 2727/4096 = (Sin[23*Pi/64] + Cos[23*Pi/64])/2 = 0.6657721932768628 */
-  /* 3903/4096 = (Sin[23*Pi/64] - Cos[23*Pi/64])*2 = 0.9528683993863225 */
-  /* 7005/8192 = Cos[23*Pi/64]*2                   = 0.8551101868605642 */
-  od_rotate_add_half(s4, sb, s4h, 2727, 12, 3903, 12, 7005, 13, SHIFT);
-
-  /* 5619/8192 = (Sin[21*Pi/64] + Cos[21*Pi/64])/2 = 0.6859156770967569 */
-  /* 2815/4096 = (Sin[21*Pi/64] - Cos[21*Pi/64])*2 = 0.6872517316141069 */
-  /* 8423/8192 = Cos[21*Pi/64]*2                   = 1.0282054883864433 */
-  od_rotate_sub_half(sa, s5, sah, 5619, 13, 2815, 12, 8423, 13, SHIFT);
-
-  /*   2865/4096 = (Sin[19*Pi/64] + Cos[19*Pi/64])/2 = 0.6994534179865391 */
-  /* 13588/32768 = (Sin[19*Pi/64] - Cos[19*Pi/64])*2 = 0.4150164539764232 */
-  /*     305/256 = Cos[19*Pi/64]*2                   = 1.1913986089848667 */
-  od_rotate_add_half(s6, s9, s6h, 2865, 12, 13599, 15, 305, 8, SHIFT);
-
-  /* 23143/32768 = (Sin[17*Pi/64] + Cos[17*Pi/64])/2 = 0.7062550401009887 */
-  /*   1137/8192 = (Sin[17*Pi/64] - Cos[17*Pi/64])*2 = 0.1387843410158816 */
-  /* 44011/32768 = Cos[17*Pi/64]*2                   = 1.3431179096940367 */
-  od_rotate_sub_half(s8, s7, s8h, 23143, 15, 1137, 13, 44011, 15, SHIFT);
-
-  /* Stage 1 */
-
-  od_butterfly_sub_asym(s0, od_rshift1(*s0), s7);
-  od_butterfly_sub_asym(s8, od_rshift1(*s8), sf);
-  od_butterfly_add_asym(s4, od_rshift1(*s4), s3);
-  od_butterfly_add_asym(sc, od_rshift1(*sc), sb);
-  od_butterfly_sub_asym(s2, od_rshift1(*s2), s5);
-  od_butterfly_sub_asym(sa, od_rshift1(*sa), sd);
-  od_butterfly_add_asym(s6, od_rshift1(*s6), s1);
-  od_butterfly_add_asym(se, od_rshift1(*se), s9);
-
-  /* Stage 2 */
-
-  od_butterfly_add(s8, NULL, s4);
-  od_butterfly_add(s7, NULL, sb);
-  od_butterfly_sub(sa, NULL, s6);
-  od_butterfly_sub(s5, NULL, s9);
-  od_butterfly_add(s0, &s0h, s3);
-  od_butterfly_add(sd, &sdh, se);
-  od_butterfly_sub(s2, &s2h, s1);
-  od_butterfly_sub(sf, &sfh, sc);
-
-  /* Stage 3 */
-
-  /*   9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
-  /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
-  /*  6393/32768 = Cos[7*Pi/16]                = 0.1950903220161283 */
-  od_rotate_add(s8, s7, 9633, 13, 12873, 14, 6393, 15, NONE);
-
-  /* 45451/32768 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
-  /*  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
-  /* 18205/32768 = Cos[5*Pi/16]                = 0.5555702330196022 */
-  od_rotate_add(s9, s6, 45451, 15, 9041, 15, 18205, 15, NONE);
-
-  /*  11363/8192 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
-  /*  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
-  /*   4551/8192 = Cos[5*Pi/16]                = 0.5555702330196022 */
-  od_rotate_neg(s5, sa, 11363, 13, 9041, 15, 4551, 13);
-
-  /*  9633/32768 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
-  /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
-  /*  6393/32768 = Cos[7*Pi/16]                = 0.1950903220161283 */
-  od_rotate_neg(s4, sb, 9633, 13, 12873, 14, 6393, 15);
-
-  /* Stage 4 */
-
-  od_butterfly_add_asym(s2, s2h, sc);
-  od_butterfly_sub_asym(s0, s0h, s1);
-  od_butterfly_add_asym(sf, sfh, se);
-  od_butterfly_add_asym(sd, sdh, s3);
-  od_butterfly_add_asym(s7, od_rshift1(*s7), s6);
-  od_butterfly_sub_asym(s8, od_rshift1(*s8), s9);
-  od_butterfly_sub_asym(sa, od_rshift1(*sa), sb);
-  od_butterfly_add_asym(s5, od_rshift1(*s5), s4);
-
-  /* Stage 5 */
-
-  /*  10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
-  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
-  /*   3135/8192 = Cos[7*Pi/8]               = 0.3826834323650898 */
-  od_rotate_add(sc, s3, 10703, 13, 8867, 14, 3135, 13, NONE);
-
-  /*  10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3870398453221475 */
-  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
-  /*   3135/8192 = Cos[3*Pi/8]               = 0.3826834323650898 */
-  od_rotate_neg(s2, sd, 10703, 13, 8867, 14, 3135, 13);
-
-  /*  11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
-  /* 11585/16384 = Cos[Pi/4]             = 0.7071067811865475 */
-  od_rotate_pi4_add(sa, s5, 11585, 13, 11585, 14);
-
-  /*  11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
-  /* 11585/16384 = Cos[Pi/4]             = 0.7071067811865475 */
-  od_rotate_pi4_add(s6, s9, 11585, 13, 11585, 14);
-
-  /*  11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
-  /* 11585/16384 = Cos[Pi/4]             = 0.7071067811865475 */
-  od_rotate_pi4_add(se, s1, 11585, 13, 11585, 14);
-}
-
-/**
- * 16-point asymmetric Type-IV iDST
- */
-static INLINE void od_idst_16_asym(od_coeff *s0, od_coeff *s8,
-                                   od_coeff *s4, od_coeff *sc,
-                                   od_coeff *s2, od_coeff *sa,
-                                   od_coeff *s6, od_coeff *se,
-                                   od_coeff *s1, od_coeff *s9,
-                                   od_coeff *s5, od_coeff *sd,
-                                   od_coeff *s3, od_coeff *sb,
-                                   od_coeff *s7, od_coeff *sf) {
-  od_coeff s0h;
-  od_coeff s2h;
-  od_coeff s4h;
-  od_coeff s6h;
-  od_coeff s8h;
-  od_coeff sah;
-  od_coeff sch;
-  od_coeff sdh;
-  od_coeff seh;
-  od_coeff sfh;
-
-  /* Stage 5 */
-
-  /*  11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
-  /* 11585/16384 = Cos[Pi/4]           = 0.7071067811865475 */
-  od_rotate_pi4_add(s6, s9, 11585, 13, 11585, 14);
-
-  /*  11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
-  /* 11585/16384 = 2*Cos[Pi/4]           = 0.7071067811865475 */
-  od_rotate_pi4_add(sa, s5, 11585, 13, 11585, 14);
-
-  /*  11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
-  /* 11585/16384 = 2*Cos[Pi/4]           = 0.7071067811865475 */
-  od_rotate_pi4_add(se, s1, 11585, 13, 11585, 14);
-
-  /*  10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
-  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
-  /*   3135/8192 = Cos[7*Pi/8]               = 0.7653668647301796 */
-  od_rotate_add(sc, s3, 10703, 13, 8867, 14, 3135, 13, NONE);
-
-  /*  10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3870398453221475 */
-  /*  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
-  /*   3135/8192 = Cos[3*Pi/8]               = 0.7653668647301796 */
-  od_rotate_neg(sd, s2, 10703, 13, 8867, 14, 3135, 13);
-
-  /* Stage 4 */
-
-  od_butterfly_add(s5, NULL, s4);
-  od_butterfly_sub(sa, NULL, sb);
-  od_butterfly_sub(s8, NULL, s9);
-  od_butterfly_add(s7, NULL, s6);
-  od_butterfly_add(sd, &sdh, s3);
-  od_butterfly_add(sf, &sfh, se);
-  od_butterfly_sub(s0, &s0h, s1);
-  od_butterfly_add(s2, &s2h, sc);
-
-  /* Stage 3 */
-
-  /*   9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
-  /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
-  /*  6393/32768 = Cos[7*Pi/16]                = 0.1950903220161283 */
-  od_rotate_neg(sb, s4, 9633, 13, 12873, 14, 6393, 15);
-
-  /*  11363/8192 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
-  /*  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
-  /*   4551/8192 = Cos[5*Pi/16]                = 0.5555702330196022 */
-  od_rotate_neg(sa, s5, 11363, 13, 9041, 15, 4551, 13);
-
-  /* 22725/16384 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
-  /*  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
-  /* 18205/32768 = Cos[5*Pi/16]                = 0.5555702330196022 */
-  od_rotate_add(s9, s6, 22725, 14, 9041, 15, 18205, 15, NONE);
-
-  /*   9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
-  /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
-  /*  6393/32768 = Cos[7*Pi/16]                = 0.1950903220161283 */
-  od_rotate_add(s8, s7, 9633, 13, 12873, 14, 6393, 15, NONE);
-
-  /* Stage 2 */
-
-  od_butterfly_add_asym(s8, od_rshift1(*s8), s4);
-  od_butterfly_add_asym(s7, od_rshift1(*s7), sb);
-  od_butterfly_sub_asym(sa, od_rshift1(*sa), s6);
-  od_butterfly_sub_asym(s5, od_rshift1(*s5), s9);
-  od_butterfly_add_asym(s0, s0h, s3);
-  od_butterfly_add_asym(sd, sdh, se);
-  od_butterfly_sub_asym(s2, s2h, s1);
-  od_butterfly_sub_asym(sf, sfh, sc);
-
-  /* Stage 1 */
-
-  od_butterfly_sub(s0, &s0h, s7);
-  od_butterfly_sub(s8, &s8h, sf);
-  od_butterfly_add(s4, &s4h, s3);
-  od_butterfly_add(sc, &sch, sb);
-  od_butterfly_sub(s2, &s2h, s5);
-  od_butterfly_sub(sa, &sah, sd);
-  od_butterfly_add(s6, &s6h, s1);
-  od_butterfly_add(se, &seh, s9);
-
-  /* Stage 0 */
-
-  /* 23143/32768 = (Sin[17*Pi/64] + Cos[17*Pi/64])/2 = 0.7062550401009887 */
-  /*   1137/8192 = (Sin[17*Pi/64] - Cos[17*Pi/64])*2 = 0.1387843410158816 */
-  /* 44011/32768 = Cos[17*Pi/64]*2                   = 1.3431179096940367 */
-  od_rotate_sub_half(s8, s7, s8h, 23143, 15, 1137, 13, 44011, 15, SHIFT);
-
-  /*   2865/4096 = (Sin[19*Pi/64] + Cos[19*Pi/64])/2 = 0.6994534179865391 */
-  /* 13599/32768 = (Sin[19*Pi/64] - Cos[19*Pi/64])*2 = 0.4150164539764232 */
-  /*     305/256 = Cos[19*Pi/64]*2                   = 1.1913986089848667 */
-  od_rotate_add_half(s6, s9, s6h, 2865, 12, 13599, 15, 305, 8, SHIFT);
-
-  /* 5619/8192 = (Sin[21*Pi/64] + Cos[21*Pi/64])/2 = 0.6859156770967569 */
-  /* 2815/4096 = (Sin[21*Pi/64] - Cos[21*Pi/64])*2 = 0.6872517316141069 */
-  /* 8423/8192 = Cos[21*Pi/64]*2                   = 1.0282054883864433 */
-  od_rotate_sub_half(sa, s5, sah, 5619, 13, 2815, 12, 8423, 13, SHIFT);
-
-  /* 2727/4096 = (Sin[23*Pi/64] + Cos[23*Pi/64])/2 = 0.6657721932768628 */
-  /* 3903/4096 = (Sin[23*Pi/64] - Cos[23*Pi/64])*2 = 0.9528683993863225 */
-  /* 7005/8192 = Cos[23*Pi/64]*2                   = 0.8551101868605642 */
-  od_rotate_add_half(s4, sb, s4h, 2727, 12, 3903, 12, 7005, 13, SHIFT);
-
-  /* 10473/16384 = (Sin[25*Pi/64] + Cos[25*Pi/64])/2 = 0.6392169592876205 */
-  /* 39627/32768 = (Sin[25*Pi/64] - Cos[25*Pi/64])*2 = 1.2093084235816014 */
-  /* 11039/16384 = Cos[25*Pi/64]*2                   = 0.6737797067844401 */
-  od_rotate_sub_half(sc, s3, sch, 10473, 14, 39627, 15, 11039, 14, SHIFT);
-
-  /*  9937/16384 = (Sin[27*Pi/64] + Cos[27*Pi/64])/2 = 0.6065057165489039 */
-  /*   1489/1024 = (Sin[27*Pi/64] - Cos[27*Pi/64])*2 = 1.4541021465825602 */
-  /*   3981/8192 = Cos[27*Pi/64]*2                   = 0.4859603598065277 */
-  od_rotate_add_half(s2, sd, s2h, 9937, 14, 1489, 10, 3981, 13, SHIFT);
-
-  /* 18611/32768 = (Sin[29*Pi/64] + Cos[29*Pi/64])/2 = 0.5679534922100714 */
-  /* 55211/32768 = (Sin[29*Pi/64] - Cos[29*Pi/64])*2 = 1.6848920710188384 */
-  /*    601/2048 = Cos[29*Pi/64]*2                   = 0.2934609489107235 */
-  od_rotate_sub_half(se, s1, seh, 18611, 15, 55211, 15, 601, 11, SHIFT);
-
-  /*   1073/2048 = (Sin[31*Pi/64] + Cos[31*Pi/64])/2 = 0.5239315652662953 */
-  /* 62241/32768 = (Sin[31*Pi/64] - Cos[31*Pi/64])*2 = 1.8994555637555088 */
-  /*    201/2048 = Cos[31*Pi/64]*2                   = 0.0981353486548360 */
-  od_rotate_add_half(s0, sf, s0h, 1073, 11, 62241, 15, 201, 11, SHIFT);
-}
-
-/* --- 32-point Transforms --- */
-
-/**
- * 32-point orthonormal Type-II fDCT
- */
-static INLINE void od_fdct_32(od_coeff *t0, od_coeff *t1,
-                              od_coeff *t2, od_coeff *t3,
-                              od_coeff *t4, od_coeff *t5,
-                              od_coeff *t6, od_coeff *t7,
-                              od_coeff *t8, od_coeff *t9,
-                              od_coeff *ta, od_coeff *tb,
-                              od_coeff *tc, od_coeff *td,
-                              od_coeff *te, od_coeff *tf,
-                              od_coeff *tg, od_coeff *th,
-                              od_coeff *ti, od_coeff *tj,
-                              od_coeff *tk, od_coeff *tl,
-                              od_coeff *tm, od_coeff *tn,
-                              od_coeff *to, od_coeff *tp,
-                              od_coeff *tq, od_coeff *tr,
-                              od_coeff *ts, od_coeff *tt,
-                              od_coeff *tu, od_coeff *tv) {
-  od_coeff t1h;
-  od_coeff t3h;
-  od_coeff t5h;
-  od_coeff t7h;
-  od_coeff t9h;
-  od_coeff tbh;
-  od_coeff tdh;
-  od_coeff tfh;
-  od_coeff thh;
-  od_coeff tjh;
-  od_coeff tlh;
-  od_coeff tnh;
-  od_coeff tph;
-  od_coeff trh;
-  od_coeff tth;
-  od_coeff tvh;
-
-  /* +/- Butterflies with asymmetric output. */
-  od_butterfly_neg(t0, tv, &tvh);
-  od_butterfly_add(t1, &t1h, tu);
-  od_butterfly_neg(t2, tt, &tth);
-  od_butterfly_add(t3, &t3h, ts);
-  od_butterfly_neg(t4, tr, &trh);
-  od_butterfly_add(t5, &t5h, tq);
-  od_butterfly_neg(t6, tp, &tph);
-  od_butterfly_add(t7, &t7h, to);
-  od_butterfly_neg(t8, tn, &tnh);
-  od_butterfly_add(t9, &t9h, tm);
-  od_butterfly_neg(ta, tl, &tlh);
-  od_butterfly_add(tb, &tbh, tk);
-  od_butterfly_neg(tc, tj, &tjh);
-  od_butterfly_add(td, &tdh, ti);
-  od_butterfly_neg(te, th, &thh);
-  od_butterfly_add(tf, &tfh, tg);
-
-  /* Embedded 16-point transforms with asymmetric input. */
-  od_fdct_16_asym(
-   t0, t1, t1h, t2, t3, t3h, t4, t5, t5h, t6, t7, t7h,
-   t8, t9, t9h, ta, tb, tbh, tc, td, tdh, te, tf, tfh);
-  od_fdst_16_asym(
-   tv, tvh, tu, tt, tth, ts, tr, trh, tq, tp, tph, to,
-   tn, tnh, tm, tl, tlh, tk, tj, tjh, ti, th, thh, tg);
-}
-
-/**
- * 32-point orthonormal Type-II iDCT
- */
-static INLINE void od_idct_32(od_coeff *t0, od_coeff *tg,
-                              od_coeff *t8, od_coeff *to,
-                              od_coeff *t4, od_coeff *tk,
-                              od_coeff *tc, od_coeff *ts,
-                              od_coeff *t2, od_coeff *ti,
-                              od_coeff *ta, od_coeff *tq,
-                              od_coeff *t6, od_coeff *tm,
-                              od_coeff *te, od_coeff *tu,
-                              od_coeff *t1, od_coeff *th,
-                              od_coeff *t9, od_coeff *tp,
-                              od_coeff *t5, od_coeff *tl,
-                              od_coeff *td, od_coeff *tt,
-                              od_coeff *t3, od_coeff *tj,
-                              od_coeff *tb, od_coeff *tr,
-                              od_coeff *t7, od_coeff *tn,
-                              od_coeff *tf, od_coeff *tv) {
-  od_coeff t1h;
-  od_coeff t3h;
-  od_coeff t5h;
-  od_coeff t7h;
-  od_coeff t9h;
-  od_coeff tbh;
-  od_coeff tdh;
-  od_coeff tfh;
-
-  /* Embedded 16-point transforms with asymmetric output. */
-  od_idst_16_asym(
-   tv, tn, tr, tj, tt, tl, tp, th, tu, tm, tq, ti, ts, tk, to, tg);
-  od_idct_16_asym(
-   t0, t8, t4, tc, t2, ta, t6, te,
-   t1, &t1h, t9, &t9h, t5, &t5h, td, &tdh,
-   t3, &t3h, tb, &tbh, t7, &t7h, tf, &tfh);
-
-  /* +/- Butterflies with asymmetric input. */
-  od_butterfly_add_asym(tf, tfh, tg);
-  od_butterfly_neg_asym(te, th, od_rshift1(*th));
-  od_butterfly_add_asym(td, tdh, ti);
-  od_butterfly_neg_asym(tc, tj, od_rshift1(*tj));
-  od_butterfly_add_asym(tb, tbh, tk);
-  od_butterfly_neg_asym(ta, tl, od_rshift1(*tl));
-  od_butterfly_add_asym(t9, t9h, tm);
-  od_butterfly_neg_asym(t8, tn, od_rshift1(*tn));
-  od_butterfly_add_asym(t7, t7h, to);
-  od_butterfly_neg_asym(t6, tp, od_rshift1(*tp));
-  od_butterfly_add_asym(t5, t5h, tq);
-  od_butterfly_neg_asym(t4, tr, od_rshift1(*tr));
-  od_butterfly_add_asym(t3, t3h, ts);
-  od_butterfly_neg_asym(t2, tt, od_rshift1(*tt));
-  od_butterfly_add_asym(t1, t1h, tu);
-  od_butterfly_neg_asym(t0, tv, od_rshift1(*tv));
-}
-
-#endif
diff --git a/av1/common/idct.c b/av1/common/idct.c
index f82b0e9..9db0333 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -19,22 +19,11 @@
 #include "av1/common/blockd.h"
 #include "av1/common/enums.h"
 #include "av1/common/idct.h"
-#if CONFIG_DAALA_TX4 || CONFIG_DAALA_TX8 || CONFIG_DAALA_TX16 || \
-    CONFIG_DAALA_TX32 || CONFIG_DAALA_TX64
-#include "av1/common/daala_tx.h"
-#if CONFIG_DAALA_TX
-#include "av1/common/daala_inv_txfm.h"
-#endif
-#endif
 
-#if !CONFIG_DAALA_TX
 int av1_get_tx_scale(const TX_SIZE tx_size) {
   const int pels = tx_size_2d[tx_size];
   return (pels > 256) + (pels > 1024) + (pels > 4096);
 }
-#endif
-
-#if !CONFIG_DAALA_TX
 
 // NOTE: The implementation of all inverses need to be aware of the fact
 // that input and output could be the same buffer.
@@ -63,13 +52,13 @@
   }
 }
 
-#if CONFIG_TX64X64 && (!CONFIG_DAALA_TX32 || !CONFIG_DAALA_TX64)
+#if CONFIG_TX64X64
 static void iidtx64_c(const tran_low_t *input, tran_low_t *output) {
   for (int i = 0; i < 64; ++i) {
     output[i] = (tran_low_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
   }
 }
-#endif  // CONFIG_TX64X64 && (!CONFIG_DAALA_TX32 || !CONFIG_DAALA_TX64)
+#endif  // CONFIG_TX64X64
 
 // For use in lieu of ADST
 static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
@@ -85,7 +74,7 @@
   // Note overall scaling factor is 4 times orthogonal
 }
 
-#if CONFIG_TX64X64 && (!CONFIG_DAALA_TX32 || !CONFIG_DAALA_TX64)
+#if CONFIG_TX64X64
 static const int8_t inv_stage_range_col_dct_64[12] = { 0, 0, 0, 0, 0, 0,
                                                        0, 0, 0, 0, 0, 0 };
 static const int8_t inv_stage_range_row_dct_64[12] = { 0, 0, 0, 0, 0, 0,
@@ -123,7 +112,7 @@
   aom_idct32_c(inputhalf, output + 32);
   // Note overall scaling factor is 4 * sqrt(2)  times orthogonal
 }
-#endif  // CONFIG_TX64X64 && (!CONFIG_DAALA_TX32 || !CONFIG_DAALA_TX64)
+#endif  // CONFIG_TX64X64
 
 #define FLIPUD_PTR(dest, stride, size)       \
   do {                                       \
@@ -191,31 +180,11 @@
 void av1_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          const TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
-#if !CONFIG_DAALA_TX4
   if (tx_type == DCT_DCT) {
     aom_idct4x4_16_add(input, dest, stride);
     return;
   }
-#endif
   static const transform_2d IHT_4[] = {
-#if CONFIG_DAALA_TX4
-    { daala_idct4, daala_idct4 },  // DCT_DCT  = 0
-    { daala_idst4, daala_idct4 },  // ADST_DCT = 1
-    { daala_idct4, daala_idst4 },  // DCT_ADST = 2
-    { daala_idst4, daala_idst4 },  // ADST_ADST = 3
-    { daala_idst4, daala_idct4 },  // FLIPADST_DCT
-    { daala_idct4, daala_idst4 },  // DCT_FLIPADST
-    { daala_idst4, daala_idst4 },  // FLIPADST_FLIPADST
-    { daala_idst4, daala_idst4 },  // ADST_FLIPADST
-    { daala_idst4, daala_idst4 },  // FLIPADST_ADST
-    { daala_idtx4, daala_idtx4 },  // IDTX
-    { daala_idct4, daala_idtx4 },  // V_DCT
-    { daala_idtx4, daala_idct4 },  // H_DCT
-    { daala_idst4, daala_idtx4 },  // V_ADST
-    { daala_idtx4, daala_idst4 },  // H_ADST
-    { daala_idst4, daala_idtx4 },  // V_FLIPADST
-    { daala_idtx4, daala_idst4 },  // H_FLIPADST
-#else
     { aom_idct4_c, aom_idct4_c },    // DCT_DCT  = 0
     { aom_iadst4_c, aom_idct4_c },   // ADST_DCT = 1
     { aom_idct4_c, aom_iadst4_c },   // DCT_ADST = 2
@@ -232,7 +201,6 @@
     { iidtx4_c, aom_iadst4_c },      // H_ADST
     { aom_iadst4_c, iidtx4_c },      // V_FLIPADST
     { iidtx4_c, aom_iadst4_c },      // H_FLIPADST
-#endif
   };
 
   tran_low_t tmp[4][4];
@@ -242,13 +210,7 @@
 
   // inverse transform row vectors
   for (int i = 0; i < 4; ++i) {
-#if CONFIG_DAALA_TX4
-    tran_low_t temp_in[4];
-    for (int j = 0; j < 4; j++) temp_in[j] = input[j] * 2;
-    IHT_4[tx_type].rows(temp_in, out[i]);
-#else
     IHT_4[tx_type].rows(input, out[i]);
-#endif
     input += 4;
   }
 
@@ -271,11 +233,7 @@
     for (int j = 0; j < 4; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
-#if CONFIG_DAALA_TX4
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#endif
     }
   }
 }
@@ -284,24 +242,6 @@
                          const TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   static const transform_2d IHT_4x8[] = {
-#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
-    { daala_idct8, daala_idct4 },  // DCT_DCT  = 0
-    { daala_idst8, daala_idct4 },  // ADST_DCT = 1
-    { daala_idct8, daala_idst4 },  // DCT_ADST = 2
-    { daala_idst8, daala_idst4 },  // ADST_ADST = 3
-    { daala_idst8, daala_idct4 },  // FLIPADST_DCT
-    { daala_idct8, daala_idst4 },  // DCT_FLIPADST
-    { daala_idst8, daala_idst4 },  // FLIPADST_FLIPADST
-    { daala_idst8, daala_idst4 },  // ADST_FLIPADST
-    { daala_idst8, daala_idst4 },  // FLIPADST_ADST
-    { daala_idtx8, daala_idtx4 },  // IDTX
-    { daala_idct8, daala_idtx4 },  // V_DCT
-    { daala_idtx8, daala_idct4 },  // H_DCT
-    { daala_idst8, daala_idtx4 },  // V_ADST
-    { daala_idtx8, daala_idst4 },  // H_ADST
-    { daala_idst8, daala_idtx4 },  // V_FLIPADST
-    { daala_idtx8, daala_idst4 },  // H_FLIPADST
-#else
     { aom_idct8_c, aom_idct4_c },    // DCT_DCT
     { aom_iadst8_c, aom_idct4_c },   // ADST_DCT
     { aom_idct8_c, aom_iadst4_c },   // DCT_ADST
@@ -318,7 +258,6 @@
     { iidtx8_c, aom_iadst4_c },      // H_ADST
     { aom_iadst8_c, iidtx4_c },      // V_FLIPADST
     { iidtx8_c, aom_iadst4_c },      // H_FLIPADST
-#endif
   };
 
   const int n = 4;
@@ -336,23 +275,12 @@
 
   // inverse transform row vectors and transpose
   for (int i = 0; i < n2; ++i) {
-#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
-    // Daala row transform; Scaling cases 3 and 4 above
-    tran_low_t temp_in[4];
-    // Input scaling up by 1 bit
-    for (int j = 0; j < n; j++) temp_in[j] = input[j] * 2;
-    // Row transform; Daala does not scale
-    IHT_4x8[tx_type].rows(temp_in, outtmp);
-    // Transpose; no mid scaling
-    for (int j = 0; j < n; ++j) tmp[j][i] = outtmp[j];
-#else
     // AV1 row transform; Scaling case 1 only
     // Row transform (AV1 scales up .5 bits)
     IHT_4x8[tx_type].rows(input, outtmp);
     // Transpose and mid scaling up by .5 bit
     for (int j = 0; j < n; ++j)
       tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-#endif
     input += n;
   }
 
@@ -369,13 +297,8 @@
     for (int j = 0; j < n; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
-#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
-      // Output scaling cases 2, 4
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
       // Output scaling case 1 only
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-#endif
     }
   }
 }
@@ -384,24 +307,6 @@
                          const TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   static const transform_2d IHT_8x4[] = {
-#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
-    { daala_idct4, daala_idct8 },  // DCT_DCT  = 0
-    { daala_idst4, daala_idct8 },  // ADST_DCT = 1
-    { daala_idct4, daala_idst8 },  // DCT_ADST = 2
-    { daala_idst4, daala_idst8 },  // ADST_ADST = 3
-    { daala_idst4, daala_idct8 },  // FLIPADST_DCT
-    { daala_idct4, daala_idst8 },  // DCT_FLIPADST
-    { daala_idst4, daala_idst8 },  // FLIPADST_FLIPADST
-    { daala_idst4, daala_idst8 },  // ADST_FLIPADST
-    { daala_idst4, daala_idst8 },  // FLIPADST_ADST
-    { daala_idtx4, daala_idtx8 },  // IDTX
-    { daala_idct4, daala_idtx8 },  // V_DCT
-    { daala_idtx4, daala_idct8 },  // H_DCT
-    { daala_idst4, daala_idtx8 },  // V_ADST
-    { daala_idtx4, daala_idst8 },  // H_ADST
-    { daala_idst4, daala_idtx8 },  // V_FLIPADST
-    { daala_idtx4, daala_idst8 },  // H_FLIPADST
-#else
     { aom_idct4_c, aom_idct8_c },    // DCT_DCT
     { aom_iadst4_c, aom_idct8_c },   // ADST_DCT
     { aom_idct4_c, aom_iadst8_c },   // DCT_ADST
@@ -418,7 +323,6 @@
     { iidtx4_c, aom_iadst8_c },      // H_ADST
     { aom_iadst4_c, iidtx8_c },      // V_FLIPADST
     { iidtx4_c, aom_iadst8_c },      // H_FLIPADST
-#endif
   };
 
   const int n = 4;
@@ -436,23 +340,12 @@
 
   // inverse transform row vectors and transpose
   for (int i = 0; i < n; ++i) {
-#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
-    // Daala row transform; Scaling cases 3 and 4 above
-    tran_low_t temp_in[8];
-    // Input scaling up by 1 bit
-    for (int j = 0; j < n2; j++) temp_in[j] = input[j] * 2;
-    // Row transform; Daala does not scale
-    IHT_8x4[tx_type].rows(temp_in, outtmp);
-    // Transpose; no mid scaling
-    for (int j = 0; j < n2; ++j) tmp[j][i] = outtmp[j];
-#else
     // AV1 row transform; Scaling case 1 only
     // Row transform (AV1 scales up 1 bit)
     IHT_8x4[tx_type].rows(input, outtmp);
     // Transpose and mid scaling up by .5 bit
     for (int j = 0; j < n2; ++j)
       tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-#endif
     input += n2;
   }
 
@@ -469,13 +362,8 @@
     for (int j = 0; j < n2; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
-#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
-      // Output scaling cases 2, 4
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
       // Output scaling case 1
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-#endif
     }
   }
 }
@@ -590,24 +478,6 @@
                            const TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   static const transform_2d IHT_8x16[] = {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
-    { daala_idct16, daala_idct8 },  // DCT_DCT  = 0
-    { daala_idst16, daala_idct8 },  // ADST_DCT = 1
-    { daala_idct16, daala_idst8 },  // DCT_ADST = 2
-    { daala_idst16, daala_idst8 },  // ADST_ADST = 3
-    { daala_idst16, daala_idct8 },  // FLIPADST_DCT
-    { daala_idct16, daala_idst8 },  // DCT_FLIPADST
-    { daala_idst16, daala_idst8 },  // FLIPADST_FLIPADST
-    { daala_idst16, daala_idst8 },  // ADST_FLIPADST
-    { daala_idst16, daala_idst8 },  // FLIPADST_ADST
-    { daala_idtx16, daala_idtx8 },  // IDTX
-    { daala_idct16, daala_idtx8 },  // V_DCT
-    { daala_idtx16, daala_idct8 },  // H_DCT
-    { daala_idst16, daala_idtx8 },  // V_ADST
-    { daala_idtx16, daala_idst8 },  // H_ADST
-    { daala_idst16, daala_idtx8 },  // V_FLIPADST
-    { daala_idtx16, daala_idst8 },  // H_FLIPADST
-#else
     { aom_idct16_c, aom_idct8_c },    // DCT_DCT
     { aom_iadst16_c, aom_idct8_c },   // ADST_DCT
     { aom_idct16_c, aom_iadst8_c },   // DCT_ADST
@@ -624,7 +494,6 @@
     { iidtx16_c, aom_iadst8_c },      // H_ADST
     { aom_iadst16_c, iidtx8_c },      // V_FLIPADST
     { iidtx16_c, aom_iadst8_c },      // H_FLIPADST
-#endif
   };
 
   const int n = 8;
@@ -642,22 +511,12 @@
 
   // inverse transform row vectors and transpose
   for (int i = 0; i < n2; ++i) {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
-    tran_low_t temp_in[8];
-    // Input scaling case 4
-    for (int j = 0; j < n; j++) temp_in[j] = input[j] * 2;
-    // Row transform (Daala does not scale)
-    IHT_8x16[tx_type].rows(temp_in, outtmp);
-    // Transpose (no mid scaling)
-    for (int j = 0; j < n; ++j) tmp[j][i] = outtmp[j];
-#else
     // Case 1; no input scaling
     // Row transform (AV1 scales up 1 bit)
     IHT_8x16[tx_type].rows(input, outtmp);
     // Transpose and mid scaling up .5 bits
     for (int j = 0; j < n; ++j)
       tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-#endif
     input += n;
   }
 
@@ -674,13 +533,8 @@
     for (int j = 0; j < n; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
-      // Output scaling cases 2 and 4
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
       // Output scaling case 1
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-#endif
     }
   }
 }
@@ -689,24 +543,6 @@
                            const TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   static const transform_2d IHT_16x8[] = {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
-    { daala_idct8, daala_idct16 },  // DCT_DCT  = 0
-    { daala_idst8, daala_idct16 },  // ADST_DCT = 1
-    { daala_idct8, daala_idst16 },  // DCT_ADST = 2
-    { daala_idst8, daala_idst16 },  // ADST_ADST = 3
-    { daala_idst8, daala_idct16 },  // FLIPADST_DCT
-    { daala_idct8, daala_idst16 },  // DCT_FLIPADST
-    { daala_idst8, daala_idst16 },  // FLIPADST_FLIPADST
-    { daala_idst8, daala_idst16 },  // ADST_FLIPADST
-    { daala_idst8, daala_idst16 },  // FLIPADST_ADST
-    { daala_idtx8, daala_idtx16 },  // IDTX
-    { daala_idct8, daala_idtx16 },  // V_DCT
-    { daala_idtx8, daala_idct16 },  // H_DCT
-    { daala_idst8, daala_idtx16 },  // V_ADST
-    { daala_idtx8, daala_idst16 },  // H_ADST
-    { daala_idst8, daala_idtx16 },  // V_FLIPADST
-    { daala_idtx8, daala_idst16 },  // H_FLIPADST
-#else
     { aom_idct8_c, aom_idct16_c },    // DCT_DCT
     { aom_iadst8_c, aom_idct16_c },   // ADST_DCT
     { aom_idct8_c, aom_iadst16_c },   // DCT_ADST
@@ -723,7 +559,6 @@
     { iidtx8_c, aom_iadst16_c },      // H_ADST
     { aom_iadst8_c, iidtx16_c },      // V_FLIPADST
     { iidtx8_c, aom_iadst16_c },      // H_FLIPADST
-#endif
   };
 
   const int n = 8;
@@ -741,16 +576,6 @@
 
   // inverse transform row vectors and transpose
   for (int i = 0; i < n; ++i) {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
-    tran_low_t temp_in[16];
-    // Input scaling cases 3 and 4
-    for (int j = 0; j < n2; j++) temp_in[j] = input[j] * 2;
-    // Daala row TX, no scaling
-    IHT_16x8[tx_type].rows(temp_in, outtmp);
-    // Transpose and mid scaling
-    // Case 4
-    for (int j = 0; j < n2; ++j) tmp[j][i] = outtmp[j];
-#else
     // Case 1
     // No input scaling
     // Row transform, AV1 scales up by 1.5 bits
@@ -758,7 +583,6 @@
     // Transpose and mid scaling up .5 bits
     for (int j = 0; j < n2; ++j)
       tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-#endif
     input += n2;
   }
 
@@ -775,14 +599,9 @@
     for (int j = 0; j < n2; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
-// Output scaling
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
-      // case 4
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
+      // Output scaling
       // case 1
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-#endif
     }
   }
 }
@@ -897,24 +716,6 @@
                             const TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   static const transform_2d IHT_16x32[] = {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
-    { daala_idct32, daala_idct16 },  // DCT_DCT  = 0
-    { daala_idst32, daala_idct16 },  // ADST_DCT = 1
-    { daala_idct32, daala_idst16 },  // DCT_ADST = 2
-    { daala_idst32, daala_idst16 },  // ADST_ADST = 3
-    { daala_idst32, daala_idct16 },  // FLIPADST_DCT
-    { daala_idct32, daala_idst16 },  // DCT_FLIPADST
-    { daala_idst32, daala_idst16 },  // FLIPADST_FLIPADST
-    { daala_idst32, daala_idst16 },  // ADST_FLIPADST
-    { daala_idst32, daala_idst16 },  // FLIPADST_ADST
-    { daala_idtx32, daala_idtx16 },  // IDTX
-    { daala_idct32, daala_idtx16 },  // V_DCT
-    { daala_idtx32, daala_idct16 },  // H_DCT
-    { daala_idst32, daala_idtx16 },  // V_ADST
-    { daala_idtx32, daala_idst16 },  // H_ADST
-    { daala_idst32, daala_idtx16 },  // V_FLIPADST
-    { daala_idtx32, daala_idst16 },  // H_FLIPADST
-#else
     { aom_idct32_c, aom_idct16_c },     // DCT_DCT
     { ihalfright32_c, aom_idct16_c },   // ADST_DCT
     { aom_idct32_c, aom_iadst16_c },    // DCT_ADST
@@ -931,7 +732,6 @@
     { iidtx32_c, aom_iadst16_c },       // H_ADST
     { ihalfright32_c, iidtx16_c },      // V_FLIPADST
     { iidtx32_c, aom_iadst16_c },       // H_FLIPADST
-#endif
   };
 
   const int n = 16;
@@ -943,16 +743,9 @@
 
   // inverse transform row vectors and transpose
   for (int i = 0; i < n2; ++i) {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
-    tran_low_t temp_in[16];
-    for (int j = 0; j < n; j++) temp_in[j] = input[j] * 4;
-    IHT_16x32[tx_type].rows(temp_in, outtmp);
-    for (int j = 0; j < n; ++j) tmp[j][i] = outtmp[j];
-#else
     IHT_16x32[tx_type].rows(input, outtmp);
     for (int j = 0; j < n; ++j)
       tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-#endif
     input += n;
   }
 
@@ -966,11 +759,7 @@
     for (int j = 0; j < n; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-#endif
     }
   }
 }
@@ -979,24 +768,6 @@
                             const TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   static const transform_2d IHT_32x16[] = {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
-    { daala_idct16, daala_idct32 },  // DCT_DCT  = 0
-    { daala_idst16, daala_idct32 },  // ADST_DCT = 1
-    { daala_idct16, daala_idst32 },  // DCT_ADST = 2
-    { daala_idst16, daala_idst32 },  // ADST_ADST = 3
-    { daala_idst16, daala_idct32 },  // FLIPADST_DCT
-    { daala_idct16, daala_idst32 },  // DCT_FLIPADST
-    { daala_idst16, daala_idst32 },  // FLIPADST_FLIPADST
-    { daala_idst16, daala_idst32 },  // ADST_FLIPADST
-    { daala_idst16, daala_idst32 },  // FLIPADST_ADST
-    { daala_idtx16, daala_idtx32 },  // IDTX
-    { daala_idct16, daala_idtx32 },  // V_DCT
-    { daala_idtx16, daala_idct32 },  // H_DCT
-    { daala_idst16, daala_idtx32 },  // V_ADST
-    { daala_idtx16, daala_idst32 },  // H_ADST
-    { daala_idst16, daala_idtx32 },  // V_FLIPADST
-    { daala_idtx16, daala_idst32 },  // H_FLIPADST
-#else
     { aom_idct16_c, aom_idct32_c },     // DCT_DCT
     { aom_iadst16_c, aom_idct32_c },    // ADST_DCT
     { aom_idct16_c, ihalfright32_c },   // DCT_ADST
@@ -1013,7 +784,6 @@
     { iidtx16_c, ihalfright32_c },      // H_ADST
     { aom_iadst16_c, iidtx32_c },       // V_FLIPADST
     { iidtx16_c, ihalfright32_c },      // H_FLIPADST
-#endif
   };
   const int n = 16;
   const int n2 = 32;
@@ -1024,16 +794,9 @@
 
   // inverse transform row vectors and transpose
   for (int i = 0; i < n; ++i) {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
-    tran_low_t temp_in[32];
-    for (int j = 0; j < n2; j++) temp_in[j] = input[j] * 4;
-    IHT_32x16[tx_type].rows(temp_in, outtmp);
-    for (int j = 0; j < n2; ++j) tmp[j][i] = outtmp[j];
-#else
     IHT_32x16[tx_type].rows(input, outtmp);
     for (int j = 0; j < n2; ++j)
       tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-#endif
     input += n2;
   }
 
@@ -1047,11 +810,7 @@
     for (int j = 0; j < n2; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-#endif
     }
   }
 }
@@ -1060,24 +819,6 @@
                          const TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   static const transform_2d IHT_8[] = {
-#if CONFIG_DAALA_TX8
-    { daala_idct8, daala_idct8 },  // DCT_DCT  = 0
-    { daala_idst8, daala_idct8 },  // ADST_DCT = 1
-    { daala_idct8, daala_idst8 },  // DCT_ADST = 2
-    { daala_idst8, daala_idst8 },  // ADST_ADST = 3
-    { daala_idst8, daala_idct8 },  // FLIPADST_DCT
-    { daala_idct8, daala_idst8 },  // DCT_FLIPADST
-    { daala_idst8, daala_idst8 },  // FLIPADST_FLIPADST
-    { daala_idst8, daala_idst8 },  // ADST_FLIPADST
-    { daala_idst8, daala_idst8 },  // FLIPADST_ADST
-    { daala_idtx8, daala_idtx8 },  // IDTX
-    { daala_idct8, daala_idtx8 },  // V_DCT
-    { daala_idtx8, daala_idct8 },  // H_DCT
-    { daala_idst8, daala_idtx8 },  // V_ADST
-    { daala_idtx8, daala_idst8 },  // H_ADST
-    { daala_idst8, daala_idtx8 },  // V_FLIPADST
-    { daala_idtx8, daala_idst8 },  // H_FLIPADST
-#else
     { aom_idct8_c, aom_idct8_c },    // DCT_DCT  = 0
     { aom_iadst8_c, aom_idct8_c },   // ADST_DCT = 1
     { aom_idct8_c, aom_iadst8_c },   // DCT_ADST = 2
@@ -1094,7 +835,6 @@
     { iidtx8_c, aom_iadst8_c },      // H_ADST
     { aom_iadst8_c, iidtx8_c },      // V_FLIPADST
     { iidtx8_c, aom_iadst8_c },      // H_FLIPADST
-#endif
   };
 
   tran_low_t tmp[8][8];
@@ -1104,13 +844,7 @@
 
   // inverse transform row vectors
   for (int i = 0; i < 8; ++i) {
-#if CONFIG_DAALA_TX8
-    tran_low_t temp_in[8];
-    for (int j = 0; j < 8; j++) temp_in[j] = input[j] * 2;
-    IHT_8[tx_type].rows(temp_in, out[i]);
-#else
     IHT_8[tx_type].rows(input, out[i]);
-#endif
     input += 8;
   }
 
@@ -1133,11 +867,7 @@
     for (int j = 0; j < 8; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
-#if CONFIG_DAALA_TX8
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-#endif
     }
   }
 }
@@ -1146,24 +876,6 @@
                             const TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   static const transform_2d IHT_16[] = {
-#if CONFIG_DAALA_TX16
-    { daala_idct16, daala_idct16 },  // DCT_DCT  = 0
-    { daala_idst16, daala_idct16 },  // ADST_DCT = 1
-    { daala_idct16, daala_idst16 },  // DCT_ADST = 2
-    { daala_idst16, daala_idst16 },  // ADST_ADST = 3
-    { daala_idst16, daala_idct16 },  // FLIPADST_DCT
-    { daala_idct16, daala_idst16 },  // DCT_FLIPADST
-    { daala_idst16, daala_idst16 },  // FLIPADST_FLIPADST
-    { daala_idst16, daala_idst16 },  // ADST_FLIPADST
-    { daala_idst16, daala_idst16 },  // FLIPADST_ADST
-    { daala_idtx16, daala_idtx16 },  // IDTX
-    { daala_idct16, daala_idtx16 },  // V_DCT
-    { daala_idtx16, daala_idct16 },  // H_DCT
-    { daala_idst16, daala_idtx16 },  // V_ADST
-    { daala_idtx16, daala_idst16 },  // H_ADST
-    { daala_idst16, daala_idtx16 },  // V_FLIPADST
-    { daala_idtx16, daala_idst16 },  // H_FLIPADST
-#else
     { aom_idct16_c, aom_idct16_c },    // DCT_DCT  = 0
     { aom_iadst16_c, aom_idct16_c },   // ADST_DCT = 1
     { aom_idct16_c, aom_iadst16_c },   // DCT_ADST = 2
@@ -1180,7 +892,6 @@
     { iidtx16_c, aom_iadst16_c },      // H_ADST
     { aom_iadst16_c, iidtx16_c },      // V_FLIPADST
     { iidtx16_c, aom_iadst16_c },      // H_FLIPADST
-#endif
   };
 
   tran_low_t tmp[16][16];
@@ -1190,13 +901,7 @@
 
   // inverse transform row vectors
   for (int i = 0; i < 16; ++i) {
-#if CONFIG_DAALA_TX16
-    tran_low_t temp_in[16];
-    for (int j = 0; j < 16; j++) temp_in[j] = input[j] * 2;
-    IHT_16[tx_type].rows(temp_in, out[i]);
-#else
     IHT_16[tx_type].rows(input, out[i]);
-#endif
     input += 16;
   }
 
@@ -1217,11 +922,7 @@
     for (int j = 0; j < 16; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
-#if CONFIG_DAALA_TX16
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-#endif
     }
   }
 }
@@ -1230,24 +931,6 @@
                              const TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   static const transform_2d IHT_32[] = {
-#if CONFIG_DAALA_TX32
-    { daala_idct32, daala_idct32 },  // DCT_DCT
-    { daala_idst32, daala_idct32 },  // ADST_DCT
-    { daala_idct32, daala_idst32 },  // DCT_ADST
-    { daala_idst32, daala_idst32 },  // ADST_ADST
-    { daala_idst32, daala_idct32 },  // FLIPADST_DCT
-    { daala_idct32, daala_idst32 },  // DCT_FLIPADST
-    { daala_idst32, daala_idst32 },  // FLIPADST_FLIPADST
-    { daala_idst32, daala_idst32 },  // ADST_FLIPADST
-    { daala_idst32, daala_idst32 },  // FLIPADST_ADST
-    { daala_idtx32, daala_idtx32 },  // IDTX
-    { daala_idct32, daala_idtx32 },  // V_DCT
-    { daala_idtx32, daala_idct32 },  // H_DCT
-    { daala_idst32, daala_idtx32 },  // V_ADST
-    { daala_idtx32, daala_idst32 },  // H_ADST
-    { daala_idst32, daala_idtx32 },  // V_FLIPADST
-    { daala_idtx32, daala_idst32 },  // H_FLIPADST
-#else
     { aom_idct32_c, aom_idct32_c },      // DCT_DCT
     { ihalfright32_c, aom_idct32_c },    // ADST_DCT
     { aom_idct32_c, ihalfright32_c },    // DCT_ADST
@@ -1264,7 +947,6 @@
     { iidtx32_c, ihalfright32_c },       // H_ADST
     { ihalfright32_c, iidtx32_c },       // V_FLIPADST
     { iidtx32_c, ihalfright32_c },       // H_FLIPADST
-#endif
   };
 
   tran_low_t tmp[32][32];
@@ -1274,13 +956,7 @@
 
   // inverse transform row vectors
   for (int i = 0; i < 32; ++i) {
-#if CONFIG_DAALA_TX32
-    tran_low_t temp_in[32];
-    for (int j = 0; j < 32; j++) temp_in[j] = input[j] * 4;
-    IHT_32[tx_type].rows(temp_in, out[i]);
-#else
     IHT_32[tx_type].rows(input, out[i]);
-#endif
     input += 32;
   }
 
@@ -1298,11 +974,7 @@
     for (int j = 0; j < 32; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
-#if CONFIG_DAALA_TX32
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-#endif
     }
   }
 }
@@ -1312,24 +984,6 @@
                              const TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   static const transform_2d IHT_64[] = {
-#if CONFIG_DAALA_TX64
-    { daala_idct64, daala_idct64 },  // DCT_DCT
-    { daala_idst64, daala_idct64 },  // ADST_DCT
-    { daala_idct64, daala_idst64 },  // DCT_ADST
-    { daala_idst64, daala_idst64 },  // ADST_ADST
-    { daala_idst64, daala_idct64 },  // FLIPADST_DCT
-    { daala_idct64, daala_idst64 },  // DCT_FLIPADST
-    { daala_idst64, daala_idst64 },  // FLIPADST_FLIPADST
-    { daala_idst64, daala_idst64 },  // ADST_FLIPADST
-    { daala_idst64, daala_idst64 },  // FLIPADST_ADST
-    { daala_idtx64, daala_idtx64 },  // IDTX
-    { daala_idct64, daala_idtx64 },  // V_DCT
-    { daala_idtx64, daala_idct64 },  // H_DCT
-    { daala_idst64, daala_idtx64 },  // V_ADST
-    { daala_idtx64, daala_idst64 },  // H_ADST
-    { daala_idst64, daala_idtx64 },  // V_FLIPADST
-    { daala_idtx64, daala_idst64 },  // H_FLIPADST
-#else
     { idct64_col_c, idct64_row_c },      // DCT_DCT
     { ihalfright64_c, idct64_row_c },    // ADST_DCT
     { idct64_col_c, ihalfright64_c },    // DCT_ADST
@@ -1346,7 +1000,6 @@
     { iidtx64_c, ihalfright64_c },       // H_ADST
     { ihalfright64_c, iidtx64_c },       // V_FLIPADST
     { iidtx64_c, ihalfright64_c },       // H_FLIPADST
-#endif
   };
 
   // TODO(urvang): Can the same array be reused, instead of using a new array?
@@ -1368,14 +1021,8 @@
 
   // inverse transform row vectors
   for (int i = 0; i < 64; ++i) {
-#if CONFIG_DAALA_TX64
-    tran_low_t temp_in[64];
-    for (int j = 0; j < 64; j++) temp_in[j] = mod_input_ptr[j] * 8;
-    IHT_64[tx_type].rows(temp_in, out[i]);
-#else
     IHT_64[tx_type].rows(mod_input_ptr, out[i]);
     for (int j = 0; j < 64; ++j) out[i][j] = ROUND_POWER_OF_TWO(out[i][j], 1);
-#endif
     mod_input_ptr += 64;
   }
 
@@ -1396,11 +1043,7 @@
     for (int j = 0; j < 64; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
-#if CONFIG_DAALA_TX64
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-#endif
     }
   }
 }
@@ -1409,24 +1052,6 @@
                              const TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   static const transform_2d IHT_64x32[] = {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
-    { daala_idct32, daala_idct64 },  // DCT_DCT
-    { daala_idst32, daala_idct64 },  // ADST_DCT
-    { daala_idct32, daala_idst64 },  // DCT_ADST
-    { daala_idst32, daala_idst64 },  // ADST_ADST
-    { daala_idst32, daala_idct64 },  // FLIPADST_DCT
-    { daala_idct32, daala_idst64 },  // DCT_FLIPADST
-    { daala_idst32, daala_idst64 },  // FLIPADST_FLIPADST
-    { daala_idst32, daala_idst64 },  // ADST_FLIPADST
-    { daala_idst32, daala_idst64 },  // FLIPADST_ADST
-    { daala_idtx32, daala_idtx64 },  // IDTX
-    { daala_idct32, daala_idtx64 },  // V_DCT
-    { daala_idtx32, daala_idct64 },  // H_DCT
-    { daala_idst32, daala_idtx64 },  // V_ADST
-    { daala_idtx32, daala_idst64 },  // H_ADST
-    { daala_idst32, daala_idtx64 },  // V_FLIPADST
-    { daala_idtx32, daala_idst64 },  // H_FLIPADST
-#else
     { aom_idct32_c, idct64_row_c },      // DCT_DCT
     { ihalfright32_c, idct64_row_c },    // ADST_DCT
     { aom_idct32_c, ihalfright64_c },    // DCT_ADST
@@ -1443,7 +1068,6 @@
     { iidtx32_c, ihalfright64_c },       // H_ADST
     { ihalfright32_c, iidtx64_c },       // V_FLIPADST
     { iidtx32_c, ihalfright64_c },       // H_FLIPADST
-#endif
   };
 
   // Remap 32x32 input into a modified 64x32 input by:
@@ -1465,16 +1089,9 @@
 
   // inverse transform row vectors and transpose
   for (int i = 0; i < n; ++i) {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
-    tran_low_t temp_in[64];
-    for (int j = 0; j < n2; j++) temp_in[j] = mod_input_ptr[j] * 8;
-    IHT_64x32[tx_type].rows(temp_in, outtmp);
-    for (int j = 0; j < n2; ++j) tmp[j][i] = outtmp[j];
-#else
     IHT_64x32[tx_type].rows(mod_input_ptr, outtmp);
     for (int j = 0; j < n2; ++j)
       tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * InvSqrt2);
-#endif
     mod_input_ptr += n2;
   }
 
@@ -1488,11 +1105,7 @@
     for (int j = 0; j < n2; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-#endif
     }
   }
 }
@@ -1501,24 +1114,6 @@
                              const TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   static const transform_2d IHT_32x64[] = {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
-    { daala_idct64, daala_idct32 },  // DCT_DCT
-    { daala_idst64, daala_idct32 },  // ADST_DCT
-    { daala_idct64, daala_idst32 },  // DCT_ADST
-    { daala_idst64, daala_idst32 },  // ADST_ADST
-    { daala_idst64, daala_idct32 },  // FLIPADST_DCT
-    { daala_idct64, daala_idst32 },  // DCT_FLIPADST
-    { daala_idst64, daala_idst32 },  // FLIPADST_FLIPADST
-    { daala_idst64, daala_idst32 },  // ADST_FLIPADST
-    { daala_idst64, daala_idst32 },  // FLIPADST_ADST
-    { daala_idtx64, daala_idtx32 },  // IDTX
-    { daala_idct64, daala_idtx32 },  // V_DCT
-    { daala_idtx64, daala_idct32 },  // H_DCT
-    { daala_idst64, daala_idtx32 },  // V_ADST
-    { daala_idtx64, daala_idst32 },  // H_ADST
-    { daala_idst64, daala_idtx32 },  // V_FLIPADST
-    { daala_idtx64, daala_idst32 },  // H_FLIPADST
-#else
     { idct64_col_c, aom_idct32_c },      // DCT_DCT
     { ihalfright64_c, aom_idct32_c },    // ADST_DCT
     { idct64_col_c, ihalfright32_c },    // DCT_ADST
@@ -1535,7 +1130,6 @@
     { iidtx64_c, ihalfright32_c },       // H_ADST
     { ihalfright64_c, iidtx32_c },       // V_FLIPADST
     { iidtx64_c, ihalfright32_c },       // H_FLIPADST
-#endif
   };
 
   // Remap 32x32 input into a modified 32x64 input by:
@@ -1555,16 +1149,9 @@
 
   // inverse transform row vectors and transpose
   for (int i = 0; i < n2; ++i) {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
-    tran_low_t temp_in[32];
-    for (int j = 0; j < n; j++) temp_in[j] = mod_input_ptr[j] * 8;
-    IHT_32x64[tx_type].rows(temp_in, outtmp);
-    for (int j = 0; j < n; ++j) tmp[j][i] = outtmp[j];
-#else
     IHT_32x64[tx_type].rows(mod_input_ptr, outtmp);
     for (int j = 0; j < n; ++j)
       tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * InvSqrt2);
-#endif
     mod_input_ptr += n;
   }
 
@@ -1578,11 +1165,7 @@
     for (int j = 0; j < n; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-#endif
     }
   }
 }
@@ -1722,7 +1305,6 @@
   else
     aom_idct4x4_1_add(input, dest, stride);
 }
-#endif  // !CONFIG_DAALA_TX
 
 void av1_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                      const TxfmParam *txfm_param) {
@@ -1742,7 +1324,6 @@
     aom_highbd_iwht4x4_1_add(input, dest, stride, bd);
 }
 
-#if !CONFIG_DAALA_TX
 static const int32_t *cast_to_int32(const tran_low_t *input) {
   assert(sizeof(int32_t) == sizeof(tran_low_t));
   return (const int32_t *)input;
@@ -2045,7 +1626,6 @@
   }
 }
 #endif  // CONFIG_TX64X64
-#endif  // !CONFIG_DAALA_TX
 
 static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
                             TX_TYPE tx_type, int eob, int reduced_tx_set,
@@ -2067,9 +1647,6 @@
 static void av1_highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest,
                                     int stride, TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
-#if CONFIG_DAALA_TX
-  daala_inv_txfm_add(input, dest, stride, txfm_param);
-#else
   const TX_SIZE tx_size = txfm_param->tx_size;
   switch (tx_size) {
     case TX_32X32:
@@ -2136,7 +1713,6 @@
       break;
     default: assert(0 && "Invalid transform size"); break;
   }
-#endif  // CONFIG_DAALA_TX
 }
 
 static void av1_inv_txfm_add(const tran_low_t *dqcoeff, uint8_t *dst,
diff --git a/av1/common/idct.h b/av1/common/idct.h
index 6717e22..79c948a 100644
--- a/av1/common/idct.h
+++ b/av1/common/idct.h
@@ -29,10 +29,8 @@
   transform_1d cols, rows;  // vertical and horizontal
 } transform_2d;
 
-#if !CONFIG_DAALA_TX
 #define MAX_TX_SCALE 1
 int av1_get_tx_scale(const TX_SIZE tx_size);
-#endif
 
 void av1_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                      const TxfmParam *txfm_param);
diff --git a/av1/common/quant_common.c b/av1/common/quant_common.c
index bb9997a..b5d4a87 100644
--- a/av1/common/quant_common.c
+++ b/av1/common/quant_common.c
@@ -410,8 +410,6 @@
   28143, 28687, 29247,
 };
 
-#if !CONFIG_DAALA_TX
-
 // Coefficient scaling and quantization with AV1 TX are tailored to
 // the AV1 TX transforms.  Regardless of the bit-depth of the input,
 // the transform stages scale the coefficient values up by a factor of
@@ -497,83 +495,6 @@
   return QINDEX_RANGE - 1;
 }
 
-#else   // CONFIG_DAALA_TX
-
-// Daala TX uses a constant effective coefficient depth
-// (TX_COEFF_DEPTH) regardless of input pixel bitdepth or transform
-// size. This means that coefficient scale and range is identical
-// regardless of the bit depth of the pixel input.  However, the
-// existing encoder heuristics and RDO loop were built expecting a
-// quantizer that scales with bitdepth, treating it more as a
-// proto-lambda than a quantizer.  The assumption that quantizer scale
-// increases with bitdepth is spread throughout the encoder.
-
-// For this reason, we need to be able to find an old-style 'Q3'
-// quantizer that scales with pixel depth (to be used in encoder
-// decision making) as well as the literal quantizer that is used in
-// actual quantization/dequantization.  That is centralized here.
-
-// Right now, the existing quantization code and setup are not
-// particularly well suited to Daala TX.  The scale range used by, eg,
-// the 12 bit lookups is intentionally larger in order to provide more
-// fine control at the top end of the quality range, as 12-bit input
-// would be assumed to offer a lower noise floor than an 8-bit input.
-// However, the 12-bit lookups assume an effective 15-bit TX depth,
-// while we intend to run Daala TX somewhere between 12 and 14.  We
-// can't simply scale it down, because this would violate the minimum
-// allowable quantizer in the current code (4).
-
-// As such, we do the simplest thing for the time being: Always use
-// the 8-bit scale range for all inputs and scale the QTX and Q3
-// returns accordingly, which will always be no-ops or upshifts.  This
-// might well work well enough; if not, we'll need to patch quantizer
-// scaling to extend the high-bitdepth quality range upward at some
-// later date.
-
-int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) {
-  assert(bit_depth >= 8);
-  return qindex == 0 ? dc_qlookup_Q3[0]
-                     :  // Do not scale lossless
-             dc_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)] *
-                 (1 << (bit_depth - 8));
-}
-
-int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) {
-  assert(bit_depth >= 8);
-  return qindex == 0 ? ac_qlookup_Q3[0]
-                     :  // Do not scale lossless
-             ac_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)] *
-                 (1 << (bit_depth - 8));
-}
-
-int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
-  (void)bit_depth;
-  return qindex == 0 ? dc_qlookup_Q3[0]
-                     :  // Do not scale lossless
-             dc_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)] *
-                 (1 << (TX_COEFF_DEPTH - 11));
-}
-
-int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
-  (void)bit_depth;
-  return qindex == 0 ? ac_qlookup_Q3[0]
-                     :  // Do not scale lossless
-             ac_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)] *
-                 (1 << (TX_COEFF_DEPTH - 11));
-}
-
-int16_t av1_qindex_from_ac_Q3(int ac_QTX, aom_bit_depth_t bit_depth) {
-  int i;
-  const int16_t *tab = ac_qlookup_Q3;
-  int scale = (1 << (TX_COEFF_DEPTH - 11));
-  (void)bit_depth;
-  for (i = 0; i < QINDEX_RANGE; i++) {
-    if (ac_QTX <= tab[i] * scale) return i;
-  }
-  return QINDEX_RANGE - 1;
-}
-#endif  // !CONFIG_DAALA_TX
-
 int av1_get_qindex(const struct segmentation *seg, int segment_id,
                    int base_qindex) {
   if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
diff --git a/av1/common/x86/daala_inv_txfm_avx2.c b/av1/common/x86/daala_inv_txfm_avx2.c
deleted file mode 100644
index f060bfe..0000000
--- a/av1/common/x86/daala_inv_txfm_avx2.c
+++ /dev/null
@@ -1,1607 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-#include <immintrin.h>
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "av1/common/daala_tx.h"
-#include "av1/common/daala_inv_txfm.h"
-#include "av1/common/idct.h"
-
-#if CONFIG_DAALA_TX
-
-static INLINE __m128i od_unbiased_rshift1_epi16(__m128i a) {
-  return _mm_srai_epi16(_mm_add_epi16(_mm_srli_epi16(a, 15), a), 1);
-}
-
-static INLINE __m256i od_mm256_unbiased_rshift1_epi16(__m256i a) {
-  return _mm256_srai_epi16(_mm256_add_epi16(_mm256_srli_epi16(a, 15), a), 1);
-}
-
-static INLINE __m256i od_mm256_unbiased_rshift1_epi32(__m256i a) {
-  return _mm256_srai_epi32(_mm256_add_epi32(_mm256_srli_epi32(a, 31), a), 1);
-}
-
-static INLINE __m128i od_avg_epi16(__m128i a, __m128i b) {
-  __m128i sign_bit;
-  /*x86 only provides an unsigned PAVGW with a bias (ARM is better here).
-    We emulate a signed one by adding an offset to convert to unsigned and
-    back. We use XOR instead of addition/subtraction because it dispatches
-    better on older processors.*/
-  sign_bit = _mm_set1_epi16(0x8000);
-  return _mm_xor_si128(
-      _mm_avg_epu16(_mm_xor_si128(a, sign_bit), _mm_xor_si128(b, sign_bit)),
-      sign_bit);
-}
-
-static INLINE __m256i od_mm256_avg_epi16(__m256i a, __m256i b) {
-  __m256i sign_bit;
-  sign_bit = _mm256_set1_epi16(0x8000);
-  return _mm256_xor_si256(_mm256_avg_epu16(_mm256_xor_si256(a, sign_bit),
-                                           _mm256_xor_si256(b, sign_bit)),
-                          sign_bit);
-}
-
-static INLINE __m256i od_mm256_avg_epi32(__m256i a, __m256i b) {
-  __m256i neg1;
-  /* It's cheaper to generate -1's than 1's. */
-  neg1 = _mm256_set1_epi64x(-1);
-  /* There is no corresponding PAVGD, but we are not in danger of overflowing
-     a 32-bit register. */
-  return _mm256_srai_epi32(_mm256_add_epi32(a, _mm256_sub_epi32(b, neg1)), 1);
-}
-
-/*Like the above, but does (a - b + 1) >> 1 instead.*/
-static INLINE __m128i od_hrsub_epi16(__m128i a, __m128i b) {
-  __m128i sign_bit;
-  sign_bit = _mm_set1_epi16(0x8000);
-  return _mm_xor_si128(
-      _mm_avg_epu16(_mm_xor_si128(a, sign_bit), _mm_sub_epi16(sign_bit, b)),
-      sign_bit);
-}
-
-static INLINE __m256i od_mm256_hrsub_epi16(__m256i a, __m256i b) {
-  __m256i sign_bit;
-  sign_bit = _mm256_set1_epi16(0x8000);
-  return _mm256_xor_si256(_mm256_avg_epu16(_mm256_xor_si256(a, sign_bit),
-                                           _mm256_sub_epi16(sign_bit, b)),
-                          sign_bit);
-}
-
-static INLINE __m256i od_mm256_hrsub_epi32(__m256i a, __m256i b) {
-  __m256i neg1;
-  /* It's cheaper to generate -1's than 1's. */
-  neg1 = _mm256_set1_epi64x(-1);
-  /* There is no corresponding PAVGD, but we are not in danger of overflowing
-     a 32-bit register. */
-  return _mm256_srai_epi32(_mm256_sub_epi32(a, _mm256_add_epi32(b, neg1)), 1);
-}
-
-static INLINE void od_swap_si128(__m128i *q0, __m128i *q1) {
-  __m128i t;
-  t = *q0;
-  *q0 = *q1;
-  *q1 = t;
-}
-
-static INLINE void od_mm256_swap_si256(__m256i *q0, __m256i *q1) {
-  __m256i t;
-  t = *q0;
-  *q0 = *q1;
-  *q1 = t;
-}
-
-static INLINE __m128i od_mulhrs_epi16(__m128i a, int16_t b) {
-  return _mm_mulhrs_epi16(a, _mm_set1_epi16(b));
-}
-
-static INLINE __m128i od_mul_epi16(__m128i a, int32_t b, int r) {
-  int32_t b_q15;
-  b_q15 = b << (15 - r);
-  /* b and r are in all cases compile-time constants, so these branches
-     disappear when this function gets inlined. */
-  if (b_q15 > 32767) {
-    return _mm_add_epi16(a, od_mulhrs_epi16(a, (int16_t)(b_q15 - 32768)));
-  } else if (b_q15 < -32767) {
-    return _mm_sub_epi16(od_mulhrs_epi16(a, (int16_t)(32768 + b_q15)), a);
-  } else {
-    return od_mulhrs_epi16(a, b_q15);
-  }
-}
-
-static INLINE __m256i od_mm256_mulhrs_epi16(__m256i a, int16_t b) {
-  return _mm256_mulhrs_epi16(a, _mm256_set1_epi16(b));
-}
-
-static INLINE __m256i od_mm256_mul_epi16(__m256i a, int32_t b, int r) {
-  int32_t b_q15;
-  b_q15 = b << (15 - r);
-  /* b and r are in all cases compile-time constants, so these branches
-     disappear when this function gets inlined. */
-  if (b_q15 > 32767) {
-    return _mm256_add_epi16(a,
-                            od_mm256_mulhrs_epi16(a, (int16_t)(b_q15 - 32768)));
-  } else if (b_q15 < -32767) {
-    return _mm256_sub_epi16(od_mm256_mulhrs_epi16(a, (int16_t)(32768 + b_q15)),
-                            a);
-  } else {
-    return od_mm256_mulhrs_epi16(a, b_q15);
-  }
-}
-
-static INLINE __m256i od_mm256_mul_epi32(__m256i a, int32_t b, int r) {
-  __m256i neg1;
-  /* It's cheaper to generate -1's than 1's. */
-  neg1 = _mm256_set1_epi64x(-1);
-  /* There's no 32-bit version of PMULHRSW on x86 like there is on ARM .*/
-  a = _mm256_mullo_epi32(a, _mm256_set1_epi32(b));
-  a = _mm256_srai_epi32(a, r - 1);
-  a = _mm256_sub_epi32(a, neg1);
-  return _mm256_srai_epi32(a, 1);
-}
-
-static INLINE __m128i od_hbd_max_epi16(int bd) {
-  return _mm_set1_epi16((1 << bd) - 1);
-}
-
-static INLINE __m256i od_mm256_hbd_max_epi16(int bd) {
-  return _mm256_set1_epi16((1 << bd) - 1);
-}
-
-static INLINE __m128i od_hbd_clamp_epi16(__m128i a, __m128i max) {
-  return _mm_max_epi16(_mm_setzero_si128(), _mm_min_epi16(a, max));
-}
-
-static INLINE __m256i od_mm256_hbd_clamp_epi16(__m256i a, __m256i max) {
-  return _mm256_max_epi16(_mm256_setzero_si256(), _mm256_min_epi16(a, max));
-}
-
-/* Loads a 4x4 buffer of 32-bit values into four SSE registers. */
-static INLINE void od_load_buffer_4x4_epi32(__m128i *q0, __m128i *q1,
-                                            __m128i *q2, __m128i *q3,
-                                            const tran_low_t *in) {
-  *q0 = _mm_loadu_si128((const __m128i *)in + 0);
-  *q1 = _mm_loadu_si128((const __m128i *)in + 1);
-  *q2 = _mm_loadu_si128((const __m128i *)in + 2);
-  *q3 = _mm_loadu_si128((const __m128i *)in + 3);
-}
-
-/* Loads a 4x4 buffer of 16-bit values into four SSE registers. */
-static INLINE void od_load_buffer_4x4_epi16(__m128i *q0, __m128i *q1,
-                                            __m128i *q2, __m128i *q3,
-                                            const int16_t *in) {
-  *q0 = _mm_loadu_si128((const __m128i *)in + 0);
-  *q1 = _mm_unpackhi_epi64(*q0, *q0);
-  *q2 = _mm_loadu_si128((const __m128i *)in + 1);
-  *q3 = _mm_unpackhi_epi64(*q2, *q2);
-}
-
-/* Loads an 8x4 buffer of 16-bit values into four SSE registers. */
-static INLINE void od_load_buffer_8x4_epi16(__m128i *q0, __m128i *q1,
-                                            __m128i *q2, __m128i *q3,
-                                            const int16_t *in, int in_stride) {
-  *q0 = _mm_loadu_si128((const __m128i *)(in + 0 * in_stride));
-  *q1 = _mm_loadu_si128((const __m128i *)(in + 1 * in_stride));
-  *q2 = _mm_loadu_si128((const __m128i *)(in + 2 * in_stride));
-  *q3 = _mm_loadu_si128((const __m128i *)(in + 3 * in_stride));
-}
-
-/* Loads an 8x4 buffer of 32-bit values and packs them into 16-bit values in
-   four SSE registers. */
-static INLINE void od_load_pack_buffer_8x4_epi32(__m128i *r0, __m128i *r1,
-                                                 __m128i *r2, __m128i *r3,
-                                                 const tran_low_t *in) {
-  __m128i r4;
-  __m128i r5;
-  __m128i r6;
-  __m128i r7;
-  *r0 = _mm_loadu_si128((const __m128i *)in + 0);
-  r4 = _mm_loadu_si128((const __m128i *)in + 1);
-  *r1 = _mm_loadu_si128((const __m128i *)in + 2);
-  r5 = _mm_loadu_si128((const __m128i *)in + 3);
-  *r2 = _mm_loadu_si128((const __m128i *)in + 4);
-  r6 = _mm_loadu_si128((const __m128i *)in + 5);
-  *r3 = _mm_loadu_si128((const __m128i *)in + 6);
-  r7 = _mm_loadu_si128((const __m128i *)in + 7);
-  *r0 = _mm_packs_epi32(*r0, r4);
-  *r1 = _mm_packs_epi32(*r1, r5);
-  *r2 = _mm_packs_epi32(*r2, r6);
-  *r3 = _mm_packs_epi32(*r3, r7);
-}
-
-/* Loads an 8x4 buffer of 32-bit values into four AVX registers. */
-static INLINE void od_load_buffer_8x4_epi32(__m256i *r0, __m256i *r1,
-                                            __m256i *r2, __m256i *r3,
-                                            const tran_low_t *in) {
-  *r0 = _mm256_loadu_si256((const __m256i *)in + 0);
-  *r1 = _mm256_loadu_si256((const __m256i *)in + 1);
-  *r2 = _mm256_loadu_si256((const __m256i *)in + 2);
-  *r3 = _mm256_loadu_si256((const __m256i *)in + 3);
-}
-
-/* Loads a 16x4 buffer of 16-bit values into four AVX registers. */
-static INLINE void od_load_buffer_16x4_epi16(__m256i *r0, __m256i *r1,
-                                             __m256i *r2, __m256i *r3,
-                                             const int16_t *in, int in_stride) {
-  *r0 = _mm256_loadu_si256((const __m256i *)(in + 0 * in_stride));
-  *r1 = _mm256_loadu_si256((const __m256i *)(in + 1 * in_stride));
-  *r2 = _mm256_loadu_si256((const __m256i *)(in + 2 * in_stride));
-  *r3 = _mm256_loadu_si256((const __m256i *)(in + 3 * in_stride));
-}
-
-/* Stores a 4x4 buffer of 16-bit values from two SSE registers.
-   Each register holds two rows of values. */
-static INLINE void od_store_buffer_4x4_epi16(int16_t *out, __m128i q0,
-                                             __m128i q1) {
-  _mm_storeu_si128((__m128i *)out + 0, q0);
-  _mm_storeu_si128((__m128i *)out + 1, q1);
-}
-
-/* Stores a 4x8 buffer of 16-bit values from four SSE registers.
-   Each register holds two rows of values. */
-static INLINE void od_store_buffer_4x8_epi16(int16_t *out, __m128i q0,
-                                             __m128i q1, __m128i q2,
-                                             __m128i q3) {
-  _mm_storeu_si128((__m128i *)out + 0, q0);
-  _mm_storeu_si128((__m128i *)out + 1, q1);
-  _mm_storeu_si128((__m128i *)out + 2, q2);
-  _mm_storeu_si128((__m128i *)out + 3, q3);
-}
-
-static INLINE void od_store_buffer_2x16_epi16(int16_t *out, __m256i r0,
-                                              __m256i r1) {
-  _mm256_storeu_si256((__m256i *)out + 0, r0);
-  _mm256_storeu_si256((__m256i *)out + 1, r1);
-}
-
-/* Loads a 4x4 buffer of 16-bit values, adds a 4x4 block of 16-bit values to
-   them, clamps to high bit depth, and stores the sum back. */
-static INLINE void od_add_store_buffer_hbd_4x4_epi16(void *output_pixels,
-                                                     int output_stride,
-                                                     __m128i q0, __m128i q1,
-                                                     __m128i q2, __m128i q3,
-                                                     int bd) {
-  uint16_t *output_pixels16;
-  __m128i p0;
-  __m128i p1;
-  __m128i p2;
-  __m128i p3;
-  __m128i max;
-  __m128i round;
-  int downshift;
-  output_pixels16 = CONVERT_TO_SHORTPTR(output_pixels);
-  max = od_hbd_max_epi16(bd);
-  downshift = TX_COEFF_DEPTH - bd;
-  round = _mm_set1_epi16((1 << downshift) >> 1);
-  p0 = _mm_loadl_epi64((const __m128i *)(output_pixels16 + 0 * output_stride));
-  p1 = _mm_loadl_epi64((const __m128i *)(output_pixels16 + 1 * output_stride));
-  p2 = _mm_loadl_epi64((const __m128i *)(output_pixels16 + 2 * output_stride));
-  p3 = _mm_loadl_epi64((const __m128i *)(output_pixels16 + 3 * output_stride));
-  q0 = _mm_srai_epi16(_mm_add_epi16(q0, round), downshift);
-  q1 = _mm_srai_epi16(_mm_add_epi16(q1, round), downshift);
-  q2 = _mm_srai_epi16(_mm_add_epi16(q2, round), downshift);
-  q3 = _mm_srai_epi16(_mm_add_epi16(q3, round), downshift);
-  p0 = od_hbd_clamp_epi16(_mm_add_epi16(p0, q0), max);
-  p1 = od_hbd_clamp_epi16(_mm_add_epi16(p1, q1), max);
-  p2 = od_hbd_clamp_epi16(_mm_add_epi16(p2, q2), max);
-  p3 = od_hbd_clamp_epi16(_mm_add_epi16(p3, q3), max);
-  _mm_storel_epi64((__m128i *)(output_pixels16 + 0 * output_stride), p0);
-  _mm_storel_epi64((__m128i *)(output_pixels16 + 1 * output_stride), p1);
-  _mm_storel_epi64((__m128i *)(output_pixels16 + 2 * output_stride), p2);
-  _mm_storel_epi64((__m128i *)(output_pixels16 + 3 * output_stride), p3);
-}
-
-/* Loads an 8x4 buffer of 16-bit values, adds a 8x4 block of 16-bit values to
-   them, clamps to the high bit depth max, and stores the sum back. */
-static INLINE void od_add_store_buffer_hbd_8x4_epi16(void *output_pixels,
-                                                     int output_stride,
-                                                     __m128i q0, __m128i q1,
-                                                     __m128i q2, __m128i q3,
-                                                     int bd) {
-  uint16_t *output_pixels16;
-  __m128i p0;
-  __m128i p1;
-  __m128i p2;
-  __m128i p3;
-  __m128i max;
-  __m128i round;
-  int downshift;
-  output_pixels16 = CONVERT_TO_SHORTPTR(output_pixels);
-  max = od_hbd_max_epi16(bd);
-  downshift = TX_COEFF_DEPTH - bd;
-  round = _mm_set1_epi16((1 << downshift) >> 1);
-  p0 = _mm_loadu_si128((const __m128i *)(output_pixels16 + 0 * output_stride));
-  p1 = _mm_loadu_si128((const __m128i *)(output_pixels16 + 1 * output_stride));
-  p2 = _mm_loadu_si128((const __m128i *)(output_pixels16 + 2 * output_stride));
-  p3 = _mm_loadu_si128((const __m128i *)(output_pixels16 + 3 * output_stride));
-  q0 = _mm_srai_epi16(_mm_add_epi16(q0, round), downshift);
-  q1 = _mm_srai_epi16(_mm_add_epi16(q1, round), downshift);
-  q2 = _mm_srai_epi16(_mm_add_epi16(q2, round), downshift);
-  q3 = _mm_srai_epi16(_mm_add_epi16(q3, round), downshift);
-  p0 = od_hbd_clamp_epi16(_mm_add_epi16(p0, q0), max);
-  p1 = od_hbd_clamp_epi16(_mm_add_epi16(p1, q1), max);
-  p2 = od_hbd_clamp_epi16(_mm_add_epi16(p2, q2), max);
-  p3 = od_hbd_clamp_epi16(_mm_add_epi16(p3, q3), max);
-  _mm_storeu_si128((__m128i *)(output_pixels16 + 0 * output_stride), p0);
-  _mm_storeu_si128((__m128i *)(output_pixels16 + 1 * output_stride), p1);
-  _mm_storeu_si128((__m128i *)(output_pixels16 + 2 * output_stride), p2);
-  _mm_storeu_si128((__m128i *)(output_pixels16 + 3 * output_stride), p3);
-}
-
-static INLINE void od_add_store_buffer_hbd_16x4_epi16(void *output_pixels,
-                                                      int output_stride,
-                                                      __m256i r0, __m256i r1,
-                                                      __m256i r2, __m256i r3,
-                                                      int bd) {
-  uint16_t *output_pixels16;
-  __m256i p0;
-  __m256i p1;
-  __m256i p2;
-  __m256i p3;
-  __m256i max;
-  __m256i round;
-  int downshift;
-  output_pixels16 = CONVERT_TO_SHORTPTR(output_pixels);
-  max = od_mm256_hbd_max_epi16(bd);
-  downshift = TX_COEFF_DEPTH - bd;
-  round = _mm256_set1_epi16((1 << downshift) >> 1);
-  p0 = _mm256_loadu_si256(
-      (const __m256i *)(output_pixels16 + 0 * output_stride));
-  p1 = _mm256_loadu_si256(
-      (const __m256i *)(output_pixels16 + 1 * output_stride));
-  p2 = _mm256_loadu_si256(
-      (const __m256i *)(output_pixels16 + 2 * output_stride));
-  p3 = _mm256_loadu_si256(
-      (const __m256i *)(output_pixels16 + 3 * output_stride));
-  r0 = _mm256_srai_epi16(_mm256_add_epi16(r0, round), downshift);
-  r1 = _mm256_srai_epi16(_mm256_add_epi16(r1, round), downshift);
-  r2 = _mm256_srai_epi16(_mm256_add_epi16(r2, round), downshift);
-  r3 = _mm256_srai_epi16(_mm256_add_epi16(r3, round), downshift);
-  p0 = od_mm256_hbd_clamp_epi16(_mm256_add_epi16(p0, r0), max);
-  p1 = od_mm256_hbd_clamp_epi16(_mm256_add_epi16(p1, r1), max);
-  p2 = od_mm256_hbd_clamp_epi16(_mm256_add_epi16(p2, r2), max);
-  p3 = od_mm256_hbd_clamp_epi16(_mm256_add_epi16(p3, r3), max);
-  _mm256_storeu_si256((__m256i *)(output_pixels16 + 0 * output_stride), p0);
-  _mm256_storeu_si256((__m256i *)(output_pixels16 + 1 * output_stride), p1);
-  _mm256_storeu_si256((__m256i *)(output_pixels16 + 2 * output_stride), p2);
-  _mm256_storeu_si256((__m256i *)(output_pixels16 + 3 * output_stride), p3);
-}
-
-static INLINE void od_transpose_pack4x4(__m128i *q0, __m128i *q1, __m128i *q2,
-                                        __m128i *q3) {
-  __m128i a;
-  __m128i b;
-  __m128i c;
-  __m128i d;
-  /* Input:
-     q0: q30 q20 q10 q00
-     q1: q31 q21 q11 q01
-     q2: q32 q22 q12 q02
-     q3: q33 q23 q13 q03
-  */
-  /* a: q32 q22 q12 q02 q30 q20 q10 q00 */
-  a = _mm_packs_epi32(*q0, *q2);
-  /* b: q33 q23 q13 q03 q31 q21 q11 q01 */
-  b = _mm_packs_epi32(*q1, *q3);
-  /* c: q31 q30 q21 q20 q11 q10 q01 q00 */
-  c = _mm_unpacklo_epi16(a, b);
-  /* d: q33 q32 q23 q22 q13 q12 q03 q02 */
-  d = _mm_unpackhi_epi16(a, b);
-  /* We don't care about the contents of the high half of each register. */
-  /* q0: q13 q12 q11 q10 [q03 q02 q01 q00] */
-  *q0 = _mm_unpacklo_epi32(c, d);
-  /* q1: q13 q12 q11 q10 [q13 q12 q11 q10] */
-  *q1 = _mm_unpackhi_epi64(*q0, *q0);
-  /* q2: q33 q32 q31 q30 [q23 q22 q21 q20] */
-  *q2 = _mm_unpackhi_epi32(c, d);
-  /* q3: q33 q32 q31 q30 [q33 q32 q31 q30] */
-  *q3 = _mm_unpackhi_epi64(*q2, *q2);
-}
-
-static INLINE void od_transpose4x4(__m128i *q0, __m128i q1, __m128i *q2,
-                                   __m128i q3) {
-  __m128i a;
-  __m128i b;
-  /* Input:
-     q0: ... ... ... ... q30 q20 q10 q00
-     q1: ... ... ... ... q31 q21 q11 q01
-     q2: ... ... ... ... q32 q22 q12 q02
-     q3: ... ... ... ... q33 q23 q13 q03
-  */
-  /* a: q31 q30 q21 q20 q11 q10 q01 q00 */
-  a = _mm_unpacklo_epi16(*q0, q1);
-  /* b: q33 q32 q23 q22 q13 q12 q03 q02 */
-  b = _mm_unpacklo_epi16(*q2, q3);
-  /* q0: q13 q12 q11 q10 | q03 q02 q01 q00 */
-  *q0 = _mm_unpacklo_epi32(a, b);
-  /* q2: q33 q32 q31 q30 | q23 q22 q21 q20 */
-  *q2 = _mm_unpackhi_epi32(a, b);
-}
-
-static inline void od_transpose4x8(__m128i *r0, __m128i r1, __m128i *r2,
-                                   __m128i r3, __m128i *r4, __m128i r5,
-                                   __m128i *r6, __m128i r7) {
-  __m128i a;
-  __m128i b;
-  /* Input:
-     q0: ... ... ... ... q30 q20 q10 q00
-     q1: ... ... ... ... q31 q21 q11 q01
-     q2: ... ... ... ... q32 q22 q12 q02
-     q3: ... ... ... ... q33 q23 q13 q03
-     q4: ... ... ... ... q34 q24 q14 q04
-     q5: ... ... ... ... q35 q25 q15 q05
-     q6: ... ... ... ... q36 q26 q16 q06
-     q7: ... ... ... ... q37 q27 q17 q07
-  */
-  /* r0: r13 r12 11 r10 r03 r02 r01 r00
-     r2: r33 r32 31 r30 r23 r22 r21 r20 */
-  od_transpose4x4(r0, r1, r2, r3);
-  /* r4: r17 r16 15 r14 r07 r06 r05 r04
-     r6: r37 r36 35 r34 r27 r26 r25 r24 */
-  od_transpose4x4(r4, r5, r6, r7);
-  a = *r0;
-  b = *r2;
-  /* r0: r07 r06 r05 r04 r04 r02 r01 r00 */
-  *r0 = _mm_unpacklo_epi64(a, *r4);
-  /* r2: r17 r16 r15 r14 r14 r12 r11 r10 */
-  *r2 = _mm_unpackhi_epi64(a, *r4);
-  /* r4: r27 r26 r25 r24 r24 r22 r21 r20 */
-  *r4 = _mm_unpacklo_epi64(b, *r6);
-  /* r6: r37 r36 r35 r34 r34 r32 r31 r30 */
-  *r6 = _mm_unpackhi_epi64(b, *r6);
-}
-
-static INLINE void od_transpose8x4(__m128i *q0, __m128i *q1, __m128i *q2,
-                                   __m128i *q3) {
-  __m128i a;
-  __m128i b;
-  __m128i c;
-  __m128i d;
-  /* Input:
-     q0: q07 q06 q05 q04 q03 q02 q01 q00
-     q1: q17 q16 q15 q14 q13 q12 q11 q10
-     q2: q27 q26 q25 q24 q23 q22 q21 q20
-     q3: q37 q36 q35 q34 q33 q32 q31 q30
-  */
-  /* a: q13 q03 q12 q02 q11 q01 q10 q00 */
-  a = _mm_unpacklo_epi16(*q0, *q1);
-  /* b: q17 q07 q16 q06 q15 q05 q14 q04 */
-  b = _mm_unpackhi_epi16(*q0, *q1);
-  /* c: q33 q23 q32 q22 q31 q21 q30 q20 */
-  c = _mm_unpacklo_epi16(*q2, *q3);
-  /* d: q37 q27 q36 q26 q35 q25 q34 q24 */
-  d = _mm_unpackhi_epi16(*q2, *q3);
-  /* q0: q31 q21 q11 q01 | q30 q20 q10 q00 */
-  *q0 = _mm_unpacklo_epi32(a, c);
-  /* q1: q33 q23 q13 q03 | q32 q22 q12 q02 */
-  *q1 = _mm_unpackhi_epi32(a, c);
-  /* q2: q35 q25 q15 q05 | q34 q24 q14 q04 */
-  *q2 = _mm_unpacklo_epi32(b, d);
-  /* q3: q37 q27 q17 q07 | q36 q26 q16 q06 */
-  *q3 = _mm_unpackhi_epi32(b, d);
-}
-
-static INLINE void od_transpose_pack4x8(__m128i *q0, __m128i *q1, __m128i *q2,
-                                        __m128i *q3, __m128i q4, __m128i q5,
-                                        __m128i q6, __m128i q7) {
-  __m128i a;
-  __m128i b;
-  __m128i c;
-  __m128i d;
-  /* Input:
-     q0: q30 q20 q10 q00
-     q1: q31 q21 q11 q01
-     q2: q32 q22 q12 q02
-     q3: q33 q23 q13 q03
-     q4: q34 q24 q14 q04
-     q5: q35 q25 q15 q05
-     q6: q36 q26 q16 q06
-     q7: q37 q27 q17 q07
-  */
-  /* a: q34 q24 q14 q04 q30 q20 q10 q00 */
-  a = _mm_packs_epi32(*q0, q4);
-  /* b: q35 q25 q15 q05 q31 q21 q11 q01 */
-  b = _mm_packs_epi32(*q1, q5);
-  /* c: q36 q26 q16 q06 q32 q22 q12 q02 */
-  c = _mm_packs_epi32(*q2, q6);
-  /* d: q37 q27 q17 q07 q33 q23 q13 q03 */
-  d = _mm_packs_epi32(*q3, q7);
-  /* a: q13 q12 q11 q10 q03 q02 q01 q00
-     b: q33 q32 q31 q30 q33 q22 q21 q20
-     c: q53 q52 q51 q50 q43 q42 q41 q40
-     d: q73 q72 q71 q70 q63 q62 q61 q60 */
-  od_transpose8x4(&a, &b, &c, &d);
-  /* q0: q07 q06 q05 q04 q03 q02 q01 q00 */
-  *q0 = _mm_unpacklo_epi64(a, c);
-  /* q1: q17 q16 q15 q14 q13 q12 q11 q10 */
-  *q1 = _mm_unpackhi_epi64(a, c);
-  /* q2: q27 q26 q25 q24 q23 q22 q21 q20 */
-  *q2 = _mm_unpacklo_epi64(b, d);
-  /* q3: q37 q36 q35 q34 q33 q32 q31 q30 */
-  *q3 = _mm_unpackhi_epi64(b, d);
-}
-
-static INLINE void od_transpose_pack8x4(__m128i *r0, __m128i *r1, __m128i *r2,
-                                        __m128i *r3, __m128i *r4, __m128i *r5,
-                                        __m128i *r6, __m128i *r7) {
-  /* Input:
-     r1: r07 r06 r05 r04  r0: r03 r02 r01 r00
-     r3: r17 r16 r15 r14  r2: r13 r12 r11 r10
-     r5: r27 r26 r25 r24  r4: r23 r22 r21 r20
-     r7: r37 r36 r35 r34  r6: r33 r32 r31 r30
-  */
-  /* r0: r07 r06 r05 r04 r03 r02 r01 r00 */
-  *r0 = _mm_packs_epi32(*r0, *r1);
-  /* r2: r17 r16 r15 r14 r13 r12 r11 r10 */
-  *r2 = _mm_packs_epi32(*r2, *r3);
-  /* r4: r27 r26 r25 r24 r23 r22 r21 r20 */
-  *r4 = _mm_packs_epi32(*r4, *r5);
-  /* r6: r37 r36 r35 r34 r33 r32 r31 r30 */
-  *r6 = _mm_packs_epi32(*r6, *r7);
-  /* r0: r31 r21 r11 r01 [r30 r20 r10 r00]
-     r2: r33 r23 r13 r03 [r32 r22 r12 r02]
-     r4: r35 r25 r15 r05 [r34 r24 r14 r04]
-     r6: r37 r27 r17 r07 [r36 r26 r16 r06] */
-  od_transpose8x4(r0, r2, r4, r6);
-  /* We don't care about the contents of the high half of each register. */
-  /* r1: r31 r21 r11 r01 [r31 r21 r11 r01] */
-  *r1 = _mm_unpackhi_epi64(*r0, *r0);
-  /* r3: r33 r23 r13 r03 [r33 r23 r13 r03] */
-  *r3 = _mm_unpackhi_epi64(*r2, *r2);
-  /* r5: r35 r25 r15 r05 [r35 r25 r15 r05] */
-  *r5 = _mm_unpackhi_epi64(*r4, *r4);
-  /* r7: r37 r27 r17 r07 [r37 r27 r17 r07] */
-  *r7 = _mm_unpackhi_epi64(*r6, *r6);
-}
-
-static INLINE void od_transpose8x8_epi16(__m128i *r0, __m128i *r1, __m128i *r2,
-                                         __m128i *r3, __m128i *r4, __m128i *r5,
-                                         __m128i *r6, __m128i *r7) {
-  __m128i r8;
-  /*8x8 transpose with only 1 temporary register that takes the rows in order
-    and returns the columns in order. The compiler's own register allocator
-    will probably screw this up, but that's no reason not to pretend we might
-    be able to have nice things. This only matters when we port to pre-AVX
-    instruction sets without 3-operand instructions.*/
-  r8 = *r4;
-  *r4 = _mm_unpacklo_epi16(*r4, *r5);
-  r8 = _mm_unpackhi_epi16(r8, *r5);
-  *r5 = *r0;
-  *r0 = _mm_unpacklo_epi16(*r0, *r1);
-  *r5 = _mm_unpackhi_epi16(*r5, *r1);
-  *r1 = *r6;
-  *r6 = _mm_unpacklo_epi16(*r6, *r7);
-  *r1 = _mm_unpackhi_epi16(*r1, *r7);
-  *r7 = *r2;
-  *r2 = _mm_unpackhi_epi16(*r2, *r3);
-  *r7 = _mm_unpacklo_epi16(*r7, *r3);
-  *r3 = *r0;
-  *r0 = _mm_unpacklo_epi32(*r0, *r7);
-  *r3 = _mm_unpackhi_epi32(*r3, *r7);
-  *r7 = *r5;
-  *r5 = _mm_unpacklo_epi32(*r5, *r2);
-  *r7 = _mm_unpackhi_epi32(*r7, *r2);
-  *r2 = *r4;
-  *r4 = _mm_unpackhi_epi32(*r4, *r6);
-  *r2 = _mm_unpacklo_epi32(*r2, *r6);
-  *r6 = r8;
-  r8 = _mm_unpackhi_epi32(r8, *r1);
-  *r6 = _mm_unpacklo_epi32(*r6, *r1);
-  *r1 = *r0;
-  *r0 = _mm_unpacklo_epi64(*r0, *r2);
-  *r1 = _mm_unpackhi_epi64(*r1, *r2);
-  *r2 = *r3;
-  *r3 = _mm_unpackhi_epi64(*r3, *r4);
-  *r2 = _mm_unpacklo_epi64(*r2, *r4);
-  *r4 = *r5;
-  *r5 = _mm_unpackhi_epi64(*r5, *r6);
-  *r4 = _mm_unpacklo_epi64(*r4, *r6);
-  *r6 = *r7;
-  *r7 = _mm_unpackhi_epi64(*r7, r8);
-  *r6 = _mm_unpacklo_epi64(*r6, r8);
-}
-
-static INLINE void od_transpose8x8_epi32(__m256i *r0, __m256i *r1, __m256i *r2,
-                                         __m256i *r3, __m256i *r4, __m256i *r5,
-                                         __m256i *r6, __m256i *r7) {
-  __m256i a;
-  __m256i b;
-  __m256i c;
-  __m256i d;
-  __m256i e;
-  __m256i f;
-  __m256i g;
-  __m256i h;
-  __m256i x;
-  __m256i y;
-  a = _mm256_unpacklo_epi32(*r0, *r1);
-  b = _mm256_unpacklo_epi32(*r2, *r3);
-  c = _mm256_unpackhi_epi32(*r0, *r1);
-  d = _mm256_unpackhi_epi32(*r2, *r3);
-  e = _mm256_unpacklo_epi32(*r4, *r5);
-  f = _mm256_unpacklo_epi32(*r6, *r7);
-  g = _mm256_unpackhi_epi32(*r4, *r5);
-  h = _mm256_unpackhi_epi32(*r6, *r7);
-  x = _mm256_unpacklo_epi64(a, b);
-  y = _mm256_unpacklo_epi64(e, f);
-  *r0 = _mm256_permute2x128_si256(x, y, 0 | (2 << 4));
-  *r4 = _mm256_permute2x128_si256(x, y, 1 | (3 << 4));
-  x = _mm256_unpackhi_epi64(a, b);
-  y = _mm256_unpackhi_epi64(e, f);
-  *r1 = _mm256_permute2x128_si256(x, y, 0 | (2 << 4));
-  *r5 = _mm256_permute2x128_si256(x, y, 1 | (3 << 4));
-  x = _mm256_unpacklo_epi64(c, d);
-  y = _mm256_unpacklo_epi64(g, h);
-  *r2 = _mm256_permute2x128_si256(x, y, 0 | (2 << 4));
-  *r6 = _mm256_permute2x128_si256(x, y, 1 | (3 << 4));
-  x = _mm256_unpackhi_epi64(c, d);
-  y = _mm256_unpackhi_epi64(g, h);
-  *r3 = _mm256_permute2x128_si256(x, y, 0 | (2 << 4));
-  *r7 = _mm256_permute2x128_si256(x, y, 1 | (3 << 4));
-}
-
-/* Packs two blocks of 4x8 32-bit words into 16-bit words and returns the
-   transpose of each packed into the high and low halves of each register. */
-static INLINE void od_transpose_pack4x8x2_epi32(__m256i *out0, __m256i *out1,
-                                                __m256i *out2, __m256i *out3,
-                                                __m256i rr0, __m256i rr1,
-                                                __m256i rr2, __m256i rr3,
-                                                __m256i rr4, __m256i rr5,
-                                                __m256i rr6, __m256i rr7) {
-  __m256i a;
-  __m256i b;
-  __m256i c;
-  __m256i d;
-  __m256i w;
-  __m256i x;
-  __m256i y;
-  __m256i z;
-  /* a: r47 r46 r45 r44 r07 r06 r05 r04 | r43 r42 r41 r40 r03 r02 r01 r00 */
-  a = _mm256_packs_epi32(rr0, rr4);
-  /* b: r57 r56 r55 r54 r17 r16 r15 r14 | r53 r52 r51 r50 r13 r12 r11 r10 */
-  b = _mm256_packs_epi32(rr1, rr5);
-  /* c: r67 r66 r65 r64 r27 r26 r25 r24 | r63 r62 r61 r60 r23 r22 r21 r20 */
-  c = _mm256_packs_epi32(rr2, rr6);
-  /* d: r77 r76 r75 r74 r37 r36 r35 r34 | r73 r72 r71 r70 r33 r32 r31 r30 */
-  d = _mm256_packs_epi32(rr3, rr7);
-  /* w: r17 r07 r16 r06 r15 r05 r14 r04 | r13 r03 r12 r02 r11 r01 r10 r00 */
-  w = _mm256_unpacklo_epi16(a, b);
-  /* x: r57 r47 r56 r46 r55 r45 r54 r44 | r53 r43 r52 r42 r51 r41 r50 r40 */
-  x = _mm256_unpackhi_epi16(a, b);
-  /* y: r37 r27 r36 r26 r35 r25 r34 r24 | r33 r23 r32 r22 r31 r21 r30 r20 */
-  y = _mm256_unpacklo_epi16(c, d);
-  /* z: r77 r67 r76 r66 r75 r65 r74 r64 | r73 r63 r72 r62 r71 r61 r70 r60 */
-  z = _mm256_unpackhi_epi16(c, d);
-  /* a: r35 r25 r15 r05 r34 r24 r14 r04 | r31 r21 r11 r01 r30 r20 r10 r00 */
-  a = _mm256_unpacklo_epi32(w, y);
-  /* b: r77 r67 r57 r47 r76 r66 r56 r46 | r33 r23 r13 r03 r32 r22 r12 r02 */
-  b = _mm256_unpackhi_epi32(w, y);
-  /* c: r75 r65 r55 r45 r74 r64 r54 r44 | r71 r61 r51 r41 r70 r60 r50 r40 */
-  c = _mm256_unpacklo_epi32(x, z);
-  /* d: r77 r67 r57 r47 r76 r66 r56 r46 | r73 r63 r53 r43 r72 r62 r52 r42 */
-  d = _mm256_unpackhi_epi32(x, z);
-  /* out0: r74 r64 r54 r44 r34 r24 r14 r04 | r70 r60 r50 r40 r30 r20 r10 r00 */
-  *out0 = _mm256_unpacklo_epi64(a, c);
-  /* out1: r75 r65 r55 r45 r35 r25 r15 r05 | r71 r61 r51 r41 r31 r21 r11 r01 */
-  *out1 = _mm256_unpackhi_epi64(a, c);
-  /* out2: r76 r66 r56 r46 r36 r26 r16 r06 | r72 r62 r52 r42 r32 r22 r12 r02 */
-  *out2 = _mm256_unpacklo_epi64(b, d);
-  /* out3: r77 r67 r57 r47 r37 r27 r17 r07 | r73 r63 r53 r43 r33 r23 r13 r03 */
-  *out3 = _mm256_unpackhi_epi64(b, d);
-}
-
-static INLINE void od_transpose_pack8x8_epi32(__m256i *rr0, __m256i *rr1,
-                                              __m256i *rr2, __m256i *rr3,
-                                              __m256i rr4, __m256i rr5,
-                                              __m256i rr6, __m256i rr7) {
-  __m256i w;
-  __m256i x;
-  __m256i y;
-  __m256i z;
-  /* w: r74 r64 r54 r44 r34 r24 r14 r04 | r70 r60 r50 r40 r30 r20 r10 r00
-     x: r75 r65 r55 r45 r35 r25 r15 r05 | r71 r61 r51 r41 r31 r21 r11 r01
-     y: r76 r66 r56 r46 r36 r26 r16 r06 | r72 r62 r52 r42 r32 r22 r12 r02
-     z: r77 r67 r57 r47 r37 r27 r17 r07 | r73 r63 r53 r43 r33 r23 r13 r03 */
-  od_transpose_pack4x8x2_epi32(&w, &x, &y, &z, *rr0, *rr1, *rr2, *rr3, rr4, rr5,
-                               rr6, rr7);
-  /* rr0: r71 r61 r51 r41 r31 r21 r11 r01 | r70 r60 r50 r40 r30 r20 r10 r00 */
-  *rr0 = _mm256_permute2x128_si256(w, x, 0 | (2 << 4));
-  /* rr1: r73 r63 r53 r43 r33 r23 r13 r03 | r72 r62 r52 r42 r32 r22 r12 r02 */
-  *rr1 = _mm256_permute2x128_si256(y, z, 0 | (2 << 4));
-  /* rr2: r75 r65 r55 r45 r35 r25 r15 r05 r74 r64 r54 r44 r34 r24 r14 r04 */
-  *rr2 = _mm256_permute2x128_si256(w, x, 1 | (3 << 4));
-  /* rr3: r77 r67 r57 r47 r37 r27 r17 r07 r76 r66 r56 r46 r36 r26 r16 r06 */
-  *rr3 = _mm256_permute2x128_si256(y, z, 1 | (3 << 4));
-}
-
-static INLINE void od_transpose_pack8x16_epi32(
-    __m256i *ss0, __m256i *ss1, __m256i *ss2, __m256i *ss3, __m256i *ss4,
-    __m256i *ss5, __m256i *ss6, __m256i *ss7, __m256i ss8, __m256i ss9,
-    __m256i ssa, __m256i ssb, __m256i ssc, __m256i ssd, __m256i sse,
-    __m256i ssf) {
-  __m256i a;
-  __m256i b;
-  __m256i c;
-  __m256i d;
-  __m256i e;
-  __m256i f;
-  __m256i g;
-  __m256i h;
-  /* ss0: s74 s64 s54 s44 s34 s24 s14 s04 | s70 s60 s50 s40 s30 s20 s10 s00
-     ss2: s75 s65 s55 s45 s35 s25 s15 s05 | s71 s61 s51 s41 s31 s21 s11 s01
-     ss4: s76 s66 s56 s46 s36 s26 s16 s06 | s72 s62 s52 s42 s32 s22 s12 s02
-     ss6: s77 s67 s57 s47 s37 s27 s17 s07 | s73 s63 s53 s43 s33 s23 s13 s03 */
-  od_transpose_pack4x8x2_epi32(&a, &b, &c, &d, *ss0, *ss1, *ss2, *ss3, *ss4,
-                               *ss5, *ss6, *ss7);
-  /* ss8: sf4 se4 sd4 sc4 sb4 sa4 s94 s84 | sf0 se0 sd0 sc0 sb0 sa0 s90 s80
-     ssa: sf5 se5 sd5 sc5 sb5 sa5 s95 s85 | sf1 se1 sd1 sc1 sb1 sa1 s91 s81
-     ssc: sf6 se6 sd6 sc6 sb6 sa6 s96 s86 | sf2 se2 sd2 sc2 sb2 sa2 s92 s82
-     sse: sf7 se7 sd7 sc7 sb7 sa7 s97 s87 | sf3 se3 sd3 sc3 sb3 sa3 s93 s83 */
-  od_transpose_pack4x8x2_epi32(&e, &f, &g, &h, ss8, ss9, ssa, ssb, ssc, ssd,
-                               sse, ssf);
-  /* ss0: sf0 se0 sd0 sc0 sb0 sa0 s90 s80 | s70 s60 s50 s40 s30 s20 s10 s00 */
-  *ss0 = _mm256_permute2x128_si256(a, e, 0 | (2 << 4));
-  /* ss1: sf1 se1 sd1 sc1 sb1 sa1 s91 s81 | s71 s61 s51 s41 s31 s21 s11 s01 */
-  *ss1 = _mm256_permute2x128_si256(b, f, 0 | (2 << 4));
-  /* ss2: sf2 se2 sd2 sc2 sb2 sa2 s92 s82 | s72 s62 s52 s42 s32 s22 s12 s02 */
-  *ss2 = _mm256_permute2x128_si256(c, g, 0 | (2 << 4));
-  /* ss3: sf3 se3 sd3 sc3 sb3 sa3 s93 s83 | s73 s63 s53 s43 s33 s23 s13 s03 */
-  *ss3 = _mm256_permute2x128_si256(d, h, 0 | (2 << 4));
-  /* ss4: sf4 se4 sd4 sc4 sb4 sa4 s94 s84 | s74 s64 s54 s44 s34 s24 s14 s04 */
-  *ss4 = _mm256_permute2x128_si256(a, e, 1 | (3 << 4));
-  /* ss5: sf5 se5 sd5 sc5 sb5 sa5 s95 s85 | s75 s65 s55 s45 s35 s25 s15 s05 */
-  *ss5 = _mm256_permute2x128_si256(b, f, 1 | (3 << 4));
-  /* ss6: rf6 re6 rd6 rc6 rb6 ra6 r96 r82 | r76 r66 r56 r46 r36 r26 r16 r06 */
-  *ss6 = _mm256_permute2x128_si256(c, g, 1 | (3 << 4));
-  /* ss7: rf7 re7 rd7 rc7 rb7 ra7 r97 r87 | r77 r67 r57 r47 r37 r27 r17 r07 */
-  *ss7 = _mm256_permute2x128_si256(d, h, 1 | (3 << 4));
-}
-
-#undef OD_KERNEL
-#undef OD_WORD
-#undef OD_REG
-#undef OD_ADD
-#undef OD_SUB
-#undef OD_RSHIFT1
-#undef OD_AVG
-#undef OD_HRSUB
-#undef OD_MUL
-#undef OD_SWAP
-
-/* Define 8-wide 16-bit SSSE3 kernels. */
-
-#define OD_KERNEL kernel8
-#define OD_WORD epi16
-#define OD_REG __m128i
-#define OD_ADD _mm_add_epi16
-#define OD_SUB _mm_sub_epi16
-#define OD_RSHIFT1 od_unbiased_rshift1_epi16
-#define OD_AVG od_avg_epi16
-#define OD_HRSUB od_hrsub_epi16
-#define OD_MUL od_mul_epi16
-#define OD_SWAP od_swap_si128
-
-#include "av1/common/x86/daala_tx_kernels.h"
-
-#undef OD_KERNEL
-#undef OD_REG
-#undef OD_ADD
-#undef OD_SUB
-#undef OD_RSHIFT1
-#undef OD_AVG
-#undef OD_HRSUB
-#undef OD_MUL
-#undef OD_SWAP
-
-/* Define 16-wide 16-bit AVX2 kernels. */
-
-#define OD_KERNEL kernel16
-#define OD_REG __m256i
-#define OD_ADD _mm256_add_epi16
-#define OD_SUB _mm256_sub_epi16
-#define OD_RSHIFT1 od_mm256_unbiased_rshift1_epi16
-#define OD_AVG od_mm256_avg_epi16
-#define OD_HRSUB od_mm256_hrsub_epi16
-#define OD_MUL od_mm256_mul_epi16
-#define OD_SWAP od_mm256_swap_si256
-
-#include "av1/common/x86/daala_tx_kernels.h"  // NOLINT
-
-/* Define 8-wide 32-bit AVX2 kernels. */
-
-#undef OD_KERNEL
-#undef OD_WORD
-#undef OD_ADD
-#undef OD_SUB
-#undef OD_RSHIFT1
-#undef OD_AVG
-#undef OD_HRSUB
-#undef OD_MUL
-
-#define OD_KERNEL kernel8
-#define OD_WORD epi32
-#define OD_ADD _mm256_add_epi32
-#define OD_SUB _mm256_sub_epi32
-#define OD_RSHIFT1 od_mm256_unbiased_rshift1_epi32
-#define OD_AVG od_mm256_avg_epi32
-#define OD_HRSUB od_mm256_hrsub_epi32
-#define OD_MUL od_mm256_mul_epi32
-
-#include "av1/common/x86/daala_tx_kernels.h"  // NOLINT
-
-static void od_row_iidtx_avx2(int16_t *out, int coeffs, const tran_low_t *in) {
-  int c;
-  /* The number of rows and number of columns are both multiples of 4, so the
-     total number of coefficients should be a multiple of 16. */
-  assert(!(coeffs & 0xF));
-  /* TODO(any): Use AVX2 for larger block sizes. */
-  for (c = 0; c < coeffs; c += 16) {
-    __m128i q0;
-    __m128i q1;
-    __m128i q2;
-    __m128i q3;
-    od_load_buffer_4x4_epi32(&q0, &q1, &q2, &q3, in + c);
-    q0 = _mm_packs_epi32(q0, q1);
-    q2 = _mm_packs_epi32(q2, q3);
-    od_store_buffer_4x4_epi16(out + c, q0, q2);
-  }
-}
-
-static void od_col_iidtx_add_hbd_avx2(unsigned char *output_pixels,
-                                      int output_stride, int rows, int cols,
-                                      const int16_t *in, int bd) {
-  __m128i q0;
-  __m128i q1;
-  __m128i q2;
-  __m128i q3;
-  if (cols <= 4) {
-    uint16_t *output_pixels16;
-    __m128i p0;
-    __m128i p1;
-    __m128i p2;
-    __m128i p3;
-    __m128i max;
-    __m128i round;
-    int downshift;
-    int hr;
-    output_pixels16 = CONVERT_TO_SHORTPTR(output_pixels);
-    max = od_hbd_max_epi16(bd);
-    downshift = TX_COEFF_DEPTH - bd;
-    round = _mm_set1_epi16((1 << downshift) >> 1);
-    /* Here hr counts half the number of rows, to simplify address calculations
-       when loading two rows of coefficients at once. */
-    for (hr = 0; 2 * hr < rows; hr += 2) {
-      q0 = _mm_loadu_si128((const __m128i *)in + hr + 0);
-      q2 = _mm_loadu_si128((const __m128i *)in + hr + 1);
-      p0 = _mm_loadl_epi64(
-          (const __m128i *)(output_pixels16 + (2 * hr + 0) * output_stride));
-      p1 = _mm_loadl_epi64(
-          (const __m128i *)(output_pixels16 + (2 * hr + 1) * output_stride));
-      p2 = _mm_loadl_epi64(
-          (const __m128i *)(output_pixels16 + (2 * hr + 2) * output_stride));
-      p3 = _mm_loadl_epi64(
-          (const __m128i *)(output_pixels16 + (2 * hr + 3) * output_stride));
-      q0 = _mm_srai_epi16(_mm_add_epi16(q0, round), downshift);
-      q2 = _mm_srai_epi16(_mm_add_epi16(q2, round), downshift);
-      q1 = _mm_unpackhi_epi64(q0, q0);
-      q3 = _mm_unpackhi_epi64(q2, q2);
-      p0 = od_hbd_clamp_epi16(_mm_add_epi16(p0, q0), max);
-      p1 = od_hbd_clamp_epi16(_mm_add_epi16(p1, q1), max);
-      p2 = od_hbd_clamp_epi16(_mm_add_epi16(p2, q2), max);
-      p3 = od_hbd_clamp_epi16(_mm_add_epi16(p3, q3), max);
-      _mm_storel_epi64(
-          (__m128i *)(output_pixels16 + (2 * hr + 0) * output_stride), p0);
-      _mm_storel_epi64(
-          (__m128i *)(output_pixels16 + (2 * hr + 1) * output_stride), p1);
-      _mm_storel_epi64(
-          (__m128i *)(output_pixels16 + (2 * hr + 2) * output_stride), p2);
-      _mm_storel_epi64(
-          (__m128i *)(output_pixels16 + (2 * hr + 3) * output_stride), p3);
-    }
-  } else {
-    int r;
-    for (r = 0; r < rows; r += 4) {
-      int c;
-      /* TODO(any): Use AVX2 for larger column counts. */
-      for (c = 0; c < cols; c += 8) {
-        od_load_buffer_8x4_epi16(&q0, &q1, &q2, &q3, in + r * cols + c, cols);
-        od_add_store_buffer_hbd_8x4_epi16(output_pixels + r * output_stride + c,
-                                          output_stride, q0, q1, q2, q3, bd);
-      }
-    }
-  }
-}
-
-typedef void (*od_tx4_kernel8_epi16)(__m128i *q0, __m128i *q2, __m128i *q1,
-                                     __m128i *q3);
-
-static void od_row_tx4_avx2(int16_t *out, int rows, const tran_low_t *in,
-                            od_tx4_kernel8_epi16 kernel8) {
-  __m128i q0;
-  __m128i q1;
-  __m128i q2;
-  __m128i q3;
-  if (rows <= 4) {
-    od_load_buffer_4x4_epi32(&q0, &q1, &q2, &q3, in);
-    /*TODO(any): Merge this transpose with coefficient scanning.*/
-    od_transpose_pack4x4(&q0, &q1, &q2, &q3);
-    kernel8(&q0, &q1, &q2, &q3);
-    od_transpose4x4(&q0, q2, &q1, q3);
-    od_store_buffer_4x4_epi16(out, q0, q1);
-  } else {
-    int r;
-    /* Higher row counts require 32-bit precision. */
-    assert(rows <= 16);
-    for (r = 0; r < rows; r += 8) {
-      __m128i q4;
-      __m128i q5;
-      __m128i q6;
-      __m128i q7;
-      od_load_buffer_4x4_epi32(&q0, &q1, &q2, &q3, in + 4 * r);
-      od_load_buffer_4x4_epi32(&q4, &q5, &q6, &q7, in + 4 * r + 16);
-      /*TODO(any): Merge this transpose with coefficient scanning.*/
-      od_transpose_pack4x8(&q0, &q1, &q2, &q3, q4, q5, q6, q7);
-      kernel8(&q0, &q1, &q2, &q3);
-      od_transpose8x4(&q0, &q2, &q1, &q3);
-      od_store_buffer_4x8_epi16(out + 4 * r, q0, q2, q1, q3);
-    }
-  }
-}
-
-static void od_col_tx4_add_hbd_avx2(unsigned char *output_pixels,
-                                    int output_stride, int cols,
-                                    const int16_t *in, int bd,
-                                    od_tx4_kernel8_epi16 kernel8) {
-  __m128i q0;
-  __m128i q1;
-  __m128i q2;
-  __m128i q3;
-  if (cols <= 4) {
-    od_load_buffer_4x4_epi16(&q0, &q1, &q2, &q3, in);
-    kernel8(&q0, &q1, &q2, &q3);
-    od_add_store_buffer_hbd_4x4_epi16(output_pixels, output_stride, q0, q2, q1,
-                                      q3, bd);
-  } else {
-    int c;
-    for (c = 0; c < cols; c += 8) {
-      od_load_buffer_8x4_epi16(&q0, &q1, &q2, &q3, in + c, cols);
-      kernel8(&q0, &q1, &q2, &q3);
-      od_add_store_buffer_hbd_8x4_epi16(output_pixels + c, output_stride, q0,
-                                        q2, q1, q3, bd);
-    }
-  }
-}
-
-#if 0
-static void od_row_idct4_avx2(int16_t *out, int rows, const tran_low_t *in) {
-  od_row_tx4_avx2(out, rows, in, od_idct4_kernel8_epi16);
-}
-
-static void od_col_idct4_add_hbd_avx2(unsigned char *output_pixels,
-                                      int output_stride, int cols,
-                                      const int16_t *in, int bd) {
-  od_col_tx4_add_hbd_avx2(output_pixels, output_stride, cols, in, bd,
-                          od_idct4_kernel8_epi16);
-}
-#endif
-
-static void od_row_idst4_avx2(int16_t *out, int rows, const tran_low_t *in) {
-  od_row_tx4_avx2(out, rows, in, od_idst_vii4_kernel8_epi16);
-}
-
-static void od_col_idst4_add_hbd_avx2(unsigned char *output_pixels,
-                                      int output_stride, int cols,
-                                      const int16_t *in, int bd) {
-  od_col_tx4_add_hbd_avx2(output_pixels, output_stride, cols, in, bd,
-                          od_idst_vii4_kernel8_epi16);
-}
-
-static void od_row_flip_idst4_avx2(int16_t *out, int rows,
-                                   const tran_low_t *in) {
-  od_row_tx4_avx2(out, rows, in, od_flip_idst_vii4_kernel8_epi16);
-}
-
-static void od_col_flip_idst4_add_hbd_avx2(unsigned char *output_pixels,
-                                           int output_stride, int cols,
-                                           const int16_t *in, int bd) {
-  od_col_tx4_add_hbd_avx2(output_pixels, output_stride, cols, in, bd,
-                          od_flip_idst_vii4_kernel8_epi16);
-}
-
-static void od_row_iidtx4_avx2(int16_t *out, int rows, const tran_low_t *in) {
-  od_row_iidtx_avx2(out, rows * 4, in);
-}
-
-static void od_col_iidtx4_add_hbd_avx2(unsigned char *output_pixels,
-                                       int output_stride, int cols,
-                                       const int16_t *in, int bd) {
-  od_col_iidtx_add_hbd_avx2(output_pixels, output_stride, 4, cols, in, bd);
-}
-
-typedef void (*od_tx8_kernel8_epi16)(__m128i *r0, __m128i *r4, __m128i *r2,
-                                     __m128i *r6, __m128i *r1, __m128i *r5,
-                                     __m128i *r3, __m128i *r7);
-
-typedef void (*od_tx8_mm256_kernel)(__m256i *r0, __m256i *r4, __m256i *r2,
-                                    __m256i *r6, __m256i *r1, __m256i *r5,
-                                    __m256i *r3, __m256i *r7);
-
-#if 0
-static void od_row_tx8_avx2(int16_t *out, int rows, const tran_low_t *in,
-                            od_tx8_kernel8_epi16 kernel8_epi16,
-                            od_tx8_mm256_kernel kernel8_epi32) {
-  __m128i r0;
-  __m128i r1;
-  __m128i r2;
-  __m128i r3;
-  __m128i r4;
-  __m128i r5;
-  __m128i r6;
-  __m128i r7;
-  if (rows <= 4) {
-    od_load_buffer_4x4_epi32(&r0, &r1, &r2, &r3, in);
-    od_load_buffer_4x4_epi32(&r4, &r5, &r6, &r7, in + 16);
-    /*TODO(any): Merge this transpose with coefficient scanning.*/
-    od_transpose_pack8x4(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7);
-    kernel8_epi16(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7);
-    od_transpose4x8(&r0, r4, &r2, r6, &r1, r5, &r3, r7);
-    od_store_buffer_4x4_epi16(out, r0, r2);
-    od_store_buffer_4x4_epi16(out + 16, r1, r3);
-  } else if (rows <= 8) {
-    od_load_pack_buffer_8x4_epi32(&r0, &r1, &r2, &r3, in);
-    od_load_pack_buffer_8x4_epi32(&r4, &r5, &r6, &r7, in + 32);
-    /*TODO(any): Merge this transpose with coefficient scanning.*/
-    od_transpose8x8_epi16(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7);
-    kernel8_epi16(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7);
-    od_transpose8x8_epi16(&r0, &r4, &r2, &r6, &r1, &r5, &r3, &r7);
-    od_store_buffer_4x8_epi16(out, r0, r4, r2, r6);
-    od_store_buffer_4x8_epi16(out + 32, r1, r5, r3, r7);
-  } else {
-    int r;
-    /* 16 or more rows requires 32-bit precision.
-       TODO(any): If the column TX is IDTX, then we can still use 16 bits. */
-    for (r = 0; r < rows; r += 8) {
-      __m256i rr0;
-      __m256i rr1;
-      __m256i rr2;
-      __m256i rr3;
-      __m256i rr4;
-      __m256i rr5;
-      __m256i rr6;
-      __m256i rr7;
-      od_load_buffer_8x4_epi32(&rr0, &rr1, &rr2, &rr3, in + r * 8);
-      od_load_buffer_8x4_epi32(&rr4, &rr5, &rr6, &rr7, in + r * 8 + 32);
-      od_transpose8x8_epi32(&rr0, &rr1, &rr2, &rr3, &rr4, &rr5, &rr6, &rr7);
-      kernel8_epi32(&rr0, &rr1, &rr2, &rr3, &rr4, &rr5, &rr6, &rr7);
-      od_transpose_pack8x8_epi32(&rr0, &rr4, &rr2, &rr6, rr1, rr5, rr3, rr7);
-      od_store_buffer_2x16_epi16(out + r * 8, rr0, rr4);
-      od_store_buffer_2x16_epi16(out + r * 8 + 32, rr2, rr6);
-    }
-  }
-}
-
-static void od_col_tx8_add_hbd_avx2(unsigned char *output_pixels,
-                                    int output_stride, int cols,
-                                    const int16_t *in, int bd,
-                                    od_tx8_kernel8_epi16 kernel8_epi16,
-                                    od_tx8_mm256_kernel kernel16_epi16) {
-  __m128i r0;
-  __m128i r1;
-  __m128i r2;
-  __m128i r3;
-  __m128i r4;
-  __m128i r5;
-  __m128i r6;
-  __m128i r7;
-  if (cols <= 4) {
-    od_load_buffer_4x4_epi16(&r0, &r1, &r2, &r3, in);
-    od_load_buffer_4x4_epi16(&r4, &r5, &r6, &r7, in + 16);
-    kernel8_epi16(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7);
-    od_add_store_buffer_hbd_4x4_epi16(output_pixels, output_stride, r0, r4, r2,
-                                      r6, bd);
-    od_add_store_buffer_hbd_4x4_epi16(output_pixels + 4 * output_stride,
-                                      output_stride, r1, r5, r3, r7, bd);
-  } else if (cols <= 8) {
-    od_load_buffer_8x4_epi16(&r0, &r1, &r2, &r3, in, cols);
-    od_load_buffer_8x4_epi16(&r4, &r5, &r6, &r7, in + 32, cols);
-    kernel8_epi16(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7);
-    od_add_store_buffer_hbd_8x4_epi16(output_pixels, output_stride, r0, r4, r2,
-                                      r6, bd);
-    od_add_store_buffer_hbd_8x4_epi16(output_pixels + 4 * output_stride,
-                                      output_stride, r1, r5, r3, r7, bd);
-  } else {
-    __m256i rr0;
-    __m256i rr1;
-    __m256i rr2;
-    __m256i rr3;
-    __m256i rr4;
-    __m256i rr5;
-    __m256i rr6;
-    __m256i rr7;
-    int c;
-    for (c = 0; c < cols; c += 16) {
-      od_load_buffer_16x4_epi16(&rr0, &rr1, &rr2, &rr3, in + c, cols);
-      od_load_buffer_16x4_epi16(&rr4, &rr5, &rr6, &rr7, in + 4 * cols + c,
-                                cols);
-      kernel16_epi16(&rr0, &rr1, &rr2, &rr3, &rr4, &rr5, &rr6, &rr7);
-      od_add_store_buffer_hbd_16x4_epi16(output_pixels, output_stride, rr0, rr4,
-                                         rr2, rr6, bd);
-      od_add_store_buffer_hbd_16x4_epi16(output_pixels + 4 * output_stride,
-                                         output_stride, rr1, rr5, rr3, rr7, bd);
-    }
-  }
-}
-
-static void od_row_idct8_avx2(int16_t *out, int rows, const tran_low_t *in) {
-  od_row_tx8_avx2(out, rows, in, od_idct8_kernel8_epi16,
-                  od_idct8_kernel8_epi32);
-}
-
-static void od_col_idct8_add_hbd_avx2(unsigned char *output_pixels,
-                                      int output_stride, int cols,
-                                      const int16_t *in, int bd) {
-  od_col_tx8_add_hbd_avx2(output_pixels, output_stride, cols, in, bd,
-                          od_idct8_kernel8_epi16, od_idct8_kernel16_epi16);
-}
-
-static void od_row_idst8_avx2(int16_t *out, int rows, const tran_low_t *in) {
-  od_row_tx8_avx2(out, rows, in, od_idst8_kernel8_epi16,
-                  od_idst8_kernel8_epi32);
-}
-
-static void od_col_idst8_add_hbd_avx2(unsigned char *output_pixels,
-                                      int output_stride, int cols,
-                                      const int16_t *in, int bd) {
-  od_col_tx8_add_hbd_avx2(output_pixels, output_stride, cols, in, bd,
-                          od_idst8_kernel8_epi16, od_idst8_kernel16_epi16);
-}
-
-static void od_row_flip_idst8_avx2(int16_t *out, int rows,
-                                   const tran_low_t *in) {
-  od_row_tx8_avx2(out, rows, in, od_flip_idst8_kernel8_epi16,
-                  od_flip_idst8_kernel8_epi32);
-}
-
-static void od_col_flip_idst8_add_hbd_avx2(unsigned char *output_pixels,
-                                           int output_stride, int cols,
-                                           const int16_t *in, int bd) {
-  od_col_tx8_add_hbd_avx2(output_pixels, output_stride, cols, in, bd,
-                          od_flip_idst8_kernel8_epi16,
-                          od_flip_idst8_kernel16_epi16);
-}
-#endif
-
-static void od_row_iidtx8_avx2(int16_t *out, int rows, const tran_low_t *in) {
-  od_row_iidtx_avx2(out, rows * 8, in);
-}
-
-static void od_col_iidtx8_add_hbd_avx2(unsigned char *output_pixels,
-                                       int output_stride, int cols,
-                                       const int16_t *in, int bd) {
-  od_col_iidtx_add_hbd_avx2(output_pixels, output_stride, 8, cols, in, bd);
-}
-
-typedef void (*od_tx16_kernel8_epi16)(__m128i *s0, __m128i *s4, __m128i *s2,
-                                      __m128i *s6, __m128i *s1, __m128i *s5,
-                                      __m128i *s3, __m128i *s7, __m128i *s8,
-                                      __m128i *s9, __m128i *sa, __m128i *sb,
-                                      __m128i *sc, __m128i *sd, __m128i *se,
-                                      __m128i *sf);
-
-typedef void (*od_tx16_mm256_kernel)(__m256i *s0, __m256i *s4, __m256i *s2,
-                                     __m256i *s6, __m256i *s1, __m256i *s5,
-                                     __m256i *s3, __m256i *s7, __m256i *s8,
-                                     __m256i *s9, __m256i *sa, __m256i *sb,
-                                     __m256i *sc, __m256i *sd, __m256i *se,
-                                     __m256i *sf);
-
-#if 0
-static void od_row_tx16_avx2(int16_t *out, int rows, const tran_low_t *in,
-#if CONFIG_RECT_TX_EXT
-                             od_tx16_kernel8_epi16 kernel8_epi16,
-#endif
-                             od_tx16_mm256_kernel kernel8_epi32) {
-#if CONFIG_RECT_TX_EXT
-  if (rows <= 4) {
-    __m128i s0;
-    __m128i s1;
-    __m128i s2;
-    __m128i s3;
-    __m128i s4;
-    __m128i s5;
-    __m128i s6;
-    __m128i s7;
-    __m128i s8;
-    __m128i s9;
-    __m128i sa;
-    __m128i sb;
-    __m128i sc;
-    __m128i sd;
-    __m128i se;
-    __m128i sf;
-    od_load_buffer_4x4_epi32(&s0, &s1, &s8, &s9, in);
-    od_load_buffer_4x4_epi32(&s2, &s3, &sa, &sb, in + 16);
-    od_load_buffer_4x4_epi32(&s4, &s5, &sc, &sd, in + 32);
-    od_load_buffer_4x4_epi32(&s6, &s7, &se, &sf, in + 48);
-    /*TODO(any): Merge this transpose with coefficient scanning.*/
-    od_transpose_pack8x4(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-    od_transpose_pack8x4(&s8, &s9, &sa, &sb, &sc, &sd, &se, &sf);
-    kernel8_epi16(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &sa, &sb,
-                  &sc, &sd, &se, &sf);
-    od_transpose4x8(&s0, s8, &s4, sc, &s2, sa, &s6, se);
-    od_transpose4x8(&s1, s9, &s5, sd, &s3, sb, &s7, sf);
-    od_store_buffer_4x4_epi16(out, s0, s1);
-    od_store_buffer_4x4_epi16(out + 16, s4, s5);
-    od_store_buffer_4x4_epi16(out + 32, s2, s3);
-    od_store_buffer_4x4_epi16(out + 48, s6, s7);
-    return;
-  }
-#endif  // CONFIG_RECT_TX_EXT
-  {
-    int r;
-    /* 8 or more rows requires 32-bit precision.
-       TODO(any): If the column TX is IDTX, then we can still use 16 bits. */
-    for (r = 0; r < rows; r += 8) {
-      __m256i ss0;
-      __m256i ss1;
-      __m256i ss2;
-      __m256i ss3;
-      __m256i ss4;
-      __m256i ss5;
-      __m256i ss6;
-      __m256i ss7;
-      __m256i ss8;
-      __m256i ss9;
-      __m256i ssa;
-      __m256i ssb;
-      __m256i ssc;
-      __m256i ssd;
-      __m256i sse;
-      __m256i ssf;
-      od_load_buffer_8x4_epi32(&ss0, &ss8, &ss1, &ss9, in + r * 16);
-      od_load_buffer_8x4_epi32(&ss2, &ssa, &ss3, &ssb, in + r * 16 + 32);
-      od_load_buffer_8x4_epi32(&ss4, &ssc, &ss5, &ssd, in + r * 16 + 64);
-      od_load_buffer_8x4_epi32(&ss6, &sse, &ss7, &ssf, in + r * 16 + 96);
-      od_transpose8x8_epi32(&ss0, &ss1, &ss2, &ss3, &ss4, &ss5, &ss6, &ss7);
-      od_transpose8x8_epi32(&ss8, &ss9, &ssa, &ssb, &ssc, &ssd, &sse, &ssf);
-      kernel8_epi32(&ss0, &ss1, &ss2, &ss3, &ss4, &ss5, &ss6, &ss7, &ss8, &ss9,
-                    &ssa, &ssb, &ssc, &ssd, &sse, &ssf);
-      od_transpose_pack8x16_epi32(&ss0, &ss8, &ss4, &ssc, &ss2, &ssa, &ss6,
-                                  &sse, ss1, ss9, ss5, ssd, ss3, ssb, ss7, ssf);
-      od_store_buffer_2x16_epi16(out + r * 16, ss0, ss8);
-      od_store_buffer_2x16_epi16(out + r * 16 + 32, ss4, ssc);
-      od_store_buffer_2x16_epi16(out + r * 16 + 64, ss2, ssa);
-      od_store_buffer_2x16_epi16(out + r * 16 + 96, ss6, sse);
-    }
-  }
-}
-
-static void od_col_tx16_add_hbd_avx2(unsigned char *output_pixels,
-                                     int output_stride, int cols,
-                                     const int16_t *in, int bd,
-                                     od_tx16_kernel8_epi16 kernel8_epi16,
-                                     od_tx16_mm256_kernel kernel16_epi16) {
-  __m128i s0;
-  __m128i s1;
-  __m128i s2;
-  __m128i s3;
-  __m128i s4;
-  __m128i s5;
-  __m128i s6;
-  __m128i s7;
-  __m128i s8;
-  __m128i s9;
-  __m128i sa;
-  __m128i sb;
-  __m128i sc;
-  __m128i sd;
-  __m128i se;
-  __m128i sf;
-#if CONFIG_RECT_TX_EXT
-  if (cols <= 4) {
-    od_load_buffer_4x4_epi16(&s0, &s1, &s2, &s3, in);
-    od_load_buffer_4x4_epi16(&s4, &s5, &s6, &s7, in + 16);
-    od_load_buffer_4x4_epi16(&s8, &s9, &sa, &sb, in + 32);
-    od_load_buffer_4x4_epi16(&sc, &sd, &se, &sf, in + 48);
-    kernel8_epi16(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &sa, &sb,
-                  &sc, &sd, &se, &sf);
-    od_add_store_buffer_hbd_4x4_epi16(output_pixels, output_stride, s0, s8, s4,
-                                      sc, bd);
-    od_add_store_buffer_hbd_4x4_epi16(output_pixels + 4 * output_stride,
-                                      output_stride, s2, sa, s6, se, bd);
-    od_add_store_buffer_hbd_4x4_epi16(output_pixels + 8 * output_stride,
-                                      output_stride, s1, s9, s5, sd, bd);
-    od_add_store_buffer_hbd_4x4_epi16(output_pixels + 12 * output_stride,
-                                      output_stride, s3, sb, s7, sf, bd);
-    return;
-  }
-#endif  // CONFIG_RECT_TX_EXT
-  if (cols <= 8) {
-    od_load_buffer_8x4_epi16(&s0, &s1, &s2, &s3, in, cols);
-    od_load_buffer_8x4_epi16(&s4, &s5, &s6, &s7, in + 32, cols);
-    od_load_buffer_8x4_epi16(&s8, &s9, &sa, &sb, in + 64, cols);
-    od_load_buffer_8x4_epi16(&sc, &sd, &se, &sf, in + 96, cols);
-    kernel8_epi16(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &sa, &sb,
-                  &sc, &sd, &se, &sf);
-    od_add_store_buffer_hbd_8x4_epi16(output_pixels, output_stride, s0, s8, s4,
-                                      sc, bd);
-    od_add_store_buffer_hbd_8x4_epi16(output_pixels + 4 * output_stride,
-                                      output_stride, s2, sa, s6, se, bd);
-    od_add_store_buffer_hbd_8x4_epi16(output_pixels + 8 * output_stride,
-                                      output_stride, s1, s9, s5, sd, bd);
-    od_add_store_buffer_hbd_8x4_epi16(output_pixels + 12 * output_stride,
-                                      output_stride, s3, sb, s7, sf, bd);
-  } else {
-    __m256i ss0;
-    __m256i ss1;
-    __m256i ss2;
-    __m256i ss3;
-    __m256i ss4;
-    __m256i ss5;
-    __m256i ss6;
-    __m256i ss7;
-    __m256i ss8;
-    __m256i ss9;
-    __m256i ssa;
-    __m256i ssb;
-    __m256i ssc;
-    __m256i ssd;
-    __m256i sse;
-    __m256i ssf;
-    int c;
-    for (c = 0; c < cols; c += 16) {
-      od_load_buffer_16x4_epi16(&ss0, &ss1, &ss2, &ss3, in + c, cols);
-      od_load_buffer_16x4_epi16(&ss4, &ss5, &ss6, &ss7, in + 4 * cols + c,
-                                cols);
-      od_load_buffer_16x4_epi16(&ss8, &ss9, &ssa, &ssb, in + 8 * cols + c,
-                                cols);
-      od_load_buffer_16x4_epi16(&ssc, &ssd, &sse, &ssf, in + 12 * cols + c,
-                                cols);
-      kernel16_epi16(&ss0, &ss1, &ss2, &ss3, &ss4, &ss5, &ss6, &ss7, &ss8, &ss9,
-                     &ssa, &ssb, &ssc, &ssd, &sse, &ssf);
-      od_add_store_buffer_hbd_16x4_epi16(output_pixels, output_stride, ss0, ss8,
-                                         ss4, ssc, bd);
-      od_add_store_buffer_hbd_16x4_epi16(output_pixels + 4 * output_stride,
-                                         output_stride, ss2, ssa, ss6, sse, bd);
-      od_add_store_buffer_hbd_16x4_epi16(output_pixels + 8 * output_stride,
-                                         output_stride, ss1, ss9, ss5, ssd, bd);
-      od_add_store_buffer_hbd_16x4_epi16(output_pixels + 12 * output_stride,
-                                         output_stride, ss3, ssb, ss7, ssf, bd);
-    }
-  }
-}
-
-static void od_row_idct16_avx2(int16_t *out, int rows, const tran_low_t *in) {
-  od_row_tx16_avx2(out, rows, in,
-#if CONFIG_RECT_TX_EXT
-                   od_idct16_kernel8_epi16,
-#endif
-                   od_idct16_kernel8_epi32);
-}
-
-static void od_col_idct16_add_hbd_avx2(unsigned char *output_pixels,
-                                       int output_stride, int cols,
-                                       const int16_t *in, int bd) {
-  od_col_tx16_add_hbd_avx2(output_pixels, output_stride, cols, in, bd,
-                           od_idct16_kernel8_epi16, od_idct16_kernel16_epi16);
-}
-
-static void od_row_idst16_avx2(int16_t *out, int rows, const tran_low_t *in) {
-  od_row_tx16_avx2(out, rows, in,
-#if CONFIG_RECT_TX_EXT
-                   od_idst16_kernel8_epi16,
-#endif
-                   od_idst16_kernel8_epi32);
-}
-
-static void od_col_idst16_add_hbd_avx2(unsigned char *output_pixels,
-                                       int output_stride, int cols,
-                                       const int16_t *in, int bd) {
-  od_col_tx16_add_hbd_avx2(output_pixels, output_stride, cols, in, bd,
-                           od_idst16_kernel8_epi16, od_idst16_kernel16_epi16);
-}
-
-static void od_row_flip_idst16_avx2(int16_t *out, int rows,
-                                    const tran_low_t *in) {
-  od_row_tx16_avx2(out, rows, in,
-#if CONFIG_RECT_TX_EXT
-                   od_flip_idst16_kernel8_epi16,
-#endif
-                   od_flip_idst16_kernel8_epi32);
-}
-
-static void od_col_flip_idst16_add_hbd_avx2(unsigned char *output_pixels,
-                                            int output_stride, int cols,
-                                            const int16_t *in, int bd) {
-  od_col_tx16_add_hbd_avx2(output_pixels, output_stride, cols, in, bd,
-                           od_flip_idst16_kernel8_epi16,
-                           od_flip_idst16_kernel16_epi16);
-}
-#endif
-
-static void od_row_iidtx16_avx2(int16_t *out, int rows, const tran_low_t *in) {
-  od_row_iidtx_avx2(out, rows * 16, in);
-}
-
-static void od_col_iidtx16_add_hbd_avx2(unsigned char *output_pixels,
-                                        int output_stride, int cols,
-                                        const int16_t *in, int bd) {
-  od_col_iidtx_add_hbd_avx2(output_pixels, output_stride, 16, cols, in, bd);
-}
-
-typedef void (*daala_row_itx)(int16_t *out, int rows, const tran_low_t *in);
-typedef void (*daala_col_itx_add)(unsigned char *output_pixels,
-                                  int output_stride, int cols,
-                                  const int16_t *in, int bd);
-
-static const daala_row_itx TX_ROW_MAP[TX_SIZES][TX_TYPES] = {
-  // 4-point transforms
-  { NULL, od_row_idst4_avx2, od_row_flip_idst4_avx2, od_row_iidtx4_avx2 },
-  // 8-point transforms
-  { NULL, NULL, NULL, od_row_iidtx8_avx2 },
-  // 16-point transforms
-  { NULL, NULL, NULL, od_row_iidtx16_avx2 },
-  // 32-point transforms
-  { NULL, NULL, NULL, NULL },
-#if CONFIG_TX64X64
-  // 64-point transforms
-  { NULL, NULL, NULL, NULL },
-#endif
-};
-
-static const daala_col_itx_add TX_COL_MAP[2][TX_SIZES][TX_TYPES] = {
-  // Low bit depth output
-  {
-      // 4-point transforms
-      { NULL, NULL, NULL, NULL },
-      // 8-point transforms
-      { NULL, NULL, NULL, NULL },
-      // 16-point transforms
-      { NULL, NULL, NULL, NULL },
-      // 32-point transforms
-      { NULL, NULL, NULL, NULL },
-#if CONFIG_TX64X64
-      // 64-point transforms
-      { NULL, NULL, NULL, NULL },
-#endif
-  },
-  // High bit depth output
-  {
-      // 4-point transforms
-      { NULL, od_col_idst4_add_hbd_avx2, od_col_flip_idst4_add_hbd_avx2,
-        od_col_iidtx4_add_hbd_avx2 },
-      // 8-point transforms
-      { NULL, NULL, NULL, od_col_iidtx8_add_hbd_avx2 },
-      // 16-point transforms
-      { NULL, NULL, NULL, od_col_iidtx16_add_hbd_avx2 },
-      // 32-point transforms
-      { NULL, NULL, NULL, NULL },
-#if CONFIG_TX64X64
-      // 64-point transforms
-      { NULL, NULL, NULL, NULL },
-#endif
-  }
-};
-
-/* Define this to verify the SIMD against the C versions of the transforms.
-   This is intended to be replaced by real unit tests in the future. */
-#undef DAALA_TX_VERIFY_SIMD
-
-void daala_inv_txfm_add_avx2(const tran_low_t *input_coeffs,
-                             void *output_pixels, int output_stride,
-                             TxfmParam *txfm_param) {
-  const TX_SIZE tx_size = txfm_param->tx_size;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  assert(tx_size <= TX_SIZES_ALL);
-  assert(tx_type <= TX_TYPES);
-
-  if (txfm_param->lossless) {
-    daala_inv_txfm_add_c(input_coeffs, output_pixels, output_stride,
-                         txfm_param);
-  } else {
-    // General TX case
-    assert(sizeof(tran_low_t) == sizeof(od_coeff));
-    assert(sizeof(tran_low_t) >= 4);
-
-    // Hook into existing map translation infrastructure to select
-    // appropriate TX functions
-    const TX_SIZE col_idx = txsize_vert_map[tx_size];
-    const TX_SIZE row_idx = txsize_horz_map[tx_size];
-    assert(col_idx <= TX_SIZES);
-    assert(row_idx <= TX_SIZES);
-    assert(vtx_tab[tx_type] <= (int)TX_TYPES_1D);
-    assert(htx_tab[tx_type] <= (int)TX_TYPES_1D);
-    daala_row_itx row_tx = TX_ROW_MAP[row_idx][htx_tab[tx_type]];
-    daala_col_itx_add col_tx =
-        TX_COL_MAP[txfm_param->is_hbd][col_idx][vtx_tab[tx_type]];
-    int16_t tmpsq[MAX_TX_SQUARE];
-
-    if (row_tx == NULL || col_tx == NULL) {
-      daala_inv_txfm_add_c(input_coeffs, output_pixels, output_stride,
-                           txfm_param);
-    } else {
-      const int cols = tx_size_wide[tx_size];
-      const int rows = tx_size_high[tx_size];
-#if defined(DAALA_TX_VERIFY_SIMD)
-      unsigned char out_check_buf8[MAX_TX_SQUARE];
-      int16_t out_check_buf16[MAX_TX_SQUARE];
-      unsigned char *out_check_buf;
-      {
-        if (txfm_param->is_hbd) {
-          uint16_t *output_pixels16;
-          int r;
-          output_pixels16 = CONVERT_TO_SHORTPTR(output_pixels);
-          for (r = 0; r < rows; r++) {
-            memcpy(out_check_buf16 + r * cols,
-                   output_pixels16 + r * output_stride,
-                   cols * sizeof(*out_check_buf16));
-          }
-          out_check_buf = CONVERT_TO_BYTEPTR(out_check_buf16);
-        } else {
-          unsigned char *output_pixels8;
-          int r;
-          output_pixels8 = (unsigned char *)output_pixels;
-          for (r = 0; r < rows; r++) {
-            memcpy(out_check_buf8 + r * cols,
-                   output_pixels8 + r * output_stride,
-                   cols * sizeof(*out_check_buf8));
-          }
-          out_check_buf = out_check_buf8;
-        }
-      }
-      daala_inv_txfm_add_c(input_coeffs, out_check_buf, cols, txfm_param);
-#endif
-      // Inverse-transform rows
-      row_tx(tmpsq, rows, input_coeffs);
-      // Inverse-transform columns and sum with destination
-      col_tx(output_pixels, output_stride, cols, tmpsq, txfm_param->bd);
-#if defined(DAALA_TX_VERIFY_SIMD)
-      {
-        if (txfm_param->is_hbd) {
-          uint16_t *output_pixels16;
-          int r;
-          output_pixels16 = CONVERT_TO_SHORTPTR(output_pixels);
-          for (r = 0; r < rows; r++) {
-            if (memcmp(out_check_buf16 + r * cols,
-                       output_pixels16 + r * output_stride,
-                       cols * sizeof(*out_check_buf16))) {
-              fprintf(stderr, "%s(%i): Inverse %ix%i %i_%i TX SIMD mismatch.\n",
-                      __FILE__, __LINE__, rows, cols, vtx_tab[tx_type],
-                      htx_tab[tx_type]);
-              assert(0);
-              exit(EXIT_FAILURE);
-            }
-          }
-        } else {
-          unsigned char *output_pixels8;
-          int r;
-          output_pixels8 = (unsigned char *)output_pixels;
-          for (r = 0; r < rows; r++) {
-            if (memcmp(out_check_buf8 + r * cols,
-                       output_pixels8 + r * output_stride,
-                       cols * sizeof(*out_check_buf8))) {
-              fprintf(stderr, "%s(%i): Inverse %ix%i %i_%i TX SIMD mismatch.\n",
-                      __FILE__, __LINE__, rows, cols, vtx_tab[tx_type],
-                      htx_tab[tx_type]);
-              assert(0);
-              exit(EXIT_FAILURE);
-            }
-          }
-        }
-      }
-#endif
-    }
-  }
-}
-
-#endif
diff --git a/av1/common/x86/daala_tx_kernels.h b/av1/common/x86/daala_tx_kernels.h
deleted file mode 100644
index 19f620f..0000000
--- a/av1/common/x86/daala_tx_kernels.h
+++ /dev/null
@@ -1,591 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* This header does not use an include guard.
-   It is intentionally designed to be included multiple times.
-   The file that includes it should define the following macros:
-
-   OD_KERNEL  A label for the width of the kernel, e.g., kernel8
-   OD_WORD    A label for the size of the SIMD word, e.g., epi16
-   OD_REG     The type of a SIMD register, e.g., __m128i
-   OD_ADD     The intrinsic function for addition
-   OD_SUB     The intrinsic function for subtraction
-   OD_RSHIFT1 The function that implements an unbiased right shift by 1
-   OD_AVG     The function that implements a signed PAVG[WD]
-              I.e., (a + b + 1) >> 1, without overflow
-   OD_HRSUB   The function that implements a VHRSUB.S<16|32>
-              I.e., (a - b + 1) >> 1, without overflow
-   OD_MUL     The function that implements the multiplies
-              I.e., (a * b + ((1 << r) >> 1)) >> r, without overflow
-   OD_SWAP    The function that swaps two SIMD registers
-
-   See daala_inv_txfm_avx2.c for examples. */
-
-#define OD_KERNEL_FUNC_IMPL(name, kernel, word) name##_##kernel##_##word
-#define OD_KERNEL_FUNC_WRAPPER(name, kernel, word) \
-  OD_KERNEL_FUNC_IMPL(name, kernel, word)
-#define OD_KERNEL_FUNC(name) OD_KERNEL_FUNC_WRAPPER(name, OD_KERNEL, OD_WORD)
-
-static INLINE void OD_KERNEL_FUNC(od_rotate_add)(OD_REG *q0, OD_REG *q1, int c0,
-                                                 int r0, int c1, int r1, int c2,
-                                                 int r2, int s, int avg) {
-  OD_REG t_;
-  OD_REG u_;
-
-  if (avg)
-    t_ = OD_AVG(*q0, *q1);
-  else
-    t_ = OD_ADD(*q0, *q1);
-  u_ = OD_MUL(*q1, c0, r0);
-  *q1 = OD_MUL(*q0, c1, r1);
-  t_ = OD_MUL(t_, c2, r2);
-  if (s)
-    *q0 = OD_SUB(u_, OD_RSHIFT1(t_));
-  else
-    *q0 = OD_SUB(u_, t_);
-  *q1 = OD_ADD(*q1, t_);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_rotate_addh)(OD_REG *q0, OD_REG *q1,
-                                                  OD_REG *q1h, int c0, int r0,
-                                                  int c1, int r1, int c2,
-                                                  int r2, int s) {
-  OD_REG t_;
-  OD_REG u_;
-
-  t_ = OD_ADD(*q0, *q1h);
-  u_ = OD_MUL(*q1, c0, r0);
-  *q1 = OD_MUL(*q0, c1, r1);
-  t_ = OD_MUL(t_, c2, r2);
-  *q0 = OD_SUB(u_, t_);
-  if (s)
-    *q1 = OD_ADD(*q1, OD_RSHIFT1(t_));
-  else
-    *q1 = OD_ADD(*q1, t_);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_rotate_sub)(OD_REG *q0, OD_REG *q1, int c0,
-                                                 int r0, int c1, int r1, int c2,
-                                                 int r2, int s) {
-  OD_REG t_;
-  OD_REG u_;
-
-  t_ = OD_SUB(*q0, *q1);
-  u_ = OD_MUL(*q1, c0, r0);
-  *q1 = OD_MUL(*q0, c1, r1);
-  t_ = OD_MUL(t_, c2, r2);
-  if (s)
-    *q0 = OD_ADD(u_, OD_RSHIFT1(t_));
-  else
-    *q0 = OD_ADD(u_, t_);
-  *q1 = OD_ADD(*q1, t_);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_rotate_sub2)(OD_REG *q0, OD_REG *q1,
-                                                  int c0, int r0, int c1,
-                                                  int r1, int c2, int r2,
-                                                  int avg) {
-  OD_REG t_;
-  OD_REG u_;
-
-  if (avg)
-    t_ = OD_HRSUB(*q1, *q0);
-  else
-    t_ = OD_SUB(*q1, *q0);
-  u_ = OD_MUL(*q1, c0, r0);
-  *q1 = OD_MUL(*q0, c1, r1);
-  t_ = OD_MUL(t_, c2, r2);
-  *q0 = OD_SUB(t_, u_);
-  *q1 = OD_SUB(*q1, t_);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_rotate_subh)(OD_REG *q0, OD_REG *q1,
-                                                  OD_REG *q1h, int c0, int r0,
-                                                  int c1, int r1, int c2,
-                                                  int r2, int s) {
-  OD_REG t_;
-  OD_REG u_;
-
-  t_ = OD_SUB(*q0, *q1h);
-  u_ = OD_MUL(*q1, c0, r0);
-  *q1 = OD_MUL(*q0, c1, r1);
-  t_ = OD_MUL(t_, c2, r2);
-  *q0 = OD_ADD(u_, t_);
-  if (s)
-    *q1 = OD_ADD(*q1, OD_RSHIFT1(t_));
-  else
-    *q1 = OD_ADD(*q1, t_);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_rotate45)(OD_REG *p0, OD_REG *p1,
-                                               int avg) {
-  OD_REG t_;
-  if (avg)
-    t_ = OD_AVG(*p0, *p1);
-  else
-    t_ = OD_ADD(*p0, *p1);
-  /* 11585/8192 ~= 2*Sin[Pi/4] ~= 1.4142135623730951 */
-  *p0 = OD_MUL(*p1, 11585, 13);
-  /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */
-  if (avg)
-    *p1 = OD_MUL(t_, 11585, 13);
-  else
-    *p1 = OD_MUL(t_, 11585, 14);
-  *p0 = OD_SUB(*p0, *p1);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_butterfly_add)(OD_REG *q0, OD_REG *q1) {
-  *q0 = OD_ADD(*q0, OD_RSHIFT1(*q1));
-  *q1 = OD_SUB(*q0, *q1);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_butterfly_add2)(OD_REG *q0, OD_REG *q1) {
-  *q0 = OD_ADD(*q0, OD_RSHIFT1(*q1));
-  *q1 = OD_SUB(*q1, *q0);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_butterfly_sub2)(OD_REG *q0, OD_REG *q1) {
-  *q0 = OD_SUB(*q0, OD_RSHIFT1(*q1));
-  *q1 = OD_ADD(*q1, *q0);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_butterfly_addh)(OD_REG *q0, OD_REG *q1,
-                                                     OD_REG *q1h) {
-  *q0 = OD_ADD(*q0, *q1h);
-  *q1 = OD_SUB(*q1, *q0);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_butterfly_subh)(OD_REG *q0, OD_REG *q1,
-                                                     OD_REG *q1h) {
-  *q0 = OD_SUB(*q0, *q1h);
-  *q1 = OD_ADD(*q1, *q0);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_butterfly_v1)(OD_REG *q0, OD_REG *q1,
-                                                   OD_REG *q1h) {
-  *q1 = OD_SUB(*q0, *q1);
-  *q1h = OD_RSHIFT1(*q1);
-  *q0 = OD_SUB(*q0, *q1h);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_butterfly_v2)(OD_REG *q0, OD_REG *q1,
-                                                   OD_REG *q1h) {
-  *q1 = OD_SUB(*q1, *q0);
-  *q1h = OD_RSHIFT1(*q1);
-  *q0 = OD_ADD(*q0, *q1h);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_butterfly_v3)(OD_REG *q0, OD_REG *q1,
-                                                   OD_REG *q1h) {
-  *q1 = OD_ADD(*q0, *q1);
-  *q1h = OD_RSHIFT1(*q1);
-  *q0 = OD_SUB(*q0, *q1h);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idct2)(OD_REG *p0, OD_REG *p1) {
-  OD_KERNEL_FUNC(od_rotate45)(p1, p0, 0);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idst2)(OD_REG *p0, OD_REG *p1, int neg) {
-  // Note: special case of rotation
-  OD_REG t_;
-  OD_REG u_;
-  if (neg)
-    t_ = OD_HRSUB(*p0, *p1);
-  else
-    t_ = OD_AVG(*p0, *p1);
-  /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */
-  u_ = OD_MUL(*p0, 21407, 14);
-  /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.541196100146197 */
-  *p0 = OD_MUL(*p1, 8867, 14);
-  /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */
-  t_ = OD_MUL(t_, 3135, 12);
-  if (neg) {
-    *p0 = OD_SUB(*p0, t_);
-    *p1 = OD_SUB(t_, u_);
-  } else {
-    *p0 = OD_ADD(*p0, t_);
-    *p1 = OD_SUB(u_, t_);
-  }
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idct2_asym)(OD_REG *p0, OD_REG *p1,
-                                                 OD_REG *p1h) {
-  OD_KERNEL_FUNC(od_butterfly_v1)(p0, p1, p1h);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idst2_asym)(OD_REG *p0, OD_REG *p1) {
-  // Note: special case of rotation
-  OD_REG t_;
-  OD_REG u_;
-  t_ = OD_AVG(*p0, *p1);
-  /* 3135/4096 ~= (Cos[Pi/8] - Sin[Pi/8])*Sqrt[2] = 0.7653668647301795 */
-  u_ = OD_MUL(*p1, 3135, 12);
-  /* 15137/16384 ~= (Cos[Pi/8] + Sin[Pi/8])/Sqrt[2] = 0.9238795325112867 */
-  *p1 = OD_MUL(*p0, 15137, 14);
-  /* 8867/8192 ~= Cos[3*Pi/8]*2*Sqrt[2] = 1.082392200292394 */
-  t_ = OD_MUL(t_, 8867, 13);
-  *p0 = OD_ADD(u_, t_);
-  *p1 = OD_SUB(*p1, OD_RSHIFT1(t_));
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idct4)(OD_REG *q0, OD_REG *q2, OD_REG *q1,
-                                            OD_REG *q3) {
-  OD_REG q1h;
-  OD_KERNEL_FUNC(od_idst2_asym)(q3, q2);
-  OD_KERNEL_FUNC(od_idct2_asym)(q0, q1, &q1h);
-  OD_KERNEL_FUNC(od_butterfly_addh)(q2, q1, &q1h);
-  OD_KERNEL_FUNC(od_butterfly_add)(q0, q3);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idct4_asym)(OD_REG *q0, OD_REG *q2,
-                                                 OD_REG *q1, OD_REG *q1h,
-                                                 OD_REG *q3, OD_REG *q3h) {
-  OD_KERNEL_FUNC(od_idst2)(q3, q2, 0);
-  OD_KERNEL_FUNC(od_idct2)(q0, q1);
-  OD_KERNEL_FUNC(od_butterfly_v2)(q2, q1, q1h);
-  OD_KERNEL_FUNC(od_butterfly_v1)(q0, q3, q3h);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idst_vii4)(OD_REG *q0, OD_REG *q1,
-                                                OD_REG *q2, OD_REG *q3) {
-  // Note: special case
-  OD_REG t0;
-  OD_REG t1;
-  OD_REG t2;
-  OD_REG t3;
-  OD_REG t3h;
-  OD_REG t4;
-  OD_REG u4;
-  t0 = OD_SUB(*q0, *q3);
-  t1 = OD_ADD(*q0, *q2);
-  t2 = OD_ADD(*q3, OD_HRSUB(t0, *q2));
-  t3 = *q1;
-  t4 = OD_ADD(*q2, *q3);
-  /* 467/2048 ~= 2*Sin[1*Pi/9]/3 ~= 0.228013428883779 */
-  t0 = OD_MUL(t0, 467, 11);
-  /* 7021/16384 ~= 2*Sin[2*Pi/9]/3 ~= 0.428525073124360 */
-  t1 = OD_MUL(t1, 7021, 14);
-  /* 37837/32768 ~= 4*Sin[3*Pi/9]/3 ~= 1.154700538379252 */
-  t2 = OD_MUL(t2, 37837, 15);
-  /* 37837/32768 ~= 4*Sin[3*Pi/9]/3 ~= 1.154700538379252 */
-  t3 = OD_MUL(t3, 37837, 15);
-  /* 21513/32768 ~= 2*Sin[4*Pi/9]/3 ~= 0.656538502008139 */
-  t4 = OD_MUL(t4, 21513, 15);
-  t3h = OD_RSHIFT1(t3);
-  u4 = OD_ADD(t4, t3h);
-  *q0 = OD_ADD(t0, u4);
-  /* We swap q1 and q2 to correct for the bitreverse reordering that
-     od_row_tx4_avx2() does. */
-  *q2 = OD_ADD(t1, OD_SUB(t3, u4));
-  *q1 = t2;
-  *q3 = OD_ADD(t0, OD_SUB(t1, t3h));
-}
-
-static INLINE void OD_KERNEL_FUNC(od_flip_idst_vii4)(OD_REG *q0, OD_REG *q1,
-                                                     OD_REG *q2, OD_REG *q3) {
-  OD_KERNEL_FUNC(od_idst_vii4)(q0, q1, q2, q3);
-  OD_SWAP(q0, q3);
-  OD_SWAP(q1, q2);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idst4)(OD_REG *q0, OD_REG *q1, OD_REG *q2,
-                                            OD_REG *q3) {
-  OD_REG q2h;
-  OD_REG q3h;
-  OD_KERNEL_FUNC(od_rotate45)(q2, q1, 1);
-  OD_KERNEL_FUNC(od_butterfly_v3)(q0, q2, &q2h);
-  OD_KERNEL_FUNC(od_butterfly_v3)(q1, q3, &q3h);
-  /* 16069/16384 ~= (Sin[5*Pi/16] + Cos[5*Pi/16])/Sqrt[2] ~= 0.9807852804032 */
-  /* 12785/32768 ~= (Sin[5*Pi/16] - Cos[5*Pi/16])*Sqrt[2] ~= 0.3901806440323 */
-  /* 12873/16384 ~= Cos[5*Pi/16]*Sqrt[2] ~= 0.7856949583871021 */
-  OD_KERNEL_FUNC(od_rotate_addh)
-  (q1, q2, &q2h, 16069, 14, 12785, 15, 12873, 14, 0);
-  /* 13623/16384 ~= (Sin[7*Pi/16] + Cos[7*Pi/16])/Sqrt[2] ~= 0.8314696123025 */
-  /* 18205/16384 ~= (Sin[7*Pi/16] - Cos[7*Pi/16])*Sqrt[2] ~= 1.1111404660392 */
-  /* 9041/32768 ~= Cos[7*Pi/16]*Sqrt[2] = 0.275899379282943 */
-  OD_KERNEL_FUNC(od_rotate_subh)
-  (q0, q3, &q3h, 13623, 14, 18205, 14, 9041, 15, 0);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idst4_asym)(OD_REG *q0, OD_REG *q2,
-                                                 OD_REG *q1, OD_REG *q3) {
-  OD_REG q1h;
-  OD_REG q3h;
-  OD_KERNEL_FUNC(od_rotate45)(q1, q2, 1);
-  OD_KERNEL_FUNC(od_butterfly_v3)(q0, q1, &q1h);
-  OD_KERNEL_FUNC(od_butterfly_v3)(q2, q3, &q3h);
-  /* 45451/32768 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */
-  /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */
-  /* 18205/16384 ~= 2*Cos[5*Pi/16] ~= 1.1111404660392044 */
-  OD_KERNEL_FUNC(od_rotate_addh)
-  (q2, q1, &q1h, 45451, 15, 9041, 15, 18205, 14, 1);
-  /* 38531/32768 ~= Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
-  /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
-  /* 12785/32768 ~= 2*Cos[7*Pi/16] = 0.3901806440322565 */
-  OD_KERNEL_FUNC(od_rotate_subh)
-  (q0, q3, &q3h, 38531, 15, 12873, 14, 12785, 15, 1);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idct8)(OD_REG *r0, OD_REG *r4, OD_REG *r2,
-                                            OD_REG *r6, OD_REG *r1, OD_REG *r5,
-                                            OD_REG *r3, OD_REG *r7) {
-  OD_REG r1h;
-  OD_REG r3h;
-  OD_KERNEL_FUNC(od_idst4_asym)(r7, r5, r6, r4);
-  OD_KERNEL_FUNC(od_idct4_asym)(r0, r2, r1, &r1h, r3, &r3h);
-  OD_KERNEL_FUNC(od_butterfly_addh)(r4, r3, &r3h);
-  OD_KERNEL_FUNC(od_butterfly_add)(r2, r5);
-  OD_KERNEL_FUNC(od_butterfly_addh)(r6, r1, &r1h);
-  OD_KERNEL_FUNC(od_butterfly_add)(r0, r7);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idct8_asym)(
-    OD_REG *r0, OD_REG *r4, OD_REG *r2, OD_REG *r6, OD_REG *r1, OD_REG *r1h,
-    OD_REG *r5, OD_REG *r5h, OD_REG *r3, OD_REG *r3h, OD_REG *r7, OD_REG *r7h) {
-  OD_KERNEL_FUNC(od_idst4)(r7, r5, r6, r4);
-  OD_KERNEL_FUNC(od_idct4)(r0, r2, r1, r3);
-  OD_KERNEL_FUNC(od_butterfly_v1)(r0, r7, r7h);
-  OD_KERNEL_FUNC(od_butterfly_v2)(r6, r1, r1h);
-  OD_KERNEL_FUNC(od_butterfly_v1)(r2, r5, r5h);
-  OD_KERNEL_FUNC(od_butterfly_v2)(r4, r3, r3h);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idst8)(OD_REG *r0, OD_REG *r4, OD_REG *r2,
-                                            OD_REG *r6, OD_REG *r1, OD_REG *r5,
-                                            OD_REG *r3, OD_REG *r7) {
-  OD_REG r0h;
-  OD_REG r2h;
-  OD_REG r5h;
-  OD_REG r7h;
-  OD_KERNEL_FUNC(od_rotate45)(r1, r6, 1);
-  OD_KERNEL_FUNC(od_idst2)(r5, r2, 1);
-  OD_KERNEL_FUNC(od_idst2)(r4, r3, 0);
-  OD_KERNEL_FUNC(od_butterfly_v3)(r6, r7, &r7h);
-  OD_KERNEL_FUNC(od_butterfly_v3)(r4, r2, &r2h);
-  OD_KERNEL_FUNC(od_butterfly_v2)(r1, r0, &r0h);
-  OD_KERNEL_FUNC(od_butterfly_v3)(r3, r5, &r5h);
-  OD_KERNEL_FUNC(od_butterfly_subh)(r4, r7, &r7h);
-  OD_KERNEL_FUNC(od_butterfly_addh)(r6, r5, &r5h);
-  OD_KERNEL_FUNC(od_butterfly_addh)(r3, r0, &r0h);
-  OD_KERNEL_FUNC(od_butterfly_subh)(r1, r2, &r2h);
-  /* 17911/16384 ~= Sin[15*Pi/32] + Cos[15*Pi/32] ~= 1.0932018670017576 */
-  /* 14699/16384 ~= Sin[15*Pi/32] - Cos[15*Pi/32] ~= 0.8971675863426363 */
-  /* 803/8192 ~= Cos[15*Pi/32] ~= 0.0980171403295606 */
-  OD_KERNEL_FUNC(od_rotate_add)(r7, r0, 17911, 14, 14699, 14, 803, 13, 0, 0);
-  /* 40869/32768 ~= Sin[13*Pi/32] + Cos[13*Pi/32] ~= 1.247225012986671 */
-  /* 21845/32768 ~= Sin[13*Pi/32] - Cos[13*Pi/32] ~= 0.6666556584777465 */
-  /* 1189/4096 ~= Cos[13*Pi/32] ~= 0.29028467725446233 */
-  OD_KERNEL_FUNC(od_rotate_sub)(r1, r6, 40869, 15, 21845, 15, 1189, 12, 0);
-  /* 22173/16384 ~= Sin[11*Pi/32] + Cos[11*Pi/32] ~= 1.3533180011743526 */
-  /* 3363/8192 ~= Sin[11*Pi/32] - Cos[11*Pi/32] ~= 0.4105245275223574 */
-  /* 15447/32768 ~= Cos[11*Pi/32] ~= 0.47139673682599764 */
-  OD_KERNEL_FUNC(od_rotate_add)(r5, r2, 22173, 14, 3363, 13, 15447, 15, 0, 0);
-  /* 23059/16384 ~= Sin[9*Pi/32] + Cos[9*Pi/32] ~= 1.4074037375263826 */
-  /* 2271/16384 ~= Sin[9*Pi/32] - Cos[9*Pi/32] ~= 0.1386171691990915 */
-  /* 5197/8192 ~= Cos[9*Pi/32] ~= 0.6343932841636455 */
-  OD_KERNEL_FUNC(od_rotate_sub)(r3, r4, 23059, 14, 2271, 14, 5197, 13, 0);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idst8_asym)(OD_REG *r0, OD_REG *r4,
-                                                 OD_REG *r2, OD_REG *r6,
-                                                 OD_REG *r1, OD_REG *r5,
-                                                 OD_REG *r3, OD_REG *r7) {
-  OD_REG r0h;
-  OD_REG r2h;
-  OD_REG r5h;
-  OD_REG r7h;
-  OD_KERNEL_FUNC(od_rotate45)(r1, r6, 1);
-  OD_KERNEL_FUNC(od_idst2)(r5, r2, 1);
-  OD_KERNEL_FUNC(od_idst2)(r4, r3, 0);
-  OD_KERNEL_FUNC(od_butterfly_v3)(r6, r7, &r7h);
-  OD_KERNEL_FUNC(od_butterfly_v3)(r4, r2, &r2h);
-  OD_KERNEL_FUNC(od_butterfly_v2)(r1, r0, &r0h);
-  OD_KERNEL_FUNC(od_butterfly_v3)(r3, r5, &r5h);
-  OD_KERNEL_FUNC(od_butterfly_subh)(r4, r7, &r7h);
-  OD_KERNEL_FUNC(od_butterfly_addh)(r6, r5, &r5h);
-  OD_KERNEL_FUNC(od_butterfly_addh)(r3, r0, &r0h);
-  OD_KERNEL_FUNC(od_butterfly_subh)(r1, r2, &r2h);
-  /* 12665/16384 ~= (Sin[15*Pi/32] + Cos[15*Pi/32])/Sqrt[2] ~= 0.77301045336 */
-  /* 5197/4096 ~= (Sin[15*Pi/32] - Cos[15*Pi/32])*Sqrt[2] ~= 1.2687865683273 */
-  /* 2271/16384 ~= Cos[15*Pi/32]*Sqrt[2] ~= 0.13861716919909148 */
-  OD_KERNEL_FUNC(od_rotate_add)(r7, r0, 12665, 14, 5197, 12, 2271, 14, 1, 0);
-  /* 28899/32768 ~= (Sin[13*Pi/32] + Cos[13*Pi/32])/Sqrt[2] ~= 0.88192126435 */
-  /* 30893/32768 ~= (Sin[13*Pi/32] - Cos[13*Pi/32])*Sqrt[2] ~= 0.94279347365 */
-  /* 3363/8192 ~= Cos[13*Pi/32]*Sqrt[2] ~= 0.41052452752235735 */
-  OD_KERNEL_FUNC(od_rotate_sub)(r1, r6, 28899, 15, 30893, 15, 3363, 13, 1);
-  /* 31357/32768 ~= (Sin[11*Pi/32] + Cos[11*Pi/32])/Sqrt[2] ~= 0.95694033573 */
-  /* 1189/2048 ~= (Sin[11*Pi/32] - Cos[11*Pi/32])*Sqrt[2] ~= 0.5805693545089 */
-  /* 21845/32768 ~= Cos[11*Pi/32] ~= 0.6666556584777465 */
-  OD_KERNEL_FUNC(od_rotate_add)(r5, r2, 31357, 15, 1189, 11, 21845, 15, 1, 0);
-  /* 16305/16384 ~= (Sin[9*Pi/32] + Cos[9*Pi/32])/Sqrt[2] ~= 0.9951847266722 */
-  /* 803/4096 ~= (Sin[9*Pi/32] - Cos[9*Pi/32])*Sqrt[2] ~= 0.1960342806591213 */
-  /* 14699/16384 ~= Cos[9*Pi/32]*Sqrt[2] ~= 0.8971675863426364 */
-  OD_KERNEL_FUNC(od_rotate_sub)(r3, r4, 16305, 14, 803, 12, 14699, 14, 1);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_flip_idst8)(OD_REG *r0, OD_REG *r4,
-                                                 OD_REG *r2, OD_REG *r6,
-                                                 OD_REG *r1, OD_REG *r5,
-                                                 OD_REG *r3, OD_REG *r7) {
-  OD_KERNEL_FUNC(od_idst8)(r0, r4, r2, r6, r1, r5, r3, r7);
-  OD_SWAP(r0, r7);
-  OD_SWAP(r4, r3);
-  OD_SWAP(r2, r5);
-  OD_SWAP(r6, r1);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idct16)(OD_REG *s0, OD_REG *s8, OD_REG *s4,
-                                             OD_REG *sc, OD_REG *s2, OD_REG *sa,
-                                             OD_REG *s6, OD_REG *se, OD_REG *s1,
-                                             OD_REG *s9, OD_REG *s5, OD_REG *sd,
-                                             OD_REG *s3, OD_REG *sb, OD_REG *s7,
-                                             OD_REG *sf) {
-  OD_REG s1h;
-  OD_REG s3h;
-  OD_REG s5h;
-  OD_REG s7h;
-  OD_KERNEL_FUNC(od_idst8_asym)(sf, sb, sd, s9, se, sa, sc, s8);
-  OD_KERNEL_FUNC(od_idct8_asym)
-  (s0, s4, s2, s6, s1, &s1h, s5, &s5h, s3, &s3h, s7, &s7h);
-  OD_KERNEL_FUNC(od_butterfly_addh)(s8, s7, &s7h);
-  OD_KERNEL_FUNC(od_butterfly_add)(s6, s9);
-  OD_KERNEL_FUNC(od_butterfly_addh)(sa, s5, &s5h);
-  OD_KERNEL_FUNC(od_butterfly_add)(s4, sb);
-  OD_KERNEL_FUNC(od_butterfly_addh)(sc, s3, &s3h);
-  OD_KERNEL_FUNC(od_butterfly_add)(s2, sd);
-  OD_KERNEL_FUNC(od_butterfly_addh)(se, s1, &s1h);
-  OD_KERNEL_FUNC(od_butterfly_add)(s0, sf);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idst16)(OD_REG *s0, OD_REG *s1, OD_REG *s2,
-                                             OD_REG *s3, OD_REG *s4, OD_REG *s5,
-                                             OD_REG *s6, OD_REG *s7, OD_REG *s8,
-                                             OD_REG *s9, OD_REG *sa, OD_REG *sb,
-                                             OD_REG *sc, OD_REG *sd, OD_REG *se,
-                                             OD_REG *sf) {
-  OD_REG s0h;
-  OD_REG s1h;
-  OD_REG s2h;
-  OD_REG s3h;
-  OD_REG s4h;
-  OD_REG s5h;
-  OD_REG s6h;
-  OD_REG s7h;
-  OD_REG sbh;
-  OD_REG sfh;
-  OD_REG h;
-  OD_KERNEL_FUNC(od_rotate45)(s9, s6, 1);
-  OD_KERNEL_FUNC(od_rotate45)(sa, s5, 1);
-  OD_KERNEL_FUNC(od_rotate45)(s8, s7, 1);
-  OD_KERNEL_FUNC(od_idst2)(s3, sc, 0);
-  OD_KERNEL_FUNC(od_idst2)(sb, s4, 1);
-  OD_KERNEL_FUNC(od_butterfly_v3)(s2, sa, &h);
-  OD_KERNEL_FUNC(od_butterfly_v2)(sd, s5, &h);
-  OD_KERNEL_FUNC(od_butterfly_v2)(s9, s1, &h);
-  OD_KERNEL_FUNC(od_butterfly_v3)(s6, se, &h);
-  OD_KERNEL_FUNC(od_butterfly_v3)(sc, sb, &sbh);
-  OD_KERNEL_FUNC(od_butterfly_v3)(s7, sf, &sfh);
-  OD_KERNEL_FUNC(od_butterfly_v2)(s8, s0, &s0h);
-  OD_KERNEL_FUNC(od_butterfly_v3)(s3, s4, &s4h);
-  /* 38531/32768 ~= Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
-  /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
-  /* 6393/32768 ~= Cos[7*Pi/16] = 0.19509032201612825 */
-  OD_KERNEL_FUNC(od_rotate_sub2)
-  (s2, sd, 38531, 15, 12873, 14, 6393, 15, 0);
-  /* 22725/16384 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */
-  /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */
-  /* 18205/16384 ~= 2*Cos[5*Pi/16] ~= 1.1111404660392044 */
-  OD_KERNEL_FUNC(od_rotate_sub2)
-  (sa, s5, 22725, 14, 9041, 15, 18205, 14, 1);
-  /* 45451/32768 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */
-  /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */
-  /* 18205/32768 ~= Cos[5*Pi/16] ~= 0.5555702330196022 */
-  OD_KERNEL_FUNC(od_rotate_add)
-  (s6, s9, 45451, 15, 9041, 15, 18205, 15, 0, 0);
-  /* 9633/8192 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */
-  /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */
-  /* 12785/32768 ~= 2*Cos[7*Pi/16] ~= 0.3901806440322565 */
-  OD_KERNEL_FUNC(od_rotate_add)
-  (se, s1, 9633, 13, 12873, 14, 12785, 15, 0, 1);
-  OD_KERNEL_FUNC(od_butterfly_subh)(s8, s4, &s4h);
-  OD_KERNEL_FUNC(od_butterfly_addh)(s7, sb, &sbh);
-  OD_KERNEL_FUNC(od_butterfly_subh)(s3, sf, &sfh);
-  OD_KERNEL_FUNC(od_butterfly_addh)(sc, s0, &s0h);
-  OD_KERNEL_FUNC(od_butterfly_add2)(sd, se);
-  OD_KERNEL_FUNC(od_butterfly_add2)(s2, s1);
-  OD_KERNEL_FUNC(od_butterfly_sub2)(s6, s5);
-  OD_KERNEL_FUNC(od_butterfly_sub2)(s9, sa);
-  OD_KERNEL_FUNC(od_butterfly_v2)(se, s0, &s0h);
-  OD_KERNEL_FUNC(od_butterfly_v2)(sf, s1, &s1h);
-  OD_KERNEL_FUNC(od_butterfly_v3)(sc, s2, &s2h);
-  OD_KERNEL_FUNC(od_butterfly_v3)(sd, s3, &s3h);
-  OD_KERNEL_FUNC(od_butterfly_v2)(sa, s4, &s4h);
-  OD_KERNEL_FUNC(od_butterfly_v2)(sb, s5, &s5h);
-  OD_KERNEL_FUNC(od_butterfly_v3)(s8, s6, &s6h);
-  OD_KERNEL_FUNC(od_butterfly_v3)(s9, s7, &s7h);
-  /* 32729/32768 ~= (Sin[17*Pi/64] + Cos[17*Pi/64])/Sqrt[2] ~= 0.99879545620 */
-  /* 201/2048 ~= (Sin[17*Pi/64] - Cos[17*Pi/64])*Sqrt[2] ~= 0.09813534865484 */
-  /* 31121/32768 ~= Cos[17*Pi/64]*Sqrt[2] ~= 0.9497277818777543 */
-  OD_KERNEL_FUNC(od_rotate_subh)
-  (se, s1, &s1h, 32729, 15, 201, 11, 31121, 15, 0);
-  /* 32413/32768 ~= (Sin[19*Pi/64] + Cos[19*Pi/64])/Sqrt[2] ~= 0.98917650996 */
-  /* 601/2048 ~= (Sin[19*Pi/64] - Cos[19*Pi/64])*Sqrt[2] ~= 0.29346094891072 */
-  /* 27605/32768 ~= Cos[19*Pi/64]*Sqrt[2] ~= 0.8424460355094193 */
-  OD_KERNEL_FUNC(od_rotate_addh)
-  (s9, s6, &s6h, 32413, 15, 601, 11, 27605, 15, 0);
-  /* 15893/16384 ~= (Sin[21*Pi/64] + Cos[21*Pi/64])/Sqrt[2] ~= 0.97003125319 */
-  /* 3981/8192 ~= (Sin[21*Pi/64] - Cos[21*Pi/64])*Sqrt[2] ~= 0.4859603598065 */
-  /* 1489/2048 ~= Cos[21*Pi/64]*Sqrt[2] ~= 0.72705107329128 */
-  OD_KERNEL_FUNC(od_rotate_subh)
-  (sa, s5, &s5h, 15893, 14, 3981, 13, 1489, 11, 0);
-  /* 30853/32768 ~= (Sin[23*Pi/64] + Cos[23*Pi/64])/Sqrt[2] ~= 0.94154406518 */
-  /* 11039/16384 ~= (Sin[23*Pi/64] - Cos[23*Pi/64])*Sqrt[2] ~= 0.67377970678 */
-  /* 19813/32768 ~= Cos[23*Pi/64]*Sqrt[2] ~= 0.6046542117908008 */
-  OD_KERNEL_FUNC(od_rotate_addh)
-  (sd, s2, &s2h, 30853, 15, 11039, 14, 19813, 15, 0);
-  /* 14811/16384 ~= (Sin[25*Pi/64] + Cos[25*Pi/64])/Sqrt[2] ~= 0.90398929312 */
-  /* 7005/8192 ~= (Sin[25*Pi/64] - Cos[25*Pi/64])*Sqrt[2] ~= 0.8551101868606 */
-  /* 3903/8192 ~= Cos[25*Pi/64]*Sqrt[2] ~= 0.47643419969316125 */
-  OD_KERNEL_FUNC(od_rotate_subh)
-  (sc, s3, &s3h, 14811, 14, 7005, 13, 3903, 13, 0);
-  /* 14053/16384 ~= (Sin[27*Pi/64] + Cos[27*Pi/64])/Sqrt[2] ~= 0.85772861000 */
-  /* 8423/8192 ~= (Sin[27*Pi/64] - Cos[27*Pi/64])*Sqrt[2] ~= 1.0282054883864 */
-  /* 2815/8192 ~= Cos[27*Pi/64]*Sqrt[2] ~= 0.34362586580705035 */
-  OD_KERNEL_FUNC(od_rotate_addh)
-  (sb, s4, &s4h, 14053, 14, 8423, 13, 2815, 13, 0);
-  /* 1645/2048 ~= (Sin[29*Pi/64] + Cos[29*Pi/64])/Sqrt[2] ~= 0.8032075314806 */
-  /* 305/256 ~= (Sin[29*Pi/64] - Cos[29*Pi/64])*Sqrt[2] ~= 1.191398608984867 */
-  /* 425/2048 ~= Cos[29*Pi/64]*Sqrt[2] ~= 0.20750822698821159 */
-  OD_KERNEL_FUNC(od_rotate_subh)
-  (s8, s7, &s7h, 1645, 11, 305, 8, 425, 11, 0);
-  /* 24279/32768 ~= (Sin[31*Pi/64] + Cos[31*Pi/64])/Sqrt[2] ~= 0.74095112535 */
-  /* 44011/32768 ~= (Sin[31*Pi/64] - Cos[31*Pi/64])*Sqrt[2] ~= 1.34311790969 */
-  /* 1137/16384 ~= Cos[31*Pi/64]*Sqrt[2] ~= 0.06939217050794069 */
-  OD_KERNEL_FUNC(od_rotate_addh)
-  (sf, s0, &s0h, 24279, 15, 44011, 15, 1137, 14, 0);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_flip_idst16)(
-    OD_REG *s0, OD_REG *s1, OD_REG *s2, OD_REG *s3, OD_REG *s4, OD_REG *s5,
-    OD_REG *s6, OD_REG *s7, OD_REG *s8, OD_REG *s9, OD_REG *sa, OD_REG *sb,
-    OD_REG *sc, OD_REG *sd, OD_REG *se, OD_REG *sf) {
-  OD_KERNEL_FUNC(od_idst16)
-  (s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sa, sb, sc, sd, se, sf);
-  OD_SWAP(s0, sf);
-  OD_SWAP(s1, se);
-  OD_SWAP(s2, sd);
-  OD_SWAP(s3, sc);
-  OD_SWAP(s4, sb);
-  OD_SWAP(s5, sa);
-  OD_SWAP(s6, s9);
-  OD_SWAP(s7, s8);
-}
diff --git a/av1/decoder/decodetxb.c b/av1/decoder/decodetxb.c
index dfe0e21..aea54cc 100644
--- a/av1/decoder/decodetxb.c
+++ b/av1/decoder/decodetxb.c
@@ -82,19 +82,13 @@
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int16_t *const dequant = pd->seg_dequant_QTX[mbmi->segment_id];
   tran_low_t *const tcoeffs = pd->dqcoeff;
-#if !CONFIG_DAALA_TX
   const int shift = av1_get_tx_scale(tx_size);
-#endif
 #if CONFIG_NEW_QUANT
 #if !CONFIG_AOM_QM
   const tran_low_t *dqv_val = &dq_val[0][0];
 #endif  // !CONFIG_AOM_QM
 
-#if CONFIG_DAALA_TX
-  const int nq_shift = 0;
-#else
   const int nq_shift = shift;
-#endif  // CONFIG_DAALA_TX
 #endif  // CONFIG_NEW_QUANT && !CONFIG_AOM_QM
   const int bwl = get_txb_bwl(tx_size);
   const int width = get_txb_wide(tx_size);
@@ -283,9 +277,7 @@
 #endif  // CONFIG_AOM_QM
 #else
         v = level * dequant[!!c];
-#if !CONFIG_DAALA_TX
         v = v >> shift;
-#endif  // !CONFIG_DAALA_TX
 #endif  // CONFIG_NEW_QUANT
         tcoeffs[pos] = v;
       } else {
@@ -351,9 +343,7 @@
 #endif  // CONFIG_AOM_QM
 #else
         t = *level * dequant[!!pos];
-#if !CONFIG_DAALA_TX
         t = t >> shift;
-#endif  // !CONFIG_DAALA_TX
 #endif  // CONFIG_NEW_QUANT
         if (signs[pos]) t = -t;
         tcoeffs[pos] = clamp(t, min_value, max_value);
@@ -374,9 +364,7 @@
 #endif  // CONFIG_AOM_QM
 #else
       t = t * dequant[!!pos];
-#if !CONFIG_DAALA_TX
       t = t >> shift;
-#endif  // !CONFIG_DAALA_TX
 #endif  // CONFIG_NEW_QUANT
       if (signs[pos]) t = -t;
       tcoeffs[pos] = clamp(t, min_value, max_value);
diff --git a/av1/decoder/detokenize.c b/av1/decoder/detokenize.c
index e410fa1..0c98069 100644
--- a/av1/decoder/detokenize.c
+++ b/av1/decoder/detokenize.c
@@ -106,16 +106,10 @@
   const tran_low_t *dqv_val = &dq_val[0][0];
 #endif  // CONFIG_NEW_QUANT && !CONFIG_AOM_QM
 
-#if !CONFIG_DAALA_TX
   int dq_shift = av1_get_tx_scale(tx_size);
-#endif
 
 #if CONFIG_NEW_QUANT
-#if CONFIG_DAALA_TX
-  int nq_shift = 0;
-#else
   int nq_shift = dq_shift;
-#endif  // CONFIG_DAALA_TX
 #endif  // CONFIG_NEW_QUANT
 
   band = *band_translate++;
@@ -192,11 +186,7 @@
     v = av1_dequant_abscoeff_nuq(val, dqv, dqv_val, nq_shift);
 #endif  // CONFIG_AOM_QM
 #else
-#if !CONFIG_DAALA_TX
     v = (int)(((int64_t)val * dqv) >> dq_shift);
-#else
-    v = val * dqv;
-#endif
 #endif
 
     v = (int)check_range(av1_read_record_bit(xd->counts, r, ACCT_STR) ? -v : v,
diff --git a/av1/encoder/daala_fwd_txfm.c b/av1/encoder/daala_fwd_txfm.c
deleted file mode 100644
index 096a1b5..0000000
--- a/av1/encoder/daala_fwd_txfm.c
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "av1/common/daala_tx.h"
-#include "av1/encoder/daala_fwd_txfm.h"
-
-#if CONFIG_DAALA_TX
-
-// Complete Daala TX map, sans lossless which is special cased
-typedef void (*daala_ftx)(od_coeff[], const od_coeff *, int);
-
-static daala_ftx tx_map[TX_SIZES][TX_TYPES_1D] = {
-  //  4-point transforms
-  { od_bin_fdct4, od_bin_fdst4, od_bin_fdst4, od_bin_fidtx4 },
-
-  //  8-point transforms
-  { od_bin_fdct8, od_bin_fdst8, od_bin_fdst8, od_bin_fidtx8 },
-
-  //  16-point transforms
-  { od_bin_fdct16, od_bin_fdst16, od_bin_fdst16, od_bin_fidtx16 },
-
-  //  32-point transforms
-  { od_bin_fdct32, od_bin_fdst32, od_bin_fdst32, od_bin_fidtx32 },
-
-#if CONFIG_TX64X64
-  //  64-point transforms
-  { od_bin_fdct64, NULL, NULL, od_bin_fidtx64 },
-#endif
-};
-
-static int tx_flip(TX_TYPE_1D t) { return t == 2; }
-
-// Daala TX toplevel entry point, same interface as av1 low-bidepth
-// and high-bitdepth TX (av1_fwd_txfm and av1_highbd_fwd_txfm).  This
-// same function is intended for both low and high bitdepth cases with
-// a tran_low_t of 32 bits (matching od_coeff).
-void daala_fwd_txfm(const int16_t *input_pixels, tran_low_t *output_coeffs,
-                    int input_stride, TxfmParam *txfm_param) {
-  const TX_SIZE tx_size = txfm_param->tx_size;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  assert(tx_size <= TX_SIZES_ALL);
-  assert(tx_type <= TX_TYPES);
-
-  if (txfm_param->lossless) {
-    // Transform function special-cased for lossless
-    assert(tx_type == DCT_DCT);
-    assert(tx_size == TX_4X4);
-    av1_fwht4x4(input_pixels, output_coeffs, input_stride);
-  } else {
-    // General TX case
-    const int upshift = TX_COEFF_DEPTH - txfm_param->bd;
-    assert(upshift >= 0);
-    assert(sizeof(tran_low_t) == sizeof(od_coeff));
-    assert(sizeof(tran_low_t) >= 4);
-
-    // Hook into existing map translation infrastructure to select
-    // appropriate TX functions
-    const int cols = tx_size_wide[tx_size];
-    const int rows = tx_size_high[tx_size];
-    const TX_SIZE col_idx = txsize_vert_map[tx_size];
-    const TX_SIZE row_idx = txsize_horz_map[tx_size];
-    assert(col_idx <= TX_SIZES);
-    assert(row_idx <= TX_SIZES);
-    assert(vtx_tab[tx_type] <= (int)TX_TYPES_1D);
-    assert(htx_tab[tx_type] <= (int)TX_TYPES_1D);
-    daala_ftx col_tx = tx_map[col_idx][vtx_tab[tx_type]];
-    daala_ftx row_tx = tx_map[row_idx][htx_tab[tx_type]];
-    int col_flip = tx_flip(vtx_tab[tx_type]);
-    int row_flip = tx_flip(htx_tab[tx_type]);
-    od_coeff tmp[MAX_TX_SIZE];
-    int r;
-    int c;
-
-    assert(col_tx);
-    assert(row_tx);
-
-    // Transform columns
-    for (c = 0; c < cols; ++c) {
-      // Cast and shift
-      for (r = 0; r < rows; ++r)
-        tmp[r] =
-            ((od_coeff)(input_pixels[r * input_stride + c])) * (1 << upshift);
-      if (col_flip)
-        col_tx(tmp, tmp + (rows - 1), -1);
-      else
-        col_tx(tmp, tmp, 1);
-      // No ystride in daala_tx lowlevel functions, store output vector
-      // into column the long way
-      for (r = 0; r < rows; ++r) output_coeffs[r * cols + c] = tmp[r];
-    }
-
-    // Transform rows
-    for (r = 0; r < rows; ++r) {
-      if (row_flip)
-        row_tx(output_coeffs + r * cols, output_coeffs + r * cols + cols - 1,
-               -1);
-      else
-        row_tx(output_coeffs + r * cols, output_coeffs + r * cols, 1);
-    }
-#if CONFIG_TX64X64
-    // Re-pack coeffs in the first 32x32 indices.
-    if (cols > 32) {
-      int avail_rows;
-      int avail_cols;
-      avail_rows = AOMMIN(rows, 32);
-      avail_cols = AOMMIN(cols, 32);
-      for (r = 1; r < avail_rows; r++) {
-        memmove(output_coeffs + r * avail_cols, output_coeffs + r * cols,
-                avail_cols * sizeof(*output_coeffs));
-      }
-    }
-#endif
-  }
-}
-
-#endif
diff --git a/av1/encoder/daala_fwd_txfm.h b/av1/encoder/daala_fwd_txfm.h
deleted file mode 100644
index e8f777a..0000000
--- a/av1/encoder/daala_fwd_txfm.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_ENCODER_DAALA_FWD_TXFM_H_
-#define AV1_ENCODER_DAALA_FWD_TXFM_H_
-
-#include "./aom_config.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void daala_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
-                    TxfmParam *txfm_param);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AV1_ENCODER_DAALA_FWD_TXFM_H_
diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index 3e40a4d..ebaa0d1 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c
@@ -19,10 +19,6 @@
 #include "aom_ports/mem.h"
 #include "av1/common/blockd.h"
 #include "av1/common/idct.h"
-#if CONFIG_DAALA_TX4 || CONFIG_DAALA_TX8 || CONFIG_DAALA_TX16 || \
-    CONFIG_DAALA_TX32 || CONFIG_DAALA_TX64
-#include "av1/common/daala_tx.h"
-#endif
 #include "av1/encoder/av1_fwd_txfm1d.h"
 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
 
@@ -1153,32 +1149,12 @@
 void av1_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
                   TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
-#if !CONFIG_DAALA_TX4
   if (tx_type == DCT_DCT) {
     aom_fdct4x4_c(input, output, stride);
     return;
   }
-#endif
   {
     static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX4
-      { daala_fdct4, daala_fdct4 },  // DCT_DCT
-      { daala_fdst4, daala_fdct4 },  // ADST_DCT
-      { daala_fdct4, daala_fdst4 },  // DCT_ADST
-      { daala_fdst4, daala_fdst4 },  // ADST_ADST
-      { daala_fdst4, daala_fdct4 },  // FLIPADST_DCT
-      { daala_fdct4, daala_fdst4 },  // DCT_FLIPADST
-      { daala_fdst4, daala_fdst4 },  // FLIPADST_FLIPADST
-      { daala_fdst4, daala_fdst4 },  // ADST_FLIPADST
-      { daala_fdst4, daala_fdst4 },  // FLIPADST_ADST
-      { daala_idtx4, daala_idtx4 },  // IDTX
-      { daala_fdct4, daala_idtx4 },  // V_DCT
-      { daala_idtx4, daala_fdct4 },  // H_DCT
-      { daala_fdst4, daala_idtx4 },  // V_ADST
-      { daala_idtx4, daala_fdst4 },  // H_ADST
-      { daala_fdst4, daala_idtx4 },  // V_FLIPADST
-      { daala_idtx4, daala_fdst4 },  // H_FLIPADST
-#else
       { fdct4, fdct4 },    // DCT_DCT
       { fadst4, fdct4 },   // ADST_DCT
       { fdct4, fadst4 },   // DCT_ADST
@@ -1195,7 +1171,6 @@
       { fidtx4, fadst4 },  // H_ADST
       { fadst4, fidtx4 },  // V_FLIPADST
       { fidtx4, fadst4 },  // H_FLIPADST
-#endif
     };
     const transform_2d ht = FHT[tx_type];
     tran_low_t out[4 * 4];
@@ -1209,9 +1184,7 @@
     for (i = 0; i < 4; ++i) {
       /* A C99-safe upshift by 4 for both Daala and VPx TX. */
       for (j = 0; j < 4; ++j) temp_in[j] = input[j * stride + i] * 16;
-#if !CONFIG_DAALA_TX4
       if (i == 0 && temp_in[0]) temp_in[0] += 1;
-#endif
       ht.cols(temp_in, temp_out);
       for (j = 0; j < 4; ++j) out[j * 4 + i] = temp_out[j];
     }
@@ -1220,13 +1193,7 @@
     for (i = 0; i < 4; ++i) {
       for (j = 0; j < 4; ++j) temp_in[j] = out[j + i * 4];
       ht.rows(temp_in, temp_out);
-#if CONFIG_DAALA_TX4
-      /* Daala TX has orthonormal scaling; shift down by only 1 to achieve
-         the usual VPx coefficient left-shift of 3. */
-      for (j = 0; j < 4; ++j) output[j + i * 4] = temp_out[j] >> 1;
-#else
       for (j = 0; j < 4; ++j) output[j + i * 4] = (temp_out[j] + 1) >> 2;
-#endif
     }
   }
 }
@@ -1235,24 +1202,6 @@
                   TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
-    { daala_fdct8, daala_fdct4 },  // DCT_DCT
-    { daala_fdst8, daala_fdct4 },  // ADST_DCT
-    { daala_fdct8, daala_fdst4 },  // DCT_ADST
-    { daala_fdst8, daala_fdst4 },  // ADST_ADST
-    { daala_fdst8, daala_fdct4 },  // FLIPADST_DCT
-    { daala_fdct8, daala_fdst4 },  // DCT_FLIPADST
-    { daala_fdst8, daala_fdst4 },  // FLIPADST_FLIPADST
-    { daala_fdst8, daala_fdst4 },  // ADST_FLIPADST
-    { daala_fdst8, daala_fdst4 },  // FLIPADST_ADST
-    { daala_idtx8, daala_idtx4 },  // IDTX
-    { daala_fdct8, daala_idtx4 },  // V_DCT
-    { daala_idtx8, daala_fdct4 },  // H_DCT
-    { daala_fdst8, daala_idtx4 },  // V_ADST
-    { daala_idtx8, daala_fdst4 },  // H_ADST
-    { daala_fdst8, daala_idtx4 },  // V_FLIPADST
-    { daala_idtx8, daala_fdst4 },  // H_FLIPADST
-#else
     { fdct8, fdct4 },    // DCT_DCT
     { fadst8, fdct4 },   // ADST_DCT
     { fdct8, fadst4 },   // DCT_ADST
@@ -1269,7 +1218,6 @@
     { fidtx8, fadst4 },  // H_ADST
     { fadst8, fidtx4 },  // V_FLIPADST
     { fidtx8, fadst4 },  // H_FLIPADST
-#endif
   };
   const transform_2d ht = FHT[tx_type];
   const int n = 4;
@@ -1290,14 +1238,9 @@
   for (i = 0; i < n2; ++i) {
     // Input scaling
     for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
-      // Input scaling when LGT is not possible, Daala only (4 above)
-      temp_in[j] = input[i * stride + j] * 16;
-#else
       // Input scaling when Daala is not possible, LGT/AV1 only (1 above)
       temp_in[j] =
           (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
-#endif
     }
     // Row transform (AV1/LGT scale up .5 bit, Daala does not scale)
     ht.rows(temp_in, temp_out);
@@ -1321,24 +1264,6 @@
                   TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
-    { daala_fdct4, daala_fdct8 },  // DCT_DCT
-    { daala_fdst4, daala_fdct8 },  // ADST_DCT
-    { daala_fdct4, daala_fdst8 },  // DCT_ADST
-    { daala_fdst4, daala_fdst8 },  // ADST_ADST
-    { daala_fdst4, daala_fdct8 },  // FLIPADST_DCT
-    { daala_fdct4, daala_fdst8 },  // DCT_FLIPADST
-    { daala_fdst4, daala_fdst8 },  // FLIPADST_FLIPADST
-    { daala_fdst4, daala_fdst8 },  // ADST_FLIPADST
-    { daala_fdst4, daala_fdst8 },  // FLIPADST_ADST
-    { daala_idtx4, daala_idtx8 },  // IDTX
-    { daala_fdct4, daala_idtx8 },  // V_DCT
-    { daala_idtx4, daala_fdct8 },  // H_DCT
-    { daala_fdst4, daala_idtx8 },  // V_ADST
-    { daala_idtx4, daala_fdst8 },  // H_ADST
-    { daala_fdst4, daala_idtx8 },  // V_FLIPADST
-    { daala_idtx4, daala_fdst8 },  // H_FLIPADST
-#else
     { fdct4, fdct8 },    // DCT_DCT
     { fadst4, fdct8 },   // ADST_DCT
     { fdct4, fadst8 },   // DCT_ADST
@@ -1355,7 +1280,6 @@
     { fidtx4, fadst8 },  // H_ADST
     { fadst4, fidtx8 },  // V_FLIPADST
     { fidtx4, fadst8 },  // H_FLIPADST
-#endif
   };
   const transform_2d ht = FHT[tx_type];
   const int n = 4;
@@ -1375,14 +1299,9 @@
   // Columns
   for (i = 0; i < n2; ++i) {
     for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
-      // Input scaling when LGT is not possible, Daala only (4 above)
-      temp_in[j] = input[j * stride + i] * 16;
-#else
       // Input scaling when Daala is not possible, AV1/LGT only (1 above)
       temp_in[j] =
           (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
-#endif
     }
     // Column transform (AV1/LGT scale up .5 bit, Daala does not scale)
     ht.cols(temp_in, temp_out);
@@ -1500,24 +1419,6 @@
                    TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
-    { daala_fdct16, daala_fdct8 },  // DCT_DCT
-    { daala_fdst16, daala_fdct8 },  // ADST_DCT
-    { daala_fdct16, daala_fdst8 },  // DCT_ADST
-    { daala_fdst16, daala_fdst8 },  // ADST_ADST
-    { daala_fdst16, daala_fdct8 },  // FLIPADST_DCT
-    { daala_fdct16, daala_fdst8 },  // DCT_FLIPADST
-    { daala_fdst16, daala_fdst8 },  // FLIPADST_FLIPADST
-    { daala_fdst16, daala_fdst8 },  // ADST_FLIPADST
-    { daala_fdst16, daala_fdst8 },  // FLIPADST_ADST
-    { daala_idtx16, daala_idtx8 },  // IDTX
-    { daala_fdct16, daala_idtx8 },  // V_DCT
-    { daala_idtx16, daala_fdct8 },  // H_DCT
-    { daala_fdst16, daala_idtx8 },  // V_ADST
-    { daala_idtx16, daala_fdst8 },  // H_ADST
-    { daala_fdst16, daala_idtx8 },  // V_FLIPADST
-    { daala_idtx16, daala_fdst8 },  // H_FLIPADST
-#else
     { fdct16, fdct8 },    // DCT_DCT
     { fadst16, fdct8 },   // ADST_DCT
     { fdct16, fadst8 },   // DCT_ADST
@@ -1534,7 +1435,6 @@
     { fidtx16, fadst8 },  // H_ADST
     { fadst16, fidtx8 },  // V_FLIPADST
     { fidtx16, fadst8 },  // H_FLIPADST
-#endif
   };
   const transform_2d ht = FHT[tx_type];
   const int n = 8;
@@ -1555,14 +1455,9 @@
   for (i = 0; i < n2; ++i) {
     // Input scaling
     for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
-      // Input scaling when LGT is not possible, Daala only (case 4 above)
-      temp_in[j] = input[i * stride + j] * 16;
-#else
       // Input scaling when Daala is not possible, LGT/AV1 only (case 1 above)
       temp_in[j] =
           (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
-#endif
     }
 
     // Row transform (AV1/LGT scale up 1 bit, Daala does not scale)
@@ -1570,13 +1465,8 @@
 
     // Mid scaling
     for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
-      // mid scaling: only cases 2 and 4 possible
-      out[j * n2 + i] = temp_out[j];
-#else
       // mid scaling: only case 1 possible
       out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-#endif
     }
   }
 
@@ -1586,13 +1476,8 @@
     // Column transform (AV1/LGT scale up 1.5 bits, Daala does not scale)
     ht.cols(temp_in, temp_out);
     for (j = 0; j < n2; ++j) {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
-      // Output scaling (cases 2 and 3 above)
-      output[i + j * n] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-#else
       // Output scaling (case 1 above)
       output[i + j * n] = temp_out[j];
-#endif
     }
   }
   // Note: overall scale factor of transform is 8 times unitary
@@ -1602,24 +1487,6 @@
                    TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
-    { daala_fdct8, daala_fdct16 },  // DCT_DCT
-    { daala_fdst8, daala_fdct16 },  // ADST_DCT
-    { daala_fdct8, daala_fdst16 },  // DCT_ADST
-    { daala_fdst8, daala_fdst16 },  // ADST_ADST
-    { daala_fdst8, daala_fdct16 },  // FLIPADST_DCT
-    { daala_fdct8, daala_fdst16 },  // DCT_FLIPADST
-    { daala_fdst8, daala_fdst16 },  // FLIPADST_FLIPADST
-    { daala_fdst8, daala_fdst16 },  // ADST_FLIPADST
-    { daala_fdst8, daala_fdst16 },  // FLIPADST_ADST
-    { daala_idtx8, daala_idtx16 },  // IDTX
-    { daala_fdct8, daala_idtx16 },  // V_DCT
-    { daala_idtx8, daala_fdct16 },  // H_DCT
-    { daala_fdst8, daala_idtx16 },  // V_ADST
-    { daala_idtx8, daala_fdst16 },  // H_ADST
-    { daala_fdst8, daala_idtx16 },  // V_FLIPADST
-    { daala_idtx8, daala_fdst16 },  // H_FLIPADST
-#else
     { fdct8, fdct16 },    // DCT_DCT
     { fadst8, fdct16 },   // ADST_DCT
     { fdct8, fadst16 },   // DCT_ADST
@@ -1636,7 +1503,6 @@
     { fidtx8, fadst16 },  // H_ADST
     { fadst8, fidtx16 },  // V_FLIPADST
     { fidtx8, fadst16 },  // H_FLIPADST
-#endif
   };
   const transform_2d ht = FHT[tx_type];
   const int n = 8;
@@ -1657,14 +1523,9 @@
   for (i = 0; i < n2; ++i) {
     // Input scaling
     for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
-      // Input scaling when LGT is not possible, Daala only (4 above)
-      temp_in[j] = input[j * stride + i] * 16;
-#else
       // Input scaling when Daala is not possible, AV1/LGT only (1 above)
       temp_in[j] =
           (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
-#endif
     }
 
     // Column transform (AV1/LGT scale up 1 bit, Daala does not scale)
@@ -1672,13 +1533,8 @@
 
     // Mid scaling
     for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
-      // scaling cases 2 and 4 above
-      out[j * n2 + i] = temp_out[j];
-#else
       // Scaling case 1 above
       out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-#endif
     }
   }
 
@@ -1688,13 +1544,8 @@
     // Row transform (AV1 scales up 1.5 bits, Daala does not scale)
     ht.rows(temp_in, temp_out);
     for (j = 0; j < n2; ++j) {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
-      // Output scaing cases 2 and 4 above
-      output[j + i * n2] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-#else
       // Ouptut scaling case 1 above
       output[j + i * n2] = temp_out[j];
-#endif
     }
   }
   // Note: overall scale factor of transform is 8 times unitary
@@ -1798,24 +1649,6 @@
                     TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
-    { daala_fdct32, daala_fdct16 },  // DCT_DCT
-    { daala_fdst32, daala_fdct16 },  // ADST_DCT
-    { daala_fdct32, daala_fdst16 },  // DCT_ADST
-    { daala_fdst32, daala_fdst16 },  // ADST_ADST
-    { daala_fdst32, daala_fdct16 },  // FLIPADST_DCT
-    { daala_fdct32, daala_fdst16 },  // DCT_FLIPADST
-    { daala_fdst32, daala_fdst16 },  // FLIPADST_FLIPADST
-    { daala_fdst32, daala_fdst16 },  // ADST_FLIPADST
-    { daala_fdst32, daala_fdst16 },  // FLIPADST_ADST
-    { daala_idtx32, daala_idtx16 },  // IDTX
-    { daala_fdct32, daala_idtx16 },  // V_DCT
-    { daala_idtx32, daala_fdct16 },  // H_DCT
-    { daala_fdst32, daala_idtx16 },  // V_ADST
-    { daala_idtx32, daala_fdst16 },  // H_ADST
-    { daala_fdst32, daala_idtx16 },  // V_FLIPADST
-    { daala_idtx32, daala_fdst16 },  // H_FLIPADST
-#else
     { fdct32, fdct16 },         // DCT_DCT
     { fhalfright32, fdct16 },   // ADST_DCT
     { fdct32, fadst16 },        // DCT_ADST
@@ -1832,7 +1665,6 @@
     { fidtx32, fadst16 },       // H_ADST
     { fhalfright32, fidtx16 },  // V_FLIPADST
     { fidtx32, fadst16 },       // H_FLIPADST
-#endif
   };
   const transform_2d ht = FHT[tx_type];
   const int n = 16;
@@ -1846,20 +1678,12 @@
   // Rows
   for (i = 0; i < n2; ++i) {
     for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
-      temp_in[j] = input[i * stride + j] * 16;
-#else
       temp_in[j] =
           (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
-#endif
     }
     ht.rows(temp_in, temp_out);
     for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
-      out[j * n2 + i] = temp_out[j];
-#else
       out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
-#endif
     }
   }
 
@@ -1867,12 +1691,7 @@
   for (i = 0; i < n; ++i) {
     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
     ht.cols(temp_in, temp_out);
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
-    for (j = 0; j < n2; ++j)
-      output[i + j * n] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-#else
     for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j];
-#endif
   }
   // Note: overall scale factor of transform is 4 times unitary
 }
@@ -1881,24 +1700,6 @@
                     TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
-    { daala_fdct16, daala_fdct32 },  // DCT_DCT
-    { daala_fdst16, daala_fdct32 },  // ADST_DCT
-    { daala_fdct16, daala_fdst32 },  // DCT_ADST
-    { daala_fdst16, daala_fdst32 },  // ADST_ADST
-    { daala_fdst16, daala_fdct32 },  // FLIPADST_DCT
-    { daala_fdct16, daala_fdst32 },  // DCT_FLIPADST
-    { daala_fdst16, daala_fdst32 },  // FLIPADST_FLIPADST
-    { daala_fdst16, daala_fdst32 },  // ADST_FLIPADST
-    { daala_fdst16, daala_fdst32 },  // FLIPADST_ADST
-    { daala_idtx16, daala_idtx32 },  // IDTX
-    { daala_fdct16, daala_idtx32 },  // V_DCT
-    { daala_idtx16, daala_fdct32 },  // H_DCT
-    { daala_fdst16, daala_idtx32 },  // V_ADST
-    { daala_idtx16, daala_fdst32 },  // H_ADST
-    { daala_fdst16, daala_idtx32 },  // V_FLIPADST
-    { daala_idtx16, daala_fdst32 },  // H_FLIPADST
-#else
     { fdct16, fdct32 },         // DCT_DCT
     { fadst16, fdct32 },        // ADST_DCT
     { fdct16, fhalfright32 },   // DCT_ADST
@@ -1915,7 +1716,6 @@
     { fidtx16, fhalfright32 },  // H_ADST
     { fadst16, fidtx32 },       // V_FLIPADST
     { fidtx16, fhalfright32 },  // H_FLIPADST
-#endif
   };
   const transform_2d ht = FHT[tx_type];
   const int n = 16;
@@ -1929,20 +1729,12 @@
   // Columns
   for (i = 0; i < n2; ++i) {
     for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
-      temp_in[j] = input[j * stride + i] * 16;
-#else
       temp_in[j] =
           (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
-#endif
     }
     ht.cols(temp_in, temp_out);
     for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
-      out[j * n2 + i] = temp_out[j];
-#else
       out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
-#endif
     }
   }
 
@@ -1950,12 +1742,7 @@
   for (i = 0; i < n; ++i) {
     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
     ht.rows(temp_in, temp_out);
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
-    for (j = 0; j < n2; ++j)
-      output[j + i * n2] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-#else
     for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j];
-#endif
   }
   // Note: overall scale factor of transform is 4 times unitary
 }
@@ -1963,32 +1750,12 @@
 void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
                   TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
-#if !CONFIG_DAALA_TX8
   if (tx_type == DCT_DCT) {
     aom_fdct8x8_c(input, output, stride);
     return;
   }
-#endif
   {
     static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX8
-      { daala_fdct8, daala_fdct8 },  // DCT_DCT
-      { daala_fdst8, daala_fdct8 },  // ADST_DCT
-      { daala_fdct8, daala_fdst8 },  // DCT_ADST
-      { daala_fdst8, daala_fdst8 },  // ADST_ADST
-      { daala_fdst8, daala_fdct8 },  // FLIPADST_DCT
-      { daala_fdct8, daala_fdst8 },  // DCT_FLIPADST
-      { daala_fdst8, daala_fdst8 },  // FLIPADST_FLIPADST
-      { daala_fdst8, daala_fdst8 },  // ADST_FLIPADST
-      { daala_fdst8, daala_fdst8 },  // FLIPADST_ADST
-      { daala_idtx8, daala_idtx8 },  // IDTX
-      { daala_fdct8, daala_idtx8 },  // V_DCT
-      { daala_idtx8, daala_fdct8 },  // H_DCT
-      { daala_fdst8, daala_idtx8 },  // V_ADST
-      { daala_idtx8, daala_fdst8 },  // H_ADST
-      { daala_fdst8, daala_idtx8 },  // V_FLIPADST
-      { daala_idtx8, daala_fdst8 },  // H_FLIPADST
-#else
       { fdct8, fdct8 },    // DCT_DCT
       { fadst8, fdct8 },   // ADST_DCT
       { fdct8, fadst8 },   // DCT_ADST
@@ -2005,7 +1772,6 @@
       { fidtx8, fadst8 },  // H_ADST
       { fadst8, fidtx8 },  // V_FLIPADST
       { fidtx8, fadst8 },  // H_FLIPADST
-#endif
     };
     const transform_2d ht = FHT[tx_type];
     tran_low_t out[64];
@@ -2017,11 +1783,7 @@
 
     // Columns
     for (i = 0; i < 8; ++i) {
-#if CONFIG_DAALA_TX8
-      for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 16;
-#else
       for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 4;
-#endif
       ht.cols(temp_in, temp_out);
       for (j = 0; j < 8; ++j) out[j * 8 + i] = temp_out[j];
     }
@@ -2030,13 +1792,8 @@
     for (i = 0; i < 8; ++i) {
       for (j = 0; j < 8; ++j) temp_in[j] = out[j + i * 8];
       ht.rows(temp_in, temp_out);
-#if CONFIG_DAALA_TX8
       for (j = 0; j < 8; ++j)
         output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-#else
-      for (j = 0; j < 8; ++j)
-        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-#endif
     }
   }
 }
@@ -2101,24 +1858,6 @@
                     TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX16
-    { daala_fdct16, daala_fdct16 },  // DCT_DCT
-    { daala_fdst16, daala_fdct16 },  // ADST_DCT
-    { daala_fdct16, daala_fdst16 },  // DCT_ADST
-    { daala_fdst16, daala_fdst16 },  // ADST_ADST
-    { daala_fdst16, daala_fdct16 },  // FLIPADST_DCT
-    { daala_fdct16, daala_fdst16 },  // DCT_FLIPADST
-    { daala_fdst16, daala_fdst16 },  // FLIPADST_FLIPADST
-    { daala_fdst16, daala_fdst16 },  // ADST_FLIPADST
-    { daala_fdst16, daala_fdst16 },  // FLIPADST_ADST
-    { daala_idtx16, daala_idtx16 },  // IDTX
-    { daala_fdct16, daala_idtx16 },  // V_DCT
-    { daala_idtx16, daala_fdct16 },  // H_DCT
-    { daala_fdst16, daala_idtx16 },  // V_ADST
-    { daala_idtx16, daala_fdst16 },  // H_ADST
-    { daala_fdst16, daala_idtx16 },  // V_FLIPADST
-    { daala_idtx16, daala_fdst16 },  // H_FLIPADST
-#else
     { fdct16, fdct16 },    // DCT_DCT
     { fadst16, fdct16 },   // ADST_DCT
     { fdct16, fadst16 },   // DCT_ADST
@@ -2135,7 +1874,6 @@
     { fidtx16, fadst16 },  // H_ADST
     { fadst16, fidtx16 },  // V_FLIPADST
     { fidtx16, fadst16 },  // H_FLIPADST
-#endif
   };
   const transform_2d ht = FHT[tx_type];
   tran_low_t out[256];
@@ -2148,19 +1886,11 @@
   // Columns
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j) {
-#if CONFIG_DAALA_TX16
-      temp_in[j] = input[j * stride + i] * 16;
-#else
       temp_in[j] = input[j * stride + i] * 4;
-#endif
     }
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 16; ++j) {
-#if CONFIG_DAALA_TX16
-      out[j * 16 + i] = temp_out[j];
-#else
       out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
-#endif
     }
   }
 
@@ -2169,11 +1899,7 @@
     for (j = 0; j < 16; ++j) temp_in[j] = out[j + i * 16];
     ht.rows(temp_in, temp_out);
     for (j = 0; j < 16; ++j) {
-#if CONFIG_DAALA_TX16
-      output[j + i * 16] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-#else
       output[j + i * 16] = temp_out[j];
-#endif
     }
   }
 }
@@ -2187,24 +1913,6 @@
                     TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX32
-    { daala_fdct32, daala_fdct32 },  // DCT_DCT
-    { daala_fdst32, daala_fdct32 },  // ADST_DCT
-    { daala_fdct32, daala_fdst32 },  // DCT_ADST
-    { daala_fdst32, daala_fdst32 },  // ADST_ADST
-    { daala_fdst32, daala_fdct32 },  // FLIPADST_DCT
-    { daala_fdct32, daala_fdst32 },  // DCT_FLIPADST
-    { daala_fdst32, daala_fdst32 },  // FLIPADST_FLIPADST
-    { daala_fdst32, daala_fdst32 },  // ADST_FLIPADST
-    { daala_fdst32, daala_fdst32 },  // FLIPADST_ADST
-    { daala_idtx32, daala_idtx32 },  // IDTX
-    { daala_fdct32, daala_idtx32 },  // V_DCT
-    { daala_idtx32, daala_fdct32 },  // H_DCT
-    { daala_fdst32, daala_idtx32 },  // V_ADST
-    { daala_idtx32, daala_fdst32 },  // H_ADST
-    { daala_fdst32, daala_idtx32 },  // V_FLIPADST
-    { daala_idtx32, daala_fdst32 },  // H_FLIPADST
-#else
     { fdct32, fdct32 },              // DCT_DCT
     { fhalfright32, fdct32 },        // ADST_DCT
     { fdct32, fhalfright32 },        // DCT_ADST
@@ -2221,7 +1929,6 @@
     { fidtx32, fhalfright32 },       // H_ADST
     { fhalfright32, fidtx32 },       // V_FLIPADST
     { fidtx32, fhalfright32 },       // H_FLIPADST
-#endif
   };
   const transform_2d ht = FHT[tx_type];
   tran_low_t out[1024];
@@ -2234,19 +1941,11 @@
   // Columns
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j) {
-#if CONFIG_DAALA_TX32
-      temp_in[j] = input[j * stride + i] * 16;
-#else
       temp_in[j] = input[j * stride + i] * 4;
-#endif
     }
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 32; ++j) {
-#if CONFIG_DAALA_TX32
-      out[j * 32 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-#else
       out[j * 32 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
-#endif
     }
   }
 
@@ -2260,7 +1959,7 @@
   }
 }
 
-#if CONFIG_TX64X64 && !(CONFIG_DAALA_TX64 && CONFIG_DAALA_TX32)
+#if CONFIG_TX64X64
 static void fidtx64(const tran_low_t *input, tran_low_t *output) {
   int i;
   for (i = 0; i < 64; ++i)
@@ -2312,24 +2011,6 @@
                     TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX64
-    { daala_fdct64, daala_fdct64 },  // DCT_DCT
-    { daala_fdst64, daala_fdct64 },  // ADST_DCT
-    { daala_fdct64, daala_fdst64 },  // DCT_ADST
-    { daala_fdst64, daala_fdst64 },  // ADST_ADST
-    { daala_fdst64, daala_fdct64 },  // FLIPADST_DCT
-    { daala_fdct64, daala_fdst64 },  // DCT_FLIPADST
-    { daala_fdst64, daala_fdst64 },  // FLIPADST_FLIPADST
-    { daala_fdst64, daala_fdst64 },  // ADST_FLIPADST
-    { daala_fdst64, daala_fdst64 },  // FLIPADST_ADST
-    { daala_idtx64, daala_idtx64 },  // IDTX
-    { daala_fdct64, daala_idtx64 },  // V_DCT
-    { daala_idtx64, daala_fdct64 },  // H_DCT
-    { daala_fdst64, daala_idtx64 },  // V_ADST
-    { daala_idtx64, daala_fdst64 },  // H_ADST
-    { daala_fdst64, daala_idtx64 },  // V_FLIPADST
-    { daala_idtx64, daala_fdst64 },  // H_FLIPADST
-#else
     { fdct64_col, fdct64_row },      // DCT_DCT
     { fhalfright64, fdct64_row },    // ADST_DCT
     { fdct64_col, fhalfright64 },    // DCT_ADST
@@ -2346,7 +2027,6 @@
     { fidtx64, fhalfright64 },       // H_ADST
     { fhalfright64, fidtx64 },       // V_FLIPADST
     { fidtx64, fhalfright64 },       // H_FLIPADST
-#endif  // CONFIG_DAALA_TX64
   };
   const transform_2d ht = FHT[tx_type];
   tran_low_t out[4096];
@@ -2357,17 +2037,10 @@
 
   // Columns
   for (i = 0; i < 64; ++i) {
-#if CONFIG_DAALA_TX64
-    for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i] * 16;
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < 64; ++j) out[j * 64 + i] = temp_out[j];
-
-#else
     for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i];
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 64; ++j)
       out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
-#endif
   }
 
   // Rows
@@ -2375,12 +2048,8 @@
     for (j = 0; j < 64; ++j) temp_in[j] = out[j + i * 64];
     ht.rows(temp_in, temp_out);
     for (j = 0; j < 64; ++j)
-#if CONFIG_DAALA_TX64
-      output[j + i * 64] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 3);
-#else
       output[j + i * 64] =
           (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
-#endif
   }
 
   // Zero out top-right 32x32 area.
@@ -2399,24 +2068,6 @@
                     TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
-    { daala_fdct32, daala_fdct64 },  // DCT_DCT
-    { daala_fdst32, daala_fdct64 },  // ADST_DCT
-    { daala_fdct32, daala_fdst64 },  // DCT_ADST
-    { daala_fdst32, daala_fdst64 },  // ADST_ADST
-    { daala_fdst32, daala_fdct64 },  // FLIPADST_DCT
-    { daala_fdct32, daala_fdst64 },  // DCT_FLIPADST
-    { daala_fdst32, daala_fdst64 },  // FLIPADST_FLIPADST
-    { daala_fdst32, daala_fdst64 },  // ADST_FLIPADST
-    { daala_fdst32, daala_fdst64 },  // FLIPADST_ADST
-    { daala_idtx32, daala_idtx64 },  // IDTX
-    { daala_fdct32, daala_idtx64 },  // V_DCT
-    { daala_idtx32, daala_fdct64 },  // H_DCT
-    { daala_fdst32, daala_idtx64 },  // V_ADST
-    { daala_idtx32, daala_fdst64 },  // H_ADST
-    { daala_fdst32, daala_idtx64 },  // V_FLIPADST
-    { daala_idtx32, daala_fdst64 },  // H_FLIPADST
-#else
     { fdct32, fdct64_row },          // DCT_DCT
     { fhalfright32, fdct64_row },    // ADST_DCT
     { fdct32, fhalfright64 },        // DCT_ADST
@@ -2433,7 +2084,6 @@
     { fidtx32, fhalfright64 },       // H_ADST
     { fhalfright32, fidtx64 },       // V_FLIPADST
     { fidtx32, fhalfright64 },       // H_FLIPADST
-#endif
   };
   const transform_2d ht = FHT[tx_type];
   tran_low_t out[2048];
@@ -2447,19 +2097,11 @@
   // Columns
   for (i = 0; i < n2; ++i) {
     for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
-      temp_in[j] = input[j * stride + i] * 16;
-#else
       temp_in[j] = (tran_low_t)fdct_round_shift(input[j * stride + i] * Sqrt2);
-#endif
     }
     ht.cols(temp_in, temp_out);
     for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
-      out[j * n2 + i] = temp_out[j];
-#else
       out[j * n2 + i] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-#endif
     }
   }
 
@@ -2468,13 +2110,8 @@
     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
     ht.rows(temp_in, temp_out);
     for (j = 0; j < n2; ++j) {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
-      output[j + i * n2] =
-          (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 3);
-#else
       output[j + i * n2] =
           (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-#endif
     }
   }
 
@@ -2492,24 +2129,6 @@
                     TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
-    { daala_fdct64, daala_fdct32 },  // DCT_DCT
-    { daala_fdst64, daala_fdct32 },  // ADST_DCT
-    { daala_fdct64, daala_fdst32 },  // DCT_ADST
-    { daala_fdst64, daala_fdst32 },  // ADST_ADST
-    { daala_fdst64, daala_fdct32 },  // FLIPADST_DCT
-    { daala_fdct64, daala_fdst32 },  // DCT_FLIPADST
-    { daala_fdst64, daala_fdst32 },  // FLIPADST_FLIPADST
-    { daala_fdst64, daala_fdst32 },  // ADST_FLIPADST
-    { daala_fdst64, daala_fdst32 },  // FLIPADST_ADST
-    { daala_idtx64, daala_idtx32 },  // IDTX
-    { daala_fdct64, daala_idtx32 },  // V_DCT
-    { daala_idtx64, daala_fdct32 },  // H_DCT
-    { daala_fdst64, daala_idtx32 },  // V_ADST
-    { daala_idtx64, daala_fdst32 },  // H_ADST
-    { daala_fdst64, daala_idtx32 },  // V_FLIPADST
-    { daala_idtx64, daala_fdst32 },  // H_FLIPADST
-#else
     { fdct64_row, fdct32 },          // DCT_DCT
     { fhalfright64, fdct32 },        // ADST_DCT
     { fdct64_row, fhalfright32 },    // DCT_ADST
@@ -2526,7 +2145,6 @@
     { fidtx64, fhalfright32 },       // H_ADST
     { fhalfright64, fidtx32 },       // V_FLIPADST
     { fidtx64, fhalfright32 },       // H_FLIPADST
-#endif
   };
   const transform_2d ht = FHT[tx_type];
   tran_low_t out[32 * 64];
@@ -2540,19 +2158,11 @@
   // Rows
   for (i = 0; i < n2; ++i) {
     for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
-      temp_in[j] = input[i * stride + j] * 16;
-#else
       temp_in[j] = (tran_low_t)fdct_round_shift(input[i * stride + j] * Sqrt2);
-#endif
     }
     ht.rows(temp_in, temp_out);
     for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
-      out[j * n2 + i] = temp_out[j];
-#else
       out[j * n2 + i] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-#endif
     }
   }
 
@@ -2561,11 +2171,7 @@
     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
     ht.cols(temp_in, temp_out);
     for (j = 0; j < n2; ++j) {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
-      output[i + j * n] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 3);
-#else
       output[i + j * n] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-#endif
     }
   }
 
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index a9e1d32..474839a 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -33,9 +33,6 @@
 #include "av1/encoder/encodetxb.h"
 #endif
 #include "av1/encoder/hybrid_fwd_txfm.h"
-#if CONFIG_DAALA_TX
-#include "av1/common/daala_inv_txfm.h"
-#endif
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
 
@@ -143,14 +140,7 @@
       get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
   const int16_t *const scan = scan_order->scan;
   const int16_t *const nb = scan_order->neighbors;
-#if CONFIG_DAALA_TX
-  // This is one of the few places where RDO is done on coeffs; it
-  // expects the coeffs to be in Q3/D11, so we need to scale them.
-  int depth_shift = (TX_COEFF_DEPTH - 11) * 2;
-  int depth_round = depth_shift > 1 ? (1 << depth_shift >> 1) : 0;
-#else
   const int shift = av1_get_tx_scale(tx_size);
-#endif
 #if CONFIG_AOM_QM
   int seg_id = xd->mi[0]->mbmi.segment_id;
   const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
@@ -219,19 +209,14 @@
           tail_token_costs[band_cur][ctx_cur]);
       // accu_error does not change when x==0
     } else {
-/*  Computing distortion
- */
-// compute the distortion for the first candidate
-// and the distortion for quantizing to 0.
-#if CONFIG_DAALA_TX
-      int dx0 = coeff[rc];
-      const int64_t d0 = ((int64_t)dx0 * dx0 + depth_round) >> depth_shift;
-#else
+      /*  Computing distortion
+       */
+      // compute the distortion for the first candidate
+      // and the distortion for quantizing to 0.
       int dx0 = abs(coeff[rc]) * (1 << shift);
       dx0 >>= xd->bd - 8;
 
       const int64_t d0 = (int64_t)dx0 * dx0;
-#endif
       const int x_a = x - 2 * sz - 1;
       int dqv = dequant_ptr[rc != 0];
 #if CONFIG_AOM_QM
@@ -241,33 +226,15 @@
       }
 #endif  // CONFIG_AOM_QM
 
-#if CONFIG_DAALA_TX
-      int dx = dqcoeff[rc] - coeff[rc];
-      const int64_t d2 = ((int64_t)dx * dx + depth_round) >> depth_shift;
-#else
       int dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
       dx = signed_shift_right(dx, xd->bd - 8);
       const int64_t d2 = (int64_t)dx * dx;
-#endif
 
       /* compute the distortion for the second candidate
        * x_a = x - 2 * sz + 1;
        */
       int64_t d2_a;
       if (x_a != 0) {
-#if CONFIG_DAALA_TX
-#if CONFIG_NEW_QUANT
-#if CONFIG_AOM_QM
-        dx = av1_dequant_coeff_nuq(x_a, dqv, dq, rc != 0, 0) - coeff[rc];
-#else
-        dx = av1_dequant_coeff_nuq(x_a, dqv, dequant_val[rc != 0], 0) -
-             coeff[rc];
-#endif  // CONFIG_AOM_QM
-#else   // CONFIG_NEW_QUANT
-        dx -= (dqv + sz) ^ sz;
-#endif  // CONFIG_NEW_QUANT
-        d2_a = ((int64_t)dx * dx + depth_round) >> depth_shift;
-#else  // CONFIG_DAALA_TX
 #if CONFIG_NEW_QUANT
 #if CONFIG_AOM_QM
         dx = av1_dequant_coeff_nuq(x_a, dqv, dq, rc != 0, 0) -
@@ -281,7 +248,6 @@
         dx -= ((dqv >> (xd->bd - 8)) + sz) ^ sz;
 #endif  // CONFIG_NEW_QUANT
         d2_a = (int64_t)dx * dx;
-#endif  // CONFIG_DAALA_TX
       } else {
         d2_a = d0;
       }
@@ -354,19 +320,6 @@
       int dqc_a = 0;
       if (best_x || best_eob_x) {
         if (x_a != 0) {
-#if CONFIG_DAALA_TX
-#if CONFIG_NEW_QUANT
-#if CONFIG_AOM_QM
-          dqc_a = av1_dequant_abscoeff_nuq(abs(x_a), dqv, dq, rc != 0, 0);
-#else
-          dqc_a =
-              av1_dequant_abscoeff_nuq(abs(x_a), dqv, dequant_val[rc != 0], 0);
-#endif  // CONFIG_AOM_QM
-          if (sz) dqc_a = -dqc_a;
-#else
-          dqc_a = x_a * dqv;
-#endif  // CONFIG_NEW_QUANT
-#else   // CONFIG_DAALA_TX
 #if CONFIG_NEW_QUANT
 #if CONFIG_AOM_QM
           dqc_a = av1_dequant_abscoeff_nuq(abs(x_a), dqv, dq, rc != 0, shift);
@@ -381,7 +334,6 @@
           else
             dqc_a = (x_a * dqv) >> shift;
 #endif  // CONFIG_NEW_QUANT
-#endif  // CONFIG_DAALA_TX
         } else {
           dqc_a = 0;
         }
@@ -524,11 +476,7 @@
 
   src_diff =
       &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
-#if CONFIG_DAALA_TX
-  qparam.log_scale = 0;
-#else
   qparam.log_scale = av1_get_tx_scale(tx_size);
-#endif
   qparam.tx_size = tx_size;
 #if CONFIG_NEW_QUANT
   qparam.dq = get_dq_profile(cm->dq_type, x->qindex, is_inter, plane_type);
@@ -554,13 +502,8 @@
   if (xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
     const int n_coeffs = av1_get_max_eob(tx_size);
     if (LIKELY(!x->skip_block)) {
-#if CONFIG_DAALA_TX
-      quant_func_list[xform_quant_idx][1](coeff, n_coeffs, p, qcoeff, dqcoeff,
-                                          eob, scan_order, &qparam);
-#else
       quant_func_list[xform_quant_idx][txfm_param.is_hbd](
           coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, &qparam);
-#endif
     } else {
       av1_quantize_skip(n_coeffs, qcoeff, dqcoeff, eob);
     }
@@ -740,9 +683,6 @@
     txfm_param.tx_set_type = get_ext_tx_set_type(
         txfm_param.tx_size, plane_bsize, is_inter_block(&xd->mi[0]->mbmi),
         cm->reduced_tx_set_used);
-#if CONFIG_DAALA_TX
-    daala_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
-#else
     if (txfm_param.is_hbd) {
       av1_highbd_inv_txfm_add_4x4(dqcoeff, dst, pd->dst.stride, &txfm_param);
       return;
@@ -752,7 +692,6 @@
     } else {
       av1_idct4x4_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
     }
-#endif
   }
 }
 
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 9f7ac56..1ffb462 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -4660,75 +4660,6 @@
   }
 }
 
-#if 0 && CONFIG_INTERNAL_STATS
-static void output_frame_level_debug_stats(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
-  int64_t recon_err;
-
-  aom_clear_system_state();
-
-  recon_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
-
-  if (cpi->twopass.total_left_stats.coded_error != 0.0)
-    fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
-       "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
-       "%10"PRId64" %10"PRId64" %10d "
-       "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
-        "%6d %6d %5d %5d %5d "
-        "%10"PRId64" %10.3lf"
-        "%10lf %8u %10"PRId64" %10d %10d %10d\n",
-        cpi->common.current_video_frame,
-        cm->width, cm->height,
-        cpi->rc.source_alt_ref_pending,
-        cpi->rc.source_alt_ref_active,
-        cpi->rc.this_frame_target,
-        cpi->rc.projected_frame_size,
-        cpi->rc.projected_frame_size / cpi->common.MBs,
-        (cpi->rc.projected_frame_size - cpi->rc.this_frame_target),
-        cpi->rc.vbr_bits_off_target,
-        cpi->rc.vbr_bits_off_target_fast,
-        cpi->twopass.extend_minq,
-        cpi->twopass.extend_minq_fast,
-        cpi->rc.total_target_vs_actual,
-        (cpi->rc.starting_buffer_level - cpi->rc.bits_off_target),
-        cpi->rc.total_actual_bits, cm->base_qindex,
-        av1_convert_qindex_to_q(cm->base_qindex, cm->bit_depth),
-        (double)av1_dc_quant(cm->base_qindex, 0, cm->bit_depth) / 4.0,
-        av1_convert_qindex_to_q(cpi->twopass.active_worst_quality,
-                                cm->bit_depth),
-        cpi->rc.avg_q,
-        av1_convert_qindex_to_q(cpi->oxcf.cq_level, cm->bit_depth),
-        cpi->refresh_last_frame, cpi->refresh_golden_frame,
-        cpi->refresh_alt_ref_frame, cm->frame_type, cpi->rc.gfu_boost,
-        cpi->twopass.bits_left,
-        cpi->twopass.total_left_stats.coded_error,
-        cpi->twopass.bits_left /
-            (1 + cpi->twopass.total_left_stats.coded_error),
-        cpi->tot_recode_hits, recon_err, cpi->rc.kf_boost,
-        cpi->twopass.kf_zeromotion_pct,
-        cpi->twopass.fr_content_type);
-
-  fclose(f);
-
-  if (0) {
-    FILE *const fmodes = fopen("Modes.stt", "a");
-    int i;
-
-    fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame,
-            cm->frame_type, cpi->refresh_golden_frame,
-            cpi->refresh_alt_ref_frame);
-
-    for (i = 0; i < MAX_MODES; ++i)
-      fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
-
-    fprintf(fmodes, "\n");
-
-    fclose(fmodes);
-  }
-}
-#endif
-
 static void set_mv_search_params(AV1_COMP *cpi) {
   const AV1_COMMON *const cm = &cpi->common;
   const unsigned int max_mv_def = AOMMIN(cm->width, cm->height);
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index a47aff9..3793c6c 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -766,14 +766,6 @@
 }
 
 // TODO(zoeliu): To set up cpi->oxcf.enable_auto_brf
-#if 0 && CONFIG_EXT_REFS
-static INLINE int is_bwdref_enabled(const AV1_COMP *const cpi) {
-  // NOTE(zoeliu): The enabling of bi-predictive frames depends on the use of
-  //               alt_ref, and now will be off when the alt_ref interval is
-  //               not sufficiently large.
-  return is_altref_enabled(cpi) && cpi->oxcf.enable_auto_brf;
-}
-#endif  // CONFIG_EXT_REFS
 
 static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                 MV_REFERENCE_FRAME ref0,
diff --git a/av1/encoder/encodetxb.c b/av1/encoder/encodetxb.c
index 9e4ffd4..8d2323b 100644
--- a/av1/encoder/encodetxb.c
+++ b/av1/encoder/encodetxb.c
@@ -161,16 +161,8 @@
 
 static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff,
                                      int shift) {
-#if CONFIG_DAALA_TX
-  int depth_shift = (TX_COEFF_DEPTH - 11) * 2;
-  int depth_round = depth_shift > 1 ? (1 << (depth_shift - 1)) : 0;
-  const int64_t diff = tcoeff - dqcoeff;
-  const int64_t error = (diff * diff + depth_round) >> depth_shift;
-  (void)shift;
-#else
   const int64_t diff = (tcoeff - dqcoeff) * (1 << shift);
   const int64_t error = diff * diff;
-#endif
   return error;
 }
 
@@ -2066,11 +2058,7 @@
   const LV_MAP_EOB_COST txb_eob_costs =
       x->eob_costs[eob_multi_size][plane_type];
 
-#if CONFIG_DAALA_TX
-  const int shift = 0;
-#else
   const int shift = av1_get_tx_scale(tx_size);
-#endif
   const int64_t rdmult =
       ((x->rdmult * plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))) +
        2) >>
diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index 4183231..befb53f 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c
@@ -15,9 +15,6 @@
 
 #include "av1/common/idct.h"
 #include "av1/encoder/hybrid_fwd_txfm.h"
-#if CONFIG_DAALA_TX
-#include "av1/encoder/daala_fwd_txfm.h"
-#else
 
 static void fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
                          int diff_stride, TxfmParam *txfm_param) {
@@ -27,84 +24,47 @@
     return;
   }
 
-#if CONFIG_DAALA_TX4
-  // only C version has LGTs
-  av1_fht4x4_c(src_diff, coeff, diff_stride, txfm_param);
-#else
   av1_fht4x4(src_diff, coeff, diff_stride, txfm_param);
-#endif
 }
 
 static void fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
                          int diff_stride, TxfmParam *txfm_param) {
-#if (CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8)
-  av1_fht4x8_c(src_diff, coeff, diff_stride, txfm_param);
-#else
   av1_fht4x8(src_diff, coeff, diff_stride, txfm_param);
-#endif
 }
 
 static void fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
                          int diff_stride, TxfmParam *txfm_param) {
-#if (CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8)
-  av1_fht8x4_c(src_diff, coeff, diff_stride, txfm_param);
-#else
   av1_fht8x4(src_diff, coeff, diff_stride, txfm_param);
-#endif
 }
 
 static void fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff,
                           int diff_stride, TxfmParam *txfm_param) {
-#if (CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16)
-  av1_fht8x16_c(src_diff, coeff, diff_stride, txfm_param);
-#else
   av1_fht8x16(src_diff, coeff, diff_stride, txfm_param);
-#endif
 }
 
 static void fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
                           int diff_stride, TxfmParam *txfm_param) {
-#if (CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16)
-  av1_fht16x8_c(src_diff, coeff, diff_stride, txfm_param);
-#else
   av1_fht16x8(src_diff, coeff, diff_stride, txfm_param);
-#endif
 }
 
 static void fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
                            int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
-  av1_fht16x32_c(src_diff, coeff, diff_stride, txfm_param);
-#else
   av1_fht16x32(src_diff, coeff, diff_stride, txfm_param);
-#endif
 }
 
 static void fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
                            int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
-  av1_fht32x16_c(src_diff, coeff, diff_stride, txfm_param);
-#else
   av1_fht32x16(src_diff, coeff, diff_stride, txfm_param);
-#endif
 }
 
 static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
                          int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_DAALA_TX8
-  av1_fht8x8_c(src_diff, coeff, diff_stride, txfm_param);
-#else
   av1_fht8x8(src_diff, coeff, diff_stride, txfm_param);
-#endif
 }
 
 static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
                            int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_DAALA_TX16
-  av1_fht16x16_c(src_diff, coeff, diff_stride, txfm_param);
-#else
   av1_fht16x16(src_diff, coeff, diff_stride, txfm_param);
-#endif  // CONFIG_DAALA_TX16
 }
 
 static void fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
@@ -572,14 +532,10 @@
   }
 }
 #endif  // CONFIG_TX64X64
-#endif  // CONFIG_DAALA_TXFM
 
 void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
                   TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
-#if CONFIG_DAALA_TX
-  daala_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
-#else
   const TX_SIZE tx_size = txfm_param->tx_size;
   switch (tx_size) {
 #if CONFIG_TX64X64
@@ -635,15 +591,11 @@
       break;
     default: assert(0); break;
   }
-#endif
 }
 
 void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
                          int diff_stride, TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
-#if CONFIG_DAALA_TX
-  daala_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
-#else
   const TX_SIZE tx_size = txfm_param->tx_size;
   switch (tx_size) {
 #if CONFIG_TX64X64
@@ -707,5 +659,4 @@
       break;
     default: assert(0); break;
   }
-#endif
 }
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 40ab03d..30d069c 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -1509,12 +1509,7 @@
                                  int64_t *ssz, int bd) {
   int i;
   int64_t error = 0, sqcoeff = 0;
-#if CONFIG_DAALA_TX
-  (void)bd;
-  int shift = 2 * (TX_COEFF_DEPTH - 11);
-#else
   int shift = 2 * (bd - 8);
-#endif
   int rounding = shift > 0 ? 1 << (shift - 1) : 0;
 
   for (i = 0; i < block_size; i++) {
@@ -1701,26 +1696,17 @@
     // not involve an inverse transform, but it is less accurate.
     const int buffer_length = av1_get_max_eob(tx_size);
     int64_t this_sse;
-// TX-domain results need to shift down to Q2/D10 to match pixel
-// domain distortion values which are in Q2^2
-#if CONFIG_DAALA_TX
-    int shift = (TX_COEFF_DEPTH - 10) * 2;
-#else
+    // TX-domain results need to shift down to Q2/D10 to match pixel
+    // domain distortion values which are in Q2^2
     int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
-#endif
     tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
     tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 
-#if CONFIG_DAALA_TX
-    *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse,
-                                       xd->bd);
-#else
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
       *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length,
                                          &this_sse, xd->bd);
     else
       *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
-#endif
 
     *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift);
     *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
@@ -1977,28 +1963,19 @@
     av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                     AV1_XFORM_QUANT_FP);
 
-/// TX-domain results need to shift down to Q2/D10 to match pixel
-// domain distortion values which are in Q2^2
-#if CONFIG_DAALA_TX
-    const int shift = (TX_COEFF_DEPTH - 10) * 2;
-#else
+    /// TX-domain results need to shift down to Q2/D10 to match pixel
+    // domain distortion values which are in Q2^2
     const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
-#endif
     tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
     tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
     const int buffer_length = av1_get_max_eob(tx_size);
     int64_t tmp_dist;
     int64_t tmp;
-#if CONFIG_DAALA_TX
-    tmp_dist =
-        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp, xd->bd);
-#else
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
       tmp_dist =
           av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp, xd->bd);
     else
       tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp);
-#endif
     tmp_dist = RIGHT_SIGNED_SHIFT(tmp_dist, shift);
 
     if (
@@ -3729,13 +3706,9 @@
     av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                     AV1_XFORM_QUANT_FP);
 
-// TX-domain results need to shift down to Q2/D10 to match pixel
-// domain distortion values which are in Q2^2
-#if CONFIG_DAALA_TX
-    const int shift = (TX_COEFF_DEPTH - 10) * 2;
-#else
+    // TX-domain results need to shift down to Q2/D10 to match pixel
+    // domain distortion values which are in Q2^2
     const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
-#endif
     tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
     const int buffer_length = av1_get_max_eob(tx_size);
     int64_t tmp_dist, tmp_sse;
@@ -3748,16 +3721,11 @@
         x->tune_metric != AOM_TUNE_PSNR;
 #endif  // CONFIG_DIST_8X8
 
-#if CONFIG_DAALA_TX
-    tmp_dist =
-        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp_sse, xd->bd);
-#else
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
       tmp_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp_sse,
                                         xd->bd);
     else
       tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp_sse);
-#endif
 
     tmp_dist = RIGHT_SIGNED_SHIFT(tmp_dist, shift);
 
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index 2efe62f..79638ca 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -95,14 +95,6 @@
 set(CONFIG_CFL 1 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_CICP 0 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_COLORSPACE_HEADERS 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DAALA_TX 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DAALA_TX16 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DAALA_TX32 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DAALA_TX4 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DAALA_TX64 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DAALA_TX8 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DAALA_TX_DST32 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DAALA_TX_DST8 0 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_DEBLOCK_13TAP 1 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_DEPENDENT_HORZTILEGROUPS 0 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_DEPENDENT_HORZTILES 0 CACHE NUMBER "AV1 experiment flag.")
diff --git a/build/cmake/aom_experiment_deps.cmake b/build/cmake/aom_experiment_deps.cmake
index e6204a7..c6808e4 100644
--- a/build/cmake/aom_experiment_deps.cmake
+++ b/build/cmake/aom_experiment_deps.cmake
@@ -32,55 +32,6 @@
     endif ()
   endif ()
 
-  if (CONFIG_DAALA_TX)
-     set(CONFIG_DAALA_TX_DST32 1)
-     set(CONFIG_DAALA_TX4 1)
-     set(CONFIG_DAALA_TX8 1)
-     set(CONFIG_DAALA_TX16 1)
-     set(CONFIG_DAALA_TX32 1)
-     set(CONFIG_DAALA_TX64 1)
-  endif ()
-
-  if (NOT CONFIG_DAALA_TX)
-     set(CONFIG_DAALA_TX_DST32 0)
-     set(CONFIG_DAALA_TX4 0)
-     set(CONFIG_DAALA_TX8 0)
-     set(CONFIG_DAALA_TX16 0)
-     set(CONFIG_DAALA_TX32 0)
-     set(CONFIG_DAALA_TX64 0)
-  endif ()
-
-  if (CONFIG_DAALA_TX_DST8)
-    if (NOT CONFIG_DAALA_TX8)
-      set(CONFIG_DAALA_TX_DST8 0)
-      message("--- DAALA_TX_DST8 requires DAALA_TX8: disabled DAALA_TX_DST8")
-    endif ()
-  endif ()
-
-  if (CONFIG_DAALA_TX_DST32)
-    if (NOT CONFIG_DAALA_TX32)
-      set(CONFIG_DAALA_TX_DST32 0)
-      message("--- DAALA_TX_DST32 requires DAALA_TX32: disabled DAALA_TX_DST32")
-    endif ()
-  endif ()
-
-  if (CONFIG_DAALA_TX64)
-    if (NOT CONFIG_TX64X64)
-      set(CONFIG_DAALA_TX64 0)
-      message("--- DAALA_TX64 requires TX64X64: disabled DAALA_TX64")
-    endif ()
-  endif ()
-
-  if (CONFIG_DAALA_TX4 OR CONFIG_DAALA_TX8 OR CONFIG_DAALA_TX16 OR
-      CONFIG_DAALA_TX32 OR CONFIG_DAALA_TX64)
-    if (NOT CONFIG_LOWBITDEPTH)
-      change_config_and_warn(CONFIG_LOWBITDEPTH 1 CONFIG_DAALA_TXx)
-    endif ()
-    if (CONFIG_TXMG)
-      change_config_and_warn(CONFIG_TXMG 0 CONFIG_DAALA_DCTx)
-    endif ()
-  endif ()
-
   if (CONFIG_EXT_INTRA_MOD)
     if (NOT CONFIG_INTRA_EDGE)
       change_config_and_warn(CONFIG_INTRA_EDGE 1 CONFIG_EXT_INTRA_MOD)
diff --git a/test/av1_dct_test.cc b/test/av1_dct_test.cc
index c8a474f..fd68a54 100644
--- a/test/av1_dct_test.cc
+++ b/test/av1_dct_test.cc
@@ -23,10 +23,6 @@
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 1
 #define AV1_DCT_GTEST
 #include "av1/encoder/dct.c"
-#if CONFIG_DAALA_TX4 || CONFIG_DAALA_TX8 || CONFIG_DAALA_TX16 || \
-    CONFIG_DAALA_Tx32
-#include "av1/common/daala_tx.c"
-#endif
 
 using libaom_test::ACMRandom;
 
diff --git a/test/av1_fht16x16_test.cc b/test/av1_fht16x16_test.cc
index 93440f7..24f4c78 100644
--- a/test/av1_fht16x16_test.cc
+++ b/test/av1_fht16x16_test.cc
@@ -23,7 +23,6 @@
 
 using libaom_test::ACMRandom;
 
-#if !CONFIG_DAALA_TX
 namespace {
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
@@ -161,7 +160,7 @@
 
 using std::tr1::make_tuple;
 
-#if HAVE_SSE2 && !CONFIG_DAALA_TX16
+#if HAVE_SSE2
 const Ht16x16Param kArrayHt16x16Param_sse2[] = {
   make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, DCT_DCT,
              AOM_BITS_8, 256),
@@ -200,7 +199,7 @@
                         ::testing::ValuesIn(kArrayHt16x16Param_sse2));
 #endif  // HAVE_SSE2
 
-#if HAVE_AVX2 && !CONFIG_DAALA_TX16
+#if HAVE_AVX2
 const Ht16x16Param kArrayHt16x16Param_avx2[] = {
   make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, DCT_DCT,
              AOM_BITS_8, 256),
@@ -239,7 +238,7 @@
                         ::testing::ValuesIn(kArrayHt16x16Param_avx2));
 #endif  // HAVE_AVX2
 
-#if HAVE_SSE4_1 && !CONFIG_DAALA_TX16
+#if HAVE_SSE4_1
 const HighbdHt16x16Param kArrayHBDHt16x16Param_sse4_1[] = {
   make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, DCT_DCT, 10),
   make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, DCT_DCT, 12),
@@ -262,7 +261,6 @@
 };
 INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdTrans16x16HT,
                         ::testing::ValuesIn(kArrayHBDHt16x16Param_sse4_1));
-#endif  // HAVE_SSE4_1 && !CONFIG_DAALA_TX16
+#endif  // HAVE_SSE4_1
 
 }  // namespace
-#endif  // !CONFIG_DAALA_TX
diff --git a/test/av1_fht16x32_test.cc b/test/av1_fht16x32_test.cc
index 871dc8b..ac89d54 100644
--- a/test/av1_fht16x32_test.cc
+++ b/test/av1_fht16x32_test.cc
@@ -23,8 +23,6 @@
 
 using libaom_test::ACMRandom;
 
-#if !CONFIG_DAALA_TX
-
 namespace {
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
@@ -153,5 +151,3 @@
 #endif  // HAVE_SSE2
 
 }  // namespace
-
-#endif  // !CONFIG_DAALA_TX
diff --git a/test/av1_fht16x8_test.cc b/test/av1_fht16x8_test.cc
index b32cf8e..991cdb4 100644
--- a/test/av1_fht16x8_test.cc
+++ b/test/av1_fht16x8_test.cc
@@ -23,7 +23,6 @@
 
 using libaom_test::ACMRandom;
 
-#if !CONFIG_DAALA_TX
 namespace {
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
@@ -150,5 +149,3 @@
 #endif  // HAVE_SSE2
 
 }  // namespace
-
-#endif  // !CONFIG_DAALA_TX
diff --git a/test/av1_fht32x16_test.cc b/test/av1_fht32x16_test.cc
index 177ddf2..3caa129 100644
--- a/test/av1_fht32x16_test.cc
+++ b/test/av1_fht32x16_test.cc
@@ -23,8 +23,6 @@
 
 using libaom_test::ACMRandom;
 
-#if !CONFIG_DAALA_TX
-
 namespace {
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
@@ -153,4 +151,3 @@
 #endif  // HAVE_SSE2
 
 }  // namespace
-#endif  // !CONFIG_DAALA_TX
diff --git a/test/av1_fht32x32_test.cc b/test/av1_fht32x32_test.cc
index 2fc4db8..b31d2f4 100644
--- a/test/av1_fht32x32_test.cc
+++ b/test/av1_fht32x32_test.cc
@@ -48,7 +48,7 @@
   av1_fwd_txfm2d_32x32_c(in, out, stride, tx_type, bd);
 }
 
-#if (HAVE_SSE2 || HAVE_AVX2) && !CONFIG_DAALA_TX32
+#if (HAVE_SSE2 || HAVE_AVX2)
 void dummy_inv_txfm(const tran_low_t *in, uint8_t *out, int stride,
                     const TxfmParam *txfm_param) {
   (void)in;
@@ -161,7 +161,7 @@
 
 using std::tr1::make_tuple;
 
-#if HAVE_SSE2 && !CONFIG_DAALA_TX32
+#if HAVE_SSE2
 const Ht32x32Param kArrayHt32x32Param_sse2[] = {
   make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, DCT_DCT, AOM_BITS_8, 1024),
   make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, ADST_DCT, AOM_BITS_8, 1024),
@@ -187,9 +187,9 @@
 };
 INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans32x32HT,
                         ::testing::ValuesIn(kArrayHt32x32Param_sse2));
-#endif  // HAVE_SSE2 && !CONFIG_DAALA_TX32
+#endif  // HAVE_SSE2
 
-#if HAVE_AVX2 && !CONFIG_DAALA_TX32
+#if HAVE_AVX2
 const Ht32x32Param kArrayHt32x32Param_avx2[] = {
   make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, DCT_DCT, AOM_BITS_8, 1024),
   make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, ADST_DCT, AOM_BITS_8, 1024),
@@ -215,5 +215,5 @@
 };
 INSTANTIATE_TEST_CASE_P(AVX2, AV1Trans32x32HT,
                         ::testing::ValuesIn(kArrayHt32x32Param_avx2));
-#endif  // HAVE_AVX2 && !CONFIG_DAALA_TX32
+#endif  // HAVE_AVX2
 }  // namespace
diff --git a/test/av1_fht4x4_test.cc b/test/av1_fht4x4_test.cc
index 91c9798..ef61bce 100644
--- a/test/av1_fht4x4_test.cc
+++ b/test/av1_fht4x4_test.cc
@@ -23,7 +23,6 @@
 
 using libaom_test::ACMRandom;
 
-#if !CONFIG_DAALA_TX
 namespace {
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
@@ -164,7 +163,7 @@
 
 using std::tr1::make_tuple;
 
-#if HAVE_SSE2 && !CONFIG_DAALA_TX4
+#if HAVE_SSE2
 const Ht4x4Param kArrayHt4x4Param_sse2[] = {
   make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, DCT_DCT, AOM_BITS_8,
              16),
@@ -198,7 +197,7 @@
                         ::testing::ValuesIn(kArrayHt4x4Param_sse2));
 #endif  // HAVE_SSE2
 
-#if HAVE_SSE4_1 && !CONFIG_DAALA_TX4
+#if HAVE_SSE4_1
 const HighbdHt4x4Param kArrayHighbdHt4x4Param[] = {
   make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, DCT_DCT, 10),
   make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, DCT_DCT, 12),
@@ -223,7 +222,6 @@
 INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdTrans4x4HT,
                         ::testing::ValuesIn(kArrayHighbdHt4x4Param));
 
-#endif  // HAVE_SSE4_1 && !CONFIG_DAALA_TX4
+#endif  // HAVE_SSE4_1
 
 }  // namespace
-#endif  // !CONFIG_DAALA_TX
diff --git a/test/av1_fht4x8_test.cc b/test/av1_fht4x8_test.cc
index 00a2916..8be6aa0 100644
--- a/test/av1_fht4x8_test.cc
+++ b/test/av1_fht4x8_test.cc
@@ -23,7 +23,6 @@
 
 using libaom_test::ACMRandom;
 
-#if !CONFIG_DAALA_TX
 namespace {
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
@@ -140,4 +139,3 @@
 #endif  // HAVE_SSE2
 
 }  // namespace
-#endif  // !CONFIG_DAALA_TX
diff --git a/test/av1_fht64x64_test.cc b/test/av1_fht64x64_test.cc
index d833324..a611c5b 100644
--- a/test/av1_fht64x64_test.cc
+++ b/test/av1_fht64x64_test.cc
@@ -20,7 +20,7 @@
 #include "test/transform_test_base.h"
 #include "test/util.h"
 
-#if CONFIG_TX64X64 && !CONFIG_DAALA_TX
+#if CONFIG_TX64X64
 
 using libaom_test::ACMRandom;
 
diff --git a/test/av1_fht8x16_test.cc b/test/av1_fht8x16_test.cc
index a706a98..fc5748d 100644
--- a/test/av1_fht8x16_test.cc
+++ b/test/av1_fht8x16_test.cc
@@ -22,7 +22,6 @@
 
 using libaom_test::ACMRandom;
 
-#if !CONFIG_DAALA_TX
 namespace {
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
@@ -149,4 +148,3 @@
 #endif  // HAVE_SSE2
 
 }  // namespace
-#endif  // !CONFIG_DAALA_TX
diff --git a/test/av1_fht8x4_test.cc b/test/av1_fht8x4_test.cc
index 6806edb..e88bc35 100644
--- a/test/av1_fht8x4_test.cc
+++ b/test/av1_fht8x4_test.cc
@@ -22,7 +22,6 @@
 
 using libaom_test::ACMRandom;
 
-#if !CONFIG_DAALA_TX
 namespace {
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
@@ -139,4 +138,3 @@
 #endif  // HAVE_SSE2
 
 }  // namespace
-#endif  // !CONFIG_DAALA_TX
diff --git a/test/av1_fht8x8_test.cc b/test/av1_fht8x8_test.cc
index a73053f..6510099 100644
--- a/test/av1_fht8x8_test.cc
+++ b/test/av1_fht8x8_test.cc
@@ -23,7 +23,6 @@
 
 using libaom_test::ACMRandom;
 
-#if !CONFIG_DAALA_TX
 namespace {
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
@@ -164,7 +163,7 @@
 
 using std::tr1::make_tuple;
 
-#if HAVE_SSE2 && !CONFIG_DAALA_TX8
+#if HAVE_SSE2
 const Ht8x8Param kArrayHt8x8Param_sse2[] = {
   make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, DCT_DCT, AOM_BITS_8,
              64),
@@ -198,7 +197,7 @@
                         ::testing::ValuesIn(kArrayHt8x8Param_sse2));
 #endif  // HAVE_SSE2
 
-#if HAVE_SSE4_1 && !CONFIG_DAALA_TX8
+#if HAVE_SSE4_1
 const HighbdHt8x8Param kArrayHBDHt8x8Param_sse4_1[] = {
   make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, DCT_DCT, 10),
   make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, DCT_DCT, 12),
@@ -221,7 +220,6 @@
 };
 INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdTrans8x8HT,
                         ::testing::ValuesIn(kArrayHBDHt8x8Param_sse4_1));
-#endif  // HAVE_SSE4_1 && !CONFIG_DAALA_TX8
+#endif  // HAVE_SSE4_1
 
 }  // namespace
-#endif  // !CONFIG_DAALA_TX
diff --git a/test/av1_highbd_iht_test.cc b/test/av1_highbd_iht_test.cc
index 81e635a..761193e 100644
--- a/test/av1_highbd_iht_test.cc
+++ b/test/av1_highbd_iht_test.cc
@@ -137,22 +137,16 @@
 
 using std::tr1::make_tuple;
 
-#if HAVE_SSE4_1 && !(CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16)
-#if !CONFIG_DAALA_TX4
+#if HAVE_SSE4_1
 #define PARAM_LIST_4X4                                   \
   &av1_fwd_txfm2d_4x4_c, &av1_inv_txfm2d_add_4x4_sse4_1, \
       &av1_inv_txfm2d_add_4x4_c, 16
-#endif
-#if !CONFIG_DAALA_TX8
 #define PARAM_LIST_8X8                                   \
   &av1_fwd_txfm2d_8x8_c, &av1_inv_txfm2d_add_8x8_sse4_1, \
       &av1_inv_txfm2d_add_8x8_c, 64
-#endif
-#if !CONFIG_DAALA_TX16
 #define PARAM_LIST_16X16                                     \
   &av1_fwd_txfm2d_16x16_c, &av1_inv_txfm2d_add_16x16_sse4_1, \
       &av1_inv_txfm2d_add_16x16_c, 256
-#endif
 #if CONFIG_TX64X64
 #define PARAM_LIST_64X64                                     \
   &av1_fwd_txfm2d_64x64_c, &av1_inv_txfm2d_add_64x64_sse4_1, \
@@ -160,8 +154,7 @@
 #endif
 
 const IHbdHtParam kArrayIhtParam[] = {
-// 16x16
-#if !CONFIG_DAALA_TX16
+  // 16x16
   make_tuple(PARAM_LIST_16X16, DCT_DCT, 10),
   make_tuple(PARAM_LIST_16X16, DCT_DCT, 12),
   make_tuple(PARAM_LIST_16X16, ADST_DCT, 10),
@@ -180,9 +173,7 @@
   make_tuple(PARAM_LIST_16X16, ADST_FLIPADST, 12),
   make_tuple(PARAM_LIST_16X16, FLIPADST_ADST, 10),
   make_tuple(PARAM_LIST_16X16, FLIPADST_ADST, 12),
-#endif
-// 8x8
-#if !CONFIG_DAALA_TX8
+  // 8x8
   make_tuple(PARAM_LIST_8X8, DCT_DCT, 10),
   make_tuple(PARAM_LIST_8X8, DCT_DCT, 12),
   make_tuple(PARAM_LIST_8X8, ADST_DCT, 10),
@@ -201,9 +192,7 @@
   make_tuple(PARAM_LIST_8X8, ADST_FLIPADST, 12),
   make_tuple(PARAM_LIST_8X8, FLIPADST_ADST, 10),
   make_tuple(PARAM_LIST_8X8, FLIPADST_ADST, 12),
-#endif
-// 4x4
-#if !CONFIG_DAALA_TX4
+  // 4x4
   make_tuple(PARAM_LIST_4X4, DCT_DCT, 10),
   make_tuple(PARAM_LIST_4X4, DCT_DCT, 12),
   make_tuple(PARAM_LIST_4X4, ADST_DCT, 10),
@@ -222,7 +211,6 @@
   make_tuple(PARAM_LIST_4X4, ADST_FLIPADST, 12),
   make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 10),
   make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 12),
-#endif
 #if CONFIG_TX64X64
   make_tuple(PARAM_LIST_64X64, DCT_DCT, 10),
   make_tuple(PARAM_LIST_64X64, DCT_DCT, 12),
@@ -231,10 +219,9 @@
 
 INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdInvHTNxN,
                         ::testing::ValuesIn(kArrayIhtParam));
-#endif  // HAVE_SSE4_1 &&
-        //  !(CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16)
+#endif  // HAVE_SSE4_1
 
-#if HAVE_AVX2 && !CONFIG_DAALA_TX32
+#if HAVE_AVX2
 #define PARAM_LIST_32X32                                   \
   &av1_fwd_txfm2d_32x32_c, &av1_inv_txfm2d_add_32x32_avx2, \
       &av1_inv_txfm2d_add_32x32_c, 1024
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 5ae7d69..572feb8 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -30,7 +30,6 @@
 
 using libaom_test::ACMRandom;
 
-#if !CONFIG_DAALA_TX
 namespace {
 
 const int kNumCoeffs = 256;
@@ -753,7 +752,6 @@
                         ::testing::Values(make_tuple(&aom_fdct16x16_sse2,
                                                      &aom_idct16x16_256_add_c,
                                                      DCT_DCT, AOM_BITS_8)));
-#if !CONFIG_DAALA_TX16
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans16x16HT,
     ::testing::Values(make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_c,
@@ -764,7 +762,5 @@
                                  DCT_ADST, AOM_BITS_8),
                       make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_c,
                                  ADST_ADST, AOM_BITS_8)));
-#endif
 #endif  // HAVE_SSE2
 }  // namespace
-#endif  // !CONFIG_DAALA_TX
diff --git a/test/error_block_test.cc b/test/error_block_test.cc
index 0b1052f..aa083d5 100644
--- a/test/error_block_test.cc
+++ b/test/error_block_test.cc
@@ -155,7 +155,7 @@
       << "First failed at test case " << first_failure;
 }
 
-#if (HAVE_SSE2 || HAVE_AVX) && !CONFIG_DAALA_TX
+#if (HAVE_SSE2 || HAVE_AVX)
 using std::tr1::make_tuple;
 
 INSTANTIATE_TEST_CASE_P(
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 8d5ce9b..63f1601 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -29,7 +29,6 @@
 
 using libaom_test::ACMRandom;
 
-#if !CONFIG_DAALA_TX
 namespace {
 typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
 typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
@@ -245,7 +244,7 @@
                       make_tuple(&av1_fwht4x4_c, &aom_iwht4x4_16_add_c, DCT_DCT,
                                  AOM_BITS_8, 16)));
 
-#if HAVE_SSE2 && !CONFIG_DAALA_TX4
+#if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4WHT,
     ::testing::Values(make_tuple(&av1_fwht4x4_c, &aom_iwht4x4_16_add_c, DCT_DCT,
@@ -254,7 +253,7 @@
                                  DCT_DCT, AOM_BITS_8, 16)));
 #endif
 
-#if HAVE_SSE2 && !CONFIG_DAALA_TX4
+#if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4HT,
     ::testing::Values(make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c,
@@ -265,7 +264,6 @@
                                  DCT_ADST, AOM_BITS_8, 16),
                       make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c,
                                  ADST_ADST, AOM_BITS_8, 16)));
-#endif  // HAVE_SSE2 && !CONFIG_DAALA_TX4
+#endif  // HAVE_SSE2
 
 }  // namespace
-#endif  // !CONFIG_DAALA_TX
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 096a3c6..050408f 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -29,7 +29,6 @@
 
 using libaom_test::ACMRandom;
 
-#if !CONFIG_DAALA_TX
 namespace {
 
 const int kNumCoeffs = 64;
@@ -595,7 +594,6 @@
                         ::testing::Values(make_tuple(&aom_fdct8x8_sse2,
                                                      &aom_idct8x8_64_add_c,
                                                      DCT_DCT, AOM_BITS_8)));
-#if !CONFIG_DAALA_TX8
 INSTANTIATE_TEST_CASE_P(
     SSE2, FwdTrans8x8HT,
     ::testing::Values(make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_c,
@@ -606,7 +604,6 @@
                                  DCT_ADST, AOM_BITS_8),
                       make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_c,
                                  ADST_ADST, AOM_BITS_8)));
-#endif  // !CONFIG_DAALA_TX8
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3 && ARCH_X86_64
@@ -617,4 +614,3 @@
 #endif
 
 }  // namespace
-#endif  // !CONFIG_DAALA_TX