Remove DAALA_TX experiment
This experiment has been abandonned for AV1.
Change-Id: Ief8ed6a51a5e7bac17838ebb7a88d88bbf90a96f
diff --git a/aom_dsp/inv_txfm.h b/aom_dsp/inv_txfm.h
index dcd3fa1..14cc989 100644
--- a/aom_dsp/inv_txfm.h
+++ b/aom_dsp/inv_txfm.h
@@ -28,13 +28,6 @@
}
static INLINE tran_high_t check_range(tran_high_t input, int bd) {
-#if CONFIG_DAALA_TX
- // Daala TX coeffs cover a different range from AV1 TX
- // all depths: 19 bit integer
- const int32_t int_max = (1 << (TX_COEFF_DEPTH + 6)) - 1;
- const int32_t int_min = -int_max - 1;
- (void)bd;
-#else
// AV1 TX case
// - 8 bit: signed 16 bit integer
// - 10 bit: signed 18 bit integer
@@ -42,7 +35,6 @@
// - max quantization error = 1828 << (bd - 8)
const int32_t int_max = (1 << (7 + bd)) - 1 + (914 << (bd - 7));
const int32_t int_min = -int_max - 1;
-#endif
#if CONFIG_COEFFICIENT_RANGE_CHECKING
assert(int_min <= input);
assert(input <= int_max);
@@ -57,9 +49,6 @@
void aom_idct8_c(const tran_low_t *input, tran_low_t *output);
void aom_idct16_c(const tran_low_t *input, tran_low_t *output);
void aom_idct32_c(const tran_low_t *input, tran_low_t *output);
-#if CONFIG_TX64X64 && CONFIG_DAALA_TX64
-void aom_idct64_c(const tran_low_t *input, tran_low_t *output);
-#endif
void aom_iadst4_c(const tran_low_t *input, tran_low_t *output);
void aom_iadst8_c(const tran_low_t *input, tran_low_t *output);
void aom_iadst16_c(const tran_low_t *input, tran_low_t *output);
diff --git a/av1/av1.cmake b/av1/av1.cmake
index c26aa3e..74122c9 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -70,16 +70,6 @@
"${AOM_ROOT}/av1/common/tile_common.c"
"${AOM_ROOT}/av1/common/tile_common.h")
-if (CONFIG_DAALA_TX)
- set(AOM_AV1_COMMON_SOURCES
- ${AOM_AV1_COMMON_SOURCES}
- "${AOM_ROOT}/av1/common/daala_tx.c"
- "${AOM_ROOT}/av1/common/daala_tx.h"
- "${AOM_ROOT}/av1/common/daala_tx_kernels.h"
- "${AOM_ROOT}/av1/common/daala_inv_txfm.c"
- "${AOM_ROOT}/av1/common/daala_inv_txfm.h")
-endif ()
-
set(AOM_AV1_DECODER_SOURCES
"${AOM_ROOT}/av1/av1_dx_iface.c"
"${AOM_ROOT}/av1/decoder/decodeframe.c"
@@ -165,13 +155,6 @@
"${AOM_ROOT}/av1/encoder/tokenize.c"
"${AOM_ROOT}/av1/encoder/tokenize.h")
-if (CONFIG_DAALA_TX)
- set(AOM_AV1_ENCODER_SOURCES
- ${AOM_AV1_ENCODER_SOURCES}
- "${AOM_ROOT}/av1/encoder/daala_fwd_txfm.c"
- "${AOM_ROOT}/av1/encoder/daala_fwd_txfm.h")
-endif ()
-
set(AOM_AV1_COMMON_INTRIN_SSE2
"${AOM_ROOT}/av1/common/x86/idct_intrin_sse2.c")
@@ -186,12 +169,6 @@
set(AOM_AV1_COMMON_INTRIN_AVX2
"${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c"
"${AOM_ROOT}/av1/common/x86/hybrid_inv_txfm_avx2.c")
-if (CONFIG_DAALA_TX)
- set(AOM_AV1_COMMON_INTRIN_AVX2
- ${AOM_AV1_COMMON_INTRIN_AVX2}
- "${AOM_ROOT}/av1/common/x86/daala_tx_kernels.h"
- "${AOM_ROOT}/av1/common/x86/daala_inv_txfm_avx2.c")
-endif ()
set(AOM_AV1_COMMON_INTRIN_MSA
"${AOM_ROOT}/av1/common/mips/msa/av1_idct16x16_msa.c"
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index ee41933..9f0dbc4 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -28,10 +28,6 @@
#include "av1/common/restoration.h"
#endif
-#if CONFIG_DAALA_TX
-#include "av1/common/daala_inv_txfm.h"
-#endif
-
#if CONFIG_CFL
#include "av1/common/cfl.h"
#endif
@@ -95,11 +91,11 @@
# Inverse dct
#
add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-if (aom_config("CONFIG_DAALA_TX4") ne "yes") {
+{
specialize qw/av1_iht4x4_16_add sse2/;
}
-if (aom_config("CONFIG_DAALA_TX") ne "yes") {
+{
add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
specialize qw/av1_iht4x8_32_add sse2/;
@@ -127,13 +123,13 @@
add_proto qw/void av1_iht32x8_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
- if (aom_config("CONFIG_DAALA_TX8") ne "yes") {
+ {
specialize qw/av1_iht8x8_64_add sse2/;
}
add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
- if (aom_config("CONFIG_DAALA_TX16") ne "yes") {
+ {
specialize qw/av1_iht16x16_256_add sse2 avx2/;
}
@@ -247,19 +243,19 @@
add_proto qw/void av1_inv_txfm2d_add_16x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_inv_txfm2d_add_32x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_inv_txfm2d_add_4x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-if (aom_config("CONFIG_DAALA_TX4") ne "yes") {
+{
specialize qw/av1_inv_txfm2d_add_4x4 sse4_1/;
}
add_proto qw/void av1_inv_txfm2d_add_8x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-if (aom_config("CONFIG_DAALA_TX8") ne "yes") {
+{
specialize qw/av1_inv_txfm2d_add_8x8 sse4_1/;
}
add_proto qw/void av1_inv_txfm2d_add_16x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-if (aom_config("CONFIG_DAALA_TX16") ne "yes") {
+{
specialize qw/av1_inv_txfm2d_add_16x16 sse4_1/;
}
add_proto qw/void av1_inv_txfm2d_add_32x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-if (aom_config("CONFIG_DAALA_TX32") ne "yes") {
+{
specialize qw/av1_inv_txfm2d_add_32x32 avx2/;
}
if (aom_config("CONFIG_TX64X64") eq "yes") {
@@ -300,24 +296,24 @@
# fdct functions
add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
- if (aom_config("CONFIG_DAALA_TX4") ne "yes") {
+ {
specialize qw/av1_fht4x4 sse2/;
}
add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
- if (aom_config("CONFIG_DAALA_TX8") ne "yes") {
+ {
specialize qw/av1_fht8x8 sse2/;
}
add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
- if (aom_config("CONFIG_DAALA_TX16") ne "yes") {
+ {
specialize qw/av1_fht16x16 sse2 avx2/;
}
add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
- if (aom_config("CONFIG_DAALA_TX32") ne "yes") {
+ {
specialize qw/av1_fht32x32 sse2 avx2/;
}
@@ -370,19 +366,19 @@
add_proto qw/void av1_fwd_txfm2d_8x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_fwd_txfm2d_32x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
- if (aom_config("CONFIG_DAALA_TX4") ne "yes") {
+ {
specialize qw/av1_fwd_txfm2d_4x4 sse4_1/;
}
add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
- if (aom_config("CONFIG_DAALA_TX8") ne "yes") {
+ {
specialize qw/av1_fwd_txfm2d_8x8 sse4_1/;
}
add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
- if (aom_config("CONFIG_DAALA_TX16") ne "yes") {
+ {
specialize qw/av1_fwd_txfm2d_16x16 sse4_1/;
}
add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
- if (aom_config("CONFIG_DAALA_TX32") ne "yes") {
+ {
specialize qw/av1_fwd_txfm2d_32x32 sse4_1/;
}
@@ -433,7 +429,7 @@
}
add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
- if (aom_config("CONFIG_DAALA_TX") ne "yes") {
+ {
specialize qw/av1_highbd_block_error sse2/;
}
@@ -579,12 +575,6 @@
}
-# DAALA_TX functions
-if (aom_config("CONFIG_DAALA_TX") eq "yes") {
- add_proto qw/void daala_inv_txfm_add/, "const tran_low_t *input_coeffs, void *output_pixels, int output_stride, TxfmParam *txfm_param";
- specialize qw/daala_inv_txfm_add avx2/;
-}
-
# CFL
if (aom_config("CONFIG_CFL") eq "yes") {
add_proto qw/void av1_cfl_subtract/, "int16_t *pred_buf_q3, int width, int height, int16_t avg_q3";
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 2a82927..0cd4ec4 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -765,16 +765,6 @@
#endif // USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
if (use_reduced_set)
return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
-#if CONFIG_DAALA_TX_DST32
- if (tx_size_sqr_up > TX_32X32)
- return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY;
- if (is_inter)
- return (tx_size_sqr >= TX_16X16 ? EXT_TX_SET_DTT9_IDTX_1DDCT
- : EXT_TX_SET_ALL16);
- else
- return (tx_size_sqr >= TX_16X16 ? EXT_TX_SET_DTT4_IDTX
- : EXT_TX_SET_DTT4_IDTX_1DDCT);
-#endif
if (tx_size_sqr_up == TX_32X32)
return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY;
if (is_inter)
@@ -970,12 +960,8 @@
if (is_inter_block(mbmi) && !av1_ext_tx_used[tx_set_type][mbmi->tx_type])
return DCT_DCT;
-#if CONFIG_DAALA_TX_DST32
- if (xd->lossless[mbmi->segment_id] || txsize_sqr_map[tx_size] > TX_32X32)
-#else
if (xd->lossless[mbmi->segment_id] || txsize_sqr_map[tx_size] > TX_32X32 ||
(txsize_sqr_map[tx_size] >= TX_32X32 && !is_inter_block(mbmi)))
-#endif
return DCT_DCT;
if (plane_type == PLANE_TYPE_Y) {
return mbmi->tx_type;
@@ -1345,7 +1331,7 @@
if (tx_size == TX_16X64 || tx_size == TX_64X16) {
return 512;
}
-#endif // CONFIG_TX64X64 && !CONFIG_DAALA_TX
+#endif // CONFIG_TX64X64
return tx_size_2d[tx_size];
}
diff --git a/av1/common/daala_inv_txfm.c b/av1/common/daala_inv_txfm.c
deleted file mode 100644
index 04092e0..0000000
--- a/av1/common/daala_inv_txfm.c
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "av1/common/daala_tx.h"
-#include "av1/common/daala_inv_txfm.h"
-#include "av1/common/idct.h"
-
-#if CONFIG_DAALA_TX
-
-// Complete Daala TX map, sans lossless which is special cased
-typedef void (*daala_itx)(od_coeff *, int, const od_coeff[]);
-
-static daala_itx tx_map[TX_SIZES][TX_TYPES] = {
- // 4-point transforms
- { od_bin_idct4, od_bin_idst4, od_bin_idst4, od_bin_iidtx4 },
-
- // 8-point transforms
- { od_bin_idct8, od_bin_idst8, od_bin_idst8, od_bin_iidtx8 },
-
- // 16-point transforms
- { od_bin_idct16, od_bin_idst16, od_bin_idst16, od_bin_iidtx16 },
-
- // 32-point transforms
- { od_bin_idct32, od_bin_idst32, od_bin_idst32, od_bin_iidtx32 },
-
-#if CONFIG_TX64X64
- // 64-point transforms
- { od_bin_idct64, NULL, NULL, od_bin_iidtx64 },
-#endif
-};
-
-static int tx_flip(TX_TYPE_1D t) { return t == FLIPADST_1D; }
-
-// Daala TX toplevel inverse entry point. This same function is
-// intended for both low and high bitdepth cases with a tran_low_t of
-// 32 bits (matching od_coeff), and a passed-in pixel buffer of either
-// bytes (hbd=0) or shorts (hbd=1).
-void daala_inv_txfm_add_c(const tran_low_t *input_coeffs, void *output_pixels,
- int output_stride, TxfmParam *txfm_param) {
- const TX_SIZE tx_size = txfm_param->tx_size;
- const TX_TYPE tx_type = txfm_param->tx_type;
- const int px_depth = txfm_param->bd;
- assert(tx_size <= TX_SIZES_ALL);
- assert(tx_type <= TX_TYPES);
-
- if (txfm_param->lossless) {
- // Transform function special-cased for lossless
- assert(tx_type == DCT_DCT);
- assert(tx_size == TX_4X4);
- if (txfm_param->is_hbd)
- // Note that the output pointer in the prototype is uint8, but the
- // function converts to short internally
- av1_highbd_iwht4x4_add(input_coeffs, output_pixels, output_stride,
- txfm_param->eob, px_depth);
- else
- av1_iwht4x4_add(input_coeffs, output_pixels, output_stride, txfm_param);
- } else {
- // General TX case
- const int downshift = TX_COEFF_DEPTH - px_depth;
- assert(downshift >= 0);
- assert(sizeof(tran_low_t) == sizeof(od_coeff));
- assert(sizeof(tran_low_t) >= 4);
-
- // Hook into existing map translation infrastructure to select
- // appropriate TX functions
- const int cols = tx_size_wide[tx_size];
- const int rows = tx_size_high[tx_size];
- const TX_SIZE col_idx = txsize_vert_map[tx_size];
- const TX_SIZE row_idx = txsize_horz_map[tx_size];
- assert(col_idx <= TX_SIZES);
- assert(row_idx <= TX_SIZES);
- assert(vtx_tab[tx_type] <= (int)TX_TYPES_1D);
- assert(htx_tab[tx_type] <= (int)TX_TYPES_1D);
- daala_itx col_tx = tx_map[col_idx][vtx_tab[tx_type]];
- daala_itx row_tx = tx_map[row_idx][htx_tab[tx_type]];
- int col_flip = tx_flip(vtx_tab[tx_type]);
- int row_flip = tx_flip(htx_tab[tx_type]);
- od_coeff tmpsq[MAX_TX_SQUARE];
-#if CONFIG_TX64X64
- tran_low_t pad_input[MAX_TX_SQUARE];
-#endif
- int r;
- int c;
-
- assert(col_tx);
- assert(row_tx);
-
-#if CONFIG_TX64X64
- if (rows > 32 || cols > 32) {
- int avail_rows;
- int avail_cols;
- // TODO(urvang): Can the same array be reused, instead of using a new
- // array?
- // Remap 32x32 input into a modified input by:
- // - Copying over these values in top-left 32x32 locations.
- // - Setting the rest of the locations to 0.
- avail_rows = AOMMIN(rows, 32);
- avail_cols = AOMMIN(cols, 32);
- for (r = 0; r < avail_rows; r++) {
- memcpy(pad_input + r * cols, input_coeffs + r * avail_cols,
- avail_cols * sizeof(*pad_input));
- if (cols > avail_cols) {
- memset(pad_input + r * cols + avail_cols, 0,
- (cols - avail_cols) * sizeof(*pad_input));
- }
- }
- if (rows > avail_rows) {
- memset(pad_input + avail_rows * cols, 0,
- (rows - avail_rows) * cols * sizeof(*pad_input));
- }
- input_coeffs = pad_input;
- }
-#endif
-
- // Inverse-transform rows
- for (r = 0; r < rows; ++r) {
- // The output addressing transposes
- if (row_flip)
- row_tx(tmpsq + r + (rows * cols) - rows, -rows,
- input_coeffs + r * cols);
- else
- row_tx(tmpsq + r, rows, input_coeffs + r * cols);
- }
-
- // Inverse-transform columns
- for (c = 0; c < cols; ++c) {
- // Above transposed, so our cols are now rows
- if (col_flip)
- col_tx(tmpsq + c * rows + rows - 1, -1, tmpsq + c * rows);
- else
- col_tx(tmpsq + c * rows, 1, tmpsq + c * rows);
- }
-
- // Sum with destination according to bit depth
- // The tmpsq array is currently transposed relative to output
- if (txfm_param->is_hbd) {
- // Destination array is shorts
- uint16_t *out16 = CONVERT_TO_SHORTPTR(output_pixels);
- for (r = 0; r < rows; ++r)
- for (c = 0; c < cols; ++c)
- out16[r * output_stride + c] = highbd_clip_pixel_add(
- out16[r * output_stride + c],
- (tmpsq[c * rows + r] + (1 << downshift >> 1)) >> downshift,
- px_depth);
- } else {
- // Destination array is bytes
- uint8_t *out8 = (uint8_t *)output_pixels;
- for (r = 0; r < rows; ++r)
- for (c = 0; c < cols; ++c)
- out8[r * output_stride + c] = clip_pixel_add(
- out8[r * output_stride + c],
- (tmpsq[c * rows + r] + (1 << downshift >> 1)) >> downshift);
- }
- }
-}
-
-#endif
diff --git a/av1/common/daala_inv_txfm.h b/av1/common/daala_inv_txfm.h
deleted file mode 100644
index 3e0df30..0000000
--- a/av1/common/daala_inv_txfm.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_ENCODER_DAALA_INV_TXFM_H_
-#define AV1_ENCODER_DAALA_INV_TXFM_H_
-
-#include "./aom_config.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void daala_inv_txfm_add_c(const tran_low_t *input_coeffs, void *output_pixels,
- int output_stride, TxfmParam *txfm_param);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AV1_ENCODER_DAALA_INV_TXFM_H_
diff --git a/av1/common/daala_tx.c b/av1/common/daala_tx.c
deleted file mode 100644
index 854011b..0000000
--- a/av1/common/daala_tx.c
+++ /dev/null
@@ -1,5527 +0,0 @@
-#include "av1/common/daala_tx.h"
-#include "av1/common/odintrin.h"
-#include "av1/common/daala_tx_kernels.h"
-
-/* clang-format off */
-
-#define OD_RSHIFT1(_a) (((_a) + ((_a) < 0)) >> 1)
-#define OD_PAVG(_a, _b) (((_a) + (_b) + 1) >> 1)
-
-/* TODO: Daala DCT overflow checks need to be ported as a later test */
-# if defined(OD_DCT_CHECK_OVERFLOW)
-# else
-# define OD_DCT_OVERFLOW_CHECK(val, scale, offset, idx)
-# endif
-
-#define OD_FDCT_2_PR(p0, p1) \
- /* Embedded 2-point orthonormal Type-II fDCT. */ \
- do { \
- /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
- OD_DCT_OVERFLOW_CHECK(p1, 13573, 16384, 100); \
- p0 -= (p1*13573 + 16384) >> 15; \
- /* 5793/8192 ~= Sin[pi/4] ~= 0.707106781186547 */ \
- OD_DCT_OVERFLOW_CHECK(p0, 5793, 4096, 101); \
- p1 += (p0*5793 + 4096) >> 13; \
- /* 3393/8192 ~= Tan[pi/8] ~= 0.414213562373095 */ \
- OD_DCT_OVERFLOW_CHECK(p1, 3393, 4096, 102); \
- p0 -= (p1*3393 + 4096) >> 13; \
- } \
- while (0)
-
-#define OD_IDCT_2_PR(p0, p1) \
- /* Embedded 2-point orthonormal Type-II iDCT. */ \
- do { \
- /* 3393/8192 ~= Tan[pi/8] ~= 0.414213562373095 */ \
- p0 += (p1*3393 + 4096) >> 13; \
- /* 5793/8192 ~= Sin[pi/4] ~= 0.707106781186547 */ \
- p1 -= (p0*5793 + 4096) >> 13; \
- /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
- p0 += (p1*13573 + 16384) >> 15; \
- } \
- while (0)
-
-#define OD_FDCT_2_ASYM_PR(p0, p1, p1h) \
- /* Embedded 2-point asymmetric Type-II fDCT. */ \
- do { \
- p0 += p1h; \
- p1 = p0 - p1; \
- } \
- while (0)
-
-#define OD_IDCT_2_ASYM_PR(p0, p1, p1h) \
- /* Embedded 2-point asymmetric Type-II iDCT. */ \
- do { \
- p1 = p0 - p1; \
- p1h = OD_RSHIFT1(p1); \
- p0 -= p1h; \
- } \
- while (0)
-
-#define OD_FDST_2_PR(p0, p1) \
- /* Embedded 2-point orthonormal Type-IV fDST. */ \
- do { \
- /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
- OD_DCT_OVERFLOW_CHECK(p1, 10947, 8192, 103); \
- p0 -= (p1*10947 + 8192) >> 14; \
- /* 473/512 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
- OD_DCT_OVERFLOW_CHECK(p0, 473, 256, 104); \
- p1 += (p0*473 + 256) >> 9; \
- /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
- OD_DCT_OVERFLOW_CHECK(p1, 10947, 8192, 105); \
- p0 -= (p1*10947 + 8192) >> 14; \
- } \
- while (0)
-
-#define OD_IDST_2_PR(p0, p1) \
- /* Embedded 2-point orthonormal Type-IV iDST. */ \
- do { \
- /* 10947/16384 ~= Tan[3*Pi/16]) ~= 0.668178637919299 */ \
- p0 += (p1*10947 + 8192) >> 14; \
- /* 473/512 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
- p1 -= (p0*473 + 256) >> 9; \
- /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
- p0 += (p1*10947 + 8192) >> 14; \
- } \
- while (0)
-
-#define OD_FDST_2_ASYM_PR(p0, p1) \
- /* Embedded 2-point asymmetric Type-IV fDST. */ \
- do { \
- /* 11507/16384 ~= 4*Sin[Pi/8] - 2*Tan[Pi/8] ~= 0.702306604714169 */ \
- OD_DCT_OVERFLOW_CHECK(p1, 11507, 8192, 187); \
- p0 -= (p1*11507 + 8192) >> 14; \
- /* 669/1024 ~= Cos[Pi/8]/Sqrt[2] ~= 0.653281482438188 */ \
- OD_DCT_OVERFLOW_CHECK(p0, 669, 512, 188); \
- p1 += (p0*669 + 512) >> 10; \
- /* 4573/4096 ~= 4*Sin[Pi/8] - Tan[Pi/8] ~= 1.11652016708726 */ \
- OD_DCT_OVERFLOW_CHECK(p1, 4573, 2048, 189); \
- p0 -= (p1*4573 + 2048) >> 12; \
- } \
- while (0)
-
-#define OD_IDST_2_ASYM_PR(p0, p1) \
- /* Embedded 2-point asymmetric Type-IV iDST. */ \
- do { \
- /* 4573/4096 ~= 4*Sin[Pi/8] - Tan[Pi/8] ~= 1.11652016708726 */ \
- p0 += (p1*4573 + 2048) >> 12; \
- /* 669/1024 ~= Cos[Pi/8]/Sqrt[2] ~= 0.653281482438188 */ \
- p1 -= (p0*669 + 512) >> 10; \
- /* 11507/16384 ~= 4*Sin[Pi/8] - 2*Tan[Pi/8] ~= 0.702306604714169 */ \
- p0 += (p1*11507 + 8192) >> 14; \
- } \
- while (0)
-
-#define OD_FDCT_4_PR(q0, q2, q1, q3) \
- /* Embedded 4-point orthonormal Type-II fDCT. */ \
- do { \
- int q2h; \
- int q3h; \
- q3 = q0 - q3; \
- q3h = OD_RSHIFT1(q3); \
- q0 -= q3h; \
- q2 += q1; \
- q2h = OD_RSHIFT1(q2); \
- q1 = q2h - q1; \
- OD_FDCT_2_ASYM_PR(q0, q2, q2h); \
- OD_FDST_2_ASYM_PR(q3, q1); \
- } \
- while (0)
-
-#define OD_IDCT_4_PR(q0, q2, q1, q3) \
- /* Embedded 4-point orthonormal Type-II iDCT. */ \
- do { \
- int q1h; \
- int q3h; \
- OD_IDST_2_ASYM_PR(q3, q2); \
- OD_IDCT_2_ASYM_PR(q0, q1, q1h); \
- q3h = OD_RSHIFT1(q3); \
- q0 += q3h; \
- q3 = q0 - q3; \
- q2 = q1h - q2; \
- q1 -= q2; \
- } \
- while (0)
-
-#define OD_FDCT_4_ASYM_PR(q0, q2, q2h, q1, q3, q3h) \
- /* Embedded 4-point asymmetric Type-II fDCT. */ \
- do { \
- q0 += q3h; \
- q3 = q0 - q3; \
- q1 = q2h - q1; \
- q2 = q1 - q2; \
- OD_FDCT_2_PR(q0, q2); \
- OD_FDST_2_PR(q3, q1); \
- } \
- while (0)
-
-#define OD_IDCT_4_ASYM_PR(q0, q2, q1, q1h, q3, q3h) \
- /* Embedded 4-point asymmetric Type-II iDCT. */ \
- do { \
- OD_IDST_2_PR(q3, q2); \
- OD_IDCT_2_PR(q0, q1); \
- q1 = q2 - q1; \
- q1h = OD_RSHIFT1(q1); \
- q2 = q1h - q2; \
- q3 = q0 - q3; \
- q3h = OD_RSHIFT1(q3); \
- q0 -= q3h; \
- } \
- while (0)
-
-#define OD_FDST_4_PR(q0, q2, q1, q3) \
- /* Embedded 4-point orthonormal Type-IV fDST. */ \
- do { \
- int q0h; \
- int q1h; \
- /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
- OD_DCT_OVERFLOW_CHECK(q1, 13573, 16384, 190); \
- q2 += (q1*13573 + 16384) >> 15; \
- /* 5793/8192 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
- OD_DCT_OVERFLOW_CHECK(q2, 5793, 4096, 191); \
- q1 -= (q2*5793 + 4096) >> 13; \
- /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
- OD_DCT_OVERFLOW_CHECK(q1, 3393, 4096, 192); \
- q2 += (q1*3393 + 4096) >> 13; \
- q0 += q2; \
- q0h = OD_RSHIFT1(q0); \
- q2 = q0h - q2; \
- q1 += q3; \
- q1h = OD_RSHIFT1(q1); \
- q3 -= q1h; \
- /* 537/1024 ~= (1/Sqrt[2] - Cos[3*Pi/16]/2)/Sin[3*Pi/16] ~=
- 0.524455699240090 */ \
- OD_DCT_OVERFLOW_CHECK(q1, 537, 512, 193); \
- q2 -= (q1*537 + 512) >> 10; \
- /* 1609/2048 ~= Sqrt[2]*Sin[3*Pi/16] ~= 0.785694958387102 */ \
- OD_DCT_OVERFLOW_CHECK(q2, 1609, 1024, 194); \
- q1 += (q2*1609 + 1024) >> 11; \
- /* 7335/32768 ~= (1/Sqrt[2] - Cos[3*Pi/16])/Sin[3*Pi/16] ~=
- 0.223847182092655 */ \
- OD_DCT_OVERFLOW_CHECK(q1, 7335, 16384, 195); \
- q2 += (q1*7335 + 16384) >> 15; \
- /* 5091/8192 ~= (1/Sqrt[2] - Cos[7*Pi/16]/2)/Sin[7*Pi/16] ~=
- 0.6215036383171189 */ \
- OD_DCT_OVERFLOW_CHECK(q0, 5091, 4096, 196); \
- q3 += (q0*5091 + 4096) >> 13; \
- /* 5681/4096 ~= Sqrt[2]*Sin[7*Pi/16] ~= 1.38703984532215 */ \
- OD_DCT_OVERFLOW_CHECK(q3, 5681, 2048, 197); \
- q0 -= (q3*5681 + 2048) >> 12; \
- /* 4277/8192 ~= (1/Sqrt[2] - Cos[7*Pi/16])/Sin[7*Pi/16] ~=
- 0.52204745462729 */ \
- OD_DCT_OVERFLOW_CHECK(q0, 4277, 4096, 198); \
- q3 += (q0*4277 + 4096) >> 13; \
- } \
- while (0)
-
-#define OD_IDST_4_PR(q0, q2, q1, q3) \
- /* Embedded 4-point orthonormal Type-IV iDST. */ \
- do { \
- int q0h; \
- int q2h; \
- /* 4277/8192 ~= (1/Sqrt[2] - Cos[7*Pi/16])/Sin[7*Pi/16] ~=
- 0.52204745462729 */ \
- q3 -= (q0*4277 + 4096) >> 13; \
- /* 5681/4096 ~= Sqrt[2]*Sin[7*Pi/16] ~= 1.38703984532215 */ \
- q0 += (q3*5681 + 2048) >> 12; \
- /* 5091/8192 ~= (1/Sqrt[2] - Cos[7*Pi/16]/2)/Sin[7*Pi/16] ~=
- 0.6215036383171189 */ \
- q3 -= (q0*5091 + 4096) >> 13; \
- /* 7335/32768 ~= (1/Sqrt[2] - Cos[3*Pi/16])/Sin[3*Pi/16] ~=
- 0.223847182092655 */ \
- q1 -= (q2*7335 + 16384) >> 15; \
- /* 1609/2048 ~= Sqrt[2]*Sin[3*Pi/16] ~= 0.785694958387102 */ \
- q2 -= (q1*1609 + 1024) >> 11; \
- /* 537/1024 ~= (1/Sqrt[2] - Cos[3*Pi/16]/2)/Sin[3*Pi/16] ~=
- 0.524455699240090 */ \
- q1 += (q2*537 + 512) >> 10; \
- q2h = OD_RSHIFT1(q2); \
- q3 += q2h; \
- q2 -= q3; \
- q0h = OD_RSHIFT1(q0); \
- q1 = q0h - q1; \
- q0 -= q1; \
- /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
- q1 -= (q2*3393 + 4096) >> 13; \
- /* 5793/8192 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
- q2 += (q1*5793 + 4096) >> 13; \
- /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
- q1 -= (q2*13573 + 16384) >> 15; \
- } \
- while (0)
-
-#define OD_FDST_4_ASYM_PR(t0, t0h, t2, t1, t3) \
- /* Embedded 4-point asymmetric Type-IV fDST. */ \
- do { \
- /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
- OD_DCT_OVERFLOW_CHECK(t1, 7489, 4096, 106); \
- t2 -= (t1*7489 + 4096) >> 13; \
- /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
- OD_DCT_OVERFLOW_CHECK(t1, 11585, 8192, 107); \
- t1 += (t2*11585 + 8192) >> 14; \
- /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
- OD_DCT_OVERFLOW_CHECK(t1, 19195, 16384, 108); \
- t2 += (t1*19195 + 16384) >> 15; \
- t3 += OD_RSHIFT1(t2); \
- t2 -= t3; \
- t1 = t0h - t1; \
- t0 -= t1; \
- /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
- OD_DCT_OVERFLOW_CHECK(t0, 6723, 4096, 109); \
- t3 += (t0*6723 + 4096) >> 13; \
- /* 8035/8192 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
- OD_DCT_OVERFLOW_CHECK(t3, 8035, 4096, 110); \
- t0 -= (t3*8035 + 4096) >> 13; \
- /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
- OD_DCT_OVERFLOW_CHECK(t0, 6723, 4096, 111); \
- t3 += (t0*6723 + 4096) >> 13; \
- /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
- OD_DCT_OVERFLOW_CHECK(t1, 8757, 8192, 112); \
- t2 += (t1*8757 + 8192) >> 14; \
- /* 6811/8192 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
- OD_DCT_OVERFLOW_CHECK(t2, 6811, 4096, 113); \
- t1 -= (t2*6811 + 4096) >> 13; \
- /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
- OD_DCT_OVERFLOW_CHECK(t1, 8757, 8192, 114); \
- t2 += (t1*8757 + 8192) >> 14; \
- } \
- while (0)
-
-#define OD_IDST_4_ASYM_PR(t0, t0h, t2, t1, t3) \
- /* Embedded 4-point asymmetric Type-IV iDST. */ \
- do { \
- /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
- t1 -= (t2*8757 + 8192) >> 14; \
- /* 6811/8192 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
- t2 += (t1*6811 + 4096) >> 13; \
- /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
- t1 -= (t2*8757 + 8192) >> 14; \
- /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
- t3 -= (t0*6723 + 4096) >> 13; \
- /* 8035/8192 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
- t0 += (t3*8035 + 4096) >> 13; \
- /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
- t3 -= (t0*6723 + 4096) >> 13; \
- t0 += t2; \
- t0h = OD_RSHIFT1(t0); \
- t2 = t0h - t2; \
- t1 += t3; \
- t3 -= OD_RSHIFT1(t1); \
- /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
- t1 -= (t2*19195 + 16384) >> 15; \
- /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
- t2 -= (t1*11585 + 8192) >> 14; \
- /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
- t1 += (t2*7489 + 4096) >> 13; \
- } \
- while (0)
-
-#define OD_FDCT_8_PR(r0, r4, r2, r6, r1, r5, r3, r7) \
- /* Embedded 8-point orthonormal Type-II fDCT. */ \
- do { \
- int r4h; \
- int r5h; \
- int r6h; \
- int r7h; \
- r7 = r0 - r7; \
- r7h = OD_RSHIFT1(r7); \
- r0 -= r7h; \
- r6 += r1; \
- r6h = OD_RSHIFT1(r6); \
- r1 = r6h - r1; \
- r5 = r2 - r5; \
- r5h = OD_RSHIFT1(r5); \
- r2 -= r5h; \
- r4 += r3; \
- r4h = OD_RSHIFT1(r4); \
- r3 = r4h - r3; \
- OD_FDCT_4_ASYM_PR(r0, r4, r4h, r2, r6, r6h); \
- OD_FDST_4_ASYM_PR(r7, r7h, r3, r5, r1); \
- } \
- while (0)
-
-#define OD_IDCT_8_PR(r0, r4, r2, r6, r1, r5, r3, r7) \
- /* Embedded 8-point orthonormal Type-II iDCT. */ \
- do { \
- int r1h; \
- int r3h; \
- int r5h; \
- int r7h; \
- OD_IDST_4_ASYM_PR(r7, r7h, r5, r6, r4); \
- OD_IDCT_4_ASYM_PR(r0, r2, r1, r1h, r3, r3h); \
- r0 += r7h; \
- r7 = r0 - r7; \
- r6 = r1h - r6; \
- r1 -= r6; \
- r5h = OD_RSHIFT1(r5); \
- r2 += r5h; \
- r5 = r2 - r5; \
- r4 = r3h - r4; \
- r3 -= r4; \
- } \
- while (0)
-
-#define OD_FDCT_8_ASYM_PR(r0, r4, r4h, r2, r6, r6h, r1, r5, r5h, r3, r7, r7h) \
- /* Embedded 8-point asymmetric Type-II fDCT. */ \
- do { \
- r0 += r7h; \
- r7 = r0 - r7; \
- r1 = r6h - r1; \
- r6 -= r1; \
- r2 += r5h; \
- r5 = r2 - r5; \
- r3 = r4h - r3; \
- r4 -= r3; \
- OD_FDCT_4_PR(r0, r4, r2, r6); \
- OD_FDST_4_PR(r7, r3, r5, r1); \
- } \
- while (0)
-
-#define OD_IDCT_8_ASYM_PR(r0, r4, r2, r6, r1, r1h, r5, r5h, r3, r3h, r7, r7h) \
- /* Embedded 8-point asymmetric Type-II iDCT. */ \
- do { \
- OD_IDST_4_PR(r7, r5, r6, r4); \
- OD_IDCT_4_PR(r0, r2, r1, r3); \
- r7 = r0 - r7; \
- r7h = OD_RSHIFT1(r7); \
- r0 -= r7h; \
- r1 += r6; \
- r1h = OD_RSHIFT1(r1); \
- r6 = r1h - r6; \
- r5 = r2 - r5; \
- r5h = OD_RSHIFT1(r5); \
- r2 -= r5h; \
- r3 += r4; \
- r3h = OD_RSHIFT1(r3); \
- r4 = r3h - r4; \
- } \
- while (0)
-
-#define OD_FDST_8_PR(t0, t4, t2, t6, t1, t5, t3, t7) \
- /* Embedded 8-point orthonormal Type-IV fDST. */ \
- do { \
- int t0h; \
- int t2h; \
- int t5h; \
- int t7h; \
- /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
- OD_DCT_OVERFLOW_CHECK(t1, 13573, 16384, 115); \
- t6 -= (t1*13573 + 16384) >> 15; \
- /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
- OD_DCT_OVERFLOW_CHECK(t6, 11585, 8192, 116); \
- t1 += (t6*11585 + 8192) >> 14; \
- /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
- OD_DCT_OVERFLOW_CHECK(t1, 13573, 16384, 117); \
- t6 -= (t1*13573 + 16384) >> 15; \
- /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
- OD_DCT_OVERFLOW_CHECK(t2, 21895, 16384, 118); \
- t5 -= (t2*21895 + 16384) >> 15; \
- /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
- OD_DCT_OVERFLOW_CHECK(t5, 15137, 8192, 119); \
- t2 += (t5*15137 + 8192) >> 14; \
- /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
- OD_DCT_OVERFLOW_CHECK(t2, 10947, 8192, 120); \
- t5 -= (t2*10947 + 8192) >> 14; \
- /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
- OD_DCT_OVERFLOW_CHECK(t3, 3259, 8192, 121); \
- t4 -= (t3*3259 + 8192) >> 14; \
- /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
- OD_DCT_OVERFLOW_CHECK(t4, 3135, 4096, 122); \
- t3 += (t4*3135 + 4096) >> 13; \
- /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
- OD_DCT_OVERFLOW_CHECK(t3, 3259, 8192, 123); \
- t4 -= (t3*3259 + 8192) >> 14; \
- t7 += t1; \
- t7h = OD_RSHIFT1(t7); \
- t1 -= t7h; \
- t2 = t3 - t2; \
- t2h = OD_RSHIFT1(t2); \
- t3 -= t2h; \
- t0 -= t6; \
- t0h = OD_RSHIFT1(t0); \
- t6 += t0h; \
- t5 = t4 - t5; \
- t5h = OD_RSHIFT1(t5); \
- t4 -= t5h; \
- t1 += t5h; \
- t5 = t1 - t5; \
- t4 += t0h; \
- t0 -= t4; \
- t6 -= t2h; \
- t2 += t6; \
- t3 -= t7h; \
- t7 += t3; \
- /* TODO: Can we move this into another operation */ \
- t7 = -t7; \
- /* 7425/8192 ~= Tan[15*Pi/64] ~= 0.906347169019147 */ \
- OD_DCT_OVERFLOW_CHECK(t7, 7425, 4096, 124); \
- t0 -= (t7*7425 + 4096) >> 13; \
- /* 8153/8192 ~= Sin[15*Pi/32] ~= 0.995184726672197 */ \
- OD_DCT_OVERFLOW_CHECK(t0, 8153, 4096, 125); \
- t7 += (t0*8153 + 4096) >> 13; \
- /* 7425/8192 ~= Tan[15*Pi/64] ~= 0.906347169019147 */ \
- OD_DCT_OVERFLOW_CHECK(t7, 7425, 4096, 126); \
- t0 -= (t7*7425 + 4096) >> 13; \
- /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.148335987538347 */ \
- OD_DCT_OVERFLOW_CHECK(t1, 4861, 16384, 127); \
- t6 -= (t1*4861 + 16384) >> 15; \
- /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.290284677254462 */ \
- OD_DCT_OVERFLOW_CHECK(t6, 1189, 2048, 128); \
- t1 += (t6*1189 + 2048) >> 12; \
- /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.148335987538347 */ \
- OD_DCT_OVERFLOW_CHECK(t1, 4861, 16384, 129); \
- t6 -= (t1*4861 + 16384) >> 15; \
- /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.599376933681924 */ \
- OD_DCT_OVERFLOW_CHECK(t5, 2455, 2048, 130); \
- t2 -= (t5*2455 + 2048) >> 12; \
- /* 7225/8192 ~= Sin[11*Pi/32] ~= 0.881921264348355 */ \
- OD_DCT_OVERFLOW_CHECK(t2, 7225, 4096, 131); \
- t5 += (t2*7225 + 4096) >> 13; \
- /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.599376933681924 */ \
- OD_DCT_OVERFLOW_CHECK(t5, 2455, 2048, 132); \
- t2 -= (t5*2455 + 2048) >> 12; \
- /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.357805721314524 */ \
- OD_DCT_OVERFLOW_CHECK(t3, 11725, 16384, 133); \
- t4 -= (t3*11725 + 16384) >> 15; \
- /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.634393284163645 */ \
- OD_DCT_OVERFLOW_CHECK(t4, 5197, 4096, 134); \
- t3 += (t4*5197 + 4096) >> 13; \
- /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.357805721314524 */ \
- OD_DCT_OVERFLOW_CHECK(t3, 11725, 16384, 135); \
- t4 -= (t3*11725 + 16384) >> 15; \
- } \
- while (0)
-
-#define OD_IDST_8_PR(t0, t4, t2, t6, t1, t5, t3, t7) \
- /* Embedded 8-point orthonormal Type-IV iDST. */ \
- do { \
- int t0h; \
- int t2h; \
- int t5h_; \
- int t7h_; \
- /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.357805721314524 */ \
- t1 += (t6*11725 + 16384) >> 15; \
- /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.634393284163645 */ \
- t6 -= (t1*5197 + 4096) >> 13; \
- /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.357805721314524 */ \
- t1 += (t6*11725 + 16384) >> 15; \
- /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.599376933681924 */ \
- t2 += (t5*2455 + 2048) >> 12; \
- /* 7225/8192 ~= Sin[11*Pi/32] ~= 0.881921264348355 */ \
- t5 -= (t2*7225 + 4096) >> 13; \
- /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.599376933681924 */ \
- t2 += (t5*2455 + 2048) >> 12; \
- /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.148335987538347 */ \
- t3 += (t4*4861 + 16384) >> 15; \
- /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.290284677254462 */ \
- t4 -= (t3*1189 + 2048) >> 12; \
- /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.148335987538347 */ \
- t3 += (t4*4861 + 16384) >> 15; \
- /* 7425/8192 ~= Tan[15*Pi/64] ~= 0.906347169019147 */ \
- t0 += (t7*7425 + 4096) >> 13; \
- /* 8153/8192 ~= Sin[15*Pi/32] ~= 0.995184726672197 */ \
- t7 -= (t0*8153 + 4096) >> 13; \
- /* 7425/8192 ~= Tan[15*Pi/64] ~= 0.906347169019147 */ \
- t0 += (t7*7425 + 4096) >> 13; \
- /* TODO: Can we move this into another operation */ \
- t7 = -t7; \
- t7 -= t6; \
- t7h_ = OD_RSHIFT1(t7); \
- t6 += t7h_; \
- t2 -= t3; \
- t2h = OD_RSHIFT1(t2); \
- t3 += t2h; \
- t0 += t1; \
- t0h = OD_RSHIFT1(t0); \
- t1 -= t0h; \
- t5 = t4 - t5; \
- t5h_ = OD_RSHIFT1(t5); \
- t4 -= t5h_; \
- t1 += t5h_; \
- t5 = t1 - t5; \
- t3 -= t0h; \
- t0 += t3; \
- t6 += t2h; \
- t2 = t6 - t2; \
- t4 += t7h_; \
- t7 -= t4; \
- /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
- t1 += (t6*3259 + 8192) >> 14; \
- /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
- t6 -= (t1*3135 + 4096) >> 13; \
- /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
- t1 += (t6*3259 + 8192) >> 14; \
- /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
- t5 += (t2*10947 + 8192) >> 14; \
- /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
- t2 -= (t5*15137 + 8192) >> 14; \
- /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
- t5 += (t2*21895 + 16384) >> 15; \
- /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
- t3 += (t4*13573 + 16384) >> 15; \
- /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
- t4 -= (t3*11585 + 8192) >> 14; \
- /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
- t3 += (t4*13573 + 16384) >> 15; \
- } \
- while (0)
-
-/* Rewrite this so that t0h can be passed in. */
-#define OD_FDST_8_ASYM_PR(t0, t4, t2, t6, t1, t5, t3, t7) \
- /* Embedded 8-point asymmetric Type-IV fDST. */ \
- do { \
- int t0h; \
- int t2h; \
- int t5h; \
- int t7h; \
- /* 1035/2048 ~= (Sqrt[2] - Cos[7*Pi/32])/(2*Sin[7*Pi/32]) */ \
- OD_DCT_OVERFLOW_CHECK(t1, 1035, 1024, 199); \
- t6 += (t1*1035 + 1024) >> 11; \
- /* 3675/4096 ~= Sqrt[2]*Sin[7*Pi/32] */ \
- OD_DCT_OVERFLOW_CHECK(t6, 3675, 2048, 200); \
- t1 -= (t6*3675 + 2048) >> 12; \
- /* 851/8192 ~= (Cos[7*Pi/32] - 1/Sqrt[2])/Sin[7*Pi/32] */ \
- OD_DCT_OVERFLOW_CHECK(t1, 851, 4096, 201); \
- t6 -= (t1*851 + 4096) >> 13; \
- /* 4379/8192 ~= (Sqrt[2] - Sin[5*Pi/32])/(2*Cos[5*Pi/32]) */ \
- OD_DCT_OVERFLOW_CHECK(t2, 4379, 4096, 202); \
- t5 += (t2*4379 + 4096) >> 13; \
- /* 10217/8192 ~= Sqrt[2]*Cos[5*Pi/32] */ \
- OD_DCT_OVERFLOW_CHECK(t5, 10217, 4096, 203); \
- t2 -= (t5*10217 + 4096) >> 13; \
- /* 4379/16384 ~= (1/Sqrt[2] - Sin[5*Pi/32])/Cos[5*Pi/32] */ \
- OD_DCT_OVERFLOW_CHECK(t2, 4379, 8192, 204); \
- t5 += (t2*4379 + 8192) >> 14; \
- /* 12905/16384 ~= (Sqrt[2] - Cos[3*Pi/32])/(2*Sin[3*Pi/32]) */ \
- OD_DCT_OVERFLOW_CHECK(t3, 12905, 8192, 205); \
- t4 += (t3*12905 + 8192) >> 14; \
- /* 3363/8192 ~= Sqrt[2]*Sin[3*Pi/32] */ \
- OD_DCT_OVERFLOW_CHECK(t4, 3363, 4096, 206); \
- t3 -= (t4*3363 + 4096) >> 13; \
- /* 3525/4096 ~= (Cos[3*Pi/32] - 1/Sqrt[2])/Sin[3*Pi/32] */ \
- OD_DCT_OVERFLOW_CHECK(t3, 3525, 2048, 207); \
- t4 -= (t3*3525 + 2048) >> 12; \
- /* 5417/8192 ~= (Sqrt[2] - Sin[Pi/32])/(2*Cos[Pi/32]) */ \
- OD_DCT_OVERFLOW_CHECK(t0, 5417, 4096, 208); \
- t7 += (t0*5417 + 4096) >> 13; \
- /* 5765/4096 ~= Sqrt[2]*Cos[Pi/32] */ \
- OD_DCT_OVERFLOW_CHECK(t7, 5765, 2048, 209); \
- t0 -= (t7*5765 + 2048) >> 12; \
- /* 2507/4096 ~= (1/Sqrt[2] - Sin[Pi/32])/Cos[Pi/32] */ \
- OD_DCT_OVERFLOW_CHECK(t0, 2507, 2048, 210); \
- t7 += (t0*2507 + 2048) >> 12; \
- t0 += t1; \
- t0h = OD_RSHIFT1(t0); \
- t1 -= t0h; \
- t2 -= t3; \
- t2h = OD_RSHIFT1(t2); \
- t3 += t2h; \
- t5 -= t4; \
- t5h = OD_RSHIFT1(t5); \
- t4 += t5h; \
- t7 += t6; \
- t7h = OD_RSHIFT1(t7); \
- t6 = t7h - t6; \
- t4 = t7h - t4; \
- t7 -= t4; \
- t1 += t5h; \
- t5 = t1 - t5; \
- t6 += t2h; \
- t2 = t6 - t2; \
- t3 -= t0h; \
- t0 += t3; \
- /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
- OD_DCT_OVERFLOW_CHECK(t6, 3259, 8192, 211); \
- t1 += (t6*3259 + 8192) >> 14; \
- /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
- OD_DCT_OVERFLOW_CHECK(t1, 3135, 4096, 212); \
- t6 -= (t1*3135 + 4096) >> 13; \
- /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
- OD_DCT_OVERFLOW_CHECK(t6, 3259, 8192, 213); \
- t1 += (t6*3259 + 8192) >> 14; \
- /* 2737/4096 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
- OD_DCT_OVERFLOW_CHECK(t2, 2737, 2048, 214); \
- t5 += (t2*2737 + 2048) >> 12; \
- /* 473/512 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
- OD_DCT_OVERFLOW_CHECK(t5, 473, 256, 215); \
- t2 -= (t5*473 + 256) >> 9; \
- /* 2737/4096 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
- OD_DCT_OVERFLOW_CHECK(t2, 2737, 2048, 216); \
- t5 += (t2*2737 + 2048) >> 12; \
- /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
- OD_DCT_OVERFLOW_CHECK(t4, 3393, 4096, 217); \
- t3 += (t4*3393 + 4096) >> 13; \
- /* 5793/8192 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
- OD_DCT_OVERFLOW_CHECK(t3, 5793, 4096, 218); \
- t4 -= (t3*5793 + 4096) >> 13; \
- /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
- OD_DCT_OVERFLOW_CHECK(t4, 3393, 4096, 219); \
- t3 += (t4*3393 + 4096) >> 13; \
- } \
- while (0)
-
-#define OD_IDST_8_ASYM_PR(t0, t4, t2, t6, t1, t5, t3, t7) \
- /* Embedded 8-point asymmetric Type-IV iDST. */ \
- do { \
- int t0h; \
- int t2h; \
- int t5h__; \
- int t7h__; \
- /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
- t6 -= (t1*3393 + 4096) >> 13; \
- /* 5793/8192 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
- t1 += (t6*5793 + 4096) >> 13; \
- /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
- t6 -= (t1*3393 + 4096) >> 13; \
- /* 2737/4096 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
- t5 -= (t2*2737 + 2048) >> 12; \
- /* 473/512 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
- t2 += (t5*473 + 256) >> 9; \
- /* 2737/4096 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
- t5 -= (t2*2737 + 2048) >> 12; \
- /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
- t4 -= (t3*3259 + 8192) >> 14; \
- /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
- t3 += (t4*3135 + 4096) >> 13; \
- /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
- t4 -= (t3*3259 + 8192) >> 14; \
- t0 -= t6; \
- t0h = OD_RSHIFT1(t0); \
- t6 += t0h; \
- t2 = t3 - t2; \
- t2h = OD_RSHIFT1(t2); \
- t3 -= t2h; \
- t5 = t4 - t5; \
- t5h__ = OD_RSHIFT1(t5); \
- t4 -= t5h__; \
- t7 += t1; \
- t7h__ = OD_RSHIFT1(t7); \
- t1 = t7h__ - t1; \
- t3 = t7h__ - t3; \
- t7 -= t3; \
- t1 -= t5h__; \
- t5 += t1; \
- t6 -= t2h; \
- t2 += t6; \
- t4 += t0h; \
- t0 -= t4; \
- /* 2507/4096 ~= (1/Sqrt[2] - Sin[Pi/32])/Cos[Pi/32] */ \
- t7 -= (t0*2507 + 2048) >> 12; \
- /* 5765/4096 ~= Sqrt[2]*Cos[Pi/32] */ \
- t0 += (t7*5765 + 2048) >> 12; \
- /* 5417/8192 ~= (Sqrt[2] - Sin[Pi/32])/(2*Cos[Pi/32]) */ \
- t7 -= (t0*5417 + 4096) >> 13; \
- /* 3525/4096 ~= (Cos[3*Pi/32] - 1/Sqrt[2])/Sin[3*Pi/32] */ \
- t1 += (t6*3525 + 2048) >> 12; \
- /* 3363/8192 ~= Sqrt[2]*Sin[3*Pi/32] */ \
- t6 += (t1*3363 + 4096) >> 13; \
- /* 12905/16384 ~= (1/Sqrt[2] - Cos[3*Pi/32]/1)/Sin[3*Pi/32] */ \
- t1 -= (t6*12905 + 8192) >> 14; \
- /* 4379/16384 ~= (1/Sqrt[2] - Sin[5*Pi/32])/Cos[5*Pi/32] */ \
- t5 -= (t2*4379 + 8192) >> 14; \
- /* 10217/8192 ~= Sqrt[2]*Cos[5*Pi/32] */ \
- t2 += (t5*10217 + 4096) >> 13; \
- /* 4379/8192 ~= (Sqrt[2] - Sin[5*Pi/32])/(2*Cos[5*Pi/32]) */ \
- t5 -= (t2*4379 + 4096) >> 13; \
- /* 851/8192 ~= (Cos[7*Pi/32] - 1/Sqrt[2])/Sin[7*Pi/32] */ \
- t3 += (t4*851 + 4096) >> 13; \
- /* 3675/4096 ~= Sqrt[2]*Sin[7*Pi/32] */ \
- t4 += (t3*3675 + 2048) >> 12; \
- /* 1035/2048 ~= (Sqrt[2] - Cos[7*Pi/32])/(2*Sin[7*Pi/32]) */ \
- t3 -= (t4*1035 + 1024) >> 11; \
- } \
- while (0)
-
-#define OD_FDCT_16_PR(s0, s8, s4, sc, s2, sa, s6, se, \
- s1, s9, s5, sd, s3, sb, s7, sf) \
- /* Embedded 16-point orthonormal Type-II fDCT. */ \
- do { \
- int s8h; \
- int sah; \
- int sch; \
- int seh; \
- int sfh; \
- sf = s0 - sf; \
- sfh = OD_RSHIFT1(sf); \
- s0 -= sfh; \
- se += s1; \
- seh = OD_RSHIFT1(se); \
- s1 = seh - s1; \
- sd = s2 - sd; \
- s2 -= OD_RSHIFT1(sd); \
- sc += s3; \
- sch = OD_RSHIFT1(sc); \
- s3 = sch - s3; \
- sb = s4 - sb; \
- s4 -= OD_RSHIFT1(sb); \
- sa += s5; \
- sah = OD_RSHIFT1(sa); \
- s5 = sah - s5; \
- s9 = s6 - s9; \
- s6 -= OD_RSHIFT1(s9); \
- s8 += s7; \
- s8h = OD_RSHIFT1(s8); \
- s7 = s8h - s7; \
- OD_FDCT_8_ASYM_PR(s0, s8, s8h, s4, sc, sch, s2, sa, sah, s6, se, seh); \
- OD_FDST_8_ASYM_PR(sf, s7, sb, s3, sd, s5, s9, s1); \
- } \
- while (0)
-
-#define OD_IDCT_16_PR(s0, s8, s4, sc, s2, sa, s6, se, \
- s1, s9, s5, sd, s3, sb, s7, sf) \
- /* Embedded 16-point orthonormal Type-II iDCT. */ \
- do { \
- int s1h; \
- int s3h; \
- int s5h; \
- int s7h; \
- int sfh; \
- OD_IDST_8_ASYM_PR(sf, sb, sd, s9, se, sa, sc, s8); \
- OD_IDCT_8_ASYM_PR(s0, s4, s2, s6, s1, s1h, s5, s5h, s3, s3h, s7, s7h); \
- sfh = OD_RSHIFT1(sf); \
- s0 += sfh; \
- sf = s0 - sf; \
- se = s1h - se; \
- s1 -= se; \
- s2 += OD_RSHIFT1(sd); \
- sd = s2 - sd; \
- sc = s3h - sc; \
- s3 -= sc; \
- s4 += OD_RSHIFT1(sb); \
- sb = s4 - sb; \
- sa = s5h - sa; \
- s5 -= sa; \
- s6 += OD_RSHIFT1(s9); \
- s9 = s6 - s9; \
- s8 = s7h - s8; \
- s7 -= s8; \
- } \
- while (0)
-
-#define OD_FDCT_16_ASYM_PR(t0, t8, t8h, t4, tc, tch, t2, ta, tah, t6, te, teh, \
- t1, t9, t9h, t5, td, tdh, t3, tb, tbh, t7, tf, tfh) \
- /* Embedded 16-point asymmetric Type-II fDCT. */ \
- do { \
- t0 += tfh; \
- tf = t0 - tf; \
- t1 -= teh; \
- te += t1; \
- t2 += tdh; \
- td = t2 - td; \
- t3 -= tch; \
- tc += t3; \
- t4 += tbh; \
- tb = t4 - tb; \
- t5 -= tah; \
- ta += t5; \
- t6 += t9h; \
- t9 = t6 - t9; \
- t7 -= t8h; \
- t8 += t7; \
- OD_FDCT_8_PR(t0, t8, t4, tc, t2, ta, t6, te); \
- OD_FDST_8_PR(tf, t7, tb, t3, td, t5, t9, t1); \
- } \
- while (0)
-
-#define OD_IDCT_16_ASYM_PR(t0, t8, t4, tc, t2, ta, t6, te, \
- t1, t1h, t9, t9h, t5, t5h, td, tdh, t3, t3h, tb, tbh, t7, t7h, tf, tfh) \
- /* Embedded 16-point asymmetric Type-II iDCT. */ \
- do { \
- OD_IDST_8_PR(tf, tb, td, t9, te, ta, tc, t8); \
- OD_IDCT_8_PR(t0, t4, t2, t6, t1, t5, t3, t7); \
- t1 -= te; \
- t1h = OD_RSHIFT1(t1); \
- te += t1h; \
- t9 = t6 - t9; \
- t9h = OD_RSHIFT1(t9); \
- t6 -= t9h; \
- t5 -= ta; \
- t5h = OD_RSHIFT1(t5); \
- ta += t5h; \
- td = t2 - td; \
- tdh = OD_RSHIFT1(td); \
- t2 -= tdh; \
- t3 -= tc; \
- t3h = OD_RSHIFT1(t3); \
- tc += t3h; \
- tb = t4 - tb; \
- tbh = OD_RSHIFT1(tb); \
- t4 -= tbh; \
- t7 -= t8; \
- t7h = OD_RSHIFT1(t7); \
- t8 += t7h; \
- tf = t0 - tf; \
- tfh = OD_RSHIFT1(tf); \
- t0 -= tfh; \
- } \
- while (0)
-
-#define OD_FDST_16_PR(s0, s8, s4, sc, s2, sa, s6, se, \
- s1, s9, s5, sd, s3, sb, s7, sf) \
- /* Embedded 16-point orthonormal Type-IV fDST. */ \
- do { \
- int s0h; \
- int s2h; \
- int sdh; \
- int sfh; \
- /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
- OD_DCT_OVERFLOW_CHECK(s3, 13573, 16384, 220); \
- s1 += (se*13573 + 16384) >> 15; \
- /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
- OD_DCT_OVERFLOW_CHECK(s1, 11585, 8192, 221); \
- se -= (s1*11585 + 8192) >> 14; \
- /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
- OD_DCT_OVERFLOW_CHECK(s3, 13573, 16384, 222); \
- s1 += (se*13573 + 16384) >> 15; \
- /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
- OD_DCT_OVERFLOW_CHECK(s2, 21895, 16384, 223); \
- sd += (s2*21895 + 16384) >> 15; \
- /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
- OD_DCT_OVERFLOW_CHECK(sd, 15137, 16384, 224); \
- s2 -= (sd*15137 + 8192) >> 14; \
- /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
- OD_DCT_OVERFLOW_CHECK(s2, 21895, 16384, 225); \
- sd += (s2*21895 + 16384) >> 15; \
- /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
- OD_DCT_OVERFLOW_CHECK(s3, 3259, 8192, 226); \
- sc += (s3*3259 + 8192) >> 14; \
- /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
- OD_DCT_OVERFLOW_CHECK(sc, 3135, 4096, 227); \
- s3 -= (sc*3135 + 4096) >> 13; \
- /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
- OD_DCT_OVERFLOW_CHECK(s3, 3259, 8192, 228); \
- sc += (s3*3259 + 8192) >> 14; \
- /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
- OD_DCT_OVERFLOW_CHECK(s5, 13573, 16384, 229); \
- sa += (s5*13573 + 16384) >> 15; \
- /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
- OD_DCT_OVERFLOW_CHECK(sa, 11585, 8192, 230); \
- s5 -= (sa*11585 + 8192) >> 14; \
- /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
- OD_DCT_OVERFLOW_CHECK(s5, 13573, 16384, 231); \
- sa += (s5*13573 + 16384) >> 15; \
- /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
- OD_DCT_OVERFLOW_CHECK(s9, 13573, 16384, 232); \
- s6 += (s9*13573 + 16384) >> 15; \
- /* 11585/16384 ~= Sin[pi/4] ~= 0.707106781186547 */ \
- OD_DCT_OVERFLOW_CHECK(s6, 11585, 8192, 233); \
- s9 -= (s6*11585 + 8192) >> 14; \
- /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
- OD_DCT_OVERFLOW_CHECK(s9, 13573, 16384, 234); \
- s6 += (s9*13573 + 16384) >> 15; \
- sf += se; \
- sfh = OD_RSHIFT1(sf); \
- se = sfh - se; \
- s0 += s1; \
- s0h = OD_RSHIFT1(s0); \
- s1 = s0h - s1; \
- s2 = s3 - s2; \
- s2h = OD_RSHIFT1(s2); \
- s3 -= s2h; \
- sd -= sc; \
- sdh = OD_RSHIFT1(sd); \
- sc += sdh; \
- sa = s4 - sa; \
- s4 -= OD_RSHIFT1(sa); \
- s5 += sb; \
- sb = OD_RSHIFT1(s5) - sb; \
- s8 += s6; \
- s6 -= OD_RSHIFT1(s8); \
- s7 = s9 - s7; \
- s9 -= OD_RSHIFT1(s7); \
- /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
- OD_DCT_OVERFLOW_CHECK(sb, 6723, 4096, 235); \
- s4 += (sb*6723 + 4096) >> 13; \
- /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
- OD_DCT_OVERFLOW_CHECK(s4, 16069, 8192, 236); \
- sb -= (s4*16069 + 8192) >> 14; \
- /* 6723/8192 ~= Tan[7*Pi/32]) ~= 0.820678790828660 */ \
- OD_DCT_OVERFLOW_CHECK(sb, 6723, 4096, 237); \
- s4 += (sb*6723 + 4096) >> 13; \
- /* 8757/16384 ~= Tan[5*Pi/32]) ~= 0.534511135950792 */ \
- OD_DCT_OVERFLOW_CHECK(s5, 8757, 8192, 238); \
- sa += (s5*8757 + 8192) >> 14; \
- /* 6811/8192 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
- OD_DCT_OVERFLOW_CHECK(sa, 6811, 4096, 239); \
- s5 -= (sa*6811 + 4096) >> 13; \
- /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
- OD_DCT_OVERFLOW_CHECK(s5, 8757, 8192, 240); \
- sa += (s5*8757 + 8192) >> 14; \
- /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
- OD_DCT_OVERFLOW_CHECK(s9, 2485, 4096, 241); \
- s6 += (s9*2485 + 4096) >> 13; \
- /* 4551/8192 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
- OD_DCT_OVERFLOW_CHECK(s6, 4551, 4096, 242); \
- s9 -= (s6*4551 + 4096) >> 13; \
- /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
- OD_DCT_OVERFLOW_CHECK(s9, 2485, 4096, 243); \
- s6 += (s9*2485 + 4096) >> 13; \
- /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
- OD_DCT_OVERFLOW_CHECK(s8, 3227, 16384, 244); \
- s7 += (s8*3227 + 16384) >> 15; \
- /* 6393/32768 ~= Sin[Pi/16] ~= 0.19509032201612825 */ \
- OD_DCT_OVERFLOW_CHECK(s7, 6393, 16384, 245); \
- s8 -= (s7*6393 + 16384) >> 15; \
- /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
- OD_DCT_OVERFLOW_CHECK(s8, 3227, 16384, 246); \
- s7 += (s8*3227 + 16384) >> 15; \
- s1 -= s2h; \
- s2 += s1; \
- se += sdh; \
- sd = se - sd; \
- s3 += sfh; \
- sf -= s3; \
- sc = s0h - sc; \
- s0 -= sc; \
- sb += OD_RSHIFT1(s8); \
- s8 = sb - s8; \
- s4 += OD_RSHIFT1(s7); \
- s7 -= s4; \
- s6 += OD_RSHIFT1(s5); \
- s5 = s6 - s5; \
- s9 -= OD_RSHIFT1(sa); \
- sa += s9; \
- s8 += s0; \
- s0 -= OD_RSHIFT1(s8); \
- sf += s7; \
- s7 = OD_RSHIFT1(sf) - s7; \
- s1 -= s6; \
- s6 += OD_RSHIFT1(s1); \
- s9 += se; \
- se = OD_RSHIFT1(s9) - se; \
- s2 += sa; \
- sa = OD_RSHIFT1(s2) - sa; \
- s5 += sd; \
- sd -= OD_RSHIFT1(s5); \
- s4 = sc - s4; \
- sc -= OD_RSHIFT1(s4); \
- s3 -= sb; \
- sb += OD_RSHIFT1(s3); \
- /* 2799/4096 ~= (1/Sqrt[2] - Cos[31*Pi/64]/2)/Sin[31*Pi/64] */ \
- OD_DCT_OVERFLOW_CHECK(sf, 2799, 2048, 247); \
- s0 -= (sf*2799 + 2048) >> 12; \
- /* 2893/2048 ~= Sqrt[2]*Sin[31*Pi/64] */ \
- OD_DCT_OVERFLOW_CHECK(s0, 2893, 1024, 248); \
- sf += (s0*2893 + 1024) >> 11; \
- /* 5397/8192 ~= (Cos[Pi/4] - Cos[31*Pi/64])/Sin[31*Pi/64] */ \
- OD_DCT_OVERFLOW_CHECK(sf, 5397, 4096, 249); \
- s0 -= (sf*5397 + 4096) >> 13; \
- /* 41/64 ~= (1/Sqrt[2] - Cos[29*Pi/64]/2)/Sin[29*Pi/64] */ \
- OD_DCT_OVERFLOW_CHECK(s1, 41, 32, 250); \
- se += (s1*41 + 32) >> 6; \
- /* 2865/2048 ~= Sqrt[2]*Sin[29*Pi/64] */ \
- OD_DCT_OVERFLOW_CHECK(se, 2865, 1024, 251); \
- s1 -= (se*2865 + 1024) >> 11; \
- /* 4641/8192 ~= (1/Sqrt[2] - Cos[29*Pi/64])/Sin[29*Pi/64] */ \
- OD_DCT_OVERFLOW_CHECK(s1, 4641, 4096, 252); \
- se += (s1*4641 + 4096) >> 13; \
- /* 2473/4096 ~= (1/Sqrt[2] - Cos[27*Pi/64]/2)/Sin[27*Pi/64] */ \
- OD_DCT_OVERFLOW_CHECK(s2, 2473, 2048, 253); \
- sd += (s2*2473 + 2048) >> 12; \
- /* 5619/4096 ~= Sqrt[2]*Sin[27*Pi/64] */ \
- OD_DCT_OVERFLOW_CHECK(sd, 5619, 2048, 254); \
- s2 -= (sd*5619 + 2048) >> 12; \
- /* 7839/16384 ~= (1/Sqrt[2] - Cos[27*Pi/64])/Sin[27*Pi/64] */ \
- OD_DCT_OVERFLOW_CHECK(s2, 7839, 8192, 255); \
- sd += (s2*7839 + 8192) >> 14; \
- /* 5747/8192 ~= (1/Sqrt[2] - Cos[7*Pi/64]/2)/Sin[7*Pi/64] */ \
- OD_DCT_OVERFLOW_CHECK(s3, 5747, 4096, 256); \
- sc -= (s3*5747 + 4096) >> 13; \
- /* 3903/8192 ~= Sqrt[2]*Sin[7*Pi/64] ~= */ \
- OD_DCT_OVERFLOW_CHECK(sc, 3903, 4096, 257); \
- s3 += (sc*3903 + 4096) >> 13; \
- /* 5701/8192 ~= (1/Sqrt[2] - Cos[7*Pi/64])/Sin[7*Pi/64] */ \
- OD_DCT_OVERFLOW_CHECK(s3, 5701, 4096, 258); \
- sc += (s3*5701 + 4096) >> 13; \
- /* 4471/8192 ~= (1/Sqrt[2] - Cos[23*Pi/64]/2)/Sin[23*Pi/64] */ \
- OD_DCT_OVERFLOW_CHECK(s4, 4471, 4096, 259); \
- sb += (s4*4471 + 4096) >> 13; \
- /* 1309/1024 ~= Sqrt[2]*Sin[23*Pi/64] */ \
- OD_DCT_OVERFLOW_CHECK(sb, 1309, 512, 260); \
- s4 -= (sb*1309 + 512) >> 10; \
- /* 5067/16384 ~= (1/Sqrt[2] - Cos[23*Pi/64])/Sin[23*Pi/64] */ \
- OD_DCT_OVERFLOW_CHECK(s4, 5067, 8192, 261); \
- sb += (s4*5067 + 8192) >> 14; \
- /* 2217/4096 ~= (1/Sqrt[2] - Cos[11*Pi/64]/2)/Sin[11*Pi/64] */ \
- OD_DCT_OVERFLOW_CHECK(s5, 2217, 2048, 262); \
- sa -= (s5*2217 + 2048) >> 12; \
- /* 1489/2048 ~= Sqrt[2]*Sin[11*Pi/64] ~= 0.72705107329128 */ \
- OD_DCT_OVERFLOW_CHECK(sa, 1489, 1024, 263); \
- s5 += (sa*1489 + 1024) >> 11; \
- /* 75/256 ~= (1/Sqrt[2] - Cos[11*Pi/64])/Sin[11*Pi/64] */ \
- OD_DCT_OVERFLOW_CHECK(s5, 75, 128, 264); \
- sa += (s5*75 + 128) >> 8; \
- /* 2087/4096 ~= (1/Sqrt[2] - Cos[19*Pi/64]/2)/Sin[19*Pi/64] */ \
- OD_DCT_OVERFLOW_CHECK(s9, 2087, 2048, 265); \
- s6 -= (s9*2087 + 2048) >> 12; \
- /* 4653/4096 ~= Sqrt[2]*Sin[19*Pi/64] */ \
- OD_DCT_OVERFLOW_CHECK(s6, 4653, 2048, 266); \
- s9 += (s6*4653 + 2048) >> 12; \
- /* 4545/32768 ~= (1/Sqrt[2] - Cos[19*Pi/64])/Sin[19*Pi/64] */ \
- OD_DCT_OVERFLOW_CHECK(s9, 4545, 16384, 267); \
- s6 -= (s9*4545 + 16384) >> 15; \
- /* 2053/4096 ~= (1/Sqrt[2] - Cos[15*Pi/64]/2)/Sin[15*Pi/64] */ \
- OD_DCT_OVERFLOW_CHECK(s8, 2053, 2048, 268); \
- s7 += (s8*2053 + 2048) >> 12; \
- /* 1945/2048 ~= Sqrt[2]*Sin[15*Pi/64] */ \
- OD_DCT_OVERFLOW_CHECK(s7, 1945, 1024, 269); \
- s8 -= (s7*1945 + 1024) >> 11; \
- /* 1651/32768 ~= (1/Sqrt[2] - Cos[15*Pi/64])/Sin[15*Pi/64] */ \
- OD_DCT_OVERFLOW_CHECK(s8, 1651, 16384, 270); \
- s7 -= (s8*1651 + 16384) >> 15; \
- } \
- while (0)
-
-#define OD_IDST_16_PR(s0, s8, s4, sc, s2, sa, s6, se, \
- s1, s9, s5, sd, s3, sb, s7, sf) \
- /* Embedded 16-point orthonormal Type-IV iDST. */ \
- do { \
- int s0h; \
- int s4h; \
- int sbh; \
- int sfh; \
- /* 1651/32768 ~= (1/Sqrt[2] - Cos[15*Pi/64])/Sin[15*Pi/64] */ \
- se += (s1*1651 + 16384) >> 15; \
- /* 1945/2048 ~= Sqrt[2]*Sin[15*Pi/64] */ \
- s1 += (se*1945 + 1024) >> 11; \
- /* 2053/4096 ~= (1/Sqrt[2] - Cos[15*Pi/64]/2)/Sin[15*Pi/64] */ \
- se -= (s1*2053 + 2048) >> 12; \
- /* 4545/32768 ~= (1/Sqrt[2] - Cos[19*Pi/64])/Sin[19*Pi/64] */ \
- s6 += (s9*4545 + 16384) >> 15; \
- /* 4653/32768 ~= Sqrt[2]*Sin[19*Pi/64] */ \
- s9 -= (s6*4653 + 2048) >> 12; \
- /* 2087/4096 ~= (1/Sqrt[2] - Cos[19*Pi/64]/2)/Sin[19*Pi/64] */ \
- s6 += (s9*2087 + 2048) >> 12; \
- /* 75/256 ~= (1/Sqrt[2] - Cos[11*Pi/64])/Sin[11*Pi/64] */ \
- s5 -= (sa*75 + 128) >> 8; \
- /* 1489/2048 ~= Sqrt[2]*Sin[11*Pi/64] */ \
- sa -= (s5*1489 + 1024) >> 11; \
- /* 2217/4096 ~= (1/Sqrt[2] - Cos[11*Pi/64]/2)/Sin[11*Pi/64] */ \
- s5 += (sa*2217 + 2048) >> 12; \
- /* 5067/16384 ~= (1/Sqrt[2] - Cos[23*Pi/64])/Sin[23*Pi/64] */ \
- sd -= (s2*5067 + 8192) >> 14; \
- /* 1309/1024 ~= Sqrt[2]*Sin[23*Pi/64] */ \
- s2 += (sd*1309 + 512) >> 10; \
- /* 4471/8192 ~= (1/Sqrt[2] - Cos[23*Pi/64]/2)/Sin[23*Pi/64] */ \
- sd -= (s2*4471 + 4096) >> 13; \
- /* 5701/8192 ~= (1/Sqrt[2] - Cos[7*Pi/64])/Sin[7*Pi/64] */ \
- s3 -= (sc*5701 + 4096) >> 13; \
- /* 3903/8192 ~= Sqrt[2]*Sin[7*Pi/64] */ \
- sc -= (s3*3903 + 4096) >> 13; \
- /* 5747/8192 ~= (1/Sqrt[2] - Cos[7*Pi/64]/2)/Sin[7*Pi/64] */ \
- s3 += (sc*5747 + 4096) >> 13; \
- /* 7839/16384 ~= (1/Sqrt[2] - Cos[27*Pi/64])/Sin[27*Pi/64] */ \
- sb -= (s4*7839 + 8192) >> 14; \
- /* 5619/4096 ~= Sqrt[2]*Sin[27*Pi/64] */ \
- s4 += (sb*5619 + 2048) >> 12; \
- /* 2473/4096 ~= (1/Sqrt[2] - Cos[27*Pi/64]/2)/Sin[27*Pi/64] */ \
- sb -= (s4*2473 + 2048) >> 12; \
- /* 4641/8192 ~= (1/Sqrt[2] - Cos[29*Pi/64])/Sin[29*Pi/64] */ \
- s7 -= (s8*4641 + 4096) >> 13; \
- /* 2865/2048 ~= Sqrt[2]*Sin[29*Pi/64] */ \
- s8 += (s7*2865 + 1024) >> 11; \
- /* 41/64 ~= (1/Sqrt[2] - Cos[29*Pi/64]/2)/Sin[29*Pi/64] */ \
- s7 -= (s8*41 + 32) >> 6; \
- /* 5397/8192 ~= (Cos[Pi/4] - Cos[31*Pi/64])/Sin[31*Pi/64] */ \
- s0 += (sf*5397 + 4096) >> 13; \
- /* 2893/2048 ~= Sqrt[2]*Sin[31*Pi/64] */ \
- sf -= (s0*2893 + 1024) >> 11; \
- /* 2799/4096 ~= (1/Sqrt[2] - Cos[31*Pi/64]/2)/Sin[31*Pi/64] */ \
- s0 += (sf*2799 + 2048) >> 12; \
- sd -= OD_RSHIFT1(sc); \
- sc += sd; \
- s3 += OD_RSHIFT1(s2); \
- s2 = s3 - s2; \
- sb += OD_RSHIFT1(sa); \
- sa -= sb; \
- s5 = OD_RSHIFT1(s4) - s5; \
- s4 -= s5; \
- s7 = OD_RSHIFT1(s9) - s7; \
- s9 -= s7; \
- s6 -= OD_RSHIFT1(s8); \
- s8 += s6; \
- se = OD_RSHIFT1(sf) - se; \
- sf -= se; \
- s0 += OD_RSHIFT1(s1); \
- s1 -= s0; \
- s5 -= s9; \
- s9 += OD_RSHIFT1(s5); \
- sa = s6 - sa; \
- s6 -= OD_RSHIFT1(sa); \
- se += s2; \
- s2 -= OD_RSHIFT1(se); \
- s1 = sd - s1; \
- sd -= OD_RSHIFT1(s1); \
- s0 += s3; \
- s0h = OD_RSHIFT1(s0); \
- s3 = s0h - s3; \
- sf += sc; \
- sfh = OD_RSHIFT1(sf); \
- sc -= sfh; \
- sb = s7 - sb; \
- sbh = OD_RSHIFT1(sb); \
- s7 -= sbh; \
- s4 -= s8; \
- s4h = OD_RSHIFT1(s4); \
- s8 += s4h; \
- /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
- se -= (s1*3227 + 16384) >> 15; \
- /* 6393/32768 ~= Sin[Pi/16] ~= 0.19509032201612825 */ \
- s1 += (se*6393 + 16384) >> 15; \
- /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
- se -= (s1*3227 + 16384) >> 15; \
- /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
- s6 -= (s9*2485 + 4096) >> 13; \
- /* 4551/8192 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
- s9 += (s6*4551 + 4096) >> 13; \
- /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
- s6 -= (s9*2485 + 4096) >> 13; \
- /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
- s5 -= (sa*8757 + 8192) >> 14; \
- /* 6811/8192 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
- sa += (s5*6811 + 4096) >> 13; \
- /* 8757/16384 ~= Tan[5*Pi/32]) ~= 0.534511135950792 */ \
- s5 -= (sa*8757 + 8192) >> 14; \
- /* 6723/8192 ~= Tan[7*Pi/32]) ~= 0.820678790828660 */ \
- s2 -= (sd*6723 + 4096) >> 13; \
- /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
- sd += (s2*16069 + 8192) >> 14; \
- /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
- s2 -= (sd*6723 + 4096) >> 13; \
- s9 += OD_RSHIFT1(se); \
- se = s9 - se; \
- s6 += OD_RSHIFT1(s1); \
- s1 -= s6; \
- sd = OD_RSHIFT1(sa) - sd; \
- sa -= sd; \
- s2 += OD_RSHIFT1(s5); \
- s5 = s2 - s5; \
- s3 -= sbh; \
- sb += s3; \
- sc += s4h; \
- s4 = sc - s4; \
- s8 = s0h - s8; \
- s0 -= s8; \
- s7 = sfh - s7; \
- sf -= s7; \
- /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
- s6 -= (s9*13573 + 16384) >> 15; \
- /* 11585/16384 ~= Sin[pi/4] ~= 0.707106781186547 */ \
- s9 += (s6*11585 + 8192) >> 14; \
- /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
- s6 -= (s9*13573 + 16384) >> 15; \
- /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
- s5 -= (sa*13573 + 16384) >> 15; \
- /* 11585/16384 ~= Sin[pi/4] ~= 0.707106781186547 */ \
- sa += (s5*11585 + 8192) >> 14; \
- /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
- s5 -= (sa*13573 + 16384) >> 15; \
- /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
- s3 -= (sc*3259 + 8192) >> 14; \
- /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
- sc += (s3*3135 + 4096) >> 13; \
- /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
- s3 -= (sc*3259 + 8192) >> 14; \
- /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
- sb -= (s4*21895 + 16384) >> 15; \
- /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
- s4 += (sb*15137 + 8192) >> 14; \
- /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
- sb -= (s4*21895 + 16384) >> 15; \
- /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
- s8 -= (s7*13573 + 16384) >> 15; \
- /* 11585/16384 ~= Sin[pi/4] ~= 0.707106781186547 */ \
- s7 += (s8*11585 + 8192) >> 14; \
- /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
- s8 -= (s7*13573 + 16384) >> 15; \
- } \
- while (0)
-
-/* TODO: rewrite this to match OD_FDST_16. */
-#define OD_FDST_16_ASYM_PR(t0, t0h, t8, t4, t4h, tc, t2, ta, t6, te, \
- t1, t9, t5, td, t3, tb, t7, t7h, tf) \
- /* Embedded 16-point asymmetric Type-IV fDST. */ \
- do { \
- int t2h; \
- int t3h; \
- int t6h; \
- int t8h; \
- int t9h; \
- int tch; \
- int tdh; \
- /* TODO: Can we move these into another operation */ \
- t8 = -t8; \
- t9 = -t9; \
- ta = -ta; \
- tb = -tb; \
- td = -td; \
- /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
- OD_DCT_OVERFLOW_CHECK(te, 13573, 8192, 136); \
- t1 -= (te*13573 + 8192) >> 14; \
- /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
- OD_DCT_OVERFLOW_CHECK(t1, 11585, 16384, 137); \
- te += (t1*11585 + 16384) >> 15; \
- /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
- OD_DCT_OVERFLOW_CHECK(te, 13573, 8192, 138); \
- t1 -= (te*13573 + 8192) >> 14; \
- /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
- OD_DCT_OVERFLOW_CHECK(td, 4161, 8192, 139); \
- t2 += (td*4161 + 8192) >> 14; \
- /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
- OD_DCT_OVERFLOW_CHECK(t2, 15137, 8192, 140); \
- td -= (t2*15137 + 8192) >> 14; \
- /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
- OD_DCT_OVERFLOW_CHECK(td, 14341, 8192, 141); \
- t2 += (td*14341 + 8192) >> 14; \
- /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
- OD_DCT_OVERFLOW_CHECK(t3, 14341, 8192, 142); \
- tc -= (t3*14341 + 8192) >> 14; \
- /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
- OD_DCT_OVERFLOW_CHECK(tc, 15137, 8192, 143); \
- t3 += (tc*15137 + 8192) >> 14; \
- /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
- OD_DCT_OVERFLOW_CHECK(t3, 4161, 8192, 144); \
- tc -= (t3*4161 + 8192) >> 14; \
- te = t0h - te; \
- t0 -= te; \
- tf = OD_RSHIFT1(t1) - tf; \
- t1 -= tf; \
- /* TODO: Can we move this into another operation */ \
- tc = -tc; \
- t2 = OD_RSHIFT1(tc) - t2; \
- tc -= t2; \
- t3 = OD_RSHIFT1(td) - t3; \
- td = t3 - td; \
- /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
- OD_DCT_OVERFLOW_CHECK(t6, 7489, 4096, 145); \
- t9 -= (t6*7489 + 4096) >> 13; \
- /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
- OD_DCT_OVERFLOW_CHECK(t9, 11585, 8192, 146); \
- t6 += (t9*11585 + 8192) >> 14; \
- /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
- OD_DCT_OVERFLOW_CHECK(t6, 19195, 16384, 147); \
- t9 += (t6*19195 + 16384) >> 15; \
- t8 += OD_RSHIFT1(t9); \
- t9 -= t8; \
- t6 = t7h - t6; \
- t7 -= t6; \
- /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
- OD_DCT_OVERFLOW_CHECK(t7, 6723, 4096, 148); \
- t8 += (t7*6723 + 4096) >> 13; \
- /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
- OD_DCT_OVERFLOW_CHECK(t8, 16069, 8192, 149); \
- t7 -= (t8*16069 + 8192) >> 14; \
- /* 6723/8192 ~= Tan[7*Pi/32]) ~= 0.820678790828660 */ \
- OD_DCT_OVERFLOW_CHECK(t7, 6723, 4096, 150); \
- t8 += (t7*6723 + 4096) >> 13; \
- /* 17515/32768 ~= Tan[5*Pi/32]) ~= 0.534511135950792 */ \
- OD_DCT_OVERFLOW_CHECK(t6, 17515, 16384, 151); \
- t9 += (t6*17515 + 16384) >> 15; \
- /* 13623/16384 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
- OD_DCT_OVERFLOW_CHECK(t9, 13623, 8192, 152); \
- t6 -= (t9*13623 + 8192) >> 14; \
- /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
- OD_DCT_OVERFLOW_CHECK(t6, 17515, 16384, 153); \
- t9 += (t6*17515 + 16384) >> 15; \
- /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
- OD_DCT_OVERFLOW_CHECK(ta, 13573, 8192, 154); \
- t5 += (ta*13573 + 8192) >> 14; \
- /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
- OD_DCT_OVERFLOW_CHECK(t5, 11585, 16384, 155); \
- ta -= (t5*11585 + 16384) >> 15; \
- /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
- OD_DCT_OVERFLOW_CHECK(ta, 13573, 8192, 156); \
- t5 += (ta*13573 + 8192) >> 14; \
- tb += OD_RSHIFT1(t5); \
- t5 = tb - t5; \
- ta += t4h; \
- t4 -= ta; \
- /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
- OD_DCT_OVERFLOW_CHECK(t5, 2485, 4096, 157); \
- ta += (t5*2485 + 4096) >> 13; \
- /* 18205/32768 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
- OD_DCT_OVERFLOW_CHECK(ta, 18205, 16384, 158); \
- t5 -= (ta*18205 + 16384) >> 15; \
- /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
- OD_DCT_OVERFLOW_CHECK(t5, 2485, 4096, 159); \
- ta += (t5*2485 + 4096) >> 13; \
- /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
- OD_DCT_OVERFLOW_CHECK(t4, 6723, 4096, 160); \
- tb -= (t4*6723 + 4096) >> 13; \
- /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
- OD_DCT_OVERFLOW_CHECK(tb, 16069, 8192, 161); \
- t4 += (tb*16069 + 8192) >> 14; \
- /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
- OD_DCT_OVERFLOW_CHECK(t4, 6723, 4096, 162); \
- tb -= (t4*6723 + 4096) >> 13; \
- /* TODO: Can we move this into another operation */ \
- t5 = -t5; \
- tc -= tf; \
- tch = OD_RSHIFT1(tc); \
- tf += tch; \
- t3 += t0; \
- t3h = OD_RSHIFT1(t3); \
- t0 -= t3h; \
- td -= t1; \
- tdh = OD_RSHIFT1(td); \
- t1 += tdh; \
- t2 += te; \
- t2h = OD_RSHIFT1(t2); \
- te -= t2h; \
- t8 += t4; \
- t8h = OD_RSHIFT1(t8); \
- t4 = t8h - t4; \
- t7 = tb - t7; \
- t7h = OD_RSHIFT1(t7); \
- tb = t7h - tb; \
- t6 -= ta; \
- t6h = OD_RSHIFT1(t6); \
- ta += t6h; \
- t9 = t5 - t9; \
- t9h = OD_RSHIFT1(t9); \
- t5 -= t9h; \
- t0 -= t7h; \
- t7 += t0; \
- tf += t8h; \
- t8 -= tf; \
- te -= t6h; \
- t6 += te; \
- t1 += t9h; \
- t9 -= t1; \
- tb -= tch; \
- tc += tb; \
- t4 += t3h; \
- t3 -= t4; \
- ta -= tdh; \
- td += ta; \
- t5 = t2h - t5; \
- t2 -= t5; \
- /* TODO: Can we move these into another operation */ \
- t8 = -t8; \
- t9 = -t9; \
- ta = -ta; \
- tb = -tb; \
- tc = -tc; \
- td = -td; \
- tf = -tf; \
- /* 7799/8192 ~= Tan[31*Pi/128] ~= 0.952079146700925 */ \
- OD_DCT_OVERFLOW_CHECK(tf, 7799, 4096, 163); \
- t0 -= (tf*7799 + 4096) >> 13; \
- /* 4091/4096 ~= Sin[31*Pi/64] ~= 0.998795456205172 */ \
- OD_DCT_OVERFLOW_CHECK(t0, 4091, 2048, 164); \
- tf += (t0*4091 + 2048) >> 12; \
- /* 7799/8192 ~= Tan[31*Pi/128] ~= 0.952079146700925 */ \
- OD_DCT_OVERFLOW_CHECK(tf, 7799, 4096, 165); \
- t0 -= (tf*7799 + 4096) >> 13; \
- /* 2417/32768 ~= Tan[3*Pi/128] ~= 0.0737644315224493 */ \
- OD_DCT_OVERFLOW_CHECK(te, 2417, 16384, 166); \
- t1 += (te*2417 + 16384) >> 15; \
- /* 601/4096 ~= Sin[3*Pi/64] ~= 0.146730474455362 */ \
- OD_DCT_OVERFLOW_CHECK(t1, 601, 2048, 167); \
- te -= (t1*601 + 2048) >> 12; \
- /* 2417/32768 ~= Tan[3*Pi/128] ~= 0.0737644315224493 */ \
- OD_DCT_OVERFLOW_CHECK(te, 2417, 16384, 168); \
- t1 += (te*2417 + 16384) >> 15; \
- /* 14525/32768 ~= Tan[17*Pi/128] ~= 0.443269513890864 */ \
- OD_DCT_OVERFLOW_CHECK(t8, 14525, 16384, 169); \
- t7 -= (t8*14525 + 16384) >> 15; \
- /* 3035/4096 ~= Sin[17*Pi/64] ~= 0.740951125354959 */ \
- OD_DCT_OVERFLOW_CHECK(t7, 3035, 2048, 170); \
- t8 += (t7*3035 + 2048) >> 12; \
- /* 7263/16384 ~= Tan[17*Pi/128] ~= 0.443269513890864 */ \
- OD_DCT_OVERFLOW_CHECK(t8, 7263, 8192, 171); \
- t7 -= (t8*7263 + 8192) >> 14; \
- /* 6393/8192 ~= Tan[27*Pi/128] ~= 0.780407659653944 */ \
- OD_DCT_OVERFLOW_CHECK(td, 6393, 4096, 172); \
- t2 -= (td*6393 + 4096) >> 13; \
- /* 3973/4096 ~= Sin[27*Pi/64] ~= 0.970031253194544 */ \
- OD_DCT_OVERFLOW_CHECK(t2, 3973, 2048, 173); \
- td += (t2*3973 + 2048) >> 12; \
- /* 6393/8192 ~= Tan[27*Pi/128] ~= 0.780407659653944 */ \
- OD_DCT_OVERFLOW_CHECK(td, 6393, 4096, 174); \
- t2 -= (td*6393 + 4096) >> 13; \
- /* 9281/16384 ~= Tan[21*Pi/128] ~= 0.566493002730344 */ \
- OD_DCT_OVERFLOW_CHECK(ta, 9281, 8192, 175); \
- t5 -= (ta*9281 + 8192) >> 14; \
- /* 7027/8192 ~= Sin[21*Pi/64] ~= 0.857728610000272 */ \
- OD_DCT_OVERFLOW_CHECK(t5, 7027, 4096, 176); \
- ta += (t5*7027 + 4096) >> 13; \
- /* 9281/16384 ~= Tan[21*Pi/128] ~= 0.566493002730344 */ \
- OD_DCT_OVERFLOW_CHECK(ta, 9281, 8192, 177); \
- t5 -= (ta*9281 + 8192) >> 14; \
- /* 11539/16384 ~= Tan[25*Pi/128] ~= 0.704279460865044 */ \
- OD_DCT_OVERFLOW_CHECK(tc, 11539, 8192, 178); \
- t3 -= (tc*11539 + 8192) >> 14; \
- /* 7713/8192 ~= Sin[25*Pi/64] ~= 0.941544065183021 */ \
- OD_DCT_OVERFLOW_CHECK(t3, 7713, 4096, 179); \
- tc += (t3*7713 + 4096) >> 13; \
- /* 11539/16384 ~= Tan[25*Pi/128] ~= 0.704279460865044 */ \
- OD_DCT_OVERFLOW_CHECK(tc, 11539, 8192, 180); \
- t3 -= (tc*11539 + 8192) >> 14; \
- /* 10375/16384 ~= Tan[23*Pi/128] ~= 0.633243016177569 */ \
- OD_DCT_OVERFLOW_CHECK(tb, 10375, 8192, 181); \
- t4 -= (tb*10375 + 8192) >> 14; \
- /* 7405/8192 ~= Sin[23*Pi/64] ~= 0.903989293123443 */ \
- OD_DCT_OVERFLOW_CHECK(t4, 7405, 4096, 182); \
- tb += (t4*7405 + 4096) >> 13; \
- /* 10375/16384 ~= Tan[23*Pi/128] ~= 0.633243016177569 */ \
- OD_DCT_OVERFLOW_CHECK(tb, 10375, 8192, 183); \
- t4 -= (tb*10375 + 8192) >> 14; \
- /* 8247/16384 ~= Tan[19*Pi/128] ~= 0.503357699799294 */ \
- OD_DCT_OVERFLOW_CHECK(t9, 8247, 8192, 184); \
- t6 -= (t9*8247 + 8192) >> 14; \
- /* 1645/2048 ~= Sin[19*Pi/64] ~= 0.803207531480645 */ \
- OD_DCT_OVERFLOW_CHECK(t6, 1645, 1024, 185); \
- t9 += (t6*1645 + 1024) >> 11; \
- /* 8247/16384 ~= Tan[19*Pi/128] ~= 0.503357699799294 */ \
- OD_DCT_OVERFLOW_CHECK(t9, 8247, 8192, 186); \
- t6 -= (t9*8247 + 8192) >> 14; \
- } \
- while (0)
-
-#define OD_IDST_16_ASYM_PR(t0, t0h, t8, t4, tc, t2, t2h, ta, t6, te, teh, \
- t1, t9, t5, td, t3, tb, t7, tf) \
- /* Embedded 16-point asymmetric Type-IV iDST. */ \
- do { \
- int t1h_; \
- int t3h_; \
- int t4h; \
- int t6h; \
- int t9h_; \
- int tbh_; \
- int tch; \
- /* 8247/16384 ~= Tan[19*Pi/128] ~= 0.503357699799294 */ \
- t6 += (t9*8247 + 8192) >> 14; \
- /* 1645/2048 ~= Sin[19*Pi/64] ~= 0.803207531480645 */ \
- t9 -= (t6*1645 + 1024) >> 11; \
- /* 8247/16384 ~= Tan[19*Pi/128] ~= 0.503357699799294 */ \
- t6 += (t9*8247 + 8192) >> 14; \
- /* 10375/16384 ~= Tan[23*Pi/128] ~= 0.633243016177569 */ \
- t2 += (td*10375 + 8192) >> 14; \
- /* 7405/8192 ~= Sin[23*Pi/64] ~= 0.903989293123443 */ \
- td -= (t2*7405 + 4096) >> 13; \
- /* 10375/16384 ~= Tan[23*Pi/128] ~= 0.633243016177569 */ \
- t2 += (td*10375 + 8192) >> 14; \
- /* 11539/16384 ~= Tan[25*Pi/128] ~= 0.704279460865044 */ \
- tc += (t3*11539 + 8192) >> 14; \
- /* 7713/8192 ~= Sin[25*Pi/64] ~= 0.941544065183021 */ \
- t3 -= (tc*7713 + 4096) >> 13; \
- /* 11539/16384 ~= Tan[25*Pi/128] ~= 0.704279460865044 */ \
- tc += (t3*11539 + 8192) >> 14; \
- /* 9281/16384 ~= Tan[21*Pi/128] ~= 0.566493002730344 */ \
- ta += (t5*9281 + 8192) >> 14; \
- /* 7027/8192 ~= Sin[21*Pi/64] ~= 0.857728610000272 */ \
- t5 -= (ta*7027 + 4096) >> 13; \
- /* 9281/16384 ~= Tan[21*Pi/128] ~= 0.566493002730344 */ \
- ta += (t5*9281 + 8192) >> 14; \
- /* 6393/8192 ~= Tan[27*Pi/128] ~= 0.780407659653944 */ \
- t4 += (tb*6393 + 4096) >> 13; \
- /* 3973/4096 ~= Sin[27*Pi/64] ~= 0.970031253194544 */ \
- tb -= (t4*3973 + 2048) >> 12; \
- /* 6393/8192 ~= Tan[27*Pi/128] ~= 0.780407659653944 */ \
- t4 += (tb*6393 + 4096) >> 13; \
- /* 7263/16384 ~= Tan[17*Pi/128] ~= 0.443269513890864 */ \
- te += (t1*7263 + 8192) >> 14; \
- /* 3035/4096 ~= Sin[17*Pi/64] ~= 0.740951125354959 */ \
- t1 -= (te*3035 + 2048) >> 12; \
- /* 14525/32768 ~= Tan[17*Pi/128] ~= 0.443269513890864 */ \
- te += (t1*14525 + 16384) >> 15; \
- /* 2417/32768 ~= Tan[3*Pi/128] ~= 0.0737644315224493 */ \
- t8 -= (t7*2417 + 16384) >> 15; \
- /* 601/4096 ~= Sin[3*Pi/64] ~= 0.146730474455362 */ \
- t7 += (t8*601 + 2048) >> 12; \
- /* 2417/32768 ~= Tan[3*Pi/128] ~= 0.0737644315224493 */ \
- t8 -= (t7*2417 + 16384) >> 15; \
- /* 7799/8192 ~= Tan[31*Pi/128] ~= 0.952079146700925 */ \
- t0 += (tf*7799 + 4096) >> 13; \
- /* 4091/4096 ~= Sin[31*Pi/64] ~= 0.998795456205172 */ \
- tf -= (t0*4091 + 2048) >> 12; \
- /* 7799/8192 ~= Tan[31*Pi/128] ~= 0.952079146700925 */ \
- t0 += (tf*7799 + 4096) >> 13; \
- /* TODO: Can we move these into another operation */ \
- t1 = -t1; \
- t3 = -t3; \
- t5 = -t5; \
- t9 = -t9; \
- tb = -tb; \
- td = -td; \
- tf = -tf; \
- t4 += ta; \
- t4h = OD_RSHIFT1(t4); \
- ta = t4h - ta; \
- tb -= t5; \
- tbh_ = OD_RSHIFT1(tb); \
- t5 += tbh_; \
- tc += t2; \
- tch = OD_RSHIFT1(tc); \
- t2 -= tch; \
- t3 -= td; \
- t3h_ = OD_RSHIFT1(t3); \
- td += t3h_; \
- t9 += t8; \
- t9h_ = OD_RSHIFT1(t9); \
- t8 -= t9h_; \
- t6 -= t7; \
- t6h = OD_RSHIFT1(t6); \
- t7 += t6h; \
- t1 += tf; \
- t1h_ = OD_RSHIFT1(t1); \
- tf -= t1h_; \
- te -= t0; \
- teh = OD_RSHIFT1(te); \
- t0 += teh; \
- ta += t9h_; \
- t9 = ta - t9; \
- t5 -= t6h; \
- t6 += t5; \
- td = teh - td; \
- te = td - te; \
- t2 = t1h_ - t2; \
- t1 -= t2; \
- t7 += t4h; \
- t4 -= t7; \
- t8 -= tbh_; \
- tb += t8; \
- t0 += tch; \
- tc -= t0; \
- tf -= t3h_; \
- t3 += tf; \
- /* TODO: Can we move this into another operation */ \
- ta = -ta; \
- /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
- td += (t2*6723 + 4096) >> 13; \
- /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
- t2 -= (td*16069 + 8192) >> 14; \
- /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
- td += (t2*6723 + 4096) >> 13; \
- /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
- t5 -= (ta*2485 + 4096) >> 13; \
- /* 18205/32768 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
- ta += (t5*18205 + 16384) >> 15; \
- /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
- t5 -= (ta*2485 + 4096) >> 13; \
- t2 += t5; \
- t2h = OD_RSHIFT1(t2); \
- t5 -= t2h; \
- ta = td - ta; \
- td -= OD_RSHIFT1(ta); \
- /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
- ta -= (t5*13573 + 8192) >> 14; \
- /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
- t5 += (ta*11585 + 16384) >> 15; \
- /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
- ta -= (t5*13573 + 8192) >> 14; \
- /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
- t9 -= (t6*17515 + 16384) >> 15; \
- /* 13623/16384 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
- t6 += (t9*13623 + 8192) >> 14; \
- /* 17515/32768 ~= Tan[5*Pi/32]) ~= 0.534511135950792 */ \
- t9 -= (t6*17515 + 16384) >> 15; \
- /* 6723/8192 ~= Tan[7*Pi/32]) ~= 0.820678790828660 */ \
- t1 -= (te*6723 + 4096) >> 13; \
- /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
- te += (t1*16069 + 8192) >> 14; \
- /* 6723/8192 ~= Tan[7*Pi/32]) ~= 0.820678790828660 */ \
- t1 -= (te*6723 + 4096) >> 13; \
- te += t6; \
- teh = OD_RSHIFT1(te); \
- t6 = teh - t6; \
- t9 += t1; \
- t1 -= OD_RSHIFT1(t9); \
- /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
- t9 -= (t6*19195 + 16384) >> 15; \
- /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
- t6 -= (t9*11585 + 8192) >> 14; \
- /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
- t9 += (t6*7489 + 4096) >> 13; \
- tb = tc - tb; \
- tc = OD_RSHIFT1(tb) - tc; \
- t3 += t4; \
- t4 = OD_RSHIFT1(t3) - t4; \
- /* TODO: Can we move this into another operation */ \
- t3 = -t3; \
- t8 += tf; \
- tf = OD_RSHIFT1(t8) - tf; \
- t0 += t7; \
- t0h = OD_RSHIFT1(t0); \
- t7 = t0h - t7; \
- /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
- t3 += (tc*4161 + 8192) >> 14; \
- /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
- tc -= (t3*15137 + 8192) >> 14; \
- /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
- t3 += (tc*14341 + 8192) >> 14; \
- /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
- t4 -= (tb*14341 + 8192) >> 14; \
- /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
- tb += (t4*15137 + 8192) >> 14; \
- /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
- t4 -= (tb*4161 + 8192) >> 14; \
- /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
- t8 += (t7*13573 + 8192) >> 14; \
- /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
- t7 -= (t8*11585 + 16384) >> 15; \
- /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
- t8 += (t7*13573 + 8192) >> 14; \
- /* TODO: Can we move these into another operation */ \
- t1 = -t1; \
- t5 = -t5; \
- t9 = -t9; \
- tb = -tb; \
- td = -td; \
- } \
- while (0)
-
-#define OD_FDCT_32_PR(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, \
- te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
- /* Embedded 32-point orthonormal Type-II fDCT. */ \
- do { \
- int tgh; \
- int thh; \
- int tih; \
- int tkh; \
- int tmh; \
- int tnh; \
- int toh; \
- int tqh; \
- int tsh; \
- int tuh; \
- int tvh; \
- tv = t0 - tv; \
- tvh = OD_RSHIFT1(tv); \
- t0 -= tvh; \
- tu += t1; \
- tuh = OD_RSHIFT1(tu); \
- t1 = tuh - t1; \
- tt = t2 - tt; \
- t2 -= OD_RSHIFT1(tt); \
- ts += t3; \
- tsh = OD_RSHIFT1(ts); \
- t3 = tsh - t3; \
- tr = t4 - tr; \
- t4 -= OD_RSHIFT1(tr); \
- tq += t5; \
- tqh = OD_RSHIFT1(tq); \
- t5 = tqh - t5; \
- tp = t6 - tp; \
- t6 -= OD_RSHIFT1(tp); \
- to += t7; \
- toh = OD_RSHIFT1(to); \
- t7 = toh - t7; \
- tn = t8 - tn; \
- tnh = OD_RSHIFT1(tn); \
- t8 -= tnh; \
- tm += t9; \
- tmh = OD_RSHIFT1(tm); \
- t9 = tmh - t9; \
- tl = ta - tl; \
- ta -= OD_RSHIFT1(tl); \
- tk += tb; \
- tkh = OD_RSHIFT1(tk); \
- tb = tkh - tb; \
- tj = tc - tj; \
- tc -= OD_RSHIFT1(tj); \
- ti += td; \
- tih = OD_RSHIFT1(ti); \
- td = tih - td; \
- th = te - th; \
- thh = OD_RSHIFT1(th); \
- te -= thh; \
- tg += tf; \
- tgh = OD_RSHIFT1(tg); \
- tf = tgh - tf; \
- OD_FDCT_16_ASYM_PR(t0, tg, tgh, t8, to, toh, t4, tk, tkh, tc, ts, tsh, \
- t2, ti, tih, ta, tq, tqh, t6, tm, tmh, te, tu, tuh); \
- OD_FDST_16_ASYM_PR(tv, tvh, tf, tn, tnh, t7, tr, tb, tj, t3, \
- tt, td, tl, t5, tp, t9, th, thh, t1); \
- } \
- while (0)
-
-#define OD_IDCT_32_PR(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, \
- te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
- /* Embedded 32-point orthonormal Type-II iDCT. */ \
- do { \
- int t1h; \
- int t3h; \
- int t5h; \
- int t7h; \
- int t9h; \
- int tbh; \
- int tdh; \
- int tfh; \
- int thh; \
- int tth; \
- int tvh; \
- OD_IDST_16_ASYM_PR(tv, tvh, tn, tr, tj, tt, tth, tl, tp, th, thh, \
- tu, tm, tq, ti, ts, tk, to, tg); \
- OD_IDCT_16_ASYM_PR(t0, t8, t4, tc, t2, ta, t6, te, \
- t1, t1h, t9, t9h, t5, t5h, td, tdh, t3, t3h, tb, tbh, t7, t7h, tf, tfh); \
- tu = t1h - tu; \
- t1 -= tu; \
- te += thh; \
- th = te - th; \
- tm = t9h - tm; \
- t9 -= tm; \
- t6 += OD_RSHIFT1(tp); \
- tp = t6 - tp; \
- tq = t5h - tq; \
- t5 -= tq; \
- ta += OD_RSHIFT1(tl); \
- tl = ta - tl; \
- ti = tdh - ti; \
- td -= ti; \
- t2 += tth; \
- tt = t2 - tt; \
- ts = t3h - ts; \
- t3 -= ts; \
- tc += OD_RSHIFT1(tj); \
- tj = tc - tj; \
- tk = tbh - tk; \
- tb -= tk; \
- t4 += OD_RSHIFT1(tr); \
- tr = t4 - tr; \
- to = t7h - to; \
- t7 -= to; \
- t8 += OD_RSHIFT1(tn); \
- tn = t8 - tn; \
- tg = tfh - tg; \
- tf -= tg; \
- t0 += tvh; \
- tv = t0 - tv; \
- } \
- while (0)
-
-/* Embedded 32-point orthonormal Type-IV fDST. */
-#define OD_FDST_32_PR(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, \
- te, tf, tg, th, ti, tj, tk, tl, tm, tn, to, tp, tq, tr, ts, tt, tu, tv) \
- /* 117 "muls", 117 + 128 = 245 adds, 36 shifts */ \
- do { \
- od_coeff t0h; \
- od_coeff t1h; \
- od_coeff t2h; \
- od_coeff t3h; \
- od_coeff t4h; \
- od_coeff t6h; \
- od_coeff t8h; \
- od_coeff t9h; \
- od_coeff tah; \
- od_coeff tbh; \
- od_coeff tch; \
- od_coeff tdh; \
- od_coeff teh; \
- od_coeff tfh; \
- od_coeff tgh; \
- od_coeff thh; \
- od_coeff tih; \
- od_coeff tjh; \
- od_coeff tkh; \
- od_coeff tlh; \
- od_coeff tmh; \
- od_coeff tnh; \
- od_coeff tph; \
- od_coeff trh; \
- od_coeff tsh; \
- od_coeff tth; \
- od_coeff tuh; \
- od_coeff tvh; \
- /* Stage 0 */ \
- tp += (t6*659 + 2048) >> 12; \
- t6 -= (tp*10279 + 16384) >> 15; \
- tp += (t6*659 + 2048) >> 12; \
- th += (te*3045 + 4096) >> 13; \
- te -= (th*21403 + 16384) >> 15; \
- th += (te*3045 + 4096) >> 13; \
- t9 += (tm*20191 + 16384) >> 15; \
- tm -= (t9*29269 + 16384) >> 15; \
- t9 += (tm*20191 + 16384) >> 15; \
- tu += (t1*1207 + 16384) >> 15; \
- t1 -= (tu*2411 + 16384) >> 15; \
- tu += (t1*1207 + 16384) >> 15; \
- t4 += (tr*13113 + 8192) >> 14; \
- tr -= (t4*7993 + 4096) >> 13; \
- t4 += (tr*13113 + 8192) >> 14; \
- tj += (tc*10381 + 16384) >> 15; \
- tc -= (tj*4717 + 4096) >> 13; \
- tj += (tc*10381 + 16384) >> 15; \
- tb += (tk*18035 + 16384) >> 15; \
- tk -= (tb*6921 + 4096) >> 13; \
- tb += (tk*18035 + 16384) >> 15; \
- ts += (t3*1411 + 8192) >> 14; \
- t3 -= (ts*2801 + 8192) >> 14; \
- ts += (t3*1411 + 8192) >> 14; \
- tq += (t5*2225 + 8192) >> 14; \
- t5 -= (tq*2185 + 4096) >> 13; \
- tq += (t5*2225 + 8192) >> 14; \
- ti += (td*11273 + 16384) >> 15; \
- td -= (ti*315 + 256) >> 9; \
- ti += (td*11273 + 16384) >> 15; \
- tl += (ta*8637 + 16384) >> 15; \
- ta -= (tl*16151 + 16384) >> 15; \
- tl += (ta*8637 + 16384) >> 15; \
- tt += (t2*2013 + 16384) >> 15; \
- t2 -= (tt*4011 + 16384) >> 15; \
- tt += (t2*2013 + 16384) >> 15; \
- to += (t7*6101 + 16384) >> 15; \
- t7 -= (to*11793 + 16384) >> 15; \
- to += (t7*6101 + 16384) >> 15; \
- t8 += (tn*10659 + 8192) >> 14; \
- tn -= (t8*29957 + 16384) >> 15; \
- t8 += (tn*10659 + 8192) >> 14; \
- tg += (tf*819 + 1024) >> 11; \
- tf -= (tg*22595 + 16384) >> 15; \
- tg += (tf*819 + 1024) >> 11; \
- t0 += (tv*31973 + 16384) >> 15; \
- tv -= (t0*16379 + 8192) >> 14; \
- t0 += (tv*31973 + 16384) >> 15; \
- /* Stage 1 */ \
- tj -= ts; \
- tjh = OD_RSHIFT1(tj); \
- ts += tjh; \
- tr = tk - tr; \
- trh = OD_RSHIFT1(tr); \
- tk = trh - tk; \
- tc += t3; \
- tch = OD_RSHIFT1(tc); \
- t3 -= tch; \
- t4 += tb; \
- t4h = OD_RSHIFT1(t4); \
- tb -= t4h; \
- tv += tf; \
- tvh = OD_RSHIFT1(tv); \
- tf -= tvh; \
- t8 -= to; \
- t8h = OD_RSHIFT1(t8); \
- to += t8h; \
- t0 += tg; \
- t0h = OD_RSHIFT1(t0); \
- tg -= t0h; \
- tn = t7 - tn; \
- tnh = OD_RSHIFT1(tn); \
- t7 -= tnh; \
- th -= tu; \
- thh = OD_RSHIFT1(th); \
- tu += thh; \
- t6 += tm; \
- t6h = OD_RSHIFT1(t6); \
- tm = t6h - tm; \
- te += t1; \
- teh = OD_RSHIFT1(te); \
- t1 -= teh; \
- tp += t9; \
- tph = OD_RSHIFT1(tp); \
- t9 -= tph; \
- t2 -= td; \
- t2h = OD_RSHIFT1(t2); \
- td += t2h; \
- tl = tq - tl; \
- tlh = OD_RSHIFT1(tl); \
- tq -= tlh; \
- tt += ti; \
- tth = OD_RSHIFT1(tt); \
- ti -= tth; \
- ta += t5; \
- tah = OD_RSHIFT1(ta); \
- t5 -= tah; \
- /* Stage 2 */ \
- tm -= thh; \
- th += tm; \
- t9 = teh - t9; \
- te -= t9; \
- td = tlh - td; \
- tl -= td; \
- ti += tah; \
- ta -= ti; \
- tk = tjh - tk; \
- tj -= tk; \
- tb -= tch; \
- tc += tb; \
- tg += tnh; \
- tn = tg - tn; \
- tf += t8h; \
- t8 = tf - t8; \
- t3 -= trh; \
- tr += t3; \
- ts += t4h; \
- t4 -= ts; \
- to -= t0h; \
- t0 += to; \
- t7 = tvh - t7; \
- tv = t7 - tv; \
- t1 -= t6h; \
- t6 += t1; \
- tu += tph; \
- tp -= tu; \
- tq -= tth; \
- tt += tq; \
- t5 += t2h; \
- t2 -= t5; \
- /* Stage 3 */ \
- tj += (tc*11725 + 16384) >> 15; \
- tc -= (tj*5197 + 4096) >> 13; \
- tj += (tc*11725 + 16384) >> 15; \
- td += (ti*513 + 1024) >> 11; \
- ti -= (td*15447 + 16384) >> 15; \
- td += (ti*513 + 1024) >> 11; \
- th += (te*4861 + 16384) >> 15; \
- te -= (th*1189 + 2048) >> 12; \
- th += (te*4861 + 16384) >> 15; \
- tg += (tf*805 + 8192) >> 14; \
- tf -= (tg*803 + 4096) >> 13; \
- tg += (tf*805 + 8192) >> 14; \
- tb += (tk*7749 + 8192) >> 14; \
- tk -= (tb*12665 + 8192) >> 14; \
- tb += (tk*7749 + 8192) >> 14; \
- tl += (ta*2455 + 2048) >> 12; \
- ta -= (tl*28899 + 16384) >> 15; \
- tl += (ta*2455 + 2048) >> 12; \
- t9 += (tm*12151 + 8192) >> 14; \
- tm -= (t9*31357 + 16384) >> 15; \
- t9 += (tm*12151 + 8192) >> 14; \
- tn += (t8*29699 + 16384) >> 15; \
- t8 -= (tn*16305 + 8192) >> 14; \
- tn += (t8*29699 + 16384) >> 15; \
- /* Stage 4 */ \
- tf -= tc; \
- tfh = OD_RSHIFT1(tf); \
- tc += tfh; \
- ti = th - ti; \
- tih = OD_RSHIFT1(ti); \
- th -= tih; \
- tg += tj; \
- tgh = OD_RSHIFT1(tg); \
- tj = tgh - tj; \
- td -= te; \
- tdh = OD_RSHIFT1(td); \
- te += tdh; \
- tm = ta - tm; \
- tmh = OD_RSHIFT1(tm); \
- ta = tmh - ta; \
- t9 += tl; \
- t9h = OD_RSHIFT1(t9); \
- tl -= t9h; \
- tb += t8; \
- tbh = OD_RSHIFT1(tb); \
- t8 -= tbh; \
- tk += tn; \
- tkh = OD_RSHIFT1(tk); \
- tn -= tkh; \
- t1 -= t2; \
- t1h = OD_RSHIFT1(t1); \
- t2 += t1h; \
- t3 += tv; \
- t3h = OD_RSHIFT1(t3); \
- tv -= t3h; \
- tu += tt; \
- tuh = OD_RSHIFT1(tu); \
- tt -= tuh; \
- ts -= t0; \
- tsh = OD_RSHIFT1(ts); \
- t0 += tsh; \
- tq = t6 - tq; \
- t6 -= OD_RSHIFT1(tq); \
- to += tr; \
- tr = OD_RSHIFT1(to) - tr; \
- t7 = t4 - t7; \
- t4 -= OD_RSHIFT1(t7); \
- t5 -= tp; \
- tp += OD_RSHIFT1(t5); \
- /* Stage 5 */ \
- tp += (t6*2485 + 4096) >> 13; \
- t6 -= (tp*18205 + 16384) >> 15; \
- tp += (t6*2485 + 4096) >> 13; \
- to += (t7*3227 + 16384) >> 15; \
- t7 -= (to*6393 + 16384) >> 15; \
- to += (t7*3227 + 16384) >> 15; \
- tq += (t5*17515 + 16384) >> 15; \
- t5 -= (tq*13623 + 8192) >> 14; \
- tq += (t5*17515 + 16384) >> 15; \
- t4 += (tr*6723 + 4096) >> 13; \
- tr -= (t4*16069 + 8192) >> 14; \
- t4 += (tr*6723 + 4096) >> 13; \
- /* Stage 6 */ \
- tj += tdh; \
- td -= tj; \
- tc -= tih; \
- ti += tc; \
- th = tgh - th; \
- tg -= th; \
- te += tfh; \
- tf -= te; \
- tl = tkh - tl; \
- tk -= tl; \
- ta += tbh; \
- tb -= ta; \
- tn -= tmh; \
- tm += tn; \
- t8 += t9h; \
- t9 = t8 - t9; \
- tt = t3h - tt; \
- t3 -= tt; \
- t2 -= tsh; \
- ts += t2; \
- tv -= t1h; \
- t1 += tv; \
- t0 += tuh; \
- tu -= t0; \
- tp = OD_RSHIFT1(to) - tp; \
- to -= tp; \
- t6 += OD_RSHIFT1(t7); \
- t7 -= t6; \
- t4 = OD_RSHIFT1(tq) - t4; \
- tq -= t4; \
- tr += OD_RSHIFT1(t5); \
- t5 = tr - t5; \
- /* Stage 7 */ \
- td += (ti*21895 + 16384) >> 15; \
- ti -= (td*15137 + 8192) >> 14; \
- td += (ti*21895 + 16384) >> 15; \
- tj += (tc*21895 + 16384) >> 15; \
- tc -= (tj*15137 + 8192) >> 14; \
- tj += (tc*21895 + 16384) >> 15; \
- th += (te*13573 + 16384) >> 15; \
- te -= (th*11585 + 8192) >> 14; \
- th += (te*13573 + 16384) >> 15; \
- tb += (tk*21895 + 16384) >> 15; \
- tk -= (tb*15137 + 8192) >> 14; \
- tb += (tk*21895 + 16384) >> 15; \
- ta += (tl*3259 + 8192) >> 14; \
- tl -= (ta*3135 + 4096) >> 13; \
- ta += (tl*3259 + 8192) >> 14; \
- t9 += (tm*13573 + 16384) >> 15; \
- tm -= (t9*11585 + 8192) >> 14; \
- t9 += (tm*13573 + 16384) >> 15; \
- ts += (t3*3259 + 8192) >> 14; \
- t3 -= (ts*3135 + 4096) >> 13; \
- ts += (t3*3259 + 8192) >> 14; \
- t2 += (tt*3259 + 8192) >> 14; \
- tt -= (t2*3135 + 4096) >> 13; \
- t2 += (tt*3259 + 8192) >> 14; \
- tu += (t1*13573 + 16384) >> 15; \
- t1 -= (tu*11585 + 8192) >> 14; \
- tu += (t1*13573 + 16384) >> 15; \
- tp += (t6*13573 + 16384) >> 15; \
- t6 -= (tp*11585 + 8192) >> 14; \
- tp += (t6*13573 + 16384) >> 15; \
- tq += (t5*13573 + 16384) >> 15; \
- t5 -= (tq*11585 + 8192) >> 14; \
- tq += (t5*13573 + 16384) >> 15; \
- } \
- while (0)
-
-/* Embedded 32-point orthonormal Type-IV iDST. */
-#define OD_IDST_32_PR(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, \
- te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
- /* 117 "muls", 117 + 128 = 245 adds, 36 shifts */ \
- do { \
- od_coeff t0h; \
- od_coeff t1h; \
- od_coeff t2h; \
- od_coeff t3h; \
- od_coeff t4h; \
- od_coeff t6h; \
- od_coeff t8h; \
- od_coeff t9h; \
- od_coeff tah; \
- od_coeff tbh; \
- od_coeff tch; \
- od_coeff tdh; \
- od_coeff teh; \
- od_coeff tfh; \
- od_coeff tgh; \
- od_coeff thh; \
- od_coeff tih; \
- od_coeff tjh; \
- od_coeff tkh; \
- od_coeff tlh; \
- od_coeff tmh; \
- od_coeff tnh; \
- od_coeff tph; \
- od_coeff trh; \
- od_coeff tsh; \
- od_coeff tth; \
- od_coeff tuh; \
- od_coeff tvh; \
- /* Stage 0 */ \
- tq -= (t5*13573 + 16384) >> 15; \
- t5 += (tq*11585 + 8192) >> 14; \
- tq -= (t5*13573 + 16384) >> 15; \
- tp -= (t6*13573 + 16384) >> 15; \
- t6 += (tp*11585 + 8192) >> 14; \
- tp -= (t6*13573 + 16384) >> 15; \
- tu -= (t1*13573 + 16384) >> 15; \
- t1 += (tu*11585 + 8192) >> 14; \
- tu -= (t1*13573 + 16384) >> 15; \
- t2 -= (tt*3259 + 8192) >> 14; \
- tt += (t2*3135 + 4096) >> 13; \
- t2 -= (tt*3259 + 8192) >> 14; \
- ts -= (t3*3259 + 8192) >> 14; \
- t3 += (ts*3135 + 4096) >> 13; \
- ts -= (t3*3259 + 8192) >> 14; \
- t9 -= (tm*13573 + 16384) >> 15; \
- tm += (t9*11585 + 8192) >> 14; \
- t9 -= (tm*13573 + 16384) >> 15; \
- ta -= (tl*3259 + 8192) >> 14; \
- tl += (ta*3135 + 4096) >> 13; \
- ta -= (tl*3259 + 8192) >> 14; \
- tb -= (tk*21895 + 16384) >> 15; \
- tk += (tb*15137 + 8192) >> 14; \
- tb -= (tk*21895 + 16384) >> 15; \
- th -= (te*13573 + 16384) >> 15; \
- te += (th*11585 + 8192) >> 14; \
- th -= (te*13573 + 16384) >> 15; \
- tj -= (tc*21895 + 16384) >> 15; \
- tc += (tj*15137 + 8192) >> 14; \
- tj -= (tc*21895 + 16384) >> 15; \
- td -= (ti*21895 + 16384) >> 15; \
- ti += (td*15137 + 8192) >> 14; \
- td -= (ti*21895 + 16384) >> 15; \
- /* Stage 1 */ \
- t5 = tr - t5; \
- tr -= OD_RSHIFT1(t5); \
- tq += t4; \
- t4 = OD_RSHIFT1(tq) - t4; \
- t7 += t6; \
- t6 -= OD_RSHIFT1(t7); \
- to += tp; \
- tp = OD_RSHIFT1(to) - tp; \
- tu += t0; \
- tuh = OD_RSHIFT1(tu); \
- t0 -= tuh; \
- t1 -= tv; \
- t1h = OD_RSHIFT1(t1); \
- tv += t1h; \
- ts -= t2; \
- tsh = OD_RSHIFT1(ts); \
- t2 += tsh; \
- t3 += tt; \
- t3h = OD_RSHIFT1(t3); \
- tt = t3h - tt; \
- t9 = t8 - t9; \
- t9h = OD_RSHIFT1(t9); \
- t8 -= t9h; \
- tm -= tn; \
- tmh = OD_RSHIFT1(tm); \
- tn += tmh; \
- tb += ta; \
- tbh = OD_RSHIFT1(tb); \
- ta -= tbh; \
- tk += tl; \
- tkh = OD_RSHIFT1(tk); \
- tl = tkh - tl; \
- tf += te; \
- tfh = OD_RSHIFT1(tf); \
- te -= tfh; \
- tg += th; \
- tgh = OD_RSHIFT1(tg); \
- th = tgh - th; \
- ti -= tc; \
- tih = OD_RSHIFT1(ti); \
- tc += tih; \
- td += tj; \
- tdh = OD_RSHIFT1(td); \
- tj -= tdh; \
- /* Stage 2 */ \
- t4 -= (tr*6723 + 4096) >> 13; \
- tr += (t4*16069 + 8192) >> 14; \
- t4 -= (tr*6723 + 4096) >> 13; \
- tq -= (t5*17515 + 16384) >> 15; \
- t5 += (tq*13623 + 8192) >> 14; \
- tq -= (t5*17515 + 16384) >> 15; \
- to -= (t7*3227 + 16384) >> 15; \
- t7 += (to*6393 + 16384) >> 15; \
- to -= (t7*3227 + 16384) >> 15; \
- tp -= (t6*2485 + 4096) >> 13; \
- t6 += (tp*18205 + 16384) >> 15; \
- tp -= (t6*2485 + 4096) >> 13; \
- /* Stage 3 */ \
- tp -= OD_RSHIFT1(t5); \
- t5 += tp; \
- t4 += OD_RSHIFT1(t7); \
- t7 = t4 - t7; \
- tr = OD_RSHIFT1(to) - tr; \
- to -= tr; \
- t6 += OD_RSHIFT1(tq); \
- tq = t6 - tq; \
- t0 -= tsh; \
- ts += t0; \
- tt += tuh; \
- tu -= tt; \
- tv += t3h; \
- t3 -= tv; \
- t2 -= t1h; \
- t1 += t2; \
- tn += tkh; \
- tk -= tn; \
- t8 += tbh; \
- tb -= t8; \
- tl += t9h; \
- t9 -= tl; \
- ta = tmh - ta; \
- tm = ta - tm; \
- te -= tdh; \
- td += te; \
- tj = tgh - tj; \
- tg -= tj; \
- th += tih; \
- ti = th - ti; \
- tc -= tfh; \
- tf += tc; \
- /* Stage 4 */ \
- tn -= (t8*29699 + 16384) >> 15; \
- t8 += (tn*16305 + 8192) >> 14; \
- tn -= (t8*29699 + 16384) >> 15; \
- t9 -= (tm*12151 + 8192) >> 14; \
- tm += (t9*31357 + 16384) >> 15; \
- t9 -= (tm*12151 + 8192) >> 14; \
- tl -= (ta*2455 + 2048) >> 12; \
- ta += (tl*28899 + 16384) >> 15; \
- tl -= (ta*2455 + 2048) >> 12; \
- tb -= (tk*7749 + 8192) >> 14; \
- tk += (tb*12665 + 8192) >> 14; \
- tb -= (tk*7749 + 8192) >> 14; \
- tg -= (tf*805 + 8192) >> 14; \
- tf += (tg*803 + 4096) >> 13; \
- tg -= (tf*805 + 8192) >> 14; \
- th -= (te*4861 + 16384) >> 15; \
- te += (th*1189 + 2048) >> 12; \
- th -= (te*4861 + 16384) >> 15; \
- td -= (ti*513 + 1024) >> 11; \
- ti += (td*15447 + 16384) >> 15; \
- td -= (ti*513 + 1024) >> 11; \
- tj -= (tc*11725 + 16384) >> 15; \
- tc += (tj*5197 + 4096) >> 13; \
- tj -= (tc*11725 + 16384) >> 15; \
- /* Stage 5 */ \
- t2 += t5; \
- t2h = OD_RSHIFT1(t2); \
- t5 -= t2h; \
- tt -= tq; \
- tth = OD_RSHIFT1(tt); \
- tq += tth; \
- tp += tu; \
- tph = OD_RSHIFT1(tp); \
- tu -= tph; \
- t6 -= t1; \
- t6h = OD_RSHIFT1(t6); \
- t1 += t6h; \
- tv = t7 - tv; \
- tvh = OD_RSHIFT1(tv); \
- t7 = tvh - t7; \
- t0 -= to; \
- t0h = OD_RSHIFT1(t0); \
- to += t0h; \
- t4 += ts; \
- t4h = OD_RSHIFT1(t4); \
- ts -= t4h; \
- tr -= t3; \
- trh = OD_RSHIFT1(tr); \
- t3 += trh; \
- t8 = tf - t8; \
- t8h = OD_RSHIFT1(t8); \
- tf -= t8h; \
- tn = tg - tn; \
- tnh = OD_RSHIFT1(tn); \
- tg -= tnh; \
- tc -= tb; \
- tch = OD_RSHIFT1(tc); \
- tb += tch; \
- tj += tk; \
- tjh = OD_RSHIFT1(tj); \
- tk = tjh - tk; \
- ta += ti; \
- tah = OD_RSHIFT1(ta); \
- ti -= tah; \
- tl += td; \
- tlh = OD_RSHIFT1(tl); \
- td = tlh - td; \
- te += t9; \
- teh = OD_RSHIFT1(te); \
- t9 = teh - t9; \
- th -= tm; \
- thh = OD_RSHIFT1(th); \
- tm += thh; \
- /* Stage 6 */ \
- t5 += tah; \
- ta -= t5; \
- ti += tth; \
- tt -= ti; \
- tq += tlh; \
- tl = tq - tl; \
- td -= t2h; \
- t2 += td; \
- t9 += tph; \
- tp -= t9; \
- t1 += teh; \
- te -= t1; \
- tm = t6h - tm; \
- t6 -= tm; \
- tu -= thh; \
- th += tu; \
- t7 += tnh; \
- tn = t7 - tn; \
- tg += t0h; \
- t0 -= tg; \
- to -= t8h; \
- t8 += to; \
- tf += tvh; \
- tv -= tf; \
- tb += t4h; \
- t4 -= tb; \
- t3 += tch; \
- tc -= t3; \
- tk = trh - tk; \
- tr = tk - tr; \
- ts -= tjh; \
- tj += ts; \
- /* Stage 7 */ \
- t0 -= (tv*31973 + 16384) >> 15; \
- tv += (t0*16379 + 8192) >> 14; \
- t0 -= (tv*31973 + 16384) >> 15; \
- tg -= (tf*819 + 1024) >> 11; \
- tf += (tg*22595 + 16384) >> 15; \
- tg -= (tf*819 + 1024) >> 11; \
- t8 -= (tn*10659 + 8192) >> 14; \
- tn += (t8*29957 + 16384) >> 15; \
- t8 -= (tn*10659 + 8192) >> 14; \
- to -= (t7*6101 + 16384) >> 15; \
- t7 += (to*11793 + 16384) >> 15; \
- to -= (t7*6101 + 16384) >> 15; \
- tt -= (t2*2013 + 16384) >> 15; \
- t2 += (tt*4011 + 16384) >> 15; \
- tt -= (t2*2013 + 16384) >> 15; \
- tl -= (ta*8637 + 16384) >> 15; \
- ta += (tl*16151 + 16384) >> 15; \
- tl -= (ta*8637 + 16384) >> 15; \
- ti -= (td*11273 + 16384) >> 15; \
- td += (ti*315 + 256) >> 9; \
- ti -= (td*11273 + 16384) >> 15; \
- tq -= (t5*2225 + 8192) >> 14; \
- t5 += (tq*2185 + 4096) >> 13; \
- tq -= (t5*2225 + 8192) >> 14; \
- ts -= (t3*1411 + 8192) >> 14; \
- t3 += (ts*2801 + 8192) >> 14; \
- ts -= (t3*1411 + 8192) >> 14; \
- tb -= (tk*18035 + 16384) >> 15; \
- tk += (tb*6921 + 4096) >> 13; \
- tb -= (tk*18035 + 16384) >> 15; \
- tj -= (tc*10381 + 16384) >> 15; \
- tc += (tj*4717 + 4096) >> 13; \
- tj -= (tc*10381 + 16384) >> 15; \
- t4 -= (tr*13113 + 8192) >> 14; \
- tr += (t4*7993 + 4096) >> 13; \
- t4 -= (tr*13113 + 8192) >> 14; \
- tu -= (t1*1207 + 16384) >> 15; \
- t1 += (tu*2411 + 16384) >> 15; \
- tu -= (t1*1207 + 16384) >> 15; \
- t9 -= (tm*20191 + 16384) >> 15; \
- tm += (t9*29269 + 16384) >> 15; \
- t9 -= (tm*20191 + 16384) >> 15; \
- th -= (te*3045 + 4096) >> 13; \
- te += (th*21403 + 16384) >> 15; \
- th -= (te*3045 + 4096) >> 13; \
- tp -= (t6*659 + 2048) >> 12; \
- t6 += (tp*10279 + 16384) >> 15; \
- tp -= (t6*659 + 2048) >> 12; \
- } \
- while (0)
-
-#if CONFIG_TX64X64
-#define OD_FDCT_32_ASYM_PR(t0, tg, tgh, t8, to, toh, t4, tk, tkh, tc, ts, tsh, \
- t2, ti, tih, ta, tq, tqh, t6, tm, tmh, te, tu, tuh, t1, th, thh, \
- t9, tp, tph, t5, tl, tlh, td, tt, tth, t3, tj, tjh, tb, tr, trh, \
- t7, tn, tnh, tf, tv, tvh) \
- /* Embedded 32-point asymmetric Type-II fDCT. */ \
- do { \
- t0 += tvh; \
- tv = t0 - tv; \
- t1 = tuh - t1; \
- tu -= t1; \
- t2 += tth; \
- tt = t2 - tt; \
- t3 = tsh - t3; \
- ts -= t3; \
- t4 += trh; \
- tr = t4 - tr; \
- t5 = tqh - t5; \
- tq -= t5; \
- t6 += tph; \
- tp = t6 - tp; \
- t7 = toh - t7; \
- to -= t7; \
- t8 += tnh; \
- tn = t8 - tn; \
- t9 = tmh - t9; \
- tm -= t9; \
- ta += tlh; \
- tl = ta - tl; \
- tb = tkh - tb; \
- tk -= tb; \
- tc += tjh; \
- tj = tc - tj; \
- td = tih - td; \
- ti -= td; \
- te += thh; \
- th = te - th; \
- tf = tgh - tf; \
- tg -= tf; \
- OD_FDCT_16_PR(t0, tg, t8, to, t4, tk, tc, ts, \
- t2, ti, ta, tq, t6, tm, te, tu); \
- OD_FDST_16_PR(tv, tf, tn, t7, tr, tb, tj, t3, \
- tt, td, tl, t5, tp, t9, th, t1); \
- } \
- while (0)
-
-#define OD_IDCT_32_ASYM_PR(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, \
- t6, tm, te, tu, t1, t1h, th, thh, t9, t9h, tp, tph, t5, t5h, tl, tlh, \
- td, tdh, tt, tth, t3, t3h, tj, tjh, tb, tbh, tr, trh, t7, t7h, tn, tnh, \
- tf, tfh, tv, tvh) \
- /* Embedded 32-point asymmetric Type-II iDCT. */ \
- do { \
- OD_IDST_16_PR(tv, tn, tr, tj, tt, tl, tp, th, \
- tu, tm, tq, ti, ts, tk, to, tg); \
- OD_IDCT_16_PR(t0, t8, t4, tc, t2, ta, t6, te, \
- t1, t9, t5, td, t3, tb, t7, tf); \
- tv = t0 - tv; \
- tvh = OD_RSHIFT1(tv); \
- t0 -= tvh; \
- t1 += tu; \
- t1h = OD_RSHIFT1(t1); \
- tu = t1h - tu; \
- tt = t2 - tt; \
- tth = OD_RSHIFT1(tt); \
- t2 -= tth; \
- t3 += ts; \
- t3h = OD_RSHIFT1(t3); \
- ts = t3h - ts; \
- tr = t4 - tr; \
- trh = OD_RSHIFT1(tr); \
- t4 -= trh; \
- t5 += tq; \
- t5h = OD_RSHIFT1(t5); \
- tq = t5h - tq; \
- tp = t6 - tp; \
- tph = OD_RSHIFT1(tp); \
- t6 -= tph; \
- t7 += to; \
- t7h = OD_RSHIFT1(t7); \
- to = t7h - to; \
- tn = t8 - tn; \
- tnh = OD_RSHIFT1(tn); \
- t8 -= tnh; \
- t9 += tm; \
- t9h = OD_RSHIFT1(t9); \
- tm = t9h - tm; \
- tl = ta - tl; \
- tlh = OD_RSHIFT1(tl); \
- ta -= tlh; \
- tb += tk; \
- tbh = OD_RSHIFT1(tb); \
- tk = tbh - tk; \
- tj = tc - tj; \
- tjh = OD_RSHIFT1(tj); \
- tc -= tjh; \
- td += ti; \
- tdh = OD_RSHIFT1(td); \
- ti = tdh - ti; \
- th = te - th; \
- thh = OD_RSHIFT1(th); \
- te -= thh; \
- tf += tg; \
- tfh = OD_RSHIFT1(tf); \
- tg = tfh - tg; \
- } \
- while (0)
-
-#define OD_FDST_32_ASYM_PR(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, \
- tm, te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
- /* Embedded 32-point asymmetric Type-IV fDST. */ \
- do { \
- int t0h; \
- int t1h; \
- int t4h; \
- int t5h; \
- int tqh; \
- int trh; \
- int tuh; \
- int tvh; \
- \
- tu = -tu; \
- \
- /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
- OD_DCT_OVERFLOW_CHECK(tq, 13573, 8192, 271); \
- t5 -= (tq*13573 + 8192) >> 14; \
- /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
- OD_DCT_OVERFLOW_CHECK(t5, 11585, 16384, 272); \
- tq += (t5*11585 + 16384) >> 15; \
- /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
- OD_DCT_OVERFLOW_CHECK(tq, 13573, 8192, 273); \
- t5 -= (tq*13573 + 8192) >> 14; \
- /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
- OD_DCT_OVERFLOW_CHECK(t6, 29957, 16384, 274); \
- tp += (t6*29957 + 16384) >> 15; \
- /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
- OD_DCT_OVERFLOW_CHECK(tp, 11585, 8192, 275); \
- t6 -= (tp*11585 + 8192) >> 14; \
- /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
- OD_DCT_OVERFLOW_CHECK(t6, 19195, 16384, 276); \
- tp -= (t6*19195 + 16384) >> 15; \
- /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
- OD_DCT_OVERFLOW_CHECK(t1, 29957, 16384, 277); \
- tu += (t1*29957 + 16384) >> 15; \
- /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
- OD_DCT_OVERFLOW_CHECK(tu, 11585, 8192, 278); \
- t1 -= (tu*11585 + 8192) >> 14; \
- /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
- OD_DCT_OVERFLOW_CHECK(t1, 19195, 16384, 279); \
- tu -= (t1*19195 + 16384) >> 15; \
- /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
- OD_DCT_OVERFLOW_CHECK(t2, 28681, 16384, 280); \
- tt += (t2*28681 + 16384) >> 15; \
- /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
- OD_DCT_OVERFLOW_CHECK(tt, 15137, 8192, 281); \
- t2 -= (tt*15137 + 8192) >> 14; \
- /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
- OD_DCT_OVERFLOW_CHECK(t2, 4161, 8192, 282); \
- tt += (t2*4161 + 8192) >> 14; \
- /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
- OD_DCT_OVERFLOW_CHECK(ts, 4161, 8192, 283); \
- t3 += (ts*4161 + 8192) >> 14; \
- /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
- OD_DCT_OVERFLOW_CHECK(t3, 15137, 8192, 284); \
- ts -= (t3*15137 + 8192) >> 14; \
- /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
- OD_DCT_OVERFLOW_CHECK(ts, 14341, 8192, 285); \
- t3 += (ts*14341 + 8192) >> 14; \
- /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
- OD_DCT_OVERFLOW_CHECK(tm, 19195, 16384, 286); \
- t9 -= (tm*19195 + 16384) >> 15; \
- /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
- OD_DCT_OVERFLOW_CHECK(t9, 11585, 8192, 287); \
- tm -= (t9*11585 + 8192) >> 14; \
- /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
- OD_DCT_OVERFLOW_CHECK(tm, 7489, 4096, 288); \
- t9 += (tm*7489 + 4096) >> 13; \
- /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \
- OD_DCT_OVERFLOW_CHECK(tl, 3259, 4096, 289); \
- ta += (tl*3259 + 4096) >> 13; \
- /* 3135/16384 ~= Sin[Pi/8]/2 ~= 0.1913417161825449 */ \
- OD_DCT_OVERFLOW_CHECK(ta, 3135, 8192, 290); \
- tl -= (ta*3135 + 8192) >> 14; \
- /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \
- OD_DCT_OVERFLOW_CHECK(tl, 3259, 4096, 291); \
- ta += (tl*3259 + 4096) >> 13; \
- /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
- OD_DCT_OVERFLOW_CHECK(tk, 4161, 8192, 292); \
- tb += (tk*4161 + 8192) >> 14; \
- /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
- OD_DCT_OVERFLOW_CHECK(tb, 15137, 8192, 293); \
- tk -= (tb*15137 + 8192) >> 14; \
- /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
- OD_DCT_OVERFLOW_CHECK(tk, 14341, 8192, 294); \
- tb += (tk*14341 + 8192) >> 14; \
- /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
- OD_DCT_OVERFLOW_CHECK(te, 29957, 16384, 295); \
- th += (te*29957 + 16384) >> 15; \
- /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
- OD_DCT_OVERFLOW_CHECK(th, 11585, 8192, 296); \
- te -= (th*11585 + 8192) >> 14; \
- /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
- OD_DCT_OVERFLOW_CHECK(te, 19195, 16384, 297); \
- th -= (te*19195 + 16384) >> 15; \
- /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
- OD_DCT_OVERFLOW_CHECK(tc, 28681, 16384, 298); \
- tj += (tc*28681 + 16384) >> 15; \
- /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
- OD_DCT_OVERFLOW_CHECK(tj, 15137, 8192, 299); \
- tc -= (tj*15137 + 8192) >> 14; \
- /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
- OD_DCT_OVERFLOW_CHECK(tc, 4161, 8192, 300); \
- tj += (tc*4161 + 8192) >> 14; \
- /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
- OD_DCT_OVERFLOW_CHECK(ti, 4161, 8192, 301); \
- td += (ti*4161 + 8192) >> 14; \
- /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
- OD_DCT_OVERFLOW_CHECK(td, 15137, 8192, 302); \
- ti -= (td*15137 + 8192) >> 14; \
- /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
- OD_DCT_OVERFLOW_CHECK(ti, 14341, 8192, 303); \
- td += (ti*14341 + 8192) >> 14; \
- \
- t1 = -t1; \
- t2 = -t2; \
- t3 = -t3; \
- td = -td; \
- tg = -tg; \
- to = -to; \
- ts = -ts; \
- \
- tr -= OD_RSHIFT1(t5); \
- t5 += tr; \
- tq -= OD_RSHIFT1(t4); /* pass */ \
- t4 += tq; \
- t6 -= OD_RSHIFT1(t7); \
- t7 += t6; \
- to -= OD_RSHIFT1(tp); /* pass */ \
- tp += to; \
- t1 += OD_RSHIFT1(t0); /* pass */ \
- t0 -= t1; \
- tv -= OD_RSHIFT1(tu); \
- tu += tv; \
- t3 -= OD_RSHIFT1(tt); \
- tt += t3; \
- t2 += OD_RSHIFT1(ts); \
- ts -= t2; \
- t9 -= OD_RSHIFT1(t8); /* pass */ \
- t8 += t9; \
- tn += OD_RSHIFT1(tm); \
- tm -= tn; \
- tb += OD_RSHIFT1(ta); \
- ta -= tb; \
- tl -= OD_RSHIFT1(tk); \
- tk += tl; \
- te -= OD_RSHIFT1(tf); /* pass */ \
- tf += te; \
- tg -= OD_RSHIFT1(th); \
- th += tg; \
- tc -= OD_RSHIFT1(ti); \
- ti += tc; \
- td += OD_RSHIFT1(tj); \
- tj -= td; \
- \
- t4 = -t4; \
- \
- /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \
- OD_DCT_OVERFLOW_CHECK(tr, 6723, 4096, 304); \
- t4 += (tr*6723 + 4096) >> 13; \
- /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.9807852804032304 */ \
- OD_DCT_OVERFLOW_CHECK(t4, 16069, 8192, 305); \
- tr -= (t4*16069 + 8192) >> 14; \
- /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \
- OD_DCT_OVERFLOW_CHECK(tr, 6723, 4096, 306); \
- t4 += (tr*6723 + 4096) >> 13; \
- /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \
- OD_DCT_OVERFLOW_CHECK(tq, 17515, 16384, 307); \
- t5 += (tq*17515 + 16384) >> 15; \
- /* 13623/16384 ~= Sin[5*Pi/16] ~= 0.8314696123025452 */ \
- OD_DCT_OVERFLOW_CHECK(t5, 13623, 8192, 308); \
- tq -= (t5*13623 + 8192) >> 14; \
- /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \
- OD_DCT_OVERFLOW_CHECK(tq, 17515, 16384, 309); \
- t5 += (tq*17515 + 16384) >> 15; \
- /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
- OD_DCT_OVERFLOW_CHECK(to, 3227, 16384, 310); \
- t7 += (to*3227 + 16384) >> 15; \
- /* 6393/32768 ~= Sin[Pi/16] ~= 0.19509032201612825 */ \
- OD_DCT_OVERFLOW_CHECK(t7, 6393, 16384, 311); \
- to -= (t7*6393 + 16384) >> 15; \
- /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
- OD_DCT_OVERFLOW_CHECK(to, 3227, 16384, 312); \
- t7 += (to*3227 + 16384) >> 15; \
- /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
- OD_DCT_OVERFLOW_CHECK(tp, 2485, 4096, 313); \
- t6 += (tp*2485 + 4096) >> 13; \
- /* 18205/32768 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
- OD_DCT_OVERFLOW_CHECK(t6, 18205, 16384, 314); \
- tp -= (t6*18205 + 16384) >> 15; \
- /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
- OD_DCT_OVERFLOW_CHECK(tp, 2485, 4096, 315); \
- t6 += (tp*2485 + 4096) >> 13; \
- \
- t5 = -t5; \
- \
- tr += to; \
- trh = OD_RSHIFT1(tr); \
- to -= trh; \
- t4 += t7; \
- t4h = OD_RSHIFT1(t4); \
- t7 -= t4h; \
- t5 += tp; \
- t5h = OD_RSHIFT1(t5); \
- tp -= t5h; \
- tq += t6; \
- tqh = OD_RSHIFT1(tq); \
- t6 -= tqh; \
- t0 -= t3; \
- t0h = OD_RSHIFT1(t0); \
- t3 += t0h; \
- tv -= ts; \
- tvh = OD_RSHIFT1(tv); \
- ts += tvh; \
- tu += tt; \
- tuh = OD_RSHIFT1(tu); \
- tt -= tuh; \
- t1 -= t2; \
- t1h = OD_RSHIFT1(t1); \
- t2 += t1h; \
- t8 += tb; \
- tb -= OD_RSHIFT1(t8); \
- tn += tk; \
- tk -= OD_RSHIFT1(tn); \
- t9 += tl; \
- tl -= OD_RSHIFT1(t9); \
- tm -= ta; \
- ta += OD_RSHIFT1(tm); \
- tc -= tf; \
- tf += OD_RSHIFT1(tc); \
- tj += tg; \
- tg -= OD_RSHIFT1(tj); \
- td -= te; \
- te += OD_RSHIFT1(td); \
- ti += th; \
- th -= OD_RSHIFT1(ti); \
- \
- t9 = -t9; \
- tl = -tl; \
- \
- /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
- OD_DCT_OVERFLOW_CHECK(tn, 805, 8192, 316); \
- t8 += (tn*805 + 8192) >> 14; \
- /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \
- OD_DCT_OVERFLOW_CHECK(t8, 803, 4096, 317); \
- tn -= (t8*803 + 4096) >> 13; \
- /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
- OD_DCT_OVERFLOW_CHECK(tn, 805, 8192, 318); \
- t8 += (tn*805 + 8192) >> 14; \
- /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
- OD_DCT_OVERFLOW_CHECK(tb, 11725, 16384, 319); \
- tk += (tb*11725 + 16384) >> 15; \
- /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \
- OD_DCT_OVERFLOW_CHECK(tk, 5197, 4096, 320); \
- tb -= (tk*5197 + 4096) >> 13; \
- /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
- OD_DCT_OVERFLOW_CHECK(tb, 11725, 16384, 321); \
- tk += (tb*11725 + 16384) >> 15; \
- /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \
- OD_DCT_OVERFLOW_CHECK(tl, 2455, 2048, 322); \
- ta += (tl*2455 + 2048) >> 12; \
- /* 14449/16384 ~= Sin[11*Pi/32] ~= 0.881921264348355 */ \
- OD_DCT_OVERFLOW_CHECK(ta, 14449, 8192, 323); \
- tl -= (ta*14449 + 8192) >> 14; \
- /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \
- OD_DCT_OVERFLOW_CHECK(tl, 2455, 2048, 324); \
- ta += (tl*2455 + 2048) >> 12; \
- /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
- OD_DCT_OVERFLOW_CHECK(tm, 4861, 16384, 325); \
- t9 += (tm*4861 + 16384) >> 15; \
- /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \
- OD_DCT_OVERFLOW_CHECK(t9, 1189, 2048, 326); \
- tm -= (t9*1189 + 2048) >> 12; \
- /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
- OD_DCT_OVERFLOW_CHECK(tm, 4861, 16384, 327); \
- t9 += (tm*4861 + 16384) >> 15; \
- /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
- OD_DCT_OVERFLOW_CHECK(tg, 805, 8192, 328); \
- tf += (tg*805 + 8192) >> 14; \
- /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \
- OD_DCT_OVERFLOW_CHECK(tf, 803, 4096, 329); \
- tg -= (tf*803 + 4096) >> 13; \
- /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
- OD_DCT_OVERFLOW_CHECK(tg, 805, 8192, 330); \
- tf += (tg*805 + 8192) >> 14; \
- /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
- OD_DCT_OVERFLOW_CHECK(tj, 2931, 4096, 331); \
- tc += (tj*2931 + 4096) >> 13; \
- /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \
- OD_DCT_OVERFLOW_CHECK(tc, 5197, 4096, 332); \
- tj -= (tc*5197 + 4096) >> 13; \
- /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
- OD_DCT_OVERFLOW_CHECK(tj, 2931, 4096, 333); \
- tc += (tj*2931 + 4096) >> 13; \
- /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \
- OD_DCT_OVERFLOW_CHECK(ti, 513, 1024, 334); \
- td += (ti*513 + 1024) >> 11; \
- /* 7723/16384 ~= Sin[5*Pi/32] ~= 0.47139673682599764 */ \
- OD_DCT_OVERFLOW_CHECK(td, 7723, 8192, 335); \
- ti -= (td*7723 + 8192) >> 14; \
- /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \
- OD_DCT_OVERFLOW_CHECK(ti, 513, 1024, 336); \
- td += (ti*513 + 1024) >> 11; \
- /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
- OD_DCT_OVERFLOW_CHECK(th, 4861, 16384, 337); \
- te += (th*4861 + 16384) >> 15; \
- /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \
- OD_DCT_OVERFLOW_CHECK(te, 1189, 2048, 338); \
- th -= (te*1189 + 2048) >> 12; \
- /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
- OD_DCT_OVERFLOW_CHECK(th, 4861, 16384, 339); \
- te += (th*4861 + 16384) >> 15; \
- \
- ta = -ta; \
- tb = -tb; \
- \
- tt += t5h; \
- t5 -= tt; \
- t2 -= tqh; \
- tq += t2; \
- tp += t1h; \
- t1 -= tp; \
- t6 -= tuh; \
- tu += t6; \
- t7 += tvh; \
- tv -= t7; \
- to += t0h; \
- t0 -= to; \
- t3 -= t4h; \
- t4 += t3; \
- ts += trh; \
- tr -= ts; \
- tf -= OD_RSHIFT1(tn); \
- tn += tf; \
- tg -= OD_RSHIFT1(t8); \
- t8 += tg; \
- tk += OD_RSHIFT1(tc); \
- tc -= tk; \
- tb += OD_RSHIFT1(tj); \
- tj -= tb; \
- ta += OD_RSHIFT1(ti); \
- ti -= ta; \
- tl += OD_RSHIFT1(td); \
- td -= tl; \
- te -= OD_RSHIFT1(tm); \
- tm += te; \
- th -= OD_RSHIFT1(t9); \
- t9 += th; \
- ta -= t5; \
- t5 += OD_RSHIFT1(ta); \
- tq -= tl; \
- tl += OD_RSHIFT1(tq); \
- t2 -= ti; \
- ti += OD_RSHIFT1(t2); \
- td -= tt; \
- tt += OD_RSHIFT1(td); \
- tm += tp; \
- tp -= OD_RSHIFT1(tm); \
- t6 += t9; \
- t9 -= OD_RSHIFT1(t6); \
- te -= tu; \
- tu += OD_RSHIFT1(te); \
- t1 -= th; \
- th += OD_RSHIFT1(t1); \
- t0 -= tg; \
- tg += OD_RSHIFT1(t0); \
- tf += tv; \
- tv -= OD_RSHIFT1(tf); \
- t8 -= t7; \
- t7 += OD_RSHIFT1(t8); \
- to -= tn; \
- tn += OD_RSHIFT1(to); \
- t4 -= tk; \
- tk += OD_RSHIFT1(t4); \
- tb -= tr; \
- tr += OD_RSHIFT1(tb); \
- t3 -= tj; \
- tj += OD_RSHIFT1(t3); \
- tc -= ts; \
- ts += OD_RSHIFT1(tc); \
- \
- tr = -tr; \
- ts = -ts; \
- tt = -tt; \
- tu = -tu; \
- \
- /* 2847/4096 ~= (1/Sqrt[2] - Cos[63*Pi/128]/2)/Sin[63*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(t0, 2847, 2048, 340); \
- tv += (t0*2847 + 2048) >> 12; \
- /* 5791/4096 ~= Sqrt[2]*Sin[63*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(tv, 5791, 2048, 341); \
- t0 -= (tv*5791 + 2048) >> 12; \
- /* 5593/8192 ~= (1/Sqrt[2] - Cos[63*Pi/128])/Sin[63*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(t0, 5593, 4096, 342); \
- tv += (t0*5593 + 4096) >> 13; \
- /* 4099/8192 ~= (1/Sqrt[2] - Cos[31*Pi/128]/2)/Sin[31*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(tf, 4099, 4096, 343); \
- tg -= (tf*4099 + 4096) >> 13; \
- /* 1997/2048 ~= Sqrt[2]*Sin[31*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(tg, 1997, 1024, 344); \
- tf += (tg*1997 + 1024) >> 11; \
- /* -815/32768 ~= (1/Sqrt[2] - Cos[31*Pi/128])/Sin[31*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(tf, 815, 16384, 345); \
- tg += (tf*815 + 16384) >> 15; \
- /* 2527/4096 ~= (1/Sqrt[2] - Cos[17*Pi/128]/2)/Sin[17*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(t8, 2527, 2048, 346); \
- tn -= (t8*2527 + 2048) >> 12; \
- /* 4695/8192 ~= Sqrt[2]*Sin[17*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(tn, 4695, 4096, 347); \
- t8 += (tn*4695 + 4096) >> 13; \
- /* -4187/8192 ~= (1/Sqrt[2] - Cos[17*Pi/128])/Sin[17*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(t8, 4187, 4096, 348); \
- tn += (t8*4187 + 4096) >> 13; \
- /* 5477/8192 ~= (1/Sqrt[2] - Cos[15*Pi/128]/2)/Sin[15*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(to, 5477, 4096, 349); \
- t7 += (to*5477 + 4096) >> 13; \
- /* 4169/8192 ~= Sqrt[2]*Sin[15*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(t7, 4169, 4096, 350); \
- to -= (t7*4169 + 4096) >> 13; \
- /* -2571/4096 ~= (1/Sqrt[2] - Cos[15*Pi/128])/Sin[15*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(to, 2571, 2048, 351); \
- t7 -= (to*2571 + 2048) >> 12; \
- /* 5331/8192 ~= (1/Sqrt[2] - Cos[59*Pi/128]/2)/Sin[59*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(t2, 5331, 4096, 352); \
- tt += (t2*5331 + 4096) >> 13; \
- /* 5749/4096 ~= Sqrt[2]*Sin[59*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(tt, 5749, 2048, 353); \
- t2 -= (tt*5749 + 2048) >> 12; \
- /* 2413/4096 ~= (1/Sqrt[2] - Cos[59*Pi/128])/Sin[59*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(t2, 2413, 2048, 354); \
- tt += (t2*2413 + 2048) >> 12; \
- /* 4167/8192 ~= (1/Sqrt[2] - Cos[27*Pi/128]/2)/Sin[27*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(td, 4167, 4096, 355); \
- ti -= (td*4167 + 4096) >> 13; \
- /* 891/1024 ~= Sqrt[2]*Sin[27*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(ti, 891, 512, 356); \
- td += (ti*891 + 512) >> 10; \
- /* -4327/32768 ~= (1/Sqrt[2] - Cos[27*Pi/128])/Sin[27*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(td, 4327, 16384, 357); \
- ti += (td*4327 + 16384) >> 15; \
- /* 2261/4096 ~= (1/Sqrt[2] - Cos[21*Pi/128]/2)/Sin[21*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(ta, 2261, 2048, 358); \
- tl -= (ta*2261 + 2048) >> 12; \
- /* 2855/4096 ~= Sqrt[2]*Sin[21*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(tl, 2855, 2048, 359); \
- ta += (tl*2855 + 2048) >> 12; \
- /* -5417/16384 ~= (1/Sqrt[2] - Cos[21*Pi/128])/Sin[21*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(ta, 5417, 8192, 360); \
- tl += (ta*5417 + 8192) >> 14; \
- /* 3459/4096 ~= (1/Sqrt[2] - Cos[11*Pi/128]/2)/Sin[11*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(tq, 3459, 2048, 361); \
- t5 += (tq*3459 + 2048) >> 12; \
- /* 1545/4096 ~= Sqrt[2]*Sin[11*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(t5, 1545, 2048, 362); \
- tq -= (t5*1545 + 2048) >> 12; \
- /* -1971/2048 ~= (1/Sqrt[2] - Cos[11*Pi/128])/Sin[11*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(tq, 1971, 1024, 363); \
- t5 -= (tq*1971 + 1024) >> 11; \
- /* 323/512 ~= (1/Sqrt[2] - Cos[57*Pi/128]/2)/Sin[57*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(t3, 323, 256, 364); \
- ts += (t3*323 + 256) >> 9; \
- /* 5707/4096 ~= Sqrt[2]*Sin[57*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(ts, 5707, 2048, 365); \
- t3 -= (ts*5707 + 2048) >> 12; \
- /* 2229/4096 ~= (1/Sqrt[2] - Cos[57*Pi/128])/Sin[57*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(t3, 2229, 2048, 366); \
- ts += (t3*2229 + 2048) >> 12; \
- /* 1061/2048 ~= (1/Sqrt[2] - Cos[25*Pi/128]/2)/Sin[25*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(tc, 1061, 1024, 367); \
- tj -= (tc*1061 + 1024) >> 11; \
- /* 6671/8192 ~= Sqrt[2]*Sin[25*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(tj, 6671, 4096, 368); \
- tc += (tj*6671 + 4096) >> 13; \
- /* -6287/32768 ~= (1/Sqrt[2] - Cos[25*Pi/128])/Sin[25*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(tc, 6287, 16384, 369); \
- tj += (tc*6287 + 16384) >> 15; \
- /* 4359/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128]/2)/Sin[23*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(tb, 4359, 4096, 370); \
- tk -= (tb*4359 + 4096) >> 13; \
- /* 3099/4096 ~= Sqrt[2]*Sin[23*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(tk, 3099, 2048, 371); \
- tb += (tk*3099 + 2048) >> 12; \
- /* -2109/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128])/Sin[23*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(tb, 2109, 4096, 372); \
- tk += (tb*2109 + 4096) >> 13; \
- /* 5017/8192 ~= (1/Sqrt[2] - Cos[55*Pi/128]/2)/Sin[55*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(t4, 5017, 4096, 373); \
- tr += (t4*5017 + 4096) >> 13; \
- /* 1413/1024 ~= Sqrt[2]*Sin[55*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(tr, 1413, 512, 374); \
- t4 -= (tr*1413 + 512) >> 10; \
- /* 8195/16384 ~= (1/Sqrt[2] - Cos[55*Pi/128])/Sin[55*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(t4, 8195, 8192, 375); \
- tr += (t4*8195 + 8192) >> 14; \
- /* 2373/4096 ~= (1/Sqrt[2] - Cos[19*Pi/128]/2)/Sin[19*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(tm, 2373, 2048, 376); \
- t9 += (tm*2373 + 2048) >> 12; \
- /* 5209/8192 ~= Sqrt[2]*Sin[19*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(t9, 5209, 4096, 377); \
- tm -= (t9*5209 + 4096) >> 13; \
- /* -3391/8192 ~= (1/Sqrt[2] - Cos[19*Pi/128])/Sin[19*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(tm, 3391, 4096, 378); \
- t9 -= (tm*3391 + 4096) >> 13; \
- /* 1517/2048 ~= (1/Sqrt[2] - Cos[13*Pi/128]/2)/Sin[13*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(t6, 1517, 1024, 379); \
- tp -= (t6*1517 + 1024) >> 11; \
- /* 1817/4096 ~= Sqrt[2]*Sin[13*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(tp, 1817, 2048, 380); \
- t6 += (tp*1817 + 2048) >> 12; \
- /* -6331/8192 ~= (1/Sqrt[2] - Cos[13*Pi/128])/Sin[13*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(t6, 6331, 4096, 381); \
- tp += (t6*6331 + 4096) >> 13; \
- /* 515/1024 ~= (1/Sqrt[2] - Cos[29*Pi/128]/2)/Sin[29*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(te, 515, 512, 382); \
- th -= (te*515 + 512) >> 10; \
- /* 7567/8192 ~= Sqrt[2]*Sin[29*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(th, 7567, 4096, 383); \
- te += (th*7567 + 4096) >> 13; \
- /* -2513/32768 ~= (1/Sqrt[2] - Cos[29*Pi/128])/Sin[29*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(te, 2513, 16384, 384); \
- th += (te*2513 + 16384) >> 15; \
- /* 2753/4096 ~= (1/Sqrt[2] - Cos[61*Pi/128]/2)/Sin[61*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(t1, 2753, 2048, 385); \
- tu += (t1*2753 + 2048) >> 12; \
- /* 5777/4096 ~= Sqrt[2]*Sin[61*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(tu, 5777, 2048, 386); \
- t1 -= (tu*5777 + 2048) >> 12; \
- /* 1301/2048 ~= (1/Sqrt[2] - Cos[61*Pi/128])/Sin[61*Pi/128] */ \
- OD_DCT_OVERFLOW_CHECK(t1, 1301, 1024, 387); \
- tu += (t1*1301 + 1024) >> 11; \
- } \
- while (0)
-
-#define OD_IDST_32_ASYM_PR(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, \
- tm, te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
- /* Embedded 32-point asymmetric Type-IV iDST. */ \
- do { \
- int t0h; \
- int t4h; \
- int tbh; \
- int tfh; \
- int tgh; \
- int tkh; \
- int trh; \
- int tvh; \
- /* 1301/2048 ~= (1/Sqrt[2] - Cos[61*Pi/128])/Sin[61*Pi/128] */ \
- tf -= (tg*1301 + 1024) >> 11; \
- /* 5777/4096 ~= Sqrt[2]*Sin[61*Pi/128] */ \
- tg += (tf*5777 + 2048) >> 12; \
- /* 2753/4096 ~= (1/Sqrt[2] - Cos[61*Pi/128]/2)/Sin[61*Pi/128] */ \
- tf -= (tg*2753 + 2048) >> 12; \
- /* -2513/32768 ~= (1/Sqrt[2] - Cos[29*Pi/128])/Sin[29*Pi/128] */ \
- th -= (te*2513 + 16384) >> 15; \
- /* 7567/8192 ~= Sqrt[2]*Sin[29*Pi/128] */ \
- te -= (th*7567 + 4096) >> 13; \
- /* 515/1024 ~= (1/Sqrt[2] - Cos[29*Pi/128]/2)/Sin[29*Pi/128] */ \
- th += (te*515 + 512) >> 10; \
- /* -6331/8192 ~= (1/Sqrt[2] - Cos[13*Pi/128])/Sin[13*Pi/128] */ \
- tj -= (tc*6331 + 4096) >> 13; \
- /* 1817/4096 ~= Sqrt[2]*Sin[13*Pi/128] */ \
- tc -= (tj*1817 + 2048) >> 12; \
- /* 1517/2048 ~= (1/Sqrt[2] - Cos[13*Pi/128]/2)/Sin[13*Pi/128] */ \
- tj += (tc*1517 + 1024) >> 11; \
- /* -3391/8192 ~= (1/Sqrt[2] - Cos[19*Pi/128])/Sin[19*Pi/128] */ \
- ti += (td*3391 + 4096) >> 13; \
- /* 5209/8192 ~= Sqrt[2]*Sin[19*Pi/128] */ \
- td += (ti*5209 + 4096) >> 13; \
- /* 2373/4096 ~= (1/Sqrt[2] - Cos[19*Pi/128]/2)/Sin[19*Pi/128] */ \
- ti -= (td*2373 + 2048) >> 12; \
- /* 8195/16384 ~= (1/Sqrt[2] - Cos[55*Pi/128])/Sin[55*Pi/128] */ \
- tr -= (t4*8195 + 8192) >> 14; \
- /* 1413/1024 ~= Sqrt[2]*Sin[55*Pi/128] */ \
- t4 += (tr*1413 + 512) >> 10; \
- /* 5017/8192 ~= (1/Sqrt[2] - Cos[55*Pi/128]/2)/Sin[55*Pi/128] */ \
- tr -= (t4*5017 + 4096) >> 13; \
- /* -2109/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128])/Sin[23*Pi/128] */ \
- t5 -= (tq*2109 + 4096) >> 13; \
- /* 3099/4096 ~= Sqrt[2]*Sin[23*Pi/128] */ \
- tq -= (t5*3099 + 2048) >> 12; \
- /* 4359/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128]/2)/Sin[23*Pi/128] */ \
- t5 += (tq*4359 + 4096) >> 13; \
- /* -6287/32768 ~= (1/Sqrt[2] - Cos[25*Pi/128])/Sin[25*Pi/128] */ \
- tp -= (t6*6287 + 16384) >> 15; \
- /* 6671/8192 ~= Sqrt[2]*Sin[25*Pi/128] */ \
- t6 -= (tp*6671 + 4096) >> 13; \
- /* 1061/2048 ~= (1/Sqrt[2] - Cos[25*Pi/128]/2)/Sin[25*Pi/128] */ \
- tp += (t6*1061 + 1024) >> 11; \
- /* 2229/4096 ~= (1/Sqrt[2] - Cos[57*Pi/128])/Sin[57*Pi/128] */ \
- t7 -= (to*2229 + 2048) >> 12; \
- /* 5707/4096 ~= Sqrt[2]*Sin[57*Pi/128] */ \
- to += (t7*5707 + 2048) >> 12; \
- /* 323/512 ~= (1/Sqrt[2] - Cos[57*Pi/128]/2)/Sin[57*Pi/128] */ \
- t7 -= (to*323 + 256) >> 9; \
- /* -1971/2048 ~= (1/Sqrt[2] - Cos[11*Pi/128])/Sin[11*Pi/128] */ \
- tk += (tb*1971 + 1024) >> 11; \
- /* 1545/4096 ~= Sqrt[2]*Sin[11*Pi/128] */ \
- tb += (tk*1545 + 2048) >> 12; \
- /* 3459/4096 ~= (1/Sqrt[2] - Cos[11*Pi/128]/2)/Sin[11*Pi/128] */ \
- tk -= (tb*3459 + 2048) >> 12; \
- /* -5417/16384 ~= (1/Sqrt[2] - Cos[21*Pi/128])/Sin[21*Pi/128] */ \
- tl -= (ta*5417 + 8192) >> 14; \
- /* 2855/4096 ~= Sqrt[2]*Sin[21*Pi/128] */ \
- ta -= (tl*2855 + 2048) >> 12; \
- /* 2261/4096 ~= (1/Sqrt[2] - Cos[21*Pi/128]/2)/Sin[21*Pi/128] */ \
- tl += (ta*2261 + 2048) >> 12; \
- /* -4327/32768 ~= (1/Sqrt[2] - Cos[27*Pi/128])/Sin[27*Pi/128] */ \
- t9 -= (tm*4327 + 16384) >> 15; \
- /* 891/1024 ~= Sqrt[2]*Sin[27*Pi/128] */ \
- tm -= (t9*891 + 512) >> 10; \
- /* 4167/8192 ~= (1/Sqrt[2] - Cos[27*Pi/128]/2)/Sin[27*Pi/128] */ \
- t9 += (tm*4167 + 4096) >> 13; \
- /* 2413/4096 ~= (1/Sqrt[2] - Cos[59*Pi/128])/Sin[59*Pi/128] */ \
- tn -= (t8*2413 + 2048) >> 12; \
- /* 5749/4096 ~= Sqrt[2]*Sin[59*Pi/128] */ \
- t8 += (tn*5749 + 2048) >> 12; \
- /* 5331/8192 ~= (1/Sqrt[2] - Cos[59*Pi/128]/2)/Sin[59*Pi/128] */ \
- tn -= (t8*5331 + 4096) >> 13; \
- /* -2571/4096 ~= (1/Sqrt[2] - Cos[15*Pi/128])/Sin[15*Pi/128] */ \
- ts += (t3*2571 + 2048) >> 12; \
- /* 4169/8192 ~= Sqrt[2]*Sin[15*Pi/128] */ \
- t3 += (ts*4169 + 4096) >> 13; \
- /* 5477/8192 ~= (1/Sqrt[2] - Cos[15*Pi/128]/2)/Sin[15*Pi/128] */ \
- ts -= (t3*5477 + 4096) >> 13; \
- /* -4187/8192 ~= (1/Sqrt[2] - Cos[17*Pi/128])/Sin[17*Pi/128] */ \
- tt -= (t2*4187 + 4096) >> 13; \
- /* 4695/8192 ~= Sqrt[2]*Sin[17*Pi/128] */ \
- t2 -= (tt*4695 + 4096) >> 13; \
- /* 2527/4096 ~= (1/Sqrt[2] - Cos[17*Pi/128]/2)/Sin[17*Pi/128] */ \
- tt += (t2*2527 + 2048) >> 12; \
- /* -815/32768 ~= (1/Sqrt[2] - Cos[31*Pi/128])/Sin[31*Pi/128] */ \
- t1 -= (tu*815 + 16384) >> 15; \
- /* 1997/2048 ~= Sqrt[2]*Sin[31*Pi/128] */ \
- tu -= (t1*1997 + 1024) >> 11; \
- /* 4099/8192 ~= (1/Sqrt[2] - Cos[31*Pi/128]/2)/Sin[31*Pi/128] */ \
- t1 += (tu*4099 + 4096) >> 13; \
- /* 5593/8192 ~= (1/Sqrt[2] - Cos[63*Pi/128])/Sin[63*Pi/128] */ \
- tv -= (t0*5593 + 4096) >> 13; \
- /* 5791/4096 ~= Sqrt[2]*Sin[63*Pi/128] */ \
- t0 += (tv*5791 + 2048) >> 12; \
- /* 2847/4096 ~= (1/Sqrt[2] - Cos[63*Pi/128]/2)/Sin[63*Pi/128] */ \
- tv -= (t0*2847 + 2048) >> 12; \
- \
- t7 = -t7; \
- tf = -tf; \
- tn = -tn; \
- tr = -tr; \
- \
- t7 -= OD_RSHIFT1(t6); \
- t6 += t7; \
- tp -= OD_RSHIFT1(to); \
- to += tp; \
- tr -= OD_RSHIFT1(tq); \
- tq += tr; \
- t5 -= OD_RSHIFT1(t4); \
- t4 += t5; \
- tt -= OD_RSHIFT1(t3); \
- t3 += tt; \
- ts -= OD_RSHIFT1(t2); \
- t2 += ts; \
- tv += OD_RSHIFT1(tu); \
- tu -= tv; \
- t1 -= OD_RSHIFT1(t0); \
- t0 += t1; \
- th -= OD_RSHIFT1(tg); \
- tg += th; \
- tf -= OD_RSHIFT1(te); \
- te += tf; \
- ti += OD_RSHIFT1(tc); \
- tc -= ti; \
- tj += OD_RSHIFT1(td); \
- td -= tj; \
- tn -= OD_RSHIFT1(tm); \
- tm += tn; \
- t9 -= OD_RSHIFT1(t8); \
- t8 += t9; \
- tl -= OD_RSHIFT1(tb); \
- tb += tl; \
- tk -= OD_RSHIFT1(ta); \
- ta += tk; \
- \
- ti -= th; \
- th += OD_RSHIFT1(ti); \
- td -= te; \
- te += OD_RSHIFT1(td); \
- tm += tl; \
- tl -= OD_RSHIFT1(tm); \
- t9 += ta; \
- ta -= OD_RSHIFT1(t9); \
- tp += tq; \
- tq -= OD_RSHIFT1(tp); \
- t6 += t5; \
- t5 -= OD_RSHIFT1(t6); \
- t2 -= t1; \
- t1 += OD_RSHIFT1(t2); \
- tt -= tu; \
- tu += OD_RSHIFT1(tt); \
- tr += t7; \
- trh = OD_RSHIFT1(tr); \
- t7 -= trh; \
- t4 -= to; \
- t4h = OD_RSHIFT1(t4); \
- to += t4h; \
- t0 += t3; \
- t0h = OD_RSHIFT1(t0); \
- t3 -= t0h; \
- tv += ts; \
- tvh = OD_RSHIFT1(tv); \
- ts -= tvh; \
- tf -= tc; \
- tfh = OD_RSHIFT1(tf); \
- tc += tfh; \
- tg += tj; \
- tgh = OD_RSHIFT1(tg); \
- tj -= tgh; \
- tb -= t8; \
- tbh = OD_RSHIFT1(tb); \
- t8 += tbh; \
- tk += tn; \
- tkh = OD_RSHIFT1(tk); \
- tn -= tkh; \
- \
- ta = -ta; \
- tq = -tq; \
- \
- /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
- te -= (th*4861 + 16384) >> 15; \
- /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \
- th += (te*1189 + 2048) >> 12; \
- /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
- te -= (th*4861 + 16384) >> 15; \
- /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \
- tm -= (t9*513 + 1024) >> 11; \
- /* 7723/16384 ~= Sin[5*Pi/32] ~= 0.47139673682599764 */ \
- t9 += (tm*7723 + 8192) >> 14; \
- /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \
- tm -= (t9*513 + 1024) >> 11; \
- /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
- t6 -= (tp*2931 + 4096) >> 13; \
- /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \
- tp += (t6*5197 + 4096) >> 13; \
- /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
- t6 -= (tp*2931 + 4096) >> 13; \
- /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
- tu -= (t1*805 + 8192) >> 14; \
- /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \
- t1 += (tu*803 + 4096) >> 13; \
- /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
- tu -= (t1*805 + 8192) >> 14; \
- /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
- ti -= (td*4861 + 16384) >> 15; \
- /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \
- td += (ti*1189 + 2048) >> 12; \
- /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
- ti -= (td*4861 + 16384) >> 15; \
- /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \
- ta -= (tl*2455 + 2048) >> 12; \
- /* 14449/16384 ~= Sin[11*Pi/32] ~= 0.881921264348355 */ \
- tl += (ta*14449 + 8192) >> 14; \
- /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \
- ta -= (tl*2455 + 2048) >> 12; \
- /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
- t5 -= (tq*11725 + 16384) >> 15; \
- /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \
- tq += (t5*5197 + 4096) >> 13; \
- /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
- t5 -= (tq*11725 + 16384) >> 15; \
- /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
- t2 -= (tt*805 + 8192) >> 14; \
- /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \
- tt += (t2*803 + 4096) >> 13; \
- /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
- t2 -= (tt*805 + 8192) >> 14; \
- \
- tl = -tl; \
- ti = -ti; \
- \
- th += OD_RSHIFT1(t9); \
- t9 -= th; \
- te -= OD_RSHIFT1(tm); \
- tm += te; \
- t1 += OD_RSHIFT1(tp); \
- tp -= t1; \
- tu -= OD_RSHIFT1(t6); \
- t6 += tu; \
- ta -= OD_RSHIFT1(td); \
- td += ta; \
- tl += OD_RSHIFT1(ti); \
- ti -= tl; \
- t5 += OD_RSHIFT1(tt); \
- tt -= t5; \
- tq += OD_RSHIFT1(t2); \
- t2 -= tq; \
- \
- t8 -= tgh; \
- tg += t8; \
- tn += tfh; \
- tf -= tn; \
- t7 -= tvh; \
- tv += t7; \
- to -= t0h; \
- t0 += to; \
- tc += tbh; \
- tb -= tc; \
- tj += tkh; \
- tk -= tj; \
- ts += t4h; \
- t4 -= ts; \
- t3 += trh; \
- tr -= t3; \
- \
- tk = -tk; \
- \
- /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
- tc -= (tj*2485 + 4096) >> 13; \
- /* 18205/32768 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
- tj += (tc*18205 + 16384) >> 15; \
- /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
- tc -= (tj*2485 + 4096) >> 13; \
- /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
- ts -= (t3*3227 + 16384) >> 15; \
- /* 6393/32768 ~= Sin[Pi/16] ~= 0.19509032201612825 */ \
- t3 += (ts*6393 + 16384) >> 15; \
- /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
- ts -= (t3*3227 + 16384) >> 15; \
- /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \
- tk -= (tb*17515 + 16384) >> 15; \
- /* 13623/16384 ~= Sin[5*Pi/16] ~= 0.8314696123025452 */ \
- tb += (tk*13623 + 8192) >> 14; \
- /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \
- tk -= (tb*17515 + 16384) >> 15; \
- /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \
- t4 -= (tr*6723 + 4096) >> 13; \
- /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.9807852804032304 */ \
- tr += (t4*16069 + 8192) >> 14; \
- /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \
- t4 -= (tr*6723 + 4096) >> 13; \
- \
- t4 = -t4; \
- \
- tp += tm; \
- tm -= OD_RSHIFT1(tp); \
- t9 -= t6; \
- t6 += OD_RSHIFT1(t9); \
- th -= t1; \
- t1 += OD_RSHIFT1(th); \
- tu -= te; \
- te += OD_RSHIFT1(tu); /* pass */ \
- t5 -= tl; \
- tl += OD_RSHIFT1(t5); \
- ta += tq; \
- tq -= OD_RSHIFT1(ta); \
- td += tt; \
- tt -= OD_RSHIFT1(td); \
- t2 -= ti; \
- ti += OD_RSHIFT1(t2); /* pass */ \
- t7 += t8; \
- t8 -= OD_RSHIFT1(t7); \
- tn -= to; \
- to += OD_RSHIFT1(tn); \
- tf -= tv; \
- tv += OD_RSHIFT1(tf); \
- t0 += tg; \
- tg -= OD_RSHIFT1(t0); /* pass */ \
- tj -= t3; \
- t3 += OD_RSHIFT1(tj); /* pass */ \
- ts -= tc; \
- tc += OD_RSHIFT1(ts); \
- t4 -= tb; \
- tb += OD_RSHIFT1(t4); /* pass */ \
- tk -= tr; \
- tr += OD_RSHIFT1(tk); \
- \
- t1 = -t1; \
- t3 = -t3; \
- t7 = -t7; \
- t8 = -t8; \
- tg = -tg; \
- tm = -tm; \
- to = -to; \
- \
- /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
- tm -= (t9*14341 + 8192) >> 14; \
- /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
- t9 += (tm*15137 + 8192) >> 14; \
- /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
- tm -= (t9*4161 + 8192) >> 14; \
- /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
- tp -= (t6*4161 + 8192) >> 14; \
- /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
- t6 += (tp*15137 + 8192) >> 14; \
- /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
- tp -= (t6*28681 + 16384) >> 15; \
- /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
- th += (te*19195 + 16384) >> 15; \
- /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
- te += (th*11585 + 8192) >> 14; \
- /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
- th -= (te*29957 + 16384) >> 15; \
- /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
- tq -= (t5*14341 + 8192) >> 14; \
- /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
- t5 += (tq*15137 + 8192) >> 14; \
- /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
- tq -= (t5*4161 + 8192) >> 14; \
- /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \
- ta -= (tl*3259 + 4096) >> 13; \
- /* 3135/16384 ~= Sin[Pi/8]/2 ~= 0.1913417161825449 */ \
- tl += (ta*3135 + 8192) >> 14; \
- /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \
- ta -= (tl*3259 + 4096) >> 13; \
- /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
- ti -= (td*7489 + 4096) >> 13; \
- /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
- td += (ti*11585 + 8192) >> 14; \
- /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
- ti += (td*19195 + 16384) >> 15; \
- /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
- to -= (t7*14341 + 8192) >> 14; \
- /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
- t7 += (to*15137 + 8192) >> 14; \
- /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
- to -= (t7*4161 + 8192) >> 14; \
- /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
- tn -= (t8*4161 + 8192) >> 14; \
- /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
- t8 += (tn*15137 + 8192) >> 14; \
- /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
- tn -= (t8*28681 + 16384) >> 15; \
- /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
- tf += (tg*19195 + 16384) >> 15; \
- /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
- tg += (tf*11585 + 8192) >> 14; \
- /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
- tf -= (tg*29957 + 16384) >> 15; \
- /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
- tj += (tc*19195 + 16384) >> 15; \
- /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
- tc += (tj*11585 + 8192) >> 14; \
- /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
- tj -= (tc*29957 + 16384) >> 15; \
- /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
- tk += (tb*13573 + 8192) >> 14; \
- /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
- tb -= (tk*11585 + 16384) >> 15; \
- /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
- tk += (tb*13573 + 8192) >> 14; \
- \
- tf = -tf; \
- \
- } \
- while (0)
-
-#define OD_FDCT_64_PR(u0, uw, ug, uM, u8, uE, uo, uU, u4, uA, uk, uQ, uc, uI, \
- us, uY, u2, uy, ui, uO, ua, uG, uq, uW, u6, uC, um, uS, ue, uK, uu, u_, u1, \
- ux, uh, uN, u9, uF, up, uV, u5, uB, ul, uR, ud, uJ, ut, uZ, u3, uz, uj, uP, \
- ub, uH, ur, uX, u7, uD, un, uT, uf, uL, uv, u) \
- /* Embedded 64-point orthonormal Type-II fDCT. */ \
- do { \
- int uwh; \
- int uxh; \
- int uyh; \
- int uzh; \
- int uAh; \
- int uBh; \
- int uCh; \
- int uDh; \
- int uEh; \
- int uFh; \
- int uGh; \
- int uHh; \
- int uIh; \
- int uJh; \
- int uKh; \
- int uLh; \
- int uMh; \
- int uNh; \
- int uOh; \
- int uPh; \
- int uQh; \
- int uRh; \
- int uSh; \
- int uTh; \
- int uUh; \
- int uVh; \
- int uWh; \
- int uXh; \
- int uYh; \
- int uZh; \
- int u_h; \
- int uh_; \
- u = u0 - u; \
- uh_ = OD_RSHIFT1(u); \
- u0 -= uh_; \
- u_ += u1; \
- u_h = OD_RSHIFT1(u_); \
- u1 = u_h - u1; \
- uZ = u2 - uZ; \
- uZh = OD_RSHIFT1(uZ); \
- u2 -= uZh; \
- uY += u3; \
- uYh = OD_RSHIFT1(uY); \
- u3 = uYh - u3; \
- uX = u4 - uX; \
- uXh = OD_RSHIFT1(uX); \
- u4 -= uXh; \
- uW += u5; \
- uWh = OD_RSHIFT1(uW); \
- u5 = uWh - u5; \
- uV = u6 - uV; \
- uVh = OD_RSHIFT1(uV); \
- u6 -= uVh; \
- uU += u7; \
- uUh = OD_RSHIFT1(uU); \
- u7 = uUh - u7; \
- uT = u8 - uT; \
- uTh = OD_RSHIFT1(uT); \
- u8 -= uTh; \
- uS += u9; \
- uSh = OD_RSHIFT1(uS); \
- u9 = uSh - u9; \
- uR = ua - uR; \
- uRh = OD_RSHIFT1(uR); \
- ua -= uRh; \
- uQ += ub; \
- uQh = OD_RSHIFT1(uQ); \
- ub = uQh - ub; \
- uP = uc - uP; \
- uPh = OD_RSHIFT1(uP); \
- uc -= uPh; \
- uO += ud; \
- uOh = OD_RSHIFT1(uO); \
- ud = uOh - ud; \
- uN = ue - uN; \
- uNh = OD_RSHIFT1(uN); \
- ue -= uNh; \
- uM += uf; \
- uMh = OD_RSHIFT1(uM); \
- uf = uMh - uf; \
- uL = ug - uL; \
- uLh = OD_RSHIFT1(uL); \
- ug -= uLh; \
- uK += uh; \
- uKh = OD_RSHIFT1(uK); \
- uh = uKh - uh; \
- uJ = ui - uJ; \
- uJh = OD_RSHIFT1(uJ); \
- ui -= uJh; \
- uI += uj; \
- uIh = OD_RSHIFT1(uI); \
- uj = uIh - uj; \
- uH = uk - uH; \
- uHh = OD_RSHIFT1(uH); \
- uk -= uHh; \
- uG += ul; \
- uGh = OD_RSHIFT1(uG); \
- ul = uGh - ul; \
- uF = um - uF; \
- uFh = OD_RSHIFT1(uF); \
- um -= uFh; \
- uE += un; \
- uEh = OD_RSHIFT1(uE); \
- un = uEh - un; \
- uD = uo - uD; \
- uDh = OD_RSHIFT1(uD); \
- uo -= uDh; \
- uC += up; \
- uCh = OD_RSHIFT1(uC); \
- up = uCh - up; \
- uB = uq - uB; \
- uBh = OD_RSHIFT1(uB); \
- uq -= uBh; \
- uA += ur; \
- uAh = OD_RSHIFT1(uA); \
- ur = uAh - ur; \
- uz = us - uz; \
- uzh = OD_RSHIFT1(uz); \
- us -= uzh; \
- uy += ut; \
- uyh = OD_RSHIFT1(uy); \
- ut = uyh - ut; \
- ux = uu - ux; \
- uxh = OD_RSHIFT1(ux); \
- uu -= uxh; \
- uw += uv; \
- uwh = OD_RSHIFT1(uw); \
- uv = uwh - uv; \
- OD_FDCT_32_ASYM_PR(u0, uw, uwh, ug, uM, uMh, u8, uE, uEh, uo, uU, uUh, \
- u4, uA, uAh, uk, uQ, uQh, uc, uI, uIh, us, uY, uYh, u2, uy, uyh, \
- ui, uO, uOh, ua, uG, uGh, uq, uW, uWh, u6, uC, uCh, um, uS, uSh, \
- ue, uK, uKh, uu, u_, u_h); \
- OD_FDST_32_ASYM_PR(u, uv, uL, uf, uT, un, uD, u7, uX, ur, uH, ub, uP, uj, \
- uz, u3, uZ, ut, uJ, ud, uR, ul, uB, u5, uV, up, uF, u9, uN, uh, ux, u1); \
- } \
- while (0)
-
-#define OD_IDCT_64_PR(u0, uw, ug, uM, u8, uE, uo, uU, u4, uA, uk, uQ, uc, uI, \
- us, uY, u2, uy, ui, uO, ua, uG, uq, uW, u6, uC, um, uS, ue, uK, uu, u_, u1, \
- ux, uh, uN, u9, uF, up, uV, u5, uB, ul, uR, ud, uJ, ut, uZ, u3, uz, uj, uP, \
- ub, uH, ur, uX, u7, uD, un, uT, uf, uL, uv, u) \
- /* Embedded 64-point orthonormal Type-II fDCT. */ \
- do { \
- int u1h; \
- int u3h; \
- int u5h; \
- int u7h; \
- int u9h; \
- int ubh; \
- int udh; \
- int ufh; \
- int uhh; \
- int ujh; \
- int ulh; \
- int unh; \
- int uph; \
- int urh; \
- int uth; \
- int uvh; \
- int uxh; \
- int uzh; \
- int uBh; \
- int uDh; \
- int uFh; \
- int uHh; \
- int uJh; \
- int uLh; \
- int uNh; \
- int uPh; \
- int uRh; \
- int uTh; \
- int uVh; \
- int uXh; \
- int uZh; \
- int uh_; \
- OD_IDST_32_ASYM_PR(u, uL, uT, uD, uX, uH, uP, uz, uZ, uJ, uR, uB, uV, uF, \
- uN, ux, u_, uK, uS, uC, uW, uG, uO, uy, uY, uI, uQ, uA, uU, uE, uM, uw); \
- OD_IDCT_32_ASYM_PR(u0, ug, u8, uo, u4, uk, uc, us, u2, ui, ua, uq, u6, um, \
- ue, uu, u1, u1h, uh, uhh, u9, u9h, up, uph, u5, u5h, ul, ulh, ud, udh, \
- ut, uth, u3, u3h, uj, ujh, ub, ubh, ur, urh, u7, u7h, un, unh, uf, ufh, \
- uv, uvh); \
- uh_ = OD_RSHIFT1(u); \
- u0 += uh_; \
- u = u0 - u; \
- u_ = u1h - u_; \
- u1 -= u_; \
- uZh = OD_RSHIFT1(uZ); \
- u2 += uZh; \
- uZ = u2 - uZ; \
- uY = u3h - uY; \
- u3 -= uY; \
- uXh = OD_RSHIFT1(uX); \
- u4 += uXh; \
- uX = u4 - uX; \
- uW = u5h - uW; \
- u5 -= uW; \
- uVh = OD_RSHIFT1(uV); \
- u6 += uVh; \
- uV = u6 - uV; \
- uU = u7h - uU; \
- u7 -= uU; \
- uTh = OD_RSHIFT1(uT); \
- u8 += uTh; \
- uT = u8 - uT; \
- uS = u9h - uS; \
- u9 -= uS; \
- uRh = OD_RSHIFT1(uR); \
- ua += uRh; \
- uR = ua - uR; \
- uQ = ubh - uQ; \
- ub -= uQ; \
- uPh = OD_RSHIFT1(uP); \
- uc += uPh; \
- uP = uc - uP; \
- uO = udh - uO; \
- ud -= uO; \
- uNh = OD_RSHIFT1(uN); \
- ue += uNh; \
- uN = ue - uN; \
- uM = ufh - uM; \
- uf -= uM; \
- uLh = OD_RSHIFT1(uL); \
- ug += uLh; \
- uL = ug - uL; \
- uK = uhh - uK; \
- uh -= uK; \
- uJh = OD_RSHIFT1(uJ); \
- ui += uJh; \
- uJ = ui - uJ; \
- uI = ujh - uI; \
- uj -= uI; \
- uHh = OD_RSHIFT1(uH); \
- uk += uHh; \
- uH = uk - uH; \
- uG = ulh - uG; \
- ul -= uG; \
- uFh = OD_RSHIFT1(uF); \
- um += uFh; \
- uF = um - uF; \
- uE = unh - uE; \
- un -= uE; \
- uDh = OD_RSHIFT1(uD); \
- uo += uDh; \
- uD = uo - uD; \
- uC = uph - uC; \
- up -= uC; \
- uBh = OD_RSHIFT1(uB); \
- uq += uBh; \
- uB = uq - uB; \
- uA = urh - uA; \
- ur -= uA; \
- uzh = OD_RSHIFT1(uz); \
- us += uzh; \
- uz = us - uz; \
- uy = uth - uy; \
- ut -= uy; \
- uxh = OD_RSHIFT1(ux); \
- uu += uxh; \
- ux = uu - ux; \
- uw = uvh - uw; \
- uv -= uw; \
- } while (0)
-#endif
-
-/* 4-point orthonormal Type-II fDCT. */
-void od_bin_fdct4(od_coeff y[4], const od_coeff *x, int xstride) {
- int q0;
- int q1;
- int q2;
- int q3;
- q0 = x[0*xstride];
- q1 = x[1*xstride];
- q2 = x[2*xstride];
- q3 = x[3*xstride];
- od_fdct_4(&q0, &q1, &q2, &q3);
- y[0] = (od_coeff)q0;
- y[1] = (od_coeff)q2;
- y[2] = (od_coeff)q1;
- y[3] = (od_coeff)q3;
-}
-
-/* 4-point orthonormal Type-II iDCT. */
-void od_bin_idct4(od_coeff *x, int xstride, const od_coeff y[4]) {
- int q0;
- int q1;
- int q2;
- int q3;
- q0 = y[0];
- q2 = y[1];
- q1 = y[2];
- q3 = y[3];
- od_idct_4(&q0, &q2, &q1, &q3);
- x[0*xstride] = (od_coeff)q0;
- x[1*xstride] = (od_coeff)q1;
- x[2*xstride] = (od_coeff)q2;
- x[3*xstride] = (od_coeff)q3;
-}
-
-/* 4-point orthonormal Type-VII fDST. */
-void od_bin_fdst4(od_coeff y[4], const od_coeff *x, int xstride) {
- /* 11 adds, 5 "muls", 2 shifts.*/
- int q0;
- int q1;
- int q2;
- int q3;
- int t0;
- int t1;
- int t2;
- int t3;
- int t3h;
- int t4;
- int u4;
- q0 = x[0*xstride];
- q1 = x[1*xstride];
- q2 = x[2*xstride];
- q3 = x[3*xstride];
- t0 = q1 + q3;
- /*When used in a 4x16 transform, the following line could overflow 16 bits
- in SIMD unless implemented using PAVGW or VRHSUB.S16.*/
- t1 = q1 + OD_PAVG(q0, -t0);
- t2 = q0 - q1;
- t3 = q2;
- t4 = q0 + q3;
- /* 7021/16384 ~= 2*Sin[2*Pi/9]/3 ~= 0.428525073124360 */
- t0 = (t0*7021 + 8192) >> 14;
- /* 37837/32768 ~= 4*Sin[3*Pi/9]/3 ~= 1.154700538379252 */
- t1 = (t1*37837 + 16384) >> 15;
- /* 21513/32768 ~= 2*Sin[4*Pi/9]/3 ~= 0.656538502008139 */
- t2 = (t2*21513 + 16384) >> 15;
- /* 37837/32768 ~= 4*Sin[3*Pi/9]/3 ~= 1.154700538379252 */
- t3 = (t3*37837 + 16384) >> 15;
- /* 467/2048 ~= 2*Sin[1*Pi/9]/3 ~= 0.228013428883779 */
- t4 = (t4*467 + 1024) >> 11;
- t3h = OD_RSHIFT1(t3);
- u4 = t4 + t3h;
- q0 = t0 + u4;
- q1 = t1;
- q2 = t0 + t2 - t3h;
- q3 = t2 + t3 - u4;
- y[0] = (od_coeff)q0;
- y[1] = (od_coeff)q1;
- y[2] = (od_coeff)q2;
- y[3] = (od_coeff)q3;
-}
-
-/* 4-point orthonormal Type-VII iDST. */
-void od_bin_idst4(od_coeff *x, int xstride, const od_coeff y[4]) {
- /* 11 adds, 5 "muls".*/
- int q0;
- int q1;
- int q2;
- int q3;
- int t0;
- int t1;
- int t2;
- int t3;
- int t3h;
- int t4;
- int u4;
- q0 = y[0];
- q1 = y[1];
- q2 = y[2];
- q3 = y[3];
- t0 = q0 - q3;
- t1 = q0 + q2;
- t2 = q3 + OD_PAVG(t0, -q2);
- t3 = q1;
- t4 = q2 + q3;
- /* 467/2048 ~= 2*Sin[1*Pi/9]/3 ~= 0.228013428883779 */
- t0 = (t0*467 + 1024) >> 11;
- /* 7021/16384 ~= 2*Sin[2*Pi/9]/3 ~= 0.428525073124360 */
- t1 = (t1*7021 + 8192) >> 14;
- /* 37837/32768 ~= 4*Sin[3*Pi/9]/3 ~= 1.154700538379252 */
- t2 = (t2*37837 + 16384) >> 15;
- /* 37837/32768 ~= 4*Sin[3*Pi/9]/3 ~= 1.154700538379252 */
- t3 = (t3*37837 + 16384) >> 15;
- /* 21513/32768 ~= 2*Sin[4*Pi/9]/3 ~= 0.656538502008139 */
- t4 = (t4*21513 + 16384) >> 15;
- t3h = OD_RSHIFT1(t3);
- u4 = t4 + t3h;
- q0 = t0 + u4;
- q1 = t1 + t3 - u4;
- q2 = t2;
- q3 = t0 + t1 - t3h;
- x[0*xstride] = q0;
- x[1*xstride] = q1;
- x[2*xstride] = q2;
- x[3*xstride] = q3;
-}
-
-void od_bin_fdct8(od_coeff y[8], const od_coeff *x, int xstride) {
- int r0;
- int r1;
- int r2;
- int r3;
- int r4;
- int r5;
- int r6;
- int r7;
- r0 = x[0*xstride];
- r1 = x[1*xstride];
- r2 = x[2*xstride];
- r3 = x[3*xstride];
- r4 = x[4*xstride];
- r5 = x[5*xstride];
- r6 = x[6*xstride];
- r7 = x[7*xstride];
- od_fdct_8(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7);
- y[0] = (od_coeff)r0;
- y[1] = (od_coeff)r4;
- y[2] = (od_coeff)r2;
- y[3] = (od_coeff)r6;
- y[4] = (od_coeff)r1;
- y[5] = (od_coeff)r5;
- y[6] = (od_coeff)r3;
- y[7] = (od_coeff)r7;
-}
-
-void od_bin_idct8(od_coeff *x, int xstride, const od_coeff y[8]) {
- int r0;
- int r1;
- int r2;
- int r3;
- int r4;
- int r5;
- int r6;
- int r7;
- r0 = y[0];
- r4 = y[1];
- r2 = y[2];
- r6 = y[3];
- r1 = y[4];
- r5 = y[5];
- r3 = y[6];
- r7 = y[7];
- od_idct_8(&r0, &r4, &r2, &r6, &r1, &r5, &r3, &r7);
- x[0*xstride] = (od_coeff)r0;
- x[1*xstride] = (od_coeff)r1;
- x[2*xstride] = (od_coeff)r2;
- x[3*xstride] = (od_coeff)r3;
- x[4*xstride] = (od_coeff)r4;
- x[5*xstride] = (od_coeff)r5;
- x[6*xstride] = (od_coeff)r6;
- x[7*xstride] = (od_coeff)r7;
-}
-
-#if !CONFIG_DAALA_TX_DST8
-void od_bin_fdst8(od_coeff y[8], const od_coeff *x, int xstride) {
- int r0;
- int r1;
- int r2;
- int r3;
- int r4;
- int r5;
- int r6;
- int r7;
- r0 = x[0*xstride];
- r1 = x[1*xstride];
- r2 = x[2*xstride];
- r3 = x[3*xstride];
- r4 = x[4*xstride];
- r5 = x[5*xstride];
- r6 = x[6*xstride];
- r7 = x[7*xstride];
- od_fdst_8(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7);
- y[0] = (od_coeff)r0;
- y[1] = (od_coeff)r4;
- y[2] = (od_coeff)r2;
- y[3] = (od_coeff)r6;
- y[4] = (od_coeff)r1;
- y[5] = (od_coeff)r5;
- y[6] = (od_coeff)r3;
- y[7] = (od_coeff)r7;
-}
-
-void od_bin_idst8(od_coeff *x, int xstride, const od_coeff y[8]) {
- int r0;
- int r1;
- int r2;
- int r3;
- int r4;
- int r5;
- int r6;
- int r7;
- r0 = y[0];
- r4 = y[1];
- r2 = y[2];
- r6 = y[3];
- r1 = y[4];
- r5 = y[5];
- r3 = y[6];
- r7 = y[7];
- od_idst_8(&r0, &r4, &r2, &r6, &r1, &r5, &r3, &r7);
- x[0*xstride] = (od_coeff)r0;
- x[1*xstride] = (od_coeff)r1;
- x[2*xstride] = (od_coeff)r2;
- x[3*xstride] = (od_coeff)r3;
- x[4*xstride] = (od_coeff)r4;
- x[5*xstride] = (od_coeff)r5;
- x[6*xstride] = (od_coeff)r6;
- x[7*xstride] = (od_coeff)r7;
-}
-#else
-const int OD_DST_8_PERM[8] = { 0, 7, 1, 6, 2, 5, 3, 4 };
-
-/* Computes the Polynomial Product Y(z) ≡ X(z)*H(z) modulo (z^8 + 1) using
- Nussbaumer's "short" algorithm [1].
- The monomial coefficients in Y(z) are exactly the values of an acyclic
- convolution of the monomial coefficients of X(z) and H(z).
- Since H(z) is fixed, the multiplication terms are constant and precomputed.
-
- [1] Nussbaumer, Henri J. "Fast Fourier Transform and Convolution Algorithms"
- Springer-Verlag: Berlin, Heidelberg, New York (1981) pages 76-78. */
-static void od_poly_prod_8(od_coeff y[8], const od_coeff x[8]) {
- /* 21 "muls", 76 adds, 21 shifts */
- od_coeff q0;
- od_coeff q1;
- od_coeff q2;
- od_coeff q3;
- od_coeff q4;
- od_coeff q5;
- od_coeff q6;
- od_coeff q7;
- od_coeff q8;
- od_coeff q9;
- od_coeff q10;
- od_coeff q11;
- od_coeff q12;
- od_coeff q13;
- od_coeff q14;
- od_coeff q15;
- od_coeff q16;
- od_coeff q17;
- od_coeff q18;
- od_coeff q19;
- od_coeff q20;
- od_coeff r0;
- od_coeff r1;
- od_coeff r2;
- od_coeff r3;
- od_coeff r4;
- od_coeff r5;
- od_coeff r6;
- od_coeff r7;
- od_coeff t0;
- od_coeff t1;
- od_coeff t2;
- od_coeff t3;
- od_coeff t4;
- od_coeff t5;
- od_coeff t6;
- od_coeff t7;
- od_coeff u0;
- od_coeff u1;
- od_coeff u1h;
- od_coeff u2;
- od_coeff u2h;
- od_coeff u3;
- od_coeff u4;
- od_coeff u4h;
- od_coeff u5;
- od_coeff u6;
- od_coeff u7;
- od_coeff u7h;
- od_coeff u8;
- od_coeff u9;
- od_coeff u10;
- od_coeff u11;
- od_coeff u12;
- od_coeff u13;
- od_coeff u14;
- od_coeff u15;
- od_coeff u16;
- od_coeff u17;
- od_coeff u18;
- od_coeff u19;
- od_coeff u20;
- od_coeff u21;
- od_coeff u22;
- od_coeff u23;
- od_coeff u24;
- od_coeff u25;
- od_coeff u26;
- od_coeff u27;
- t0 = x[0];
- t1 = x[1];
- t2 = x[2];
- t3 = x[3];
- t4 = x[4];
- t5 = x[5];
- t6 = x[6];
- t7 = x[7];
- /* Stage 0 Butterfly */
- u7 = t0 - t7;
- u7h = OD_RSHIFT1(u7);
- u0 = t0 - u7h;
- u2 = t2 - t6;
- u2h = OD_RSHIFT1(u2);
- u6 = t2 - u2h;
- u4 = t4 + t5;
- u4h = OD_RSHIFT1(u4);
- u5 = t4 - u4h;
- u1 = t3 - t1;
- u1h = OD_RSHIFT1(u1);
- u3 = t3 - u1h;
- /* Stage 1 Butterfly */
- q0 = u0 + u2h;
- q1 = q0 - u2;
- q4 = u3 + u4h;
- q5 = q4 - u4;
- q2 = u7h + u5;
- q7 = u7 - q2;
- q6 = u1h + u6;
- q3 = u1 - q6;
- /* Stage 2 Half-Butterfly */
- /*The intermediate sums can overflow 16 bits, but all SIMD instruction sets
- should be able to compute them without issue (i.e., using PAVGW or
- V{R}HADD.S16).*/
- q8 = (q0 + q4 + 1) >> 1;
- q9 = (q1 + q5) >> 1;
- q10 = (q2 + q3 + 1) >> 1;
- q11 = (q7 + q6) >> 1;
- /* Stage 3 */
- q12 = t0 + t3;
- q13 = t0;
- q14 = t3;
- q15 = t5 - t6;
- q16 = t6;
- q17 = t5;
- r0 = t2 + t4;
- r1 = t2 - OD_RSHIFT1(r0);
- r2 = (r1 - q15 + 1) >> 1;
- r3 = OD_RSHIFT1(t0);
- r4 = (r3 - t1 + 1) >> 1;
- /* q18 = (q6 - q4)/2 + (t0 - q15)/4
- = (t0 + t2 - t4)/4 - (t1 + t5 - t6)/2 */
- q18 = r2 + r4;
- r5 = t5 - (q15 >> 1);
- r6 = (r0 + t3 + 1) >> 1;
- r7 = (t7 + r6 + 1) >> 1;
- /* q19 = (q7 - q0)/2 + (t5 + t6 - t3)/4
- = (t5 + t6 - t7)/2 - (t2 + t3 + t4)/4 */
- q19 = r5 - r7;
- q20 = (q18 - q19) >> 1;
- /* Stage 4 */
- q0 = (-5995*q0 + 8192) >> 14;
- q1 = (-1373*q1 + 4096) >> 13;
- q2 = (22891*q2 + 16384) >> 15;
- q3 = (-217*q3 + 512) >> 10;
- q4 = (13427*q4 + 16384) >> 15;
- q5 = (-11013*q5 + 8192) >> 14;
- q6 = (1373*q6 + 1024) >> 11;
- q7 = (-14077*q7 + 16384) >> 15;
- q8 = (-1437*q8 + 16384) >> 15;
- q9 = (27519*q9 + 16384) >> 15;
- q10 = (-15947*q10 + 16384) >> 15;
- q11 = (-7891*q11 + 16384) >> 15;
- q12 = (4897*q12 + 16384) >> 15;
- q13 = (-5079*q13 + 8192) >> 14;
- q14 = (365*q14 + 16384) >> 15;
- q15 = (3325*q15 + 8192) >> 14;
- q16 = (-5225*q16 + 8192) >> 14;
- q17 = (-1425*q17 + 8192) >> 14;
- q18 = (3453*q18 + 16384) >> 15;
- q19 = (-8421*q19 + 8192) >> 14;
- q20 = (-20295*q20 + 16384) >> 15;
- /* Stage 5 */
- u0 = q0 + q8;
- u1 = q1 + q9;
- u2 = q2 + q10;
- u3 = q3 + q10;
- u4 = q4 + q8;
- u5 = q5 + q9;
- u6 = q6 + q11;
- u7 = q7 + q11;
- /* Stage 6 */
- u10 = u0 + u1;
- u11 = u0 - u1;
- u12 = u2 + u7;
- u13 = u2 - u7;
- u14 = u3 + u6;
- u15 = u3 - u6;
- u16 = u5 + u4;
- u17 = u5 - u4;
- /* Stage 7 */
- u8 = q19 + q20;
- u9 = q19 - q18;
- u18 = q12 + u8;
- u19 = u18 + q13;
- u20 = u18 + q14;
- u21 = 2*u9;
- u22 = q15 + u21;
- u23 = q16 - u22;
- u24 = u22 + q17;
- u25 = 2*u8;
- u26 = 2*u25;
- u27 = u25 - u9;
- /* Stage 8 */
- y[0] = u14 + u16 + u20;
- y[1] = u12 - u10 - u25;
- y[2] = u9 + u13 - u17;
- y[3] = u9 - u10 - u12 - u19;
- y[4] = u15 - u11 - u27;
- y[5] = u23 - u11 - u15;
- y[6] = u13 + u17 - u24 + u26;
- y[7] = u16 - u14 + u21 - u25;
-}
-
-void od_bin_fdst8(od_coeff y[8], const od_coeff *x, int xstride) {
- int i;
- od_coeff xp[8];
- od_coeff yp[8];
- for (i = 0; i < 8; i++) xp[i] = x[i*xstride];
- od_poly_prod_8(yp, xp);
- for (i = 0; i < 8; i++) y[OD_DST_8_PERM[i]] = yp[i];
-}
-
-void od_bin_idst8(od_coeff *x, int xstride, const od_coeff y[8]) {
- int i;
- od_coeff xp[8];
- od_coeff yp[8];
- for (i = 0; i < 8; i++) yp[i] = y[OD_DST_8_PERM[i]];
- od_poly_prod_8(xp, yp);
- for (i = 0; i < 8; i++) x[i*xstride] = xp[i];
-}
-#endif
-void od_bin_fdct16(od_coeff y[16], const od_coeff *x, int xstride) {
- int s0;
- int s1;
- int s2;
- int s3;
- int s4;
- int s5;
- int s6;
- int s7;
- int s8;
- int s9;
- int sa;
- int sb;
- int sc;
- int sd;
- int se;
- int sf;
- s0 = x[0*xstride];
- s1 = x[1*xstride];
- s2 = x[2*xstride];
- s3 = x[3*xstride];
- s4 = x[4*xstride];
- s5 = x[5*xstride];
- s6 = x[6*xstride];
- s7 = x[7*xstride];
- s8 = x[8*xstride];
- s9 = x[9*xstride];
- sa = x[10*xstride];
- sb = x[11*xstride];
- sc = x[12*xstride];
- sd = x[13*xstride];
- se = x[14*xstride];
- sf = x[15*xstride];
- od_fdct_16(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7,
- &s8, &s9, &sa, &sb, &sc, &sd, &se, &sf);
- y[0] = (od_coeff)s0;
- y[1] = (od_coeff)s8;
- y[2] = (od_coeff)s4;
- y[3] = (od_coeff)sc;
- y[4] = (od_coeff)s2;
- y[5] = (od_coeff)sa;
- y[6] = (od_coeff)s6;
- y[7] = (od_coeff)se;
- y[8] = (od_coeff)s1;
- y[9] = (od_coeff)s9;
- y[10] = (od_coeff)s5;
- y[11] = (od_coeff)sd;
- y[12] = (od_coeff)s3;
- y[13] = (od_coeff)sb;
- y[14] = (od_coeff)s7;
- y[15] = (od_coeff)sf;
-}
-
-void od_bin_idct16(od_coeff *x, int xstride, const od_coeff y[16]) {
- int s0;
- int s1;
- int s2;
- int s3;
- int s4;
- int s5;
- int s6;
- int s7;
- int s8;
- int s9;
- int sa;
- int sb;
- int sc;
- int sd;
- int se;
- int sf;
- s0 = y[0];
- s8 = y[1];
- s4 = y[2];
- sc = y[3];
- s2 = y[4];
- sa = y[5];
- s6 = y[6];
- se = y[7];
- s1 = y[8];
- s9 = y[9];
- s5 = y[10];
- sd = y[11];
- s3 = y[12];
- sb = y[13];
- s7 = y[14];
- sf = y[15];
- od_idct_16(&s0, &s8, &s4, &sc, &s2, &sa, &s6, &se,
- &s1, &s9, &s5, &sd, &s3, &sb, &s7, &sf);
- x[0*xstride] = (od_coeff)s0;
- x[1*xstride] = (od_coeff)s1;
- x[2*xstride] = (od_coeff)s2;
- x[3*xstride] = (od_coeff)s3;
- x[4*xstride] = (od_coeff)s4;
- x[5*xstride] = (od_coeff)s5;
- x[6*xstride] = (od_coeff)s6;
- x[7*xstride] = (od_coeff)s7;
- x[8*xstride] = (od_coeff)s8;
- x[9*xstride] = (od_coeff)s9;
- x[10*xstride] = (od_coeff)sa;
- x[11*xstride] = (od_coeff)sb;
- x[12*xstride] = (od_coeff)sc;
- x[13*xstride] = (od_coeff)sd;
- x[14*xstride] = (od_coeff)se;
- x[15*xstride] = (od_coeff)sf;
-}
-
-void od_bin_fdst16(od_coeff y[16], const od_coeff *x, int xstride) {
- int s0;
- int s1;
- int s2;
- int s3;
- int s4;
- int s5;
- int s6;
- int s7;
- int s8;
- int s9;
- int sa;
- int sb;
- int sc;
- int sd;
- int se;
- int sf;
- s0 = x[0*xstride];
- s1 = x[1*xstride];
- s2 = x[2*xstride];
- s3 = x[3*xstride];
- s4 = x[4*xstride];
- s5 = x[5*xstride];
- s6 = x[6*xstride];
- s7 = x[7*xstride];
- s8 = x[8*xstride];
- s9 = x[9*xstride];
- sa = x[10*xstride];
- sb = x[11*xstride];
- sc = x[12*xstride];
- sd = x[13*xstride];
- se = x[14*xstride];
- sf = x[15*xstride];
- od_fdst_16(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7,
- &s8, &s9, &sa, &sb, &sc, &sd, &se, &sf);
- y[0] = (od_coeff)s0;
- y[1] = (od_coeff)s8;
- y[2] = (od_coeff)s4;
- y[3] = (od_coeff)sc;
- y[4] = (od_coeff)s2;
- y[5] = (od_coeff)sa;
- y[6] = (od_coeff)s6;
- y[7] = (od_coeff)se;
- y[8] = (od_coeff)s1;
- y[9] = (od_coeff)s9;
- y[10] = (od_coeff)s5;
- y[11] = (od_coeff)sd;
- y[12] = (od_coeff)s3;
- y[13] = (od_coeff)sb;
- y[14] = (od_coeff)s7;
- y[15] = (od_coeff)sf;
-}
-
-void od_bin_idst16(od_coeff *x, int xstride, const od_coeff y[16]) {
- int s0;
- int s1;
- int s2;
- int s3;
- int s4;
- int s5;
- int s6;
- int s7;
- int s8;
- int s9;
- int sa;
- int sb;
- int sc;
- int sd;
- int se;
- int sf;
- s0 = y[0];
- s8 = y[1];
- s4 = y[2];
- sc = y[3];
- s2 = y[4];
- sa = y[5];
- s6 = y[6];
- se = y[7];
- s1 = y[8];
- s9 = y[9];
- s5 = y[10];
- sd = y[11];
- s3 = y[12];
- sb = y[13];
- s7 = y[14];
- sf = y[15];
- od_idst_16(&s0, &s8, &s4, &sc, &s2, &sa, &s6, &se,
- &s1, &s9, &s5, &sd, &s3, &sb, &s7, &sf);
- x[0*xstride] = (od_coeff)s0;
- x[1*xstride] = (od_coeff)s1;
- x[2*xstride] = (od_coeff)s2;
- x[3*xstride] = (od_coeff)s3;
- x[4*xstride] = (od_coeff)s4;
- x[5*xstride] = (od_coeff)s5;
- x[6*xstride] = (od_coeff)s6;
- x[7*xstride] = (od_coeff)s7;
- x[8*xstride] = (od_coeff)s8;
- x[9*xstride] = (od_coeff)s9;
- x[10*xstride] = (od_coeff)sa;
- x[11*xstride] = (od_coeff)sb;
- x[12*xstride] = (od_coeff)sc;
- x[13*xstride] = (od_coeff)sd;
- x[14*xstride] = (od_coeff)se;
- x[15*xstride] = (od_coeff)sf;
-}
-
-void od_bin_fdct32(od_coeff y[32], const od_coeff *x, int xstride) {
- /*215 adds, 38 shifts, 87 "muls".*/
- int t0;
- int t1;
- int t2;
- int t3;
- int t4;
- int t5;
- int t6;
- int t7;
- int t8;
- int t9;
- int ta;
- int tb;
- int tc;
- int td;
- int te;
- int tf;
- int tg;
- int th;
- int ti;
- int tj;
- int tk;
- int tl;
- int tm;
- int tn;
- int to;
- int tp;
- int tq;
- int tr;
- int ts;
- int tt;
- int tu;
- int tv;
- t0 = x[0*xstride];
- tg = x[1*xstride];
- t8 = x[2*xstride];
- to = x[3*xstride];
- t4 = x[4*xstride];
- tk = x[5*xstride];
- tc = x[6*xstride];
- ts = x[7*xstride];
- t2 = x[8*xstride];
- ti = x[9*xstride];
- ta = x[10*xstride];
- tq = x[11*xstride];
- t6 = x[12*xstride];
- tm = x[13*xstride];
- te = x[14*xstride];
- tu = x[15*xstride];
- t1 = x[16*xstride];
- th = x[17*xstride];
- t9 = x[18*xstride];
- tp = x[19*xstride];
- t5 = x[20*xstride];
- tl = x[21*xstride];
- td = x[22*xstride];
- tt = x[23*xstride];
- t3 = x[24*xstride];
- tj = x[25*xstride];
- tb = x[26*xstride];
- tr = x[27*xstride];
- t7 = x[28*xstride];
- tn = x[29*xstride];
- tf = x[30*xstride];
- tv = x[31*xstride];
- od_fdct_32(
- &t0, &tg, &t8, &to, &t4, &tk, &tc, &ts, &t2, &ti, &ta, &tq, &t6, &tm, &te,
- &tu, &t1, &th, &t9, &tp, &t5, &tl, &td, &tt, &t3, &tj, &tb, &tr, &t7, &tn,
- &tf, &tv);
- y[0] = (od_coeff)t0;
- y[1] = (od_coeff)t1;
- y[2] = (od_coeff)t2;
- y[3] = (od_coeff)t3;
- y[4] = (od_coeff)t4;
- y[5] = (od_coeff)t5;
- y[6] = (od_coeff)t6;
- y[7] = (od_coeff)t7;
- y[8] = (od_coeff)t8;
- y[9] = (od_coeff)t9;
- y[10] = (od_coeff)ta;
- y[11] = (od_coeff)tb;
- y[12] = (od_coeff)tc;
- y[13] = (od_coeff)td;
- y[14] = (od_coeff)te;
- y[15] = (od_coeff)tf;
- y[16] = (od_coeff)tg;
- y[17] = (od_coeff)th;
- y[18] = (od_coeff)ti;
- y[19] = (od_coeff)tj;
- y[20] = (od_coeff)tk;
- y[21] = (od_coeff)tl;
- y[22] = (od_coeff)tm;
- y[23] = (od_coeff)tn;
- y[24] = (od_coeff)to;
- y[25] = (od_coeff)tp;
- y[26] = (od_coeff)tq;
- y[27] = (od_coeff)tr;
- y[28] = (od_coeff)ts;
- y[29] = (od_coeff)tt;
- y[30] = (od_coeff)tu;
- y[31] = (od_coeff)tv;
-}
-
-void od_bin_idct32(od_coeff *x, int xstride, const od_coeff y[32]) {
- int t0;
- int t1;
- int t2;
- int t3;
- int t4;
- int t5;
- int t6;
- int t7;
- int t8;
- int t9;
- int ta;
- int tb;
- int tc;
- int td;
- int te;
- int tf;
- int tg;
- int th;
- int ti;
- int tj;
- int tk;
- int tl;
- int tm;
- int tn;
- int to;
- int tp;
- int tq;
- int tr;
- int ts;
- int tt;
- int tu;
- int tv;
- t0 = y[0];
- tg = y[1];
- t8 = y[2];
- to = y[3];
- t4 = y[4];
- tk = y[5];
- tc = y[6];
- ts = y[7];
- t2 = y[8];
- ti = y[9];
- ta = y[10];
- tq = y[11];
- t6 = y[12];
- tm = y[13];
- te = y[14];
- tu = y[15];
- t1 = y[16];
- th = y[17];
- t9 = y[18];
- tp = y[19];
- t5 = y[20];
- tl = y[21];
- td = y[22];
- tt = y[23];
- t3 = y[24];
- tj = y[25];
- tb = y[26];
- tr = y[27];
- t7 = y[28];
- tn = y[29];
- tf = y[30];
- tv = y[31];
- od_idct_32(
- &t0, &tg, &t8, &to, &t4, &tk, &tc, &ts, &t2, &ti, &ta, &tq, &t6, &tm, &te,
- &tu, &t1, &th, &t9, &tp, &t5, &tl, &td, &tt, &t3, &tj, &tb, &tr, &t7, &tn,
- &tf, &tv);
- x[0*xstride] = (od_coeff)t0;
- x[1*xstride] = (od_coeff)t1;
- x[2*xstride] = (od_coeff)t2;
- x[3*xstride] = (od_coeff)t3;
- x[4*xstride] = (od_coeff)t4;
- x[5*xstride] = (od_coeff)t5;
- x[6*xstride] = (od_coeff)t6;
- x[7*xstride] = (od_coeff)t7;
- x[8*xstride] = (od_coeff)t8;
- x[9*xstride] = (od_coeff)t9;
- x[10*xstride] = (od_coeff)ta;
- x[11*xstride] = (od_coeff)tb;
- x[12*xstride] = (od_coeff)tc;
- x[13*xstride] = (od_coeff)td;
- x[14*xstride] = (od_coeff)te;
- x[15*xstride] = (od_coeff)tf;
- x[16*xstride] = (od_coeff)tg;
- x[17*xstride] = (od_coeff)th;
- x[18*xstride] = (od_coeff)ti;
- x[19*xstride] = (od_coeff)tj;
- x[20*xstride] = (od_coeff)tk;
- x[21*xstride] = (od_coeff)tl;
- x[22*xstride] = (od_coeff)tm;
- x[23*xstride] = (od_coeff)tn;
- x[24*xstride] = (od_coeff)to;
- x[25*xstride] = (od_coeff)tp;
- x[26*xstride] = (od_coeff)tq;
- x[27*xstride] = (od_coeff)tr;
- x[28*xstride] = (od_coeff)ts;
- x[29*xstride] = (od_coeff)tt;
- x[30*xstride] = (od_coeff)tu;
- x[31*xstride] = (od_coeff)tv;
-}
-
-void od_bin_fdst32(od_coeff y[32], const od_coeff *x, int xstride) {
- od_coeff t0;
- od_coeff t1;
- od_coeff t2;
- od_coeff t3;
- od_coeff t4;
- od_coeff t5;
- od_coeff t6;
- od_coeff t7;
- od_coeff t8;
- od_coeff t9;
- od_coeff ta;
- od_coeff tb;
- od_coeff tc;
- od_coeff td;
- od_coeff te;
- od_coeff tf;
- od_coeff tg;
- od_coeff th;
- od_coeff ti;
- od_coeff tj;
- od_coeff tk;
- od_coeff tl;
- od_coeff tm;
- od_coeff tn;
- od_coeff to;
- od_coeff tp;
- od_coeff tq;
- od_coeff tr;
- od_coeff ts;
- od_coeff tt;
- od_coeff tu;
- od_coeff tv;
- #if !CONFIG_DAALA_TX_DST32
- assert(0 && "od_bin_fdst32() called when !CONFIG_DAALA_TX_DST32");
- #endif
- t0 = x[0*xstride];
- t1 = x[1*xstride];
- t2 = x[2*xstride];
- t3 = x[3*xstride];
- t4 = x[4*xstride];
- t5 = x[5*xstride];
- t6 = x[6*xstride];
- t7 = x[7*xstride];
- t8 = x[8*xstride];
- t9 = x[9*xstride];
- ta = x[10*xstride];
- tb = x[11*xstride];
- tc = x[12*xstride];
- td = x[13*xstride];
- te = x[14*xstride];
- tf = x[15*xstride];
- tg = x[16*xstride];
- th = x[17*xstride];
- ti = x[18*xstride];
- tj = x[19*xstride];
- tk = x[20*xstride];
- tl = x[21*xstride];
- tm = x[22*xstride];
- tn = x[23*xstride];
- to = x[24*xstride];
- tp = x[25*xstride];
- tq = x[26*xstride];
- tr = x[27*xstride];
- ts = x[28*xstride];
- tt = x[29*xstride];
- tu = x[30*xstride];
- tv = x[31*xstride];
- OD_FDST_32_PR(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf,
- tg, th, ti, tj, tk, tl, tm, tn, to, tp, tq, tr, ts, tt, tu, tv);
- y[0] = t0;
- y[1] = tg;
- y[2] = t8;
- y[3] = to;
- y[4] = t4;
- y[5] = tk;
- y[6] = tc;
- y[7] = ts;
- y[8] = t2;
- y[9] = ti;
- y[10] = ta;
- y[11] = tq;
- y[12] = t6;
- y[13] = tm;
- y[14] = te;
- y[15] = tu;
- y[16] = t1;
- y[17] = th;
- y[18] = t9;
- y[19] = tp;
- y[20] = t5;
- y[21] = tl;
- y[22] = td;
- y[23] = tt;
- y[24] = t3;
- y[25] = tj;
- y[26] = tb;
- y[27] = tr;
- y[28] = t7;
- y[29] = tn;
- y[30] = tf;
- y[31] = tv;
-}
-
-void od_bin_idst32(od_coeff *x, int xstride, const od_coeff y[32]) {
- od_coeff t0;
- od_coeff t1;
- od_coeff t2;
- od_coeff t3;
- od_coeff t4;
- od_coeff t5;
- od_coeff t6;
- od_coeff t7;
- od_coeff t8;
- od_coeff t9;
- od_coeff ta;
- od_coeff tb;
- od_coeff tc;
- od_coeff td;
- od_coeff te;
- od_coeff tf;
- od_coeff tg;
- od_coeff th;
- od_coeff ti;
- od_coeff tj;
- od_coeff tk;
- od_coeff tl;
- od_coeff tm;
- od_coeff tn;
- od_coeff to;
- od_coeff tp;
- od_coeff tq;
- od_coeff tr;
- od_coeff ts;
- od_coeff tt;
- od_coeff tu;
- od_coeff tv;
- #if !CONFIG_DAALA_TX_DST32
- assert(0 && "od_bin_idst32() called when !CONFIG_DAALA_TX_DST32");
- #endif
- t0 = y[0];
- tg = y[1];
- t8 = y[2];
- to = y[3];
- t4 = y[4];
- tk = y[5];
- tc = y[6];
- ts = y[7];
- t2 = y[8];
- ti = y[9];
- ta = y[10];
- tq = y[11];
- t6 = y[12];
- tm = y[13];
- te = y[14];
- tu = y[15];
- t1 = y[16];
- th = y[17];
- t9 = y[18];
- tp = y[19];
- t5 = y[20];
- tl = y[21];
- td = y[22];
- tt = y[23];
- t3 = y[24];
- tj = y[25];
- tb = y[26];
- tr = y[27];
- t7 = y[28];
- tn = y[29];
- tf = y[30];
- tv = y[31];
- OD_IDST_32_PR(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, te, tu,
- t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv);
- x[0*xstride] = t0;
- x[1*xstride] = t1;
- x[2*xstride] = t2;
- x[3*xstride] = t3;
- x[4*xstride] = t4;
- x[5*xstride] = t5;
- x[6*xstride] = t6;
- x[7*xstride] = t7;
- x[8*xstride] = t8;
- x[9*xstride] = t9;
- x[10*xstride] = ta;
- x[11*xstride] = tb;
- x[12*xstride] = tc;
- x[13*xstride] = td;
- x[14*xstride] = te;
- x[15*xstride] = tf;
- x[16*xstride] = tg;
- x[17*xstride] = th;
- x[18*xstride] = ti;
- x[19*xstride] = tj;
- x[20*xstride] = tk;
- x[21*xstride] = tl;
- x[22*xstride] = tm;
- x[23*xstride] = tn;
- x[24*xstride] = to;
- x[25*xstride] = tp;
- x[26*xstride] = tq;
- x[27*xstride] = tr;
- x[28*xstride] = ts;
- x[29*xstride] = tt;
- x[30*xstride] = tu;
- x[31*xstride] = tv;
-}
-
-#if CONFIG_TX64X64
-void od_bin_fdct64(od_coeff y[64], const od_coeff *x, int xstride) {
- int t0;
- int t1;
- int t2;
- int t3;
- int t4;
- int t5;
- int t6;
- int t7;
- int t8;
- int t9;
- int ta;
- int tb;
- int tc;
- int td;
- int te;
- int tf;
- int tg;
- int th;
- int ti;
- int tj;
- int tk;
- int tl;
- int tm;
- int tn;
- int to;
- int tp;
- int tq;
- int tr;
- int ts;
- int tt;
- int tu;
- int tv;
- int tw;
- int tx;
- int ty;
- int tz;
- int tA;
- int tB;
- int tC;
- int tD;
- int tE;
- int tF;
- int tG;
- int tH;
- int tI;
- int tJ;
- int tK;
- int tL;
- int tM;
- int tN;
- int tO;
- int tP;
- int tQ;
- int tR;
- int tS;
- int tT;
- int tU;
- int tV;
- int tW;
- int tX;
- int tY;
- int tZ;
- int t_;
- int t;
- t0 = x[0*xstride];
- tw = x[1*xstride];
- tg = x[2*xstride];
- tM = x[3*xstride];
- t8 = x[4*xstride];
- tE = x[5*xstride];
- to = x[6*xstride];
- tU = x[7*xstride];
- t4 = x[8*xstride];
- tA = x[9*xstride];
- tk = x[10*xstride];
- tQ = x[11*xstride];
- tc = x[12*xstride];
- tI = x[13*xstride];
- ts = x[14*xstride];
- tY = x[15*xstride];
- t2 = x[16*xstride];
- ty = x[17*xstride];
- ti = x[18*xstride];
- tO = x[19*xstride];
- ta = x[20*xstride];
- tG = x[21*xstride];
- tq = x[22*xstride];
- tW = x[23*xstride];
- t6 = x[24*xstride];
- tC = x[25*xstride];
- tm = x[26*xstride];
- tS = x[27*xstride];
- te = x[28*xstride];
- tK = x[29*xstride];
- tu = x[30*xstride];
- t_ = x[31*xstride];
- t1 = x[32*xstride];
- tx = x[33*xstride];
- th = x[34*xstride];
- tN = x[35*xstride];
- t9 = x[36*xstride];
- tF = x[37*xstride];
- tp = x[38*xstride];
- tV = x[39*xstride];
- t5 = x[40*xstride];
- tB = x[41*xstride];
- tl = x[42*xstride];
- tR = x[43*xstride];
- td = x[44*xstride];
- tJ = x[45*xstride];
- tt = x[46*xstride];
- tZ = x[47*xstride];
- t3 = x[48*xstride];
- tz = x[49*xstride];
- tj = x[50*xstride];
- tP = x[51*xstride];
- tb = x[52*xstride];
- tH = x[53*xstride];
- tr = x[54*xstride];
- tX = x[55*xstride];
- t7 = x[56*xstride];
- tD = x[57*xstride];
- tn = x[58*xstride];
- tT = x[59*xstride];
- tf = x[60*xstride];
- tL = x[61*xstride];
- tv = x[62*xstride];
- t = x[63*xstride];
- OD_FDCT_64_PR(t0, tw, tg, tM, t8, tE, to, tU, t4, tA, tk, tQ, tc, tI, ts, tY,
- t2, ty, ti, tO, ta, tG, tq, tW, t6, tC, tm, tS, te, tK, tu, t_, t1, tx, th,
- tN, t9, tF, tp, tV, t5, tB, tl, tR, td, tJ, tt, tZ, t3, tz, tj, tP, tb, tH,
- tr, tX, t7, tD, tn, tT, tf, tL, tv, t);
- y[0] = (od_coeff)t0;
- y[1] = (od_coeff)t1;
- y[2] = (od_coeff)t2;
- y[3] = (od_coeff)t3;
- y[4] = (od_coeff)t4;
- y[5] = (od_coeff)t5;
- y[6] = (od_coeff)t6;
- y[7] = (od_coeff)t7;
- y[8] = (od_coeff)t8;
- y[9] = (od_coeff)t9;
- y[10] = (od_coeff)ta;
- y[11] = (od_coeff)tb;
- y[12] = (od_coeff)tc;
- y[13] = (od_coeff)td;
- y[14] = (od_coeff)te;
- y[15] = (od_coeff)tf;
- y[16] = (od_coeff)tg;
- y[17] = (od_coeff)th;
- y[18] = (od_coeff)ti;
- y[19] = (od_coeff)tj;
- y[20] = (od_coeff)tk;
- y[21] = (od_coeff)tl;
- y[22] = (od_coeff)tm;
- y[23] = (od_coeff)tn;
- y[24] = (od_coeff)to;
- y[25] = (od_coeff)tp;
- y[26] = (od_coeff)tq;
- y[27] = (od_coeff)tr;
- y[28] = (od_coeff)ts;
- y[29] = (od_coeff)tt;
- y[30] = (od_coeff)tu;
- y[31] = (od_coeff)tv;
- y[32] = (od_coeff)tw;
- y[33] = (od_coeff)tx;
- y[34] = (od_coeff)ty;
- y[35] = (od_coeff)tz;
- y[36] = (od_coeff)tA;
- y[37] = (od_coeff)tB;
- y[38] = (od_coeff)tC;
- y[39] = (od_coeff)tD;
- y[40] = (od_coeff)tE;
- y[41] = (od_coeff)tF;
- y[41] = (od_coeff)tF;
- y[42] = (od_coeff)tG;
- y[43] = (od_coeff)tH;
- y[44] = (od_coeff)tI;
- y[45] = (od_coeff)tJ;
- y[46] = (od_coeff)tK;
- y[47] = (od_coeff)tL;
- y[48] = (od_coeff)tM;
- y[49] = (od_coeff)tN;
- y[50] = (od_coeff)tO;
- y[51] = (od_coeff)tP;
- y[52] = (od_coeff)tQ;
- y[53] = (od_coeff)tR;
- y[54] = (od_coeff)tS;
- y[55] = (od_coeff)tT;
- y[56] = (od_coeff)tU;
- y[57] = (od_coeff)tV;
- y[58] = (od_coeff)tW;
- y[59] = (od_coeff)tX;
- y[60] = (od_coeff)tY;
- y[61] = (od_coeff)tZ;
- y[62] = (od_coeff)t_;
- y[63] = (od_coeff)t;
-}
-
-void od_bin_idct64(od_coeff *x, int xstride, const od_coeff y[64]) {
- int t0;
- int t1;
- int t2;
- int t3;
- int t4;
- int t5;
- int t6;
- int t7;
- int t8;
- int t9;
- int ta;
- int tb;
- int tc;
- int td;
- int te;
- int tf;
- int tg;
- int th;
- int ti;
- int tj;
- int tk;
- int tl;
- int tm;
- int tn;
- int to;
- int tp;
- int tq;
- int tr;
- int ts;
- int tt;
- int tu;
- int tv;
- int tw;
- int tx;
- int ty;
- int tz;
- int tA;
- int tB;
- int tC;
- int tD;
- int tE;
- int tF;
- int tG;
- int tH;
- int tI;
- int tJ;
- int tK;
- int tL;
- int tM;
- int tN;
- int tO;
- int tP;
- int tQ;
- int tR;
- int tS;
- int tT;
- int tU;
- int tV;
- int tW;
- int tX;
- int tY;
- int tZ;
- int t_;
- int t;
- t0 = y[0];
- tw = y[1];
- tg = y[2];
- tM = y[3];
- t8 = y[4];
- tE = y[5];
- to = y[6];
- tU = y[7];
- t4 = y[8];
- tA = y[9];
- tk = y[10];
- tQ = y[11];
- tc = y[12];
- tI = y[13];
- ts = y[14];
- tY = y[15];
- t2 = y[16];
- ty = y[17];
- ti = y[18];
- tO = y[19];
- ta = y[20];
- tG = y[21];
- tq = y[22];
- tW = y[23];
- t6 = y[24];
- tC = y[25];
- tm = y[26];
- tS = y[27];
- te = y[28];
- tK = y[29];
- tu = y[30];
- t_ = y[31];
- t1 = y[32];
- tx = y[33];
- th = y[34];
- tN = y[35];
- t9 = y[36];
- tF = y[37];
- tp = y[38];
- tV = y[39];
- t5 = y[40];
- tB = y[41];
- tl = y[42];
- tR = y[43];
- td = y[44];
- tJ = y[45];
- tt = y[46];
- tZ = y[47];
- t3 = y[48];
- tz = y[49];
- tj = y[50];
- tP = y[51];
- tb = y[52];
- tH = y[53];
- tr = y[54];
- tX = y[55];
- t7 = y[56];
- tD = y[57];
- tn = y[58];
- tT = y[59];
- tf = y[60];
- tL = y[61];
- tv = y[62];
- t = y[63];
- OD_IDCT_64_PR(t0, tw, tg, tM, t8, tE, to, tU, t4, tA, tk, tQ, tc, tI, ts, tY,
- t2, ty, ti, tO, ta, tG, tq, tW, t6, tC, tm, tS, te, tK, tu, t_, t1, tx, th,
- tN, t9, tF, tp, tV, t5, tB, tl, tR, td, tJ, tt, tZ, t3, tz, tj, tP, tb, tH,
- tr, tX, t7, tD, tn, tT, tf, tL, tv, t);
- x[0*xstride] = (od_coeff)t0;
- x[1*xstride] = (od_coeff)t1;
- x[2*xstride] = (od_coeff)t2;
- x[3*xstride] = (od_coeff)t3;
- x[4*xstride] = (od_coeff)t4;
- x[5*xstride] = (od_coeff)t5;
- x[6*xstride] = (od_coeff)t6;
- x[7*xstride] = (od_coeff)t7;
- x[8*xstride] = (od_coeff)t8;
- x[9*xstride] = (od_coeff)t9;
- x[10*xstride] = (od_coeff)ta;
- x[11*xstride] = (od_coeff)tb;
- x[12*xstride] = (od_coeff)tc;
- x[13*xstride] = (od_coeff)td;
- x[14*xstride] = (od_coeff)te;
- x[15*xstride] = (od_coeff)tf;
- x[16*xstride] = (od_coeff)tg;
- x[17*xstride] = (od_coeff)th;
- x[18*xstride] = (od_coeff)ti;
- x[19*xstride] = (od_coeff)tj;
- x[20*xstride] = (od_coeff)tk;
- x[21*xstride] = (od_coeff)tl;
- x[22*xstride] = (od_coeff)tm;
- x[23*xstride] = (od_coeff)tn;
- x[24*xstride] = (od_coeff)to;
- x[25*xstride] = (od_coeff)tp;
- x[26*xstride] = (od_coeff)tq;
- x[27*xstride] = (od_coeff)tr;
- x[28*xstride] = (od_coeff)ts;
- x[29*xstride] = (od_coeff)tt;
- x[30*xstride] = (od_coeff)tu;
- x[31*xstride] = (od_coeff)tv;
- x[32*xstride] = (od_coeff)tw;
- x[33*xstride] = (od_coeff)tx;
- x[34*xstride] = (od_coeff)ty;
- x[35*xstride] = (od_coeff)tz;
- x[36*xstride] = (od_coeff)tA;
- x[37*xstride] = (od_coeff)tB;
- x[38*xstride] = (od_coeff)tC;
- x[39*xstride] = (od_coeff)tD;
- x[40*xstride] = (od_coeff)tE;
- x[41*xstride] = (od_coeff)tF;
- x[41*xstride] = (od_coeff)tF;
- x[42*xstride] = (od_coeff)tG;
- x[43*xstride] = (od_coeff)tH;
- x[44*xstride] = (od_coeff)tI;
- x[45*xstride] = (od_coeff)tJ;
- x[46*xstride] = (od_coeff)tK;
- x[47*xstride] = (od_coeff)tL;
- x[48*xstride] = (od_coeff)tM;
- x[49*xstride] = (od_coeff)tN;
- x[50*xstride] = (od_coeff)tO;
- x[51*xstride] = (od_coeff)tP;
- x[52*xstride] = (od_coeff)tQ;
- x[53*xstride] = (od_coeff)tR;
- x[54*xstride] = (od_coeff)tS;
- x[55*xstride] = (od_coeff)tT;
- x[56*xstride] = (od_coeff)tU;
- x[57*xstride] = (od_coeff)tV;
- x[58*xstride] = (od_coeff)tW;
- x[59*xstride] = (od_coeff)tX;
- x[60*xstride] = (od_coeff)tY;
- x[61*xstride] = (od_coeff)tZ;
- x[62*xstride] = (od_coeff)t_;
- x[63*xstride] = (od_coeff)t;
-}
-#endif
-
-void od_bin_fidtx4(od_coeff y[4], const od_coeff *x, int xstride) {
- int i;
- for (i = 0; i < 4; i++)
- y[i] = x[i*xstride];
-}
-
-void od_bin_fidtx8(od_coeff y[8], const od_coeff *x, int xstride) {
- int i;
- for (i = 0; i < 8; i++)
- y[i] = x[i*xstride];
-}
-
-void od_bin_fidtx16(od_coeff y[16], const od_coeff *x, int xstride) {
- int i;
- for (i = 0; i < 16; i++)
- y[i] = x[i*xstride];
-}
-
-void od_bin_fidtx32(od_coeff y[32], const od_coeff *x, int xstride) {
- int i;
- for (i = 0; i < 32; i++)
- y[i] = x[i*xstride];
-}
-
-#if CONFIG_TX64X64
-void od_bin_fidtx64(od_coeff y[64], const od_coeff *x, int xstride) {
- int i;
- for (i = 0; i < 64; i++)
- y[i] = x[i*xstride];
-}
-#endif
-
-void od_bin_iidtx4(od_coeff *x, int xstride, const od_coeff y[4]) {
- int i;
- for (i = 0; i < 4; i++)
- x[i*xstride] = y[i];
-}
-
-void od_bin_iidtx8(od_coeff *x, int xstride, const od_coeff y[8]) {
- int i;
- for (i = 0; i < 8; i++)
- x[i*xstride] = y[i];
-}
-
-void od_bin_iidtx16(od_coeff *x, int xstride, const od_coeff y[16]) {
- int i;
- for (i = 0; i < 16; i++)
- x[i*xstride] = y[i];
-}
-
-void od_bin_iidtx32(od_coeff *x, int xstride, const od_coeff y[32]) {
- int i;
- for (i = 0; i < 32; i++)
- x[i*xstride] = y[i];
-}
-
-#if CONFIG_TX64X64
-void od_bin_iidtx64(od_coeff *x, int xstride, const od_coeff y[64]) {
- int i;
- for (i = 0; i < 64; i++)
- x[i*xstride] = y[i];
-}
-#endif
-
-// Below are intermediate wrappers that handle the case when
-// tran_low_t is a smaller type than od_coeff
-void daala_fdct4(const tran_low_t *input, tran_low_t *output) {
- int i;
- od_coeff x[4];
- od_coeff y[4];
- for (i = 0; i < 4; i++) x[i] = (od_coeff)input[i];
- od_bin_fdct4(y, x, 1);
- for (i = 0; i < 4; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idct4(const tran_low_t *input, tran_low_t *output) {
- int i;
- od_coeff x[4];
- od_coeff y[4];
- for (i = 0; i < 4; i++) y[i] = input[i];
- od_bin_idct4(x, 1, y);
- for (i = 0; i < 4; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_fdst4(const tran_low_t *input, tran_low_t *output) {
- int i;
- od_coeff x[4];
- od_coeff y[4];
- for (i = 0; i < 4; i++) x[i] = (od_coeff)input[i];
- od_bin_fdst4(y, x, 1);
- for (i = 0; i < 4; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idst4(const tran_low_t *input, tran_low_t *output) {
- int i;
- od_coeff x[4];
- od_coeff y[4];
- for (i = 0; i < 4; i++) y[i] = input[i];
- od_bin_idst4(x, 1, y);
- for (i = 0; i < 4; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_idtx4(const tran_low_t *input, tran_low_t *output) {
- int i;
- for (i = 0; i < 4; i++) output[i] = input[i];
-}
-
-void daala_fdct8(const tran_low_t *input, tran_low_t *output) {
- int i;
- od_coeff x[8];
- od_coeff y[8];
- for (i = 0; i < 8; i++) x[i] = (od_coeff)input[i];
- od_bin_fdct8(y, x, 1);
- for (i = 0; i < 8; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idct8(const tran_low_t *input, tran_low_t *output) {
- int i;
- od_coeff x[8];
- od_coeff y[8];
- for (i = 0; i < 8; i++) y[i] = (od_coeff)input[i];
- od_bin_idct8(x, 1, y);
- for (i = 0; i < 8; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_fdst8(const tran_low_t *input, tran_low_t *output) {
- int i;
- od_coeff x[8];
- od_coeff y[8];
- for (i = 0; i < 8; i++) x[i] = (od_coeff)input[i];
- od_bin_fdst8(y, x, 1);
- for (i = 0; i < 8; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idst8(const tran_low_t *input, tran_low_t *output) {
- int i;
- od_coeff x[8];
- od_coeff y[8];
- for (i = 0; i < 8; i++) y[i] = (od_coeff)input[i];
- od_bin_idst8(x, 1, y);
- for (i = 0; i < 8; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_idtx8(const tran_low_t *input, tran_low_t *output) {
- int i;
- for (i = 0; i < 8; i++) output[i] = input[i];
-}
-
-void daala_fdct16(const tran_low_t *input, tran_low_t *output) {
- int i;
- od_coeff x[16];
- od_coeff y[16];
- for (i = 0; i < 16; i++) x[i] = (od_coeff)input[i];
- od_bin_fdct16(y, x, 1);
- for (i = 0; i < 16; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idct16(const tran_low_t *input, tran_low_t *output) {
- int i;
- od_coeff x[16];
- od_coeff y[16];
- for (i = 0; i < 16; i++) y[i] = (od_coeff)input[i];
- od_bin_idct16(x, 1, y);
- for (i = 0; i < 16; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_fdst16(const tran_low_t *input, tran_low_t *output) {
- int i;
- od_coeff x[16];
- od_coeff y[16];
- for (i = 0; i < 16; i++) x[i] = (od_coeff)input[i];
- od_bin_fdst16(y, x, 1);
- for (i = 0; i < 16; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idst16(const tran_low_t *input, tran_low_t *output) {
- int i;
- od_coeff x[16];
- od_coeff y[16];
- for (i = 0; i < 16; i++) y[i] = (od_coeff)input[i];
- od_bin_idst16(x, 1, y);
- for (i = 0; i < 16; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_idtx16(const tran_low_t *input, tran_low_t *output) {
- int i;
- for (i = 0; i < 16; i++) output[i] = input[i];
-}
-
-void daala_fdct32(const tran_low_t *input, tran_low_t *output) {
- int i;
- od_coeff x[32];
- od_coeff y[32];
- for (i = 0; i < 32; i++) x[i] = (od_coeff)input[i];
- od_bin_fdct32(y, x, 1);
- for (i = 0; i < 32; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idct32(const tran_low_t *input, tran_low_t *output) {
- int i;
- od_coeff x[32];
- od_coeff y[32];
- for (i = 0; i < 32; i++) y[i] = (od_coeff)input[i];
- od_bin_idct32(x, 1, y);
- for (i = 0; i < 32; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_fdst32(const tran_low_t *input, tran_low_t *output) {
- int i;
- od_coeff x[32];
- od_coeff y[32];
- for (i = 0; i < 32; i++) x[i] = (od_coeff)input[i];
- od_bin_fdst32(y, x, 1);
- for (i = 0; i < 32; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idst32(const tran_low_t *input, tran_low_t *output) {
- int i;
- od_coeff x[32];
- od_coeff y[32];
- for (i = 0; i < 32; i++) y[i] = input[i];
- od_bin_idst32(x, 1, y);
- for (i = 0; i < 32; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_idtx32(const tran_low_t *input, tran_low_t *output) {
- int i;
- for (i = 0; i < 32; i++) output[i] = input[i];
-}
-
-#if CONFIG_TX64X64
-void daala_fdct64(const tran_low_t *input, tran_low_t *output) {
- int i;
- od_coeff x[64];
- od_coeff y[64];
- for (i = 0; i < 64; i++) x[i] = (od_coeff)input[i];
- od_bin_fdct64(y, x, 1);
- for (i = 0; i < 64; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idct64(const tran_low_t *input, tran_low_t *output) {
- int i;
- od_coeff x[64];
- od_coeff y[64];
- for (i = 0; i < 64; i++) y[i] = (od_coeff)input[i];
- od_bin_idct64(x, 1, y);
- for (i = 0; i < 64; i++) output[i] = (tran_low_t)x[i];
-}
-
-/* Preserve the "half-right" transform behavior. */
-void daala_fdst64(const tran_low_t *input, tran_low_t *output) {
- int i;
- tran_low_t inputhalf[32];
- for (i = 0; i < 32; ++i) {
- output[32 + i] = input[i];
- }
- for (i = 0; i < 32; ++i) {
- inputhalf[i] = input[i + 32];
- }
- daala_fdct32(inputhalf, output);
-}
-
-/* Preserve the "half-right" transform behavior. */
-void daala_idst64(const tran_low_t *input, tran_low_t *output) {
- int i;
- tran_low_t inputhalf[32];
- for (i = 0; i < 32; ++i) {
- inputhalf[i] = input[i];
- }
- for (i = 0; i < 32; ++i) {
- output[i] = input[32 + i];
- }
- daala_idct32(inputhalf, output + 32);
-}
-
-void daala_idtx64(const tran_low_t *input, tran_low_t *output) {
- int i;
- for (i = 0; i < 64; i++) output[i] = input[i];
-}
-#endif
diff --git a/av1/common/daala_tx.h b/av1/common/daala_tx.h
deleted file mode 100644
index 2943802..0000000
--- a/av1/common/daala_tx.h
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifndef AOM_DSP_DAALA_TX_H_
-#define AOM_DSP_DAALA_TX_H_
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "av1/common/odintrin.h"
-
-void daala_fdct4(const tran_low_t *input, tran_low_t *output);
-void daala_idct4(const tran_low_t *input, tran_low_t *output);
-void daala_fdst4(const tran_low_t *input, tran_low_t *output);
-void daala_idst4(const tran_low_t *input, tran_low_t *output);
-void daala_idtx4(const tran_low_t *input, tran_low_t *output);
-void daala_fdct8(const tran_low_t *input, tran_low_t *output);
-void daala_idct8(const tran_low_t *input, tran_low_t *output);
-void daala_fdst8(const tran_low_t *input, tran_low_t *output);
-void daala_idst8(const tran_low_t *input, tran_low_t *output);
-void daala_idtx8(const tran_low_t *input, tran_low_t *output);
-void daala_fdct16(const tran_low_t *input, tran_low_t *output);
-void daala_idct16(const tran_low_t *input, tran_low_t *output);
-void daala_fdst16(const tran_low_t *input, tran_low_t *output);
-void daala_idst16(const tran_low_t *input, tran_low_t *output);
-void daala_idtx16(const tran_low_t *input, tran_low_t *output);
-void daala_fdct32(const tran_low_t *input, tran_low_t *output);
-void daala_idct32(const tran_low_t *input, tran_low_t *output);
-void daala_fdst32(const tran_low_t *input, tran_low_t *output);
-void daala_idst32(const tran_low_t *input, tran_low_t *output);
-void daala_idtx32(const tran_low_t *input, tran_low_t *output);
-#if CONFIG_TX64X64
-void daala_fdct64(const tran_low_t *input, tran_low_t *output);
-void daala_idct64(const tran_low_t *input, tran_low_t *output);
-void daala_fdst64(const tran_low_t *input, tran_low_t *output);
-void daala_idst64(const tran_low_t *input, tran_low_t *output);
-void daala_idtx64(const tran_low_t *input, tran_low_t *output);
-#endif
-
-void od_bin_fdct4(od_coeff y[4], const od_coeff *x, int xstride);
-void od_bin_idct4(od_coeff *x, int xstride, const od_coeff y[4]);
-void od_bin_fdst4(od_coeff y[4], const od_coeff *x, int xstride);
-void od_bin_idst4(od_coeff *x, int xstride, const od_coeff y[4]);
-void od_bin_fidtx4(od_coeff y[4], const od_coeff *x, int xstride);
-void od_bin_iidtx4(od_coeff *x, int xstride, const od_coeff y[4]);
-void od_bin_fdct8(od_coeff y[8], const od_coeff *x, int xstride);
-void od_bin_idct8(od_coeff *x, int xstride, const od_coeff y[8]);
-void od_bin_fdst8(od_coeff y[8], const od_coeff *x, int xstride);
-void od_bin_idst8(od_coeff *x, int xstride, const od_coeff y[8]);
-void od_bin_fidtx8(od_coeff y[8], const od_coeff *x, int xstride);
-void od_bin_iidtx8(od_coeff *x, int xstride, const od_coeff y[8]);
-void od_bin_fdct16(od_coeff y[16], const od_coeff *x, int xstride);
-void od_bin_idct16(od_coeff *x, int xstride, const od_coeff y[16]);
-void od_bin_fdst16(od_coeff y[16], const od_coeff *x, int xstride);
-void od_bin_idst16(od_coeff *x, int xstride, const od_coeff y[16]);
-void od_bin_fidtx16(od_coeff y[16], const od_coeff *x, int xstride);
-void od_bin_iidtx16(od_coeff *x, int xstride, const od_coeff y[16]);
-void od_bin_fdct32(od_coeff y[32], const od_coeff *x, int xstride);
-void od_bin_idct32(od_coeff *x, int xstride, const od_coeff y[32]);
-void od_bin_fdst32(od_coeff y[32], const od_coeff *x, int xstride);
-void od_bin_idst32(od_coeff *x, int xstride, const od_coeff y[32]);
-void od_bin_fidtx32(od_coeff y[32], const od_coeff *x, int xstride);
-void od_bin_iidtx32(od_coeff *x, int xstride, const od_coeff y[32]);
-#if CONFIG_TX64X64
-void od_bin_fdct64(od_coeff y[64], const od_coeff *x, int xstride);
-void od_bin_idct64(od_coeff *x, int xstride, const od_coeff y[64]);
-void od_bin_fidtx64(od_coeff y[64], const od_coeff *x, int xstride);
-void od_bin_iidtx64(od_coeff *x, int xstride, const od_coeff y[64]);
-#endif
-#endif
diff --git a/av1/common/daala_tx_kernels.h b/av1/common/daala_tx_kernels.h
deleted file mode 100644
index 1559228..0000000
--- a/av1/common/daala_tx_kernels.h
+++ /dev/null
@@ -1,1629 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#ifndef AOM_DSP_DAALA_TX_KERNELS_H_
-#define AOM_DSP_DAALA_TX_KERNELS_H_
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "av1/common/odintrin.h"
-
-#define AVG_BIAS (0)
-
-static INLINE od_coeff od_add(od_coeff p0, od_coeff p1) {
- return p0 + p1;
-}
-
-static INLINE od_coeff od_sub(od_coeff p0, od_coeff p1) {
- return p0 - p1;
-}
-
-static INLINE od_coeff od_add_avg(od_coeff p0, od_coeff p1) {
- return (od_add(p0, p1) + AVG_BIAS) >> 1;
-}
-
-static INLINE od_coeff od_sub_avg(od_coeff p0, od_coeff p1) {
- return (od_sub(p0, p1) + AVG_BIAS) >> 1;
-}
-
-static INLINE od_coeff od_rshift1(od_coeff v) {
- return (v + (v < 0)) >> 1;
-}
-
-/* Fixed point multiply. */
-static INLINE od_coeff od_mul(od_coeff n, int c, int q) {
- return (n*c + ((1 << q) >> 1)) >> q;
-}
-
-/* Two multiply rotation primative (used when rotating by Pi/4). */
-static INLINE void od_rot2(od_coeff *p0, od_coeff *p1, od_coeff t, int c0,
- int q0, int c1, int q1) {
- *p1 = od_mul(*p0, c0, q0);
- *p0 = od_mul(t, c1, q1);
-}
-
-/* Three multiply rotation primative. */
-static INLINE void od_rot3(od_coeff *p0, od_coeff *p1, od_coeff *t, od_coeff *u,
- int c0, int q0, int c1, int q1, int c2, int q2) {
- *u = od_mul(*p0, c0, q0);
- *p0 = od_mul(*p1, c1, q1);
- *t = od_mul(*t, c2, q2);
-}
-
-#define NONE (0)
-#define AVG (!NONE)
-#define SHIFT (!NONE)
-
-#define ADD (0)
-#define SUB (1)
-
-/* Rotate by Pi/4 and add. */
-static INLINE void od_rotate_pi4_kernel(od_coeff *p0, od_coeff *p1, int c0,
- int q0, int c1, int q1, int type, int avg) {
- od_coeff t;
- t = type == ADD ?
- avg ? od_add_avg(*p1, *p0) : od_add(*p1, *p0) :
- avg ? od_sub_avg(*p1, *p0) : od_sub(*p1, *p0);
- od_rot2(p0, p1, t, c0, q0, c1, q1);
- *p1 = type == ADD ? od_sub(*p1, *p0) : od_add(*p1, *p0);
-}
-
-#define od_rotate_pi4_add(p0, p1, c0, q0, c1, q1) \
- od_rotate_pi4_kernel(p0, p1, c0, q0, c1, q1, ADD, NONE)
-#define od_rotate_pi4_sub(p0, p1, c0, q0, c1, q1) \
- od_rotate_pi4_kernel(p0, p1, c0, q0, c1, q1, SUB, NONE)
-
-#define od_rotate_pi4_add_avg(p0, p1, c0, q0, c1, q1) \
- od_rotate_pi4_kernel(p0, p1, c0, q0, c1, q1, ADD, AVG)
-#define od_rotate_pi4_sub_avg(p0, p1, c0, q0, c1, q1) \
- od_rotate_pi4_kernel(p0, p1, c0, q0, c1, q1, SUB, AVG)
-
-/* Rotate and add. */
-static INLINE void od_rotate_kernel(od_coeff *p0, od_coeff *p1, od_coeff v,
- int c0, int q0, int c1, int q1, int c2, int q2, int type, int avg, int shift) {
- od_coeff u;
- od_coeff t;
- t = type == ADD ?
- avg ? od_add_avg(*p1, v) : od_add(*p1, v) :
- avg ? od_sub_avg(*p1, v) : od_sub(*p1, v);
- od_rot3(p0, p1, &t, &u, c0, q0, c1, q1, c2, q2);
- *p0 = od_add(*p0, t);
- if (shift) t = od_rshift1(t);
- *p1 = type == ADD ? od_sub(u, t) : od_add(u, t);
-}
-
-#define od_rotate_add(p0, p1, c0, q0, c1, q1, c2, q2, shift) \
- od_rotate_kernel(p0, p1, *p0, c0, q0, c1, q1, c2, q2, ADD, NONE, shift)
-#define od_rotate_sub(p0, p1, c0, q0, c1, q1, c2, q2, shift) \
- od_rotate_kernel(p0, p1, *p0, c0, q0, c1, q1, c2, q2, SUB, NONE, shift)
-
-#define od_rotate_add_avg(p0, p1, c0, q0, c1, q1, c2, q2, shift) \
- od_rotate_kernel(p0, p1, *p0, c0, q0, c1, q1, c2, q2, ADD, AVG, shift)
-#define od_rotate_sub_avg(p0, p1, c0, q0, c1, q1, c2, q2, shift) \
- od_rotate_kernel(p0, p1, *p0, c0, q0, c1, q1, c2, q2, SUB, AVG, shift)
-
-#define od_rotate_add_half(p0, p1, v, c0, q0, c1, q1, c2, q2, shift) \
- od_rotate_kernel(p0, p1, v, c0, q0, c1, q1, c2, q2, ADD, NONE, shift)
-#define od_rotate_sub_half(p0, p1, v, c0, q0, c1, q1, c2, q2, shift) \
- od_rotate_kernel(p0, p1, v, c0, q0, c1, q1, c2, q2, SUB, NONE, shift)
-
-/* Rotate and subtract with negation. */
-static INLINE void od_rotate_neg_kernel(od_coeff *p0, od_coeff *p1,
- int c0, int q0, int c1, int q1, int c2, int q2, int avg) {
- od_coeff u;
- od_coeff t;
- t = avg ? od_sub_avg(*p0, *p1) : od_sub(*p0, *p1);
- od_rot3(p0, p1, &t, &u, c0, q0, c1, q1, c2, q2);
- *p0 = od_sub(*p0, t);
- *p1 = od_sub(t, u);
-}
-
-#define od_rotate_neg(p0, p1, c0, q0, c1, q1, c2, q2) \
- od_rotate_neg_kernel(p0, p1, c0, q0, c1, q1, c2, q2, NONE)
-#define od_rotate_neg_avg(p0, p1, c0, q0, c1, q1, c2, q2) \
- od_rotate_neg_kernel(p0, p1, c0, q0, c1, q1, c2, q2, AVG)
-
-/* Computes the +/- addition butterfly (asymmetric output).
- The inverse to this function is od_butterfly_add_asym().
-
- p0 = p0 + p1;
- p1 = p1 - p0/2; */
-static INLINE void od_butterfly_add(od_coeff *p0, od_coeff *p0h, od_coeff *p1) {
- od_coeff p0h_;
- *p0 = od_add(*p0, *p1);
- p0h_ = od_rshift1(*p0);
- *p1 = od_sub(*p1, p0h_);
- if (p0h != NULL) *p0h = p0h_;
-}
-
-/* Computes the asymmetric +/- addition butterfly (unscaled output).
- The inverse to this function is od_butterfly_add().
-
- p1 = p1 + p0/2;
- p0 = p0 - p1; */
-static INLINE void od_butterfly_add_asym(od_coeff *p0, od_coeff p0h,
- od_coeff *p1) {
- *p1 = od_add(*p1, p0h);
- *p0 = od_sub(*p0, *p1);
-}
-
-/* Computes the +/- subtraction butterfly (asymmetric output).
- The inverse to this function is od_butterfly_sub_asym().
-
- p0 = p0 - p1;
- p1 = p1 + p0/2; */
-static INLINE void od_butterfly_sub(od_coeff *p0, od_coeff *p0h, od_coeff *p1) {
- od_coeff p0h_;
- *p0 = od_sub(*p0, *p1);
- p0h_ = od_rshift1(*p0);
- *p1 = od_add(*p1, p0h_);
- if (p0h != NULL) *p0h = p0h_;
-}
-
-/* Computes the asymmetric +/- subtraction butterfly (unscaled output).
- The inverse to this function is od_butterfly_sub().
-
- p1 = p1 - p0/2;
- p0 = p0 + p1; */
-static INLINE void od_butterfly_sub_asym(od_coeff *p0, od_coeff p0h,
- od_coeff *p1) {
- *p1 = od_sub(*p1, p0h);
- *p0 = od_add(*p0, *p1);
-}
-
-/* Computes the +/- subtract and negate butterfly (asymmetric output).
- The inverse to this function is od_butterfly_neg_asym().
-
- p1 = p1 - p0;
- p0 = p0 + p1/2;
- p1 = -p1; */
-static INLINE void od_butterfly_neg(od_coeff *p0, od_coeff *p1, od_coeff *p1h) {
- *p1 = od_sub(*p0, *p1);
- *p1h = od_rshift1(*p1);
- *p0 = od_sub(*p0, *p1h);
-}
-
-/* Computes the asymmetric +/- negate and subtract butterfly (unscaled output).
- The inverse to this function is od_butterfly_neg().
-
- p1 = -p1;
- p0 = p0 - p1/2;
- p1 = p1 + p0; */
-static INLINE void od_butterfly_neg_asym(od_coeff *p0, od_coeff *p1,
- od_coeff p1h) {
- *p0 = od_add(*p0, p1h);
- *p1 = od_sub(*p0, *p1);
-}
-
-/* --- 2-point Transforms --- */
-
-/**
- * 2-point orthonormal Type-II fDCT
- */
-static INLINE void od_fdct_2(od_coeff *p0, od_coeff *p1) {
- /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
- /* 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 */
- od_rotate_pi4_sub_avg(p1, p0, 11585, 13, 11585, 13);
-}
-
-/**
- * 2-point orthonormal Type-II iDCT
- */
-static INLINE void od_idct_2(od_coeff *p0, od_coeff *p1) {
- /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
- /* 11585/16384 = Cos[Pi/4] = 0.7071067811865475 */
- od_rotate_pi4_add(p0, p1, 11585, 13, 11585, 14);
-}
-
-/**
- * 2-point asymmetric Type-II fDCT
- */
-static INLINE void od_fdct_2_asym(od_coeff *p0, od_coeff *p1, od_coeff p1h) {
- od_butterfly_neg_asym(p0, p1, p1h);
-}
-
-/**
- * 2-point asymmetric Type-II iDCT
- */
-static INLINE void od_idct_2_asym(od_coeff *p0, od_coeff *p1, od_coeff *p1h) {
- od_butterfly_neg(p0, p1, p1h);
-}
-
-/**
- * 2-point orthonormal Type-IV fDCT
- */
-static INLINE void od_fdst_2(od_coeff *p0, od_coeff *p1) {
-
- /* Stage 0 */
-
- /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
- /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461971 */
- /* 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 */
- od_rotate_add_avg(p0, p1, 21407, 14, 8867, 14, 3135, 12, NONE);
-}
-
-/**
- * 2-point orthonormal Type-IV iDCT
- */
-static INLINE void od_idst_2(od_coeff *p0, od_coeff *p1) {
- od_fdst_2(p0, p1);
-}
-
-/**
- * 2-point asymmetric Type-IV fDCT
- */
-static INLINE void od_fdst_2_asym(od_coeff *p0, od_coeff p0h, od_coeff *p1) {
-
- /* Stage 0 */
-
- /* 15137/16384 = (Sin[3*Pi/8] + Cos[3*Pi/8])/Sqrt[2] = 0.9238795325112867 */
- /* 3135/4096 = (Sin[3*Pi/8] - Cos[3*Pi/8])*Sqrt[2] = 0.7653668647301795 */
- /* 8867/16384 = Cos[3*Pi/8]*Sqrt[2] = 0.5411961001461971 */
- od_rotate_add_half(p0, p1, p0h, 15137, 14, 3135, 12, 8867, 14, NONE);
-}
-
-/**
- * 2-point asymmetric Type-IV iDCT
- */
-static INLINE void od_idst_2_asym(od_coeff *p0, od_coeff *p1) {
-
- /* Stage 0 */
-
- /* 15137/16384 = (Sin[3*Pi/8] + Cos[3*Pi/8])/Sqrt[2] = 0.9238795325112867 */
- /* 3135/4096 = (Sin[3*Pi/8] - Cos[3*Pi/8])*Sqrt[2] = 0.7653668647301795 */
- /* 8867/8192 = 2*Cos[3*Pi/8]*Sqrt[2] = 1.0823922002923940 */
- od_rotate_add_avg(p0, p1, 15137, 14, 3135, 12, 8867, 13, SHIFT);
-}
-
-/* --- 4-point Transforms --- */
-
-/**
- * 4-point orthonormal Type-II fDCT
- */
-static INLINE void od_fdct_4(od_coeff *q0, od_coeff *q1, od_coeff *q2,
- od_coeff *q3) {
- od_coeff q1h;
- od_coeff q3h;
-
- /* +/- Butterflies with asymmetric output. */
- od_butterfly_neg(q0, q3, &q3h);
- od_butterfly_add(q1, &q1h, q2);
-
- /* Embedded 2-point transforms with asymmetric input. */
- od_fdct_2_asym(q0, q1, q1h);
- od_fdst_2_asym(q3, q3h, q2);
-}
-
-/**
- * 4-point orthonormal Type-II iDCT
- */
-static INLINE void od_idct_4(od_coeff *q0, od_coeff *q2,
- od_coeff *q1, od_coeff *q3) {
- od_coeff q1h;
-
- /* Embedded 2-point transforms with asymmetric output. */
- od_idst_2_asym(q3, q2);
- od_idct_2_asym(q0, q1, &q1h);
-
- /* +/- Butterflies with asymmetric input. */
- od_butterfly_add_asym(q1, q1h, q2);
- od_butterfly_neg_asym(q0, q3, od_rshift1(*q3));
-}
-
-/**
- * 4-point asymmetric Type-II fDCT
- */
-static INLINE void od_fdct_4_asym(od_coeff *q0, od_coeff *q1, od_coeff q1h,
- od_coeff *q2, od_coeff *q3, od_coeff q3h) {
-
- /* +/- Butterflies with asymmetric input. */
- od_butterfly_neg_asym(q0, q3, q3h);
- od_butterfly_sub_asym(q1, q1h, q2);
-
- /* Embedded 2-point orthonormal transforms. */
- od_fdct_2(q0, q1);
- od_fdst_2(q3, q2);
-}
-
-/**
- * 4-point asymmetric Type-II iDCT
- */
-static INLINE void od_idct_4_asym(od_coeff *q0, od_coeff *q2,
- od_coeff *q1, od_coeff *q1h,
- od_coeff *q3, od_coeff *q3h) {
-
- /* Embedded 2-point orthonormal transforms. */
- od_idst_2(q3, q2);
- od_idct_2(q0, q1);
-
- /* +/- Butterflies with asymmetric output. */
- od_butterfly_sub(q1, q1h, q2);
- od_butterfly_neg(q0, q3, q3h);
-}
-
-/**
- * 4-point orthonormal Type-IV fDST
- */
-static INLINE void od_fdst_4(od_coeff *q0, od_coeff *q1,
- od_coeff *q2, od_coeff *q3) {
-
- /* Stage 0 */
-
- /* 13623/16384 = (Sin[7*Pi/16] + Cos[7*Pi/16])/Sqrt[2] = 0.831469612302545 */
- /* 18205/16384 = (Sin[7*Pi/16] - Cos[7*Pi/16])*Sqrt[2] = 1.111140466039204 */
- /* 9041/32768 = Cos[7*Pi/16]*Sqrt[2] = 0.275899379282943 */
- od_rotate_add(q0, q3, 13623, 14, 18205, 14, 9041, 15, SHIFT);
-
- /* 16069/16384 = (Sin[5*Pi/16] + Cos[5*Pi/16])/Sqrt[2] = 0.9807852804032304 */
- /* 12785/32768 = (Sin[5*Pi/16] - Cos[5*Pi/16])*Sqrt[2] = 0.3901806440322566 */
- /* 12873/16384 = Cos[5*Pi/16]*Sqrt[2] = 0.7856949583871021 */
- od_rotate_sub(q2, q1, 16069, 14, 12785, 15, 12873, 14, SHIFT);
-
- /* Stage 1 */
-
- od_butterfly_sub_asym(q0, od_rshift1(*q0), q1);
- od_butterfly_sub_asym(q2, od_rshift1(*q2), q3);
-
- /* Stage 2 */
-
- /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
- /* 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 */
- od_rotate_pi4_add_avg(q2, q1, 11585, 13, 11585, 13);
-}
-
-/**
- * 4-point orthonormal Type-IV iDST
- */
-static INLINE void od_idst_4(od_coeff *q0, od_coeff *q2,
- od_coeff *q1, od_coeff *q3) {
- od_coeff q0h;
- od_coeff q2h;
-
- /* Stage 0 */
-
- /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
- /* 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 */
- od_rotate_pi4_add_avg(q2, q1, 11585, 13, 11585, 13);
-
- /* Stage 1 */
-
- od_butterfly_sub(q2, &q2h, q3);
- od_butterfly_sub(q0, &q0h, q1);
-
- /* Stage 2 */
-
- /* 16069/16384 = (Sin[5*Pi/16] + Cos[5*Pi/16])/Sqrt[2] = 0.9807852804032304 */
- /* 12785/32768 = (Sin[5*Pi/16] - Cos[5*Pi/16])*Sqrt[2] = 0.3901806440322566 */
- /* 12873/16384 = Cos[5*Pi/16]*Sqrt[2] = 0.7856949583871021 */
- od_rotate_sub_half(q2, q1, q2h, 16069, 14, 12785, 15, 12873, 14, NONE);
-
- /* 13623/16384 = (Sin[7*Pi/16] + Cos[7*Pi/16])/Sqrt[2] = 0.831469612302545 */
- /* 18205/16384 = (Sin[7*Pi/16] - Cos[7*Pi/16])*Sqrt[2] = 1.111140466039204 */
- /* 9041/32768 = Cos[7*Pi/16]*Sqrt[2] = 0.275899379282943 */
- od_rotate_add_half(q0, q3, q0h, 13623, 14, 18205, 14, 9041, 15, NONE);
-}
-
-/**
- * 4-point asymmetric Type-IV fDST
- */
-static INLINE void od_fdst_4_asym(od_coeff *q0, od_coeff q0h, od_coeff *q1,
- od_coeff *q2, od_coeff q2h, od_coeff *q3) {
-
- /* Stage 0 */
-
- /* 9633/16384 = (Sin[7*Pi/16] + Cos[7*Pi/16])/2 = 0.5879378012096793 */
- /* 12873/8192 = (Sin[7*Pi/16] - Cos[7*Pi/16])*2 = 1.5713899167742045 */
- /* 12785/32768 = Cos[7*Pi/16]*2 = 0.3901806440322565 */
- od_rotate_add_half(q0, q3, q0h, 9633, 14, 12873, 13, 12785, 15, SHIFT);
-
- /* 22725/32768 = (Sin[5*Pi/16] + Cos[5*Pi/16])/2 = 0.6935199226610738 */
- /* 18081/32768 = (Sin[5*Pi/16] - Cos[5*Pi/16])*2 = 0.5517987585658861 */
- /* 18205/16384 = Cos[5*Pi/16]*2 = 1.1111404660392044 */
- od_rotate_sub_half(q2, q1, q2h, 22725, 15, 18081, 15, 18205, 14, SHIFT);
-
- /* Stage 1 */
-
- od_butterfly_sub_asym(q0, od_rshift1(*q0), q1);
- od_butterfly_sub_asym(q2, od_rshift1(*q2), q3);
-
- /* Stage 2 */
-
- /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
- /* 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 */
- od_rotate_pi4_add_avg(q2, q1, 11585, 13, 11585, 13);
-}
-
-/**
- * 4-point asymmetric Type-IV iDST
- */
-static INLINE void od_idst_4_asym(od_coeff *q0, od_coeff *q2,
- od_coeff *q1, od_coeff *q3) {
- od_coeff q0h;
- od_coeff q2h;
-
- /* Stage 0 */
-
- /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
- /* 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 */
- od_rotate_pi4_add_avg(q2, q1, 11585, 13, 11585, 13);
-
- /* Stage 1 */
-
- od_butterfly_sub(q2, &q2h, q3);
- od_butterfly_sub(q0, &q0h, q1);
-
- /* Stage 2 */
-
- /* 22725/32768 = (Sin[5*Pi/16] + Cos[5*Pi/16])/2 = 0.6935199226610738 */
- /* 18081/32768 = (Sin[5*Pi/16] - Cos[5*Pi/16])*2 = 0.5517987585658861 */
- /* 18205/16384 = Cos[5*Pi/16]*2 = 1.1111404660392044 */
- od_rotate_sub_half(q2, q1, q2h, 22725, 15, 18081, 15, 18205, 14, SHIFT);
-
- /* 9633/16384 = (Sin[7*Pi/16] + Cos[7*Pi/16])/2 = 0.5879378012096793 */
- /* 12873/8192 = (Sin[7*Pi/16] - Cos[7*Pi/16])*2 = 1.5713899167742045 */
- /* 12785/32768 = Cos[7*Pi/16]*2 = 0.3901806440322565 */
- od_rotate_add_half(q0, q3, q0h, 9633, 14, 12873, 13, 12785, 15, SHIFT);
-}
-
-/* --- 8-point Transforms --- */
-
-/**
- * 8-point orthonormal Type-II fDCT
- */
-static INLINE void od_fdct_8(od_coeff *r0, od_coeff *r1,
- od_coeff *r2, od_coeff *r3,
- od_coeff *r4, od_coeff *r5,
- od_coeff *r6, od_coeff *r7) {
- od_coeff r1h;
- od_coeff r3h;
- od_coeff r5h;
- od_coeff r7h;
-
- /* +/- Butterflies with asymmetric output. */
- od_butterfly_neg(r0, r7, &r7h);
- od_butterfly_add(r1, &r1h, r6);
- od_butterfly_neg(r2, r5, &r5h);
- od_butterfly_add(r3, &r3h, r4);
-
- /* Embedded 4-point forward transforms with asymmetric input. */
- od_fdct_4_asym(r0, r1, r1h, r2, r3, r3h);
- od_fdst_4_asym(r7, r7h, r6, r5, r5h, r4);
-}
-
-/**
- * 8-point orthonormal Type-II iDCT
- */
-static INLINE void od_idct_8(od_coeff *r0, od_coeff *r4,
- od_coeff *r2, od_coeff *r6,
- od_coeff *r1, od_coeff *r5,
- od_coeff *r3, od_coeff *r7) {
- od_coeff r1h;
- od_coeff r3h;
-
- /* Embedded 4-point inverse transforms with asymmetric output. */
- od_idst_4_asym(r7, r5, r6, r4);
- od_idct_4_asym(r0, r2, r1, &r1h, r3, &r3h);
-
- /* +/- Butterflies with asymmetric input. */
- od_butterfly_add_asym(r3, r3h, r4);
- od_butterfly_neg_asym(r2, r5, od_rshift1(*r5));
- od_butterfly_add_asym(r1, r1h, r6);
- od_butterfly_neg_asym(r0, r7, od_rshift1(*r7));
-}
-
-/**
- * 8-point asymmetric Type-II fDCT
- */
-static INLINE void od_fdct_8_asym(od_coeff *r0, od_coeff *r1, od_coeff r1h,
- od_coeff *r2, od_coeff *r3, od_coeff r3h,
- od_coeff *r4, od_coeff *r5, od_coeff r5h,
- od_coeff *r6, od_coeff *r7, od_coeff r7h) {
-
- /* +/- Butterflies with asymmetric input. */
- od_butterfly_neg_asym(r0, r7, r7h);
- od_butterfly_sub_asym(r1, r1h, r6);
- od_butterfly_neg_asym(r2, r5, r5h);
- od_butterfly_sub_asym(r3, r3h, r4);
-
- /* Embedded 4-point orthonormal transforms. */
- od_fdct_4(r0, r1, r2, r3);
- od_fdst_4(r7, r6, r5, r4);
-}
-
-/**
- * 8-point asymmetric Type-II iDCT
- */
-static INLINE void od_idct_8_asym(od_coeff *r0, od_coeff *r4,
- od_coeff *r2, od_coeff *r6,
- od_coeff *r1, od_coeff *r1h,
- od_coeff *r5, od_coeff *r5h,
- od_coeff *r3, od_coeff *r3h,
- od_coeff *r7, od_coeff *r7h) {
-
- /* Embedded 4-point inverse orthonormal transforms. */
- od_idst_4(r7, r5, r6, r4);
- od_idct_4(r0, r2, r1, r3);
-
- /* +/- Butterflies with asymmetric output. */
- od_butterfly_sub(r3, r3h, r4);
- od_butterfly_neg(r2, r5, r5h);
- od_butterfly_sub(r1, r1h, r6);
- od_butterfly_neg(r0, r7, r7h);
-}
-
-/**
- * 8-point orthonormal Type-IV fDST
- */
-static INLINE void od_fdst_8(od_coeff *r0, od_coeff *r1,
- od_coeff *r2, od_coeff *r3,
- od_coeff *r4, od_coeff *r5,
- od_coeff *r6, od_coeff *r7) {
- od_coeff r0h;
- od_coeff r2h;
- od_coeff r5h;
- od_coeff r7h;
-
- /* Stage 0 */
-
- /* 17911/16384 = Sin[15*Pi/32] + Cos[15*Pi/32] = 1.0932018670017576 */
- /* 14699/16384 = Sin[15*Pi/32] - Cos[15*Pi/32] = 0.8971675863426363 */
- /* 803/8192 = Cos[15*Pi/32] = 0.0980171403295606 */
- od_rotate_add(r0, r7, 17911, 14, 14699, 14, 803, 13, NONE);
-
- /* 40869/32768 = Sin[13*Pi/32] + Cos[13*Pi/32] = 1.24722501298667123 */
- /* 21845/32768 = Sin[13*Pi/32] - Cos[13*Pi/32] = 0.66665565847774650 */
- /* 1189/4096 = Cos[13*Pi/32] = 0.29028467725446233 */
- od_rotate_sub(r6, r1, 40869, 15, 21845, 15, 1189, 12, NONE);
-
- /* 22173/16384 = Sin[11*Pi/32] + Cos[11*Pi/32] = 1.3533180011743526 */
- /* 3363/8192 = Sin[11*Pi/32] - Cos[11*Pi/32] = 0.4105245275223574 */
- /* 15447/32768 = Cos[11*Pi/32] = 0.47139673682599764 */
- od_rotate_add(r2, r5, 22173, 14, 3363, 13, 15447, 15, NONE);
-
- /* 23059/16384 = Sin[9*Pi/32] + Cos[9*Pi/32] = 1.4074037375263826 */
- /* 2271/16384 = Sin[9*Pi/32] - Cos[9*Pi/32] = 0.1386171691990915 */
- /* 5197/8192 = Cos[9*Pi/32] = 0.6343932841636455 */
- od_rotate_sub(r4, r3, 23059, 14, 2271, 14, 5197, 13, NONE);
-
- /* Stage 1 */
-
- od_butterfly_add(r0, &r0h, r3);
- od_butterfly_sub(r2, &r2h, r1);
- od_butterfly_add(r5, &r5h, r6);
- od_butterfly_sub(r7, &r7h, r4);
-
- /* Stage 2 */
-
- od_butterfly_add_asym(r7, r7h, r6);
- od_butterfly_add_asym(r5, r5h, r3);
- od_butterfly_add_asym(r2, r2h, r4);
- od_butterfly_sub_asym(r0, r0h, r1);
-
- /* Stage 3 */
-
- /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
- /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
- /* 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 */
- od_rotate_sub_avg(r3, r4, 21407, 14, 8867, 14, 3135, 12, NONE);
-
- /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
- /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
- /* 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 */
- od_rotate_neg_avg(r2, r5, 21407, 14, 8867, 14, 3135, 12);
-
- /* 46341/32768 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
- /* 46341/32768 = 2*Cos[Pi/4] = 1.4142135623730951 */
- od_rotate_pi4_sub_avg(r1, r6, 46341, 15, 46341, 15);
-}
-
-/**
- * 8-point orthonormal Type-IV iDST
- */
-static INLINE void od_idst_8(od_coeff *r0, od_coeff *r4,
- od_coeff *r2, od_coeff *r6,
- od_coeff *r1, od_coeff *r5,
- od_coeff *r3, od_coeff *r7) {
- od_coeff r0h;
- od_coeff r2h;
- od_coeff r5h;
- od_coeff r7h;
-
- /* Stage 3 */
-
- /* 46341/32768 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
- /* 46341/32768 = 2*Cos[Pi/4] = 1.4142135623730951 */
- od_rotate_pi4_add_avg(r6, r1, 11585, 13, 46341, 15);
-
- /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
- /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
- /* 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 */
- od_rotate_neg_avg(r5, r2, 21407, 14, 8867, 14, 3135, 12);
-
- /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
- /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
- /* 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 */
- od_rotate_add_avg(r4, r3, 21407, 14, 8867, 14, 3135, 12, NONE);
-
- /* Stage 2 */
-
- od_butterfly_sub(r0, &r0h, r1);
- od_butterfly_add(r2, &r2h, r4);
- od_butterfly_add(r5, &r5h, r3);
- od_butterfly_add(r7, &r7h, r6);
-
- /* Stage 1 */
-
- od_butterfly_sub_asym(r7, r7h, r4);
- od_butterfly_add_asym(r5, r5h, r6);
- od_butterfly_sub_asym(r2, r2h, r1);
- od_butterfly_add_asym(r0, r0h, r3);
-
- /* Stage 0 */
-
- /* 23059/16384 = Sin[9*Pi/32] + Cos[9*Pi/32] = 1.4074037375263826 */
- /* 2271/16384 = Sin[9*Pi/32] - Cos[9*Pi/32] = 0.1386171691990915 */
- /* 5197/8192 = Cos[9*Pi/32] = 0.6343932841636455 */
- od_rotate_sub(r4, r3, 23059, 14, 2271, 14, 5197, 13, NONE);
-
- /* 22173/16384 = Sin[11*Pi/32] + Cos[11*Pi/32] = 1.3533180011743526 */
- /* 3363/8192 = Sin[11*Pi/32] - Cos[11*Pi/32] = 0.4105245275223574 */
- /* 15447/32768 = Cos[11*Pi/32] = 0.47139673682599764 */
- od_rotate_add(r2, r5, 22173, 14, 3363, 13, 15447, 15, NONE);
-
- /* 40869/32768 = Sin[13*Pi/32] + Cos[13*Pi/32] = 1.24722501298667123 */
- /* 21845/32768 = Sin[13*Pi/32] - Cos[13*Pi/32] = 0.66665565847774650 */
- /* 1189/4096 = Cos[13*Pi/32] = 0.29028467725446233 */
- od_rotate_sub(r6, r1, 40869, 15, 21845, 15, 1189, 12, NONE);
-
- /* 17911/16384 = Sin[15*Pi/32] + Cos[15*Pi/32] = 1.0932018670017576 */
- /* 14699/16384 = Sin[15*Pi/32] - Cos[15*Pi/32] = 0.8971675863426363 */
- /* 803/8192 = Cos[15*Pi/32] = 0.0980171403295606 */
- od_rotate_add(r0, r7, 17911, 14, 14699, 14, 803, 13, NONE);
-}
-
-/**
- * 8-point asymmetric Type-IV fDST
- */
-static INLINE void od_fdst_8_asym(od_coeff *r0, od_coeff r0h, od_coeff *r1,
- od_coeff *r2, od_coeff r2h, od_coeff *r3,
- od_coeff *r4, od_coeff r4h, od_coeff *r5,
- od_coeff *r6, od_coeff r6h, od_coeff *r7) {
- od_coeff r5h;
- od_coeff r7h;
-
- /* Stage 0 */
-
- /* 12665/16384 = (Sin[15*Pi/32] + Cos[15*Pi/32])/Sqrt[2] = 0.77301045336274 */
- /* 5197/4096 = (Sin[15*Pi/32] - Cos[15*Pi/32])*Sqrt[2] = 1.26878656832729 */
- /* 2271/16384 = Cos[15*Pi/32]*Sqrt[2] = 0.13861716919909 */
- od_rotate_add_half(r0, r7, r0h, 12665, 14, 5197, 12, 2271, 14, NONE);
-
- /* 28899/32768 = Sin[13*Pi/32] + Cos[13*Pi/32])/Sqrt[2] = 0.881921264348355 */
- /* 30893/32768 = Sin[13*Pi/32] - Cos[13*Pi/32])*Sqrt[2] = 0.942793473651995 */
- /* 3363/8192 = Cos[13*Pi/32]*Sqrt[2] = 0.410524527522357 */
- od_rotate_sub_half(r6, r1, r6h, 28899, 15, 30893, 15, 3363, 13, NONE);
-
- /* 31357/32768 = Sin[11*Pi/32] + Cos[11*Pi/32])/Sqrt[2] = 0.956940335732209 */
- /* 1189/2048 = Sin[11*Pi/32] - Cos[11*Pi/32])*Sqrt[2] = 0.580569354508925 */
- /* 21845/32768 = Cos[11*Pi/32]*Sqrt[2] = 0.666655658477747 */
- od_rotate_add_half(r2, r5, r2h, 31357, 15, 1189, 11, 21845, 15, NONE);
-
- /* 16305/16384 = (Sin[9*Pi/32] + Cos[9*Pi/32])/Sqrt[2] = 0.9951847266721969 */
- /* 803/4096 = (Sin[9*Pi/32] - Cos[9*Pi/32])*Sqrt[2] = 0.1960342806591213 */
- /* 14699/16384 = Cos[9*Pi/32]*Sqrt[2] = 0.8971675863426364 */
- od_rotate_sub_half(r4, r3, r4h, 16305, 14, 803, 12, 14699, 14, NONE);
-
- /* Stage 1 */
-
- od_butterfly_add(r0, &r0h, r3);
- od_butterfly_sub(r2, &r2h, r1);
- od_butterfly_add(r5, &r5h, r6);
- od_butterfly_sub(r7, &r7h, r4);
-
- /* Stage 2 */
-
- od_butterfly_add_asym(r7, r7h, r6);
- od_butterfly_add_asym(r5, r5h, r3);
- od_butterfly_add_asym(r2, r2h, r4);
- od_butterfly_sub_asym(r0, r0h, r1);
-
- /* Stage 3 */
-
- /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
- /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
- /* 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 */
- od_rotate_sub_avg(r3, r4, 21407, 14, 8867, 14, 3135, 12, NONE);
-
- /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
- /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
- /* 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 */
- od_rotate_neg_avg(r2, r5, 21407, 14, 8867, 14, 3135, 12);
-
- /* 46341/32768 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
- /* 46341/32768 = 2*Cos[Pi/4] = 1.4142135623730951 */
- od_rotate_pi4_sub_avg(r1, r6, 46341, 15, 46341, 15);
-}
-
-/**
- * 8-point asymmetric Type-IV iDST
- */
-static INLINE void od_idst_8_asym(od_coeff *r0, od_coeff *r4,
- od_coeff *r2, od_coeff *r6,
- od_coeff *r1, od_coeff *r5,
- od_coeff *r3, od_coeff *r7) {
- od_coeff r0h;
- od_coeff r2h;
- od_coeff r5h;
- od_coeff r7h;
-
- /* Stage 3 */
-
- /* 46341/32768 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
- /* 46341/32768 = 2*Cos[Pi/4] = 1.4142135623730951 */
- od_rotate_pi4_add_avg(r6, r1, 11585, 13, 11585, 13);
-
- /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
- /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
- /* 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 */
- od_rotate_neg_avg(r5, r2, 21407, 14, 8867, 14, 3135, 12);
-
- /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
- /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
- /* 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 */
- od_rotate_add_avg(r4, r3, 21407, 14, 8867, 14, 3135, 12, NONE);
-
- /* Stage 2 */
-
- od_butterfly_sub(r0, &r0h, r1);
- od_butterfly_add(r2, &r2h, r4);
- od_butterfly_add(r5, &r5h, r3);
- od_butterfly_add(r7, &r7h, r6);
-
- /* Stage 1 */
-
- od_butterfly_sub_asym(r7, r7h, r4);
- od_butterfly_add_asym(r5, r5h, r6);
- od_butterfly_sub_asym(r2, r2h, r1);
- od_butterfly_add_asym(r0, r0h, r3);
-
- /* Stage 0 */
-
- /* 16305/16384 = (Sin[9*Pi/32] + Cos[9*Pi/32])/Sqrt[2] = 0.9951847266721969 */
- /* 803/4096 = (Sin[9*Pi/32] - Cos[9*Pi/32])*Sqrt[2] = 0.1960342806591213 */
- /* 14699/16384 = Cos[9*Pi/32]*Sqrt[2] = 0.8971675863426364 */
- od_rotate_sub(r4, r3, 16305, 14, 803, 12, 14699, 14, SHIFT);
-
- /* 31357/32768 = Sin[11*Pi/32] + Cos[11*Pi/32])/Sqrt[2] = 0.956940335732209 */
- /* 1189/2048 = Sin[11*Pi/32] - Cos[11*Pi/32])*Sqrt[2] = 0.580569354508925 */
- /* 21845/32768 = Cos[11*Pi/32]*Sqrt[2] = 0.666655658477747 */
- od_rotate_add(r2, r5, 31357, 15, 1189, 11, 21845, 15, SHIFT);
-
- /* 28899/32768 = Sin[13*Pi/32] + Cos[13*Pi/32])/Sqrt[2] = 0.881921264348355 */
- /* 30893/32768 = Sin[13*Pi/32] - Cos[13*Pi/32])*Sqrt[2] = 0.942793473651995 */
- /* 3363/8192 = Cos[13*Pi/32]*Sqrt[2] = 0.410524527522357 */
- od_rotate_sub(r6, r1, 28899, 15, 30893, 15, 3363, 13, SHIFT);
-
- /* 12665/16384 = (Sin[15*Pi/32] + Cos[15*Pi/32])/Sqrt[2] = 0.77301045336274 */
- /* 5197/4096 = (Sin[15*Pi/32] - Cos[15*Pi/32])*Sqrt[2] = 1.26878656832729 */
- /* 2271/16384 = Cos[15*Pi/32]*Sqrt[2] = 0.13861716919909 */
- od_rotate_add(r0, r7, 12665, 14, 5197, 12, 2271, 14, SHIFT);
-}
-
-/* --- 16-point Transforms --- */
-
-/**
- * 16-point orthonormal Type-II fDCT
- */
-static INLINE void od_fdct_16(od_coeff *s0, od_coeff *s1,
- od_coeff *s2, od_coeff *s3,
- od_coeff *s4, od_coeff *s5,
- od_coeff *s6, od_coeff *s7,
- od_coeff *s8, od_coeff *s9,
- od_coeff *sa, od_coeff *sb,
- od_coeff *sc, od_coeff *sd,
- od_coeff *se, od_coeff *sf) {
- od_coeff s1h;
- od_coeff s3h;
- od_coeff s5h;
- od_coeff s7h;
- od_coeff s9h;
- od_coeff sbh;
- od_coeff sdh;
- od_coeff sfh;
-
- /* +/- Butterflies with asymmetric output. */
- od_butterfly_neg(s0, sf, &sfh);
- od_butterfly_add(s1, &s1h, se);
- od_butterfly_neg(s2, sd, &sdh);
- od_butterfly_add(s3, &s3h, sc);
- od_butterfly_neg(s4, sb, &sbh);
- od_butterfly_add(s5, &s5h, sa);
- od_butterfly_neg(s6, s9, &s9h);
- od_butterfly_add(s7, &s7h, s8);
-
- /* Embedded 8-point transforms with asymmetric input. */
- od_fdct_8_asym(s0, s1, s1h, s2, s3, s3h, s4, s5, s5h, s6, s7, s7h);
- od_fdst_8_asym(sf, sfh, se, sd, sdh, sc, sb, sbh, sa, s9, s9h, s8);
-}
-
-/**
- * 16-point orthonormal Type-II iDCT
- */
-static INLINE void od_idct_16(od_coeff *s0, od_coeff *s8,
- od_coeff *s4, od_coeff *sc,
- od_coeff *s2, od_coeff *sa,
- od_coeff *s6, od_coeff *se,
- od_coeff *s1, od_coeff *s9,
- od_coeff *s5, od_coeff *sd,
- od_coeff *s3, od_coeff *sb,
- od_coeff *s7, od_coeff *sf) {
- od_coeff s1h;
- od_coeff s3h;
- od_coeff s5h;
- od_coeff s7h;
-
- /* Embedded 8-point transforms with asymmetric output. */
- od_idst_8_asym(sf, sb, sd, s9, se, sa, sc, s8);
- od_idct_8_asym(s0, s4, s2, s6, s1, &s1h, s5, &s5h, s3, &s3h, s7, &s7h);
-
- /* +/- Butterflies with asymmetric input. */
- od_butterfly_add_asym(s7, s7h, s8);
- od_butterfly_neg_asym(s6, s9, od_rshift1(*s9));
- od_butterfly_add_asym(s5, s5h, sa);
- od_butterfly_neg_asym(s4, sb, od_rshift1(*sb));
- od_butterfly_add_asym(s3, s3h, sc);
- od_butterfly_neg_asym(s2, sd, od_rshift1(*sd));
- od_butterfly_add_asym(s1, s1h, se);
- od_butterfly_neg_asym(s0, sf, od_rshift1(*sf));
-}
-
-/**
- * 16-point asymmetric Type-II fDCT
- */
-static INLINE void od_fdct_16_asym(od_coeff *s0, od_coeff *s1, od_coeff s1h,
- od_coeff *s2, od_coeff *s3, od_coeff s3h,
- od_coeff *s4, od_coeff *s5, od_coeff s5h,
- od_coeff *s6, od_coeff *s7, od_coeff s7h,
- od_coeff *s8, od_coeff *s9, od_coeff s9h,
- od_coeff *sa, od_coeff *sb, od_coeff sbh,
- od_coeff *sc, od_coeff *sd, od_coeff sdh,
- od_coeff *se, od_coeff *sf, od_coeff sfh) {
-
- /* +/- Butterflies with asymmetric input. */
- od_butterfly_neg_asym(s0, sf, sfh);
- od_butterfly_sub_asym(s1, s1h, se);
- od_butterfly_neg_asym(s2, sd, sdh);
- od_butterfly_sub_asym(s3, s3h, sc);
- od_butterfly_neg_asym(s4, sb, sbh);
- od_butterfly_sub_asym(s5, s5h, sa);
- od_butterfly_neg_asym(s6, s9, s9h);
- od_butterfly_sub_asym(s7, s7h, s8);
-
- /* Embedded 8-point orthonormal transforms. */
- od_fdct_8(s0, s1, s2, s3, s4, s5, s6, s7);
- od_fdst_8(sf, se, sd, sc, sb, sa, s9, s8);
-}
-
-/**
- * 16-point asymmetric Type-II iDCT
- */
-static INLINE void od_idct_16_asym(od_coeff *s0, od_coeff *s8,
- od_coeff *s4, od_coeff *sc,
- od_coeff *s2, od_coeff *sa,
- od_coeff *s6, od_coeff *se,
- od_coeff *s1, od_coeff *s1h,
- od_coeff *s9, od_coeff *s9h,
- od_coeff *s5, od_coeff *s5h,
- od_coeff *sd, od_coeff *sdh,
- od_coeff *s3, od_coeff *s3h,
- od_coeff *sb, od_coeff *sbh,
- od_coeff *s7, od_coeff *s7h,
- od_coeff *sf, od_coeff *sfh) {
-
- /* Embedded 8-point orthonormal transforms. */
- od_idst_8(sf, sb, sd, s9, se, sa, sc, s8);
- od_idct_8(s0, s4, s2, s6, s1, s5, s3, s7);
-
- /* +/- Butterflies with asymmetric output. */
- od_butterfly_sub(s7, s7h, s8);
- od_butterfly_neg(s6, s9, s9h);
- od_butterfly_sub(s5, s5h, sa);
- od_butterfly_neg(s4, sb, sbh);
- od_butterfly_sub(s3, s3h, sc);
- od_butterfly_neg(s2, sd, sdh);
- od_butterfly_sub(s1, s1h, se);
- od_butterfly_neg(s0, sf, sfh);
-}
-
-/**
- * 16-point orthonormal Type-IV fDST
- */
-static INLINE void od_fdst_16(od_coeff *s0, od_coeff *s1,
- od_coeff *s2, od_coeff *s3,
- od_coeff *s4, od_coeff *s5,
- od_coeff *s6, od_coeff *s7,
- od_coeff *s8, od_coeff *s9,
- od_coeff *sa, od_coeff *sb,
- od_coeff *sc, od_coeff *sd,
- od_coeff *se, od_coeff *sf) {
- od_coeff s0h;
- od_coeff s2h;
- od_coeff sdh;
- od_coeff sfh;
-
- /* Stage 0 */
-
- /* 24279/32768 = (Sin[31*Pi/64] + Cos[31*Pi/64])/Sqrt[2] = 0.74095112535496 */
- /* 44011/32768 = (Sin[31*Pi/64] - Cos[31*Pi/64])*Sqrt[2] = 1.34311790969404 */
- /* 1137/16384 = Cos[31*Pi/64]*Sqrt[2] = 0.06939217050794 */
- od_rotate_add(s0, sf, 24279, 15, 44011, 15, 1137, 14, SHIFT);
-
- /* 1645/2048 = (Sin[29*Pi/64] + Cos[29*Pi/64])/Sqrt[2] = 0.8032075314806449 */
- /* 305/256 = (Sin[29*Pi/64] - Cos[29*Pi/64])*Sqrt[2] = 1.1913986089848667 */
- /* 425/2048 = Cos[29*Pi/64]*Sqrt[2] = 0.2075082269882116 */
- od_rotate_sub(se, s1, 1645, 11, 305, 8, 425, 11, SHIFT);
-
- /* 14053/32768 = (Sin[27*Pi/64] + Cos[27*Pi/64])/Sqrt[2] = 0.85772861000027 */
- /* 8423/8192 = (Sin[27*Pi/64] - Cos[27*Pi/64])*Sqrt[2] = 1.02820548838644 */
- /* 2815/8192 = Cos[27*Pi/64]*Sqrt[2] = 0.34362586580705 */
- od_rotate_add(s2, sd, 14053, 14, 8423, 13, 2815, 13, SHIFT);
-
- /* 14811/16384 = (Sin[25*Pi/64] + Cos[25*Pi/64])/Sqrt[2] = 0.90398929312344 */
- /* 7005/8192 = (Sin[25*Pi/64] - Cos[25*Pi/64])*Sqrt[2] = 0.85511018686056 */
- /* 3903/8192 = Cos[25*Pi/64]*Sqrt[2] = 0.47643419969316 */
- od_rotate_sub(sc, s3, 14811, 14, 7005, 13, 3903, 13, SHIFT);
-
- /* 30853/32768 = (Sin[23*Pi/64] + Cos[23*Pi/64])/Sqrt[2] = 0.94154406518302 */
- /* 11039/16384 = (Sin[23*Pi/64] - Cos[23*Pi/64])*Sqrt[2] = 0.67377970678444 */
- /* 19813/32768 = Cos[23*Pi/64]*Sqrt[2] = 0.60465421179080 */
- od_rotate_add(s4, sb, 30853, 15, 11039, 14, 19813, 15, SHIFT);
-
- /* 15893/16384 = (Sin[21*Pi/64] + Cos[21*Pi/64])/Sqrt[2] = 0.97003125319454 */
- /* 3981/8192 = (Sin[21*Pi/64] - Cos[21*Pi/64])*Sqrt[2] = 0.89716758634264 */
- /* 1489/2048 = Cos[21*Pi/64]*Sqrt[2] = 0.72705107329128 */
- od_rotate_sub(sa, s5, 15893, 14, 3981, 13, 1489, 11, SHIFT);
-
- /* 32413/32768 = (Sin[19*Pi/64] + Cos[19*Pi/64])/Sqrt[2] = 0.98917650996478 */
- /* 601/2048 = (Sin[19*Pi/64] - Cos[19*Pi/64])*Sqrt[2] = 0.29346094891072 */
- /* 27605/32768 = Cos[19*Pi/64]*Sqrt[2] = 0.84244603550942 */
- od_rotate_add(s6, s9, 32413, 15, 601, 11, 27605, 15, SHIFT);
-
- /* 32729/32768 = (Sin[17*Pi/64] + Cos[17*Pi/64])/Sqrt[2] = 0.99879545620517 */
- /* 201/2048 = (Sin[17*Pi/64] - Cos[17*Pi/64])*Sqrt[2] = 0.09813534865484 */
- /* 31121/32768 = Cos[17*Pi/64]*Sqrt[2] = 0.94972778187775 */
- od_rotate_sub(s8, s7, 32729, 15, 201, 11, 31121, 15, SHIFT);
-
- /* Stage 1 */
-
- od_butterfly_sub_asym(s0, od_rshift1(*s0), s7);
- od_butterfly_sub_asym(s8, od_rshift1(*s8), sf);
- od_butterfly_add_asym(s4, od_rshift1(*s4), s3);
- od_butterfly_add_asym(sc, od_rshift1(*sc), sb);
- od_butterfly_sub_asym(s2, od_rshift1(*s2), s5);
- od_butterfly_sub_asym(sa, od_rshift1(*sa), sd);
- od_butterfly_add_asym(s6, od_rshift1(*s6), s1);
- od_butterfly_add_asym(se, od_rshift1(*se), s9);
-
- /* Stage 2 */
-
- od_butterfly_add(s8, NULL, s4);
- od_butterfly_add(s7, NULL, sb);
- od_butterfly_sub(sa, NULL, s6);
- od_butterfly_sub(s5, NULL, s9);
- od_butterfly_add(s0, &s0h, s3);
- od_butterfly_add(sd, &sdh, se);
- od_butterfly_sub(s2, &s2h, s1);
- od_butterfly_sub(sf, &sfh, sc);
-
- /* Stage 3 */
-
- /* 9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
- /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
- /* 12785/32768 = 2*Cos[7*Pi/16] = 0.3901806440322565 */
- od_rotate_add_avg(s8, s7, 9633, 13, 12873, 14, 12785, 15, NONE);
-
- /* 45451/32768 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
- /* 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
- /* 18205/32768 = Cos[5*Pi/16] = 0.5555702330196022 */
- od_rotate_add(s9, s6, 45451, 15, 9041, 15, 18205, 15, NONE);
-
- /* 22725/16384 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
- /* 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
- /* 18205/32768 = 2*Cos[5*Pi/16] = 1.1111404660392044 */
- od_rotate_neg_avg(s5, sa, 22725, 14, 9041, 15, 18205, 14);
-
- /* 38531/32768 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
- /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
- /* 6393/32768 = Cos[7*Pi/16] = 0.1950903220161283 */
- od_rotate_neg(s4, sb, 38531, 15, 12873, 14, 6393, 15);
-
- /* Stage 4 */
-
- od_butterfly_add_asym(s2, s2h, sc);
- od_butterfly_sub_asym(s0, s0h, s1);
- od_butterfly_add_asym(sf, sfh, se);
- od_butterfly_add_asym(sd, sdh, s3);
- od_butterfly_add_asym(s7, od_rshift1(*s7), s6);
- od_butterfly_sub_asym(s8, od_rshift1(*s8), s9);
- od_butterfly_sub_asym(sa, od_rshift1(*sa), sb);
- od_butterfly_add_asym(s5, od_rshift1(*s5), s4);
-
- /* Stage 5 */
-
- /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
- /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
- /* 3135/4096 = 2*Cos[7*Pi/8] = 0.7653668647301796 */
- od_rotate_add_avg(sc, s3, 21407, 14, 8867, 14, 3135, 12, NONE);
-
- /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3870398453221475 */
- /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
- /* 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 */
- od_rotate_neg_avg(s2, sd, 21407, 14, 8867, 14, 3135, 12);
-
- /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
- /* 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 */
- od_rotate_pi4_add_avg(sa, s5, 11585, 13, 11585, 13);
-
- /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
- /* 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 */
- od_rotate_pi4_add_avg(s6, s9, 11585, 13, 11585, 13);
-
- /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
- /* 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 */
- od_rotate_pi4_add_avg(se, s1, 11585, 13, 11585, 13);
-}
-
-/**
- * 16-point orthonormal Type-IV iDST
- */
-static INLINE void od_idst_16(od_coeff *s0, od_coeff *s8,
- od_coeff *s4, od_coeff *sc,
- od_coeff *s2, od_coeff *sa,
- od_coeff *s6, od_coeff *se,
- od_coeff *s1, od_coeff *s9,
- od_coeff *s5, od_coeff *sd,
- od_coeff *s3, od_coeff *sb,
- od_coeff *s7, od_coeff *sf) {
- od_coeff s0h;
- od_coeff s2h;
- od_coeff s4h;
- od_coeff s6h;
- od_coeff s8h;
- od_coeff sah;
- od_coeff sch;
- od_coeff sdh;
- od_coeff seh;
- od_coeff sfh;
-
- /* Stage 5 */
-
- /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
- /* 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 */
- od_rotate_pi4_add_avg(s6, s9, 11585, 13, 11585, 13);
-
- /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
- /* 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 */
- od_rotate_pi4_add_avg(sa, s5, 11585, 13, 11585, 13);
-
- /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
- /* 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 */
- od_rotate_pi4_add_avg(se, s1, 11585, 13, 11585, 13);
-
- /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
- /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
- /* 3135/4096 = 2*Cos[7*Pi/8] = 0.7653668647301796 */
- od_rotate_add_avg(sc, s3, 21407, 14, 8867, 14, 3135, 12, NONE);
-
- /* 21407/16384 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3870398453221475 */
- /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
- /* 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 */
- od_rotate_neg_avg(sd, s2, 21407, 14, 8867, 14, 3135, 12);
-
- /* Stage 4 */
-
- od_butterfly_add(s5, NULL, s4);
- od_butterfly_sub(sa, NULL, sb);
- od_butterfly_sub(s8, NULL, s9);
- od_butterfly_add(s7, NULL, s6);
- od_butterfly_add(sd, &sdh, s3);
- od_butterfly_add(sf, &sfh, se);
- od_butterfly_sub(s0, &s0h, s1);
- od_butterfly_add(s2, &s2h, sc);
-
- /* Stage 3 */
-
- /* 9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
- /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
- /* 12785/32768 = 2*Cos[7*Pi/16] = 0.3901806440322565 */
- od_rotate_add_avg(s8, s7, 9633, 13, 12873, 14, 12785, 15, NONE);
-
- /* 45451/32768 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
- /* 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
- /* 18205/32768 = Cos[5*Pi/16] = 0.5555702330196022 */
- od_rotate_add(s9, s6, 45451, 15, 9041, 15, 18205, 15, NONE);
-
- /* 22725/16384 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
- /* 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
- /* 18205/32768 = 2*Cos[5*Pi/16] = 1.1111404660392044 */
- od_rotate_neg_avg(sa, s5, 22725, 14, 9041, 15, 18205, 14);
-
- /* 38531/32768 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
- /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
- /* 6393/32768 = Cos[7*Pi/16] = 0.1950903220161283 */
- od_rotate_neg(sb, s4, 38531, 15, 12873, 14, 6393, 15);
-
- /* Stage 2 */
-
- od_butterfly_add_asym(s8, od_rshift1(*s8), s4);
- od_butterfly_add_asym(s7, od_rshift1(*s7), sb);
- od_butterfly_sub_asym(sa, od_rshift1(*sa), s6);
- od_butterfly_sub_asym(s5, od_rshift1(*s5), s9);
- od_butterfly_add_asym(s0, s0h, s3);
- od_butterfly_add_asym(sd, sdh, se);
- od_butterfly_sub_asym(s2, s2h, s1);
- od_butterfly_sub_asym(sf, sfh, sc);
-
- /* Stage 1 */
-
- od_butterfly_sub(s0, &s0h, s7);
- od_butterfly_sub(s8, &s8h, sf);
- od_butterfly_add(s4, &s4h, s3);
- od_butterfly_add(sc, &sch, sb);
- od_butterfly_sub(s2, &s2h, s5);
- od_butterfly_sub(sa, &sah, sd);
- od_butterfly_add(s6, &s6h, s1);
- od_butterfly_add(se, &seh, s9);
-
- /* Stage 0 */
-
- /* 32729/32768 = (Sin[17*Pi/64] + Cos[17*Pi/64])/Sqrt[2] = 0.99879545620517 */
- /* 201/2048 = (Sin[17*Pi/64] - Cos[17*Pi/64])*Sqrt[2] = 0.09813534865484 */
- /* 31121/32768 = Cos[17*Pi/64]*Sqrt[2] = 0.94972778187775 */
- od_rotate_sub_half(s8, s7, s8h, 32729, 15, 201, 11, 31121, 15, NONE);
-
- /* 32413/32768 = (Sin[19*Pi/64] + Cos[19*Pi/64])/Sqrt[2] = 0.98917650996478 */
- /* 601/2048 = (Sin[19*Pi/64] - Cos[19*Pi/64])*Sqrt[2] = 0.29346094891072 */
- /* 27605/32768 = Cos[19*Pi/64]*Sqrt[2] = 0.84244603550942 */
- od_rotate_add_half(s6, s9, s6h, 32413, 15, 601, 11, 27605, 15, NONE);
-
- /* 15893/16384 = (Sin[21*Pi/64] + Cos[21*Pi/64])/Sqrt[2] = 0.97003125319454 */
- /* 3981/8192 = (Sin[21*Pi/64] - Cos[21*Pi/64])*Sqrt[2] = 0.89716758634264 */
- /* 1489/2048 = Cos[21*Pi/64]*Sqrt[2] = 0.72705107329128 */
- od_rotate_sub_half(sa, s5, sah, 15893, 14, 3981, 13, 1489, 11, NONE);
-
- /* 30853/32768 = (Sin[23*Pi/64] + Cos[23*Pi/64])/Sqrt[2] = 0.94154406518302 */
- /* 11039/16384 = (Sin[23*Pi/64] - Cos[23*Pi/64])*Sqrt[2] = 0.67377970678444 */
- /* 19813/32768 = Cos[23*Pi/64]*Sqrt[2] = 0.60465421179080 */
- od_rotate_add_half(s4, sb, s4h, 30853, 15, 11039, 14, 19813, 15, NONE);
-
- /* 14811/16384 = (Sin[25*Pi/64] + Cos[25*Pi/64])/Sqrt[2] = 0.90398929312344 */
- /* 7005/8192 = (Sin[25*Pi/64] - Cos[25*Pi/64])*Sqrt[2] = 0.85511018686056 */
- /* 3903/8192 = Cos[25*Pi/64]*Sqrt[2] = 0.47643419969316 */
- od_rotate_sub_half(sc, s3, sch, 14811, 14, 7005, 13, 3903, 13, NONE);
-
- /* 14053/32768 = (Sin[27*Pi/64] + Cos[27*Pi/64])/Sqrt[2] = 0.85772861000027 */
- /* 8423/8192 = (Sin[27*Pi/64] - Cos[27*Pi/64])*Sqrt[2] = 1.02820548838644 */
- /* 2815/8192 = Cos[27*Pi/64]*Sqrt[2] = 0.34362586580705 */
- od_rotate_add_half(s2, sd, s2h, 14053, 14, 8423, 13, 2815, 13, NONE);
-
- /* 1645/2048 = (Sin[29*Pi/64] + Cos[29*Pi/64])/Sqrt[2] = 0.8032075314806449 */
- /* 305/256 = (Sin[29*Pi/64] - Cos[29*Pi/64])*Sqrt[2] = 1.1913986089848667 */
- /* 425/2048 = Cos[29*Pi/64]*Sqrt[2] = 0.2075082269882116 */
- od_rotate_sub_half(se, s1, seh, 1645, 11, 305, 8, 425, 11, NONE);
-
- /* 24279/32768 = (Sin[31*Pi/64] + Cos[31*Pi/64])/Sqrt[2] = 0.74095112535496 */
- /* 44011/32768 = (Sin[31*Pi/64] - Cos[31*Pi/64])*Sqrt[2] = 1.34311790969404 */
- /* 1137/16384 = Cos[31*Pi/64]*Sqrt[2] = 0.06939217050794 */
- od_rotate_add_half(s0, sf, s0h, 24279, 15, 44011, 15, 1137, 14, NONE);
-}
-
-/**
- * 16-point asymmetric Type-IV fDST
- */
-static INLINE void od_fdst_16_asym(od_coeff *s0, od_coeff s0h, od_coeff *s1,
- od_coeff *s2, od_coeff s2h, od_coeff *s3,
- od_coeff *s4, od_coeff s4h, od_coeff *s5,
- od_coeff *s6, od_coeff s6h, od_coeff *s7,
- od_coeff *s8, od_coeff s8h, od_coeff *s9,
- od_coeff *sa, od_coeff sah, od_coeff *sb,
- od_coeff *sc, od_coeff sch, od_coeff *sd,
- od_coeff *se, od_coeff seh, od_coeff *sf) {
- od_coeff sdh;
- od_coeff sfh;
-
- /* Stage 0 */
-
- /* 1073/2048 = (Sin[31*Pi/64] + Cos[31*Pi/64])/2 = 0.5239315652662953 */
- /* 62241/32768 = (Sin[31*Pi/64] - Cos[31*Pi/64])*2 = 1.8994555637555088 */
- /* 201/16384 = Cos[31*Pi/64]*2 = 0.0981353486548360 */
- od_rotate_add_half(s0, sf, s0h, 1073, 11, 62241, 15, 201, 11, SHIFT);
-
- /* 18611/32768 = (Sin[29*Pi/64] + Cos[29*Pi/64])/2 = 0.5679534922100714 */
- /* 55211/32768 = (Sin[29*Pi/64] - Cos[29*Pi/64])*2 = 1.6848920710188384 */
- /* 601/2048 = Cos[29*Pi/64]*2 = 0.2934609489107235 */
- od_rotate_sub_half(se, s1, seh, 18611, 15, 55211, 15, 601, 11, SHIFT);
-
- /* 9937/16384 = (Sin[27*Pi/64] + Cos[27*Pi/64])/2 = 0.6065057165489039 */
- /* 1489/1024 = (Sin[27*Pi/64] - Cos[27*Pi/64])*2 = 1.4541021465825602 */
- /* 3981/8192 = Cos[27*Pi/64]*2 = 0.4859603598065277 */
- od_rotate_add_half(s2, sd, s2h, 9937, 14, 1489, 10, 3981, 13, SHIFT);
-
- /* 10473/16384 = (Sin[25*Pi/64] + Cos[25*Pi/64])/2 = 0.6392169592876205 */
- /* 39627/32768 = (Sin[25*Pi/64] - Cos[25*Pi/64])*2 = 1.2093084235816014 */
- /* 11039/16384 = Cos[25*Pi/64]*2 = 0.6737797067844401 */
- od_rotate_sub_half(sc, s3, sch, 10473, 14, 39627, 15, 11039, 14, SHIFT);
-
- /* 2727/4096 = (Sin[23*Pi/64] + Cos[23*Pi/64])/2 = 0.6657721932768628 */
- /* 3903/4096 = (Sin[23*Pi/64] - Cos[23*Pi/64])*2 = 0.9528683993863225 */
- /* 7005/8192 = Cos[23*Pi/64]*2 = 0.8551101868605642 */
- od_rotate_add_half(s4, sb, s4h, 2727, 12, 3903, 12, 7005, 13, SHIFT);
-
- /* 5619/8192 = (Sin[21*Pi/64] + Cos[21*Pi/64])/2 = 0.6859156770967569 */
- /* 2815/4096 = (Sin[21*Pi/64] - Cos[21*Pi/64])*2 = 0.6872517316141069 */
- /* 8423/8192 = Cos[21*Pi/64]*2 = 1.0282054883864433 */
- od_rotate_sub_half(sa, s5, sah, 5619, 13, 2815, 12, 8423, 13, SHIFT);
-
- /* 2865/4096 = (Sin[19*Pi/64] + Cos[19*Pi/64])/2 = 0.6994534179865391 */
- /* 13588/32768 = (Sin[19*Pi/64] - Cos[19*Pi/64])*2 = 0.4150164539764232 */
- /* 305/256 = Cos[19*Pi/64]*2 = 1.1913986089848667 */
- od_rotate_add_half(s6, s9, s6h, 2865, 12, 13599, 15, 305, 8, SHIFT);
-
- /* 23143/32768 = (Sin[17*Pi/64] + Cos[17*Pi/64])/2 = 0.7062550401009887 */
- /* 1137/8192 = (Sin[17*Pi/64] - Cos[17*Pi/64])*2 = 0.1387843410158816 */
- /* 44011/32768 = Cos[17*Pi/64]*2 = 1.3431179096940367 */
- od_rotate_sub_half(s8, s7, s8h, 23143, 15, 1137, 13, 44011, 15, SHIFT);
-
- /* Stage 1 */
-
- od_butterfly_sub_asym(s0, od_rshift1(*s0), s7);
- od_butterfly_sub_asym(s8, od_rshift1(*s8), sf);
- od_butterfly_add_asym(s4, od_rshift1(*s4), s3);
- od_butterfly_add_asym(sc, od_rshift1(*sc), sb);
- od_butterfly_sub_asym(s2, od_rshift1(*s2), s5);
- od_butterfly_sub_asym(sa, od_rshift1(*sa), sd);
- od_butterfly_add_asym(s6, od_rshift1(*s6), s1);
- od_butterfly_add_asym(se, od_rshift1(*se), s9);
-
- /* Stage 2 */
-
- od_butterfly_add(s8, NULL, s4);
- od_butterfly_add(s7, NULL, sb);
- od_butterfly_sub(sa, NULL, s6);
- od_butterfly_sub(s5, NULL, s9);
- od_butterfly_add(s0, &s0h, s3);
- od_butterfly_add(sd, &sdh, se);
- od_butterfly_sub(s2, &s2h, s1);
- od_butterfly_sub(sf, &sfh, sc);
-
- /* Stage 3 */
-
- /* 9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
- /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
- /* 6393/32768 = Cos[7*Pi/16] = 0.1950903220161283 */
- od_rotate_add(s8, s7, 9633, 13, 12873, 14, 6393, 15, NONE);
-
- /* 45451/32768 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
- /* 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
- /* 18205/32768 = Cos[5*Pi/16] = 0.5555702330196022 */
- od_rotate_add(s9, s6, 45451, 15, 9041, 15, 18205, 15, NONE);
-
- /* 11363/8192 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
- /* 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
- /* 4551/8192 = Cos[5*Pi/16] = 0.5555702330196022 */
- od_rotate_neg(s5, sa, 11363, 13, 9041, 15, 4551, 13);
-
- /* 9633/32768 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
- /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
- /* 6393/32768 = Cos[7*Pi/16] = 0.1950903220161283 */
- od_rotate_neg(s4, sb, 9633, 13, 12873, 14, 6393, 15);
-
- /* Stage 4 */
-
- od_butterfly_add_asym(s2, s2h, sc);
- od_butterfly_sub_asym(s0, s0h, s1);
- od_butterfly_add_asym(sf, sfh, se);
- od_butterfly_add_asym(sd, sdh, s3);
- od_butterfly_add_asym(s7, od_rshift1(*s7), s6);
- od_butterfly_sub_asym(s8, od_rshift1(*s8), s9);
- od_butterfly_sub_asym(sa, od_rshift1(*sa), sb);
- od_butterfly_add_asym(s5, od_rshift1(*s5), s4);
-
- /* Stage 5 */
-
- /* 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
- /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
- /* 3135/8192 = Cos[7*Pi/8] = 0.3826834323650898 */
- od_rotate_add(sc, s3, 10703, 13, 8867, 14, 3135, 13, NONE);
-
- /* 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3870398453221475 */
- /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
- /* 3135/8192 = Cos[3*Pi/8] = 0.3826834323650898 */
- od_rotate_neg(s2, sd, 10703, 13, 8867, 14, 3135, 13);
-
- /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
- /* 11585/16384 = Cos[Pi/4] = 0.7071067811865475 */
- od_rotate_pi4_add(sa, s5, 11585, 13, 11585, 14);
-
- /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
- /* 11585/16384 = Cos[Pi/4] = 0.7071067811865475 */
- od_rotate_pi4_add(s6, s9, 11585, 13, 11585, 14);
-
- /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
- /* 11585/16384 = Cos[Pi/4] = 0.7071067811865475 */
- od_rotate_pi4_add(se, s1, 11585, 13, 11585, 14);
-}
-
-/**
- * 16-point asymmetric Type-IV iDST
- */
-static INLINE void od_idst_16_asym(od_coeff *s0, od_coeff *s8,
- od_coeff *s4, od_coeff *sc,
- od_coeff *s2, od_coeff *sa,
- od_coeff *s6, od_coeff *se,
- od_coeff *s1, od_coeff *s9,
- od_coeff *s5, od_coeff *sd,
- od_coeff *s3, od_coeff *sb,
- od_coeff *s7, od_coeff *sf) {
- od_coeff s0h;
- od_coeff s2h;
- od_coeff s4h;
- od_coeff s6h;
- od_coeff s8h;
- od_coeff sah;
- od_coeff sch;
- od_coeff sdh;
- od_coeff seh;
- od_coeff sfh;
-
- /* Stage 5 */
-
- /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
- /* 11585/16384 = Cos[Pi/4] = 0.7071067811865475 */
- od_rotate_pi4_add(s6, s9, 11585, 13, 11585, 14);
-
- /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
- /* 11585/16384 = 2*Cos[Pi/4] = 0.7071067811865475 */
- od_rotate_pi4_add(sa, s5, 11585, 13, 11585, 14);
-
- /* 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 */
- /* 11585/16384 = 2*Cos[Pi/4] = 0.7071067811865475 */
- od_rotate_pi4_add(se, s1, 11585, 13, 11585, 14);
-
- /* 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 */
- /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
- /* 3135/8192 = Cos[7*Pi/8] = 0.7653668647301796 */
- od_rotate_add(sc, s3, 10703, 13, 8867, 14, 3135, 13, NONE);
-
- /* 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3870398453221475 */
- /* 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 */
- /* 3135/8192 = Cos[3*Pi/8] = 0.7653668647301796 */
- od_rotate_neg(sd, s2, 10703, 13, 8867, 14, 3135, 13);
-
- /* Stage 4 */
-
- od_butterfly_add(s5, NULL, s4);
- od_butterfly_sub(sa, NULL, sb);
- od_butterfly_sub(s8, NULL, s9);
- od_butterfly_add(s7, NULL, s6);
- od_butterfly_add(sd, &sdh, s3);
- od_butterfly_add(sf, &sfh, se);
- od_butterfly_sub(s0, &s0h, s1);
- od_butterfly_add(s2, &s2h, sc);
-
- /* Stage 3 */
-
- /* 9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
- /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
- /* 6393/32768 = Cos[7*Pi/16] = 0.1950903220161283 */
- od_rotate_neg(sb, s4, 9633, 13, 12873, 14, 6393, 15);
-
- /* 11363/8192 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
- /* 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
- /* 4551/8192 = Cos[5*Pi/16] = 0.5555702330196022 */
- od_rotate_neg(sa, s5, 11363, 13, 9041, 15, 4551, 13);
-
- /* 22725/16384 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
- /* 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 */
- /* 18205/32768 = Cos[5*Pi/16] = 0.5555702330196022 */
- od_rotate_add(s9, s6, 22725, 14, 9041, 15, 18205, 15, NONE);
-
- /* 9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
- /* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
- /* 6393/32768 = Cos[7*Pi/16] = 0.1950903220161283 */
- od_rotate_add(s8, s7, 9633, 13, 12873, 14, 6393, 15, NONE);
-
- /* Stage 2 */
-
- od_butterfly_add_asym(s8, od_rshift1(*s8), s4);
- od_butterfly_add_asym(s7, od_rshift1(*s7), sb);
- od_butterfly_sub_asym(sa, od_rshift1(*sa), s6);
- od_butterfly_sub_asym(s5, od_rshift1(*s5), s9);
- od_butterfly_add_asym(s0, s0h, s3);
- od_butterfly_add_asym(sd, sdh, se);
- od_butterfly_sub_asym(s2, s2h, s1);
- od_butterfly_sub_asym(sf, sfh, sc);
-
- /* Stage 1 */
-
- od_butterfly_sub(s0, &s0h, s7);
- od_butterfly_sub(s8, &s8h, sf);
- od_butterfly_add(s4, &s4h, s3);
- od_butterfly_add(sc, &sch, sb);
- od_butterfly_sub(s2, &s2h, s5);
- od_butterfly_sub(sa, &sah, sd);
- od_butterfly_add(s6, &s6h, s1);
- od_butterfly_add(se, &seh, s9);
-
- /* Stage 0 */
-
- /* 23143/32768 = (Sin[17*Pi/64] + Cos[17*Pi/64])/2 = 0.7062550401009887 */
- /* 1137/8192 = (Sin[17*Pi/64] - Cos[17*Pi/64])*2 = 0.1387843410158816 */
- /* 44011/32768 = Cos[17*Pi/64]*2 = 1.3431179096940367 */
- od_rotate_sub_half(s8, s7, s8h, 23143, 15, 1137, 13, 44011, 15, SHIFT);
-
- /* 2865/4096 = (Sin[19*Pi/64] + Cos[19*Pi/64])/2 = 0.6994534179865391 */
- /* 13599/32768 = (Sin[19*Pi/64] - Cos[19*Pi/64])*2 = 0.4150164539764232 */
- /* 305/256 = Cos[19*Pi/64]*2 = 1.1913986089848667 */
- od_rotate_add_half(s6, s9, s6h, 2865, 12, 13599, 15, 305, 8, SHIFT);
-
- /* 5619/8192 = (Sin[21*Pi/64] + Cos[21*Pi/64])/2 = 0.6859156770967569 */
- /* 2815/4096 = (Sin[21*Pi/64] - Cos[21*Pi/64])*2 = 0.6872517316141069 */
- /* 8423/8192 = Cos[21*Pi/64]*2 = 1.0282054883864433 */
- od_rotate_sub_half(sa, s5, sah, 5619, 13, 2815, 12, 8423, 13, SHIFT);
-
- /* 2727/4096 = (Sin[23*Pi/64] + Cos[23*Pi/64])/2 = 0.6657721932768628 */
- /* 3903/4096 = (Sin[23*Pi/64] - Cos[23*Pi/64])*2 = 0.9528683993863225 */
- /* 7005/8192 = Cos[23*Pi/64]*2 = 0.8551101868605642 */
- od_rotate_add_half(s4, sb, s4h, 2727, 12, 3903, 12, 7005, 13, SHIFT);
-
- /* 10473/16384 = (Sin[25*Pi/64] + Cos[25*Pi/64])/2 = 0.6392169592876205 */
- /* 39627/32768 = (Sin[25*Pi/64] - Cos[25*Pi/64])*2 = 1.2093084235816014 */
- /* 11039/16384 = Cos[25*Pi/64]*2 = 0.6737797067844401 */
- od_rotate_sub_half(sc, s3, sch, 10473, 14, 39627, 15, 11039, 14, SHIFT);
-
- /* 9937/16384 = (Sin[27*Pi/64] + Cos[27*Pi/64])/2 = 0.6065057165489039 */
- /* 1489/1024 = (Sin[27*Pi/64] - Cos[27*Pi/64])*2 = 1.4541021465825602 */
- /* 3981/8192 = Cos[27*Pi/64]*2 = 0.4859603598065277 */
- od_rotate_add_half(s2, sd, s2h, 9937, 14, 1489, 10, 3981, 13, SHIFT);
-
- /* 18611/32768 = (Sin[29*Pi/64] + Cos[29*Pi/64])/2 = 0.5679534922100714 */
- /* 55211/32768 = (Sin[29*Pi/64] - Cos[29*Pi/64])*2 = 1.6848920710188384 */
- /* 601/2048 = Cos[29*Pi/64]*2 = 0.2934609489107235 */
- od_rotate_sub_half(se, s1, seh, 18611, 15, 55211, 15, 601, 11, SHIFT);
-
- /* 1073/2048 = (Sin[31*Pi/64] + Cos[31*Pi/64])/2 = 0.5239315652662953 */
- /* 62241/32768 = (Sin[31*Pi/64] - Cos[31*Pi/64])*2 = 1.8994555637555088 */
- /* 201/2048 = Cos[31*Pi/64]*2 = 0.0981353486548360 */
- od_rotate_add_half(s0, sf, s0h, 1073, 11, 62241, 15, 201, 11, SHIFT);
-}
-
-/* --- 32-point Transforms --- */
-
-/**
- * 32-point orthonormal Type-II fDCT
- */
-static INLINE void od_fdct_32(od_coeff *t0, od_coeff *t1,
- od_coeff *t2, od_coeff *t3,
- od_coeff *t4, od_coeff *t5,
- od_coeff *t6, od_coeff *t7,
- od_coeff *t8, od_coeff *t9,
- od_coeff *ta, od_coeff *tb,
- od_coeff *tc, od_coeff *td,
- od_coeff *te, od_coeff *tf,
- od_coeff *tg, od_coeff *th,
- od_coeff *ti, od_coeff *tj,
- od_coeff *tk, od_coeff *tl,
- od_coeff *tm, od_coeff *tn,
- od_coeff *to, od_coeff *tp,
- od_coeff *tq, od_coeff *tr,
- od_coeff *ts, od_coeff *tt,
- od_coeff *tu, od_coeff *tv) {
- od_coeff t1h;
- od_coeff t3h;
- od_coeff t5h;
- od_coeff t7h;
- od_coeff t9h;
- od_coeff tbh;
- od_coeff tdh;
- od_coeff tfh;
- od_coeff thh;
- od_coeff tjh;
- od_coeff tlh;
- od_coeff tnh;
- od_coeff tph;
- od_coeff trh;
- od_coeff tth;
- od_coeff tvh;
-
- /* +/- Butterflies with asymmetric output. */
- od_butterfly_neg(t0, tv, &tvh);
- od_butterfly_add(t1, &t1h, tu);
- od_butterfly_neg(t2, tt, &tth);
- od_butterfly_add(t3, &t3h, ts);
- od_butterfly_neg(t4, tr, &trh);
- od_butterfly_add(t5, &t5h, tq);
- od_butterfly_neg(t6, tp, &tph);
- od_butterfly_add(t7, &t7h, to);
- od_butterfly_neg(t8, tn, &tnh);
- od_butterfly_add(t9, &t9h, tm);
- od_butterfly_neg(ta, tl, &tlh);
- od_butterfly_add(tb, &tbh, tk);
- od_butterfly_neg(tc, tj, &tjh);
- od_butterfly_add(td, &tdh, ti);
- od_butterfly_neg(te, th, &thh);
- od_butterfly_add(tf, &tfh, tg);
-
- /* Embedded 16-point transforms with asymmetric input. */
- od_fdct_16_asym(
- t0, t1, t1h, t2, t3, t3h, t4, t5, t5h, t6, t7, t7h,
- t8, t9, t9h, ta, tb, tbh, tc, td, tdh, te, tf, tfh);
- od_fdst_16_asym(
- tv, tvh, tu, tt, tth, ts, tr, trh, tq, tp, tph, to,
- tn, tnh, tm, tl, tlh, tk, tj, tjh, ti, th, thh, tg);
-}
-
-/**
- * 32-point orthonormal Type-II iDCT
- */
-static INLINE void od_idct_32(od_coeff *t0, od_coeff *tg,
- od_coeff *t8, od_coeff *to,
- od_coeff *t4, od_coeff *tk,
- od_coeff *tc, od_coeff *ts,
- od_coeff *t2, od_coeff *ti,
- od_coeff *ta, od_coeff *tq,
- od_coeff *t6, od_coeff *tm,
- od_coeff *te, od_coeff *tu,
- od_coeff *t1, od_coeff *th,
- od_coeff *t9, od_coeff *tp,
- od_coeff *t5, od_coeff *tl,
- od_coeff *td, od_coeff *tt,
- od_coeff *t3, od_coeff *tj,
- od_coeff *tb, od_coeff *tr,
- od_coeff *t7, od_coeff *tn,
- od_coeff *tf, od_coeff *tv) {
- od_coeff t1h;
- od_coeff t3h;
- od_coeff t5h;
- od_coeff t7h;
- od_coeff t9h;
- od_coeff tbh;
- od_coeff tdh;
- od_coeff tfh;
-
- /* Embedded 16-point transforms with asymmetric output. */
- od_idst_16_asym(
- tv, tn, tr, tj, tt, tl, tp, th, tu, tm, tq, ti, ts, tk, to, tg);
- od_idct_16_asym(
- t0, t8, t4, tc, t2, ta, t6, te,
- t1, &t1h, t9, &t9h, t5, &t5h, td, &tdh,
- t3, &t3h, tb, &tbh, t7, &t7h, tf, &tfh);
-
- /* +/- Butterflies with asymmetric input. */
- od_butterfly_add_asym(tf, tfh, tg);
- od_butterfly_neg_asym(te, th, od_rshift1(*th));
- od_butterfly_add_asym(td, tdh, ti);
- od_butterfly_neg_asym(tc, tj, od_rshift1(*tj));
- od_butterfly_add_asym(tb, tbh, tk);
- od_butterfly_neg_asym(ta, tl, od_rshift1(*tl));
- od_butterfly_add_asym(t9, t9h, tm);
- od_butterfly_neg_asym(t8, tn, od_rshift1(*tn));
- od_butterfly_add_asym(t7, t7h, to);
- od_butterfly_neg_asym(t6, tp, od_rshift1(*tp));
- od_butterfly_add_asym(t5, t5h, tq);
- od_butterfly_neg_asym(t4, tr, od_rshift1(*tr));
- od_butterfly_add_asym(t3, t3h, ts);
- od_butterfly_neg_asym(t2, tt, od_rshift1(*tt));
- od_butterfly_add_asym(t1, t1h, tu);
- od_butterfly_neg_asym(t0, tv, od_rshift1(*tv));
-}
-
-#endif
diff --git a/av1/common/idct.c b/av1/common/idct.c
index f82b0e9..9db0333 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -19,22 +19,11 @@
#include "av1/common/blockd.h"
#include "av1/common/enums.h"
#include "av1/common/idct.h"
-#if CONFIG_DAALA_TX4 || CONFIG_DAALA_TX8 || CONFIG_DAALA_TX16 || \
- CONFIG_DAALA_TX32 || CONFIG_DAALA_TX64
-#include "av1/common/daala_tx.h"
-#if CONFIG_DAALA_TX
-#include "av1/common/daala_inv_txfm.h"
-#endif
-#endif
-#if !CONFIG_DAALA_TX
int av1_get_tx_scale(const TX_SIZE tx_size) {
const int pels = tx_size_2d[tx_size];
return (pels > 256) + (pels > 1024) + (pels > 4096);
}
-#endif
-
-#if !CONFIG_DAALA_TX
// NOTE: The implementation of all inverses need to be aware of the fact
// that input and output could be the same buffer.
@@ -63,13 +52,13 @@
}
}
-#if CONFIG_TX64X64 && (!CONFIG_DAALA_TX32 || !CONFIG_DAALA_TX64)
+#if CONFIG_TX64X64
static void iidtx64_c(const tran_low_t *input, tran_low_t *output) {
for (int i = 0; i < 64; ++i) {
output[i] = (tran_low_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
}
}
-#endif // CONFIG_TX64X64 && (!CONFIG_DAALA_TX32 || !CONFIG_DAALA_TX64)
+#endif // CONFIG_TX64X64
// For use in lieu of ADST
static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
@@ -85,7 +74,7 @@
// Note overall scaling factor is 4 times orthogonal
}
-#if CONFIG_TX64X64 && (!CONFIG_DAALA_TX32 || !CONFIG_DAALA_TX64)
+#if CONFIG_TX64X64
static const int8_t inv_stage_range_col_dct_64[12] = { 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0 };
static const int8_t inv_stage_range_row_dct_64[12] = { 0, 0, 0, 0, 0, 0,
@@ -123,7 +112,7 @@
aom_idct32_c(inputhalf, output + 32);
// Note overall scaling factor is 4 * sqrt(2) times orthogonal
}
-#endif // CONFIG_TX64X64 && (!CONFIG_DAALA_TX32 || !CONFIG_DAALA_TX64)
+#endif // CONFIG_TX64X64
#define FLIPUD_PTR(dest, stride, size) \
do { \
@@ -191,31 +180,11 @@
void av1_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
-#if !CONFIG_DAALA_TX4
if (tx_type == DCT_DCT) {
aom_idct4x4_16_add(input, dest, stride);
return;
}
-#endif
static const transform_2d IHT_4[] = {
-#if CONFIG_DAALA_TX4
- { daala_idct4, daala_idct4 }, // DCT_DCT = 0
- { daala_idst4, daala_idct4 }, // ADST_DCT = 1
- { daala_idct4, daala_idst4 }, // DCT_ADST = 2
- { daala_idst4, daala_idst4 }, // ADST_ADST = 3
- { daala_idst4, daala_idct4 }, // FLIPADST_DCT
- { daala_idct4, daala_idst4 }, // DCT_FLIPADST
- { daala_idst4, daala_idst4 }, // FLIPADST_FLIPADST
- { daala_idst4, daala_idst4 }, // ADST_FLIPADST
- { daala_idst4, daala_idst4 }, // FLIPADST_ADST
- { daala_idtx4, daala_idtx4 }, // IDTX
- { daala_idct4, daala_idtx4 }, // V_DCT
- { daala_idtx4, daala_idct4 }, // H_DCT
- { daala_idst4, daala_idtx4 }, // V_ADST
- { daala_idtx4, daala_idst4 }, // H_ADST
- { daala_idst4, daala_idtx4 }, // V_FLIPADST
- { daala_idtx4, daala_idst4 }, // H_FLIPADST
-#else
{ aom_idct4_c, aom_idct4_c }, // DCT_DCT = 0
{ aom_iadst4_c, aom_idct4_c }, // ADST_DCT = 1
{ aom_idct4_c, aom_iadst4_c }, // DCT_ADST = 2
@@ -232,7 +201,6 @@
{ iidtx4_c, aom_iadst4_c }, // H_ADST
{ aom_iadst4_c, iidtx4_c }, // V_FLIPADST
{ iidtx4_c, aom_iadst4_c }, // H_FLIPADST
-#endif
};
tran_low_t tmp[4][4];
@@ -242,13 +210,7 @@
// inverse transform row vectors
for (int i = 0; i < 4; ++i) {
-#if CONFIG_DAALA_TX4
- tran_low_t temp_in[4];
- for (int j = 0; j < 4; j++) temp_in[j] = input[j] * 2;
- IHT_4[tx_type].rows(temp_in, out[i]);
-#else
IHT_4[tx_type].rows(input, out[i]);
-#endif
input += 4;
}
@@ -271,11 +233,7 @@
for (int j = 0; j < 4; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
-#if CONFIG_DAALA_TX4
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
- dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#endif
}
}
}
@@ -284,24 +242,6 @@
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
static const transform_2d IHT_4x8[] = {
-#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
- { daala_idct8, daala_idct4 }, // DCT_DCT = 0
- { daala_idst8, daala_idct4 }, // ADST_DCT = 1
- { daala_idct8, daala_idst4 }, // DCT_ADST = 2
- { daala_idst8, daala_idst4 }, // ADST_ADST = 3
- { daala_idst8, daala_idct4 }, // FLIPADST_DCT
- { daala_idct8, daala_idst4 }, // DCT_FLIPADST
- { daala_idst8, daala_idst4 }, // FLIPADST_FLIPADST
- { daala_idst8, daala_idst4 }, // ADST_FLIPADST
- { daala_idst8, daala_idst4 }, // FLIPADST_ADST
- { daala_idtx8, daala_idtx4 }, // IDTX
- { daala_idct8, daala_idtx4 }, // V_DCT
- { daala_idtx8, daala_idct4 }, // H_DCT
- { daala_idst8, daala_idtx4 }, // V_ADST
- { daala_idtx8, daala_idst4 }, // H_ADST
- { daala_idst8, daala_idtx4 }, // V_FLIPADST
- { daala_idtx8, daala_idst4 }, // H_FLIPADST
-#else
{ aom_idct8_c, aom_idct4_c }, // DCT_DCT
{ aom_iadst8_c, aom_idct4_c }, // ADST_DCT
{ aom_idct8_c, aom_iadst4_c }, // DCT_ADST
@@ -318,7 +258,6 @@
{ iidtx8_c, aom_iadst4_c }, // H_ADST
{ aom_iadst8_c, iidtx4_c }, // V_FLIPADST
{ iidtx8_c, aom_iadst4_c }, // H_FLIPADST
-#endif
};
const int n = 4;
@@ -336,23 +275,12 @@
// inverse transform row vectors and transpose
for (int i = 0; i < n2; ++i) {
-#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
- // Daala row transform; Scaling cases 3 and 4 above
- tran_low_t temp_in[4];
- // Input scaling up by 1 bit
- for (int j = 0; j < n; j++) temp_in[j] = input[j] * 2;
- // Row transform; Daala does not scale
- IHT_4x8[tx_type].rows(temp_in, outtmp);
- // Transpose; no mid scaling
- for (int j = 0; j < n; ++j) tmp[j][i] = outtmp[j];
-#else
// AV1 row transform; Scaling case 1 only
// Row transform (AV1 scales up .5 bits)
IHT_4x8[tx_type].rows(input, outtmp);
// Transpose and mid scaling up by .5 bit
for (int j = 0; j < n; ++j)
tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-#endif
input += n;
}
@@ -369,13 +297,8 @@
for (int j = 0; j < n; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
-#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
- // Output scaling cases 2, 4
- dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
// Output scaling case 1 only
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-#endif
}
}
}
@@ -384,24 +307,6 @@
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
static const transform_2d IHT_8x4[] = {
-#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
- { daala_idct4, daala_idct8 }, // DCT_DCT = 0
- { daala_idst4, daala_idct8 }, // ADST_DCT = 1
- { daala_idct4, daala_idst8 }, // DCT_ADST = 2
- { daala_idst4, daala_idst8 }, // ADST_ADST = 3
- { daala_idst4, daala_idct8 }, // FLIPADST_DCT
- { daala_idct4, daala_idst8 }, // DCT_FLIPADST
- { daala_idst4, daala_idst8 }, // FLIPADST_FLIPADST
- { daala_idst4, daala_idst8 }, // ADST_FLIPADST
- { daala_idst4, daala_idst8 }, // FLIPADST_ADST
- { daala_idtx4, daala_idtx8 }, // IDTX
- { daala_idct4, daala_idtx8 }, // V_DCT
- { daala_idtx4, daala_idct8 }, // H_DCT
- { daala_idst4, daala_idtx8 }, // V_ADST
- { daala_idtx4, daala_idst8 }, // H_ADST
- { daala_idst4, daala_idtx8 }, // V_FLIPADST
- { daala_idtx4, daala_idst8 }, // H_FLIPADST
-#else
{ aom_idct4_c, aom_idct8_c }, // DCT_DCT
{ aom_iadst4_c, aom_idct8_c }, // ADST_DCT
{ aom_idct4_c, aom_iadst8_c }, // DCT_ADST
@@ -418,7 +323,6 @@
{ iidtx4_c, aom_iadst8_c }, // H_ADST
{ aom_iadst4_c, iidtx8_c }, // V_FLIPADST
{ iidtx4_c, aom_iadst8_c }, // H_FLIPADST
-#endif
};
const int n = 4;
@@ -436,23 +340,12 @@
// inverse transform row vectors and transpose
for (int i = 0; i < n; ++i) {
-#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
- // Daala row transform; Scaling cases 3 and 4 above
- tran_low_t temp_in[8];
- // Input scaling up by 1 bit
- for (int j = 0; j < n2; j++) temp_in[j] = input[j] * 2;
- // Row transform; Daala does not scale
- IHT_8x4[tx_type].rows(temp_in, outtmp);
- // Transpose; no mid scaling
- for (int j = 0; j < n2; ++j) tmp[j][i] = outtmp[j];
-#else
// AV1 row transform; Scaling case 1 only
// Row transform (AV1 scales up 1 bit)
IHT_8x4[tx_type].rows(input, outtmp);
// Transpose and mid scaling up by .5 bit
for (int j = 0; j < n2; ++j)
tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-#endif
input += n2;
}
@@ -469,13 +362,8 @@
for (int j = 0; j < n2; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
-#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
- // Output scaling cases 2, 4
- dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
// Output scaling case 1
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-#endif
}
}
}
@@ -590,24 +478,6 @@
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
static const transform_2d IHT_8x16[] = {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
- { daala_idct16, daala_idct8 }, // DCT_DCT = 0
- { daala_idst16, daala_idct8 }, // ADST_DCT = 1
- { daala_idct16, daala_idst8 }, // DCT_ADST = 2
- { daala_idst16, daala_idst8 }, // ADST_ADST = 3
- { daala_idst16, daala_idct8 }, // FLIPADST_DCT
- { daala_idct16, daala_idst8 }, // DCT_FLIPADST
- { daala_idst16, daala_idst8 }, // FLIPADST_FLIPADST
- { daala_idst16, daala_idst8 }, // ADST_FLIPADST
- { daala_idst16, daala_idst8 }, // FLIPADST_ADST
- { daala_idtx16, daala_idtx8 }, // IDTX
- { daala_idct16, daala_idtx8 }, // V_DCT
- { daala_idtx16, daala_idct8 }, // H_DCT
- { daala_idst16, daala_idtx8 }, // V_ADST
- { daala_idtx16, daala_idst8 }, // H_ADST
- { daala_idst16, daala_idtx8 }, // V_FLIPADST
- { daala_idtx16, daala_idst8 }, // H_FLIPADST
-#else
{ aom_idct16_c, aom_idct8_c }, // DCT_DCT
{ aom_iadst16_c, aom_idct8_c }, // ADST_DCT
{ aom_idct16_c, aom_iadst8_c }, // DCT_ADST
@@ -624,7 +494,6 @@
{ iidtx16_c, aom_iadst8_c }, // H_ADST
{ aom_iadst16_c, iidtx8_c }, // V_FLIPADST
{ iidtx16_c, aom_iadst8_c }, // H_FLIPADST
-#endif
};
const int n = 8;
@@ -642,22 +511,12 @@
// inverse transform row vectors and transpose
for (int i = 0; i < n2; ++i) {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
- tran_low_t temp_in[8];
- // Input scaling case 4
- for (int j = 0; j < n; j++) temp_in[j] = input[j] * 2;
- // Row transform (Daala does not scale)
- IHT_8x16[tx_type].rows(temp_in, outtmp);
- // Transpose (no mid scaling)
- for (int j = 0; j < n; ++j) tmp[j][i] = outtmp[j];
-#else
// Case 1; no input scaling
// Row transform (AV1 scales up 1 bit)
IHT_8x16[tx_type].rows(input, outtmp);
// Transpose and mid scaling up .5 bits
for (int j = 0; j < n; ++j)
tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-#endif
input += n;
}
@@ -674,13 +533,8 @@
for (int j = 0; j < n; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
- // Output scaling cases 2 and 4
- dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
// Output scaling case 1
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-#endif
}
}
}
@@ -689,24 +543,6 @@
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
static const transform_2d IHT_16x8[] = {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
- { daala_idct8, daala_idct16 }, // DCT_DCT = 0
- { daala_idst8, daala_idct16 }, // ADST_DCT = 1
- { daala_idct8, daala_idst16 }, // DCT_ADST = 2
- { daala_idst8, daala_idst16 }, // ADST_ADST = 3
- { daala_idst8, daala_idct16 }, // FLIPADST_DCT
- { daala_idct8, daala_idst16 }, // DCT_FLIPADST
- { daala_idst8, daala_idst16 }, // FLIPADST_FLIPADST
- { daala_idst8, daala_idst16 }, // ADST_FLIPADST
- { daala_idst8, daala_idst16 }, // FLIPADST_ADST
- { daala_idtx8, daala_idtx16 }, // IDTX
- { daala_idct8, daala_idtx16 }, // V_DCT
- { daala_idtx8, daala_idct16 }, // H_DCT
- { daala_idst8, daala_idtx16 }, // V_ADST
- { daala_idtx8, daala_idst16 }, // H_ADST
- { daala_idst8, daala_idtx16 }, // V_FLIPADST
- { daala_idtx8, daala_idst16 }, // H_FLIPADST
-#else
{ aom_idct8_c, aom_idct16_c }, // DCT_DCT
{ aom_iadst8_c, aom_idct16_c }, // ADST_DCT
{ aom_idct8_c, aom_iadst16_c }, // DCT_ADST
@@ -723,7 +559,6 @@
{ iidtx8_c, aom_iadst16_c }, // H_ADST
{ aom_iadst8_c, iidtx16_c }, // V_FLIPADST
{ iidtx8_c, aom_iadst16_c }, // H_FLIPADST
-#endif
};
const int n = 8;
@@ -741,16 +576,6 @@
// inverse transform row vectors and transpose
for (int i = 0; i < n; ++i) {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
- tran_low_t temp_in[16];
- // Input scaling cases 3 and 4
- for (int j = 0; j < n2; j++) temp_in[j] = input[j] * 2;
- // Daala row TX, no scaling
- IHT_16x8[tx_type].rows(temp_in, outtmp);
- // Transpose and mid scaling
- // Case 4
- for (int j = 0; j < n2; ++j) tmp[j][i] = outtmp[j];
-#else
// Case 1
// No input scaling
// Row transform, AV1 scales up by 1.5 bits
@@ -758,7 +583,6 @@
// Transpose and mid scaling up .5 bits
for (int j = 0; j < n2; ++j)
tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-#endif
input += n2;
}
@@ -775,14 +599,9 @@
for (int j = 0; j < n2; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
-// Output scaling
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
- // case 4
- dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
+ // Output scaling
// case 1
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-#endif
}
}
}
@@ -897,24 +716,6 @@
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
static const transform_2d IHT_16x32[] = {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
- { daala_idct32, daala_idct16 }, // DCT_DCT = 0
- { daala_idst32, daala_idct16 }, // ADST_DCT = 1
- { daala_idct32, daala_idst16 }, // DCT_ADST = 2
- { daala_idst32, daala_idst16 }, // ADST_ADST = 3
- { daala_idst32, daala_idct16 }, // FLIPADST_DCT
- { daala_idct32, daala_idst16 }, // DCT_FLIPADST
- { daala_idst32, daala_idst16 }, // FLIPADST_FLIPADST
- { daala_idst32, daala_idst16 }, // ADST_FLIPADST
- { daala_idst32, daala_idst16 }, // FLIPADST_ADST
- { daala_idtx32, daala_idtx16 }, // IDTX
- { daala_idct32, daala_idtx16 }, // V_DCT
- { daala_idtx32, daala_idct16 }, // H_DCT
- { daala_idst32, daala_idtx16 }, // V_ADST
- { daala_idtx32, daala_idst16 }, // H_ADST
- { daala_idst32, daala_idtx16 }, // V_FLIPADST
- { daala_idtx32, daala_idst16 }, // H_FLIPADST
-#else
{ aom_idct32_c, aom_idct16_c }, // DCT_DCT
{ ihalfright32_c, aom_idct16_c }, // ADST_DCT
{ aom_idct32_c, aom_iadst16_c }, // DCT_ADST
@@ -931,7 +732,6 @@
{ iidtx32_c, aom_iadst16_c }, // H_ADST
{ ihalfright32_c, iidtx16_c }, // V_FLIPADST
{ iidtx32_c, aom_iadst16_c }, // H_FLIPADST
-#endif
};
const int n = 16;
@@ -943,16 +743,9 @@
// inverse transform row vectors and transpose
for (int i = 0; i < n2; ++i) {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
- tran_low_t temp_in[16];
- for (int j = 0; j < n; j++) temp_in[j] = input[j] * 4;
- IHT_16x32[tx_type].rows(temp_in, outtmp);
- for (int j = 0; j < n; ++j) tmp[j][i] = outtmp[j];
-#else
IHT_16x32[tx_type].rows(input, outtmp);
for (int j = 0; j < n; ++j)
tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-#endif
input += n;
}
@@ -966,11 +759,7 @@
for (int j = 0; j < n; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
- dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-#endif
}
}
}
@@ -979,24 +768,6 @@
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
static const transform_2d IHT_32x16[] = {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
- { daala_idct16, daala_idct32 }, // DCT_DCT = 0
- { daala_idst16, daala_idct32 }, // ADST_DCT = 1
- { daala_idct16, daala_idst32 }, // DCT_ADST = 2
- { daala_idst16, daala_idst32 }, // ADST_ADST = 3
- { daala_idst16, daala_idct32 }, // FLIPADST_DCT
- { daala_idct16, daala_idst32 }, // DCT_FLIPADST
- { daala_idst16, daala_idst32 }, // FLIPADST_FLIPADST
- { daala_idst16, daala_idst32 }, // ADST_FLIPADST
- { daala_idst16, daala_idst32 }, // FLIPADST_ADST
- { daala_idtx16, daala_idtx32 }, // IDTX
- { daala_idct16, daala_idtx32 }, // V_DCT
- { daala_idtx16, daala_idct32 }, // H_DCT
- { daala_idst16, daala_idtx32 }, // V_ADST
- { daala_idtx16, daala_idst32 }, // H_ADST
- { daala_idst16, daala_idtx32 }, // V_FLIPADST
- { daala_idtx16, daala_idst32 }, // H_FLIPADST
-#else
{ aom_idct16_c, aom_idct32_c }, // DCT_DCT
{ aom_iadst16_c, aom_idct32_c }, // ADST_DCT
{ aom_idct16_c, ihalfright32_c }, // DCT_ADST
@@ -1013,7 +784,6 @@
{ iidtx16_c, ihalfright32_c }, // H_ADST
{ aom_iadst16_c, iidtx32_c }, // V_FLIPADST
{ iidtx16_c, ihalfright32_c }, // H_FLIPADST
-#endif
};
const int n = 16;
const int n2 = 32;
@@ -1024,16 +794,9 @@
// inverse transform row vectors and transpose
for (int i = 0; i < n; ++i) {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
- tran_low_t temp_in[32];
- for (int j = 0; j < n2; j++) temp_in[j] = input[j] * 4;
- IHT_32x16[tx_type].rows(temp_in, outtmp);
- for (int j = 0; j < n2; ++j) tmp[j][i] = outtmp[j];
-#else
IHT_32x16[tx_type].rows(input, outtmp);
for (int j = 0; j < n2; ++j)
tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-#endif
input += n2;
}
@@ -1047,11 +810,7 @@
for (int j = 0; j < n2; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
- dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-#endif
}
}
}
@@ -1060,24 +819,6 @@
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
static const transform_2d IHT_8[] = {
-#if CONFIG_DAALA_TX8
- { daala_idct8, daala_idct8 }, // DCT_DCT = 0
- { daala_idst8, daala_idct8 }, // ADST_DCT = 1
- { daala_idct8, daala_idst8 }, // DCT_ADST = 2
- { daala_idst8, daala_idst8 }, // ADST_ADST = 3
- { daala_idst8, daala_idct8 }, // FLIPADST_DCT
- { daala_idct8, daala_idst8 }, // DCT_FLIPADST
- { daala_idst8, daala_idst8 }, // FLIPADST_FLIPADST
- { daala_idst8, daala_idst8 }, // ADST_FLIPADST
- { daala_idst8, daala_idst8 }, // FLIPADST_ADST
- { daala_idtx8, daala_idtx8 }, // IDTX
- { daala_idct8, daala_idtx8 }, // V_DCT
- { daala_idtx8, daala_idct8 }, // H_DCT
- { daala_idst8, daala_idtx8 }, // V_ADST
- { daala_idtx8, daala_idst8 }, // H_ADST
- { daala_idst8, daala_idtx8 }, // V_FLIPADST
- { daala_idtx8, daala_idst8 }, // H_FLIPADST
-#else
{ aom_idct8_c, aom_idct8_c }, // DCT_DCT = 0
{ aom_iadst8_c, aom_idct8_c }, // ADST_DCT = 1
{ aom_idct8_c, aom_iadst8_c }, // DCT_ADST = 2
@@ -1094,7 +835,6 @@
{ iidtx8_c, aom_iadst8_c }, // H_ADST
{ aom_iadst8_c, iidtx8_c }, // V_FLIPADST
{ iidtx8_c, aom_iadst8_c }, // H_FLIPADST
-#endif
};
tran_low_t tmp[8][8];
@@ -1104,13 +844,7 @@
// inverse transform row vectors
for (int i = 0; i < 8; ++i) {
-#if CONFIG_DAALA_TX8
- tran_low_t temp_in[8];
- for (int j = 0; j < 8; j++) temp_in[j] = input[j] * 2;
- IHT_8[tx_type].rows(temp_in, out[i]);
-#else
IHT_8[tx_type].rows(input, out[i]);
-#endif
input += 8;
}
@@ -1133,11 +867,7 @@
for (int j = 0; j < 8; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
-#if CONFIG_DAALA_TX8
- dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-#endif
}
}
}
@@ -1146,24 +876,6 @@
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
static const transform_2d IHT_16[] = {
-#if CONFIG_DAALA_TX16
- { daala_idct16, daala_idct16 }, // DCT_DCT = 0
- { daala_idst16, daala_idct16 }, // ADST_DCT = 1
- { daala_idct16, daala_idst16 }, // DCT_ADST = 2
- { daala_idst16, daala_idst16 }, // ADST_ADST = 3
- { daala_idst16, daala_idct16 }, // FLIPADST_DCT
- { daala_idct16, daala_idst16 }, // DCT_FLIPADST
- { daala_idst16, daala_idst16 }, // FLIPADST_FLIPADST
- { daala_idst16, daala_idst16 }, // ADST_FLIPADST
- { daala_idst16, daala_idst16 }, // FLIPADST_ADST
- { daala_idtx16, daala_idtx16 }, // IDTX
- { daala_idct16, daala_idtx16 }, // V_DCT
- { daala_idtx16, daala_idct16 }, // H_DCT
- { daala_idst16, daala_idtx16 }, // V_ADST
- { daala_idtx16, daala_idst16 }, // H_ADST
- { daala_idst16, daala_idtx16 }, // V_FLIPADST
- { daala_idtx16, daala_idst16 }, // H_FLIPADST
-#else
{ aom_idct16_c, aom_idct16_c }, // DCT_DCT = 0
{ aom_iadst16_c, aom_idct16_c }, // ADST_DCT = 1
{ aom_idct16_c, aom_iadst16_c }, // DCT_ADST = 2
@@ -1180,7 +892,6 @@
{ iidtx16_c, aom_iadst16_c }, // H_ADST
{ aom_iadst16_c, iidtx16_c }, // V_FLIPADST
{ iidtx16_c, aom_iadst16_c }, // H_FLIPADST
-#endif
};
tran_low_t tmp[16][16];
@@ -1190,13 +901,7 @@
// inverse transform row vectors
for (int i = 0; i < 16; ++i) {
-#if CONFIG_DAALA_TX16
- tran_low_t temp_in[16];
- for (int j = 0; j < 16; j++) temp_in[j] = input[j] * 2;
- IHT_16[tx_type].rows(temp_in, out[i]);
-#else
IHT_16[tx_type].rows(input, out[i]);
-#endif
input += 16;
}
@@ -1217,11 +922,7 @@
for (int j = 0; j < 16; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
-#if CONFIG_DAALA_TX16
- dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-#endif
}
}
}
@@ -1230,24 +931,6 @@
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
static const transform_2d IHT_32[] = {
-#if CONFIG_DAALA_TX32
- { daala_idct32, daala_idct32 }, // DCT_DCT
- { daala_idst32, daala_idct32 }, // ADST_DCT
- { daala_idct32, daala_idst32 }, // DCT_ADST
- { daala_idst32, daala_idst32 }, // ADST_ADST
- { daala_idst32, daala_idct32 }, // FLIPADST_DCT
- { daala_idct32, daala_idst32 }, // DCT_FLIPADST
- { daala_idst32, daala_idst32 }, // FLIPADST_FLIPADST
- { daala_idst32, daala_idst32 }, // ADST_FLIPADST
- { daala_idst32, daala_idst32 }, // FLIPADST_ADST
- { daala_idtx32, daala_idtx32 }, // IDTX
- { daala_idct32, daala_idtx32 }, // V_DCT
- { daala_idtx32, daala_idct32 }, // H_DCT
- { daala_idst32, daala_idtx32 }, // V_ADST
- { daala_idtx32, daala_idst32 }, // H_ADST
- { daala_idst32, daala_idtx32 }, // V_FLIPADST
- { daala_idtx32, daala_idst32 }, // H_FLIPADST
-#else
{ aom_idct32_c, aom_idct32_c }, // DCT_DCT
{ ihalfright32_c, aom_idct32_c }, // ADST_DCT
{ aom_idct32_c, ihalfright32_c }, // DCT_ADST
@@ -1264,7 +947,6 @@
{ iidtx32_c, ihalfright32_c }, // H_ADST
{ ihalfright32_c, iidtx32_c }, // V_FLIPADST
{ iidtx32_c, ihalfright32_c }, // H_FLIPADST
-#endif
};
tran_low_t tmp[32][32];
@@ -1274,13 +956,7 @@
// inverse transform row vectors
for (int i = 0; i < 32; ++i) {
-#if CONFIG_DAALA_TX32
- tran_low_t temp_in[32];
- for (int j = 0; j < 32; j++) temp_in[j] = input[j] * 4;
- IHT_32[tx_type].rows(temp_in, out[i]);
-#else
IHT_32[tx_type].rows(input, out[i]);
-#endif
input += 32;
}
@@ -1298,11 +974,7 @@
for (int j = 0; j < 32; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
-#if CONFIG_DAALA_TX32
- dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-#endif
}
}
}
@@ -1312,24 +984,6 @@
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
static const transform_2d IHT_64[] = {
-#if CONFIG_DAALA_TX64
- { daala_idct64, daala_idct64 }, // DCT_DCT
- { daala_idst64, daala_idct64 }, // ADST_DCT
- { daala_idct64, daala_idst64 }, // DCT_ADST
- { daala_idst64, daala_idst64 }, // ADST_ADST
- { daala_idst64, daala_idct64 }, // FLIPADST_DCT
- { daala_idct64, daala_idst64 }, // DCT_FLIPADST
- { daala_idst64, daala_idst64 }, // FLIPADST_FLIPADST
- { daala_idst64, daala_idst64 }, // ADST_FLIPADST
- { daala_idst64, daala_idst64 }, // FLIPADST_ADST
- { daala_idtx64, daala_idtx64 }, // IDTX
- { daala_idct64, daala_idtx64 }, // V_DCT
- { daala_idtx64, daala_idct64 }, // H_DCT
- { daala_idst64, daala_idtx64 }, // V_ADST
- { daala_idtx64, daala_idst64 }, // H_ADST
- { daala_idst64, daala_idtx64 }, // V_FLIPADST
- { daala_idtx64, daala_idst64 }, // H_FLIPADST
-#else
{ idct64_col_c, idct64_row_c }, // DCT_DCT
{ ihalfright64_c, idct64_row_c }, // ADST_DCT
{ idct64_col_c, ihalfright64_c }, // DCT_ADST
@@ -1346,7 +1000,6 @@
{ iidtx64_c, ihalfright64_c }, // H_ADST
{ ihalfright64_c, iidtx64_c }, // V_FLIPADST
{ iidtx64_c, ihalfright64_c }, // H_FLIPADST
-#endif
};
// TODO(urvang): Can the same array be reused, instead of using a new array?
@@ -1368,14 +1021,8 @@
// inverse transform row vectors
for (int i = 0; i < 64; ++i) {
-#if CONFIG_DAALA_TX64
- tran_low_t temp_in[64];
- for (int j = 0; j < 64; j++) temp_in[j] = mod_input_ptr[j] * 8;
- IHT_64[tx_type].rows(temp_in, out[i]);
-#else
IHT_64[tx_type].rows(mod_input_ptr, out[i]);
for (int j = 0; j < 64; ++j) out[i][j] = ROUND_POWER_OF_TWO(out[i][j], 1);
-#endif
mod_input_ptr += 64;
}
@@ -1396,11 +1043,7 @@
for (int j = 0; j < 64; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
-#if CONFIG_DAALA_TX64
- dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-#endif
}
}
}
@@ -1409,24 +1052,6 @@
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
static const transform_2d IHT_64x32[] = {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
- { daala_idct32, daala_idct64 }, // DCT_DCT
- { daala_idst32, daala_idct64 }, // ADST_DCT
- { daala_idct32, daala_idst64 }, // DCT_ADST
- { daala_idst32, daala_idst64 }, // ADST_ADST
- { daala_idst32, daala_idct64 }, // FLIPADST_DCT
- { daala_idct32, daala_idst64 }, // DCT_FLIPADST
- { daala_idst32, daala_idst64 }, // FLIPADST_FLIPADST
- { daala_idst32, daala_idst64 }, // ADST_FLIPADST
- { daala_idst32, daala_idst64 }, // FLIPADST_ADST
- { daala_idtx32, daala_idtx64 }, // IDTX
- { daala_idct32, daala_idtx64 }, // V_DCT
- { daala_idtx32, daala_idct64 }, // H_DCT
- { daala_idst32, daala_idtx64 }, // V_ADST
- { daala_idtx32, daala_idst64 }, // H_ADST
- { daala_idst32, daala_idtx64 }, // V_FLIPADST
- { daala_idtx32, daala_idst64 }, // H_FLIPADST
-#else
{ aom_idct32_c, idct64_row_c }, // DCT_DCT
{ ihalfright32_c, idct64_row_c }, // ADST_DCT
{ aom_idct32_c, ihalfright64_c }, // DCT_ADST
@@ -1443,7 +1068,6 @@
{ iidtx32_c, ihalfright64_c }, // H_ADST
{ ihalfright32_c, iidtx64_c }, // V_FLIPADST
{ iidtx32_c, ihalfright64_c }, // H_FLIPADST
-#endif
};
// Remap 32x32 input into a modified 64x32 input by:
@@ -1465,16 +1089,9 @@
// inverse transform row vectors and transpose
for (int i = 0; i < n; ++i) {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
- tran_low_t temp_in[64];
- for (int j = 0; j < n2; j++) temp_in[j] = mod_input_ptr[j] * 8;
- IHT_64x32[tx_type].rows(temp_in, outtmp);
- for (int j = 0; j < n2; ++j) tmp[j][i] = outtmp[j];
-#else
IHT_64x32[tx_type].rows(mod_input_ptr, outtmp);
for (int j = 0; j < n2; ++j)
tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * InvSqrt2);
-#endif
mod_input_ptr += n2;
}
@@ -1488,11 +1105,7 @@
for (int j = 0; j < n2; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
- dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-#endif
}
}
}
@@ -1501,24 +1114,6 @@
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
static const transform_2d IHT_32x64[] = {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
- { daala_idct64, daala_idct32 }, // DCT_DCT
- { daala_idst64, daala_idct32 }, // ADST_DCT
- { daala_idct64, daala_idst32 }, // DCT_ADST
- { daala_idst64, daala_idst32 }, // ADST_ADST
- { daala_idst64, daala_idct32 }, // FLIPADST_DCT
- { daala_idct64, daala_idst32 }, // DCT_FLIPADST
- { daala_idst64, daala_idst32 }, // FLIPADST_FLIPADST
- { daala_idst64, daala_idst32 }, // ADST_FLIPADST
- { daala_idst64, daala_idst32 }, // FLIPADST_ADST
- { daala_idtx64, daala_idtx32 }, // IDTX
- { daala_idct64, daala_idtx32 }, // V_DCT
- { daala_idtx64, daala_idct32 }, // H_DCT
- { daala_idst64, daala_idtx32 }, // V_ADST
- { daala_idtx64, daala_idst32 }, // H_ADST
- { daala_idst64, daala_idtx32 }, // V_FLIPADST
- { daala_idtx64, daala_idst32 }, // H_FLIPADST
-#else
{ idct64_col_c, aom_idct32_c }, // DCT_DCT
{ ihalfright64_c, aom_idct32_c }, // ADST_DCT
{ idct64_col_c, ihalfright32_c }, // DCT_ADST
@@ -1535,7 +1130,6 @@
{ iidtx64_c, ihalfright32_c }, // H_ADST
{ ihalfright64_c, iidtx32_c }, // V_FLIPADST
{ iidtx64_c, ihalfright32_c }, // H_FLIPADST
-#endif
};
// Remap 32x32 input into a modified 32x64 input by:
@@ -1555,16 +1149,9 @@
// inverse transform row vectors and transpose
for (int i = 0; i < n2; ++i) {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
- tran_low_t temp_in[32];
- for (int j = 0; j < n; j++) temp_in[j] = mod_input_ptr[j] * 8;
- IHT_32x64[tx_type].rows(temp_in, outtmp);
- for (int j = 0; j < n; ++j) tmp[j][i] = outtmp[j];
-#else
IHT_32x64[tx_type].rows(mod_input_ptr, outtmp);
for (int j = 0; j < n; ++j)
tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * InvSqrt2);
-#endif
mod_input_ptr += n;
}
@@ -1578,11 +1165,7 @@
for (int j = 0; j < n; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
- dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-#endif
}
}
}
@@ -1722,7 +1305,6 @@
else
aom_idct4x4_1_add(input, dest, stride);
}
-#endif // !CONFIG_DAALA_TX
void av1_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
const TxfmParam *txfm_param) {
@@ -1742,7 +1324,6 @@
aom_highbd_iwht4x4_1_add(input, dest, stride, bd);
}
-#if !CONFIG_DAALA_TX
static const int32_t *cast_to_int32(const tran_low_t *input) {
assert(sizeof(int32_t) == sizeof(tran_low_t));
return (const int32_t *)input;
@@ -2045,7 +1626,6 @@
}
}
#endif // CONFIG_TX64X64
-#endif // !CONFIG_DAALA_TX
static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
TX_TYPE tx_type, int eob, int reduced_tx_set,
@@ -2067,9 +1647,6 @@
static void av1_highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest,
int stride, TxfmParam *txfm_param) {
assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
-#if CONFIG_DAALA_TX
- daala_inv_txfm_add(input, dest, stride, txfm_param);
-#else
const TX_SIZE tx_size = txfm_param->tx_size;
switch (tx_size) {
case TX_32X32:
@@ -2136,7 +1713,6 @@
break;
default: assert(0 && "Invalid transform size"); break;
}
-#endif // CONFIG_DAALA_TX
}
static void av1_inv_txfm_add(const tran_low_t *dqcoeff, uint8_t *dst,
diff --git a/av1/common/idct.h b/av1/common/idct.h
index 6717e22..79c948a 100644
--- a/av1/common/idct.h
+++ b/av1/common/idct.h
@@ -29,10 +29,8 @@
transform_1d cols, rows; // vertical and horizontal
} transform_2d;
-#if !CONFIG_DAALA_TX
#define MAX_TX_SCALE 1
int av1_get_tx_scale(const TX_SIZE tx_size);
-#endif
void av1_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
const TxfmParam *txfm_param);
diff --git a/av1/common/quant_common.c b/av1/common/quant_common.c
index bb9997a..b5d4a87 100644
--- a/av1/common/quant_common.c
+++ b/av1/common/quant_common.c
@@ -410,8 +410,6 @@
28143, 28687, 29247,
};
-#if !CONFIG_DAALA_TX
-
// Coefficient scaling and quantization with AV1 TX are tailored to
// the AV1 TX transforms. Regardless of the bit-depth of the input,
// the transform stages scale the coefficient values up by a factor of
@@ -497,83 +495,6 @@
return QINDEX_RANGE - 1;
}
-#else // CONFIG_DAALA_TX
-
-// Daala TX uses a constant effective coefficient depth
-// (TX_COEFF_DEPTH) regardless of input pixel bitdepth or transform
-// size. This means that coefficient scale and range is identical
-// regardless of the bit depth of the pixel input. However, the
-// existing encoder heuristics and RDO loop were built expecting a
-// quantizer that scales with bitdepth, treating it more as a
-// proto-lambda than a quantizer. The assumption that quantizer scale
-// increases with bitdepth is spread throughout the encoder.
-
-// For this reason, we need to be able to find an old-style 'Q3'
-// quantizer that scales with pixel depth (to be used in encoder
-// decision making) as well as the literal quantizer that is used in
-// actual quantization/dequantization. That is centralized here.
-
-// Right now, the existing quantization code and setup are not
-// particularly well suited to Daala TX. The scale range used by, eg,
-// the 12 bit lookups is intentionally larger in order to provide more
-// fine control at the top end of the quality range, as 12-bit input
-// would be assumed to offer a lower noise floor than an 8-bit input.
-// However, the 12-bit lookups assume an effective 15-bit TX depth,
-// while we intend to run Daala TX somewhere between 12 and 14. We
-// can't simply scale it down, because this would violate the minimum
-// allowable quantizer in the current code (4).
-
-// As such, we do the simplest thing for the time being: Always use
-// the 8-bit scale range for all inputs and scale the QTX and Q3
-// returns accordingly, which will always be no-ops or upshifts. This
-// might well work well enough; if not, we'll need to patch quantizer
-// scaling to extend the high-bitdepth quality range upward at some
-// later date.
-
-int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) {
- assert(bit_depth >= 8);
- return qindex == 0 ? dc_qlookup_Q3[0]
- : // Do not scale lossless
- dc_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)] *
- (1 << (bit_depth - 8));
-}
-
-int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) {
- assert(bit_depth >= 8);
- return qindex == 0 ? ac_qlookup_Q3[0]
- : // Do not scale lossless
- ac_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)] *
- (1 << (bit_depth - 8));
-}
-
-int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
- (void)bit_depth;
- return qindex == 0 ? dc_qlookup_Q3[0]
- : // Do not scale lossless
- dc_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)] *
- (1 << (TX_COEFF_DEPTH - 11));
-}
-
-int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
- (void)bit_depth;
- return qindex == 0 ? ac_qlookup_Q3[0]
- : // Do not scale lossless
- ac_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)] *
- (1 << (TX_COEFF_DEPTH - 11));
-}
-
-int16_t av1_qindex_from_ac_Q3(int ac_QTX, aom_bit_depth_t bit_depth) {
- int i;
- const int16_t *tab = ac_qlookup_Q3;
- int scale = (1 << (TX_COEFF_DEPTH - 11));
- (void)bit_depth;
- for (i = 0; i < QINDEX_RANGE; i++) {
- if (ac_QTX <= tab[i] * scale) return i;
- }
- return QINDEX_RANGE - 1;
-}
-#endif // !CONFIG_DAALA_TX
-
int av1_get_qindex(const struct segmentation *seg, int segment_id,
int base_qindex) {
if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
diff --git a/av1/common/x86/daala_inv_txfm_avx2.c b/av1/common/x86/daala_inv_txfm_avx2.c
deleted file mode 100644
index f060bfe..0000000
--- a/av1/common/x86/daala_inv_txfm_avx2.c
+++ /dev/null
@@ -1,1607 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-#include <immintrin.h>
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "av1/common/daala_tx.h"
-#include "av1/common/daala_inv_txfm.h"
-#include "av1/common/idct.h"
-
-#if CONFIG_DAALA_TX
-
-static INLINE __m128i od_unbiased_rshift1_epi16(__m128i a) {
- return _mm_srai_epi16(_mm_add_epi16(_mm_srli_epi16(a, 15), a), 1);
-}
-
-static INLINE __m256i od_mm256_unbiased_rshift1_epi16(__m256i a) {
- return _mm256_srai_epi16(_mm256_add_epi16(_mm256_srli_epi16(a, 15), a), 1);
-}
-
-static INLINE __m256i od_mm256_unbiased_rshift1_epi32(__m256i a) {
- return _mm256_srai_epi32(_mm256_add_epi32(_mm256_srli_epi32(a, 31), a), 1);
-}
-
-static INLINE __m128i od_avg_epi16(__m128i a, __m128i b) {
- __m128i sign_bit;
- /*x86 only provides an unsigned PAVGW with a bias (ARM is better here).
- We emulate a signed one by adding an offset to convert to unsigned and
- back. We use XOR instead of addition/subtraction because it dispatches
- better on older processors.*/
- sign_bit = _mm_set1_epi16(0x8000);
- return _mm_xor_si128(
- _mm_avg_epu16(_mm_xor_si128(a, sign_bit), _mm_xor_si128(b, sign_bit)),
- sign_bit);
-}
-
-static INLINE __m256i od_mm256_avg_epi16(__m256i a, __m256i b) {
- __m256i sign_bit;
- sign_bit = _mm256_set1_epi16(0x8000);
- return _mm256_xor_si256(_mm256_avg_epu16(_mm256_xor_si256(a, sign_bit),
- _mm256_xor_si256(b, sign_bit)),
- sign_bit);
-}
-
-static INLINE __m256i od_mm256_avg_epi32(__m256i a, __m256i b) {
- __m256i neg1;
- /* It's cheaper to generate -1's than 1's. */
- neg1 = _mm256_set1_epi64x(-1);
- /* There is no corresponding PAVGD, but we are not in danger of overflowing
- a 32-bit register. */
- return _mm256_srai_epi32(_mm256_add_epi32(a, _mm256_sub_epi32(b, neg1)), 1);
-}
-
-/*Like the above, but does (a - b + 1) >> 1 instead.*/
-static INLINE __m128i od_hrsub_epi16(__m128i a, __m128i b) {
- __m128i sign_bit;
- sign_bit = _mm_set1_epi16(0x8000);
- return _mm_xor_si128(
- _mm_avg_epu16(_mm_xor_si128(a, sign_bit), _mm_sub_epi16(sign_bit, b)),
- sign_bit);
-}
-
-static INLINE __m256i od_mm256_hrsub_epi16(__m256i a, __m256i b) {
- __m256i sign_bit;
- sign_bit = _mm256_set1_epi16(0x8000);
- return _mm256_xor_si256(_mm256_avg_epu16(_mm256_xor_si256(a, sign_bit),
- _mm256_sub_epi16(sign_bit, b)),
- sign_bit);
-}
-
-static INLINE __m256i od_mm256_hrsub_epi32(__m256i a, __m256i b) {
- __m256i neg1;
- /* It's cheaper to generate -1's than 1's. */
- neg1 = _mm256_set1_epi64x(-1);
- /* There is no corresponding PAVGD, but we are not in danger of overflowing
- a 32-bit register. */
- return _mm256_srai_epi32(_mm256_sub_epi32(a, _mm256_add_epi32(b, neg1)), 1);
-}
-
-static INLINE void od_swap_si128(__m128i *q0, __m128i *q1) {
- __m128i t;
- t = *q0;
- *q0 = *q1;
- *q1 = t;
-}
-
-static INLINE void od_mm256_swap_si256(__m256i *q0, __m256i *q1) {
- __m256i t;
- t = *q0;
- *q0 = *q1;
- *q1 = t;
-}
-
-static INLINE __m128i od_mulhrs_epi16(__m128i a, int16_t b) {
- return _mm_mulhrs_epi16(a, _mm_set1_epi16(b));
-}
-
-static INLINE __m128i od_mul_epi16(__m128i a, int32_t b, int r) {
- int32_t b_q15;
- b_q15 = b << (15 - r);
- /* b and r are in all cases compile-time constants, so these branches
- disappear when this function gets inlined. */
- if (b_q15 > 32767) {
- return _mm_add_epi16(a, od_mulhrs_epi16(a, (int16_t)(b_q15 - 32768)));
- } else if (b_q15 < -32767) {
- return _mm_sub_epi16(od_mulhrs_epi16(a, (int16_t)(32768 + b_q15)), a);
- } else {
- return od_mulhrs_epi16(a, b_q15);
- }
-}
-
-static INLINE __m256i od_mm256_mulhrs_epi16(__m256i a, int16_t b) {
- return _mm256_mulhrs_epi16(a, _mm256_set1_epi16(b));
-}
-
-static INLINE __m256i od_mm256_mul_epi16(__m256i a, int32_t b, int r) {
- int32_t b_q15;
- b_q15 = b << (15 - r);
- /* b and r are in all cases compile-time constants, so these branches
- disappear when this function gets inlined. */
- if (b_q15 > 32767) {
- return _mm256_add_epi16(a,
- od_mm256_mulhrs_epi16(a, (int16_t)(b_q15 - 32768)));
- } else if (b_q15 < -32767) {
- return _mm256_sub_epi16(od_mm256_mulhrs_epi16(a, (int16_t)(32768 + b_q15)),
- a);
- } else {
- return od_mm256_mulhrs_epi16(a, b_q15);
- }
-}
-
-static INLINE __m256i od_mm256_mul_epi32(__m256i a, int32_t b, int r) {
- __m256i neg1;
- /* It's cheaper to generate -1's than 1's. */
- neg1 = _mm256_set1_epi64x(-1);
- /* There's no 32-bit version of PMULHRSW on x86 like there is on ARM .*/
- a = _mm256_mullo_epi32(a, _mm256_set1_epi32(b));
- a = _mm256_srai_epi32(a, r - 1);
- a = _mm256_sub_epi32(a, neg1);
- return _mm256_srai_epi32(a, 1);
-}
-
-static INLINE __m128i od_hbd_max_epi16(int bd) {
- return _mm_set1_epi16((1 << bd) - 1);
-}
-
-static INLINE __m256i od_mm256_hbd_max_epi16(int bd) {
- return _mm256_set1_epi16((1 << bd) - 1);
-}
-
-static INLINE __m128i od_hbd_clamp_epi16(__m128i a, __m128i max) {
- return _mm_max_epi16(_mm_setzero_si128(), _mm_min_epi16(a, max));
-}
-
-static INLINE __m256i od_mm256_hbd_clamp_epi16(__m256i a, __m256i max) {
- return _mm256_max_epi16(_mm256_setzero_si256(), _mm256_min_epi16(a, max));
-}
-
-/* Loads a 4x4 buffer of 32-bit values into four SSE registers. */
-static INLINE void od_load_buffer_4x4_epi32(__m128i *q0, __m128i *q1,
- __m128i *q2, __m128i *q3,
- const tran_low_t *in) {
- *q0 = _mm_loadu_si128((const __m128i *)in + 0);
- *q1 = _mm_loadu_si128((const __m128i *)in + 1);
- *q2 = _mm_loadu_si128((const __m128i *)in + 2);
- *q3 = _mm_loadu_si128((const __m128i *)in + 3);
-}
-
-/* Loads a 4x4 buffer of 16-bit values into four SSE registers. */
-static INLINE void od_load_buffer_4x4_epi16(__m128i *q0, __m128i *q1,
- __m128i *q2, __m128i *q3,
- const int16_t *in) {
- *q0 = _mm_loadu_si128((const __m128i *)in + 0);
- *q1 = _mm_unpackhi_epi64(*q0, *q0);
- *q2 = _mm_loadu_si128((const __m128i *)in + 1);
- *q3 = _mm_unpackhi_epi64(*q2, *q2);
-}
-
-/* Loads an 8x4 buffer of 16-bit values into four SSE registers. */
-static INLINE void od_load_buffer_8x4_epi16(__m128i *q0, __m128i *q1,
- __m128i *q2, __m128i *q3,
- const int16_t *in, int in_stride) {
- *q0 = _mm_loadu_si128((const __m128i *)(in + 0 * in_stride));
- *q1 = _mm_loadu_si128((const __m128i *)(in + 1 * in_stride));
- *q2 = _mm_loadu_si128((const __m128i *)(in + 2 * in_stride));
- *q3 = _mm_loadu_si128((const __m128i *)(in + 3 * in_stride));
-}
-
-/* Loads an 8x4 buffer of 32-bit values and packs them into 16-bit values in
- four SSE registers. */
-static INLINE void od_load_pack_buffer_8x4_epi32(__m128i *r0, __m128i *r1,
- __m128i *r2, __m128i *r3,
- const tran_low_t *in) {
- __m128i r4;
- __m128i r5;
- __m128i r6;
- __m128i r7;
- *r0 = _mm_loadu_si128((const __m128i *)in + 0);
- r4 = _mm_loadu_si128((const __m128i *)in + 1);
- *r1 = _mm_loadu_si128((const __m128i *)in + 2);
- r5 = _mm_loadu_si128((const __m128i *)in + 3);
- *r2 = _mm_loadu_si128((const __m128i *)in + 4);
- r6 = _mm_loadu_si128((const __m128i *)in + 5);
- *r3 = _mm_loadu_si128((const __m128i *)in + 6);
- r7 = _mm_loadu_si128((const __m128i *)in + 7);
- *r0 = _mm_packs_epi32(*r0, r4);
- *r1 = _mm_packs_epi32(*r1, r5);
- *r2 = _mm_packs_epi32(*r2, r6);
- *r3 = _mm_packs_epi32(*r3, r7);
-}
-
-/* Loads an 8x4 buffer of 32-bit values into four AVX registers. */
-static INLINE void od_load_buffer_8x4_epi32(__m256i *r0, __m256i *r1,
- __m256i *r2, __m256i *r3,
- const tran_low_t *in) {
- *r0 = _mm256_loadu_si256((const __m256i *)in + 0);
- *r1 = _mm256_loadu_si256((const __m256i *)in + 1);
- *r2 = _mm256_loadu_si256((const __m256i *)in + 2);
- *r3 = _mm256_loadu_si256((const __m256i *)in + 3);
-}
-
-/* Loads a 16x4 buffer of 16-bit values into four AVX registers. */
-static INLINE void od_load_buffer_16x4_epi16(__m256i *r0, __m256i *r1,
- __m256i *r2, __m256i *r3,
- const int16_t *in, int in_stride) {
- *r0 = _mm256_loadu_si256((const __m256i *)(in + 0 * in_stride));
- *r1 = _mm256_loadu_si256((const __m256i *)(in + 1 * in_stride));
- *r2 = _mm256_loadu_si256((const __m256i *)(in + 2 * in_stride));
- *r3 = _mm256_loadu_si256((const __m256i *)(in + 3 * in_stride));
-}
-
-/* Stores a 4x4 buffer of 16-bit values from two SSE registers.
- Each register holds two rows of values. */
-static INLINE void od_store_buffer_4x4_epi16(int16_t *out, __m128i q0,
- __m128i q1) {
- _mm_storeu_si128((__m128i *)out + 0, q0);
- _mm_storeu_si128((__m128i *)out + 1, q1);
-}
-
-/* Stores a 4x8 buffer of 16-bit values from four SSE registers.
- Each register holds two rows of values. */
-static INLINE void od_store_buffer_4x8_epi16(int16_t *out, __m128i q0,
- __m128i q1, __m128i q2,
- __m128i q3) {
- _mm_storeu_si128((__m128i *)out + 0, q0);
- _mm_storeu_si128((__m128i *)out + 1, q1);
- _mm_storeu_si128((__m128i *)out + 2, q2);
- _mm_storeu_si128((__m128i *)out + 3, q3);
-}
-
-static INLINE void od_store_buffer_2x16_epi16(int16_t *out, __m256i r0,
- __m256i r1) {
- _mm256_storeu_si256((__m256i *)out + 0, r0);
- _mm256_storeu_si256((__m256i *)out + 1, r1);
-}
-
-/* Loads a 4x4 buffer of 16-bit values, adds a 4x4 block of 16-bit values to
- them, clamps to high bit depth, and stores the sum back. */
-static INLINE void od_add_store_buffer_hbd_4x4_epi16(void *output_pixels,
- int output_stride,
- __m128i q0, __m128i q1,
- __m128i q2, __m128i q3,
- int bd) {
- uint16_t *output_pixels16;
- __m128i p0;
- __m128i p1;
- __m128i p2;
- __m128i p3;
- __m128i max;
- __m128i round;
- int downshift;
- output_pixels16 = CONVERT_TO_SHORTPTR(output_pixels);
- max = od_hbd_max_epi16(bd);
- downshift = TX_COEFF_DEPTH - bd;
- round = _mm_set1_epi16((1 << downshift) >> 1);
- p0 = _mm_loadl_epi64((const __m128i *)(output_pixels16 + 0 * output_stride));
- p1 = _mm_loadl_epi64((const __m128i *)(output_pixels16 + 1 * output_stride));
- p2 = _mm_loadl_epi64((const __m128i *)(output_pixels16 + 2 * output_stride));
- p3 = _mm_loadl_epi64((const __m128i *)(output_pixels16 + 3 * output_stride));
- q0 = _mm_srai_epi16(_mm_add_epi16(q0, round), downshift);
- q1 = _mm_srai_epi16(_mm_add_epi16(q1, round), downshift);
- q2 = _mm_srai_epi16(_mm_add_epi16(q2, round), downshift);
- q3 = _mm_srai_epi16(_mm_add_epi16(q3, round), downshift);
- p0 = od_hbd_clamp_epi16(_mm_add_epi16(p0, q0), max);
- p1 = od_hbd_clamp_epi16(_mm_add_epi16(p1, q1), max);
- p2 = od_hbd_clamp_epi16(_mm_add_epi16(p2, q2), max);
- p3 = od_hbd_clamp_epi16(_mm_add_epi16(p3, q3), max);
- _mm_storel_epi64((__m128i *)(output_pixels16 + 0 * output_stride), p0);
- _mm_storel_epi64((__m128i *)(output_pixels16 + 1 * output_stride), p1);
- _mm_storel_epi64((__m128i *)(output_pixels16 + 2 * output_stride), p2);
- _mm_storel_epi64((__m128i *)(output_pixels16 + 3 * output_stride), p3);
-}
-
-/* Loads an 8x4 buffer of 16-bit values, adds a 8x4 block of 16-bit values to
- them, clamps to the high bit depth max, and stores the sum back. */
-static INLINE void od_add_store_buffer_hbd_8x4_epi16(void *output_pixels,
- int output_stride,
- __m128i q0, __m128i q1,
- __m128i q2, __m128i q3,
- int bd) {
- uint16_t *output_pixels16;
- __m128i p0;
- __m128i p1;
- __m128i p2;
- __m128i p3;
- __m128i max;
- __m128i round;
- int downshift;
- output_pixels16 = CONVERT_TO_SHORTPTR(output_pixels);
- max = od_hbd_max_epi16(bd);
- downshift = TX_COEFF_DEPTH - bd;
- round = _mm_set1_epi16((1 << downshift) >> 1);
- p0 = _mm_loadu_si128((const __m128i *)(output_pixels16 + 0 * output_stride));
- p1 = _mm_loadu_si128((const __m128i *)(output_pixels16 + 1 * output_stride));
- p2 = _mm_loadu_si128((const __m128i *)(output_pixels16 + 2 * output_stride));
- p3 = _mm_loadu_si128((const __m128i *)(output_pixels16 + 3 * output_stride));
- q0 = _mm_srai_epi16(_mm_add_epi16(q0, round), downshift);
- q1 = _mm_srai_epi16(_mm_add_epi16(q1, round), downshift);
- q2 = _mm_srai_epi16(_mm_add_epi16(q2, round), downshift);
- q3 = _mm_srai_epi16(_mm_add_epi16(q3, round), downshift);
- p0 = od_hbd_clamp_epi16(_mm_add_epi16(p0, q0), max);
- p1 = od_hbd_clamp_epi16(_mm_add_epi16(p1, q1), max);
- p2 = od_hbd_clamp_epi16(_mm_add_epi16(p2, q2), max);
- p3 = od_hbd_clamp_epi16(_mm_add_epi16(p3, q3), max);
- _mm_storeu_si128((__m128i *)(output_pixels16 + 0 * output_stride), p0);
- _mm_storeu_si128((__m128i *)(output_pixels16 + 1 * output_stride), p1);
- _mm_storeu_si128((__m128i *)(output_pixels16 + 2 * output_stride), p2);
- _mm_storeu_si128((__m128i *)(output_pixels16 + 3 * output_stride), p3);
-}
-
-static INLINE void od_add_store_buffer_hbd_16x4_epi16(void *output_pixels,
- int output_stride,
- __m256i r0, __m256i r1,
- __m256i r2, __m256i r3,
- int bd) {
- uint16_t *output_pixels16;
- __m256i p0;
- __m256i p1;
- __m256i p2;
- __m256i p3;
- __m256i max;
- __m256i round;
- int downshift;
- output_pixels16 = CONVERT_TO_SHORTPTR(output_pixels);
- max = od_mm256_hbd_max_epi16(bd);
- downshift = TX_COEFF_DEPTH - bd;
- round = _mm256_set1_epi16((1 << downshift) >> 1);
- p0 = _mm256_loadu_si256(
- (const __m256i *)(output_pixels16 + 0 * output_stride));
- p1 = _mm256_loadu_si256(
- (const __m256i *)(output_pixels16 + 1 * output_stride));
- p2 = _mm256_loadu_si256(
- (const __m256i *)(output_pixels16 + 2 * output_stride));
- p3 = _mm256_loadu_si256(
- (const __m256i *)(output_pixels16 + 3 * output_stride));
- r0 = _mm256_srai_epi16(_mm256_add_epi16(r0, round), downshift);
- r1 = _mm256_srai_epi16(_mm256_add_epi16(r1, round), downshift);
- r2 = _mm256_srai_epi16(_mm256_add_epi16(r2, round), downshift);
- r3 = _mm256_srai_epi16(_mm256_add_epi16(r3, round), downshift);
- p0 = od_mm256_hbd_clamp_epi16(_mm256_add_epi16(p0, r0), max);
- p1 = od_mm256_hbd_clamp_epi16(_mm256_add_epi16(p1, r1), max);
- p2 = od_mm256_hbd_clamp_epi16(_mm256_add_epi16(p2, r2), max);
- p3 = od_mm256_hbd_clamp_epi16(_mm256_add_epi16(p3, r3), max);
- _mm256_storeu_si256((__m256i *)(output_pixels16 + 0 * output_stride), p0);
- _mm256_storeu_si256((__m256i *)(output_pixels16 + 1 * output_stride), p1);
- _mm256_storeu_si256((__m256i *)(output_pixels16 + 2 * output_stride), p2);
- _mm256_storeu_si256((__m256i *)(output_pixels16 + 3 * output_stride), p3);
-}
-
-static INLINE void od_transpose_pack4x4(__m128i *q0, __m128i *q1, __m128i *q2,
- __m128i *q3) {
- __m128i a;
- __m128i b;
- __m128i c;
- __m128i d;
- /* Input:
- q0: q30 q20 q10 q00
- q1: q31 q21 q11 q01
- q2: q32 q22 q12 q02
- q3: q33 q23 q13 q03
- */
- /* a: q32 q22 q12 q02 q30 q20 q10 q00 */
- a = _mm_packs_epi32(*q0, *q2);
- /* b: q33 q23 q13 q03 q31 q21 q11 q01 */
- b = _mm_packs_epi32(*q1, *q3);
- /* c: q31 q30 q21 q20 q11 q10 q01 q00 */
- c = _mm_unpacklo_epi16(a, b);
- /* d: q33 q32 q23 q22 q13 q12 q03 q02 */
- d = _mm_unpackhi_epi16(a, b);
- /* We don't care about the contents of the high half of each register. */
- /* q0: q13 q12 q11 q10 [q03 q02 q01 q00] */
- *q0 = _mm_unpacklo_epi32(c, d);
- /* q1: q13 q12 q11 q10 [q13 q12 q11 q10] */
- *q1 = _mm_unpackhi_epi64(*q0, *q0);
- /* q2: q33 q32 q31 q30 [q23 q22 q21 q20] */
- *q2 = _mm_unpackhi_epi32(c, d);
- /* q3: q33 q32 q31 q30 [q33 q32 q31 q30] */
- *q3 = _mm_unpackhi_epi64(*q2, *q2);
-}
-
-static INLINE void od_transpose4x4(__m128i *q0, __m128i q1, __m128i *q2,
- __m128i q3) {
- __m128i a;
- __m128i b;
- /* Input:
- q0: ... ... ... ... q30 q20 q10 q00
- q1: ... ... ... ... q31 q21 q11 q01
- q2: ... ... ... ... q32 q22 q12 q02
- q3: ... ... ... ... q33 q23 q13 q03
- */
- /* a: q31 q30 q21 q20 q11 q10 q01 q00 */
- a = _mm_unpacklo_epi16(*q0, q1);
- /* b: q33 q32 q23 q22 q13 q12 q03 q02 */
- b = _mm_unpacklo_epi16(*q2, q3);
- /* q0: q13 q12 q11 q10 | q03 q02 q01 q00 */
- *q0 = _mm_unpacklo_epi32(a, b);
- /* q2: q33 q32 q31 q30 | q23 q22 q21 q20 */
- *q2 = _mm_unpackhi_epi32(a, b);
-}
-
-static inline void od_transpose4x8(__m128i *r0, __m128i r1, __m128i *r2,
- __m128i r3, __m128i *r4, __m128i r5,
- __m128i *r6, __m128i r7) {
- __m128i a;
- __m128i b;
- /* Input:
- q0: ... ... ... ... q30 q20 q10 q00
- q1: ... ... ... ... q31 q21 q11 q01
- q2: ... ... ... ... q32 q22 q12 q02
- q3: ... ... ... ... q33 q23 q13 q03
- q4: ... ... ... ... q34 q24 q14 q04
- q5: ... ... ... ... q35 q25 q15 q05
- q6: ... ... ... ... q36 q26 q16 q06
- q7: ... ... ... ... q37 q27 q17 q07
- */
- /* r0: r13 r12 11 r10 r03 r02 r01 r00
- r2: r33 r32 31 r30 r23 r22 r21 r20 */
- od_transpose4x4(r0, r1, r2, r3);
- /* r4: r17 r16 15 r14 r07 r06 r05 r04
- r6: r37 r36 35 r34 r27 r26 r25 r24 */
- od_transpose4x4(r4, r5, r6, r7);
- a = *r0;
- b = *r2;
- /* r0: r07 r06 r05 r04 r04 r02 r01 r00 */
- *r0 = _mm_unpacklo_epi64(a, *r4);
- /* r2: r17 r16 r15 r14 r14 r12 r11 r10 */
- *r2 = _mm_unpackhi_epi64(a, *r4);
- /* r4: r27 r26 r25 r24 r24 r22 r21 r20 */
- *r4 = _mm_unpacklo_epi64(b, *r6);
- /* r6: r37 r36 r35 r34 r34 r32 r31 r30 */
- *r6 = _mm_unpackhi_epi64(b, *r6);
-}
-
-static INLINE void od_transpose8x4(__m128i *q0, __m128i *q1, __m128i *q2,
- __m128i *q3) {
- __m128i a;
- __m128i b;
- __m128i c;
- __m128i d;
- /* Input:
- q0: q07 q06 q05 q04 q03 q02 q01 q00
- q1: q17 q16 q15 q14 q13 q12 q11 q10
- q2: q27 q26 q25 q24 q23 q22 q21 q20
- q3: q37 q36 q35 q34 q33 q32 q31 q30
- */
- /* a: q13 q03 q12 q02 q11 q01 q10 q00 */
- a = _mm_unpacklo_epi16(*q0, *q1);
- /* b: q17 q07 q16 q06 q15 q05 q14 q04 */
- b = _mm_unpackhi_epi16(*q0, *q1);
- /* c: q33 q23 q32 q22 q31 q21 q30 q20 */
- c = _mm_unpacklo_epi16(*q2, *q3);
- /* d: q37 q27 q36 q26 q35 q25 q34 q24 */
- d = _mm_unpackhi_epi16(*q2, *q3);
- /* q0: q31 q21 q11 q01 | q30 q20 q10 q00 */
- *q0 = _mm_unpacklo_epi32(a, c);
- /* q1: q33 q23 q13 q03 | q32 q22 q12 q02 */
- *q1 = _mm_unpackhi_epi32(a, c);
- /* q2: q35 q25 q15 q05 | q34 q24 q14 q04 */
- *q2 = _mm_unpacklo_epi32(b, d);
- /* q3: q37 q27 q17 q07 | q36 q26 q16 q06 */
- *q3 = _mm_unpackhi_epi32(b, d);
-}
-
-static INLINE void od_transpose_pack4x8(__m128i *q0, __m128i *q1, __m128i *q2,
- __m128i *q3, __m128i q4, __m128i q5,
- __m128i q6, __m128i q7) {
- __m128i a;
- __m128i b;
- __m128i c;
- __m128i d;
- /* Input:
- q0: q30 q20 q10 q00
- q1: q31 q21 q11 q01
- q2: q32 q22 q12 q02
- q3: q33 q23 q13 q03
- q4: q34 q24 q14 q04
- q5: q35 q25 q15 q05
- q6: q36 q26 q16 q06
- q7: q37 q27 q17 q07
- */
- /* a: q34 q24 q14 q04 q30 q20 q10 q00 */
- a = _mm_packs_epi32(*q0, q4);
- /* b: q35 q25 q15 q05 q31 q21 q11 q01 */
- b = _mm_packs_epi32(*q1, q5);
- /* c: q36 q26 q16 q06 q32 q22 q12 q02 */
- c = _mm_packs_epi32(*q2, q6);
- /* d: q37 q27 q17 q07 q33 q23 q13 q03 */
- d = _mm_packs_epi32(*q3, q7);
- /* a: q13 q12 q11 q10 q03 q02 q01 q00
- b: q33 q32 q31 q30 q33 q22 q21 q20
- c: q53 q52 q51 q50 q43 q42 q41 q40
- d: q73 q72 q71 q70 q63 q62 q61 q60 */
- od_transpose8x4(&a, &b, &c, &d);
- /* q0: q07 q06 q05 q04 q03 q02 q01 q00 */
- *q0 = _mm_unpacklo_epi64(a, c);
- /* q1: q17 q16 q15 q14 q13 q12 q11 q10 */
- *q1 = _mm_unpackhi_epi64(a, c);
- /* q2: q27 q26 q25 q24 q23 q22 q21 q20 */
- *q2 = _mm_unpacklo_epi64(b, d);
- /* q3: q37 q36 q35 q34 q33 q32 q31 q30 */
- *q3 = _mm_unpackhi_epi64(b, d);
-}
-
-static INLINE void od_transpose_pack8x4(__m128i *r0, __m128i *r1, __m128i *r2,
- __m128i *r3, __m128i *r4, __m128i *r5,
- __m128i *r6, __m128i *r7) {
- /* Input:
- r1: r07 r06 r05 r04 r0: r03 r02 r01 r00
- r3: r17 r16 r15 r14 r2: r13 r12 r11 r10
- r5: r27 r26 r25 r24 r4: r23 r22 r21 r20
- r7: r37 r36 r35 r34 r6: r33 r32 r31 r30
- */
- /* r0: r07 r06 r05 r04 r03 r02 r01 r00 */
- *r0 = _mm_packs_epi32(*r0, *r1);
- /* r2: r17 r16 r15 r14 r13 r12 r11 r10 */
- *r2 = _mm_packs_epi32(*r2, *r3);
- /* r4: r27 r26 r25 r24 r23 r22 r21 r20 */
- *r4 = _mm_packs_epi32(*r4, *r5);
- /* r6: r37 r36 r35 r34 r33 r32 r31 r30 */
- *r6 = _mm_packs_epi32(*r6, *r7);
- /* r0: r31 r21 r11 r01 [r30 r20 r10 r00]
- r2: r33 r23 r13 r03 [r32 r22 r12 r02]
- r4: r35 r25 r15 r05 [r34 r24 r14 r04]
- r6: r37 r27 r17 r07 [r36 r26 r16 r06] */
- od_transpose8x4(r0, r2, r4, r6);
- /* We don't care about the contents of the high half of each register. */
- /* r1: r31 r21 r11 r01 [r31 r21 r11 r01] */
- *r1 = _mm_unpackhi_epi64(*r0, *r0);
- /* r3: r33 r23 r13 r03 [r33 r23 r13 r03] */
- *r3 = _mm_unpackhi_epi64(*r2, *r2);
- /* r5: r35 r25 r15 r05 [r35 r25 r15 r05] */
- *r5 = _mm_unpackhi_epi64(*r4, *r4);
- /* r7: r37 r27 r17 r07 [r37 r27 r17 r07] */
- *r7 = _mm_unpackhi_epi64(*r6, *r6);
-}
-
-static INLINE void od_transpose8x8_epi16(__m128i *r0, __m128i *r1, __m128i *r2,
- __m128i *r3, __m128i *r4, __m128i *r5,
- __m128i *r6, __m128i *r7) {
- __m128i r8;
- /*8x8 transpose with only 1 temporary register that takes the rows in order
- and returns the columns in order. The compiler's own register allocator
- will probably screw this up, but that's no reason not to pretend we might
- be able to have nice things. This only matters when we port to pre-AVX
- instruction sets without 3-operand instructions.*/
- r8 = *r4;
- *r4 = _mm_unpacklo_epi16(*r4, *r5);
- r8 = _mm_unpackhi_epi16(r8, *r5);
- *r5 = *r0;
- *r0 = _mm_unpacklo_epi16(*r0, *r1);
- *r5 = _mm_unpackhi_epi16(*r5, *r1);
- *r1 = *r6;
- *r6 = _mm_unpacklo_epi16(*r6, *r7);
- *r1 = _mm_unpackhi_epi16(*r1, *r7);
- *r7 = *r2;
- *r2 = _mm_unpackhi_epi16(*r2, *r3);
- *r7 = _mm_unpacklo_epi16(*r7, *r3);
- *r3 = *r0;
- *r0 = _mm_unpacklo_epi32(*r0, *r7);
- *r3 = _mm_unpackhi_epi32(*r3, *r7);
- *r7 = *r5;
- *r5 = _mm_unpacklo_epi32(*r5, *r2);
- *r7 = _mm_unpackhi_epi32(*r7, *r2);
- *r2 = *r4;
- *r4 = _mm_unpackhi_epi32(*r4, *r6);
- *r2 = _mm_unpacklo_epi32(*r2, *r6);
- *r6 = r8;
- r8 = _mm_unpackhi_epi32(r8, *r1);
- *r6 = _mm_unpacklo_epi32(*r6, *r1);
- *r1 = *r0;
- *r0 = _mm_unpacklo_epi64(*r0, *r2);
- *r1 = _mm_unpackhi_epi64(*r1, *r2);
- *r2 = *r3;
- *r3 = _mm_unpackhi_epi64(*r3, *r4);
- *r2 = _mm_unpacklo_epi64(*r2, *r4);
- *r4 = *r5;
- *r5 = _mm_unpackhi_epi64(*r5, *r6);
- *r4 = _mm_unpacklo_epi64(*r4, *r6);
- *r6 = *r7;
- *r7 = _mm_unpackhi_epi64(*r7, r8);
- *r6 = _mm_unpacklo_epi64(*r6, r8);
-}
-
-static INLINE void od_transpose8x8_epi32(__m256i *r0, __m256i *r1, __m256i *r2,
- __m256i *r3, __m256i *r4, __m256i *r5,
- __m256i *r6, __m256i *r7) {
- __m256i a;
- __m256i b;
- __m256i c;
- __m256i d;
- __m256i e;
- __m256i f;
- __m256i g;
- __m256i h;
- __m256i x;
- __m256i y;
- a = _mm256_unpacklo_epi32(*r0, *r1);
- b = _mm256_unpacklo_epi32(*r2, *r3);
- c = _mm256_unpackhi_epi32(*r0, *r1);
- d = _mm256_unpackhi_epi32(*r2, *r3);
- e = _mm256_unpacklo_epi32(*r4, *r5);
- f = _mm256_unpacklo_epi32(*r6, *r7);
- g = _mm256_unpackhi_epi32(*r4, *r5);
- h = _mm256_unpackhi_epi32(*r6, *r7);
- x = _mm256_unpacklo_epi64(a, b);
- y = _mm256_unpacklo_epi64(e, f);
- *r0 = _mm256_permute2x128_si256(x, y, 0 | (2 << 4));
- *r4 = _mm256_permute2x128_si256(x, y, 1 | (3 << 4));
- x = _mm256_unpackhi_epi64(a, b);
- y = _mm256_unpackhi_epi64(e, f);
- *r1 = _mm256_permute2x128_si256(x, y, 0 | (2 << 4));
- *r5 = _mm256_permute2x128_si256(x, y, 1 | (3 << 4));
- x = _mm256_unpacklo_epi64(c, d);
- y = _mm256_unpacklo_epi64(g, h);
- *r2 = _mm256_permute2x128_si256(x, y, 0 | (2 << 4));
- *r6 = _mm256_permute2x128_si256(x, y, 1 | (3 << 4));
- x = _mm256_unpackhi_epi64(c, d);
- y = _mm256_unpackhi_epi64(g, h);
- *r3 = _mm256_permute2x128_si256(x, y, 0 | (2 << 4));
- *r7 = _mm256_permute2x128_si256(x, y, 1 | (3 << 4));
-}
-
-/* Packs two blocks of 4x8 32-bit words into 16-bit words and returns the
- transpose of each packed into the high and low halves of each register. */
-static INLINE void od_transpose_pack4x8x2_epi32(__m256i *out0, __m256i *out1,
- __m256i *out2, __m256i *out3,
- __m256i rr0, __m256i rr1,
- __m256i rr2, __m256i rr3,
- __m256i rr4, __m256i rr5,
- __m256i rr6, __m256i rr7) {
- __m256i a;
- __m256i b;
- __m256i c;
- __m256i d;
- __m256i w;
- __m256i x;
- __m256i y;
- __m256i z;
- /* a: r47 r46 r45 r44 r07 r06 r05 r04 | r43 r42 r41 r40 r03 r02 r01 r00 */
- a = _mm256_packs_epi32(rr0, rr4);
- /* b: r57 r56 r55 r54 r17 r16 r15 r14 | r53 r52 r51 r50 r13 r12 r11 r10 */
- b = _mm256_packs_epi32(rr1, rr5);
- /* c: r67 r66 r65 r64 r27 r26 r25 r24 | r63 r62 r61 r60 r23 r22 r21 r20 */
- c = _mm256_packs_epi32(rr2, rr6);
- /* d: r77 r76 r75 r74 r37 r36 r35 r34 | r73 r72 r71 r70 r33 r32 r31 r30 */
- d = _mm256_packs_epi32(rr3, rr7);
- /* w: r17 r07 r16 r06 r15 r05 r14 r04 | r13 r03 r12 r02 r11 r01 r10 r00 */
- w = _mm256_unpacklo_epi16(a, b);
- /* x: r57 r47 r56 r46 r55 r45 r54 r44 | r53 r43 r52 r42 r51 r41 r50 r40 */
- x = _mm256_unpackhi_epi16(a, b);
- /* y: r37 r27 r36 r26 r35 r25 r34 r24 | r33 r23 r32 r22 r31 r21 r30 r20 */
- y = _mm256_unpacklo_epi16(c, d);
- /* z: r77 r67 r76 r66 r75 r65 r74 r64 | r73 r63 r72 r62 r71 r61 r70 r60 */
- z = _mm256_unpackhi_epi16(c, d);
- /* a: r35 r25 r15 r05 r34 r24 r14 r04 | r31 r21 r11 r01 r30 r20 r10 r00 */
- a = _mm256_unpacklo_epi32(w, y);
- /* b: r77 r67 r57 r47 r76 r66 r56 r46 | r33 r23 r13 r03 r32 r22 r12 r02 */
- b = _mm256_unpackhi_epi32(w, y);
- /* c: r75 r65 r55 r45 r74 r64 r54 r44 | r71 r61 r51 r41 r70 r60 r50 r40 */
- c = _mm256_unpacklo_epi32(x, z);
- /* d: r77 r67 r57 r47 r76 r66 r56 r46 | r73 r63 r53 r43 r72 r62 r52 r42 */
- d = _mm256_unpackhi_epi32(x, z);
- /* out0: r74 r64 r54 r44 r34 r24 r14 r04 | r70 r60 r50 r40 r30 r20 r10 r00 */
- *out0 = _mm256_unpacklo_epi64(a, c);
- /* out1: r75 r65 r55 r45 r35 r25 r15 r05 | r71 r61 r51 r41 r31 r21 r11 r01 */
- *out1 = _mm256_unpackhi_epi64(a, c);
- /* out2: r76 r66 r56 r46 r36 r26 r16 r06 | r72 r62 r52 r42 r32 r22 r12 r02 */
- *out2 = _mm256_unpacklo_epi64(b, d);
- /* out3: r77 r67 r57 r47 r37 r27 r17 r07 | r73 r63 r53 r43 r33 r23 r13 r03 */
- *out3 = _mm256_unpackhi_epi64(b, d);
-}
-
-static INLINE void od_transpose_pack8x8_epi32(__m256i *rr0, __m256i *rr1,
- __m256i *rr2, __m256i *rr3,
- __m256i rr4, __m256i rr5,
- __m256i rr6, __m256i rr7) {
- __m256i w;
- __m256i x;
- __m256i y;
- __m256i z;
- /* w: r74 r64 r54 r44 r34 r24 r14 r04 | r70 r60 r50 r40 r30 r20 r10 r00
- x: r75 r65 r55 r45 r35 r25 r15 r05 | r71 r61 r51 r41 r31 r21 r11 r01
- y: r76 r66 r56 r46 r36 r26 r16 r06 | r72 r62 r52 r42 r32 r22 r12 r02
- z: r77 r67 r57 r47 r37 r27 r17 r07 | r73 r63 r53 r43 r33 r23 r13 r03 */
- od_transpose_pack4x8x2_epi32(&w, &x, &y, &z, *rr0, *rr1, *rr2, *rr3, rr4, rr5,
- rr6, rr7);
- /* rr0: r71 r61 r51 r41 r31 r21 r11 r01 | r70 r60 r50 r40 r30 r20 r10 r00 */
- *rr0 = _mm256_permute2x128_si256(w, x, 0 | (2 << 4));
- /* rr1: r73 r63 r53 r43 r33 r23 r13 r03 | r72 r62 r52 r42 r32 r22 r12 r02 */
- *rr1 = _mm256_permute2x128_si256(y, z, 0 | (2 << 4));
- /* rr2: r75 r65 r55 r45 r35 r25 r15 r05 r74 r64 r54 r44 r34 r24 r14 r04 */
- *rr2 = _mm256_permute2x128_si256(w, x, 1 | (3 << 4));
- /* rr3: r77 r67 r57 r47 r37 r27 r17 r07 r76 r66 r56 r46 r36 r26 r16 r06 */
- *rr3 = _mm256_permute2x128_si256(y, z, 1 | (3 << 4));
-}
-
-static INLINE void od_transpose_pack8x16_epi32(
- __m256i *ss0, __m256i *ss1, __m256i *ss2, __m256i *ss3, __m256i *ss4,
- __m256i *ss5, __m256i *ss6, __m256i *ss7, __m256i ss8, __m256i ss9,
- __m256i ssa, __m256i ssb, __m256i ssc, __m256i ssd, __m256i sse,
- __m256i ssf) {
- __m256i a;
- __m256i b;
- __m256i c;
- __m256i d;
- __m256i e;
- __m256i f;
- __m256i g;
- __m256i h;
- /* ss0: s74 s64 s54 s44 s34 s24 s14 s04 | s70 s60 s50 s40 s30 s20 s10 s00
- ss2: s75 s65 s55 s45 s35 s25 s15 s05 | s71 s61 s51 s41 s31 s21 s11 s01
- ss4: s76 s66 s56 s46 s36 s26 s16 s06 | s72 s62 s52 s42 s32 s22 s12 s02
- ss6: s77 s67 s57 s47 s37 s27 s17 s07 | s73 s63 s53 s43 s33 s23 s13 s03 */
- od_transpose_pack4x8x2_epi32(&a, &b, &c, &d, *ss0, *ss1, *ss2, *ss3, *ss4,
- *ss5, *ss6, *ss7);
- /* ss8: sf4 se4 sd4 sc4 sb4 sa4 s94 s84 | sf0 se0 sd0 sc0 sb0 sa0 s90 s80
- ssa: sf5 se5 sd5 sc5 sb5 sa5 s95 s85 | sf1 se1 sd1 sc1 sb1 sa1 s91 s81
- ssc: sf6 se6 sd6 sc6 sb6 sa6 s96 s86 | sf2 se2 sd2 sc2 sb2 sa2 s92 s82
- sse: sf7 se7 sd7 sc7 sb7 sa7 s97 s87 | sf3 se3 sd3 sc3 sb3 sa3 s93 s83 */
- od_transpose_pack4x8x2_epi32(&e, &f, &g, &h, ss8, ss9, ssa, ssb, ssc, ssd,
- sse, ssf);
- /* ss0: sf0 se0 sd0 sc0 sb0 sa0 s90 s80 | s70 s60 s50 s40 s30 s20 s10 s00 */
- *ss0 = _mm256_permute2x128_si256(a, e, 0 | (2 << 4));
- /* ss1: sf1 se1 sd1 sc1 sb1 sa1 s91 s81 | s71 s61 s51 s41 s31 s21 s11 s01 */
- *ss1 = _mm256_permute2x128_si256(b, f, 0 | (2 << 4));
- /* ss2: sf2 se2 sd2 sc2 sb2 sa2 s92 s82 | s72 s62 s52 s42 s32 s22 s12 s02 */
- *ss2 = _mm256_permute2x128_si256(c, g, 0 | (2 << 4));
- /* ss3: sf3 se3 sd3 sc3 sb3 sa3 s93 s83 | s73 s63 s53 s43 s33 s23 s13 s03 */
- *ss3 = _mm256_permute2x128_si256(d, h, 0 | (2 << 4));
- /* ss4: sf4 se4 sd4 sc4 sb4 sa4 s94 s84 | s74 s64 s54 s44 s34 s24 s14 s04 */
- *ss4 = _mm256_permute2x128_si256(a, e, 1 | (3 << 4));
- /* ss5: sf5 se5 sd5 sc5 sb5 sa5 s95 s85 | s75 s65 s55 s45 s35 s25 s15 s05 */
- *ss5 = _mm256_permute2x128_si256(b, f, 1 | (3 << 4));
- /* ss6: rf6 re6 rd6 rc6 rb6 ra6 r96 r82 | r76 r66 r56 r46 r36 r26 r16 r06 */
- *ss6 = _mm256_permute2x128_si256(c, g, 1 | (3 << 4));
- /* ss7: rf7 re7 rd7 rc7 rb7 ra7 r97 r87 | r77 r67 r57 r47 r37 r27 r17 r07 */
- *ss7 = _mm256_permute2x128_si256(d, h, 1 | (3 << 4));
-}
-
-#undef OD_KERNEL
-#undef OD_WORD
-#undef OD_REG
-#undef OD_ADD
-#undef OD_SUB
-#undef OD_RSHIFT1
-#undef OD_AVG
-#undef OD_HRSUB
-#undef OD_MUL
-#undef OD_SWAP
-
-/* Define 8-wide 16-bit SSSE3 kernels. */
-
-#define OD_KERNEL kernel8
-#define OD_WORD epi16
-#define OD_REG __m128i
-#define OD_ADD _mm_add_epi16
-#define OD_SUB _mm_sub_epi16
-#define OD_RSHIFT1 od_unbiased_rshift1_epi16
-#define OD_AVG od_avg_epi16
-#define OD_HRSUB od_hrsub_epi16
-#define OD_MUL od_mul_epi16
-#define OD_SWAP od_swap_si128
-
-#include "av1/common/x86/daala_tx_kernels.h"
-
-#undef OD_KERNEL
-#undef OD_REG
-#undef OD_ADD
-#undef OD_SUB
-#undef OD_RSHIFT1
-#undef OD_AVG
-#undef OD_HRSUB
-#undef OD_MUL
-#undef OD_SWAP
-
-/* Define 16-wide 16-bit AVX2 kernels. */
-
-#define OD_KERNEL kernel16
-#define OD_REG __m256i
-#define OD_ADD _mm256_add_epi16
-#define OD_SUB _mm256_sub_epi16
-#define OD_RSHIFT1 od_mm256_unbiased_rshift1_epi16
-#define OD_AVG od_mm256_avg_epi16
-#define OD_HRSUB od_mm256_hrsub_epi16
-#define OD_MUL od_mm256_mul_epi16
-#define OD_SWAP od_mm256_swap_si256
-
-#include "av1/common/x86/daala_tx_kernels.h" // NOLINT
-
-/* Define 8-wide 32-bit AVX2 kernels. */
-
-#undef OD_KERNEL
-#undef OD_WORD
-#undef OD_ADD
-#undef OD_SUB
-#undef OD_RSHIFT1
-#undef OD_AVG
-#undef OD_HRSUB
-#undef OD_MUL
-
-#define OD_KERNEL kernel8
-#define OD_WORD epi32
-#define OD_ADD _mm256_add_epi32
-#define OD_SUB _mm256_sub_epi32
-#define OD_RSHIFT1 od_mm256_unbiased_rshift1_epi32
-#define OD_AVG od_mm256_avg_epi32
-#define OD_HRSUB od_mm256_hrsub_epi32
-#define OD_MUL od_mm256_mul_epi32
-
-#include "av1/common/x86/daala_tx_kernels.h" // NOLINT
-
-static void od_row_iidtx_avx2(int16_t *out, int coeffs, const tran_low_t *in) {
- int c;
- /* The number of rows and number of columns are both multiples of 4, so the
- total number of coefficients should be a multiple of 16. */
- assert(!(coeffs & 0xF));
- /* TODO(any): Use AVX2 for larger block sizes. */
- for (c = 0; c < coeffs; c += 16) {
- __m128i q0;
- __m128i q1;
- __m128i q2;
- __m128i q3;
- od_load_buffer_4x4_epi32(&q0, &q1, &q2, &q3, in + c);
- q0 = _mm_packs_epi32(q0, q1);
- q2 = _mm_packs_epi32(q2, q3);
- od_store_buffer_4x4_epi16(out + c, q0, q2);
- }
-}
-
-static void od_col_iidtx_add_hbd_avx2(unsigned char *output_pixels,
- int output_stride, int rows, int cols,
- const int16_t *in, int bd) {
- __m128i q0;
- __m128i q1;
- __m128i q2;
- __m128i q3;
- if (cols <= 4) {
- uint16_t *output_pixels16;
- __m128i p0;
- __m128i p1;
- __m128i p2;
- __m128i p3;
- __m128i max;
- __m128i round;
- int downshift;
- int hr;
- output_pixels16 = CONVERT_TO_SHORTPTR(output_pixels);
- max = od_hbd_max_epi16(bd);
- downshift = TX_COEFF_DEPTH - bd;
- round = _mm_set1_epi16((1 << downshift) >> 1);
- /* Here hr counts half the number of rows, to simplify address calculations
- when loading two rows of coefficients at once. */
- for (hr = 0; 2 * hr < rows; hr += 2) {
- q0 = _mm_loadu_si128((const __m128i *)in + hr + 0);
- q2 = _mm_loadu_si128((const __m128i *)in + hr + 1);
- p0 = _mm_loadl_epi64(
- (const __m128i *)(output_pixels16 + (2 * hr + 0) * output_stride));
- p1 = _mm_loadl_epi64(
- (const __m128i *)(output_pixels16 + (2 * hr + 1) * output_stride));
- p2 = _mm_loadl_epi64(
- (const __m128i *)(output_pixels16 + (2 * hr + 2) * output_stride));
- p3 = _mm_loadl_epi64(
- (const __m128i *)(output_pixels16 + (2 * hr + 3) * output_stride));
- q0 = _mm_srai_epi16(_mm_add_epi16(q0, round), downshift);
- q2 = _mm_srai_epi16(_mm_add_epi16(q2, round), downshift);
- q1 = _mm_unpackhi_epi64(q0, q0);
- q3 = _mm_unpackhi_epi64(q2, q2);
- p0 = od_hbd_clamp_epi16(_mm_add_epi16(p0, q0), max);
- p1 = od_hbd_clamp_epi16(_mm_add_epi16(p1, q1), max);
- p2 = od_hbd_clamp_epi16(_mm_add_epi16(p2, q2), max);
- p3 = od_hbd_clamp_epi16(_mm_add_epi16(p3, q3), max);
- _mm_storel_epi64(
- (__m128i *)(output_pixels16 + (2 * hr + 0) * output_stride), p0);
- _mm_storel_epi64(
- (__m128i *)(output_pixels16 + (2 * hr + 1) * output_stride), p1);
- _mm_storel_epi64(
- (__m128i *)(output_pixels16 + (2 * hr + 2) * output_stride), p2);
- _mm_storel_epi64(
- (__m128i *)(output_pixels16 + (2 * hr + 3) * output_stride), p3);
- }
- } else {
- int r;
- for (r = 0; r < rows; r += 4) {
- int c;
- /* TODO(any): Use AVX2 for larger column counts. */
- for (c = 0; c < cols; c += 8) {
- od_load_buffer_8x4_epi16(&q0, &q1, &q2, &q3, in + r * cols + c, cols);
- od_add_store_buffer_hbd_8x4_epi16(output_pixels + r * output_stride + c,
- output_stride, q0, q1, q2, q3, bd);
- }
- }
- }
-}
-
-typedef void (*od_tx4_kernel8_epi16)(__m128i *q0, __m128i *q2, __m128i *q1,
- __m128i *q3);
-
-static void od_row_tx4_avx2(int16_t *out, int rows, const tran_low_t *in,
- od_tx4_kernel8_epi16 kernel8) {
- __m128i q0;
- __m128i q1;
- __m128i q2;
- __m128i q3;
- if (rows <= 4) {
- od_load_buffer_4x4_epi32(&q0, &q1, &q2, &q3, in);
- /*TODO(any): Merge this transpose with coefficient scanning.*/
- od_transpose_pack4x4(&q0, &q1, &q2, &q3);
- kernel8(&q0, &q1, &q2, &q3);
- od_transpose4x4(&q0, q2, &q1, q3);
- od_store_buffer_4x4_epi16(out, q0, q1);
- } else {
- int r;
- /* Higher row counts require 32-bit precision. */
- assert(rows <= 16);
- for (r = 0; r < rows; r += 8) {
- __m128i q4;
- __m128i q5;
- __m128i q6;
- __m128i q7;
- od_load_buffer_4x4_epi32(&q0, &q1, &q2, &q3, in + 4 * r);
- od_load_buffer_4x4_epi32(&q4, &q5, &q6, &q7, in + 4 * r + 16);
- /*TODO(any): Merge this transpose with coefficient scanning.*/
- od_transpose_pack4x8(&q0, &q1, &q2, &q3, q4, q5, q6, q7);
- kernel8(&q0, &q1, &q2, &q3);
- od_transpose8x4(&q0, &q2, &q1, &q3);
- od_store_buffer_4x8_epi16(out + 4 * r, q0, q2, q1, q3);
- }
- }
-}
-
-static void od_col_tx4_add_hbd_avx2(unsigned char *output_pixels,
- int output_stride, int cols,
- const int16_t *in, int bd,
- od_tx4_kernel8_epi16 kernel8) {
- __m128i q0;
- __m128i q1;
- __m128i q2;
- __m128i q3;
- if (cols <= 4) {
- od_load_buffer_4x4_epi16(&q0, &q1, &q2, &q3, in);
- kernel8(&q0, &q1, &q2, &q3);
- od_add_store_buffer_hbd_4x4_epi16(output_pixels, output_stride, q0, q2, q1,
- q3, bd);
- } else {
- int c;
- for (c = 0; c < cols; c += 8) {
- od_load_buffer_8x4_epi16(&q0, &q1, &q2, &q3, in + c, cols);
- kernel8(&q0, &q1, &q2, &q3);
- od_add_store_buffer_hbd_8x4_epi16(output_pixels + c, output_stride, q0,
- q2, q1, q3, bd);
- }
- }
-}
-
-#if 0
-static void od_row_idct4_avx2(int16_t *out, int rows, const tran_low_t *in) {
- od_row_tx4_avx2(out, rows, in, od_idct4_kernel8_epi16);
-}
-
-static void od_col_idct4_add_hbd_avx2(unsigned char *output_pixels,
- int output_stride, int cols,
- const int16_t *in, int bd) {
- od_col_tx4_add_hbd_avx2(output_pixels, output_stride, cols, in, bd,
- od_idct4_kernel8_epi16);
-}
-#endif
-
-static void od_row_idst4_avx2(int16_t *out, int rows, const tran_low_t *in) {
- od_row_tx4_avx2(out, rows, in, od_idst_vii4_kernel8_epi16);
-}
-
-static void od_col_idst4_add_hbd_avx2(unsigned char *output_pixels,
- int output_stride, int cols,
- const int16_t *in, int bd) {
- od_col_tx4_add_hbd_avx2(output_pixels, output_stride, cols, in, bd,
- od_idst_vii4_kernel8_epi16);
-}
-
-static void od_row_flip_idst4_avx2(int16_t *out, int rows,
- const tran_low_t *in) {
- od_row_tx4_avx2(out, rows, in, od_flip_idst_vii4_kernel8_epi16);
-}
-
-static void od_col_flip_idst4_add_hbd_avx2(unsigned char *output_pixels,
- int output_stride, int cols,
- const int16_t *in, int bd) {
- od_col_tx4_add_hbd_avx2(output_pixels, output_stride, cols, in, bd,
- od_flip_idst_vii4_kernel8_epi16);
-}
-
-static void od_row_iidtx4_avx2(int16_t *out, int rows, const tran_low_t *in) {
- od_row_iidtx_avx2(out, rows * 4, in);
-}
-
-static void od_col_iidtx4_add_hbd_avx2(unsigned char *output_pixels,
- int output_stride, int cols,
- const int16_t *in, int bd) {
- od_col_iidtx_add_hbd_avx2(output_pixels, output_stride, 4, cols, in, bd);
-}
-
-typedef void (*od_tx8_kernel8_epi16)(__m128i *r0, __m128i *r4, __m128i *r2,
- __m128i *r6, __m128i *r1, __m128i *r5,
- __m128i *r3, __m128i *r7);
-
-typedef void (*od_tx8_mm256_kernel)(__m256i *r0, __m256i *r4, __m256i *r2,
- __m256i *r6, __m256i *r1, __m256i *r5,
- __m256i *r3, __m256i *r7);
-
-#if 0
-static void od_row_tx8_avx2(int16_t *out, int rows, const tran_low_t *in,
- od_tx8_kernel8_epi16 kernel8_epi16,
- od_tx8_mm256_kernel kernel8_epi32) {
- __m128i r0;
- __m128i r1;
- __m128i r2;
- __m128i r3;
- __m128i r4;
- __m128i r5;
- __m128i r6;
- __m128i r7;
- if (rows <= 4) {
- od_load_buffer_4x4_epi32(&r0, &r1, &r2, &r3, in);
- od_load_buffer_4x4_epi32(&r4, &r5, &r6, &r7, in + 16);
- /*TODO(any): Merge this transpose with coefficient scanning.*/
- od_transpose_pack8x4(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7);
- kernel8_epi16(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7);
- od_transpose4x8(&r0, r4, &r2, r6, &r1, r5, &r3, r7);
- od_store_buffer_4x4_epi16(out, r0, r2);
- od_store_buffer_4x4_epi16(out + 16, r1, r3);
- } else if (rows <= 8) {
- od_load_pack_buffer_8x4_epi32(&r0, &r1, &r2, &r3, in);
- od_load_pack_buffer_8x4_epi32(&r4, &r5, &r6, &r7, in + 32);
- /*TODO(any): Merge this transpose with coefficient scanning.*/
- od_transpose8x8_epi16(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7);
- kernel8_epi16(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7);
- od_transpose8x8_epi16(&r0, &r4, &r2, &r6, &r1, &r5, &r3, &r7);
- od_store_buffer_4x8_epi16(out, r0, r4, r2, r6);
- od_store_buffer_4x8_epi16(out + 32, r1, r5, r3, r7);
- } else {
- int r;
- /* 16 or more rows requires 32-bit precision.
- TODO(any): If the column TX is IDTX, then we can still use 16 bits. */
- for (r = 0; r < rows; r += 8) {
- __m256i rr0;
- __m256i rr1;
- __m256i rr2;
- __m256i rr3;
- __m256i rr4;
- __m256i rr5;
- __m256i rr6;
- __m256i rr7;
- od_load_buffer_8x4_epi32(&rr0, &rr1, &rr2, &rr3, in + r * 8);
- od_load_buffer_8x4_epi32(&rr4, &rr5, &rr6, &rr7, in + r * 8 + 32);
- od_transpose8x8_epi32(&rr0, &rr1, &rr2, &rr3, &rr4, &rr5, &rr6, &rr7);
- kernel8_epi32(&rr0, &rr1, &rr2, &rr3, &rr4, &rr5, &rr6, &rr7);
- od_transpose_pack8x8_epi32(&rr0, &rr4, &rr2, &rr6, rr1, rr5, rr3, rr7);
- od_store_buffer_2x16_epi16(out + r * 8, rr0, rr4);
- od_store_buffer_2x16_epi16(out + r * 8 + 32, rr2, rr6);
- }
- }
-}
-
-static void od_col_tx8_add_hbd_avx2(unsigned char *output_pixels,
- int output_stride, int cols,
- const int16_t *in, int bd,
- od_tx8_kernel8_epi16 kernel8_epi16,
- od_tx8_mm256_kernel kernel16_epi16) {
- __m128i r0;
- __m128i r1;
- __m128i r2;
- __m128i r3;
- __m128i r4;
- __m128i r5;
- __m128i r6;
- __m128i r7;
- if (cols <= 4) {
- od_load_buffer_4x4_epi16(&r0, &r1, &r2, &r3, in);
- od_load_buffer_4x4_epi16(&r4, &r5, &r6, &r7, in + 16);
- kernel8_epi16(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7);
- od_add_store_buffer_hbd_4x4_epi16(output_pixels, output_stride, r0, r4, r2,
- r6, bd);
- od_add_store_buffer_hbd_4x4_epi16(output_pixels + 4 * output_stride,
- output_stride, r1, r5, r3, r7, bd);
- } else if (cols <= 8) {
- od_load_buffer_8x4_epi16(&r0, &r1, &r2, &r3, in, cols);
- od_load_buffer_8x4_epi16(&r4, &r5, &r6, &r7, in + 32, cols);
- kernel8_epi16(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7);
- od_add_store_buffer_hbd_8x4_epi16(output_pixels, output_stride, r0, r4, r2,
- r6, bd);
- od_add_store_buffer_hbd_8x4_epi16(output_pixels + 4 * output_stride,
- output_stride, r1, r5, r3, r7, bd);
- } else {
- __m256i rr0;
- __m256i rr1;
- __m256i rr2;
- __m256i rr3;
- __m256i rr4;
- __m256i rr5;
- __m256i rr6;
- __m256i rr7;
- int c;
- for (c = 0; c < cols; c += 16) {
- od_load_buffer_16x4_epi16(&rr0, &rr1, &rr2, &rr3, in + c, cols);
- od_load_buffer_16x4_epi16(&rr4, &rr5, &rr6, &rr7, in + 4 * cols + c,
- cols);
- kernel16_epi16(&rr0, &rr1, &rr2, &rr3, &rr4, &rr5, &rr6, &rr7);
- od_add_store_buffer_hbd_16x4_epi16(output_pixels, output_stride, rr0, rr4,
- rr2, rr6, bd);
- od_add_store_buffer_hbd_16x4_epi16(output_pixels + 4 * output_stride,
- output_stride, rr1, rr5, rr3, rr7, bd);
- }
- }
-}
-
-static void od_row_idct8_avx2(int16_t *out, int rows, const tran_low_t *in) {
- od_row_tx8_avx2(out, rows, in, od_idct8_kernel8_epi16,
- od_idct8_kernel8_epi32);
-}
-
-static void od_col_idct8_add_hbd_avx2(unsigned char *output_pixels,
- int output_stride, int cols,
- const int16_t *in, int bd) {
- od_col_tx8_add_hbd_avx2(output_pixels, output_stride, cols, in, bd,
- od_idct8_kernel8_epi16, od_idct8_kernel16_epi16);
-}
-
-static void od_row_idst8_avx2(int16_t *out, int rows, const tran_low_t *in) {
- od_row_tx8_avx2(out, rows, in, od_idst8_kernel8_epi16,
- od_idst8_kernel8_epi32);
-}
-
-static void od_col_idst8_add_hbd_avx2(unsigned char *output_pixels,
- int output_stride, int cols,
- const int16_t *in, int bd) {
- od_col_tx8_add_hbd_avx2(output_pixels, output_stride, cols, in, bd,
- od_idst8_kernel8_epi16, od_idst8_kernel16_epi16);
-}
-
-static void od_row_flip_idst8_avx2(int16_t *out, int rows,
- const tran_low_t *in) {
- od_row_tx8_avx2(out, rows, in, od_flip_idst8_kernel8_epi16,
- od_flip_idst8_kernel8_epi32);
-}
-
-static void od_col_flip_idst8_add_hbd_avx2(unsigned char *output_pixels,
- int output_stride, int cols,
- const int16_t *in, int bd) {
- od_col_tx8_add_hbd_avx2(output_pixels, output_stride, cols, in, bd,
- od_flip_idst8_kernel8_epi16,
- od_flip_idst8_kernel16_epi16);
-}
-#endif
-
-static void od_row_iidtx8_avx2(int16_t *out, int rows, const tran_low_t *in) {
- od_row_iidtx_avx2(out, rows * 8, in);
-}
-
-static void od_col_iidtx8_add_hbd_avx2(unsigned char *output_pixels,
- int output_stride, int cols,
- const int16_t *in, int bd) {
- od_col_iidtx_add_hbd_avx2(output_pixels, output_stride, 8, cols, in, bd);
-}
-
-typedef void (*od_tx16_kernel8_epi16)(__m128i *s0, __m128i *s4, __m128i *s2,
- __m128i *s6, __m128i *s1, __m128i *s5,
- __m128i *s3, __m128i *s7, __m128i *s8,
- __m128i *s9, __m128i *sa, __m128i *sb,
- __m128i *sc, __m128i *sd, __m128i *se,
- __m128i *sf);
-
-typedef void (*od_tx16_mm256_kernel)(__m256i *s0, __m256i *s4, __m256i *s2,
- __m256i *s6, __m256i *s1, __m256i *s5,
- __m256i *s3, __m256i *s7, __m256i *s8,
- __m256i *s9, __m256i *sa, __m256i *sb,
- __m256i *sc, __m256i *sd, __m256i *se,
- __m256i *sf);
-
-#if 0
-static void od_row_tx16_avx2(int16_t *out, int rows, const tran_low_t *in,
-#if CONFIG_RECT_TX_EXT
- od_tx16_kernel8_epi16 kernel8_epi16,
-#endif
- od_tx16_mm256_kernel kernel8_epi32) {
-#if CONFIG_RECT_TX_EXT
- if (rows <= 4) {
- __m128i s0;
- __m128i s1;
- __m128i s2;
- __m128i s3;
- __m128i s4;
- __m128i s5;
- __m128i s6;
- __m128i s7;
- __m128i s8;
- __m128i s9;
- __m128i sa;
- __m128i sb;
- __m128i sc;
- __m128i sd;
- __m128i se;
- __m128i sf;
- od_load_buffer_4x4_epi32(&s0, &s1, &s8, &s9, in);
- od_load_buffer_4x4_epi32(&s2, &s3, &sa, &sb, in + 16);
- od_load_buffer_4x4_epi32(&s4, &s5, &sc, &sd, in + 32);
- od_load_buffer_4x4_epi32(&s6, &s7, &se, &sf, in + 48);
- /*TODO(any): Merge this transpose with coefficient scanning.*/
- od_transpose_pack8x4(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
- od_transpose_pack8x4(&s8, &s9, &sa, &sb, &sc, &sd, &se, &sf);
- kernel8_epi16(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &sa, &sb,
- &sc, &sd, &se, &sf);
- od_transpose4x8(&s0, s8, &s4, sc, &s2, sa, &s6, se);
- od_transpose4x8(&s1, s9, &s5, sd, &s3, sb, &s7, sf);
- od_store_buffer_4x4_epi16(out, s0, s1);
- od_store_buffer_4x4_epi16(out + 16, s4, s5);
- od_store_buffer_4x4_epi16(out + 32, s2, s3);
- od_store_buffer_4x4_epi16(out + 48, s6, s7);
- return;
- }
-#endif // CONFIG_RECT_TX_EXT
- {
- int r;
- /* 8 or more rows requires 32-bit precision.
- TODO(any): If the column TX is IDTX, then we can still use 16 bits. */
- for (r = 0; r < rows; r += 8) {
- __m256i ss0;
- __m256i ss1;
- __m256i ss2;
- __m256i ss3;
- __m256i ss4;
- __m256i ss5;
- __m256i ss6;
- __m256i ss7;
- __m256i ss8;
- __m256i ss9;
- __m256i ssa;
- __m256i ssb;
- __m256i ssc;
- __m256i ssd;
- __m256i sse;
- __m256i ssf;
- od_load_buffer_8x4_epi32(&ss0, &ss8, &ss1, &ss9, in + r * 16);
- od_load_buffer_8x4_epi32(&ss2, &ssa, &ss3, &ssb, in + r * 16 + 32);
- od_load_buffer_8x4_epi32(&ss4, &ssc, &ss5, &ssd, in + r * 16 + 64);
- od_load_buffer_8x4_epi32(&ss6, &sse, &ss7, &ssf, in + r * 16 + 96);
- od_transpose8x8_epi32(&ss0, &ss1, &ss2, &ss3, &ss4, &ss5, &ss6, &ss7);
- od_transpose8x8_epi32(&ss8, &ss9, &ssa, &ssb, &ssc, &ssd, &sse, &ssf);
- kernel8_epi32(&ss0, &ss1, &ss2, &ss3, &ss4, &ss5, &ss6, &ss7, &ss8, &ss9,
- &ssa, &ssb, &ssc, &ssd, &sse, &ssf);
- od_transpose_pack8x16_epi32(&ss0, &ss8, &ss4, &ssc, &ss2, &ssa, &ss6,
- &sse, ss1, ss9, ss5, ssd, ss3, ssb, ss7, ssf);
- od_store_buffer_2x16_epi16(out + r * 16, ss0, ss8);
- od_store_buffer_2x16_epi16(out + r * 16 + 32, ss4, ssc);
- od_store_buffer_2x16_epi16(out + r * 16 + 64, ss2, ssa);
- od_store_buffer_2x16_epi16(out + r * 16 + 96, ss6, sse);
- }
- }
-}
-
-static void od_col_tx16_add_hbd_avx2(unsigned char *output_pixels,
- int output_stride, int cols,
- const int16_t *in, int bd,
- od_tx16_kernel8_epi16 kernel8_epi16,
- od_tx16_mm256_kernel kernel16_epi16) {
- __m128i s0;
- __m128i s1;
- __m128i s2;
- __m128i s3;
- __m128i s4;
- __m128i s5;
- __m128i s6;
- __m128i s7;
- __m128i s8;
- __m128i s9;
- __m128i sa;
- __m128i sb;
- __m128i sc;
- __m128i sd;
- __m128i se;
- __m128i sf;
-#if CONFIG_RECT_TX_EXT
- if (cols <= 4) {
- od_load_buffer_4x4_epi16(&s0, &s1, &s2, &s3, in);
- od_load_buffer_4x4_epi16(&s4, &s5, &s6, &s7, in + 16);
- od_load_buffer_4x4_epi16(&s8, &s9, &sa, &sb, in + 32);
- od_load_buffer_4x4_epi16(&sc, &sd, &se, &sf, in + 48);
- kernel8_epi16(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &sa, &sb,
- &sc, &sd, &se, &sf);
- od_add_store_buffer_hbd_4x4_epi16(output_pixels, output_stride, s0, s8, s4,
- sc, bd);
- od_add_store_buffer_hbd_4x4_epi16(output_pixels + 4 * output_stride,
- output_stride, s2, sa, s6, se, bd);
- od_add_store_buffer_hbd_4x4_epi16(output_pixels + 8 * output_stride,
- output_stride, s1, s9, s5, sd, bd);
- od_add_store_buffer_hbd_4x4_epi16(output_pixels + 12 * output_stride,
- output_stride, s3, sb, s7, sf, bd);
- return;
- }
-#endif // CONFIG_RECT_TX_EXT
- if (cols <= 8) {
- od_load_buffer_8x4_epi16(&s0, &s1, &s2, &s3, in, cols);
- od_load_buffer_8x4_epi16(&s4, &s5, &s6, &s7, in + 32, cols);
- od_load_buffer_8x4_epi16(&s8, &s9, &sa, &sb, in + 64, cols);
- od_load_buffer_8x4_epi16(&sc, &sd, &se, &sf, in + 96, cols);
- kernel8_epi16(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &sa, &sb,
- &sc, &sd, &se, &sf);
- od_add_store_buffer_hbd_8x4_epi16(output_pixels, output_stride, s0, s8, s4,
- sc, bd);
- od_add_store_buffer_hbd_8x4_epi16(output_pixels + 4 * output_stride,
- output_stride, s2, sa, s6, se, bd);
- od_add_store_buffer_hbd_8x4_epi16(output_pixels + 8 * output_stride,
- output_stride, s1, s9, s5, sd, bd);
- od_add_store_buffer_hbd_8x4_epi16(output_pixels + 12 * output_stride,
- output_stride, s3, sb, s7, sf, bd);
- } else {
- __m256i ss0;
- __m256i ss1;
- __m256i ss2;
- __m256i ss3;
- __m256i ss4;
- __m256i ss5;
- __m256i ss6;
- __m256i ss7;
- __m256i ss8;
- __m256i ss9;
- __m256i ssa;
- __m256i ssb;
- __m256i ssc;
- __m256i ssd;
- __m256i sse;
- __m256i ssf;
- int c;
- for (c = 0; c < cols; c += 16) {
- od_load_buffer_16x4_epi16(&ss0, &ss1, &ss2, &ss3, in + c, cols);
- od_load_buffer_16x4_epi16(&ss4, &ss5, &ss6, &ss7, in + 4 * cols + c,
- cols);
- od_load_buffer_16x4_epi16(&ss8, &ss9, &ssa, &ssb, in + 8 * cols + c,
- cols);
- od_load_buffer_16x4_epi16(&ssc, &ssd, &sse, &ssf, in + 12 * cols + c,
- cols);
- kernel16_epi16(&ss0, &ss1, &ss2, &ss3, &ss4, &ss5, &ss6, &ss7, &ss8, &ss9,
- &ssa, &ssb, &ssc, &ssd, &sse, &ssf);
- od_add_store_buffer_hbd_16x4_epi16(output_pixels, output_stride, ss0, ss8,
- ss4, ssc, bd);
- od_add_store_buffer_hbd_16x4_epi16(output_pixels + 4 * output_stride,
- output_stride, ss2, ssa, ss6, sse, bd);
- od_add_store_buffer_hbd_16x4_epi16(output_pixels + 8 * output_stride,
- output_stride, ss1, ss9, ss5, ssd, bd);
- od_add_store_buffer_hbd_16x4_epi16(output_pixels + 12 * output_stride,
- output_stride, ss3, ssb, ss7, ssf, bd);
- }
- }
-}
-
-static void od_row_idct16_avx2(int16_t *out, int rows, const tran_low_t *in) {
- od_row_tx16_avx2(out, rows, in,
-#if CONFIG_RECT_TX_EXT
- od_idct16_kernel8_epi16,
-#endif
- od_idct16_kernel8_epi32);
-}
-
-static void od_col_idct16_add_hbd_avx2(unsigned char *output_pixels,
- int output_stride, int cols,
- const int16_t *in, int bd) {
- od_col_tx16_add_hbd_avx2(output_pixels, output_stride, cols, in, bd,
- od_idct16_kernel8_epi16, od_idct16_kernel16_epi16);
-}
-
-static void od_row_idst16_avx2(int16_t *out, int rows, const tran_low_t *in) {
- od_row_tx16_avx2(out, rows, in,
-#if CONFIG_RECT_TX_EXT
- od_idst16_kernel8_epi16,
-#endif
- od_idst16_kernel8_epi32);
-}
-
-static void od_col_idst16_add_hbd_avx2(unsigned char *output_pixels,
- int output_stride, int cols,
- const int16_t *in, int bd) {
- od_col_tx16_add_hbd_avx2(output_pixels, output_stride, cols, in, bd,
- od_idst16_kernel8_epi16, od_idst16_kernel16_epi16);
-}
-
-static void od_row_flip_idst16_avx2(int16_t *out, int rows,
- const tran_low_t *in) {
- od_row_tx16_avx2(out, rows, in,
-#if CONFIG_RECT_TX_EXT
- od_flip_idst16_kernel8_epi16,
-#endif
- od_flip_idst16_kernel8_epi32);
-}
-
-static void od_col_flip_idst16_add_hbd_avx2(unsigned char *output_pixels,
- int output_stride, int cols,
- const int16_t *in, int bd) {
- od_col_tx16_add_hbd_avx2(output_pixels, output_stride, cols, in, bd,
- od_flip_idst16_kernel8_epi16,
- od_flip_idst16_kernel16_epi16);
-}
-#endif
-
-static void od_row_iidtx16_avx2(int16_t *out, int rows, const tran_low_t *in) {
- od_row_iidtx_avx2(out, rows * 16, in);
-}
-
-static void od_col_iidtx16_add_hbd_avx2(unsigned char *output_pixels,
- int output_stride, int cols,
- const int16_t *in, int bd) {
- od_col_iidtx_add_hbd_avx2(output_pixels, output_stride, 16, cols, in, bd);
-}
-
-typedef void (*daala_row_itx)(int16_t *out, int rows, const tran_low_t *in);
-typedef void (*daala_col_itx_add)(unsigned char *output_pixels,
- int output_stride, int cols,
- const int16_t *in, int bd);
-
-static const daala_row_itx TX_ROW_MAP[TX_SIZES][TX_TYPES] = {
- // 4-point transforms
- { NULL, od_row_idst4_avx2, od_row_flip_idst4_avx2, od_row_iidtx4_avx2 },
- // 8-point transforms
- { NULL, NULL, NULL, od_row_iidtx8_avx2 },
- // 16-point transforms
- { NULL, NULL, NULL, od_row_iidtx16_avx2 },
- // 32-point transforms
- { NULL, NULL, NULL, NULL },
-#if CONFIG_TX64X64
- // 64-point transforms
- { NULL, NULL, NULL, NULL },
-#endif
-};
-
-static const daala_col_itx_add TX_COL_MAP[2][TX_SIZES][TX_TYPES] = {
- // Low bit depth output
- {
- // 4-point transforms
- { NULL, NULL, NULL, NULL },
- // 8-point transforms
- { NULL, NULL, NULL, NULL },
- // 16-point transforms
- { NULL, NULL, NULL, NULL },
- // 32-point transforms
- { NULL, NULL, NULL, NULL },
-#if CONFIG_TX64X64
- // 64-point transforms
- { NULL, NULL, NULL, NULL },
-#endif
- },
- // High bit depth output
- {
- // 4-point transforms
- { NULL, od_col_idst4_add_hbd_avx2, od_col_flip_idst4_add_hbd_avx2,
- od_col_iidtx4_add_hbd_avx2 },
- // 8-point transforms
- { NULL, NULL, NULL, od_col_iidtx8_add_hbd_avx2 },
- // 16-point transforms
- { NULL, NULL, NULL, od_col_iidtx16_add_hbd_avx2 },
- // 32-point transforms
- { NULL, NULL, NULL, NULL },
-#if CONFIG_TX64X64
- // 64-point transforms
- { NULL, NULL, NULL, NULL },
-#endif
- }
-};
-
-/* Define this to verify the SIMD against the C versions of the transforms.
- This is intended to be replaced by real unit tests in the future. */
-#undef DAALA_TX_VERIFY_SIMD
-
-void daala_inv_txfm_add_avx2(const tran_low_t *input_coeffs,
- void *output_pixels, int output_stride,
- TxfmParam *txfm_param) {
- const TX_SIZE tx_size = txfm_param->tx_size;
- const TX_TYPE tx_type = txfm_param->tx_type;
- assert(tx_size <= TX_SIZES_ALL);
- assert(tx_type <= TX_TYPES);
-
- if (txfm_param->lossless) {
- daala_inv_txfm_add_c(input_coeffs, output_pixels, output_stride,
- txfm_param);
- } else {
- // General TX case
- assert(sizeof(tran_low_t) == sizeof(od_coeff));
- assert(sizeof(tran_low_t) >= 4);
-
- // Hook into existing map translation infrastructure to select
- // appropriate TX functions
- const TX_SIZE col_idx = txsize_vert_map[tx_size];
- const TX_SIZE row_idx = txsize_horz_map[tx_size];
- assert(col_idx <= TX_SIZES);
- assert(row_idx <= TX_SIZES);
- assert(vtx_tab[tx_type] <= (int)TX_TYPES_1D);
- assert(htx_tab[tx_type] <= (int)TX_TYPES_1D);
- daala_row_itx row_tx = TX_ROW_MAP[row_idx][htx_tab[tx_type]];
- daala_col_itx_add col_tx =
- TX_COL_MAP[txfm_param->is_hbd][col_idx][vtx_tab[tx_type]];
- int16_t tmpsq[MAX_TX_SQUARE];
-
- if (row_tx == NULL || col_tx == NULL) {
- daala_inv_txfm_add_c(input_coeffs, output_pixels, output_stride,
- txfm_param);
- } else {
- const int cols = tx_size_wide[tx_size];
- const int rows = tx_size_high[tx_size];
-#if defined(DAALA_TX_VERIFY_SIMD)
- unsigned char out_check_buf8[MAX_TX_SQUARE];
- int16_t out_check_buf16[MAX_TX_SQUARE];
- unsigned char *out_check_buf;
- {
- if (txfm_param->is_hbd) {
- uint16_t *output_pixels16;
- int r;
- output_pixels16 = CONVERT_TO_SHORTPTR(output_pixels);
- for (r = 0; r < rows; r++) {
- memcpy(out_check_buf16 + r * cols,
- output_pixels16 + r * output_stride,
- cols * sizeof(*out_check_buf16));
- }
- out_check_buf = CONVERT_TO_BYTEPTR(out_check_buf16);
- } else {
- unsigned char *output_pixels8;
- int r;
- output_pixels8 = (unsigned char *)output_pixels;
- for (r = 0; r < rows; r++) {
- memcpy(out_check_buf8 + r * cols,
- output_pixels8 + r * output_stride,
- cols * sizeof(*out_check_buf8));
- }
- out_check_buf = out_check_buf8;
- }
- }
- daala_inv_txfm_add_c(input_coeffs, out_check_buf, cols, txfm_param);
-#endif
- // Inverse-transform rows
- row_tx(tmpsq, rows, input_coeffs);
- // Inverse-transform columns and sum with destination
- col_tx(output_pixels, output_stride, cols, tmpsq, txfm_param->bd);
-#if defined(DAALA_TX_VERIFY_SIMD)
- {
- if (txfm_param->is_hbd) {
- uint16_t *output_pixels16;
- int r;
- output_pixels16 = CONVERT_TO_SHORTPTR(output_pixels);
- for (r = 0; r < rows; r++) {
- if (memcmp(out_check_buf16 + r * cols,
- output_pixels16 + r * output_stride,
- cols * sizeof(*out_check_buf16))) {
- fprintf(stderr, "%s(%i): Inverse %ix%i %i_%i TX SIMD mismatch.\n",
- __FILE__, __LINE__, rows, cols, vtx_tab[tx_type],
- htx_tab[tx_type]);
- assert(0);
- exit(EXIT_FAILURE);
- }
- }
- } else {
- unsigned char *output_pixels8;
- int r;
- output_pixels8 = (unsigned char *)output_pixels;
- for (r = 0; r < rows; r++) {
- if (memcmp(out_check_buf8 + r * cols,
- output_pixels8 + r * output_stride,
- cols * sizeof(*out_check_buf8))) {
- fprintf(stderr, "%s(%i): Inverse %ix%i %i_%i TX SIMD mismatch.\n",
- __FILE__, __LINE__, rows, cols, vtx_tab[tx_type],
- htx_tab[tx_type]);
- assert(0);
- exit(EXIT_FAILURE);
- }
- }
- }
- }
-#endif
- }
- }
-}
-
-#endif
diff --git a/av1/common/x86/daala_tx_kernels.h b/av1/common/x86/daala_tx_kernels.h
deleted file mode 100644
index 19f620f..0000000
--- a/av1/common/x86/daala_tx_kernels.h
+++ /dev/null
@@ -1,591 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* This header does not use an include guard.
- It is intentionally designed to be included multiple times.
- The file that includes it should define the following macros:
-
- OD_KERNEL A label for the width of the kernel, e.g., kernel8
- OD_WORD A label for the size of the SIMD word, e.g., epi16
- OD_REG The type of a SIMD register, e.g., __m128i
- OD_ADD The intrinsic function for addition
- OD_SUB The intrinsic function for subtraction
- OD_RSHIFT1 The function that implements an unbiased right shift by 1
- OD_AVG The function that implements a signed PAVG[WD]
- I.e., (a + b + 1) >> 1, without overflow
- OD_HRSUB The function that implements a VHRSUB.S<16|32>
- I.e., (a - b + 1) >> 1, without overflow
- OD_MUL The function that implements the multiplies
- I.e., (a * b + ((1 << r) >> 1)) >> r, without overflow
- OD_SWAP The function that swaps two SIMD registers
-
- See daala_inv_txfm_avx2.c for examples. */
-
-#define OD_KERNEL_FUNC_IMPL(name, kernel, word) name##_##kernel##_##word
-#define OD_KERNEL_FUNC_WRAPPER(name, kernel, word) \
- OD_KERNEL_FUNC_IMPL(name, kernel, word)
-#define OD_KERNEL_FUNC(name) OD_KERNEL_FUNC_WRAPPER(name, OD_KERNEL, OD_WORD)
-
-static INLINE void OD_KERNEL_FUNC(od_rotate_add)(OD_REG *q0, OD_REG *q1, int c0,
- int r0, int c1, int r1, int c2,
- int r2, int s, int avg) {
- OD_REG t_;
- OD_REG u_;
-
- if (avg)
- t_ = OD_AVG(*q0, *q1);
- else
- t_ = OD_ADD(*q0, *q1);
- u_ = OD_MUL(*q1, c0, r0);
- *q1 = OD_MUL(*q0, c1, r1);
- t_ = OD_MUL(t_, c2, r2);
- if (s)
- *q0 = OD_SUB(u_, OD_RSHIFT1(t_));
- else
- *q0 = OD_SUB(u_, t_);
- *q1 = OD_ADD(*q1, t_);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_rotate_addh)(OD_REG *q0, OD_REG *q1,
- OD_REG *q1h, int c0, int r0,
- int c1, int r1, int c2,
- int r2, int s) {
- OD_REG t_;
- OD_REG u_;
-
- t_ = OD_ADD(*q0, *q1h);
- u_ = OD_MUL(*q1, c0, r0);
- *q1 = OD_MUL(*q0, c1, r1);
- t_ = OD_MUL(t_, c2, r2);
- *q0 = OD_SUB(u_, t_);
- if (s)
- *q1 = OD_ADD(*q1, OD_RSHIFT1(t_));
- else
- *q1 = OD_ADD(*q1, t_);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_rotate_sub)(OD_REG *q0, OD_REG *q1, int c0,
- int r0, int c1, int r1, int c2,
- int r2, int s) {
- OD_REG t_;
- OD_REG u_;
-
- t_ = OD_SUB(*q0, *q1);
- u_ = OD_MUL(*q1, c0, r0);
- *q1 = OD_MUL(*q0, c1, r1);
- t_ = OD_MUL(t_, c2, r2);
- if (s)
- *q0 = OD_ADD(u_, OD_RSHIFT1(t_));
- else
- *q0 = OD_ADD(u_, t_);
- *q1 = OD_ADD(*q1, t_);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_rotate_sub2)(OD_REG *q0, OD_REG *q1,
- int c0, int r0, int c1,
- int r1, int c2, int r2,
- int avg) {
- OD_REG t_;
- OD_REG u_;
-
- if (avg)
- t_ = OD_HRSUB(*q1, *q0);
- else
- t_ = OD_SUB(*q1, *q0);
- u_ = OD_MUL(*q1, c0, r0);
- *q1 = OD_MUL(*q0, c1, r1);
- t_ = OD_MUL(t_, c2, r2);
- *q0 = OD_SUB(t_, u_);
- *q1 = OD_SUB(*q1, t_);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_rotate_subh)(OD_REG *q0, OD_REG *q1,
- OD_REG *q1h, int c0, int r0,
- int c1, int r1, int c2,
- int r2, int s) {
- OD_REG t_;
- OD_REG u_;
-
- t_ = OD_SUB(*q0, *q1h);
- u_ = OD_MUL(*q1, c0, r0);
- *q1 = OD_MUL(*q0, c1, r1);
- t_ = OD_MUL(t_, c2, r2);
- *q0 = OD_ADD(u_, t_);
- if (s)
- *q1 = OD_ADD(*q1, OD_RSHIFT1(t_));
- else
- *q1 = OD_ADD(*q1, t_);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_rotate45)(OD_REG *p0, OD_REG *p1,
- int avg) {
- OD_REG t_;
- if (avg)
- t_ = OD_AVG(*p0, *p1);
- else
- t_ = OD_ADD(*p0, *p1);
- /* 11585/8192 ~= 2*Sin[Pi/4] ~= 1.4142135623730951 */
- *p0 = OD_MUL(*p1, 11585, 13);
- /* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */
- if (avg)
- *p1 = OD_MUL(t_, 11585, 13);
- else
- *p1 = OD_MUL(t_, 11585, 14);
- *p0 = OD_SUB(*p0, *p1);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_butterfly_add)(OD_REG *q0, OD_REG *q1) {
- *q0 = OD_ADD(*q0, OD_RSHIFT1(*q1));
- *q1 = OD_SUB(*q0, *q1);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_butterfly_add2)(OD_REG *q0, OD_REG *q1) {
- *q0 = OD_ADD(*q0, OD_RSHIFT1(*q1));
- *q1 = OD_SUB(*q1, *q0);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_butterfly_sub2)(OD_REG *q0, OD_REG *q1) {
- *q0 = OD_SUB(*q0, OD_RSHIFT1(*q1));
- *q1 = OD_ADD(*q1, *q0);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_butterfly_addh)(OD_REG *q0, OD_REG *q1,
- OD_REG *q1h) {
- *q0 = OD_ADD(*q0, *q1h);
- *q1 = OD_SUB(*q1, *q0);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_butterfly_subh)(OD_REG *q0, OD_REG *q1,
- OD_REG *q1h) {
- *q0 = OD_SUB(*q0, *q1h);
- *q1 = OD_ADD(*q1, *q0);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_butterfly_v1)(OD_REG *q0, OD_REG *q1,
- OD_REG *q1h) {
- *q1 = OD_SUB(*q0, *q1);
- *q1h = OD_RSHIFT1(*q1);
- *q0 = OD_SUB(*q0, *q1h);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_butterfly_v2)(OD_REG *q0, OD_REG *q1,
- OD_REG *q1h) {
- *q1 = OD_SUB(*q1, *q0);
- *q1h = OD_RSHIFT1(*q1);
- *q0 = OD_ADD(*q0, *q1h);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_butterfly_v3)(OD_REG *q0, OD_REG *q1,
- OD_REG *q1h) {
- *q1 = OD_ADD(*q0, *q1);
- *q1h = OD_RSHIFT1(*q1);
- *q0 = OD_SUB(*q0, *q1h);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idct2)(OD_REG *p0, OD_REG *p1) {
- OD_KERNEL_FUNC(od_rotate45)(p1, p0, 0);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idst2)(OD_REG *p0, OD_REG *p1, int neg) {
- // Note: special case of rotation
- OD_REG t_;
- OD_REG u_;
- if (neg)
- t_ = OD_HRSUB(*p0, *p1);
- else
- t_ = OD_AVG(*p0, *p1);
- /* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */
- u_ = OD_MUL(*p0, 21407, 14);
- /* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.541196100146197 */
- *p0 = OD_MUL(*p1, 8867, 14);
- /* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */
- t_ = OD_MUL(t_, 3135, 12);
- if (neg) {
- *p0 = OD_SUB(*p0, t_);
- *p1 = OD_SUB(t_, u_);
- } else {
- *p0 = OD_ADD(*p0, t_);
- *p1 = OD_SUB(u_, t_);
- }
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idct2_asym)(OD_REG *p0, OD_REG *p1,
- OD_REG *p1h) {
- OD_KERNEL_FUNC(od_butterfly_v1)(p0, p1, p1h);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idst2_asym)(OD_REG *p0, OD_REG *p1) {
- // Note: special case of rotation
- OD_REG t_;
- OD_REG u_;
- t_ = OD_AVG(*p0, *p1);
- /* 3135/4096 ~= (Cos[Pi/8] - Sin[Pi/8])*Sqrt[2] = 0.7653668647301795 */
- u_ = OD_MUL(*p1, 3135, 12);
- /* 15137/16384 ~= (Cos[Pi/8] + Sin[Pi/8])/Sqrt[2] = 0.9238795325112867 */
- *p1 = OD_MUL(*p0, 15137, 14);
- /* 8867/8192 ~= Cos[3*Pi/8]*2*Sqrt[2] = 1.082392200292394 */
- t_ = OD_MUL(t_, 8867, 13);
- *p0 = OD_ADD(u_, t_);
- *p1 = OD_SUB(*p1, OD_RSHIFT1(t_));
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idct4)(OD_REG *q0, OD_REG *q2, OD_REG *q1,
- OD_REG *q3) {
- OD_REG q1h;
- OD_KERNEL_FUNC(od_idst2_asym)(q3, q2);
- OD_KERNEL_FUNC(od_idct2_asym)(q0, q1, &q1h);
- OD_KERNEL_FUNC(od_butterfly_addh)(q2, q1, &q1h);
- OD_KERNEL_FUNC(od_butterfly_add)(q0, q3);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idct4_asym)(OD_REG *q0, OD_REG *q2,
- OD_REG *q1, OD_REG *q1h,
- OD_REG *q3, OD_REG *q3h) {
- OD_KERNEL_FUNC(od_idst2)(q3, q2, 0);
- OD_KERNEL_FUNC(od_idct2)(q0, q1);
- OD_KERNEL_FUNC(od_butterfly_v2)(q2, q1, q1h);
- OD_KERNEL_FUNC(od_butterfly_v1)(q0, q3, q3h);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idst_vii4)(OD_REG *q0, OD_REG *q1,
- OD_REG *q2, OD_REG *q3) {
- // Note: special case
- OD_REG t0;
- OD_REG t1;
- OD_REG t2;
- OD_REG t3;
- OD_REG t3h;
- OD_REG t4;
- OD_REG u4;
- t0 = OD_SUB(*q0, *q3);
- t1 = OD_ADD(*q0, *q2);
- t2 = OD_ADD(*q3, OD_HRSUB(t0, *q2));
- t3 = *q1;
- t4 = OD_ADD(*q2, *q3);
- /* 467/2048 ~= 2*Sin[1*Pi/9]/3 ~= 0.228013428883779 */
- t0 = OD_MUL(t0, 467, 11);
- /* 7021/16384 ~= 2*Sin[2*Pi/9]/3 ~= 0.428525073124360 */
- t1 = OD_MUL(t1, 7021, 14);
- /* 37837/32768 ~= 4*Sin[3*Pi/9]/3 ~= 1.154700538379252 */
- t2 = OD_MUL(t2, 37837, 15);
- /* 37837/32768 ~= 4*Sin[3*Pi/9]/3 ~= 1.154700538379252 */
- t3 = OD_MUL(t3, 37837, 15);
- /* 21513/32768 ~= 2*Sin[4*Pi/9]/3 ~= 0.656538502008139 */
- t4 = OD_MUL(t4, 21513, 15);
- t3h = OD_RSHIFT1(t3);
- u4 = OD_ADD(t4, t3h);
- *q0 = OD_ADD(t0, u4);
- /* We swap q1 and q2 to correct for the bitreverse reordering that
- od_row_tx4_avx2() does. */
- *q2 = OD_ADD(t1, OD_SUB(t3, u4));
- *q1 = t2;
- *q3 = OD_ADD(t0, OD_SUB(t1, t3h));
-}
-
-static INLINE void OD_KERNEL_FUNC(od_flip_idst_vii4)(OD_REG *q0, OD_REG *q1,
- OD_REG *q2, OD_REG *q3) {
- OD_KERNEL_FUNC(od_idst_vii4)(q0, q1, q2, q3);
- OD_SWAP(q0, q3);
- OD_SWAP(q1, q2);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idst4)(OD_REG *q0, OD_REG *q1, OD_REG *q2,
- OD_REG *q3) {
- OD_REG q2h;
- OD_REG q3h;
- OD_KERNEL_FUNC(od_rotate45)(q2, q1, 1);
- OD_KERNEL_FUNC(od_butterfly_v3)(q0, q2, &q2h);
- OD_KERNEL_FUNC(od_butterfly_v3)(q1, q3, &q3h);
- /* 16069/16384 ~= (Sin[5*Pi/16] + Cos[5*Pi/16])/Sqrt[2] ~= 0.9807852804032 */
- /* 12785/32768 ~= (Sin[5*Pi/16] - Cos[5*Pi/16])*Sqrt[2] ~= 0.3901806440323 */
- /* 12873/16384 ~= Cos[5*Pi/16]*Sqrt[2] ~= 0.7856949583871021 */
- OD_KERNEL_FUNC(od_rotate_addh)
- (q1, q2, &q2h, 16069, 14, 12785, 15, 12873, 14, 0);
- /* 13623/16384 ~= (Sin[7*Pi/16] + Cos[7*Pi/16])/Sqrt[2] ~= 0.8314696123025 */
- /* 18205/16384 ~= (Sin[7*Pi/16] - Cos[7*Pi/16])*Sqrt[2] ~= 1.1111404660392 */
- /* 9041/32768 ~= Cos[7*Pi/16]*Sqrt[2] = 0.275899379282943 */
- OD_KERNEL_FUNC(od_rotate_subh)
- (q0, q3, &q3h, 13623, 14, 18205, 14, 9041, 15, 0);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idst4_asym)(OD_REG *q0, OD_REG *q2,
- OD_REG *q1, OD_REG *q3) {
- OD_REG q1h;
- OD_REG q3h;
- OD_KERNEL_FUNC(od_rotate45)(q1, q2, 1);
- OD_KERNEL_FUNC(od_butterfly_v3)(q0, q1, &q1h);
- OD_KERNEL_FUNC(od_butterfly_v3)(q2, q3, &q3h);
- /* 45451/32768 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */
- /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */
- /* 18205/16384 ~= 2*Cos[5*Pi/16] ~= 1.1111404660392044 */
- OD_KERNEL_FUNC(od_rotate_addh)
- (q2, q1, &q1h, 45451, 15, 9041, 15, 18205, 14, 1);
- /* 38531/32768 ~= Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
- /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
- /* 12785/32768 ~= 2*Cos[7*Pi/16] = 0.3901806440322565 */
- OD_KERNEL_FUNC(od_rotate_subh)
- (q0, q3, &q3h, 38531, 15, 12873, 14, 12785, 15, 1);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idct8)(OD_REG *r0, OD_REG *r4, OD_REG *r2,
- OD_REG *r6, OD_REG *r1, OD_REG *r5,
- OD_REG *r3, OD_REG *r7) {
- OD_REG r1h;
- OD_REG r3h;
- OD_KERNEL_FUNC(od_idst4_asym)(r7, r5, r6, r4);
- OD_KERNEL_FUNC(od_idct4_asym)(r0, r2, r1, &r1h, r3, &r3h);
- OD_KERNEL_FUNC(od_butterfly_addh)(r4, r3, &r3h);
- OD_KERNEL_FUNC(od_butterfly_add)(r2, r5);
- OD_KERNEL_FUNC(od_butterfly_addh)(r6, r1, &r1h);
- OD_KERNEL_FUNC(od_butterfly_add)(r0, r7);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idct8_asym)(
- OD_REG *r0, OD_REG *r4, OD_REG *r2, OD_REG *r6, OD_REG *r1, OD_REG *r1h,
- OD_REG *r5, OD_REG *r5h, OD_REG *r3, OD_REG *r3h, OD_REG *r7, OD_REG *r7h) {
- OD_KERNEL_FUNC(od_idst4)(r7, r5, r6, r4);
- OD_KERNEL_FUNC(od_idct4)(r0, r2, r1, r3);
- OD_KERNEL_FUNC(od_butterfly_v1)(r0, r7, r7h);
- OD_KERNEL_FUNC(od_butterfly_v2)(r6, r1, r1h);
- OD_KERNEL_FUNC(od_butterfly_v1)(r2, r5, r5h);
- OD_KERNEL_FUNC(od_butterfly_v2)(r4, r3, r3h);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idst8)(OD_REG *r0, OD_REG *r4, OD_REG *r2,
- OD_REG *r6, OD_REG *r1, OD_REG *r5,
- OD_REG *r3, OD_REG *r7) {
- OD_REG r0h;
- OD_REG r2h;
- OD_REG r5h;
- OD_REG r7h;
- OD_KERNEL_FUNC(od_rotate45)(r1, r6, 1);
- OD_KERNEL_FUNC(od_idst2)(r5, r2, 1);
- OD_KERNEL_FUNC(od_idst2)(r4, r3, 0);
- OD_KERNEL_FUNC(od_butterfly_v3)(r6, r7, &r7h);
- OD_KERNEL_FUNC(od_butterfly_v3)(r4, r2, &r2h);
- OD_KERNEL_FUNC(od_butterfly_v2)(r1, r0, &r0h);
- OD_KERNEL_FUNC(od_butterfly_v3)(r3, r5, &r5h);
- OD_KERNEL_FUNC(od_butterfly_subh)(r4, r7, &r7h);
- OD_KERNEL_FUNC(od_butterfly_addh)(r6, r5, &r5h);
- OD_KERNEL_FUNC(od_butterfly_addh)(r3, r0, &r0h);
- OD_KERNEL_FUNC(od_butterfly_subh)(r1, r2, &r2h);
- /* 17911/16384 ~= Sin[15*Pi/32] + Cos[15*Pi/32] ~= 1.0932018670017576 */
- /* 14699/16384 ~= Sin[15*Pi/32] - Cos[15*Pi/32] ~= 0.8971675863426363 */
- /* 803/8192 ~= Cos[15*Pi/32] ~= 0.0980171403295606 */
- OD_KERNEL_FUNC(od_rotate_add)(r7, r0, 17911, 14, 14699, 14, 803, 13, 0, 0);
- /* 40869/32768 ~= Sin[13*Pi/32] + Cos[13*Pi/32] ~= 1.247225012986671 */
- /* 21845/32768 ~= Sin[13*Pi/32] - Cos[13*Pi/32] ~= 0.6666556584777465 */
- /* 1189/4096 ~= Cos[13*Pi/32] ~= 0.29028467725446233 */
- OD_KERNEL_FUNC(od_rotate_sub)(r1, r6, 40869, 15, 21845, 15, 1189, 12, 0);
- /* 22173/16384 ~= Sin[11*Pi/32] + Cos[11*Pi/32] ~= 1.3533180011743526 */
- /* 3363/8192 ~= Sin[11*Pi/32] - Cos[11*Pi/32] ~= 0.4105245275223574 */
- /* 15447/32768 ~= Cos[11*Pi/32] ~= 0.47139673682599764 */
- OD_KERNEL_FUNC(od_rotate_add)(r5, r2, 22173, 14, 3363, 13, 15447, 15, 0, 0);
- /* 23059/16384 ~= Sin[9*Pi/32] + Cos[9*Pi/32] ~= 1.4074037375263826 */
- /* 2271/16384 ~= Sin[9*Pi/32] - Cos[9*Pi/32] ~= 0.1386171691990915 */
- /* 5197/8192 ~= Cos[9*Pi/32] ~= 0.6343932841636455 */
- OD_KERNEL_FUNC(od_rotate_sub)(r3, r4, 23059, 14, 2271, 14, 5197, 13, 0);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idst8_asym)(OD_REG *r0, OD_REG *r4,
- OD_REG *r2, OD_REG *r6,
- OD_REG *r1, OD_REG *r5,
- OD_REG *r3, OD_REG *r7) {
- OD_REG r0h;
- OD_REG r2h;
- OD_REG r5h;
- OD_REG r7h;
- OD_KERNEL_FUNC(od_rotate45)(r1, r6, 1);
- OD_KERNEL_FUNC(od_idst2)(r5, r2, 1);
- OD_KERNEL_FUNC(od_idst2)(r4, r3, 0);
- OD_KERNEL_FUNC(od_butterfly_v3)(r6, r7, &r7h);
- OD_KERNEL_FUNC(od_butterfly_v3)(r4, r2, &r2h);
- OD_KERNEL_FUNC(od_butterfly_v2)(r1, r0, &r0h);
- OD_KERNEL_FUNC(od_butterfly_v3)(r3, r5, &r5h);
- OD_KERNEL_FUNC(od_butterfly_subh)(r4, r7, &r7h);
- OD_KERNEL_FUNC(od_butterfly_addh)(r6, r5, &r5h);
- OD_KERNEL_FUNC(od_butterfly_addh)(r3, r0, &r0h);
- OD_KERNEL_FUNC(od_butterfly_subh)(r1, r2, &r2h);
- /* 12665/16384 ~= (Sin[15*Pi/32] + Cos[15*Pi/32])/Sqrt[2] ~= 0.77301045336 */
- /* 5197/4096 ~= (Sin[15*Pi/32] - Cos[15*Pi/32])*Sqrt[2] ~= 1.2687865683273 */
- /* 2271/16384 ~= Cos[15*Pi/32]*Sqrt[2] ~= 0.13861716919909148 */
- OD_KERNEL_FUNC(od_rotate_add)(r7, r0, 12665, 14, 5197, 12, 2271, 14, 1, 0);
- /* 28899/32768 ~= (Sin[13*Pi/32] + Cos[13*Pi/32])/Sqrt[2] ~= 0.88192126435 */
- /* 30893/32768 ~= (Sin[13*Pi/32] - Cos[13*Pi/32])*Sqrt[2] ~= 0.94279347365 */
- /* 3363/8192 ~= Cos[13*Pi/32]*Sqrt[2] ~= 0.41052452752235735 */
- OD_KERNEL_FUNC(od_rotate_sub)(r1, r6, 28899, 15, 30893, 15, 3363, 13, 1);
- /* 31357/32768 ~= (Sin[11*Pi/32] + Cos[11*Pi/32])/Sqrt[2] ~= 0.95694033573 */
- /* 1189/2048 ~= (Sin[11*Pi/32] - Cos[11*Pi/32])*Sqrt[2] ~= 0.5805693545089 */
- /* 21845/32768 ~= Cos[11*Pi/32] ~= 0.6666556584777465 */
- OD_KERNEL_FUNC(od_rotate_add)(r5, r2, 31357, 15, 1189, 11, 21845, 15, 1, 0);
- /* 16305/16384 ~= (Sin[9*Pi/32] + Cos[9*Pi/32])/Sqrt[2] ~= 0.9951847266722 */
- /* 803/4096 ~= (Sin[9*Pi/32] - Cos[9*Pi/32])*Sqrt[2] ~= 0.1960342806591213 */
- /* 14699/16384 ~= Cos[9*Pi/32]*Sqrt[2] ~= 0.8971675863426364 */
- OD_KERNEL_FUNC(od_rotate_sub)(r3, r4, 16305, 14, 803, 12, 14699, 14, 1);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_flip_idst8)(OD_REG *r0, OD_REG *r4,
- OD_REG *r2, OD_REG *r6,
- OD_REG *r1, OD_REG *r5,
- OD_REG *r3, OD_REG *r7) {
- OD_KERNEL_FUNC(od_idst8)(r0, r4, r2, r6, r1, r5, r3, r7);
- OD_SWAP(r0, r7);
- OD_SWAP(r4, r3);
- OD_SWAP(r2, r5);
- OD_SWAP(r6, r1);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idct16)(OD_REG *s0, OD_REG *s8, OD_REG *s4,
- OD_REG *sc, OD_REG *s2, OD_REG *sa,
- OD_REG *s6, OD_REG *se, OD_REG *s1,
- OD_REG *s9, OD_REG *s5, OD_REG *sd,
- OD_REG *s3, OD_REG *sb, OD_REG *s7,
- OD_REG *sf) {
- OD_REG s1h;
- OD_REG s3h;
- OD_REG s5h;
- OD_REG s7h;
- OD_KERNEL_FUNC(od_idst8_asym)(sf, sb, sd, s9, se, sa, sc, s8);
- OD_KERNEL_FUNC(od_idct8_asym)
- (s0, s4, s2, s6, s1, &s1h, s5, &s5h, s3, &s3h, s7, &s7h);
- OD_KERNEL_FUNC(od_butterfly_addh)(s8, s7, &s7h);
- OD_KERNEL_FUNC(od_butterfly_add)(s6, s9);
- OD_KERNEL_FUNC(od_butterfly_addh)(sa, s5, &s5h);
- OD_KERNEL_FUNC(od_butterfly_add)(s4, sb);
- OD_KERNEL_FUNC(od_butterfly_addh)(sc, s3, &s3h);
- OD_KERNEL_FUNC(od_butterfly_add)(s2, sd);
- OD_KERNEL_FUNC(od_butterfly_addh)(se, s1, &s1h);
- OD_KERNEL_FUNC(od_butterfly_add)(s0, sf);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_idst16)(OD_REG *s0, OD_REG *s1, OD_REG *s2,
- OD_REG *s3, OD_REG *s4, OD_REG *s5,
- OD_REG *s6, OD_REG *s7, OD_REG *s8,
- OD_REG *s9, OD_REG *sa, OD_REG *sb,
- OD_REG *sc, OD_REG *sd, OD_REG *se,
- OD_REG *sf) {
- OD_REG s0h;
- OD_REG s1h;
- OD_REG s2h;
- OD_REG s3h;
- OD_REG s4h;
- OD_REG s5h;
- OD_REG s6h;
- OD_REG s7h;
- OD_REG sbh;
- OD_REG sfh;
- OD_REG h;
- OD_KERNEL_FUNC(od_rotate45)(s9, s6, 1);
- OD_KERNEL_FUNC(od_rotate45)(sa, s5, 1);
- OD_KERNEL_FUNC(od_rotate45)(s8, s7, 1);
- OD_KERNEL_FUNC(od_idst2)(s3, sc, 0);
- OD_KERNEL_FUNC(od_idst2)(sb, s4, 1);
- OD_KERNEL_FUNC(od_butterfly_v3)(s2, sa, &h);
- OD_KERNEL_FUNC(od_butterfly_v2)(sd, s5, &h);
- OD_KERNEL_FUNC(od_butterfly_v2)(s9, s1, &h);
- OD_KERNEL_FUNC(od_butterfly_v3)(s6, se, &h);
- OD_KERNEL_FUNC(od_butterfly_v3)(sc, sb, &sbh);
- OD_KERNEL_FUNC(od_butterfly_v3)(s7, sf, &sfh);
- OD_KERNEL_FUNC(od_butterfly_v2)(s8, s0, &s0h);
- OD_KERNEL_FUNC(od_butterfly_v3)(s3, s4, &s4h);
- /* 38531/32768 ~= Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
- /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
- /* 6393/32768 ~= Cos[7*Pi/16] = 0.19509032201612825 */
- OD_KERNEL_FUNC(od_rotate_sub2)
- (s2, sd, 38531, 15, 12873, 14, 6393, 15, 0);
- /* 22725/16384 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */
- /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */
- /* 18205/16384 ~= 2*Cos[5*Pi/16] ~= 1.1111404660392044 */
- OD_KERNEL_FUNC(od_rotate_sub2)
- (sa, s5, 22725, 14, 9041, 15, 18205, 14, 1);
- /* 45451/32768 ~= Sin[5*Pi/16] + Cos[5*Pi/16] ~= 1.3870398453221475 */
- /* 9041/32768 ~= Sin[5*Pi/16] - Cos[5*Pi/16] ~= 0.27589937928294306 */
- /* 18205/32768 ~= Cos[5*Pi/16] ~= 0.5555702330196022 */
- OD_KERNEL_FUNC(od_rotate_add)
- (s6, s9, 45451, 15, 9041, 15, 18205, 15, 0, 0);
- /* 9633/8192 ~= Sin[7*Pi/16] + Cos[7*Pi/16] ~= 1.1758756024193586 */
- /* 12873/16384 ~= Sin[7*Pi/16] - Cos[7*Pi/16] ~= 0.7856949583871022 */
- /* 12785/32768 ~= 2*Cos[7*Pi/16] ~= 0.3901806440322565 */
- OD_KERNEL_FUNC(od_rotate_add)
- (se, s1, 9633, 13, 12873, 14, 12785, 15, 0, 1);
- OD_KERNEL_FUNC(od_butterfly_subh)(s8, s4, &s4h);
- OD_KERNEL_FUNC(od_butterfly_addh)(s7, sb, &sbh);
- OD_KERNEL_FUNC(od_butterfly_subh)(s3, sf, &sfh);
- OD_KERNEL_FUNC(od_butterfly_addh)(sc, s0, &s0h);
- OD_KERNEL_FUNC(od_butterfly_add2)(sd, se);
- OD_KERNEL_FUNC(od_butterfly_add2)(s2, s1);
- OD_KERNEL_FUNC(od_butterfly_sub2)(s6, s5);
- OD_KERNEL_FUNC(od_butterfly_sub2)(s9, sa);
- OD_KERNEL_FUNC(od_butterfly_v2)(se, s0, &s0h);
- OD_KERNEL_FUNC(od_butterfly_v2)(sf, s1, &s1h);
- OD_KERNEL_FUNC(od_butterfly_v3)(sc, s2, &s2h);
- OD_KERNEL_FUNC(od_butterfly_v3)(sd, s3, &s3h);
- OD_KERNEL_FUNC(od_butterfly_v2)(sa, s4, &s4h);
- OD_KERNEL_FUNC(od_butterfly_v2)(sb, s5, &s5h);
- OD_KERNEL_FUNC(od_butterfly_v3)(s8, s6, &s6h);
- OD_KERNEL_FUNC(od_butterfly_v3)(s9, s7, &s7h);
- /* 32729/32768 ~= (Sin[17*Pi/64] + Cos[17*Pi/64])/Sqrt[2] ~= 0.99879545620 */
- /* 201/2048 ~= (Sin[17*Pi/64] - Cos[17*Pi/64])*Sqrt[2] ~= 0.09813534865484 */
- /* 31121/32768 ~= Cos[17*Pi/64]*Sqrt[2] ~= 0.9497277818777543 */
- OD_KERNEL_FUNC(od_rotate_subh)
- (se, s1, &s1h, 32729, 15, 201, 11, 31121, 15, 0);
- /* 32413/32768 ~= (Sin[19*Pi/64] + Cos[19*Pi/64])/Sqrt[2] ~= 0.98917650996 */
- /* 601/2048 ~= (Sin[19*Pi/64] - Cos[19*Pi/64])*Sqrt[2] ~= 0.29346094891072 */
- /* 27605/32768 ~= Cos[19*Pi/64]*Sqrt[2] ~= 0.8424460355094193 */
- OD_KERNEL_FUNC(od_rotate_addh)
- (s9, s6, &s6h, 32413, 15, 601, 11, 27605, 15, 0);
- /* 15893/16384 ~= (Sin[21*Pi/64] + Cos[21*Pi/64])/Sqrt[2] ~= 0.97003125319 */
- /* 3981/8192 ~= (Sin[21*Pi/64] - Cos[21*Pi/64])*Sqrt[2] ~= 0.4859603598065 */
- /* 1489/2048 ~= Cos[21*Pi/64]*Sqrt[2] ~= 0.72705107329128 */
- OD_KERNEL_FUNC(od_rotate_subh)
- (sa, s5, &s5h, 15893, 14, 3981, 13, 1489, 11, 0);
- /* 30853/32768 ~= (Sin[23*Pi/64] + Cos[23*Pi/64])/Sqrt[2] ~= 0.94154406518 */
- /* 11039/16384 ~= (Sin[23*Pi/64] - Cos[23*Pi/64])*Sqrt[2] ~= 0.67377970678 */
- /* 19813/32768 ~= Cos[23*Pi/64]*Sqrt[2] ~= 0.6046542117908008 */
- OD_KERNEL_FUNC(od_rotate_addh)
- (sd, s2, &s2h, 30853, 15, 11039, 14, 19813, 15, 0);
- /* 14811/16384 ~= (Sin[25*Pi/64] + Cos[25*Pi/64])/Sqrt[2] ~= 0.90398929312 */
- /* 7005/8192 ~= (Sin[25*Pi/64] - Cos[25*Pi/64])*Sqrt[2] ~= 0.8551101868606 */
- /* 3903/8192 ~= Cos[25*Pi/64]*Sqrt[2] ~= 0.47643419969316125 */
- OD_KERNEL_FUNC(od_rotate_subh)
- (sc, s3, &s3h, 14811, 14, 7005, 13, 3903, 13, 0);
- /* 14053/16384 ~= (Sin[27*Pi/64] + Cos[27*Pi/64])/Sqrt[2] ~= 0.85772861000 */
- /* 8423/8192 ~= (Sin[27*Pi/64] - Cos[27*Pi/64])*Sqrt[2] ~= 1.0282054883864 */
- /* 2815/8192 ~= Cos[27*Pi/64]*Sqrt[2] ~= 0.34362586580705035 */
- OD_KERNEL_FUNC(od_rotate_addh)
- (sb, s4, &s4h, 14053, 14, 8423, 13, 2815, 13, 0);
- /* 1645/2048 ~= (Sin[29*Pi/64] + Cos[29*Pi/64])/Sqrt[2] ~= 0.8032075314806 */
- /* 305/256 ~= (Sin[29*Pi/64] - Cos[29*Pi/64])*Sqrt[2] ~= 1.191398608984867 */
- /* 425/2048 ~= Cos[29*Pi/64]*Sqrt[2] ~= 0.20750822698821159 */
- OD_KERNEL_FUNC(od_rotate_subh)
- (s8, s7, &s7h, 1645, 11, 305, 8, 425, 11, 0);
- /* 24279/32768 ~= (Sin[31*Pi/64] + Cos[31*Pi/64])/Sqrt[2] ~= 0.74095112535 */
- /* 44011/32768 ~= (Sin[31*Pi/64] - Cos[31*Pi/64])*Sqrt[2] ~= 1.34311790969 */
- /* 1137/16384 ~= Cos[31*Pi/64]*Sqrt[2] ~= 0.06939217050794069 */
- OD_KERNEL_FUNC(od_rotate_addh)
- (sf, s0, &s0h, 24279, 15, 44011, 15, 1137, 14, 0);
-}
-
-static INLINE void OD_KERNEL_FUNC(od_flip_idst16)(
- OD_REG *s0, OD_REG *s1, OD_REG *s2, OD_REG *s3, OD_REG *s4, OD_REG *s5,
- OD_REG *s6, OD_REG *s7, OD_REG *s8, OD_REG *s9, OD_REG *sa, OD_REG *sb,
- OD_REG *sc, OD_REG *sd, OD_REG *se, OD_REG *sf) {
- OD_KERNEL_FUNC(od_idst16)
- (s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sa, sb, sc, sd, se, sf);
- OD_SWAP(s0, sf);
- OD_SWAP(s1, se);
- OD_SWAP(s2, sd);
- OD_SWAP(s3, sc);
- OD_SWAP(s4, sb);
- OD_SWAP(s5, sa);
- OD_SWAP(s6, s9);
- OD_SWAP(s7, s8);
-}
diff --git a/av1/decoder/decodetxb.c b/av1/decoder/decodetxb.c
index dfe0e21..aea54cc 100644
--- a/av1/decoder/decodetxb.c
+++ b/av1/decoder/decodetxb.c
@@ -82,19 +82,13 @@
struct macroblockd_plane *const pd = &xd->plane[plane];
const int16_t *const dequant = pd->seg_dequant_QTX[mbmi->segment_id];
tran_low_t *const tcoeffs = pd->dqcoeff;
-#if !CONFIG_DAALA_TX
const int shift = av1_get_tx_scale(tx_size);
-#endif
#if CONFIG_NEW_QUANT
#if !CONFIG_AOM_QM
const tran_low_t *dqv_val = &dq_val[0][0];
#endif // !CONFIG_AOM_QM
-#if CONFIG_DAALA_TX
- const int nq_shift = 0;
-#else
const int nq_shift = shift;
-#endif // CONFIG_DAALA_TX
#endif // CONFIG_NEW_QUANT && !CONFIG_AOM_QM
const int bwl = get_txb_bwl(tx_size);
const int width = get_txb_wide(tx_size);
@@ -283,9 +277,7 @@
#endif // CONFIG_AOM_QM
#else
v = level * dequant[!!c];
-#if !CONFIG_DAALA_TX
v = v >> shift;
-#endif // !CONFIG_DAALA_TX
#endif // CONFIG_NEW_QUANT
tcoeffs[pos] = v;
} else {
@@ -351,9 +343,7 @@
#endif // CONFIG_AOM_QM
#else
t = *level * dequant[!!pos];
-#if !CONFIG_DAALA_TX
t = t >> shift;
-#endif // !CONFIG_DAALA_TX
#endif // CONFIG_NEW_QUANT
if (signs[pos]) t = -t;
tcoeffs[pos] = clamp(t, min_value, max_value);
@@ -374,9 +364,7 @@
#endif // CONFIG_AOM_QM
#else
t = t * dequant[!!pos];
-#if !CONFIG_DAALA_TX
t = t >> shift;
-#endif // !CONFIG_DAALA_TX
#endif // CONFIG_NEW_QUANT
if (signs[pos]) t = -t;
tcoeffs[pos] = clamp(t, min_value, max_value);
diff --git a/av1/decoder/detokenize.c b/av1/decoder/detokenize.c
index e410fa1..0c98069 100644
--- a/av1/decoder/detokenize.c
+++ b/av1/decoder/detokenize.c
@@ -106,16 +106,10 @@
const tran_low_t *dqv_val = &dq_val[0][0];
#endif // CONFIG_NEW_QUANT && !CONFIG_AOM_QM
-#if !CONFIG_DAALA_TX
int dq_shift = av1_get_tx_scale(tx_size);
-#endif
#if CONFIG_NEW_QUANT
-#if CONFIG_DAALA_TX
- int nq_shift = 0;
-#else
int nq_shift = dq_shift;
-#endif // CONFIG_DAALA_TX
#endif // CONFIG_NEW_QUANT
band = *band_translate++;
@@ -192,11 +186,7 @@
v = av1_dequant_abscoeff_nuq(val, dqv, dqv_val, nq_shift);
#endif // CONFIG_AOM_QM
#else
-#if !CONFIG_DAALA_TX
v = (int)(((int64_t)val * dqv) >> dq_shift);
-#else
- v = val * dqv;
-#endif
#endif
v = (int)check_range(av1_read_record_bit(xd->counts, r, ACCT_STR) ? -v : v,
diff --git a/av1/encoder/daala_fwd_txfm.c b/av1/encoder/daala_fwd_txfm.c
deleted file mode 100644
index 096a1b5..0000000
--- a/av1/encoder/daala_fwd_txfm.c
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "av1/common/daala_tx.h"
-#include "av1/encoder/daala_fwd_txfm.h"
-
-#if CONFIG_DAALA_TX
-
-// Complete Daala TX map, sans lossless which is special cased
-typedef void (*daala_ftx)(od_coeff[], const od_coeff *, int);
-
-static daala_ftx tx_map[TX_SIZES][TX_TYPES_1D] = {
- // 4-point transforms
- { od_bin_fdct4, od_bin_fdst4, od_bin_fdst4, od_bin_fidtx4 },
-
- // 8-point transforms
- { od_bin_fdct8, od_bin_fdst8, od_bin_fdst8, od_bin_fidtx8 },
-
- // 16-point transforms
- { od_bin_fdct16, od_bin_fdst16, od_bin_fdst16, od_bin_fidtx16 },
-
- // 32-point transforms
- { od_bin_fdct32, od_bin_fdst32, od_bin_fdst32, od_bin_fidtx32 },
-
-#if CONFIG_TX64X64
- // 64-point transforms
- { od_bin_fdct64, NULL, NULL, od_bin_fidtx64 },
-#endif
-};
-
-static int tx_flip(TX_TYPE_1D t) { return t == 2; }
-
-// Daala TX toplevel entry point, same interface as av1 low-bidepth
-// and high-bitdepth TX (av1_fwd_txfm and av1_highbd_fwd_txfm). This
-// same function is intended for both low and high bitdepth cases with
-// a tran_low_t of 32 bits (matching od_coeff).
-void daala_fwd_txfm(const int16_t *input_pixels, tran_low_t *output_coeffs,
- int input_stride, TxfmParam *txfm_param) {
- const TX_SIZE tx_size = txfm_param->tx_size;
- const TX_TYPE tx_type = txfm_param->tx_type;
- assert(tx_size <= TX_SIZES_ALL);
- assert(tx_type <= TX_TYPES);
-
- if (txfm_param->lossless) {
- // Transform function special-cased for lossless
- assert(tx_type == DCT_DCT);
- assert(tx_size == TX_4X4);
- av1_fwht4x4(input_pixels, output_coeffs, input_stride);
- } else {
- // General TX case
- const int upshift = TX_COEFF_DEPTH - txfm_param->bd;
- assert(upshift >= 0);
- assert(sizeof(tran_low_t) == sizeof(od_coeff));
- assert(sizeof(tran_low_t) >= 4);
-
- // Hook into existing map translation infrastructure to select
- // appropriate TX functions
- const int cols = tx_size_wide[tx_size];
- const int rows = tx_size_high[tx_size];
- const TX_SIZE col_idx = txsize_vert_map[tx_size];
- const TX_SIZE row_idx = txsize_horz_map[tx_size];
- assert(col_idx <= TX_SIZES);
- assert(row_idx <= TX_SIZES);
- assert(vtx_tab[tx_type] <= (int)TX_TYPES_1D);
- assert(htx_tab[tx_type] <= (int)TX_TYPES_1D);
- daala_ftx col_tx = tx_map[col_idx][vtx_tab[tx_type]];
- daala_ftx row_tx = tx_map[row_idx][htx_tab[tx_type]];
- int col_flip = tx_flip(vtx_tab[tx_type]);
- int row_flip = tx_flip(htx_tab[tx_type]);
- od_coeff tmp[MAX_TX_SIZE];
- int r;
- int c;
-
- assert(col_tx);
- assert(row_tx);
-
- // Transform columns
- for (c = 0; c < cols; ++c) {
- // Cast and shift
- for (r = 0; r < rows; ++r)
- tmp[r] =
- ((od_coeff)(input_pixels[r * input_stride + c])) * (1 << upshift);
- if (col_flip)
- col_tx(tmp, tmp + (rows - 1), -1);
- else
- col_tx(tmp, tmp, 1);
- // No ystride in daala_tx lowlevel functions, store output vector
- // into column the long way
- for (r = 0; r < rows; ++r) output_coeffs[r * cols + c] = tmp[r];
- }
-
- // Transform rows
- for (r = 0; r < rows; ++r) {
- if (row_flip)
- row_tx(output_coeffs + r * cols, output_coeffs + r * cols + cols - 1,
- -1);
- else
- row_tx(output_coeffs + r * cols, output_coeffs + r * cols, 1);
- }
-#if CONFIG_TX64X64
- // Re-pack coeffs in the first 32x32 indices.
- if (cols > 32) {
- int avail_rows;
- int avail_cols;
- avail_rows = AOMMIN(rows, 32);
- avail_cols = AOMMIN(cols, 32);
- for (r = 1; r < avail_rows; r++) {
- memmove(output_coeffs + r * avail_cols, output_coeffs + r * cols,
- avail_cols * sizeof(*output_coeffs));
- }
- }
-#endif
- }
-}
-
-#endif
diff --git a/av1/encoder/daala_fwd_txfm.h b/av1/encoder/daala_fwd_txfm.h
deleted file mode 100644
index e8f777a..0000000
--- a/av1/encoder/daala_fwd_txfm.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_ENCODER_DAALA_FWD_TXFM_H_
-#define AV1_ENCODER_DAALA_FWD_TXFM_H_
-
-#include "./aom_config.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void daala_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
- TxfmParam *txfm_param);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AV1_ENCODER_DAALA_FWD_TXFM_H_
diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index 3e40a4d..ebaa0d1 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c
@@ -19,10 +19,6 @@
#include "aom_ports/mem.h"
#include "av1/common/blockd.h"
#include "av1/common/idct.h"
-#if CONFIG_DAALA_TX4 || CONFIG_DAALA_TX8 || CONFIG_DAALA_TX16 || \
- CONFIG_DAALA_TX32 || CONFIG_DAALA_TX64
-#include "av1/common/daala_tx.h"
-#endif
#include "av1/encoder/av1_fwd_txfm1d.h"
#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
@@ -1153,32 +1149,12 @@
void av1_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
-#if !CONFIG_DAALA_TX4
if (tx_type == DCT_DCT) {
aom_fdct4x4_c(input, output, stride);
return;
}
-#endif
{
static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX4
- { daala_fdct4, daala_fdct4 }, // DCT_DCT
- { daala_fdst4, daala_fdct4 }, // ADST_DCT
- { daala_fdct4, daala_fdst4 }, // DCT_ADST
- { daala_fdst4, daala_fdst4 }, // ADST_ADST
- { daala_fdst4, daala_fdct4 }, // FLIPADST_DCT
- { daala_fdct4, daala_fdst4 }, // DCT_FLIPADST
- { daala_fdst4, daala_fdst4 }, // FLIPADST_FLIPADST
- { daala_fdst4, daala_fdst4 }, // ADST_FLIPADST
- { daala_fdst4, daala_fdst4 }, // FLIPADST_ADST
- { daala_idtx4, daala_idtx4 }, // IDTX
- { daala_fdct4, daala_idtx4 }, // V_DCT
- { daala_idtx4, daala_fdct4 }, // H_DCT
- { daala_fdst4, daala_idtx4 }, // V_ADST
- { daala_idtx4, daala_fdst4 }, // H_ADST
- { daala_fdst4, daala_idtx4 }, // V_FLIPADST
- { daala_idtx4, daala_fdst4 }, // H_FLIPADST
-#else
{ fdct4, fdct4 }, // DCT_DCT
{ fadst4, fdct4 }, // ADST_DCT
{ fdct4, fadst4 }, // DCT_ADST
@@ -1195,7 +1171,6 @@
{ fidtx4, fadst4 }, // H_ADST
{ fadst4, fidtx4 }, // V_FLIPADST
{ fidtx4, fadst4 }, // H_FLIPADST
-#endif
};
const transform_2d ht = FHT[tx_type];
tran_low_t out[4 * 4];
@@ -1209,9 +1184,7 @@
for (i = 0; i < 4; ++i) {
/* A C99-safe upshift by 4 for both Daala and VPx TX. */
for (j = 0; j < 4; ++j) temp_in[j] = input[j * stride + i] * 16;
-#if !CONFIG_DAALA_TX4
if (i == 0 && temp_in[0]) temp_in[0] += 1;
-#endif
ht.cols(temp_in, temp_out);
for (j = 0; j < 4; ++j) out[j * 4 + i] = temp_out[j];
}
@@ -1220,13 +1193,7 @@
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j) temp_in[j] = out[j + i * 4];
ht.rows(temp_in, temp_out);
-#if CONFIG_DAALA_TX4
- /* Daala TX has orthonormal scaling; shift down by only 1 to achieve
- the usual VPx coefficient left-shift of 3. */
- for (j = 0; j < 4; ++j) output[j + i * 4] = temp_out[j] >> 1;
-#else
for (j = 0; j < 4; ++j) output[j + i * 4] = (temp_out[j] + 1) >> 2;
-#endif
}
}
}
@@ -1235,24 +1202,6 @@
TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
- { daala_fdct8, daala_fdct4 }, // DCT_DCT
- { daala_fdst8, daala_fdct4 }, // ADST_DCT
- { daala_fdct8, daala_fdst4 }, // DCT_ADST
- { daala_fdst8, daala_fdst4 }, // ADST_ADST
- { daala_fdst8, daala_fdct4 }, // FLIPADST_DCT
- { daala_fdct8, daala_fdst4 }, // DCT_FLIPADST
- { daala_fdst8, daala_fdst4 }, // FLIPADST_FLIPADST
- { daala_fdst8, daala_fdst4 }, // ADST_FLIPADST
- { daala_fdst8, daala_fdst4 }, // FLIPADST_ADST
- { daala_idtx8, daala_idtx4 }, // IDTX
- { daala_fdct8, daala_idtx4 }, // V_DCT
- { daala_idtx8, daala_fdct4 }, // H_DCT
- { daala_fdst8, daala_idtx4 }, // V_ADST
- { daala_idtx8, daala_fdst4 }, // H_ADST
- { daala_fdst8, daala_idtx4 }, // V_FLIPADST
- { daala_idtx8, daala_fdst4 }, // H_FLIPADST
-#else
{ fdct8, fdct4 }, // DCT_DCT
{ fadst8, fdct4 }, // ADST_DCT
{ fdct8, fadst4 }, // DCT_ADST
@@ -1269,7 +1218,6 @@
{ fidtx8, fadst4 }, // H_ADST
{ fadst8, fidtx4 }, // V_FLIPADST
{ fidtx8, fadst4 }, // H_FLIPADST
-#endif
};
const transform_2d ht = FHT[tx_type];
const int n = 4;
@@ -1290,14 +1238,9 @@
for (i = 0; i < n2; ++i) {
// Input scaling
for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
- // Input scaling when LGT is not possible, Daala only (4 above)
- temp_in[j] = input[i * stride + j] * 16;
-#else
// Input scaling when Daala is not possible, LGT/AV1 only (1 above)
temp_in[j] =
(tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
-#endif
}
// Row transform (AV1/LGT scale up .5 bit, Daala does not scale)
ht.rows(temp_in, temp_out);
@@ -1321,24 +1264,6 @@
TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
- { daala_fdct4, daala_fdct8 }, // DCT_DCT
- { daala_fdst4, daala_fdct8 }, // ADST_DCT
- { daala_fdct4, daala_fdst8 }, // DCT_ADST
- { daala_fdst4, daala_fdst8 }, // ADST_ADST
- { daala_fdst4, daala_fdct8 }, // FLIPADST_DCT
- { daala_fdct4, daala_fdst8 }, // DCT_FLIPADST
- { daala_fdst4, daala_fdst8 }, // FLIPADST_FLIPADST
- { daala_fdst4, daala_fdst8 }, // ADST_FLIPADST
- { daala_fdst4, daala_fdst8 }, // FLIPADST_ADST
- { daala_idtx4, daala_idtx8 }, // IDTX
- { daala_fdct4, daala_idtx8 }, // V_DCT
- { daala_idtx4, daala_fdct8 }, // H_DCT
- { daala_fdst4, daala_idtx8 }, // V_ADST
- { daala_idtx4, daala_fdst8 }, // H_ADST
- { daala_fdst4, daala_idtx8 }, // V_FLIPADST
- { daala_idtx4, daala_fdst8 }, // H_FLIPADST
-#else
{ fdct4, fdct8 }, // DCT_DCT
{ fadst4, fdct8 }, // ADST_DCT
{ fdct4, fadst8 }, // DCT_ADST
@@ -1355,7 +1280,6 @@
{ fidtx4, fadst8 }, // H_ADST
{ fadst4, fidtx8 }, // V_FLIPADST
{ fidtx4, fadst8 }, // H_FLIPADST
-#endif
};
const transform_2d ht = FHT[tx_type];
const int n = 4;
@@ -1375,14 +1299,9 @@
// Columns
for (i = 0; i < n2; ++i) {
for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
- // Input scaling when LGT is not possible, Daala only (4 above)
- temp_in[j] = input[j * stride + i] * 16;
-#else
// Input scaling when Daala is not possible, AV1/LGT only (1 above)
temp_in[j] =
(tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
-#endif
}
// Column transform (AV1/LGT scale up .5 bit, Daala does not scale)
ht.cols(temp_in, temp_out);
@@ -1500,24 +1419,6 @@
TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
- { daala_fdct16, daala_fdct8 }, // DCT_DCT
- { daala_fdst16, daala_fdct8 }, // ADST_DCT
- { daala_fdct16, daala_fdst8 }, // DCT_ADST
- { daala_fdst16, daala_fdst8 }, // ADST_ADST
- { daala_fdst16, daala_fdct8 }, // FLIPADST_DCT
- { daala_fdct16, daala_fdst8 }, // DCT_FLIPADST
- { daala_fdst16, daala_fdst8 }, // FLIPADST_FLIPADST
- { daala_fdst16, daala_fdst8 }, // ADST_FLIPADST
- { daala_fdst16, daala_fdst8 }, // FLIPADST_ADST
- { daala_idtx16, daala_idtx8 }, // IDTX
- { daala_fdct16, daala_idtx8 }, // V_DCT
- { daala_idtx16, daala_fdct8 }, // H_DCT
- { daala_fdst16, daala_idtx8 }, // V_ADST
- { daala_idtx16, daala_fdst8 }, // H_ADST
- { daala_fdst16, daala_idtx8 }, // V_FLIPADST
- { daala_idtx16, daala_fdst8 }, // H_FLIPADST
-#else
{ fdct16, fdct8 }, // DCT_DCT
{ fadst16, fdct8 }, // ADST_DCT
{ fdct16, fadst8 }, // DCT_ADST
@@ -1534,7 +1435,6 @@
{ fidtx16, fadst8 }, // H_ADST
{ fadst16, fidtx8 }, // V_FLIPADST
{ fidtx16, fadst8 }, // H_FLIPADST
-#endif
};
const transform_2d ht = FHT[tx_type];
const int n = 8;
@@ -1555,14 +1455,9 @@
for (i = 0; i < n2; ++i) {
// Input scaling
for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
- // Input scaling when LGT is not possible, Daala only (case 4 above)
- temp_in[j] = input[i * stride + j] * 16;
-#else
// Input scaling when Daala is not possible, LGT/AV1 only (case 1 above)
temp_in[j] =
(tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
-#endif
}
// Row transform (AV1/LGT scale up 1 bit, Daala does not scale)
@@ -1570,13 +1465,8 @@
// Mid scaling
for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
- // mid scaling: only cases 2 and 4 possible
- out[j * n2 + i] = temp_out[j];
-#else
// mid scaling: only case 1 possible
out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-#endif
}
}
@@ -1586,13 +1476,8 @@
// Column transform (AV1/LGT scale up 1.5 bits, Daala does not scale)
ht.cols(temp_in, temp_out);
for (j = 0; j < n2; ++j) {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
- // Output scaling (cases 2 and 3 above)
- output[i + j * n] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-#else
// Output scaling (case 1 above)
output[i + j * n] = temp_out[j];
-#endif
}
}
// Note: overall scale factor of transform is 8 times unitary
@@ -1602,24 +1487,6 @@
TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
- { daala_fdct8, daala_fdct16 }, // DCT_DCT
- { daala_fdst8, daala_fdct16 }, // ADST_DCT
- { daala_fdct8, daala_fdst16 }, // DCT_ADST
- { daala_fdst8, daala_fdst16 }, // ADST_ADST
- { daala_fdst8, daala_fdct16 }, // FLIPADST_DCT
- { daala_fdct8, daala_fdst16 }, // DCT_FLIPADST
- { daala_fdst8, daala_fdst16 }, // FLIPADST_FLIPADST
- { daala_fdst8, daala_fdst16 }, // ADST_FLIPADST
- { daala_fdst8, daala_fdst16 }, // FLIPADST_ADST
- { daala_idtx8, daala_idtx16 }, // IDTX
- { daala_fdct8, daala_idtx16 }, // V_DCT
- { daala_idtx8, daala_fdct16 }, // H_DCT
- { daala_fdst8, daala_idtx16 }, // V_ADST
- { daala_idtx8, daala_fdst16 }, // H_ADST
- { daala_fdst8, daala_idtx16 }, // V_FLIPADST
- { daala_idtx8, daala_fdst16 }, // H_FLIPADST
-#else
{ fdct8, fdct16 }, // DCT_DCT
{ fadst8, fdct16 }, // ADST_DCT
{ fdct8, fadst16 }, // DCT_ADST
@@ -1636,7 +1503,6 @@
{ fidtx8, fadst16 }, // H_ADST
{ fadst8, fidtx16 }, // V_FLIPADST
{ fidtx8, fadst16 }, // H_FLIPADST
-#endif
};
const transform_2d ht = FHT[tx_type];
const int n = 8;
@@ -1657,14 +1523,9 @@
for (i = 0; i < n2; ++i) {
// Input scaling
for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
- // Input scaling when LGT is not possible, Daala only (4 above)
- temp_in[j] = input[j * stride + i] * 16;
-#else
// Input scaling when Daala is not possible, AV1/LGT only (1 above)
temp_in[j] =
(tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
-#endif
}
// Column transform (AV1/LGT scale up 1 bit, Daala does not scale)
@@ -1672,13 +1533,8 @@
// Mid scaling
for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
- // scaling cases 2 and 4 above
- out[j * n2 + i] = temp_out[j];
-#else
// Scaling case 1 above
out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-#endif
}
}
@@ -1688,13 +1544,8 @@
// Row transform (AV1 scales up 1.5 bits, Daala does not scale)
ht.rows(temp_in, temp_out);
for (j = 0; j < n2; ++j) {
-#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
- // Output scaing cases 2 and 4 above
- output[j + i * n2] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-#else
// Ouptut scaling case 1 above
output[j + i * n2] = temp_out[j];
-#endif
}
}
// Note: overall scale factor of transform is 8 times unitary
@@ -1798,24 +1649,6 @@
TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
- { daala_fdct32, daala_fdct16 }, // DCT_DCT
- { daala_fdst32, daala_fdct16 }, // ADST_DCT
- { daala_fdct32, daala_fdst16 }, // DCT_ADST
- { daala_fdst32, daala_fdst16 }, // ADST_ADST
- { daala_fdst32, daala_fdct16 }, // FLIPADST_DCT
- { daala_fdct32, daala_fdst16 }, // DCT_FLIPADST
- { daala_fdst32, daala_fdst16 }, // FLIPADST_FLIPADST
- { daala_fdst32, daala_fdst16 }, // ADST_FLIPADST
- { daala_fdst32, daala_fdst16 }, // FLIPADST_ADST
- { daala_idtx32, daala_idtx16 }, // IDTX
- { daala_fdct32, daala_idtx16 }, // V_DCT
- { daala_idtx32, daala_fdct16 }, // H_DCT
- { daala_fdst32, daala_idtx16 }, // V_ADST
- { daala_idtx32, daala_fdst16 }, // H_ADST
- { daala_fdst32, daala_idtx16 }, // V_FLIPADST
- { daala_idtx32, daala_fdst16 }, // H_FLIPADST
-#else
{ fdct32, fdct16 }, // DCT_DCT
{ fhalfright32, fdct16 }, // ADST_DCT
{ fdct32, fadst16 }, // DCT_ADST
@@ -1832,7 +1665,6 @@
{ fidtx32, fadst16 }, // H_ADST
{ fhalfright32, fidtx16 }, // V_FLIPADST
{ fidtx32, fadst16 }, // H_FLIPADST
-#endif
};
const transform_2d ht = FHT[tx_type];
const int n = 16;
@@ -1846,20 +1678,12 @@
// Rows
for (i = 0; i < n2; ++i) {
for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
- temp_in[j] = input[i * stride + j] * 16;
-#else
temp_in[j] =
(tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
-#endif
}
ht.rows(temp_in, temp_out);
for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
- out[j * n2 + i] = temp_out[j];
-#else
out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
-#endif
}
}
@@ -1867,12 +1691,7 @@
for (i = 0; i < n; ++i) {
for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
ht.cols(temp_in, temp_out);
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
- for (j = 0; j < n2; ++j)
- output[i + j * n] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-#else
for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j];
-#endif
}
// Note: overall scale factor of transform is 4 times unitary
}
@@ -1881,24 +1700,6 @@
TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
- { daala_fdct16, daala_fdct32 }, // DCT_DCT
- { daala_fdst16, daala_fdct32 }, // ADST_DCT
- { daala_fdct16, daala_fdst32 }, // DCT_ADST
- { daala_fdst16, daala_fdst32 }, // ADST_ADST
- { daala_fdst16, daala_fdct32 }, // FLIPADST_DCT
- { daala_fdct16, daala_fdst32 }, // DCT_FLIPADST
- { daala_fdst16, daala_fdst32 }, // FLIPADST_FLIPADST
- { daala_fdst16, daala_fdst32 }, // ADST_FLIPADST
- { daala_fdst16, daala_fdst32 }, // FLIPADST_ADST
- { daala_idtx16, daala_idtx32 }, // IDTX
- { daala_fdct16, daala_idtx32 }, // V_DCT
- { daala_idtx16, daala_fdct32 }, // H_DCT
- { daala_fdst16, daala_idtx32 }, // V_ADST
- { daala_idtx16, daala_fdst32 }, // H_ADST
- { daala_fdst16, daala_idtx32 }, // V_FLIPADST
- { daala_idtx16, daala_fdst32 }, // H_FLIPADST
-#else
{ fdct16, fdct32 }, // DCT_DCT
{ fadst16, fdct32 }, // ADST_DCT
{ fdct16, fhalfright32 }, // DCT_ADST
@@ -1915,7 +1716,6 @@
{ fidtx16, fhalfright32 }, // H_ADST
{ fadst16, fidtx32 }, // V_FLIPADST
{ fidtx16, fhalfright32 }, // H_FLIPADST
-#endif
};
const transform_2d ht = FHT[tx_type];
const int n = 16;
@@ -1929,20 +1729,12 @@
// Columns
for (i = 0; i < n2; ++i) {
for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
- temp_in[j] = input[j * stride + i] * 16;
-#else
temp_in[j] =
(tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
-#endif
}
ht.cols(temp_in, temp_out);
for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
- out[j * n2 + i] = temp_out[j];
-#else
out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
-#endif
}
}
@@ -1950,12 +1742,7 @@
for (i = 0; i < n; ++i) {
for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
ht.rows(temp_in, temp_out);
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
- for (j = 0; j < n2; ++j)
- output[j + i * n2] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-#else
for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j];
-#endif
}
// Note: overall scale factor of transform is 4 times unitary
}
@@ -1963,32 +1750,12 @@
void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
-#if !CONFIG_DAALA_TX8
if (tx_type == DCT_DCT) {
aom_fdct8x8_c(input, output, stride);
return;
}
-#endif
{
static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX8
- { daala_fdct8, daala_fdct8 }, // DCT_DCT
- { daala_fdst8, daala_fdct8 }, // ADST_DCT
- { daala_fdct8, daala_fdst8 }, // DCT_ADST
- { daala_fdst8, daala_fdst8 }, // ADST_ADST
- { daala_fdst8, daala_fdct8 }, // FLIPADST_DCT
- { daala_fdct8, daala_fdst8 }, // DCT_FLIPADST
- { daala_fdst8, daala_fdst8 }, // FLIPADST_FLIPADST
- { daala_fdst8, daala_fdst8 }, // ADST_FLIPADST
- { daala_fdst8, daala_fdst8 }, // FLIPADST_ADST
- { daala_idtx8, daala_idtx8 }, // IDTX
- { daala_fdct8, daala_idtx8 }, // V_DCT
- { daala_idtx8, daala_fdct8 }, // H_DCT
- { daala_fdst8, daala_idtx8 }, // V_ADST
- { daala_idtx8, daala_fdst8 }, // H_ADST
- { daala_fdst8, daala_idtx8 }, // V_FLIPADST
- { daala_idtx8, daala_fdst8 }, // H_FLIPADST
-#else
{ fdct8, fdct8 }, // DCT_DCT
{ fadst8, fdct8 }, // ADST_DCT
{ fdct8, fadst8 }, // DCT_ADST
@@ -2005,7 +1772,6 @@
{ fidtx8, fadst8 }, // H_ADST
{ fadst8, fidtx8 }, // V_FLIPADST
{ fidtx8, fadst8 }, // H_FLIPADST
-#endif
};
const transform_2d ht = FHT[tx_type];
tran_low_t out[64];
@@ -2017,11 +1783,7 @@
// Columns
for (i = 0; i < 8; ++i) {
-#if CONFIG_DAALA_TX8
- for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 16;
-#else
for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 4;
-#endif
ht.cols(temp_in, temp_out);
for (j = 0; j < 8; ++j) out[j * 8 + i] = temp_out[j];
}
@@ -2030,13 +1792,8 @@
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j) temp_in[j] = out[j + i * 8];
ht.rows(temp_in, temp_out);
-#if CONFIG_DAALA_TX8
for (j = 0; j < 8; ++j)
output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-#else
- for (j = 0; j < 8; ++j)
- output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-#endif
}
}
}
@@ -2101,24 +1858,6 @@
TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX16
- { daala_fdct16, daala_fdct16 }, // DCT_DCT
- { daala_fdst16, daala_fdct16 }, // ADST_DCT
- { daala_fdct16, daala_fdst16 }, // DCT_ADST
- { daala_fdst16, daala_fdst16 }, // ADST_ADST
- { daala_fdst16, daala_fdct16 }, // FLIPADST_DCT
- { daala_fdct16, daala_fdst16 }, // DCT_FLIPADST
- { daala_fdst16, daala_fdst16 }, // FLIPADST_FLIPADST
- { daala_fdst16, daala_fdst16 }, // ADST_FLIPADST
- { daala_fdst16, daala_fdst16 }, // FLIPADST_ADST
- { daala_idtx16, daala_idtx16 }, // IDTX
- { daala_fdct16, daala_idtx16 }, // V_DCT
- { daala_idtx16, daala_fdct16 }, // H_DCT
- { daala_fdst16, daala_idtx16 }, // V_ADST
- { daala_idtx16, daala_fdst16 }, // H_ADST
- { daala_fdst16, daala_idtx16 }, // V_FLIPADST
- { daala_idtx16, daala_fdst16 }, // H_FLIPADST
-#else
{ fdct16, fdct16 }, // DCT_DCT
{ fadst16, fdct16 }, // ADST_DCT
{ fdct16, fadst16 }, // DCT_ADST
@@ -2135,7 +1874,6 @@
{ fidtx16, fadst16 }, // H_ADST
{ fadst16, fidtx16 }, // V_FLIPADST
{ fidtx16, fadst16 }, // H_FLIPADST
-#endif
};
const transform_2d ht = FHT[tx_type];
tran_low_t out[256];
@@ -2148,19 +1886,11 @@
// Columns
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j) {
-#if CONFIG_DAALA_TX16
- temp_in[j] = input[j * stride + i] * 16;
-#else
temp_in[j] = input[j * stride + i] * 4;
-#endif
}
ht.cols(temp_in, temp_out);
for (j = 0; j < 16; ++j) {
-#if CONFIG_DAALA_TX16
- out[j * 16 + i] = temp_out[j];
-#else
out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
-#endif
}
}
@@ -2169,11 +1899,7 @@
for (j = 0; j < 16; ++j) temp_in[j] = out[j + i * 16];
ht.rows(temp_in, temp_out);
for (j = 0; j < 16; ++j) {
-#if CONFIG_DAALA_TX16
- output[j + i * 16] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-#else
output[j + i * 16] = temp_out[j];
-#endif
}
}
}
@@ -2187,24 +1913,6 @@
TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX32
- { daala_fdct32, daala_fdct32 }, // DCT_DCT
- { daala_fdst32, daala_fdct32 }, // ADST_DCT
- { daala_fdct32, daala_fdst32 }, // DCT_ADST
- { daala_fdst32, daala_fdst32 }, // ADST_ADST
- { daala_fdst32, daala_fdct32 }, // FLIPADST_DCT
- { daala_fdct32, daala_fdst32 }, // DCT_FLIPADST
- { daala_fdst32, daala_fdst32 }, // FLIPADST_FLIPADST
- { daala_fdst32, daala_fdst32 }, // ADST_FLIPADST
- { daala_fdst32, daala_fdst32 }, // FLIPADST_ADST
- { daala_idtx32, daala_idtx32 }, // IDTX
- { daala_fdct32, daala_idtx32 }, // V_DCT
- { daala_idtx32, daala_fdct32 }, // H_DCT
- { daala_fdst32, daala_idtx32 }, // V_ADST
- { daala_idtx32, daala_fdst32 }, // H_ADST
- { daala_fdst32, daala_idtx32 }, // V_FLIPADST
- { daala_idtx32, daala_fdst32 }, // H_FLIPADST
-#else
{ fdct32, fdct32 }, // DCT_DCT
{ fhalfright32, fdct32 }, // ADST_DCT
{ fdct32, fhalfright32 }, // DCT_ADST
@@ -2221,7 +1929,6 @@
{ fidtx32, fhalfright32 }, // H_ADST
{ fhalfright32, fidtx32 }, // V_FLIPADST
{ fidtx32, fhalfright32 }, // H_FLIPADST
-#endif
};
const transform_2d ht = FHT[tx_type];
tran_low_t out[1024];
@@ -2234,19 +1941,11 @@
// Columns
for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j) {
-#if CONFIG_DAALA_TX32
- temp_in[j] = input[j * stride + i] * 16;
-#else
temp_in[j] = input[j * stride + i] * 4;
-#endif
}
ht.cols(temp_in, temp_out);
for (j = 0; j < 32; ++j) {
-#if CONFIG_DAALA_TX32
- out[j * 32 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-#else
out[j * 32 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
-#endif
}
}
@@ -2260,7 +1959,7 @@
}
}
-#if CONFIG_TX64X64 && !(CONFIG_DAALA_TX64 && CONFIG_DAALA_TX32)
+#if CONFIG_TX64X64
static void fidtx64(const tran_low_t *input, tran_low_t *output) {
int i;
for (i = 0; i < 64; ++i)
@@ -2312,24 +2011,6 @@
TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX64
- { daala_fdct64, daala_fdct64 }, // DCT_DCT
- { daala_fdst64, daala_fdct64 }, // ADST_DCT
- { daala_fdct64, daala_fdst64 }, // DCT_ADST
- { daala_fdst64, daala_fdst64 }, // ADST_ADST
- { daala_fdst64, daala_fdct64 }, // FLIPADST_DCT
- { daala_fdct64, daala_fdst64 }, // DCT_FLIPADST
- { daala_fdst64, daala_fdst64 }, // FLIPADST_FLIPADST
- { daala_fdst64, daala_fdst64 }, // ADST_FLIPADST
- { daala_fdst64, daala_fdst64 }, // FLIPADST_ADST
- { daala_idtx64, daala_idtx64 }, // IDTX
- { daala_fdct64, daala_idtx64 }, // V_DCT
- { daala_idtx64, daala_fdct64 }, // H_DCT
- { daala_fdst64, daala_idtx64 }, // V_ADST
- { daala_idtx64, daala_fdst64 }, // H_ADST
- { daala_fdst64, daala_idtx64 }, // V_FLIPADST
- { daala_idtx64, daala_fdst64 }, // H_FLIPADST
-#else
{ fdct64_col, fdct64_row }, // DCT_DCT
{ fhalfright64, fdct64_row }, // ADST_DCT
{ fdct64_col, fhalfright64 }, // DCT_ADST
@@ -2346,7 +2027,6 @@
{ fidtx64, fhalfright64 }, // H_ADST
{ fhalfright64, fidtx64 }, // V_FLIPADST
{ fidtx64, fhalfright64 }, // H_FLIPADST
-#endif // CONFIG_DAALA_TX64
};
const transform_2d ht = FHT[tx_type];
tran_low_t out[4096];
@@ -2357,17 +2037,10 @@
// Columns
for (i = 0; i < 64; ++i) {
-#if CONFIG_DAALA_TX64
- for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i] * 16;
- ht.cols(temp_in, temp_out);
- for (j = 0; j < 64; ++j) out[j * 64 + i] = temp_out[j];
-
-#else
for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i];
ht.cols(temp_in, temp_out);
for (j = 0; j < 64; ++j)
out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
-#endif
}
// Rows
@@ -2375,12 +2048,8 @@
for (j = 0; j < 64; ++j) temp_in[j] = out[j + i * 64];
ht.rows(temp_in, temp_out);
for (j = 0; j < 64; ++j)
-#if CONFIG_DAALA_TX64
- output[j + i * 64] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 3);
-#else
output[j + i * 64] =
(tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
-#endif
}
// Zero out top-right 32x32 area.
@@ -2399,24 +2068,6 @@
TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
- { daala_fdct32, daala_fdct64 }, // DCT_DCT
- { daala_fdst32, daala_fdct64 }, // ADST_DCT
- { daala_fdct32, daala_fdst64 }, // DCT_ADST
- { daala_fdst32, daala_fdst64 }, // ADST_ADST
- { daala_fdst32, daala_fdct64 }, // FLIPADST_DCT
- { daala_fdct32, daala_fdst64 }, // DCT_FLIPADST
- { daala_fdst32, daala_fdst64 }, // FLIPADST_FLIPADST
- { daala_fdst32, daala_fdst64 }, // ADST_FLIPADST
- { daala_fdst32, daala_fdst64 }, // FLIPADST_ADST
- { daala_idtx32, daala_idtx64 }, // IDTX
- { daala_fdct32, daala_idtx64 }, // V_DCT
- { daala_idtx32, daala_fdct64 }, // H_DCT
- { daala_fdst32, daala_idtx64 }, // V_ADST
- { daala_idtx32, daala_fdst64 }, // H_ADST
- { daala_fdst32, daala_idtx64 }, // V_FLIPADST
- { daala_idtx32, daala_fdst64 }, // H_FLIPADST
-#else
{ fdct32, fdct64_row }, // DCT_DCT
{ fhalfright32, fdct64_row }, // ADST_DCT
{ fdct32, fhalfright64 }, // DCT_ADST
@@ -2433,7 +2084,6 @@
{ fidtx32, fhalfright64 }, // H_ADST
{ fhalfright32, fidtx64 }, // V_FLIPADST
{ fidtx32, fhalfright64 }, // H_FLIPADST
-#endif
};
const transform_2d ht = FHT[tx_type];
tran_low_t out[2048];
@@ -2447,19 +2097,11 @@
// Columns
for (i = 0; i < n2; ++i) {
for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
- temp_in[j] = input[j * stride + i] * 16;
-#else
temp_in[j] = (tran_low_t)fdct_round_shift(input[j * stride + i] * Sqrt2);
-#endif
}
ht.cols(temp_in, temp_out);
for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
- out[j * n2 + i] = temp_out[j];
-#else
out[j * n2 + i] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-#endif
}
}
@@ -2468,13 +2110,8 @@
for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
ht.rows(temp_in, temp_out);
for (j = 0; j < n2; ++j) {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
- output[j + i * n2] =
- (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 3);
-#else
output[j + i * n2] =
(tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-#endif
}
}
@@ -2492,24 +2129,6 @@
TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
static const transform_2d FHT[] = {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
- { daala_fdct64, daala_fdct32 }, // DCT_DCT
- { daala_fdst64, daala_fdct32 }, // ADST_DCT
- { daala_fdct64, daala_fdst32 }, // DCT_ADST
- { daala_fdst64, daala_fdst32 }, // ADST_ADST
- { daala_fdst64, daala_fdct32 }, // FLIPADST_DCT
- { daala_fdct64, daala_fdst32 }, // DCT_FLIPADST
- { daala_fdst64, daala_fdst32 }, // FLIPADST_FLIPADST
- { daala_fdst64, daala_fdst32 }, // ADST_FLIPADST
- { daala_fdst64, daala_fdst32 }, // FLIPADST_ADST
- { daala_idtx64, daala_idtx32 }, // IDTX
- { daala_fdct64, daala_idtx32 }, // V_DCT
- { daala_idtx64, daala_fdct32 }, // H_DCT
- { daala_fdst64, daala_idtx32 }, // V_ADST
- { daala_idtx64, daala_fdst32 }, // H_ADST
- { daala_fdst64, daala_idtx32 }, // V_FLIPADST
- { daala_idtx64, daala_fdst32 }, // H_FLIPADST
-#else
{ fdct64_row, fdct32 }, // DCT_DCT
{ fhalfright64, fdct32 }, // ADST_DCT
{ fdct64_row, fhalfright32 }, // DCT_ADST
@@ -2526,7 +2145,6 @@
{ fidtx64, fhalfright32 }, // H_ADST
{ fhalfright64, fidtx32 }, // V_FLIPADST
{ fidtx64, fhalfright32 }, // H_FLIPADST
-#endif
};
const transform_2d ht = FHT[tx_type];
tran_low_t out[32 * 64];
@@ -2540,19 +2158,11 @@
// Rows
for (i = 0; i < n2; ++i) {
for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
- temp_in[j] = input[i * stride + j] * 16;
-#else
temp_in[j] = (tran_low_t)fdct_round_shift(input[i * stride + j] * Sqrt2);
-#endif
}
ht.rows(temp_in, temp_out);
for (j = 0; j < n; ++j) {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
- out[j * n2 + i] = temp_out[j];
-#else
out[j * n2 + i] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-#endif
}
}
@@ -2561,11 +2171,7 @@
for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
ht.cols(temp_in, temp_out);
for (j = 0; j < n2; ++j) {
-#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
- output[i + j * n] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 3);
-#else
output[i + j * n] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-#endif
}
}
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index a9e1d32..474839a 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -33,9 +33,6 @@
#include "av1/encoder/encodetxb.h"
#endif
#include "av1/encoder/hybrid_fwd_txfm.h"
-#if CONFIG_DAALA_TX
-#include "av1/common/daala_inv_txfm.h"
-#endif
#include "av1/encoder/rd.h"
#include "av1/encoder/rdopt.h"
@@ -143,14 +140,7 @@
get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
const int16_t *const scan = scan_order->scan;
const int16_t *const nb = scan_order->neighbors;
-#if CONFIG_DAALA_TX
- // This is one of the few places where RDO is done on coeffs; it
- // expects the coeffs to be in Q3/D11, so we need to scale them.
- int depth_shift = (TX_COEFF_DEPTH - 11) * 2;
- int depth_round = depth_shift > 1 ? (1 << depth_shift >> 1) : 0;
-#else
const int shift = av1_get_tx_scale(tx_size);
-#endif
#if CONFIG_AOM_QM
int seg_id = xd->mi[0]->mbmi.segment_id;
const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
@@ -219,19 +209,14 @@
tail_token_costs[band_cur][ctx_cur]);
// accu_error does not change when x==0
} else {
-/* Computing distortion
- */
-// compute the distortion for the first candidate
-// and the distortion for quantizing to 0.
-#if CONFIG_DAALA_TX
- int dx0 = coeff[rc];
- const int64_t d0 = ((int64_t)dx0 * dx0 + depth_round) >> depth_shift;
-#else
+ /* Computing distortion
+ */
+ // compute the distortion for the first candidate
+ // and the distortion for quantizing to 0.
int dx0 = abs(coeff[rc]) * (1 << shift);
dx0 >>= xd->bd - 8;
const int64_t d0 = (int64_t)dx0 * dx0;
-#endif
const int x_a = x - 2 * sz - 1;
int dqv = dequant_ptr[rc != 0];
#if CONFIG_AOM_QM
@@ -241,33 +226,15 @@
}
#endif // CONFIG_AOM_QM
-#if CONFIG_DAALA_TX
- int dx = dqcoeff[rc] - coeff[rc];
- const int64_t d2 = ((int64_t)dx * dx + depth_round) >> depth_shift;
-#else
int dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
dx = signed_shift_right(dx, xd->bd - 8);
const int64_t d2 = (int64_t)dx * dx;
-#endif
/* compute the distortion for the second candidate
* x_a = x - 2 * sz + 1;
*/
int64_t d2_a;
if (x_a != 0) {
-#if CONFIG_DAALA_TX
-#if CONFIG_NEW_QUANT
-#if CONFIG_AOM_QM
- dx = av1_dequant_coeff_nuq(x_a, dqv, dq, rc != 0, 0) - coeff[rc];
-#else
- dx = av1_dequant_coeff_nuq(x_a, dqv, dequant_val[rc != 0], 0) -
- coeff[rc];
-#endif // CONFIG_AOM_QM
-#else // CONFIG_NEW_QUANT
- dx -= (dqv + sz) ^ sz;
-#endif // CONFIG_NEW_QUANT
- d2_a = ((int64_t)dx * dx + depth_round) >> depth_shift;
-#else // CONFIG_DAALA_TX
#if CONFIG_NEW_QUANT
#if CONFIG_AOM_QM
dx = av1_dequant_coeff_nuq(x_a, dqv, dq, rc != 0, 0) -
@@ -281,7 +248,6 @@
dx -= ((dqv >> (xd->bd - 8)) + sz) ^ sz;
#endif // CONFIG_NEW_QUANT
d2_a = (int64_t)dx * dx;
-#endif // CONFIG_DAALA_TX
} else {
d2_a = d0;
}
@@ -354,19 +320,6 @@
int dqc_a = 0;
if (best_x || best_eob_x) {
if (x_a != 0) {
-#if CONFIG_DAALA_TX
-#if CONFIG_NEW_QUANT
-#if CONFIG_AOM_QM
- dqc_a = av1_dequant_abscoeff_nuq(abs(x_a), dqv, dq, rc != 0, 0);
-#else
- dqc_a =
- av1_dequant_abscoeff_nuq(abs(x_a), dqv, dequant_val[rc != 0], 0);
-#endif // CONFIG_AOM_QM
- if (sz) dqc_a = -dqc_a;
-#else
- dqc_a = x_a * dqv;
-#endif // CONFIG_NEW_QUANT
-#else // CONFIG_DAALA_TX
#if CONFIG_NEW_QUANT
#if CONFIG_AOM_QM
dqc_a = av1_dequant_abscoeff_nuq(abs(x_a), dqv, dq, rc != 0, shift);
@@ -381,7 +334,6 @@
else
dqc_a = (x_a * dqv) >> shift;
#endif // CONFIG_NEW_QUANT
-#endif // CONFIG_DAALA_TX
} else {
dqc_a = 0;
}
@@ -524,11 +476,7 @@
src_diff =
&p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
-#if CONFIG_DAALA_TX
- qparam.log_scale = 0;
-#else
qparam.log_scale = av1_get_tx_scale(tx_size);
-#endif
qparam.tx_size = tx_size;
#if CONFIG_NEW_QUANT
qparam.dq = get_dq_profile(cm->dq_type, x->qindex, is_inter, plane_type);
@@ -554,13 +502,8 @@
if (xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
const int n_coeffs = av1_get_max_eob(tx_size);
if (LIKELY(!x->skip_block)) {
-#if CONFIG_DAALA_TX
- quant_func_list[xform_quant_idx][1](coeff, n_coeffs, p, qcoeff, dqcoeff,
- eob, scan_order, &qparam);
-#else
quant_func_list[xform_quant_idx][txfm_param.is_hbd](
coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, &qparam);
-#endif
} else {
av1_quantize_skip(n_coeffs, qcoeff, dqcoeff, eob);
}
@@ -740,9 +683,6 @@
txfm_param.tx_set_type = get_ext_tx_set_type(
txfm_param.tx_size, plane_bsize, is_inter_block(&xd->mi[0]->mbmi),
cm->reduced_tx_set_used);
-#if CONFIG_DAALA_TX
- daala_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
-#else
if (txfm_param.is_hbd) {
av1_highbd_inv_txfm_add_4x4(dqcoeff, dst, pd->dst.stride, &txfm_param);
return;
@@ -752,7 +692,6 @@
} else {
av1_idct4x4_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
}
-#endif
}
}
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 9f7ac56..1ffb462 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -4660,75 +4660,6 @@
}
}
-#if 0 && CONFIG_INTERNAL_STATS
-static void output_frame_level_debug_stats(AV1_COMP *cpi) {
- AV1_COMMON *const cm = &cpi->common;
- FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
- int64_t recon_err;
-
- aom_clear_system_state();
-
- recon_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
-
- if (cpi->twopass.total_left_stats.coded_error != 0.0)
- fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
- "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
- "%10"PRId64" %10"PRId64" %10d "
- "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
- "%6d %6d %5d %5d %5d "
- "%10"PRId64" %10.3lf"
- "%10lf %8u %10"PRId64" %10d %10d %10d\n",
- cpi->common.current_video_frame,
- cm->width, cm->height,
- cpi->rc.source_alt_ref_pending,
- cpi->rc.source_alt_ref_active,
- cpi->rc.this_frame_target,
- cpi->rc.projected_frame_size,
- cpi->rc.projected_frame_size / cpi->common.MBs,
- (cpi->rc.projected_frame_size - cpi->rc.this_frame_target),
- cpi->rc.vbr_bits_off_target,
- cpi->rc.vbr_bits_off_target_fast,
- cpi->twopass.extend_minq,
- cpi->twopass.extend_minq_fast,
- cpi->rc.total_target_vs_actual,
- (cpi->rc.starting_buffer_level - cpi->rc.bits_off_target),
- cpi->rc.total_actual_bits, cm->base_qindex,
- av1_convert_qindex_to_q(cm->base_qindex, cm->bit_depth),
- (double)av1_dc_quant(cm->base_qindex, 0, cm->bit_depth) / 4.0,
- av1_convert_qindex_to_q(cpi->twopass.active_worst_quality,
- cm->bit_depth),
- cpi->rc.avg_q,
- av1_convert_qindex_to_q(cpi->oxcf.cq_level, cm->bit_depth),
- cpi->refresh_last_frame, cpi->refresh_golden_frame,
- cpi->refresh_alt_ref_frame, cm->frame_type, cpi->rc.gfu_boost,
- cpi->twopass.bits_left,
- cpi->twopass.total_left_stats.coded_error,
- cpi->twopass.bits_left /
- (1 + cpi->twopass.total_left_stats.coded_error),
- cpi->tot_recode_hits, recon_err, cpi->rc.kf_boost,
- cpi->twopass.kf_zeromotion_pct,
- cpi->twopass.fr_content_type);
-
- fclose(f);
-
- if (0) {
- FILE *const fmodes = fopen("Modes.stt", "a");
- int i;
-
- fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame,
- cm->frame_type, cpi->refresh_golden_frame,
- cpi->refresh_alt_ref_frame);
-
- for (i = 0; i < MAX_MODES; ++i)
- fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
-
- fprintf(fmodes, "\n");
-
- fclose(fmodes);
- }
-}
-#endif
-
static void set_mv_search_params(AV1_COMP *cpi) {
const AV1_COMMON *const cm = &cpi->common;
const unsigned int max_mv_def = AOMMIN(cm->width, cm->height);
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index a47aff9..3793c6c 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -766,14 +766,6 @@
}
// TODO(zoeliu): To set up cpi->oxcf.enable_auto_brf
-#if 0 && CONFIG_EXT_REFS
-static INLINE int is_bwdref_enabled(const AV1_COMP *const cpi) {
- // NOTE(zoeliu): The enabling of bi-predictive frames depends on the use of
- // alt_ref, and now will be off when the alt_ref interval is
- // not sufficiently large.
- return is_altref_enabled(cpi) && cpi->oxcf.enable_auto_brf;
-}
-#endif // CONFIG_EXT_REFS
static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd,
MV_REFERENCE_FRAME ref0,
diff --git a/av1/encoder/encodetxb.c b/av1/encoder/encodetxb.c
index 9e4ffd4..8d2323b 100644
--- a/av1/encoder/encodetxb.c
+++ b/av1/encoder/encodetxb.c
@@ -161,16 +161,8 @@
static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff,
int shift) {
-#if CONFIG_DAALA_TX
- int depth_shift = (TX_COEFF_DEPTH - 11) * 2;
- int depth_round = depth_shift > 1 ? (1 << (depth_shift - 1)) : 0;
- const int64_t diff = tcoeff - dqcoeff;
- const int64_t error = (diff * diff + depth_round) >> depth_shift;
- (void)shift;
-#else
const int64_t diff = (tcoeff - dqcoeff) * (1 << shift);
const int64_t error = diff * diff;
-#endif
return error;
}
@@ -2066,11 +2058,7 @@
const LV_MAP_EOB_COST txb_eob_costs =
x->eob_costs[eob_multi_size][plane_type];
-#if CONFIG_DAALA_TX
- const int shift = 0;
-#else
const int shift = av1_get_tx_scale(tx_size);
-#endif
const int64_t rdmult =
((x->rdmult * plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))) +
2) >>
diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index 4183231..befb53f 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c
@@ -15,9 +15,6 @@
#include "av1/common/idct.h"
#include "av1/encoder/hybrid_fwd_txfm.h"
-#if CONFIG_DAALA_TX
-#include "av1/encoder/daala_fwd_txfm.h"
-#else
static void fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TxfmParam *txfm_param) {
@@ -27,84 +24,47 @@
return;
}
-#if CONFIG_DAALA_TX4
- // only C version has LGTs
- av1_fht4x4_c(src_diff, coeff, diff_stride, txfm_param);
-#else
av1_fht4x4(src_diff, coeff, diff_stride, txfm_param);
-#endif
}
static void fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TxfmParam *txfm_param) {
-#if (CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8)
- av1_fht4x8_c(src_diff, coeff, diff_stride, txfm_param);
-#else
av1_fht4x8(src_diff, coeff, diff_stride, txfm_param);
-#endif
}
static void fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TxfmParam *txfm_param) {
-#if (CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8)
- av1_fht8x4_c(src_diff, coeff, diff_stride, txfm_param);
-#else
av1_fht8x4(src_diff, coeff, diff_stride, txfm_param);
-#endif
}
static void fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TxfmParam *txfm_param) {
-#if (CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16)
- av1_fht8x16_c(src_diff, coeff, diff_stride, txfm_param);
-#else
av1_fht8x16(src_diff, coeff, diff_stride, txfm_param);
-#endif
}
static void fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TxfmParam *txfm_param) {
-#if (CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16)
- av1_fht16x8_c(src_diff, coeff, diff_stride, txfm_param);
-#else
av1_fht16x8(src_diff, coeff, diff_stride, txfm_param);
-#endif
}
static void fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
- av1_fht16x32_c(src_diff, coeff, diff_stride, txfm_param);
-#else
av1_fht16x32(src_diff, coeff, diff_stride, txfm_param);
-#endif
}
static void fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
- av1_fht32x16_c(src_diff, coeff, diff_stride, txfm_param);
-#else
av1_fht32x16(src_diff, coeff, diff_stride, txfm_param);
-#endif
}
static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_DAALA_TX8
- av1_fht8x8_c(src_diff, coeff, diff_stride, txfm_param);
-#else
av1_fht8x8(src_diff, coeff, diff_stride, txfm_param);
-#endif
}
static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_DAALA_TX16
- av1_fht16x16_c(src_diff, coeff, diff_stride, txfm_param);
-#else
av1_fht16x16(src_diff, coeff, diff_stride, txfm_param);
-#endif // CONFIG_DAALA_TX16
}
static void fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
@@ -572,14 +532,10 @@
}
}
#endif // CONFIG_TX64X64
-#endif // CONFIG_DAALA_TXFM
void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
TxfmParam *txfm_param) {
assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
-#if CONFIG_DAALA_TX
- daala_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
-#else
const TX_SIZE tx_size = txfm_param->tx_size;
switch (tx_size) {
#if CONFIG_TX64X64
@@ -635,15 +591,11 @@
break;
default: assert(0); break;
}
-#endif
}
void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TxfmParam *txfm_param) {
assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
-#if CONFIG_DAALA_TX
- daala_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
-#else
const TX_SIZE tx_size = txfm_param->tx_size;
switch (tx_size) {
#if CONFIG_TX64X64
@@ -707,5 +659,4 @@
break;
default: assert(0); break;
}
-#endif
}
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 40ab03d..30d069c 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -1509,12 +1509,7 @@
int64_t *ssz, int bd) {
int i;
int64_t error = 0, sqcoeff = 0;
-#if CONFIG_DAALA_TX
- (void)bd;
- int shift = 2 * (TX_COEFF_DEPTH - 11);
-#else
int shift = 2 * (bd - 8);
-#endif
int rounding = shift > 0 ? 1 << (shift - 1) : 0;
for (i = 0; i < block_size; i++) {
@@ -1701,26 +1696,17 @@
// not involve an inverse transform, but it is less accurate.
const int buffer_length = av1_get_max_eob(tx_size);
int64_t this_sse;
-// TX-domain results need to shift down to Q2/D10 to match pixel
-// domain distortion values which are in Q2^2
-#if CONFIG_DAALA_TX
- int shift = (TX_COEFF_DEPTH - 10) * 2;
-#else
+ // TX-domain results need to shift down to Q2/D10 to match pixel
+ // domain distortion values which are in Q2^2
int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
-#endif
tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-#if CONFIG_DAALA_TX
- *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse,
- xd->bd);
-#else
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
*out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length,
&this_sse, xd->bd);
else
*out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
-#endif
*out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift);
*out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
@@ -1977,28 +1963,19 @@
av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
AV1_XFORM_QUANT_FP);
-/// TX-domain results need to shift down to Q2/D10 to match pixel
-// domain distortion values which are in Q2^2
-#if CONFIG_DAALA_TX
- const int shift = (TX_COEFF_DEPTH - 10) * 2;
-#else
+ /// TX-domain results need to shift down to Q2/D10 to match pixel
+ // domain distortion values which are in Q2^2
const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
-#endif
tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
const int buffer_length = av1_get_max_eob(tx_size);
int64_t tmp_dist;
int64_t tmp;
-#if CONFIG_DAALA_TX
- tmp_dist =
- av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp, xd->bd);
-#else
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
tmp_dist =
av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp, xd->bd);
else
tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp);
-#endif
tmp_dist = RIGHT_SIGNED_SHIFT(tmp_dist, shift);
if (
@@ -3729,13 +3706,9 @@
av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
AV1_XFORM_QUANT_FP);
-// TX-domain results need to shift down to Q2/D10 to match pixel
-// domain distortion values which are in Q2^2
-#if CONFIG_DAALA_TX
- const int shift = (TX_COEFF_DEPTH - 10) * 2;
-#else
+ // TX-domain results need to shift down to Q2/D10 to match pixel
+ // domain distortion values which are in Q2^2
const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
-#endif
tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
const int buffer_length = av1_get_max_eob(tx_size);
int64_t tmp_dist, tmp_sse;
@@ -3748,16 +3721,11 @@
x->tune_metric != AOM_TUNE_PSNR;
#endif // CONFIG_DIST_8X8
-#if CONFIG_DAALA_TX
- tmp_dist =
- av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp_sse, xd->bd);
-#else
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
tmp_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp_sse,
xd->bd);
else
tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp_sse);
-#endif
tmp_dist = RIGHT_SIGNED_SHIFT(tmp_dist, shift);
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index 2efe62f..79638ca 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -95,14 +95,6 @@
set(CONFIG_CFL 1 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_CICP 0 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_COLORSPACE_HEADERS 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DAALA_TX 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DAALA_TX16 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DAALA_TX32 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DAALA_TX4 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DAALA_TX64 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DAALA_TX8 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DAALA_TX_DST32 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DAALA_TX_DST8 0 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_DEBLOCK_13TAP 1 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_DEPENDENT_HORZTILEGROUPS 0 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_DEPENDENT_HORZTILES 0 CACHE NUMBER "AV1 experiment flag.")
diff --git a/build/cmake/aom_experiment_deps.cmake b/build/cmake/aom_experiment_deps.cmake
index e6204a7..c6808e4 100644
--- a/build/cmake/aom_experiment_deps.cmake
+++ b/build/cmake/aom_experiment_deps.cmake
@@ -32,55 +32,6 @@
endif ()
endif ()
- if (CONFIG_DAALA_TX)
- set(CONFIG_DAALA_TX_DST32 1)
- set(CONFIG_DAALA_TX4 1)
- set(CONFIG_DAALA_TX8 1)
- set(CONFIG_DAALA_TX16 1)
- set(CONFIG_DAALA_TX32 1)
- set(CONFIG_DAALA_TX64 1)
- endif ()
-
- if (NOT CONFIG_DAALA_TX)
- set(CONFIG_DAALA_TX_DST32 0)
- set(CONFIG_DAALA_TX4 0)
- set(CONFIG_DAALA_TX8 0)
- set(CONFIG_DAALA_TX16 0)
- set(CONFIG_DAALA_TX32 0)
- set(CONFIG_DAALA_TX64 0)
- endif ()
-
- if (CONFIG_DAALA_TX_DST8)
- if (NOT CONFIG_DAALA_TX8)
- set(CONFIG_DAALA_TX_DST8 0)
- message("--- DAALA_TX_DST8 requires DAALA_TX8: disabled DAALA_TX_DST8")
- endif ()
- endif ()
-
- if (CONFIG_DAALA_TX_DST32)
- if (NOT CONFIG_DAALA_TX32)
- set(CONFIG_DAALA_TX_DST32 0)
- message("--- DAALA_TX_DST32 requires DAALA_TX32: disabled DAALA_TX_DST32")
- endif ()
- endif ()
-
- if (CONFIG_DAALA_TX64)
- if (NOT CONFIG_TX64X64)
- set(CONFIG_DAALA_TX64 0)
- message("--- DAALA_TX64 requires TX64X64: disabled DAALA_TX64")
- endif ()
- endif ()
-
- if (CONFIG_DAALA_TX4 OR CONFIG_DAALA_TX8 OR CONFIG_DAALA_TX16 OR
- CONFIG_DAALA_TX32 OR CONFIG_DAALA_TX64)
- if (NOT CONFIG_LOWBITDEPTH)
- change_config_and_warn(CONFIG_LOWBITDEPTH 1 CONFIG_DAALA_TXx)
- endif ()
- if (CONFIG_TXMG)
- change_config_and_warn(CONFIG_TXMG 0 CONFIG_DAALA_DCTx)
- endif ()
- endif ()
-
if (CONFIG_EXT_INTRA_MOD)
if (NOT CONFIG_INTRA_EDGE)
change_config_and_warn(CONFIG_INTRA_EDGE 1 CONFIG_EXT_INTRA_MOD)
diff --git a/test/av1_dct_test.cc b/test/av1_dct_test.cc
index c8a474f..fd68a54 100644
--- a/test/av1_dct_test.cc
+++ b/test/av1_dct_test.cc
@@ -23,10 +23,6 @@
#define CONFIG_COEFFICIENT_RANGE_CHECKING 1
#define AV1_DCT_GTEST
#include "av1/encoder/dct.c"
-#if CONFIG_DAALA_TX4 || CONFIG_DAALA_TX8 || CONFIG_DAALA_TX16 || \
- CONFIG_DAALA_Tx32
-#include "av1/common/daala_tx.c"
-#endif
using libaom_test::ACMRandom;
diff --git a/test/av1_fht16x16_test.cc b/test/av1_fht16x16_test.cc
index 93440f7..24f4c78 100644
--- a/test/av1_fht16x16_test.cc
+++ b/test/av1_fht16x16_test.cc
@@ -23,7 +23,6 @@
using libaom_test::ACMRandom;
-#if !CONFIG_DAALA_TX
namespace {
typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
const TxfmParam *txfm_param);
@@ -161,7 +160,7 @@
using std::tr1::make_tuple;
-#if HAVE_SSE2 && !CONFIG_DAALA_TX16
+#if HAVE_SSE2
const Ht16x16Param kArrayHt16x16Param_sse2[] = {
make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, DCT_DCT,
AOM_BITS_8, 256),
@@ -200,7 +199,7 @@
::testing::ValuesIn(kArrayHt16x16Param_sse2));
#endif // HAVE_SSE2
-#if HAVE_AVX2 && !CONFIG_DAALA_TX16
+#if HAVE_AVX2
const Ht16x16Param kArrayHt16x16Param_avx2[] = {
make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, DCT_DCT,
AOM_BITS_8, 256),
@@ -239,7 +238,7 @@
::testing::ValuesIn(kArrayHt16x16Param_avx2));
#endif // HAVE_AVX2
-#if HAVE_SSE4_1 && !CONFIG_DAALA_TX16
+#if HAVE_SSE4_1
const HighbdHt16x16Param kArrayHBDHt16x16Param_sse4_1[] = {
make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, DCT_DCT, 10),
make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, DCT_DCT, 12),
@@ -262,7 +261,6 @@
};
INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdTrans16x16HT,
::testing::ValuesIn(kArrayHBDHt16x16Param_sse4_1));
-#endif // HAVE_SSE4_1 && !CONFIG_DAALA_TX16
+#endif // HAVE_SSE4_1
} // namespace
-#endif // !CONFIG_DAALA_TX
diff --git a/test/av1_fht16x32_test.cc b/test/av1_fht16x32_test.cc
index 871dc8b..ac89d54 100644
--- a/test/av1_fht16x32_test.cc
+++ b/test/av1_fht16x32_test.cc
@@ -23,8 +23,6 @@
using libaom_test::ACMRandom;
-#if !CONFIG_DAALA_TX
-
namespace {
typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
const TxfmParam *txfm_param);
@@ -153,5 +151,3 @@
#endif // HAVE_SSE2
} // namespace
-
-#endif // !CONFIG_DAALA_TX
diff --git a/test/av1_fht16x8_test.cc b/test/av1_fht16x8_test.cc
index b32cf8e..991cdb4 100644
--- a/test/av1_fht16x8_test.cc
+++ b/test/av1_fht16x8_test.cc
@@ -23,7 +23,6 @@
using libaom_test::ACMRandom;
-#if !CONFIG_DAALA_TX
namespace {
typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
const TxfmParam *txfm_param);
@@ -150,5 +149,3 @@
#endif // HAVE_SSE2
} // namespace
-
-#endif // !CONFIG_DAALA_TX
diff --git a/test/av1_fht32x16_test.cc b/test/av1_fht32x16_test.cc
index 177ddf2..3caa129 100644
--- a/test/av1_fht32x16_test.cc
+++ b/test/av1_fht32x16_test.cc
@@ -23,8 +23,6 @@
using libaom_test::ACMRandom;
-#if !CONFIG_DAALA_TX
-
namespace {
typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
const TxfmParam *txfm_param);
@@ -153,4 +151,3 @@
#endif // HAVE_SSE2
} // namespace
-#endif // !CONFIG_DAALA_TX
diff --git a/test/av1_fht32x32_test.cc b/test/av1_fht32x32_test.cc
index 2fc4db8..b31d2f4 100644
--- a/test/av1_fht32x32_test.cc
+++ b/test/av1_fht32x32_test.cc
@@ -48,7 +48,7 @@
av1_fwd_txfm2d_32x32_c(in, out, stride, tx_type, bd);
}
-#if (HAVE_SSE2 || HAVE_AVX2) && !CONFIG_DAALA_TX32
+#if (HAVE_SSE2 || HAVE_AVX2)
void dummy_inv_txfm(const tran_low_t *in, uint8_t *out, int stride,
const TxfmParam *txfm_param) {
(void)in;
@@ -161,7 +161,7 @@
using std::tr1::make_tuple;
-#if HAVE_SSE2 && !CONFIG_DAALA_TX32
+#if HAVE_SSE2
const Ht32x32Param kArrayHt32x32Param_sse2[] = {
make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, DCT_DCT, AOM_BITS_8, 1024),
make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, ADST_DCT, AOM_BITS_8, 1024),
@@ -187,9 +187,9 @@
};
INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans32x32HT,
::testing::ValuesIn(kArrayHt32x32Param_sse2));
-#endif // HAVE_SSE2 && !CONFIG_DAALA_TX32
+#endif // HAVE_SSE2
-#if HAVE_AVX2 && !CONFIG_DAALA_TX32
+#if HAVE_AVX2
const Ht32x32Param kArrayHt32x32Param_avx2[] = {
make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, DCT_DCT, AOM_BITS_8, 1024),
make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, ADST_DCT, AOM_BITS_8, 1024),
@@ -215,5 +215,5 @@
};
INSTANTIATE_TEST_CASE_P(AVX2, AV1Trans32x32HT,
::testing::ValuesIn(kArrayHt32x32Param_avx2));
-#endif // HAVE_AVX2 && !CONFIG_DAALA_TX32
+#endif // HAVE_AVX2
} // namespace
diff --git a/test/av1_fht4x4_test.cc b/test/av1_fht4x4_test.cc
index 91c9798..ef61bce 100644
--- a/test/av1_fht4x4_test.cc
+++ b/test/av1_fht4x4_test.cc
@@ -23,7 +23,6 @@
using libaom_test::ACMRandom;
-#if !CONFIG_DAALA_TX
namespace {
typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
const TxfmParam *txfm_param);
@@ -164,7 +163,7 @@
using std::tr1::make_tuple;
-#if HAVE_SSE2 && !CONFIG_DAALA_TX4
+#if HAVE_SSE2
const Ht4x4Param kArrayHt4x4Param_sse2[] = {
make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, DCT_DCT, AOM_BITS_8,
16),
@@ -198,7 +197,7 @@
::testing::ValuesIn(kArrayHt4x4Param_sse2));
#endif // HAVE_SSE2
-#if HAVE_SSE4_1 && !CONFIG_DAALA_TX4
+#if HAVE_SSE4_1
const HighbdHt4x4Param kArrayHighbdHt4x4Param[] = {
make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, DCT_DCT, 10),
make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, DCT_DCT, 12),
@@ -223,7 +222,6 @@
INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdTrans4x4HT,
::testing::ValuesIn(kArrayHighbdHt4x4Param));
-#endif // HAVE_SSE4_1 && !CONFIG_DAALA_TX4
+#endif // HAVE_SSE4_1
} // namespace
-#endif // !CONFIG_DAALA_TX
diff --git a/test/av1_fht4x8_test.cc b/test/av1_fht4x8_test.cc
index 00a2916..8be6aa0 100644
--- a/test/av1_fht4x8_test.cc
+++ b/test/av1_fht4x8_test.cc
@@ -23,7 +23,6 @@
using libaom_test::ACMRandom;
-#if !CONFIG_DAALA_TX
namespace {
typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
const TxfmParam *txfm_param);
@@ -140,4 +139,3 @@
#endif // HAVE_SSE2
} // namespace
-#endif // !CONFIG_DAALA_TX
diff --git a/test/av1_fht64x64_test.cc b/test/av1_fht64x64_test.cc
index d833324..a611c5b 100644
--- a/test/av1_fht64x64_test.cc
+++ b/test/av1_fht64x64_test.cc
@@ -20,7 +20,7 @@
#include "test/transform_test_base.h"
#include "test/util.h"
-#if CONFIG_TX64X64 && !CONFIG_DAALA_TX
+#if CONFIG_TX64X64
using libaom_test::ACMRandom;
diff --git a/test/av1_fht8x16_test.cc b/test/av1_fht8x16_test.cc
index a706a98..fc5748d 100644
--- a/test/av1_fht8x16_test.cc
+++ b/test/av1_fht8x16_test.cc
@@ -22,7 +22,6 @@
using libaom_test::ACMRandom;
-#if !CONFIG_DAALA_TX
namespace {
typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
const TxfmParam *txfm_param);
@@ -149,4 +148,3 @@
#endif // HAVE_SSE2
} // namespace
-#endif // !CONFIG_DAALA_TX
diff --git a/test/av1_fht8x4_test.cc b/test/av1_fht8x4_test.cc
index 6806edb..e88bc35 100644
--- a/test/av1_fht8x4_test.cc
+++ b/test/av1_fht8x4_test.cc
@@ -22,7 +22,6 @@
using libaom_test::ACMRandom;
-#if !CONFIG_DAALA_TX
namespace {
typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
const TxfmParam *txfm_param);
@@ -139,4 +138,3 @@
#endif // HAVE_SSE2
} // namespace
-#endif // !CONFIG_DAALA_TX
diff --git a/test/av1_fht8x8_test.cc b/test/av1_fht8x8_test.cc
index a73053f..6510099 100644
--- a/test/av1_fht8x8_test.cc
+++ b/test/av1_fht8x8_test.cc
@@ -23,7 +23,6 @@
using libaom_test::ACMRandom;
-#if !CONFIG_DAALA_TX
namespace {
typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
const TxfmParam *txfm_param);
@@ -164,7 +163,7 @@
using std::tr1::make_tuple;
-#if HAVE_SSE2 && !CONFIG_DAALA_TX8
+#if HAVE_SSE2
const Ht8x8Param kArrayHt8x8Param_sse2[] = {
make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, DCT_DCT, AOM_BITS_8,
64),
@@ -198,7 +197,7 @@
::testing::ValuesIn(kArrayHt8x8Param_sse2));
#endif // HAVE_SSE2
-#if HAVE_SSE4_1 && !CONFIG_DAALA_TX8
+#if HAVE_SSE4_1
const HighbdHt8x8Param kArrayHBDHt8x8Param_sse4_1[] = {
make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, DCT_DCT, 10),
make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, DCT_DCT, 12),
@@ -221,7 +220,6 @@
};
INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdTrans8x8HT,
::testing::ValuesIn(kArrayHBDHt8x8Param_sse4_1));
-#endif // HAVE_SSE4_1 && !CONFIG_DAALA_TX8
+#endif // HAVE_SSE4_1
} // namespace
-#endif // !CONFIG_DAALA_TX
diff --git a/test/av1_highbd_iht_test.cc b/test/av1_highbd_iht_test.cc
index 81e635a..761193e 100644
--- a/test/av1_highbd_iht_test.cc
+++ b/test/av1_highbd_iht_test.cc
@@ -137,22 +137,16 @@
using std::tr1::make_tuple;
-#if HAVE_SSE4_1 && !(CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16)
-#if !CONFIG_DAALA_TX4
+#if HAVE_SSE4_1
#define PARAM_LIST_4X4 \
&av1_fwd_txfm2d_4x4_c, &av1_inv_txfm2d_add_4x4_sse4_1, \
&av1_inv_txfm2d_add_4x4_c, 16
-#endif
-#if !CONFIG_DAALA_TX8
#define PARAM_LIST_8X8 \
&av1_fwd_txfm2d_8x8_c, &av1_inv_txfm2d_add_8x8_sse4_1, \
&av1_inv_txfm2d_add_8x8_c, 64
-#endif
-#if !CONFIG_DAALA_TX16
#define PARAM_LIST_16X16 \
&av1_fwd_txfm2d_16x16_c, &av1_inv_txfm2d_add_16x16_sse4_1, \
&av1_inv_txfm2d_add_16x16_c, 256
-#endif
#if CONFIG_TX64X64
#define PARAM_LIST_64X64 \
&av1_fwd_txfm2d_64x64_c, &av1_inv_txfm2d_add_64x64_sse4_1, \
@@ -160,8 +154,7 @@
#endif
const IHbdHtParam kArrayIhtParam[] = {
-// 16x16
-#if !CONFIG_DAALA_TX16
+ // 16x16
make_tuple(PARAM_LIST_16X16, DCT_DCT, 10),
make_tuple(PARAM_LIST_16X16, DCT_DCT, 12),
make_tuple(PARAM_LIST_16X16, ADST_DCT, 10),
@@ -180,9 +173,7 @@
make_tuple(PARAM_LIST_16X16, ADST_FLIPADST, 12),
make_tuple(PARAM_LIST_16X16, FLIPADST_ADST, 10),
make_tuple(PARAM_LIST_16X16, FLIPADST_ADST, 12),
-#endif
-// 8x8
-#if !CONFIG_DAALA_TX8
+ // 8x8
make_tuple(PARAM_LIST_8X8, DCT_DCT, 10),
make_tuple(PARAM_LIST_8X8, DCT_DCT, 12),
make_tuple(PARAM_LIST_8X8, ADST_DCT, 10),
@@ -201,9 +192,7 @@
make_tuple(PARAM_LIST_8X8, ADST_FLIPADST, 12),
make_tuple(PARAM_LIST_8X8, FLIPADST_ADST, 10),
make_tuple(PARAM_LIST_8X8, FLIPADST_ADST, 12),
-#endif
-// 4x4
-#if !CONFIG_DAALA_TX4
+ // 4x4
make_tuple(PARAM_LIST_4X4, DCT_DCT, 10),
make_tuple(PARAM_LIST_4X4, DCT_DCT, 12),
make_tuple(PARAM_LIST_4X4, ADST_DCT, 10),
@@ -222,7 +211,6 @@
make_tuple(PARAM_LIST_4X4, ADST_FLIPADST, 12),
make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 10),
make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 12),
-#endif
#if CONFIG_TX64X64
make_tuple(PARAM_LIST_64X64, DCT_DCT, 10),
make_tuple(PARAM_LIST_64X64, DCT_DCT, 12),
@@ -231,10 +219,9 @@
INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdInvHTNxN,
::testing::ValuesIn(kArrayIhtParam));
-#endif // HAVE_SSE4_1 &&
- // !(CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16)
+#endif // HAVE_SSE4_1
-#if HAVE_AVX2 && !CONFIG_DAALA_TX32
+#if HAVE_AVX2
#define PARAM_LIST_32X32 \
&av1_fwd_txfm2d_32x32_c, &av1_inv_txfm2d_add_32x32_avx2, \
&av1_inv_txfm2d_add_32x32_c, 1024
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 5ae7d69..572feb8 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -30,7 +30,6 @@
using libaom_test::ACMRandom;
-#if !CONFIG_DAALA_TX
namespace {
const int kNumCoeffs = 256;
@@ -753,7 +752,6 @@
::testing::Values(make_tuple(&aom_fdct16x16_sse2,
&aom_idct16x16_256_add_c,
DCT_DCT, AOM_BITS_8)));
-#if !CONFIG_DAALA_TX16
INSTANTIATE_TEST_CASE_P(
SSE2, Trans16x16HT,
::testing::Values(make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_c,
@@ -764,7 +762,5 @@
DCT_ADST, AOM_BITS_8),
make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_c,
ADST_ADST, AOM_BITS_8)));
-#endif
#endif // HAVE_SSE2
} // namespace
-#endif // !CONFIG_DAALA_TX
diff --git a/test/error_block_test.cc b/test/error_block_test.cc
index 0b1052f..aa083d5 100644
--- a/test/error_block_test.cc
+++ b/test/error_block_test.cc
@@ -155,7 +155,7 @@
<< "First failed at test case " << first_failure;
}
-#if (HAVE_SSE2 || HAVE_AVX) && !CONFIG_DAALA_TX
+#if (HAVE_SSE2 || HAVE_AVX)
using std::tr1::make_tuple;
INSTANTIATE_TEST_CASE_P(
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 8d5ce9b..63f1601 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -29,7 +29,6 @@
using libaom_test::ACMRandom;
-#if !CONFIG_DAALA_TX
namespace {
typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
@@ -245,7 +244,7 @@
make_tuple(&av1_fwht4x4_c, &aom_iwht4x4_16_add_c, DCT_DCT,
AOM_BITS_8, 16)));
-#if HAVE_SSE2 && !CONFIG_DAALA_TX4
+#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
SSE2, Trans4x4WHT,
::testing::Values(make_tuple(&av1_fwht4x4_c, &aom_iwht4x4_16_add_c, DCT_DCT,
@@ -254,7 +253,7 @@
DCT_DCT, AOM_BITS_8, 16)));
#endif
-#if HAVE_SSE2 && !CONFIG_DAALA_TX4
+#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
SSE2, Trans4x4HT,
::testing::Values(make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c,
@@ -265,7 +264,6 @@
DCT_ADST, AOM_BITS_8, 16),
make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c,
ADST_ADST, AOM_BITS_8, 16)));
-#endif // HAVE_SSE2 && !CONFIG_DAALA_TX4
+#endif // HAVE_SSE2
} // namespace
-#endif // !CONFIG_DAALA_TX
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 096a3c6..050408f 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -29,7 +29,6 @@
using libaom_test::ACMRandom;
-#if !CONFIG_DAALA_TX
namespace {
const int kNumCoeffs = 64;
@@ -595,7 +594,6 @@
::testing::Values(make_tuple(&aom_fdct8x8_sse2,
&aom_idct8x8_64_add_c,
DCT_DCT, AOM_BITS_8)));
-#if !CONFIG_DAALA_TX8
INSTANTIATE_TEST_CASE_P(
SSE2, FwdTrans8x8HT,
::testing::Values(make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_c,
@@ -606,7 +604,6 @@
DCT_ADST, AOM_BITS_8),
make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_c,
ADST_ADST, AOM_BITS_8)));
-#endif // !CONFIG_DAALA_TX8
#endif // HAVE_SSE2
#if HAVE_SSSE3 && ARCH_X86_64
@@ -617,4 +614,3 @@
#endif
} // namespace
-#endif // !CONFIG_DAALA_TX