blob: 38cdd22fffa6a53b7fa65cccac31ebf2ab3b62be [file] [log] [blame]
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <math.h>
#include "./aom_dsp_rtcd.h"
#include "./av1_rtcd.h"
#include "aom_dsp/inv_txfm.h"
#include "aom_ports/mem.h"
#include "av1/common/av1_inv_txfm1d_cfg.h"
#include "av1/common/blockd.h"
#include "av1/common/enums.h"
#include "av1/common/idct.h"
#if CONFIG_DAALA_TX4 || CONFIG_DAALA_TX8 || CONFIG_DAALA_TX16 || \
CONFIG_DAALA_TX32 || CONFIG_DAALA_TX64
#include "av1/common/daala_tx.h"
#endif
int av1_get_tx_scale(const TX_SIZE tx_size) {
const int pels = tx_size_2d[tx_size];
return (pels > 256) + (pels > 1024) + (pels > 4096);
}
// NOTE: The implementation of all inverses need to be aware of the fact
// that input and output could be the same buffer.
static void iidtx4_c(const tran_low_t *input, tran_low_t *output) {
int i;
for (i = 0; i < 4; ++i) {
output[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
}
}
static void iidtx8_c(const tran_low_t *input, tran_low_t *output) {
int i;
for (i = 0; i < 8; ++i) {
output[i] = input[i] * 2;
}
}
static void iidtx16_c(const tran_low_t *input, tran_low_t *output) {
int i;
for (i = 0; i < 16; ++i) {
output[i] = (tran_low_t)dct_const_round_shift(input[i] * 2 * Sqrt2);
}
}
static void iidtx32_c(const tran_low_t *input, tran_low_t *output) {
int i;
for (i = 0; i < 32; ++i) {
output[i] = input[i] * 4;
}
}
#if CONFIG_TX64X64 && !CONFIG_DAALA_TX64
static void iidtx64_c(const tran_low_t *input, tran_low_t *output) {
int i;
for (i = 0; i < 64; ++i) {
output[i] = (tran_low_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
}
}
#endif // CONFIG_TX64X64
// For use in lieu of ADST
static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
int i;
tran_low_t inputhalf[16];
// Multiply input by sqrt(2)
for (i = 0; i < 16; ++i) {
inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
}
for (i = 0; i < 16; ++i) {
output[i] = input[16 + i] * 4;
}
aom_idct16_c(inputhalf, output + 16);
// Note overall scaling factor is 4 times orthogonal
}
#if CONFIG_TX64X64 && !CONFIG_DAALA_TX64
static void idct64_col_c(const tran_low_t *input, tran_low_t *output) {
int32_t in[64], out[64];
int i;
for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
av1_idct64_new(in, out, inv_cos_bit_col_dct_64, inv_stage_range_col_dct_64);
for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
}
static void idct64_row_c(const tran_low_t *input, tran_low_t *output) {
int32_t in[64], out[64];
int i;
for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
av1_idct64_new(in, out, inv_cos_bit_row_dct_64, inv_stage_range_row_dct_64);
for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
}
// For use in lieu of ADST
static void ihalfright64_c(const tran_low_t *input, tran_low_t *output) {
int i;
tran_low_t inputhalf[32];
// Multiply input by sqrt(2)
for (i = 0; i < 32; ++i) {
inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
}
for (i = 0; i < 32; ++i) {
output[i] = (tran_low_t)dct_const_round_shift(input[32 + i] * 4 * Sqrt2);
}
aom_idct32_c(inputhalf, output + 32);
// Note overall scaling factor is 4 * sqrt(2) times orthogonal
}
#endif // CONFIG_TX64X64
// Inverse identity transform and add.
static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int bsx, int bsy, TX_TYPE tx_type) {
int r, c;
const int pels = bsx * bsy;
const int shift = 3 - ((pels > 256) + (pels > 1024));
if (tx_type == IDTX) {
for (r = 0; r < bsy; ++r) {
for (c = 0; c < bsx; ++c)
dest[c] = clip_pixel_add(dest[c], input[c] >> shift);
dest += stride;
input += bsx;
}
}
}
#define FLIPUD_PTR(dest, stride, size) \
do { \
(dest) = (dest) + ((size)-1) * (stride); \
(stride) = -(stride); \
} while (0)
static void maybe_flip_strides(uint8_t **dst, int *dstride, tran_low_t **src,
int *sstride, TX_TYPE tx_type, int sizey,
int sizex) {
// Note that the transpose of src will be added to dst. In order to LR
// flip the addends (in dst coordinates), we UD flip the src. To UD flip
// the addends, we UD flip the dst.
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
case DCT_ADST:
case ADST_ADST:
case IDTX:
case V_DCT:
case H_DCT:
case V_ADST:
case H_ADST: break;
case FLIPADST_DCT:
case FLIPADST_ADST:
case V_FLIPADST:
// flip UD
FLIPUD_PTR(*dst, *dstride, sizey);
break;
case DCT_FLIPADST:
case ADST_FLIPADST:
case H_FLIPADST:
// flip LR
FLIPUD_PTR(*src, *sstride, sizex);
break;
case FLIPADST_FLIPADST:
// flip UD
FLIPUD_PTR(*dst, *dstride, sizey);
// flip LR
FLIPUD_PTR(*src, *sstride, sizex);
break;
default: assert(0); break;
}
}
#if CONFIG_HIGHBITDEPTH
#if CONFIG_TX64X64
static void highbd_inv_idtx_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int bsx, int bsy, TX_TYPE tx_type,
int bd) {
int r, c;
const int pels = bsx * bsy;
const int shift = 3 - ((pels > 256) + (pels > 1024));
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
if (tx_type == IDTX) {
for (r = 0; r < bsy; ++r) {
for (c = 0; c < bsx; ++c)
dest[c] = highbd_clip_pixel_add(dest[c], input[c] >> shift, bd);
dest += stride;
input += bsx;
}
}
}
#endif // CONFIG_TX64X64
#endif // CONFIG_HIGHBITDEPTH
#if CONFIG_LGT || CONFIG_LGT_FROM_PRED
void ilgt4(const tran_low_t *input, tran_low_t *output,
const tran_high_t *lgtmtx) {
if (!lgtmtx) assert(0);
#if CONFIG_LGT_FROM_PRED
// For DCT/ADST, use butterfly implementations
if (lgtmtx[0] == DCT4) {
aom_idct4_c(input, output);
return;
} else if (lgtmtx[0] == ADST4) {
aom_iadst4_c(input, output);
return;
}
#endif // CONFIG_LGT_FROM_PRED
// evaluate s[j] = sum of all lgtmtx[j]*input[i] over i=1,...,4
tran_high_t s[4] = { 0 };
for (int i = 0; i < 4; ++i)
for (int j = 0; j < 4; ++j) s[j] += lgtmtx[i * 4 + j] * input[i];
for (int i = 0; i < 4; ++i) output[i] = WRAPLOW(dct_const_round_shift(s[i]));
}
void ilgt8(const tran_low_t *input, tran_low_t *output,
const tran_high_t *lgtmtx) {
if (!lgtmtx) assert(0);
#if CONFIG_LGT_FROM_PRED
// For DCT/ADST, use butterfly implementations
if (lgtmtx[0] == DCT8) {
aom_idct8_c(input, output);
return;
} else if (lgtmtx[0] == ADST8) {
aom_iadst8_c(input, output);
return;
}
#endif // CONFIG_LGT_FROM_PRED
// evaluate s[j] = sum of all lgtmtx[j]*input[i] over i=1,...,8
tran_high_t s[8] = { 0 };
for (int i = 0; i < 8; ++i)
for (int j = 0; j < 8; ++j) s[j] += lgtmtx[i * 8 + j] * input[i];
for (int i = 0; i < 8; ++i) output[i] = WRAPLOW(dct_const_round_shift(s[i]));
}
#endif // CONFIG_LGT || CONFIG_LGT_FROM_PRED
#if CONFIG_LGT
// get_lgt4 and get_lgt8 return 1 and pick a lgt matrix if LGT is chosen to
// apply. Otherwise they return 0
int get_lgt4(const TxfmParam *txfm_param, int is_col,
const tran_high_t **lgtmtx) {
assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
if (is_col && (vtx_tab[txfm_param->tx_type] == ADST_1D ||
vtx_tab[txfm_param->tx_type] == FLIPADST_1D)) {
lgtmtx[0] = txfm_param->is_inter ? &lgt4_170[0][0] : &lgt4_140[0][0];
return 1;
} else if (!is_col && (htx_tab[txfm_param->tx_type] == ADST_1D ||
htx_tab[txfm_param->tx_type] == FLIPADST_1D)) {
lgtmtx[0] = txfm_param->is_inter ? &lgt4_170[0][0] : &lgt4_140[0][0];
return 1;
}
lgtmtx[0] = NULL;
return 0;
}
int get_lgt8(const TxfmParam *txfm_param, int is_col,
const tran_high_t **lgtmtx) {
assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
if (is_col && (vtx_tab[txfm_param->tx_type] == ADST_1D ||
vtx_tab[txfm_param->tx_type] == FLIPADST_1D)) {
lgtmtx[0] = txfm_param->is_inter ? &lgt8_170[0][0] : &lgt8_150[0][0];
return 1;
} else if (!is_col && (htx_tab[txfm_param->tx_type] == ADST_1D ||
htx_tab[txfm_param->tx_type] == FLIPADST_1D)) {
lgtmtx[0] = txfm_param->is_inter ? &lgt8_170[0][0] : &lgt8_150[0][0];
return 1;
}
lgtmtx[0] = NULL;
return 0;
}
#endif // CONFIG_LGT
#if CONFIG_LGT_FROM_PRED
void ilgt16up(const tran_low_t *input, tran_low_t *output,
const tran_high_t *lgtmtx) {
if (lgtmtx[0] == DCT16) {
aom_idct16_c(input, output);
return;
} else if (lgtmtx[0] == ADST16) {
aom_iadst16_c(input, output);
return;
} else if (lgtmtx[0] == DCT32) {
aom_idct32_c(input, output);
return;
} else if (lgtmtx[0] == ADST32) {
ihalfright32_c(input, output);
return;
} else {
assert(0);
}
}
void get_discontinuity_1d(uint8_t *arr, int n, int *idx_max_diff) {
*idx_max_diff = -1;
int temp = 0, max_diff = 0, min_diff = INT_MAX;
for (int i = 1; i < n; ++i) {
temp = abs(arr[i] - arr[i - 1]);
if (temp > max_diff) {
max_diff = temp;
*idx_max_diff = i;
}
if (temp < min_diff) min_diff = temp;
}
}
void get_discontinuity_2d(uint8_t *dst, int stride, int n, int is_col,
int *idx_max_diff, int ntx) {
*idx_max_diff = -1;
int diff = 0, temp = 0, max_diff = 0, min_diff = INT_MAX;
for (int i = 1; i < n; ++i) {
temp = 0;
for (int j = 0; j < ntx; ++j) {
if (is_col) // vertical diff
diff = dst[i * stride + j] - dst[(i - 1) * stride + j];
else // horizontal diff
diff = dst[j * stride + i] - dst[j * stride + i - 1];
temp += diff * diff;
}
// temp/w is the i-th avg square diff
if (temp > max_diff) {
max_diff = temp;
*idx_max_diff = i;
}
if (temp < min_diff) min_diff = temp;
}
}
int idx_selfloop_wrt_mode(PREDICTION_MODE mode, int is_col) {
// 0: no self-loop
// 1: small self-loop
// 2: medium self-loop
// 3: large self-loop
switch (mode) {
case DC_PRED:
case SMOOTH_PRED:
// predition is good for both directions: large SLs for row and col
return 3;
case PAETH_PRED: return 0;
#if CONFIG_SMOOTH_HV
case SMOOTH_H_PRED:
#endif
case H_PRED:
// prediction is good for H direction: large SL for row only
return is_col ? 0 : 3;
#if CONFIG_SMOOTH_HV
case SMOOTH_V_PRED:
#endif
case V_PRED:
// prediction is good for V direction: large SL for col only
return is_col ? 3 : 0;
#if LGT_SL_INTRA
// directional mode: choose SL based on the direction
case D45_PRED: return is_col ? 2 : 0;
case D63_PRED: return is_col ? 3 : 0;
case D117_PRED: return is_col ? 3 : 1;
case D135_PRED: return 2;
case D153_PRED: return is_col ? 1 : 3;
case D207_PRED: return is_col ? 0 : 3;
#else
case D45_PRED:
case D63_PRED:
case D117_PRED: return is_col ? 3 : 0;
case D135_PRED:
case D153_PRED:
case D207_PRED: return is_col ? 0 : 3;
#endif
// inter: no SL
default: return 0;
}
}
void get_lgt4_from_pred(const TxfmParam *txfm_param, int is_col,
const tran_high_t **lgtmtx, int ntx) {
assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
PREDICTION_MODE mode = txfm_param->mode;
int stride = txfm_param->stride;
uint8_t *dst = txfm_param->dst;
int bp = -1;
uint8_t arr[4];
// Each lgt4mtx_arr[k][i] corresponds to a line graph with a self-loop on
// the first node, and possibly a weak edge within the line graph. i is
// the index of the weak edge (between the i-th and (i+1)-th pixels, i=0
// means no weak edge). k corresponds to the first self-loop's weight
const tran_high_t *lgt4mtx_arr[4][4] = {
{ &lgt4_000[0][0], &lgt4_000w1[0][0], &lgt4_000w2[0][0],
&lgt4_000w3[0][0] },
{ &lgt4_060[0][0], &lgt4_060_000w1[0][0], &lgt4_060_000w2[0][0],
&lgt4_060_000w3[0][0] },
{ &lgt4_100[0][0], &lgt4_100_000w1[0][0], &lgt4_100_000w2[0][0],
&lgt4_100_000w3[0][0] },
{ &lgt4_150[0][0], &lgt4_150_000w1[0][0], &lgt4_150_000w2[0][0],
&lgt4_150_000w3[0][0] },
};
// initialize to DCT or some LGTs, and then change later if necessary
int idx_sl = idx_selfloop_wrt_mode(mode, is_col);
lgtmtx[0] = lgt4mtx_arr[idx_sl][0];
// find the break point and replace the line graph by the one with a
// break point
if (mode == DC_PRED || mode == SMOOTH_PRED) {
// Do not use break point, since 1) is_left_available and is_top_available
// in DC_PRED are not known by txfm_param for now, so accessing
// both boundaries anyway may cause a mismatch 2) DC prediciton
// typically yields very smooth residues so having the break point
// does not usually improve the RD result.
return;
} else if (mode == PAETH_PRED) {
// PAETH_PRED: use both 1D top boundary and 1D left boundary
if (is_col)
for (int i = 0; i < 4; ++i) arr[i] = dst[i * stride];
else
for (int i = 0; i < 4; ++i) arr[i] = dst[i];
get_discontinuity_1d(&arr[0], 4, &bp);
} else if (mode == V_PRED) {
// V_PRED: use 1D top boundary only
if (is_col) return;
for (int i = 0; i < 4; ++i) arr[i] = dst[i];
get_discontinuity_1d(&arr[0], 4, &bp);
} else if (mode == H_PRED) {
// H_PRED: use 1D left boundary only
if (!is_col) return;
for (int i = 0; i < 4; ++i) arr[i] = dst[i * stride];
get_discontinuity_1d(&arr[0], 4, &bp);
#if CONFIG_SMOOTH_HV
} else if (mode == SMOOTH_V_PRED) {
if (is_col) return;
for (int i = 0; i < 4; ++i) arr[i] = dst[-stride + i];
get_discontinuity_1d(&arr[0], 4, &bp);
} else if (mode == SMOOTH_H_PRED) {
if (!is_col) return;
for (int i = 0; i < 4; ++i) arr[i] = dst[i * stride - 1];
get_discontinuity_1d(&arr[0], 4, &bp);
#endif
} else if (mode == D45_PRED || mode == D63_PRED || mode == D117_PRED) {
// directional modes closer to vertical (maybe include D135 later)
if (!is_col) get_discontinuity_2d(dst, stride, 4, 0, &bp, ntx);
} else if (mode == D135_PRED || mode == D153_PRED || mode == D207_PRED) {
// directional modes closer to horizontal
if (is_col) get_discontinuity_2d(dst, stride, 4, 1, &bp, ntx);
} else if (mode > PAETH_PRED) {
// inter
get_discontinuity_2d(dst, stride, 4, is_col, &bp, ntx);
}
#if LGT_SL_INTRA
if (bp != -1) lgtmtx[0] = lgt4mtx_arr[idx_sl][bp];
#else
if (bp != -1) lgtmtx[0] = lgt4mtx_arr[0][bp];
#endif
}
void get_lgt8_from_pred(const TxfmParam *txfm_param, int is_col,
const tran_high_t **lgtmtx, int ntx) {
assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
PREDICTION_MODE mode = txfm_param->mode;
int stride = txfm_param->stride;
uint8_t *dst = txfm_param->dst;
int bp = -1;
uint8_t arr[8];
const tran_high_t *lgt8mtx_arr[4][8] = {
{ &lgt8_000[0][0], &lgt8_000w1[0][0], &lgt8_000w2[0][0], &lgt8_000w3[0][0],
&lgt8_000w4[0][0], &lgt8_000w5[0][0], &lgt8_000w6[0][0],
&lgt8_000w7[0][0] },
{ &lgt8_060[0][0], &lgt8_060_000w1[0][0], &lgt8_060_000w2[0][0],
&lgt8_060_000w3[0][0], &lgt8_060_000w4[0][0], &lgt8_060_000w5[0][0],
&lgt8_060_000w6[0][0], &lgt8_060_000w7[0][0] },
{ &lgt8_100[0][0], &lgt8_100_000w1[0][0], &lgt8_100_000w2[0][0],
&lgt8_100_000w3[0][0], &lgt8_100_000w4[0][0], &lgt8_100_000w5[0][0],
&lgt8_100_000w6[0][0], &lgt8_100_000w7[0][0] },
{ &lgt8_150[0][0], &lgt8_150_000w1[0][0], &lgt8_150_000w2[0][0],
&lgt8_150_000w3[0][0], &lgt8_150_000w4[0][0], &lgt8_150_000w5[0][0],
&lgt8_150_000w6[0][0], &lgt8_150_000w7[0][0] },
};
int idx_sl = idx_selfloop_wrt_mode(mode, is_col);
lgtmtx[0] = lgt8mtx_arr[idx_sl][0];
if (mode == DC_PRED || mode == SMOOTH_PRED) {
return;
} else if (mode == PAETH_PRED) {
if (is_col)
for (int i = 0; i < 8; ++i) arr[i] = dst[i * stride];
else
for (int i = 0; i < 8; ++i) arr[i] = dst[i];
get_discontinuity_1d(&arr[0], 8, &bp);
} else if (mode == V_PRED) {
if (is_col) return;
for (int i = 0; i < 8; ++i) arr[i] = dst[i];
get_discontinuity_1d(&arr[0], 8, &bp);
} else if (mode == H_PRED) {
if (!is_col) return;
for (int i = 0; i < 8; ++i) arr[i] = dst[i * stride];
get_discontinuity_1d(&arr[0], 8, &bp);
#if CONFIG_SMOOTH_HV
} else if (mode == SMOOTH_V_PRED) {
if (is_col) return;
for (int i = 0; i < 8; ++i) arr[i] = dst[-stride + i];
get_discontinuity_1d(&arr[0], 8, &bp);
} else if (mode == SMOOTH_H_PRED) {
if (!is_col) return;
for (int i = 0; i < 8; ++i) arr[i] = dst[i * stride - 1];
get_discontinuity_1d(&arr[0], 8, &bp);
#endif
} else if (mode == D45_PRED || mode == D63_PRED || mode == D117_PRED) {
if (!is_col) get_discontinuity_2d(dst, stride, 8, 0, &bp, ntx);
} else if (mode == D135_PRED || mode == D153_PRED || mode == D207_PRED) {
if (is_col) get_discontinuity_2d(dst, stride, 8, 1, &bp, ntx);
} else if (mode > PAETH_PRED) {
get_discontinuity_2d(dst, stride, 8, is_col, &bp, ntx);
}
#if LGT_SL_INTRA
if (bp != -1) lgtmtx[0] = lgt8mtx_arr[idx_sl][bp];
#else
if (bp != -1) lgtmtx[0] = lgt8mtx_arr[0][bp];
#endif
}
// Since LGTs with length >8 are not implemented now, the following function
// will just call DCT or ADST
void get_lgt16up_from_pred(const TxfmParam *txfm_param, int is_col,
const tran_high_t **lgtmtx, int ntx) {
assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
int tx_length = is_col ? tx_size_high[txfm_param->tx_size]
: tx_size_wide[txfm_param->tx_size];
assert(tx_length == 16 || tx_length == 32);
PREDICTION_MODE mode = txfm_param->mode;
(void)ntx;
const tran_high_t *dctmtx =
tx_length == 16 ? &lgt16_000[0][0] : &lgt32_000[0][0];
const tran_high_t *adstmtx =
tx_length == 16 ? &lgt16_200[0][0] : &lgt32_200[0][0];
switch (mode) {
case DC_PRED:
case PAETH_PRED:
case SMOOTH_PRED:
// prediction from both top and left -> ADST
lgtmtx[0] = adstmtx;
break;
case V_PRED:
case D45_PRED:
case D63_PRED:
case D117_PRED:
#if CONFIG_SMOOTH_HV
case SMOOTH_V_PRED:
#endif
// prediction from the top more than from the left -> ADST
lgtmtx[0] = is_col ? adstmtx : dctmtx;
break;
case H_PRED:
case D135_PRED:
case D153_PRED:
case D207_PRED:
#if CONFIG_SMOOTH_HV
case SMOOTH_H_PRED:
#endif
// prediction from the left more than from the top -> DCT
lgtmtx[0] = is_col ? dctmtx : adstmtx;
break;
default: lgtmtx[0] = dctmtx; break;
}
}
typedef void (*IlgtFunc)(const tran_low_t *input, tran_low_t *output,
const tran_high_t *lgtmtx);
static IlgtFunc ilgt_func[4] = { ilgt4, ilgt8, ilgt16up, ilgt16up };
typedef void (*GetLgtFunc)(const TxfmParam *txfm_param, int is_col,
const tran_high_t **lgtmtx, int ntx);
static GetLgtFunc get_lgt_func[4] = { get_lgt4_from_pred, get_lgt8_from_pred,
get_lgt16up_from_pred,
get_lgt16up_from_pred };
// this inline function corresponds to the up scaling before the transpose
// operation in the av1_iht* functions
static INLINE tran_low_t inv_upscale_wrt_txsize(const tran_high_t val,
const TX_SIZE tx_size) {
switch (tx_size) {
case TX_4X4:
case TX_8X8:
case TX_4X16:
case TX_16X4:
case TX_8X32:
case TX_32X8: return (tran_low_t)val;
case TX_4X8:
case TX_8X4:
case TX_8X16:
case TX_16X8: return (tran_low_t)dct_const_round_shift(val * Sqrt2);
default: assert(0); break;
}
return 0;
}
// This inline function corresponds to the bit shift before summing with the
// destination in the av1_iht* functions
static INLINE tran_low_t inv_downscale_wrt_txsize(const tran_low_t val,
const TX_SIZE tx_size) {
switch (tx_size) {
case TX_4X4: return ROUND_POWER_OF_TWO(val, 4);
case TX_4X8:
case TX_8X4:
case TX_8X8:
case TX_4X16:
case TX_16X4: return ROUND_POWER_OF_TWO(val, 5);
case TX_8X16:
case TX_16X8:
case TX_8X32:
case TX_32X8: return ROUND_POWER_OF_TWO(val, 6);
default: assert(0); break;
}
return 0;
}
void ilgt2d_from_pred_add(const tran_low_t *input, uint8_t *dest, int stride,
const TxfmParam *txfm_param) {
const TX_SIZE tx_size = txfm_param->tx_size;
const int w = tx_size_wide[tx_size];
const int h = tx_size_high[tx_size];
const int wlog2 = tx_size_wide_log2[tx_size];
const int hlog2 = tx_size_high_log2[tx_size];
assert(w <= 8 || h <= 8);
int i, j;
// largest 1D size allowed for LGT: 32
// largest 2D size allowed for LGT: 8x32=256
tran_low_t tmp[256], out[256], temp1d[32];
const tran_high_t *lgtmtx_col[1];
const tran_high_t *lgtmtx_row[1];
get_lgt_func[hlog2 - 2](txfm_param, 1, lgtmtx_col, w);
get_lgt_func[wlog2 - 2](txfm_param, 0, lgtmtx_row, h);
// for inverse transform, to be consistent with av1_iht functions, we always
// apply row transforms first and column transforms second, but both
// row-first and column-first versions are implemented here for future
// tests (use different lgtmtx_col[i], and choose row or column tx first
// depending on transforms).
#if 1
// inverse column transforms
for (i = 0; i < w; ++i) {
// transpose
for (j = 0; j < h; ++j) tmp[i * h + j] = input[j * w + i];
ilgt_func[hlog2 - 2](&tmp[i * h], temp1d, lgtmtx_col[0]);
// upscale, and store in place
for (j = 0; j < h; ++j)
tmp[i * h + j] = inv_upscale_wrt_txsize(temp1d[j], tx_size);
}
// inverse row transforms
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) temp1d[j] = tmp[j * h + i];
ilgt_func[wlog2 - 2](temp1d, &out[i * w], lgtmtx_row[0]);
}
// downscale + sum with the destination
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
int d = i * stride + j;
int s = i * w + j;
dest[d] =
clip_pixel_add(dest[d], inv_downscale_wrt_txsize(out[s], tx_size));
}
}
#else
// inverse row transforms
for (i = 0; i < h; ++i) {
ilgt_func[wlog2 - 2](input, temp1d, lgtmtx_row[0]);
// upscale and transpose (tmp[j*h+i] <--> tmp[j][i])
for (j = 0; j < w; ++j)
tmp[j * h + i] = inv_upscale_wrt_txsize(temp1d[j], tx_size);
input += w;
}
// inverse column transforms
for (i = 0; i < w; ++i)
ilgt_func[hlog2 - 2](&tmp[i * h], &out[i * h], lgtmtx_col[0]);
// here, out[] is the transpose of 2D block of transform coefficients
// downscale + transform + sum with dest
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
int d = i * stride + j;
int s = j * h + i;
dest[d] =
clip_pixel_add(dest[d], inv_downscale_wrt_txsize(out[s], tx_size));
}
}
#endif
}
#endif // CONFIG_LGT_FROM_PRED
void av1_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
#if CONFIG_MRC_TX
assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
#endif // CONFIG_MRC_TX
#if !CONFIG_DAALA_TX4
if (tx_type == DCT_DCT) {
aom_idct4x4_16_add(input, dest, stride);
return;
}
#endif
static const transform_2d IHT_4[] = {
#if CONFIG_DAALA_TX4
{ daala_idct4, daala_idct4 }, // DCT_DCT = 0
{ daala_idst4, daala_idct4 }, // ADST_DCT = 1
{ daala_idct4, daala_idst4 }, // DCT_ADST = 2
{ daala_idst4, daala_idst4 }, // ADST_ADST = 3
{ daala_idst4, daala_idct4 }, // FLIPADST_DCT
{ daala_idct4, daala_idst4 }, // DCT_FLIPADST
{ daala_idst4, daala_idst4 }, // FLIPADST_FLIPADST
{ daala_idst4, daala_idst4 }, // ADST_FLIPADST
{ daala_idst4, daala_idst4 }, // FLIPADST_ADST
{ daala_idtx4, daala_idtx4 }, // IDTX
{ daala_idct4, daala_idtx4 }, // V_DCT
{ daala_idtx4, daala_idct4 }, // H_DCT
{ daala_idst4, daala_idtx4 }, // V_ADST
{ daala_idtx4, daala_idst4 }, // H_ADST
{ daala_idst4, daala_idtx4 }, // V_FLIPADST
{ daala_idtx4, daala_idst4 }, // H_FLIPADST
#else
{ aom_idct4_c, aom_idct4_c }, // DCT_DCT = 0
{ aom_iadst4_c, aom_idct4_c }, // ADST_DCT = 1
{ aom_idct4_c, aom_iadst4_c }, // DCT_ADST = 2
{ aom_iadst4_c, aom_iadst4_c }, // ADST_ADST = 3
{ aom_iadst4_c, aom_idct4_c }, // FLIPADST_DCT
{ aom_idct4_c, aom_iadst4_c }, // DCT_FLIPADST
{ aom_iadst4_c, aom_iadst4_c }, // FLIPADST_FLIPADST
{ aom_iadst4_c, aom_iadst4_c }, // ADST_FLIPADST
{ aom_iadst4_c, aom_iadst4_c }, // FLIPADST_ADST
{ iidtx4_c, iidtx4_c }, // IDTX
{ aom_idct4_c, iidtx4_c }, // V_DCT
{ iidtx4_c, aom_idct4_c }, // H_DCT
{ aom_iadst4_c, iidtx4_c }, // V_ADST
{ iidtx4_c, aom_iadst4_c }, // H_ADST
{ aom_iadst4_c, iidtx4_c }, // V_FLIPADST
{ iidtx4_c, aom_iadst4_c }, // H_FLIPADST
#endif
};
int i, j;
tran_low_t tmp[4][4];
tran_low_t out[4][4];
tran_low_t *outp = &out[0][0];
int outstride = 4;
#if CONFIG_DCT_ONLY
assert(tx_type == DCT_DCT);
#endif
#if CONFIG_LGT
const tran_high_t *lgtmtx_col[1];
const tran_high_t *lgtmtx_row[1];
int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row);
#endif
// inverse transform row vectors
for (i = 0; i < 4; ++i) {
#if CONFIG_DAALA_TX4
tran_low_t temp_in[4];
for (j = 0; j < 4; j++) temp_in[j] = input[j] * 2;
IHT_4[tx_type].rows(temp_in, out[i]);
#else
#if CONFIG_LGT
if (use_lgt_row)
ilgt4(input, out[i], lgtmtx_row[0]);
else
#endif
IHT_4[tx_type].rows(input, out[i]);
#endif
input += 4;
}
// transpose
for (i = 0; i < 4; i++) {
for (j = 0; j < 4; j++) {
tmp[j][i] = out[i][j];
}
}
// inverse transform column vectors
for (i = 0; i < 4; ++i) {
#if CONFIG_LGT
if (use_lgt_col)
ilgt4(tmp[i], out[i], lgtmtx_col[0]);
else
#endif
IHT_4[tx_type].cols(tmp[i], out[i]);
}
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 4, 4);
// Sum with the destination
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
#if CONFIG_DAALA_TX4
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
#else
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
#endif
}
}
}
void av1_iht4x8_32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
#if CONFIG_MRC_TX
assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
#endif // CONFIG_MRC_TX
#if CONFIG_DCT_ONLY
assert(tx_type == DCT_DCT);
#endif
static const transform_2d IHT_4x8[] = {
#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
{ daala_idct8, daala_idct4 }, // DCT_DCT = 0
{ daala_idst8, daala_idct4 }, // ADST_DCT = 1
{ daala_idct8, daala_idst4 }, // DCT_ADST = 2
{ daala_idst8, daala_idst4 }, // ADST_ADST = 3
{ daala_idst8, daala_idct4 }, // FLIPADST_DCT
{ daala_idct8, daala_idst4 }, // DCT_FLIPADST
{ daala_idst8, daala_idst4 }, // FLIPADST_FLIPADST
{ daala_idst8, daala_idst4 }, // ADST_FLIPADST
{ daala_idst8, daala_idst4 }, // FLIPADST_ADST
{ daala_idtx8, daala_idtx4 }, // IDTX
{ daala_idct8, daala_idtx4 }, // V_DCT
{ daala_idtx8, daala_idct4 }, // H_DCT
{ daala_idst8, daala_idtx4 }, // V_ADST
{ daala_idtx8, daala_idst4 }, // H_ADST
{ daala_idst8, daala_idtx4 }, // V_FLIPADST
{ daala_idtx8, daala_idst4 }, // H_FLIPADST
#else
{ aom_idct8_c, aom_idct4_c }, // DCT_DCT
{ aom_iadst8_c, aom_idct4_c }, // ADST_DCT
{ aom_idct8_c, aom_iadst4_c }, // DCT_ADST
{ aom_iadst8_c, aom_iadst4_c }, // ADST_ADST
{ aom_iadst8_c, aom_idct4_c }, // FLIPADST_DCT
{ aom_idct8_c, aom_iadst4_c }, // DCT_FLIPADST
{ aom_iadst8_c, aom_iadst4_c }, // FLIPADST_FLIPADST
{ aom_iadst8_c, aom_iadst4_c }, // ADST_FLIPADST
{ aom_iadst8_c, aom_iadst4_c }, // FLIPADST_ADST
{ iidtx8_c, iidtx4_c }, // IDTX
{ aom_idct8_c, iidtx4_c }, // V_DCT
{ iidtx8_c, aom_idct4_c }, // H_DCT
{ aom_iadst8_c, iidtx4_c }, // V_ADST
{ iidtx8_c, aom_iadst4_c }, // H_ADST
{ aom_iadst8_c, iidtx4_c }, // V_FLIPADST
{ iidtx8_c, aom_iadst4_c }, // H_FLIPADST
#endif
};
const int n = 4;
const int n2 = 8;
int i, j;
tran_low_t out[4][8], tmp[4][8], outtmp[4];
tran_low_t *outp = &out[0][0];
int outstride = n2;
#if CONFIG_LGT
const tran_high_t *lgtmtx_col[1];
const tran_high_t *lgtmtx_row[1];
int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row);
#endif
// Multi-way scaling matrix (bits):
// LGT/AV1 row,col input+0, rowTX+.5, mid+.5, colTX+1, out-5 == -3
// LGT row, Daala col input+0, rowTX+.5, mid+.5, colTX+0, out-4 == -3
// Daala row, LGT col input+1, rowTX+0, mid+0, colTX+1, out-5 == -3
// Daala row,col input+1, rowTX+0, mid+0, colTX+0, out-4 == -3
// inverse transform row vectors and transpose
for (i = 0; i < n2; ++i) {
#if CONFIG_LGT
if (use_lgt_row) {
// Scaling cases 1 and 2 above
// No input scaling
// Row transform (LGT; scales up .5 bits)
ilgt4(input, outtmp, lgtmtx_row[0]);
// Transpose and mid scaling up by .5 bit
for (j = 0; j < n; ++j)
tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
} else {
#endif
#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
// Daala row transform; Scaling cases 3 and 4 above
tran_low_t temp_in[4];
// Input scaling up by 1 bit
for (j = 0; j < n; j++) temp_in[j] = input[j] * 2;
// Row transform; Daala does not scale
IHT_4x8[tx_type].rows(temp_in, outtmp);
// Transpose; no mid scaling
for (j = 0; j < n; ++j) tmp[j][i] = outtmp[j];
#else
// AV1 row transform; Scaling case 1 only
// Row transform (AV1 scales up .5 bits)
IHT_4x8[tx_type].rows(input, outtmp);
// Transpose and mid scaling up by .5 bit
for (j = 0; j < n; ++j)
tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
#endif
#if CONFIG_LGT
}
#endif
input += n;
}
// inverse transform column vectors
// AV1/LGT column TX scales up by 1 bit, Daala does not scale
for (i = 0; i < n; ++i) {
#if CONFIG_LGT
if (use_lgt_col)
ilgt8(tmp[i], out[i], lgtmtx_col[0]);
else
#endif
IHT_4x8[tx_type].cols(tmp[i], out[i]);
}
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
// Sum with the destination
for (i = 0; i < n2; ++i) {
for (j = 0; j < n; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
#if CONFIG_LGT
if (use_lgt_col)
// Output Scaling cases 1, 3
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
else
#endif
// Output scaling cases 2, 4
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
#else
// Output scaling case 1 only
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
#endif
}
}
}
void av1_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
#if CONFIG_MRC_TX
assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
#endif // CONFIG_MRC_TX
#if CONFIG_DCT_ONLY
assert(tx_type == DCT_DCT);
#endif
static const transform_2d IHT_8x4[] = {
#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
{ daala_idct4, daala_idct8 }, // DCT_DCT = 0
{ daala_idst4, daala_idct8 }, // ADST_DCT = 1
{ daala_idct4, daala_idst8 }, // DCT_ADST = 2
{ daala_idst4, daala_idst8 }, // ADST_ADST = 3
{ daala_idst4, daala_idct8 }, // FLIPADST_DCT
{ daala_idct4, daala_idst8 }, // DCT_FLIPADST
{ daala_idst4, daala_idst8 }, // FLIPADST_FLIPADST
{ daala_idst4, daala_idst8 }, // ADST_FLIPADST
{ daala_idst4, daala_idst8 }, // FLIPADST_ADST
{ daala_idtx4, daala_idtx8 }, // IDTX
{ daala_idct4, daala_idtx8 }, // V_DCT
{ daala_idtx4, daala_idct8 }, // H_DCT
{ daala_idst4, daala_idtx8 }, // V_ADST
{ daala_idtx4, daala_idst8 }, // H_ADST
{ daala_idst4, daala_idtx8 }, // V_FLIPADST
{ daala_idtx4, daala_idst8 }, // H_FLIPADST
#else
{ aom_idct4_c, aom_idct8_c }, // DCT_DCT
{ aom_iadst4_c, aom_idct8_c }, // ADST_DCT
{ aom_idct4_c, aom_iadst8_c }, // DCT_ADST
{ aom_iadst4_c, aom_iadst8_c }, // ADST_ADST
{ aom_iadst4_c, aom_idct8_c }, // FLIPADST_DCT
{ aom_idct4_c, aom_iadst8_c }, // DCT_FLIPADST
{ aom_iadst4_c, aom_iadst8_c }, // FLIPADST_FLIPADST
{ aom_iadst4_c, aom_iadst8_c }, // ADST_FLIPADST
{ aom_iadst4_c, aom_iadst8_c }, // FLIPADST_ADST
{ iidtx4_c, iidtx8_c }, // IDTX
{ aom_idct4_c, iidtx8_c }, // V_DCT
{ iidtx4_c, aom_idct8_c }, // H_DCT
{ aom_iadst4_c, iidtx8_c }, // V_ADST
{ iidtx4_c, aom_iadst8_c }, // H_ADST
{ aom_iadst4_c, iidtx8_c }, // V_FLIPADST
{ iidtx4_c, aom_iadst8_c }, // H_FLIPADST
#endif
};
const int n = 4;
const int n2 = 8;
int i, j;
tran_low_t out[8][4], tmp[8][4], outtmp[8];
tran_low_t *outp = &out[0][0];
int outstride = n;
#if CONFIG_LGT
const tran_high_t *lgtmtx_col[1];
const tran_high_t *lgtmtx_row[1];
int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
#endif
// Multi-way scaling matrix (bits):
// LGT/AV1 row,col input+0, rowTX+1, mid+.5, colTX+.5, out-5 == -3
// LGT row, Daala col input+0, rowTX+1, mid+.5, colTX+.5, out-4 == -3
// Daala row, LGT col input+1, rowTX+0, mid+0, colTX+1, out-5 == -3
// Daala row,col input+1, rowTX+0, mid+0, colTX+0, out-4 == -3
// inverse transform row vectors and transpose
for (i = 0; i < n; ++i) {
#if CONFIG_LGT
if (use_lgt_row) {
// Scaling cases 1 and 2 above
// No input scaling
// Row transform (LGT; scales up 1 bit)
ilgt8(input, outtmp, lgtmtx_row[0]);
// Transpose and mid scaling up by .5 bit
for (j = 0; j < n2; ++j)
tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
} else {
#endif
#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
// Daala row transform; Scaling cases 3 and 4 above
tran_low_t temp_in[8];
// Input scaling up by 1 bit
for (j = 0; j < n2; j++) temp_in[j] = input[j] * 2;
// Row transform; Daala does not scale
IHT_8x4[tx_type].rows(temp_in, outtmp);
// Transpose; no mid scaling
for (j = 0; j < n2; ++j) tmp[j][i] = outtmp[j];
#else
// AV1 row transform; Scaling case 1 only
// Row transform (AV1 scales up 1 bit)
IHT_8x4[tx_type].rows(input, outtmp);
// Transpose and mid scaling up by .5 bit
for (j = 0; j < n2; ++j)
tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
#endif
#if CONFIG_LGT
}
#endif
input += n2;
}
// inverse transform column vectors
// AV1 and LGT scale up by .5 bits; Daala does not scale
for (i = 0; i < n2; ++i) {
#if CONFIG_LGT
if (use_lgt_col)
ilgt4(tmp[i], out[i], lgtmtx_col[0]);
else
#endif
IHT_8x4[tx_type].cols(tmp[i], out[i]);
}
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
// Sum with the destination
for (i = 0; i < n; ++i) {
for (j = 0; j < n2; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
#if CONFIG_DAALA_TX4 && CONFIG_DAALA_TX8
#if CONFIG_LGT
if (use_lgt_col)
// Output scaling cases 1, 3
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
else
#endif
// Output scaling cases 2, 4
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
#else
// Output scaling case 1
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
#endif
}
}
}
void av1_iht4x16_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
#if CONFIG_MRC_TX
assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
#endif // CONFIG_MRC_TX
#if CONFIG_DCT_ONLY
assert(tx_type == DCT_DCT);
#endif
static const transform_2d IHT_4x16[] = {
{ aom_idct16_c, aom_idct4_c }, // DCT_DCT
{ aom_iadst16_c, aom_idct4_c }, // ADST_DCT
{ aom_idct16_c, aom_iadst4_c }, // DCT_ADST
{ aom_iadst16_c, aom_iadst4_c }, // ADST_ADST
{ aom_iadst16_c, aom_idct4_c }, // FLIPADST_DCT
{ aom_idct16_c, aom_iadst4_c }, // DCT_FLIPADST
{ aom_iadst16_c, aom_iadst4_c }, // FLIPADST_FLIPADST
{ aom_iadst16_c, aom_iadst4_c }, // ADST_FLIPADST
{ aom_iadst16_c, aom_iadst4_c }, // FLIPADST_ADST
{ iidtx16_c, iidtx4_c }, // IDTX
{ aom_idct16_c, iidtx4_c }, // V_DCT
{ iidtx16_c, aom_idct4_c }, // H_DCT
{ aom_iadst16_c, iidtx4_c }, // V_ADST
{ iidtx16_c, aom_iadst4_c }, // H_ADST
{ aom_iadst16_c, iidtx4_c }, // V_FLIPADST
{ iidtx16_c, aom_iadst4_c }, // H_FLIPADST
};
const int n = 4;
const int n4 = 16;
int i, j;
tran_low_t out[4][16], tmp[4][16], outtmp[4];
tran_low_t *outp = &out[0][0];
int outstride = n4;
#if CONFIG_LGT
const tran_high_t *lgtmtx_row[1];
int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row);
#endif
// inverse transform row vectors and transpose
for (i = 0; i < n4; ++i) {
#if CONFIG_LGT
if (use_lgt_row)
ilgt4(input, outtmp, lgtmtx_row[0]);
else
#endif
IHT_4x16[tx_type].rows(input, outtmp);
for (j = 0; j < n; ++j) tmp[j][i] = outtmp[j];
input += n;
}
// inverse transform column vectors
for (i = 0; i < n; ++i) {
IHT_4x16[tx_type].cols(tmp[i], out[i]);
}
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n4, n);
// Sum with the destination
for (i = 0; i < n4; ++i) {
for (j = 0; j < n; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
}
}
}
void av1_iht16x4_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
#if CONFIG_MRC_TX
assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
#endif // CONFIG_MRC_TX
#if CONFIG_DCT_ONLY
assert(tx_type == DCT_DCT);
#endif
static const transform_2d IHT_16x4[] = {
{ aom_idct4_c, aom_idct16_c }, // DCT_DCT
{ aom_iadst4_c, aom_idct16_c }, // ADST_DCT
{ aom_idct4_c, aom_iadst16_c }, // DCT_ADST
{ aom_iadst4_c, aom_iadst16_c }, // ADST_ADST
{ aom_iadst4_c, aom_idct16_c }, // FLIPADST_DCT
{ aom_idct4_c, aom_iadst16_c }, // DCT_FLIPADST
{ aom_iadst4_c, aom_iadst16_c }, // FLIPADST_FLIPADST
{ aom_iadst4_c, aom_iadst16_c }, // ADST_FLIPADST
{ aom_iadst4_c, aom_iadst16_c }, // FLIPADST_ADST
{ iidtx4_c, iidtx16_c }, // IDTX
{ aom_idct4_c, iidtx16_c }, // V_DCT
{ iidtx4_c, aom_idct16_c }, // H_DCT
{ aom_iadst4_c, iidtx16_c }, // V_ADST
{ iidtx4_c, aom_iadst16_c }, // H_ADST
{ aom_iadst4_c, iidtx16_c }, // V_FLIPADST
{ iidtx4_c, aom_iadst16_c }, // H_FLIPADST
};
const int n = 4;
const int n4 = 16;
int i, j;
tran_low_t out[16][4], tmp[16][4], outtmp[16];
tran_low_t *outp = &out[0][0];
int outstride = n;
#if CONFIG_LGT
const tran_high_t *lgtmtx_col[1];
int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
#endif
// inverse transform row vectors and transpose
for (i = 0; i < n; ++i) {
IHT_16x4[tx_type].rows(input, outtmp);
for (j = 0; j < n4; ++j) tmp[j][i] = outtmp[j];
input += n4;
}
// inverse transform column vectors
for (i = 0; i < n4; ++i) {
#if CONFIG_LGT
if (use_lgt_col)
ilgt4(tmp[i], out[i], lgtmtx_col[0]);
else
#endif
IHT_16x4[tx_type].cols(tmp[i], out[i]);
}
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n4);
// Sum with the destination
for (i = 0; i < n; ++i) {
for (j = 0; j < n4; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
}
}
}
void av1_iht8x16_128_add_c(const tran_low_t *input, uint8_t *dest, int stride,
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
#if CONFIG_MRC_TX
assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
#endif // CONFIG_MRC_TX
#if CONFIG_DCT_ONLY
assert(tx_type == DCT_DCT);
#endif
static const transform_2d IHT_8x16[] = {
#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
{ daala_idct16, daala_idct8 }, // DCT_DCT = 0
{ daala_idst16, daala_idct8 }, // ADST_DCT = 1
{ daala_idct16, daala_idst8 }, // DCT_ADST = 2
{ daala_idst16, daala_idst8 }, // ADST_ADST = 3
{ daala_idst16, daala_idct8 }, // FLIPADST_DCT
{ daala_idct16, daala_idst8 }, // DCT_FLIPADST
{ daala_idst16, daala_idst8 }, // FLIPADST_FLIPADST
{ daala_idst16, daala_idst8 }, // ADST_FLIPADST
{ daala_idst16, daala_idst8 }, // FLIPADST_ADST
{ daala_idtx16, daala_idtx8 }, // IDTX
{ daala_idct16, daala_idtx8 }, // V_DCT
{ daala_idtx16, daala_idct8 }, // H_DCT
{ daala_idst16, daala_idtx8 }, // V_ADST
{ daala_idtx16, daala_idst8 }, // H_ADST
{ daala_idst16, daala_idtx8 }, // V_FLIPADST
{ daala_idtx16, daala_idst8 }, // H_FLIPADST
#else
{ aom_idct16_c, aom_idct8_c }, // DCT_DCT
{ aom_iadst16_c, aom_idct8_c }, // ADST_DCT
{ aom_idct16_c, aom_iadst8_c }, // DCT_ADST
{ aom_iadst16_c, aom_iadst8_c }, // ADST_ADST
{ aom_iadst16_c, aom_idct8_c }, // FLIPADST_DCT
{ aom_idct16_c, aom_iadst8_c }, // DCT_FLIPADST
{ aom_iadst16_c, aom_iadst8_c }, // FLIPADST_FLIPADST
{ aom_iadst16_c, aom_iadst8_c }, // ADST_FLIPADST
{ aom_iadst16_c, aom_iadst8_c }, // FLIPADST_ADST
{ iidtx16_c, iidtx8_c }, // IDTX
{ aom_idct16_c, iidtx8_c }, // V_DCT
{ iidtx16_c, aom_idct8_c }, // H_DCT
{ aom_iadst16_c, iidtx8_c }, // V_ADST
{ iidtx16_c, aom_iadst8_c }, // H_ADST
{ aom_iadst16_c, iidtx8_c }, // V_FLIPADST
{ iidtx16_c, aom_iadst8_c }, // H_FLIPADST
#endif
};
const int n = 8;
const int n2 = 16;
int i, j;
tran_low_t out[8][16], tmp[8][16], outtmp[8];
tran_low_t *outp = &out[0][0];
int outstride = n2;
#if CONFIG_LGT
const tran_high_t *lgtmtx_row[1];
int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
#endif
// Multi-way scaling matrix (bits):
// LGT/AV1 row, AV1 col input+0, rowTX+1, mid+.5, colTX+1.5, out-6 == -3
// LGT row, Daala col input+0, rowTX+1, mid+0, colTX+0, out-4 == -3
// Daala row, LGT col N/A (no 16-point LGT)
// Daala row,col input+1, rowTX+0, mid+0, colTX+0, out-4 == -3
// inverse transform row vectors and transpose
for (i = 0; i < n2; ++i) {
#if CONFIG_LGT
if (use_lgt_row) {
// Scaling cases 1 and 2 above
// No input scaling
// Row transform (LGT; scales up 1 bit)
ilgt8(input, outtmp, lgtmtx_row[0]);
// Transpose and mid scaling
for (j = 0; j < n; ++j) {
#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
// Mid scaling case 2
tmp[j][i] = outtmp[j];
#else
// Mid scaling case 1
tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
#endif
}
} else {
#endif
#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
tran_low_t temp_in[8];
// Input scaling case 4
for (j = 0; j < n; j++) temp_in[j] = input[j] * 2;
// Row transform (Daala does not scale)
IHT_8x16[tx_type].rows(temp_in, outtmp);
// Transpose (no mid scaling)
for (j = 0; j < n; ++j) tmp[j][i] = outtmp[j];
#else
// Case 1; no input scaling
// Row transform (AV1 scales up 1 bit)
IHT_8x16[tx_type].rows(input, outtmp);
// Transpose and mid scaling up .5 bits
for (j = 0; j < n; ++j)
tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
#endif
#if CONFIG_LGT
}
#endif
input += n;
}
// inverse transform column vectors
// AV1 column TX scales up by 1.5 bit, Daala does not scale
for (i = 0; i < n; ++i) {
IHT_8x16[tx_type].cols(tmp[i], out[i]);
}
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
// Sum with the destination
for (i = 0; i < n2; ++i) {
for (j = 0; j < n; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
// Output scaling cases 2 and 4
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
#else
// Output scaling case 1
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
#endif
}
}
}
void av1_iht16x8_128_add_c(const tran_low_t *input, uint8_t *dest, int stride,
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
#if CONFIG_MRC_TX
assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
#endif // CONFIG_MRC_TX
#if CONFIG_DCT_ONLY
assert(tx_type == DCT_DCT);
#endif
static const transform_2d IHT_16x8[] = {
#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
{ daala_idct8, daala_idct16 }, // DCT_DCT = 0
{ daala_idst8, daala_idct16 }, // ADST_DCT = 1
{ daala_idct8, daala_idst16 }, // DCT_ADST = 2
{ daala_idst8, daala_idst16 }, // ADST_ADST = 3
{ daala_idst8, daala_idct16 }, // FLIPADST_DCT
{ daala_idct8, daala_idst16 }, // DCT_FLIPADST
{ daala_idst8, daala_idst16 }, // FLIPADST_FLIPADST
{ daala_idst8, daala_idst16 }, // ADST_FLIPADST
{ daala_idst8, daala_idst16 }, // FLIPADST_ADST
{ daala_idtx8, daala_idtx16 }, // IDTX
{ daala_idct8, daala_idtx16 }, // V_DCT
{ daala_idtx8, daala_idct16 }, // H_DCT
{ daala_idst8, daala_idtx16 }, // V_ADST
{ daala_idtx8, daala_idst16 }, // H_ADST
{ daala_idst8, daala_idtx16 }, // V_FLIPADST
{ daala_idtx8, daala_idst16 }, // H_FLIPADST
#else
{ aom_idct8_c, aom_idct16_c }, // DCT_DCT
{ aom_iadst8_c, aom_idct16_c }, // ADST_DCT
{ aom_idct8_c, aom_iadst16_c }, // DCT_ADST
{ aom_iadst8_c, aom_iadst16_c }, // ADST_ADST
{ aom_iadst8_c, aom_idct16_c }, // FLIPADST_DCT
{ aom_idct8_c, aom_iadst16_c }, // DCT_FLIPADST
{ aom_iadst8_c, aom_iadst16_c }, // FLIPADST_FLIPADST
{ aom_iadst8_c, aom_iadst16_c }, // ADST_FLIPADST
{ aom_iadst8_c, aom_iadst16_c }, // FLIPADST_ADST
{ iidtx8_c, iidtx16_c }, // IDTX
{ aom_idct8_c, iidtx16_c }, // V_DCT
{ iidtx8_c, aom_idct16_c }, // H_DCT
{ aom_iadst8_c, iidtx16_c }, // V_ADST
{ iidtx8_c, aom_iadst16_c }, // H_ADST
{ aom_iadst8_c, iidtx16_c }, // V_FLIPADST
{ iidtx8_c, aom_iadst16_c }, // H_FLIPADST
#endif
};
const int n = 8;
const int n2 = 16;
int i, j;
tran_low_t out[16][8], tmp[16][8], outtmp[16];
tran_low_t *outp = &out[0][0];
int outstride = n;
#if CONFIG_LGT
const tran_high_t *lgtmtx_col[1];
int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
#endif
// Multi-way scaling matrix (bits):
// AV1 row, LGT/AV1 col input+0, rowTX+1.5, mid+.5, colTX+1, out-6 == -3
// LGT row, Daala col N/A (no 16-point LGT)
// Daala row, LGT col input+1, rowTX+0, mid+1, colTX+1, out-6 == -3
// Daala row, col input+1, rowTX+0, mid+0, colTX+0, out-4 == -3
// inverse transform row vectors and transpose
for (i = 0; i < n; ++i) {
#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
tran_low_t temp_in[16];
// Input scaling cases 3 and 4
for (j = 0; j < n2; j++) temp_in[j] = input[j] * 2;
// Daala row TX, no scaling
IHT_16x8[tx_type].rows(temp_in, outtmp);
// Transpose and mid scaling
#if CONFIG_LGT
if (use_lgt_col)
// Case 3
for (j = 0; j < n2; ++j) tmp[j][i] = outtmp[j] * 2;
else
#endif
// Case 4
for (j = 0; j < n2; ++j) tmp[j][i] = outtmp[j];
#else
// Case 1
// No input scaling
// Row transform, AV1 scales up by 1.5 bits
IHT_16x8[tx_type].rows(input, outtmp);
// Transpose and mid scaling up .5 bits
for (j = 0; j < n2; ++j)
tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
#endif
input += n2;
}
// inverse transform column vectors
// AV!/LGT scales up by 1 bit, Daala does not scale
for (i = 0; i < n2; ++i) {
#if CONFIG_LGT
if (use_lgt_col)
ilgt8(tmp[i], out[i], lgtmtx_col[0]);
else
#endif
IHT_16x8[tx_type].cols(tmp[i], out[i]);
}
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
// Sum with the destination
for (i = 0; i < n; ++i) {
for (j = 0; j < n2; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
// Output scaling
#if CONFIG_DAALA_TX8 && CONFIG_DAALA_TX16
#if CONFIG_LGT
if (use_lgt_col)
// case 3
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
else
#endif
// case 4
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
#else
// case 1
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
#endif
}
}
}
void av1_iht8x32_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
#if CONFIG_MRC_TX
assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
#endif // CONFIG_MRC_TX
#if CONFIG_DCT_ONLY
assert(tx_type == DCT_DCT);
#endif
static const transform_2d IHT_8x32[] = {
{ aom_idct32_c, aom_idct8_c }, // DCT_DCT
{ ihalfright32_c, aom_idct8_c }, // ADST_DCT
{ aom_idct32_c, aom_iadst8_c }, // DCT_ADST
{ ihalfright32_c, aom_iadst8_c }, // ADST_ADST
{ ihalfright32_c, aom_idct8_c }, // FLIPADST_DCT
{ aom_idct32_c, aom_iadst8_c }, // DCT_FLIPADST
{ ihalfright32_c, aom_iadst8_c }, // FLIPADST_FLIPADST
{ ihalfright32_c, aom_iadst8_c }, // ADST_FLIPADST
{ ihalfright32_c, aom_iadst8_c }, // FLIPADST_ADST
{ iidtx32_c, iidtx8_c }, // IDTX
{ aom_idct32_c, iidtx8_c }, // V_DCT
{ iidtx32_c, aom_idct8_c }, // H_DCT
{ ihalfright32_c, iidtx8_c }, // V_ADST
{ iidtx32_c, aom_iadst8_c }, // H_ADST
{ ihalfright32_c, iidtx8_c }, // V_FLIPADST
{ iidtx32_c, aom_iadst8_c }, // H_FLIPADST
};
const int n = 8;
const int n4 = 32;
int i, j;
tran_low_t out[8][32], tmp[8][32], outtmp[8];
tran_low_t *outp = &out[0][0];
int outstride = n4;
#if CONFIG_LGT
const tran_high_t *lgtmtx_row[1];
int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
#endif
// inverse transform row vectors and transpose
for (i = 0; i < n4; ++i) {
#if CONFIG_LGT
if (use_lgt_row)
ilgt8(input, outtmp, lgtmtx_row[0]);
else
#endif
IHT_8x32[tx_type].rows(input, outtmp);
for (j = 0; j < n; ++j) tmp[j][i] = outtmp[j];
input += n;
}
// inverse transform column vectors
for (i = 0; i < n; ++i) {
IHT_8x32[tx_type].cols(tmp[i], out[i]);
}
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n4, n);
// Sum with the destination
for (i = 0; i < n4; ++i) {
for (j = 0; j < n; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
}
}
}
void av1_iht32x8_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
#if CONFIG_MRC_TX
assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
#endif // CONFIG_MRC_TX
#if CONFIG_DCT_ONLY
assert(tx_type == DCT_DCT);
#endif
static const transform_2d IHT_32x8[] = {
{ aom_idct8_c, aom_idct32_c }, // DCT_DCT
{ aom_iadst8_c, aom_idct32_c }, // ADST_DCT
{ aom_idct8_c, ihalfright32_c }, // DCT_ADST
{ aom_iadst8_c, ihalfright32_c }, // ADST_ADST
{ aom_iadst8_c, aom_idct32_c }, // FLIPADST_DCT
{ aom_idct8_c, ihalfright32_c }, // DCT_FLIPADST
{ aom_iadst8_c, ihalfright32_c }, // FLIPADST_FLIPADST
{ aom_iadst8_c, ihalfright32_c }, // ADST_FLIPADST
{ aom_iadst8_c, ihalfright32_c }, // FLIPADST_ADST
{ iidtx8_c, iidtx32_c }, // IDTX
{ aom_idct8_c, iidtx32_c }, // V_DCT
{ iidtx8_c, aom_idct32_c }, // H_DCT
{ aom_iadst8_c, iidtx32_c }, // V_ADST
{ iidtx8_c, ihalfright32_c }, // H_ADST
{ aom_iadst8_c, iidtx32_c }, // V_FLIPADST
{ iidtx8_c, ihalfright32_c }, // H_FLIPADST
};
const int n = 8;
const int n4 = 32;
int i, j;
tran_low_t out[32][8], tmp[32][8], outtmp[32];
tran_low_t *outp = &out[0][0];
int outstride = n;
#if CONFIG_LGT
const tran_high_t *lgtmtx_col[1];
int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
#endif
// inverse transform row vectors and transpose
for (i = 0; i < n; ++i) {
IHT_32x8[tx_type].rows(input, outtmp);
for (j = 0; j < n4; ++j) tmp[j][i] = outtmp[j];
input += n4;
}
// inverse transform column vectors
for (i = 0; i < n4; ++i) {
#if CONFIG_LGT
if (use_lgt_col)
ilgt8(tmp[i], out[i], lgtmtx_col[0]);
else
#endif
IHT_32x8[tx_type].cols(tmp[i], out[i]);
}
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n4);
// Sum with the destination
for (i = 0; i < n; ++i) {
for (j = 0; j < n4; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
}
}
}
void av1_iht16x32_512_add_c(const tran_low_t *input, uint8_t *dest, int stride,
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
#if CONFIG_MRC_TX
assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
#endif // CONFIG_MRC_TX
#if CONFIG_DCT_ONLY
assert(tx_type == DCT_DCT);
#endif
static const transform_2d IHT_16x32[] = {
#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
{ daala_idct32, daala_idct16 }, // DCT_DCT = 0
{ daala_idst32, daala_idct16 }, // ADST_DCT = 1
{ daala_idct32, daala_idst16 }, // DCT_ADST = 2
{ daala_idst32, daala_idst16 }, // ADST_ADST = 3
{ daala_idst32, daala_idct16 }, // FLIPADST_DCT
{ daala_idct32, daala_idst16 }, // DCT_FLIPADST
{ daala_idst32, daala_idst16 }, // FLIPADST_FLIPADST
{ daala_idst32, daala_idst16 }, // ADST_FLIPADST
{ daala_idst32, daala_idst16 }, // FLIPADST_ADST
{ daala_idtx32, daala_idtx16 }, // IDTX
{ daala_idct32, daala_idtx16 }, // V_DCT
{ daala_idtx32, daala_idct16 }, // H_DCT
{ daala_idst32, daala_idtx16 }, // V_ADST
{ daala_idtx32, daala_idst16 }, // H_ADST
{ daala_idst32, daala_idtx16 }, // V_FLIPADST
{ daala_idtx32, daala_idst16 }, // H_FLIPADST
#else
{ aom_idct32_c, aom_idct16_c }, // DCT_DCT
{ ihalfright32_c, aom_idct16_c }, // ADST_DCT
{ aom_idct32_c, aom_iadst16_c }, // DCT_ADST
{ ihalfright32_c, aom_iadst16_c }, // ADST_ADST
{ ihalfright32_c, aom_idct16_c }, // FLIPADST_DCT
{ aom_idct32_c, aom_iadst16_c }, // DCT_FLIPADST
{ ihalfright32_c, aom_iadst16_c }, // FLIPADST_FLIPADST
{ ihalfright32_c, aom_iadst16_c }, // ADST_FLIPADST
{ ihalfright32_c, aom_iadst16_c }, // FLIPADST_ADST
{ iidtx32_c, iidtx16_c }, // IDTX
{ aom_idct32_c, iidtx16_c }, // V_DCT
{ iidtx32_c, aom_idct16_c }, // H_DCT
{ ihalfright32_c, iidtx16_c }, // V_ADST
{ iidtx32_c, aom_iadst16_c }, // H_ADST
{ ihalfright32_c, iidtx16_c }, // V_FLIPADST
{ iidtx32_c, aom_iadst16_c }, // H_FLIPADST
#endif
};
const int n = 16;
const int n2 = 32;
int i, j;
tran_low_t out[16][32], tmp[16][32], outtmp[16];
tran_low_t *outp = &out[0][0];
int outstride = n2;
// inverse transform row vectors and transpose
for (i = 0; i < n2; ++i) {
#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
tran_low_t temp_in[16];
for (j = 0; j < n; j++) temp_in[j] = input[j] * 2;
IHT_16x32[tx_type].rows(temp_in, outtmp);
for (j = 0; j < n; ++j) tmp[j][i] = outtmp[j] * 4;
#else
IHT_16x32[tx_type].rows(input, outtmp);
for (j = 0; j < n; ++j)
tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
#endif
input += n;
}
// inverse transform column vectors
for (i = 0; i < n; ++i) IHT_16x32[tx_type].cols(tmp[i], out[i]);
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
// Sum with the destination
for (i = 0; i < n2; ++i) {
for (j = 0; j < n; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
#else
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
#endif
}
}
}
void av1_iht32x16_512_add_c(const tran_low_t *input, uint8_t *dest, int stride,
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
#if CONFIG_MRC_TX
assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
#endif // CONFIG_MRC_TX
#if CONFIG_DCT_ONLY
assert(tx_type == DCT_DCT);
#endif
static const transform_2d IHT_32x16[] = {
#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
{ daala_idct16, daala_idct32 }, // DCT_DCT = 0
{ daala_idst16, daala_idct32 }, // ADST_DCT = 1
{ daala_idct16, daala_idst32 }, // DCT_ADST = 2
{ daala_idst16, daala_idst32 }, // ADST_ADST = 3
{ daala_idst16, daala_idct32 }, // FLIPADST_DCT
{ daala_idct16, daala_idst32 }, // DCT_FLIPADST
{ daala_idst16, daala_idst32 }, // FLIPADST_FLIPADST
{ daala_idst16, daala_idst32 }, // ADST_FLIPADST
{ daala_idst16, daala_idst32 }, // FLIPADST_ADST
{ daala_idtx16, daala_idtx32 }, // IDTX
{ daala_idct16, daala_idtx32 }, // V_DCT
{ daala_idtx16, daala_idct32 }, // H_DCT
{ daala_idst16, daala_idtx32 }, // V_ADST
{ daala_idtx16, daala_idst32 }, // H_ADST
{ daala_idst16, daala_idtx32 }, // V_FLIPADST
{ daala_idtx16, daala_idst32 }, // H_FLIPADST
#else
{ aom_idct16_c, aom_idct32_c }, // DCT_DCT
{ aom_iadst16_c, aom_idct32_c }, // ADST_DCT
{ aom_idct16_c, ihalfright32_c }, // DCT_ADST
{ aom_iadst16_c, ihalfright32_c }, // ADST_ADST
{ aom_iadst16_c, aom_idct32_c }, // FLIPADST_DCT
{ aom_idct16_c, ihalfright32_c }, // DCT_FLIPADST
{ aom_iadst16_c, ihalfright32_c }, // FLIPADST_FLIPADST
{ aom_iadst16_c, ihalfright32_c }, // ADST_FLIPADST
{ aom_iadst16_c, ihalfright32_c }, // FLIPADST_ADST
{ iidtx16_c, iidtx32_c }, // IDTX
{ aom_idct16_c, iidtx32_c }, // V_DCT
{ iidtx16_c, aom_idct32_c }, // H_DCT
{ aom_iadst16_c, iidtx32_c }, // V_ADST
{ iidtx16_c, ihalfright32_c }, // H_ADST
{ aom_iadst16_c, iidtx32_c }, // V_FLIPADST
{ iidtx16_c, ihalfright32_c }, // H_FLIPADST
#endif
};
const int n = 16;
const int n2 = 32;
int i, j;
tran_low_t out[32][16], tmp[32][16], outtmp[32];
tran_low_t *outp = &out[0][0];
int outstride = n;
// inverse transform row vectors and transpose
for (i = 0; i < n; ++i) {
#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
tran_low_t temp_in[32];
for (j = 0; j < n2; j++) temp_in[j] = input[j] * 2;
IHT_32x16[tx_type].rows(temp_in, outtmp);
for (j = 0; j < n2; ++j) tmp[j][i] = outtmp[j] * 4;
#else
IHT_32x16[tx_type].rows(input, outtmp);
for (j = 0; j < n2; ++j)
tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
#endif
input += n2;
}
// inverse transform column vectors
for (i = 0; i < n2; ++i) IHT_32x16[tx_type].cols(tmp[i], out[i]);
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
// Sum with the destination
for (i = 0; i < n; ++i) {
for (j = 0; j < n2; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
#else
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
#endif
}
}
}
void av1_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
#if CONFIG_MRC_TX
assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
#endif // CONFIG_MRC_TX
#if CONFIG_DCT_ONLY
assert(tx_type == DCT_DCT);
#endif
static const transform_2d IHT_8[] = {
#if CONFIG_DAALA_TX8
{ daala_idct8, daala_idct8 }, // DCT_DCT = 0
{ daala_idst8, daala_idct8 }, // ADST_DCT = 1
{ daala_idct8, daala_idst8 }, // DCT_ADST = 2
{ daala_idst8, daala_idst8 }, // ADST_ADST = 3
{ daala_idst8, daala_idct8 }, // FLIPADST_DCT
{ daala_idct8, daala_idst8 }, // DCT_FLIPADST
{ daala_idst8, daala_idst8 }, // FLIPADST_FLIPADST
{ daala_idst8, daala_idst8 }, // ADST_FLIPADST
{ daala_idst8, daala_idst8 }, // FLIPADST_ADST
{ daala_idtx8, daala_idtx8 }, // IDTX
{ daala_idct8, daala_idtx8 }, // V_DCT
{ daala_idtx8, daala_idct8 }, // H_DCT
{ daala_idst8, daala_idtx8 }, // V_ADST
{ daala_idtx8, daala_idst8 }, // H_ADST
{ daala_idst8, daala_idtx8 }, // V_FLIPADST
{ daala_idtx8, daala_idst8 }, // H_FLIPADST
#else
{ aom_idct8_c, aom_idct8_c }, // DCT_DCT = 0
{ aom_iadst8_c, aom_idct8_c }, // ADST_DCT = 1
{ aom_idct8_c, aom_iadst8_c }, // DCT_ADST = 2
{ aom_iadst8_c, aom_iadst8_c }, // ADST_ADST = 3
{ aom_iadst8_c, aom_idct8_c }, // FLIPADST_DCT
{ aom_idct8_c, aom_iadst8_c }, // DCT_FLIPADST
{ aom_iadst8_c, aom_iadst8_c }, // FLIPADST_FLIPADST
{ aom_iadst8_c, aom_iadst8_c }, // ADST_FLIPADST
{ aom_iadst8_c, aom_iadst8_c }, // FLIPADST_ADST
{ iidtx8_c, iidtx8_c }, // IDTX
{ aom_idct8_c, iidtx8_c }, // V_DCT
{ iidtx8_c, aom_idct8_c }, // H_DCT
{ aom_iadst8_c, iidtx8_c }, // V_ADST
{ iidtx8_c, aom_iadst8_c }, // H_ADST
{ aom_iadst8_c, iidtx8_c }, // V_FLIPADST
{ iidtx8_c, aom_iadst8_c }, // H_FLIPADST
#endif
};
int i, j;
tran_low_t tmp[8][8];
tran_low_t out[8][8];
tran_low_t *outp = &out[0][0];
int outstride = 8;
#if CONFIG_LGT
const tran_high_t *lgtmtx_col[1];
const tran_high_t *lgtmtx_row[1];
int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
#endif
// inverse transform row vectors
for (i = 0; i < 8; ++i) {
#if CONFIG_DAALA_TX8
tran_low_t temp_in[8];
for (j = 0; j < 8; j++) temp_in[j] = input[j] * 2;
IHT_8[tx_type].rows(temp_in, out[i]);
#else
#if CONFIG_LGT
if (use_lgt_row)
ilgt8(input, out[i], lgtmtx_row[0]);
else
#endif
IHT_8[tx_type].rows(input, out[i]);
#endif
input += 8;
}
// transpose
for (i = 0; i < 8; i++) {
for (j = 0; j < 8; j++) {
tmp[j][i] = out[i][j];
}
}
// inverse transform column vectors
for (i = 0; i < 8; ++i) {
#if CONFIG_LGT
if (use_lgt_col)
ilgt8(tmp[i], out[i], lgtmtx_col[0]);
else
#endif
IHT_8[tx_type].cols(tmp[i], out[i]);
}
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 8, 8);
// Sum with the destination
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
#if CONFIG_DAALA_TX8
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
#else
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
#endif
}
}
}
void av1_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
#if CONFIG_MRC_TX
assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
#endif // CONFIG_MRC_TX
#if CONFIG_DCT_ONLY
assert(tx_type == DCT_DCT);
#endif
static const transform_2d IHT_16[] = {
#if CONFIG_DAALA_TX16
{ daala_idct16, daala_idct16 }, // DCT_DCT = 0
{ daala_idst16, daala_idct16 }, // ADST_DCT = 1
{ daala_idct16, daala_idst16 }, // DCT_ADST = 2
{ daala_idst16, daala_idst16 }, // ADST_ADST = 3
{ daala_idst16, daala_idct16 }, // FLIPADST_DCT
{ daala_idct16, daala_idst16 }, // DCT_FLIPADST
{ daala_idst16, daala_idst16 }, // FLIPADST_FLIPADST
{ daala_idst16, daala_idst16 }, // ADST_FLIPADST
{ daala_idst16, daala_idst16 }, // FLIPADST_ADST
{ daala_idtx16, daala_idtx16 }, // IDTX
{ daala_idct16, daala_idtx16 }, // V_DCT
{ daala_idtx16, daala_idct16 }, // H_DCT
{ daala_idst16, daala_idtx16 }, // V_ADST
{ daala_idtx16, daala_idst16 }, // H_ADST
{ daala_idst16, daala_idtx16 }, // V_FLIPADST
{ daala_idtx16, daala_idst16 }, // H_FLIPADST
#else
{ aom_idct16_c, aom_idct16_c }, // DCT_DCT = 0
{ aom_iadst16_c, aom_idct16_c }, // ADST_DCT = 1
{ aom_idct16_c, aom_iadst16_c }, // DCT_ADST = 2
{ aom_iadst16_c, aom_iadst16_c }, // ADST_ADST = 3
{ aom_iadst16_c, aom_idct16_c }, // FLIPADST_DCT
{ aom_idct16_c, aom_iadst16_c }, // DCT_FLIPADST
{ aom_iadst16_c, aom_iadst16_c }, // FLIPADST_FLIPADST
{ aom_iadst16_c, aom_iadst16_c }, // ADST_FLIPADST
{ aom_iadst16_c, aom_iadst16_c }, // FLIPADST_ADST
{ iidtx16_c, iidtx16_c }, // IDTX
{ aom_idct16_c, iidtx16_c }, // V_DCT
{ iidtx16_c, aom_idct16_c }, // H_DCT
{ aom_iadst16_c, iidtx16_c }, // V_ADST
{ iidtx16_c, aom_iadst16_c }, // H_ADST
{ aom_iadst16_c, iidtx16_c }, // V_FLIPADST
{ iidtx16_c, aom_iadst16_c }, // H_FLIPADST
#endif
};
int i, j;
tran_low_t tmp[16][16];
tran_low_t out[16][16];
tran_low_t *outp = &out[0][0];
int outstride = 16;
// inverse transform row vectors
for (i = 0; i < 16; ++i) {
#if CONFIG_DAALA_TX16
tran_low_t temp_in[16];
for (j = 0; j < 16; j++) temp_in[j] = input[j] * 2;
IHT_16[tx_type].rows(temp_in, out[i]);
#else
IHT_16[tx_type].rows(input, out[i]);
#endif
input += 16;
}
// transpose
for (i = 0; i < 16; i++) {
for (j = 0; j < 16; j++) {
tmp[j][i] = out[i][j];
}
}
// inverse transform column vectors
for (i = 0; i < 16; ++i) IHT_16[tx_type].cols(tmp[i], out[i]);
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 16, 16);
// Sum with the destination
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
#if CONFIG_DAALA_TX16
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
#else
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
#endif
}
}
}
void av1_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride,
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
#if CONFIG_DCT_ONLY
assert(tx_type == DCT_DCT);
#endif
static const transform_2d IHT_32[] = {
#if CONFIG_DAALA_TX32
{ daala_idct32, daala_idct32 }, // DCT_DCT
{ daala_idst32, daala_idct32 }, // ADST_DCT
{ daala_idct32, daala_idst32 }, // DCT_ADST
{ daala_idst32, daala_idst32 }, // ADST_ADST
{ daala_idst32, daala_idct32 }, // FLIPADST_DCT
{ daala_idct32, daala_idst32 }, // DCT_FLIPADST
{ daala_idst32, daala_idst32 }, // FLIPADST_FLIPADST
{ daala_idst32, daala_idst32 }, // ADST_FLIPADST
{ daala_idst32, daala_idst32 }, // FLIPADST_ADST
{ daala_idtx32, daala_idtx32 }, // IDTX
{ daala_idct32, daala_idtx32 }, // V_DCT
{ daala_idtx32, daala_idct32 }, // H_DCT
{ daala_idst32, daala_idtx32 }, // V_ADST
{ daala_idtx32, daala_idst32 }, // H_ADST
{ daala_idst32, daala_idtx32 }, // V_FLIPADST
{ daala_idtx32, daala_idst32 }, // H_FLIPADST
#else
{ aom_idct32_c, aom_idct32_c }, // DCT_DCT
{ ihalfright32_c, aom_idct32_c }, // ADST_DCT
{ aom_idct32_c, ihalfright32_c }, // DCT_ADST
{ ihalfright32_c, ihalfright32_c }, // ADST_ADST
{ ihalfright32_c, aom_idct32_c }, // FLIPADST_DCT
{ aom_idct32_c, ihalfright32_c }, // DCT_FLIPADST
{ ihalfright32_c, ihalfright32_c }, // FLIPADST_FLIPADST
{ ihalfright32_c, ihalfright32_c }, // ADST_FLIPADST
{ ihalfright32_c, ihalfright32_c }, // FLIPADST_ADST
{ iidtx32_c, iidtx32_c }, // IDTX
{ aom_idct32_c, iidtx32_c }, // V_DCT
{ iidtx32_c, aom_idct32_c }, // H_DCT
{ ihalfright32_c, iidtx32_c }, // V_ADST
{ iidtx32_c, ihalfright32_c }, // H_ADST
{ ihalfright32_c, iidtx32_c }, // V_FLIPADST
{ iidtx32_c, ihalfright32_c }, // H_FLIPADST
#endif
};
int i, j;
tran_low_t tmp[32][32];
tran_low_t out[32][32];
tran_low_t *outp = &out[0][0];
int outstride = 32;
// inverse transform row vectors
for (i = 0; i < 32; ++i) {
#if CONFIG_DAALA_TX32
tran_low_t temp_in[32];
for (j = 0; j < 32; j++) temp_in[j] = input[j] * 2;
IHT_32[tx_type].rows(temp_in, out[i]);
#else
IHT_32[tx_type].rows(input, out[i]);
#endif
input += 32;
}
// transpose
for (i = 0; i < 32; i++) {
for (j = 0; j < 32; j++) {
#if CONFIG_DAALA_TX32
tmp[j][i] = out[i][j] * 4;
#else
tmp[j][i] = out[i][j];
#endif
}
}
// inverse transform column vectors
for (i = 0; i < 32; ++i) IHT_32[tx_type].cols(tmp[i], out[i]);
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 32, 32);
// Sum with the destination
for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
#if CONFIG_DAALA_TX32
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
#else
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
#endif
}
}
}
#if CONFIG_TX64X64
void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride,
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
#if CONFIG_MRC_TX
assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
#endif // CONFIG_MRC_TX
#if CONFIG_DCT_ONLY
assert(tx_type == DCT_DCT);
#endif
static const transform_2d IHT_64[] = {
#if CONFIG_DAALA_TX64
{ daala_idct64, daala_idct64 }, // DCT_DCT
{ daala_idst64, daala_idct64 }, // ADST_DCT
{ daala_idct64, daala_idst64 }, // DCT_ADST
{ daala_idst64, daala_idst64 }, // ADST_ADST
{ daala_idst64, daala_idct64 }, // FLIPADST_DCT
{ daala_idct64, daala_idst64 }, // DCT_FLIPADST
{ daala_idst64, daala_idst64 }, // FLIPADST_FLIPADST
{ daala_idst64, daala_idst64 }, // ADST_FLIPADST
{ daala_idst64, daala_idst64 }, // FLIPADST_ADST
{ daala_idtx64, daala_idtx64 }, // IDTX
{ daala_idct64, daala_idtx64 }, // V_DCT
{ daala_idtx64, daala_idct64 }, // H_DCT
{ daala_idst64, daala_idtx64 }, // V_ADST
{ daala_idtx64, daala_idst64 }, // H_ADST
{ daala_idst64, daala_idtx64 }, // V_FLIPADST
{ daala_idtx64, daala_idst64 }, // H_FLIPADST
#else
{ idct64_col_c, idct64_row_c }, // DCT_DCT
{ ihalfright64_c, idct64_row_c }, // ADST_DCT
{ idct64_col_c, ihalfright64_c }, // DCT_ADST
{ ihalfright64_c, ihalfright64_c }, // ADST_ADST
{ ihalfright64_c, idct64_row_c }, // FLIPADST_DCT
{ idct64_col_c, ihalfright64_c }, // DCT_FLIPADST
{ ihalfright64_c, ihalfright64_c }, // FLIPADST_FLIPADST
{ ihalfright64_c, ihalfright64_c }, // ADST_FLIPADST
{ ihalfright64_c, ihalfright64_c }, // FLIPADST_ADST
{ iidtx64_c, iidtx64_c }, // IDTX
{ idct64_col_c, iidtx64_c }, // V_DCT
{ iidtx64_c, idct64_row_c }, // H_DCT
{ ihalfright64_c, iidtx64_c }, // V_ADST
{ iidtx64_c, ihalfright64_c }, // H_ADST
{ ihalfright64_c, iidtx64_c }, // V_FLIPADST
{ iidtx64_c, ihalfright64_c }, // H_FLIPADST
#endif
};
int i, j;
tran_low_t tmp[64][64];
tran_low_t out[64][64];
tran_low_t *outp = &out[0][0];
int outstride = 64;
// inverse transform row vectors
for (i = 0; i < 64; ++i) {
#if CONFIG_DAALA_TX64
tran_low_t temp_in[64];
for (j = 0; j < 64; j++) temp_in[j] = input[j] * 2;
IHT_64[tx_type].rows(temp_in, out[i]);
// Do not rescale intermediate for Daala
#else
IHT_64[tx_type].rows(input, out[i]);
for (j = 0; j < 64; ++j) out[i][j] = ROUND_POWER_OF_TWO(out[i][j], 1);
#endif
input += 64;
}
// transpose
for (i = 0; i < 64; i++) {
for (j = 0; j < 64; j++) {
tmp[j][i] = out[i][j];
}
}
// inverse transform column vectors
for (i = 0; i < 64; ++i) IHT_64[tx_type].cols(tmp[i], out[i]);
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 64, 64);
// Sum with the destination
for (i = 0; i < 64; ++i) {
for (j = 0; j < 64; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
#if CONFIG_DAALA_TX64
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 2));
#else
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
#endif
}
}
}
void av1_iht64x32_2048_add_c(const tran_low_t *input, uint8_t *dest, int stride,
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
#if CONFIG_MRC_TX
assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
#endif // CONFIG_MRC_TX
#if CONFIG_DCT_ONLY
assert(tx_type == DCT_DCT);
#endif
static const transform_2d IHT_64x32[] = {
{ aom_idct32_c, idct64_row_c }, // DCT_DCT
{ ihalfright32_c, idct64_row_c }, // ADST_DCT
{ aom_idct32_c, ihalfright64_c }, // DCT_ADST
{ ihalfright32_c, ihalfright64_c }, // ADST_ADST
{ ihalfright32_c, idct64_row_c }, // FLIPADST_DCT
{ aom_idct32_c, ihalfright64_c }, // DCT_FLIPADST
{ ihalfright32_c, ihalfright64_c }, // FLIPADST_FLIPADST
{ ihalfright32_c, ihalfright64_c }, // ADST_FLIPADST
{ ihalfright32_c, ihalfright64_c }, // FLIPADST_ADST
{ iidtx32_c, iidtx64_c }, // IDTX
{ aom_idct32_c, iidtx64_c }, // V_DCT
{ iidtx32_c, idct64_row_c }, // H_DCT
{ ihalfright32_c, iidtx64_c }, // V_ADST
{ iidtx32_c, ihalfright64_c }, // H_ADST
{ ihalfright32_c, iidtx64_c }, // V_FLIPADST
{ iidtx32_c, ihalfright64_c }, // H_FLIPADST
};
const int n = 32;
const int n2 = 64;
int i, j;
tran_low_t out[64][32], tmp[64][32], outtmp[64];
tran_low_t *outp = &out[0][0];
int outstride = n;
// inverse transform row vectors and transpose
for (i = 0; i < n; ++i) {
IHT_64x32[tx_type].rows(input, outtmp);
for (j = 0; j < n2; ++j)
tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * InvSqrt2);
input += n2;
}
// inverse transform column vectors
for (i = 0; i < n2; ++i) IHT_64x32[tx_type].cols(tmp[i], out[i]);
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
// Sum with the destination
for (i = 0; i < n; ++i) {
for (j = 0; j < n2; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
}
}
}
void av1_iht32x64_2048_add_c(const tran_low_t *input, uint8_t *dest, int stride,
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
#if CONFIG_MRC_TX
assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
#endif // CONFIG_MRC_TX
#if CONFIG_DCT_ONLY
assert(tx_type == DCT_DCT);
#endif
static const transform_2d IHT_32x64[] = {
{ idct64_col_c, aom_idct32_c }, // DCT_DCT
{ ihalfright64_c, aom_idct32_c }, // ADST_DCT
{ idct64_col_c, ihalfright32_c }, // DCT_ADST
{ ihalfright64_c, ihalfright32_c }, // ADST_ADST
{ ihalfright64_c, aom_idct32_c }, // FLIPADST_DCT
{ idct64_col_c, ihalfright32_c }, // DCT_FLIPADST
{ ihalfright64_c, ihalfright32_c }, // FLIPADST_FLIPADST
{ ihalfright64_c, ihalfright32_c }, // ADST_FLIPADST
{ ihalfright64_c, ihalfright32_c }, // FLIPADST_ADST
{ iidtx64_c, iidtx32_c }, // IDTX
{ idct64_col_c, iidtx32_c }, // V_DCT
{ iidtx64_c, aom_idct32_c }, // H_DCT
{ ihalfright64_c, iidtx32_c }, // V_ADST
{ iidtx64_c, ihalfright32_c }, // H_ADST
{ ihalfright64_c, iidtx32_c }, // V_FLIPADST
{ iidtx64_c, ihalfright32_c }, // H_FLIPADST
};
const int n = 32;
const int n2 = 64;
int i, j;
tran_low_t out[32][64], tmp[32][64], outtmp[32];
tran_low_t *outp = &out[0][0];
int outstride = n2;
// inverse transform row vectors and transpose
for (i = 0; i < n2; ++i) {
IHT_32x64[tx_type].rows(input, outtmp);
for (j = 0; j < n; ++j)
tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * InvSqrt2);
input += n;
}
// inverse transform column vectors
for (i = 0; i < n; ++i) IHT_32x64[tx_type].cols(tmp[i], out[i]);
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
// Sum with the destination
for (i = 0; i < n2; ++i) {