blob: 4238e651b043495b5e56ea6d866e5e082dacdcc1 [file] [log] [blame]
/*
* Copyright (c) 2017, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H
#define AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H
#include <immintrin.h>
#include "aom_dsp/txfm_common.h"
#include "aom_dsp/x86/txfm_common_avx2.h"
static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) {
#if CONFIG_HIGHBITDEPTH
*in = _mm256_setr_epi16(
(int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
(int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
(int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
(int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
(int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
(int16_t)coeff[15]);
#else
*in = _mm256_loadu_si256((const __m256i *)coeff);
#endif
}
static INLINE void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) {
int i = 0;
while (i < 16) {
load_coeff(coeff + (i << 4), &in[i]);
i += 1;
}
}
static INLINE void recon_and_store(const __m256i *res, uint8_t *output) {
const __m128i zero = _mm_setzero_si128();
__m128i x = _mm_loadu_si128((__m128i const *)output);
__m128i p0 = _mm_unpacklo_epi8(x, zero);
__m128i p1 = _mm_unpackhi_epi8(x, zero);
p0 = _mm_add_epi16(p0, _mm256_castsi256_si128(*res));
p1 = _mm_add_epi16(p1, _mm256_extractf128_si256(*res, 1));
x = _mm_packus_epi16(p0, p1);
_mm_storeu_si128((__m128i *)output, x);
}
#define IDCT_ROUNDING_POS (6)
static INLINE void store_buffer_16xN(__m256i *in, const int stride,
uint8_t *output, int num) {
const __m256i rounding = _mm256_set1_epi16(1 << (IDCT_ROUNDING_POS - 1));
int i = 0;
while (i < num) {
in[i] = _mm256_adds_epi16(in[i], rounding);
in[i] = _mm256_srai_epi16(in[i], IDCT_ROUNDING_POS);
recon_and_store(&in[i], output + i * stride);
i += 1;
}
}
static INLINE void unpack_butter_fly(const __m256i *a0, const __m256i *a1,
const __m256i *c0, const __m256i *c1,
__m256i *b0, __m256i *b1) {
__m256i x0, x1;
x0 = _mm256_unpacklo_epi16(*a0, *a1);
x1 = _mm256_unpackhi_epi16(*a0, *a1);
*b0 = butter_fly(&x0, &x1, c0);
*b1 = butter_fly(&x0, &x1, c1);
}
void av1_idct16_avx2(__m256i *in);
#endif // AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H