blob: afb7f503d84bd67cea0ff583e8ba7ecb8dd58690 [file] [log] [blame]
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "./av1_rtcd.h"
#include "aom_dsp/x86/inv_txfm_sse2.h"
#include "aom_dsp/x86/synonyms.h"
#include "aom_dsp/x86/txfm_common_sse2.h"
#include "aom_ports/mem.h"
#include "av1/common/enums.h"
static INLINE void fliplr_4x4(__m128i *in /*in[2]*/) {
in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
in[0] = _mm_shufflehi_epi16(in[0], 0x1b);
in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
in[1] = _mm_shufflehi_epi16(in[1], 0x1b);
}
static INLINE void fliplr_8x8(__m128i *in /*in[8]*/) {
in[0] = mm_reverse_epi16(in[0]);
in[1] = mm_reverse_epi16(in[1]);
in[2] = mm_reverse_epi16(in[2]);
in[3] = mm_reverse_epi16(in[3]);
in[4] = mm_reverse_epi16(in[4]);
in[5] = mm_reverse_epi16(in[5]);
in[6] = mm_reverse_epi16(in[6]);
in[7] = mm_reverse_epi16(in[7]);
}
static INLINE void fliplr_16x8(__m128i *in /*in[16]*/) {
fliplr_8x8(&in[0]);
fliplr_8x8(&in[8]);
}
#define FLIPLR_16x16(in0, in1) \
do { \
__m128i *tmp; \
fliplr_16x8(in0); \
fliplr_16x8(in1); \
tmp = (in0); \
(in0) = (in1); \
(in1) = tmp; \
} while (0)
#define FLIPUD_PTR(dest, stride, size) \
do { \
(dest) = (dest) + ((size)-1) * (stride); \
(stride) = -(stride); \
} while (0)
void av1_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
const TxfmParam *txfm_param) {
__m128i in[2];
const __m128i zero = _mm_setzero_si128();
const __m128i eight = _mm_set1_epi16(8);
const TX_TYPE tx_type = txfm_param->tx_type;
in[0] = load_input_data(input);
in[1] = load_input_data(input + 8);
switch (tx_type) {
case DCT_DCT:
aom_idct4_sse2(in);
aom_idct4_sse2(in);
break;
case ADST_DCT:
aom_idct4_sse2(in);
aom_iadst4_sse2(in);
break;
case DCT_ADST:
aom_iadst4_sse2(in);
aom_idct4_sse2(in);
break;
case ADST_ADST:
aom_iadst4_sse2(in);
aom_iadst4_sse2(in);
break;
case FLIPADST_DCT:
aom_idct4_sse2(in);
aom_iadst4_sse2(in);
FLIPUD_PTR(dest, stride, 4);
break;
case DCT_FLIPADST:
aom_iadst4_sse2(in);
aom_idct4_sse2(in);
fliplr_4x4(in);
break;
case FLIPADST_FLIPADST:
aom_iadst4_sse2(in);
aom_iadst4_sse2(in);
FLIPUD_PTR(dest, stride, 4);
fliplr_4x4(in);
break;
case ADST_FLIPADST:
aom_iadst4_sse2(in);
aom_iadst4_sse2(in);
fliplr_4x4(in);
break;
case FLIPADST_ADST:
aom_iadst4_sse2(in);
aom_iadst4_sse2(in);
FLIPUD_PTR(dest, stride, 4);
break;
default: assert(0); break;
}
// Final round and shift
in[0] = _mm_add_epi16(in[0], eight);
in[1] = _mm_add_epi16(in[1], eight);
in[0] = _mm_srai_epi16(in[0], 4);
in[1] = _mm_srai_epi16(in[1], 4);
// Reconstruction and Store
{
__m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
__m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
__m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
__m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
d0 = _mm_unpacklo_epi32(d0, d1);
d2 = _mm_unpacklo_epi32(d2, d3);
d0 = _mm_unpacklo_epi8(d0, zero);
d2 = _mm_unpacklo_epi8(d2, zero);
d0 = _mm_add_epi16(d0, in[0]);
d2 = _mm_add_epi16(d2, in[1]);
d0 = _mm_packus_epi16(d0, d2);
// store result[0]
*(int *)dest = _mm_cvtsi128_si32(d0);
// store result[1]
d0 = _mm_srli_si128(d0, 4);
*(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
// store result[2]
d0 = _mm_srli_si128(d0, 4);
*(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
// store result[3]
d0 = _mm_srli_si128(d0, 4);
*(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
}
}