blob: 8bcc8be82d9bd42cae9ac4015837d8d289b74585 [file] [log] [blame] [edit]
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved.
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <assert.h>
#include <smmintrin.h> /* SSE4.1 */
#include "aom_dsp/txfm_common.h"
#include "aom_dsp/x86/transpose_sse2.h"
#include "aom_dsp/x86/txfm_common_sse2.h"
#include "aom_ports/mem.h"
#include "av1/common/av1_txfm.h"
#include "av1/common/x86/highbd_txfm_utility_sse4.h"
#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
#include "av1/encoder/x86/av1_txfm1d_sse4.h"
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
void av1_fwht4x4_sse4_1(const int16_t *input, tran_low_t *output, int stride) {
__m128i in[4];
in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
// Convert to int32_t.
__m128i op[4];
op[0] = _mm_cvtepi16_epi32(in[0]);
op[1] = _mm_cvtepi16_epi32(in[1]);
op[2] = _mm_cvtepi16_epi32(in[2]);
op[3] = _mm_cvtepi16_epi32(in[3]);
for (int i = 0; i < 2; ++i) {
__m128i a1 = op[0];
__m128i b1 = op[1];
__m128i c1 = op[2];
__m128i d1 = op[3];
__m128i e1;
a1 = _mm_add_epi32(a1, b1); // a1 += b1
d1 = _mm_sub_epi32(d1, c1); // d1 = d1 - c1
e1 = _mm_sub_epi32(a1, d1); // e1 = (a1 - d1) >> 1
e1 = _mm_srai_epi32(e1, 1);
b1 = _mm_sub_epi32(e1, b1); // b1 = e1 - b1
c1 = _mm_sub_epi32(e1, c1); // c1 = e1 - c1
a1 = _mm_sub_epi32(a1, c1); // a1 -= c1
d1 = _mm_add_epi32(d1, b1); // d1 += b1
op[0] = a1;
op[1] = c1;
op[2] = d1;
op[3] = b1;
if (i == 0) {
transpose_32bit_4x4(op, op);
}
}
op[0] = _mm_slli_epi32(op[0], UNIT_QUANT_SHIFT);
op[1] = _mm_slli_epi32(op[1], UNIT_QUANT_SHIFT);
op[2] = _mm_slli_epi32(op[2], UNIT_QUANT_SHIFT);
op[3] = _mm_slli_epi32(op[3], UNIT_QUANT_SHIFT);
_mm_storeu_si128((__m128i *)(output + 0), op[0]);
_mm_storeu_si128((__m128i *)(output + 4), op[1]);
_mm_storeu_si128((__m128i *)(output + 8), op[2]);
_mm_storeu_si128((__m128i *)(output + 12), op[3]);
}