blob: b84c94822f59f6f738346296dd198031fe9a21d9 [file] [log] [blame] [edit]
/*
* Copyright (c) 2021, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 3-Clause Clear License
* and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
* License was not distributed with this source code in the LICENSE file, you
* can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
* Alliance for Open Media Patent License 1.0 was not distributed with this
* source code in the PATENTS file, you can obtain it at
* aomedia.org/license/patent-license/.
*/
#include <assert.h>
#include <smmintrin.h> /* SSE4.1 */
#include "av1/common/common_data.h"
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
#include "av1/common/av1_txfm.h"
#include "aom_dsp/txfm_common.h"
#include "aom_dsp/x86/txfm_common_sse2.h"
#include "aom_ports/mem.h"
// Forward secondary transform
void fwd_stxfm_sse4_1(tran_low_t *src, tran_low_t *dst,
const PREDICTION_MODE mode, const uint8_t stx_idx,
const int size, const int bd) {
// Secondary transform kernels are stored as 32-bit integers to match SIMD
// processing needs. This avoids on-the-fly conversion from int16_t to int32_t
// during execution by letting SIMD variants directly load the pre-converted
// filter weights.
assert(stx_idx < 4);
const int32_t *kernel = (size == 0) ? ist_4x4_kernel_int32[mode][stx_idx][0]
: ist_8x8_kernel_int32[mode][stx_idx][0];
int coef;
int *out = dst;
int shift = 7;
int reduced_width, reduced_height;
if (size == 0) {
reduced_height = IST_4x4_HEIGHT;
reduced_width = IST_4x4_WIDTH;
} else {
reduced_height = (size == 1)
? IST_8x8_HEIGHT_RED
: ((size == 3) ? IST_ADST_NZ_CNT : IST_8x8_HEIGHT);
reduced_width = IST_8x8_WIDTH;
}
for (int j = 0; j < reduced_height; j++) {
int *srcPtr = src;
const int32_t *kernel_tmp = kernel;
__m128i tmpSum = _mm_setzero_si128();
for (int i = 0; i < reduced_width; i += 4) {
__m128i tmpBlk = _mm_loadu_si128((__m128i *)(srcPtr + i));
__m128i tmpT = _mm_loadu_si128((__m128i *)(kernel_tmp + i));
tmpSum = _mm_add_epi32(tmpSum, _mm_mullo_epi32(tmpBlk, tmpT));
}
tmpSum = _mm_add_epi32(tmpSum,
_mm_shuffle_epi32(tmpSum, _MM_SHUFFLE(2, 3, 0, 1)));
tmpSum = _mm_add_epi32(tmpSum,
_mm_shuffle_epi32(tmpSum, _MM_SHUFFLE(1, 0, 3, 2)));
coef = _mm_cvtsi128_si32(tmpSum);
*out++ = clamp_value(ROUND_POWER_OF_TWO_SIGNED(coef, shift), 8 + bd);
kernel += reduced_width;
}
}