| /* |
| * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
| * |
| * This source code is subject to the terms of the BSD 2 Clause License and |
| * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| * was not distributed with this source code in the LICENSE file, you can |
| * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| * Media Patent License 1.0 was not distributed with this source code in the |
| * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| */ |
| |
| #include <smmintrin.h> |
| #include <stdint.h> |
| |
| #include "./av1_rtcd.h" |
| #include "aom_dsp/aom_dsp_common.h" |
| |
| // Coefficient quantization phase 1 |
| // param[0-2] : rounding/quan/dequan constants |
| static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param, |
| const int shift, const int scale, |
| __m128i *qcoeff, __m128i *dquan, |
| __m128i *sign) { |
| const __m128i zero = _mm_setzero_si128(); |
| const __m128i one = _mm_set1_epi32(1); |
| |
| *sign = _mm_cmplt_epi32(*coeff, zero); |
| *sign = _mm_or_si128(*sign, one); |
| *coeff = _mm_abs_epi32(*coeff); |
| |
| qcoeff[0] = _mm_add_epi32(*coeff, param[0]); |
| qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero); |
| qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero); |
| |
| qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]); |
| qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift); |
| dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]); |
| dquan[0] = _mm_srli_epi64(dquan[0], scale); |
| } |
| |
| // Coefficient quantization phase 2 |
| static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan, |
| const __m128i *sign, |
| const __m128i *param, const int shift, |
| const int scale, tran_low_t *qAddr, |
| tran_low_t *dqAddr) { |
| __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0); |
| __m128i mask0H = _mm_set_epi32(0, 0, -1, -1); |
| |
| qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]); |
| qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift); |
| dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]); |
| dquan[1] = _mm_srli_epi64(dquan[1], scale); |
| |
| // combine L&H |
| qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8); |
| qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d); |
| |
| qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H); |
| qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L); |
| |
| dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8); |
| dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d); |
| |
| dquan[0] = _mm_and_si128(dquan[0], mask0H); |
| dquan[1] = _mm_and_si128(dquan[1], mask0L); |
| |
| qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]); |
| dquan[0] = _mm_or_si128(dquan[0], dquan[1]); |
| |
| qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign); |
| dquan[0] = _mm_sign_epi32(dquan[0], *sign); |
| |
| _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]); |
| _mm_storeu_si128((__m128i *)dqAddr, dquan[0]); |
| } |
| |
| static INLINE void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan, |
| __m128i *eob) { |
| const __m128i zero = _mm_setzero_si128(); |
| __m128i mask, iscanIdx; |
| const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr); |
| const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4)); |
| __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero); |
| __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero); |
| |
| nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero); |
| nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero); |
| |
| mask = _mm_packs_epi32(nz_flag0, nz_flag1); |
| iscanIdx = _mm_loadu_si128((__m128i const *)iscan); |
| iscanIdx = _mm_sub_epi16(iscanIdx, mask); |
| iscanIdx = _mm_and_si128(iscanIdx, mask); |
| *eob = _mm_max_epi16(*eob, iscanIdx); |
| } |
| |
| static INLINE uint16_t get_accumulated_eob(__m128i *eob) { |
| __m128i eob_shuffled; |
| uint16_t eobValue; |
| eob_shuffled = _mm_shuffle_epi32(*eob, 0xe); |
| *eob = _mm_max_epi16(*eob, eob_shuffled); |
| eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe); |
| *eob = _mm_max_epi16(*eob, eob_shuffled); |
| eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1); |
| *eob = _mm_max_epi16(*eob, eob_shuffled); |
| eobValue = _mm_extract_epi16(*eob, 0); |
| return eobValue; |
| } |
| |
| void av1_highbd_quantize_fp_sse4_1( |
| const tran_low_t *coeff_ptr, intptr_t count, int skip_block, |
| const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, |
| const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, |
| tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, |
| const int16_t *scan, const int16_t *iscan, int log_scale) { |
| __m128i coeff[2], qcoeff[2], dequant[2], qparam[3], coeff_sign; |
| __m128i eob = _mm_setzero_si128(); |
| const tran_low_t *src = coeff_ptr; |
| tran_low_t *quanAddr = qcoeff_ptr; |
| tran_low_t *dquanAddr = dqcoeff_ptr; |
| const int shift = 16 - log_scale; |
| const int coeff_stride = 4; |
| const int quan_stride = coeff_stride; |
| (void)skip_block; |
| (void)zbin_ptr; |
| (void)quant_shift_ptr; |
| (void)scan; |
| |
| memset(quanAddr, 0, count * sizeof(quanAddr[0])); |
| memset(dquanAddr, 0, count * sizeof(dquanAddr[0])); |
| |
| if (!skip_block) { |
| coeff[0] = _mm_loadu_si128((__m128i const *)src); |
| |
| qparam[0] = |
| _mm_set_epi32(round_ptr[1] >> log_scale, round_ptr[1] >> log_scale, |
| round_ptr[1] >> log_scale, round_ptr[0] >> log_scale); |
| qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[0]); |
| qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[0]); |
| |
| // DC and first 3 AC |
| quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant, |
| &coeff_sign); |
| |
| // update round/quan/dquan for AC |
| qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]); |
| qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[1]); |
| qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[1]); |
| |
| quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, |
| log_scale, quanAddr, dquanAddr); |
| |
| // next 4 AC |
| coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride)); |
| quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant, |
| &coeff_sign); |
| quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, |
| log_scale, quanAddr + quan_stride, |
| dquanAddr + quan_stride); |
| |
| find_eob(quanAddr, iscan, &eob); |
| |
| count -= 8; |
| |
| // loop for the rest of AC |
| while (count > 0) { |
| src += coeff_stride << 1; |
| quanAddr += quan_stride << 1; |
| dquanAddr += quan_stride << 1; |
| iscan += quan_stride << 1; |
| |
| coeff[0] = _mm_loadu_si128((__m128i const *)src); |
| coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride)); |
| |
| quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, |
| dequant, &coeff_sign); |
| quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, |
| log_scale, quanAddr, dquanAddr); |
| |
| quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, |
| dequant, &coeff_sign); |
| quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, |
| log_scale, quanAddr + quan_stride, |
| dquanAddr + quan_stride); |
| |
| find_eob(quanAddr, iscan, &eob); |
| |
| count -= 8; |
| } |
| *eob_ptr = get_accumulated_eob(&eob); |
| } else { |
| *eob_ptr = 0; |
| } |
| } |