Blame - aom_dsp/x86/highbd_quantize_intrin_sse2.c - avm

blob: ecde8c284f029cd39e860b52c3a5028fbc337a5e [file] [log] [blame]

Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame^]	1	/*
				2	* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
				3	*
				4	* Use of this source code is governed by a BSD-style license
				5	* that can be found in the LICENSE file in the root of the source
				6	* tree. An additional intellectual property rights grant can be found
				7	* in the file PATENTS. All contributing project authors may
				8	* be found in the AUTHORS file in the root of the source tree.
				9	*/
				10
				11	#include <emmintrin.h>
				12
				13	#include "./vpx_dsp_rtcd.h"
				14	#include "aom_dsp/vpx_dsp_common.h"
				15	#include "aom_mem/vpx_mem.h"
				16	#include "aom_ports/mem.h"
				17
				18	#if CONFIG_VP9_HIGHBITDEPTH
				19	void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
				20	int skip_block, const int16_t *zbin_ptr,
				21	const int16_t *round_ptr,
				22	const int16_t *quant_ptr,
				23	const int16_t *quant_shift_ptr,
				24	tran_low_t qcoeff_ptr, tran_low_t dqcoeff_ptr,
				25	const int16_t dequant_ptr, uint16_t eob_ptr,
				26	const int16_t scan, const int16_t iscan) {
				27	int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
				28	__m128i zbins[2];
				29	__m128i nzbins[2];
				30
				31	zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
				32	(int)zbin_ptr[0]);
				33	zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
				34
				35	nzbins[0] = _mm_setzero_si128();
				36	nzbins[1] = _mm_setzero_si128();
				37	nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
				38	nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
				39
				40	(void)scan;
				41
				42	memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
				43	memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
				44
				45	if (!skip_block) {
				46	// Pre-scan pass
				47	for (i = ((int)count / 4) - 1; i >= 0; i--) {
				48	__m128i coeffs, cmp1, cmp2;
				49	int test;
				50	coeffs = _mm_load_si128((const __m128i )(coeff_ptr + i 4));
				51	cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
				52	cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
				53	cmp1 = _mm_and_si128(cmp1, cmp2);
				54	test = _mm_movemask_epi8(cmp1);
				55	if (test == 0xffff)
				56	non_zero_regs--;
				57	else
				58	break;
				59	}
				60
				61	// Quantization pass:
				62	for (i = 0; i < non_zero_regs; i++) {
				63	__m128i coeffs, coeffs_sign, tmp1, tmp2;
				64	int test;
				65	int abs_coeff[4];
				66	int coeff_sign[4];
				67
				68	coeffs = _mm_load_si128((const __m128i )(coeff_ptr + i 4));
				69	coeffs_sign = _mm_srai_epi32(coeffs, 31);
				70	coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
				71	tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
				72	tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
				73	tmp1 = _mm_or_si128(tmp1, tmp2);
				74	test = _mm_movemask_epi8(tmp1);
				75	_mm_storeu_si128((__m128i *)abs_coeff, coeffs);
				76	_mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
				77
				78	for (j = 0; j < 4; j++) {
				79	if (test & (1 << (4 * j))) {
				80	int k = 4 * i + j;
				81	const int64_t tmp1 = abs_coeff[j] + round_ptr[k != 0];
				82	const int64_t tmp2 = ((tmp1 * quant_ptr[k != 0]) >> 16) + tmp1;
				83	const uint32_t abs_qcoeff =
				84	(uint32_t)((tmp2 * quant_shift_ptr[k != 0]) >> 16);
				85	qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
				86	dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
				87	if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
				88	}
				89	}
				90	}
				91	}
				92	*eob_ptr = eob_i + 1;
				93	}
				94
				95	void vpx_highbd_quantize_b_32x32_sse2(
				96	const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
				97	const int16_t zbin_ptr, const int16_t round_ptr, const int16_t *quant_ptr,
				98	const int16_t quant_shift_ptr, tran_low_t qcoeff_ptr,
				99	tran_low_t dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr,
				100	const int16_t scan, const int16_t iscan) {
				101	__m128i zbins[2];
				102	__m128i nzbins[2];
				103	int idx = 0;
				104	int idx_arr[1024];
				105	int i, eob = -1;
				106	const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
				107	const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
				108	(void)scan;
				109	zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
				110	zbins[1] = _mm_set1_epi32(zbin1_tmp);
				111
				112	nzbins[0] = _mm_setzero_si128();
				113	nzbins[1] = _mm_setzero_si128();
				114	nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
				115	nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
				116
				117	memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
				118	memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
				119
				120	if (!skip_block) {
				121	// Pre-scan pass
				122	for (i = 0; i < n_coeffs / 4; i++) {
				123	__m128i coeffs, cmp1, cmp2;
				124	int test;
				125	coeffs = _mm_load_si128((const __m128i )(coeff_ptr + i 4));
				126	cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
				127	cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
				128	cmp1 = _mm_and_si128(cmp1, cmp2);
				129	test = _mm_movemask_epi8(cmp1);
				130	if (!(test & 0xf)) idx_arr[idx++] = i * 4;
				131	if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
				132	if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
				133	if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
				134	}
				135
				136	// Quantization pass: only process the coefficients selected in
				137	// pre-scan pass. Note: idx can be zero.
				138	for (i = 0; i < idx; i++) {
				139	const int rc = idx_arr[i];
				140	const int coeff = coeff_ptr[rc];
				141	const int coeff_sign = (coeff >> 31);
				142	const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
				143	const int64_t tmp1 =
				144	abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
				145	const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
				146	const uint32_t abs_qcoeff =
				147	(uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
				148	qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
				149	dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
				150	if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
				151	}
				152	}
				153	*eob_ptr = eob + 1;
				154	}
				155	#endif