Blame - aom_dsp/x86/highbd_adaptive_quantize_sse2.c - aom

blob: 8f31f3596f633e0bc2b12c44c7d5d676c8202580 [file] [log] [blame]

Remya	d9f3528	2019-06-03 10:52:34 +0530	[diff] [blame]	1	/*
				2	* Copyright (c) 2019, Alliance for Open Media. All rights reserved
				3	*
				4	* This source code is subject to the terms of the BSD 2 Clause License and
				5	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
				6	* was not distributed with this source code in the LICENSE file, you can
				7	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
				8	* Media Patent License 1.0 was not distributed with this source code in the
				9	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
				10	*/
				11
				12	#include <emmintrin.h>
				13	#include "config/aom_dsp_rtcd.h"
				14
				15	#include "aom/aom_integer.h"
				16	#include "aom_dsp/x86/quantize_x86.h"
				17	#include "av1/encoder/av1_quantize.h"
				18
				19	static INLINE __m128i highbd_invert_sign_64bit_sse2(__m128i a, __m128i sign) {
				20	a = _mm_xor_si128(a, sign);
				21	return _mm_sub_epi64(a, sign);
				22	}
				23
				24	static INLINE void highbd_mul_shift_sse2(const __m128i x, const __m128i y,
				25	__m128i *p, const int shift) {
				26	__m128i sign = _mm_srai_epi32(*y, 31);
				27	__m128i sign_lo = _mm_unpacklo_epi32(sign, sign);
				28	__m128i sign_hi = _mm_unpackhi_epi32(sign, sign);
				29	__m128i abs_y = invert_sign_32_sse2(*y, sign);
				30	__m128i prod_lo = _mm_mul_epu32(*x, abs_y);
				31	__m128i prod_hi = _mm_srli_epi64(*x, 32);
				32	const __m128i mult_hi = _mm_srli_epi64(abs_y, 32);
				33	prod_hi = _mm_mul_epu32(prod_hi, mult_hi);
				34	prod_lo = highbd_invert_sign_64bit_sse2(prod_lo, sign_lo);
				35	prod_hi = highbd_invert_sign_64bit_sse2(prod_hi, sign_hi);
				36
				37	prod_lo = _mm_srli_epi64(prod_lo, shift);
				38	const __m128i mask = _mm_set_epi32(0, -1, 0, -1);
				39	prod_lo = _mm_and_si128(prod_lo, mask);
				40	prod_hi = _mm_srli_epi64(prod_hi, shift);
				41
				42	prod_hi = _mm_slli_epi64(prod_hi, 32);
				43	*p = _mm_or_si128(prod_lo, prod_hi);
				44	}
				45
				46	static INLINE void highbd_calculate_qcoeff(__m128i coeff, const __m128i round,
				47	const __m128i *quant,
				48	const __m128i *shift,
				49	const int *log_scale) {
				50	__m128i tmp, qcoeff;
				51	qcoeff = _mm_add_epi32(coeff, round);
				52	highbd_mul_shift_sse2(&qcoeff, quant, &tmp, 16);
				53	qcoeff = _mm_add_epi32(tmp, qcoeff);
				54	highbd_mul_shift_sse2(&qcoeff, shift, coeff, 16 - *log_scale);
				55	}
				56
				57	static INLINE void highbd_update_mask1(__m128i *cmp_mask0,
				58	const int16_t iscan_ptr, int is_found,
				59	__m128i *mask) {
				60	__m128i temp_mask = _mm_setzero_si128();
				61	if (_mm_movemask_epi8(*cmp_mask0)) {
				62	__m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr));
				63	__m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0);
				64	temp_mask = mask0;
				65	*is_found = 1;
				66	}
				67	mask = _mm_max_epi16(temp_mask, mask);
				68	}
				69
				70	static INLINE void highbd_update_mask0(__m128i qcoeff0, __m128i qcoeff1,
				71	__m128i *threshold,
				72	const int16_t iscan_ptr, int is_found,
				73	__m128i *mask) {
				74	__m128i coeff[2], cmp_mask0, cmp_mask1;
				75
				76	coeff[0] = _mm_slli_epi32(*qcoeff0, AOM_QM_BITS);
				77	cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]);
				78	coeff[1] = _mm_slli_epi32(*qcoeff1, AOM_QM_BITS);
				79	cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]);
				80
				81	cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1);
				82
				83	highbd_update_mask1(&cmp_mask0, iscan_ptr, is_found, mask);
				84	}
				85
				86	static INLINE __m128i highbd_calculate_dqcoeff(__m128i qcoeff, __m128i dequant,
				87	const int log_scale) {
				88	__m128i coeff_sign = _mm_srai_epi32(qcoeff, 31);
				89	__m128i abs_coeff = invert_sign_32_sse2(qcoeff, coeff_sign);
				90	highbd_mul_shift_sse2(&abs_coeff, &dequant, &abs_coeff, log_scale);
				91	return invert_sign_32_sse2(abs_coeff, coeff_sign);
				92	}
				93
Remya	7a3af0c	2019-06-04 11:51:32 +0530	[diff] [blame]	94	void aom_highbd_quantize_b_adaptive_sse2(
				95	const tran_low_t coeff_ptr, intptr_t n_coeffs, const int16_t zbin_ptr,
				96	const int16_t round_ptr, const int16_t quant_ptr,
				97	const int16_t quant_shift_ptr, tran_low_t qcoeff_ptr,
				98	tran_low_t dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr,
				99	const int16_t scan, const int16_t iscan) {
				100	int index = 8;
				101	const int log_scale = 0;
				102	int non_zero_count = 0;
				103	int non_zero_count_prescan_add_zero = 0;
				104	int is_found0 = 0, is_found1 = 0;
				105	int eob = -1;
				106	const __m128i zero = _mm_setzero_si128();
				107	const __m128i one = _mm_set1_epi32(1);
				108	__m128i zbin, round, quant, dequant, shift;
				109	__m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
				110	__m128i qcoeff0, qcoeff1;
				111	__m128i cmp_mask0, cmp_mask1, cmp_mask;
				112	__m128i all_zero;
				113	__m128i mask0 = zero, mask1 = zero;
				114
				115	int prescan_add[2];
				116	int thresh[4];
				117	const qm_val_t wt = (1 << AOM_QM_BITS);
				118	for (int i = 0; i < 2; ++i) {
				119	prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
				120	thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
				121	}
				122	thresh[2] = thresh[3] = thresh[1];
				123	__m128i threshold[2];
				124	threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
				125	threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
				126
				127	#if SKIP_EOB_FACTOR_ADJUST
				128	int first = -1;
				129	#endif
				130	// Setup global values.
				131	zbin = _mm_load_si128((const __m128i *)zbin_ptr);
				132	round = _mm_load_si128((const __m128i *)round_ptr);
				133	quant = _mm_load_si128((const __m128i *)quant_ptr);
				134	dequant = _mm_load_si128((const __m128i *)dequant_ptr);
				135	shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
				136
				137	__m128i zbin_sign = _mm_srai_epi16(zbin, 15);
				138	__m128i round_sign = _mm_srai_epi16(round, 15);
				139	__m128i quant_sign = _mm_srai_epi16(quant, 15);
				140	__m128i dequant_sign = _mm_srai_epi16(dequant, 15);
				141	__m128i shift_sign = _mm_srai_epi16(shift, 15);
				142
				143	zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
				144	round = _mm_unpacklo_epi16(round, round_sign);
				145	quant = _mm_unpacklo_epi16(quant, quant_sign);
				146	dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
				147	shift = _mm_unpacklo_epi16(shift, shift_sign);
				148	zbin = _mm_sub_epi32(zbin, one);
				149
				150	// Do DC and first 15 AC.
				151	coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
				152	coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
				153
				154	coeff0_sign = _mm_srai_epi32(coeff0, 31);
				155	coeff1_sign = _mm_srai_epi32(coeff1, 31);
				156	qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
				157	qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
				158
				159	highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
				160
				161	cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
				162	zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
				163	cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
				164	cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
				165	highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
				166
				167	threshold[0] = threshold[1];
				168	all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
				169	if (_mm_movemask_epi8(all_zero) == 0) {
				170	_mm_store_si128((__m128i *)(qcoeff_ptr), zero);
				171	_mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
				172	_mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
				173	_mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
				174
				175	round = _mm_unpackhi_epi64(round, round);
				176	quant = _mm_unpackhi_epi64(quant, quant);
				177	shift = _mm_unpackhi_epi64(shift, shift);
				178	dequant = _mm_unpackhi_epi64(dequant, dequant);
				179	} else {
				180	highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
				181
				182	round = _mm_unpackhi_epi64(round, round);
				183	quant = _mm_unpackhi_epi64(quant, quant);
				184	shift = _mm_unpackhi_epi64(shift, shift);
				185	highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
				186
				187	// Reinsert signs
				188	qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
				189	qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
				190
				191	// Mask out zbin threshold coeffs
				192	qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
				193	qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
				194
				195	_mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
				196	_mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
				197
				198	coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
				199	dequant = _mm_unpackhi_epi64(dequant, dequant);
				200	coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
				201	_mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
				202	_mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
				203	}
				204
				205	// AC only loop.
				206	while (index < n_coeffs) {
				207	coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
				208	coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
				209
				210	coeff0_sign = _mm_srai_epi32(coeff0, 31);
				211	coeff1_sign = _mm_srai_epi32(coeff1, 31);
				212	qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
				213	qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
				214
				215	highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
				216	&is_found0, &mask0);
				217
				218	cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
				219	cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
				220	cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
				221	highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
				222
				223	all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
				224	if (_mm_movemask_epi8(all_zero) == 0) {
				225	_mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
				226	_mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
				227	_mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
				228	_mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
				229	index += 8;
				230	continue;
				231	}
				232	highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
				233	highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
				234
				235	qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
				236	qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
				237
				238	qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
				239	qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
				240
				241	_mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
				242	_mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
				243
				244	coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
				245	coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
				246
				247	_mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
				248	_mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
				249
				250	index += 8;
				251	}
				252	if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
				253	if (is_found1)
				254	non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
				255
				256	for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
				257	const int rc = scan[i];
				258	qcoeff_ptr[rc] = 0;
				259	dqcoeff_ptr[rc] = 0;
				260	}
				261
				262	for (int i = non_zero_count - 1; i >= 0; i--) {
				263	const int rc = scan[i];
				264	if (qcoeff_ptr[rc]) {
				265	eob = i;
				266	break;
				267	}
				268	}
				269
				270	*eob_ptr = eob + 1;
				271	#if SKIP_EOB_FACTOR_ADJUST
				272	// TODO(Aniket): Experiment the following loop with intrinsic by combining
				273	// with the quantization loop above
				274	for (int i = 0; i < non_zero_count; i++) {
				275	const int rc = scan[i];
				276	const int qcoeff = qcoeff_ptr[rc];
				277	if (qcoeff) {
				278	first = i;
				279	break;
				280	}
				281	}
				282	if ((eob_ptr - 1) >= 0 && first == (eob_ptr - 1)) {
				283	const int rc = scan[(*eob_ptr - 1)];
				284	if (qcoeff_ptr[rc] == 1 \|\| qcoeff_ptr[rc] == -1) {
				285	const int coeff = coeff_ptr[rc] * wt;
Yaowu Xu	9db5b8d	2020-04-10 08:58:15 -0700	[diff] [blame]	286	const int coeff_sign = AOMSIGN(coeff);
Remya	7a3af0c	2019-06-04 11:51:32 +0530	[diff] [blame]	287	const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
				288	const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
				289	const int prescan_add_val =
				290	ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
				291	if (abs_coeff <
				292	(zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
				293	qcoeff_ptr[rc] = 0;
				294	dqcoeff_ptr[rc] = 0;
				295	*eob_ptr = 0;
				296	}
				297	}
				298	}
				299	#endif
				300	}
				301
Remya	014f91a	2019-06-04 11:10:10 +0530	[diff] [blame]	302	void aom_highbd_quantize_b_32x32_adaptive_sse2(
				303	const tran_low_t coeff_ptr, intptr_t n_coeffs, const int16_t zbin_ptr,
				304	const int16_t round_ptr, const int16_t quant_ptr,
				305	const int16_t quant_shift_ptr, tran_low_t qcoeff_ptr,
				306	tran_low_t dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr,
				307	const int16_t scan, const int16_t iscan) {
				308	int index = 8;
				309	const int log_scale = 1;
				310	int non_zero_count = 0;
				311	int non_zero_count_prescan_add_zero = 0;
				312	int is_found0 = 0, is_found1 = 0;
				313	int eob = -1;
				314	const __m128i zero = _mm_setzero_si128();
				315	const __m128i one = _mm_set1_epi32(1);
				316	const __m128i log_scale_vec = _mm_set1_epi32(log_scale);
				317	__m128i zbin, round, quant, dequant, shift;
				318	__m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
				319	__m128i qcoeff0, qcoeff1;
				320	__m128i cmp_mask0, cmp_mask1, cmp_mask;
				321	__m128i all_zero;
				322	__m128i mask0 = zero, mask1 = zero;
				323
				324	const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
				325	ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
				326	int prescan_add[2];
				327	int thresh[4];
				328	const qm_val_t wt = (1 << AOM_QM_BITS);
				329	for (int i = 0; i < 2; ++i) {
				330	prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
				331	thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
				332	}
				333	thresh[2] = thresh[3] = thresh[1];
				334	__m128i threshold[2];
				335	threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
				336	threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
				337
				338	#if SKIP_EOB_FACTOR_ADJUST
				339	int first = -1;
				340	#endif
				341	// Setup global values.
				342	zbin = _mm_load_si128((const __m128i *)zbin_ptr);
				343	round = _mm_load_si128((const __m128i *)round_ptr);
				344	quant = _mm_load_si128((const __m128i *)quant_ptr);
				345	dequant = _mm_load_si128((const __m128i *)dequant_ptr);
				346	shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
				347
				348	__m128i zbin_sign = _mm_srai_epi16(zbin, 15);
				349	__m128i round_sign = _mm_srai_epi16(round, 15);
				350	__m128i quant_sign = _mm_srai_epi16(quant, 15);
				351	__m128i dequant_sign = _mm_srai_epi16(dequant, 15);
				352	__m128i shift_sign = _mm_srai_epi16(shift, 15);
				353
				354	zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
				355	round = _mm_unpacklo_epi16(round, round_sign);
				356	quant = _mm_unpacklo_epi16(quant, quant_sign);
				357	dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
				358	shift = _mm_unpacklo_epi16(shift, shift_sign);
				359
				360	// Shift with rounding.
				361	zbin = _mm_add_epi32(zbin, log_scale_vec);
				362	round = _mm_add_epi32(round, log_scale_vec);
				363	zbin = _mm_srli_epi32(zbin, log_scale);
				364	round = _mm_srli_epi32(round, log_scale);
				365	zbin = _mm_sub_epi32(zbin, one);
				366
				367	// Do DC and first 15 AC.
				368	coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
				369	coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
				370
				371	coeff0_sign = _mm_srai_epi32(coeff0, 31);
				372	coeff1_sign = _mm_srai_epi32(coeff1, 31);
				373	qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
				374	qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
				375
				376	highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
				377
				378	cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
				379	zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
				380	cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
				381	cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
				382	highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
				383
				384	threshold[0] = threshold[1];
				385	all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
				386	if (_mm_movemask_epi8(all_zero) == 0) {
				387	_mm_store_si128((__m128i *)(qcoeff_ptr), zero);
				388	_mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
				389	_mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
				390	_mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
				391
				392	round = _mm_unpackhi_epi64(round, round);
				393	quant = _mm_unpackhi_epi64(quant, quant);
				394	shift = _mm_unpackhi_epi64(shift, shift);
				395	dequant = _mm_unpackhi_epi64(dequant, dequant);
				396	} else {
				397	highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
				398
				399	round = _mm_unpackhi_epi64(round, round);
				400	quant = _mm_unpackhi_epi64(quant, quant);
				401	shift = _mm_unpackhi_epi64(shift, shift);
				402	highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
				403
				404	// Reinsert signs
				405	qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
				406	qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
				407
				408	// Mask out zbin threshold coeffs
				409	qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
				410	qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
				411
				412	_mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
				413	_mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
				414
				415	coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
				416	dequant = _mm_unpackhi_epi64(dequant, dequant);
				417	coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
				418	_mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
				419	_mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
				420	}
				421
				422	// AC only loop.
				423	while (index < n_coeffs) {
				424	coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
				425	coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
				426
				427	coeff0_sign = _mm_srai_epi32(coeff0, 31);
				428	coeff1_sign = _mm_srai_epi32(coeff1, 31);
				429	qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
				430	qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
				431
				432	highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
				433	&is_found0, &mask0);
				434
				435	cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
				436	cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
				437	cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
				438	highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
				439
				440	all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
				441	if (_mm_movemask_epi8(all_zero) == 0) {
				442	_mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
				443	_mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
				444	_mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
				445	_mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
				446	index += 8;
				447	continue;
				448	}
				449	highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
				450	highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
				451
				452	qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
				453	qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
				454
				455	qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
				456	qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
				457
				458	_mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
				459	_mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
				460
				461	coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
				462	coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
				463
				464	_mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
				465	_mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
				466
				467	index += 8;
				468	}
				469	if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
				470	if (is_found1)
				471	non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
				472
				473	for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
				474	const int rc = scan[i];
				475	qcoeff_ptr[rc] = 0;
				476	dqcoeff_ptr[rc] = 0;
				477	}
				478
				479	for (int i = non_zero_count - 1; i >= 0; i--) {
				480	const int rc = scan[i];
				481	if (qcoeff_ptr[rc]) {
				482	eob = i;
				483	break;
				484	}
				485	}
				486
				487	*eob_ptr = eob + 1;
				488	#if SKIP_EOB_FACTOR_ADJUST
				489	// TODO(Aniket): Experiment the following loop with intrinsic by combining
				490	// with the quantization loop above
				491	for (int i = 0; i < non_zero_count; i++) {
				492	const int rc = scan[i];
				493	const int qcoeff = qcoeff_ptr[rc];
				494	if (qcoeff) {
				495	first = i;
				496	break;
				497	}
				498	}
				499	if ((eob_ptr - 1) >= 0 && first == (eob_ptr - 1)) {
				500	const int rc = scan[(*eob_ptr - 1)];
				501	if (qcoeff_ptr[rc] == 1 \|\| qcoeff_ptr[rc] == -1) {
				502	const int coeff = coeff_ptr[rc] * wt;
Yaowu Xu	9db5b8d	2020-04-10 08:58:15 -0700	[diff] [blame]	503	const int coeff_sign = AOMSIGN(coeff);
Remya	014f91a	2019-06-04 11:10:10 +0530	[diff] [blame]	504	const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
				505	const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
				506	const int prescan_add_val =
				507	ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
				508	if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
				509	qcoeff_ptr[rc] = 0;
				510	dqcoeff_ptr[rc] = 0;
				511	*eob_ptr = 0;
				512	}
				513	}
				514	}
				515	#endif
				516	}
				517
Remya	d9f3528	2019-06-03 10:52:34 +0530	[diff] [blame]	518	void aom_highbd_quantize_b_64x64_adaptive_sse2(
				519	const tran_low_t coeff_ptr, intptr_t n_coeffs, const int16_t zbin_ptr,
				520	const int16_t round_ptr, const int16_t quant_ptr,
				521	const int16_t quant_shift_ptr, tran_low_t qcoeff_ptr,
				522	tran_low_t dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr,
				523	const int16_t scan, const int16_t iscan) {
				524	int index = 8;
				525	const int log_scale = 2;
				526	int non_zero_count = 0;
				527	int non_zero_count_prescan_add_zero = 0;
				528	int is_found0 = 0, is_found1 = 0;
				529	int eob = -1;
				530	const __m128i zero = _mm_setzero_si128();
				531	const __m128i one = _mm_set1_epi32(1);
				532	const __m128i log_scale_vec = _mm_set1_epi32(log_scale);
				533	__m128i zbin, round, quant, dequant, shift;
				534	__m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
				535	__m128i qcoeff0, qcoeff1;
				536	__m128i cmp_mask0, cmp_mask1, cmp_mask;
				537	__m128i all_zero;
				538	__m128i mask0 = zero, mask1 = zero;
				539
				540	const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
				541	ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
				542	int prescan_add[2];
				543	int thresh[4];
				544	const qm_val_t wt = (1 << AOM_QM_BITS);
				545	for (int i = 0; i < 2; ++i) {
				546	prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
				547	thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
				548	}
				549	thresh[2] = thresh[3] = thresh[1];
				550	__m128i threshold[2];
				551	threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
				552	threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
				553
				554	#if SKIP_EOB_FACTOR_ADJUST
				555	int first = -1;
				556	#endif
				557	// Setup global values.
				558	zbin = _mm_load_si128((const __m128i *)zbin_ptr);
				559	round = _mm_load_si128((const __m128i *)round_ptr);
				560	quant = _mm_load_si128((const __m128i *)quant_ptr);
				561	dequant = _mm_load_si128((const __m128i *)dequant_ptr);
				562	shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
				563
				564	__m128i zbin_sign = _mm_srai_epi16(zbin, 15);
				565	__m128i round_sign = _mm_srai_epi16(round, 15);
				566	__m128i quant_sign = _mm_srai_epi16(quant, 15);
				567	__m128i dequant_sign = _mm_srai_epi16(dequant, 15);
				568	__m128i shift_sign = _mm_srai_epi16(shift, 15);
				569
				570	zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
				571	round = _mm_unpacklo_epi16(round, round_sign);
				572	quant = _mm_unpacklo_epi16(quant, quant_sign);
				573	dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
				574	shift = _mm_unpacklo_epi16(shift, shift_sign);
				575
				576	// Shift with rounding.
				577	zbin = _mm_add_epi32(zbin, log_scale_vec);
				578	round = _mm_add_epi32(round, log_scale_vec);
				579	zbin = _mm_srli_epi32(zbin, log_scale);
				580	round = _mm_srli_epi32(round, log_scale);
				581	zbin = _mm_sub_epi32(zbin, one);
				582
				583	// Do DC and first 15 AC.
				584	coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
				585	coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
				586
				587	coeff0_sign = _mm_srai_epi32(coeff0, 31);
				588	coeff1_sign = _mm_srai_epi32(coeff1, 31);
				589	qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
				590	qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
				591
				592	highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
				593
				594	cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
				595	zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
				596	cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
				597	cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
				598	highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
				599
				600	threshold[0] = threshold[1];
				601	all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
				602	if (_mm_movemask_epi8(all_zero) == 0) {
				603	_mm_store_si128((__m128i *)(qcoeff_ptr), zero);
				604	_mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
				605	_mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
				606	_mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
				607
				608	round = _mm_unpackhi_epi64(round, round);
				609	quant = _mm_unpackhi_epi64(quant, quant);
				610	shift = _mm_unpackhi_epi64(shift, shift);
				611	dequant = _mm_unpackhi_epi64(dequant, dequant);
				612	} else {
				613	highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
				614
				615	round = _mm_unpackhi_epi64(round, round);
				616	quant = _mm_unpackhi_epi64(quant, quant);
				617	shift = _mm_unpackhi_epi64(shift, shift);
				618	highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
				619
				620	// Reinsert signs
				621	qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
				622	qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
				623
				624	// Mask out zbin threshold coeffs
				625	qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
				626	qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
				627
				628	_mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
				629	_mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
				630
				631	coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
				632	dequant = _mm_unpackhi_epi64(dequant, dequant);
				633	coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
				634	_mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
				635	_mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
				636	}
				637
				638	// AC only loop.
				639	while (index < n_coeffs) {
				640	coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
				641	coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
				642
				643	coeff0_sign = _mm_srai_epi32(coeff0, 31);
				644	coeff1_sign = _mm_srai_epi32(coeff1, 31);
				645	qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
				646	qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
				647
				648	highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
				649	&is_found0, &mask0);
				650
				651	cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
				652	cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
				653	cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
				654	highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
				655
				656	all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
				657	if (_mm_movemask_epi8(all_zero) == 0) {
				658	_mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
				659	_mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
				660	_mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
				661	_mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
				662	index += 8;
				663	continue;
				664	}
				665	highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
				666	highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
				667
				668	qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
				669	qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
				670
				671	qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
				672	qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
				673
				674	_mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
				675	_mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
				676
				677	coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
				678	coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
				679
				680	_mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
				681	_mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
				682
				683	index += 8;
				684	}
				685	if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
				686	if (is_found1)
				687	non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
				688
				689	for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
				690	const int rc = scan[i];
				691	qcoeff_ptr[rc] = 0;
				692	dqcoeff_ptr[rc] = 0;
				693	}
				694
				695	for (int i = non_zero_count - 1; i >= 0; i--) {
				696	const int rc = scan[i];
				697	if (qcoeff_ptr[rc]) {
				698	eob = i;
				699	break;
				700	}
				701	}
				702
				703	*eob_ptr = eob + 1;
				704	#if SKIP_EOB_FACTOR_ADJUST
				705	// TODO(Aniket): Experiment the following loop with intrinsic by combining
				706	// with the quantization loop above
				707	for (int i = 0; i < non_zero_count; i++) {
				708	const int rc = scan[i];
				709	const int qcoeff = qcoeff_ptr[rc];
				710	if (qcoeff) {
				711	first = i;
				712	break;
				713	}
				714	}
				715	if ((eob_ptr - 1) >= 0 && first == (eob_ptr - 1)) {
				716	const int rc = scan[(*eob_ptr - 1)];
				717	if (qcoeff_ptr[rc] == 1 \|\| qcoeff_ptr[rc] == -1) {
				718	const int coeff = coeff_ptr[rc] * wt;
Yaowu Xu	9db5b8d	2020-04-10 08:58:15 -0700	[diff] [blame]	719	const int coeff_sign = AOMSIGN(coeff);
Remya	d9f3528	2019-06-03 10:52:34 +0530	[diff] [blame]	720	const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
				721	const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
				722	const int prescan_add_val =
				723	ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
				724	if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
				725	qcoeff_ptr[rc] = 0;
				726	dqcoeff_ptr[rc] = 0;
				727	*eob_ptr = 0;
				728	}
				729	}
				730	}
				731	#endif
				732	}