Blame - aom_dsp/x86/highbd_adaptive_quantize_avx2.c - aom

blob: 05c87bcff947043c54e9b132d45747073d8dbd30 [file] [log] [blame]

Remya	8a99f6b	2019-05-27 18:36:12 +0530	[diff] [blame]	1	/*
				2	* Copyright (c) 2019, Alliance for Open Media. All rights reserved
				3	*
				4	* This source code is subject to the terms of the BSD 2 Clause License and
				5	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
				6	* was not distributed with this source code in the LICENSE file, you can
				7	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
				8	* Media Patent License 1.0 was not distributed with this source code in the
				9	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
				10	*/
				11
				12	#include <immintrin.h>
				13
				14	#include "config/aom_dsp_rtcd.h"
				15
				16	#include "aom/aom_integer.h"
Yaowu Xu	a6f54e0	2021-07-15 09:24:20 -0700	[diff] [blame]	17	#include "aom_dsp/quantize.h"
Remya	8a99f6b	2019-05-27 18:36:12 +0530	[diff] [blame]	18	#include "aom_dsp/x86/quantize_x86.h"
				19
Remya	8a99f6b	2019-05-27 18:36:12 +0530	[diff] [blame]	20	static INLINE void highbd_load_b_values_avx2(
				21	const int16_t zbin_ptr, __m256i zbin, const int16_t *round_ptr,
				22	__m256i round, const int16_t quant_ptr, __m256i *quant,
				23	const int16_t dequant_ptr, __m256i dequant, const int16_t *shift_ptr,
				24	__m256i *shift) {
				25	zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i )zbin_ptr));
				26	zbin = _mm256_sub_epi32(zbin, _mm256_set1_epi32(1));
				27	round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i )round_ptr));
				28	quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i )quant_ptr));
				29	*dequant =
				30	_mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr));
				31	shift = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i )shift_ptr));
				32	}
				33
				34	static INLINE void highbd_update_mask1_avx2(__m256i *cmp_mask,
				35	const int16_t *iscan_ptr,
				36	int is_found, __m256i mask) {
				37	__m256i temp_mask = _mm256_setzero_si256();
				38	if (_mm256_movemask_epi8(*cmp_mask)) {
				39	__m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr));
				40	temp_mask = _mm256_and_si256(*cmp_mask, iscan);
				41	*is_found = 1;
				42	}
				43	mask = _mm256_max_epi16(temp_mask, mask);
				44	}
				45
				46	static INLINE void highbd_update_mask0_avx2(__m256i qcoeff0, __m256i qcoeff1,
				47	__m256i *threshold,
				48	const int16_t *iscan_ptr,
				49	int is_found, __m256i mask) {
				50	__m256i coeff[2], cmp_mask0, cmp_mask1;
				51	coeff[0] = _mm256_slli_epi32(*qcoeff0, AOM_QM_BITS);
				52	cmp_mask0 = _mm256_cmpgt_epi32(coeff[0], threshold[0]);
				53	coeff[1] = _mm256_slli_epi32(*qcoeff1, AOM_QM_BITS);
				54	cmp_mask1 = _mm256_cmpgt_epi32(coeff[1], threshold[1]);
				55	cmp_mask0 =
				56	_mm256_permute4x64_epi64(_mm256_packs_epi32(cmp_mask0, cmp_mask1), 0xd8);
				57	highbd_update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask);
				58	}
				59
				60	static INLINE void highbd_mul_shift_avx2(const __m256i x, const __m256i y,
				61	__m256i *p, const int shift) {
				62	__m256i prod_lo = _mm256_mul_epi32(x, y);
				63	__m256i prod_hi = _mm256_srli_epi64(*x, 32);
				64	const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
				65	prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
				66
				67	prod_lo = _mm256_srli_epi64(prod_lo, shift);
				68	prod_hi = _mm256_srli_epi64(prod_hi, shift);
				69
				70	prod_hi = _mm256_slli_epi64(prod_hi, 32);
				71	*p = _mm256_blend_epi32(prod_lo, prod_hi, 0xaa);
				72	}
				73
				74	static INLINE void highbd_calculate_qcoeff_avx2(__m256i *coeff,
				75	const __m256i *round,
				76	const __m256i *quant,
				77	const __m256i *shift,
				78	const int *log_scale) {
				79	__m256i tmp, qcoeff;
				80	qcoeff = _mm256_add_epi32(coeff, round);
				81	highbd_mul_shift_avx2(&qcoeff, quant, &tmp, 16);
				82	qcoeff = _mm256_add_epi32(tmp, qcoeff);
				83	highbd_mul_shift_avx2(&qcoeff, shift, coeff, 16 - *log_scale);
				84	}
				85
				86	static INLINE __m256i highbd_calculate_dqcoeff_avx2(__m256i qcoeff,
				87	__m256i dequant) {
				88	return _mm256_mullo_epi32(qcoeff, dequant);
				89	}
				90
Remya	a7d48a6	2019-06-03 14:22:34 +0530	[diff] [blame]	91	static INLINE __m256i highbd_calculate_dqcoeff_log_scale_avx2(
				92	__m256i qcoeff, __m256i dequant, const int log_scale) {
				93	__m256i abs_coeff = _mm256_abs_epi32(qcoeff);
				94	highbd_mul_shift_avx2(&abs_coeff, &dequant, &abs_coeff, log_scale);
				95	return _mm256_sign_epi32(abs_coeff, qcoeff);
				96	}
				97
Remya	8a99f6b	2019-05-27 18:36:12 +0530	[diff] [blame]	98	static INLINE void highbd_store_coefficients_avx2(__m256i coeff0,
				99	__m256i coeff1,
				100	tran_low_t *coeff_ptr) {
				101	_mm256_store_si256((__m256i *)(coeff_ptr), coeff0);
				102	_mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff1);
				103	}
				104
				105	void aom_highbd_quantize_b_adaptive_avx2(
				106	const tran_low_t coeff_ptr, intptr_t n_coeffs, const int16_t zbin_ptr,
				107	const int16_t round_ptr, const int16_t quant_ptr,
				108	const int16_t quant_shift_ptr, tran_low_t qcoeff_ptr,
				109	tran_low_t dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr,
				110	const int16_t scan, const int16_t iscan) {
				111	int index = 16;
				112	int non_zero_count = 0;
				113	int non_zero_count_prescan_add_zero = 0;
				114	int is_found0 = 0, is_found1 = 0;
				115	int eob = -1;
				116	const __m256i zero = _mm256_setzero_si256();
				117	__m256i zbin, round, quant, dequant, shift;
				118	__m256i coeff0, qcoeff0, coeff1, qcoeff1;
				119	__m256i cmp_mask, mask0 = zero, mask1 = zero;
				120	__m128i temp_mask0, temp_mask1;
				121	int prescan_add[2];
				122	int thresh[2];
				123	const int log_scale = 0;
				124	const qm_val_t wt = (1 << AOM_QM_BITS);
				125	for (int i = 0; i < 2; ++i) {
				126	prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
				127	thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
				128	}
				129	__m256i threshold[2];
				130	threshold[0] = _mm256_set1_epi32(thresh[0]);
				131	threshold[1] = _mm256_set1_epi32(thresh[1]);
				132	threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe);
				133
				134	#if SKIP_EOB_FACTOR_ADJUST
				135	int first = -1;
				136	#endif
				137
				138	// Setup global values.
				139	highbd_load_b_values_avx2(zbin_ptr, &zbin, round_ptr, &round, quant_ptr,
				140	&quant, dequant_ptr, &dequant, quant_shift_ptr,
				141	&shift);
				142
				143	// Do DC and first 15 AC.
				144	coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr));
				145	qcoeff0 = _mm256_abs_epi32(coeff0);
				146	coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
				147	qcoeff1 = _mm256_abs_epi32(coeff1);
				148	highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0,
				149	&mask0);
				150	__m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
				151	zbin = _mm256_unpackhi_epi64(zbin, zbin);
				152	__m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
				153	cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
				154	highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1);
				155	threshold[0] = threshold[1];
				156	if (_mm256_movemask_epi8(cmp_mask) == 0) {
				157	_mm256_store_si256((__m256i *)(qcoeff_ptr), zero);
				158	_mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero);
				159	_mm256_store_si256((__m256i *)(dqcoeff_ptr), zero);
				160	_mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero);
				161	round = _mm256_unpackhi_epi64(round, round);
				162	quant = _mm256_unpackhi_epi64(quant, quant);
				163	shift = _mm256_unpackhi_epi64(shift, shift);
				164	dequant = _mm256_unpackhi_epi64(dequant, dequant);
				165	} else {
				166	highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
				167	round = _mm256_unpackhi_epi64(round, round);
				168	quant = _mm256_unpackhi_epi64(quant, quant);
				169	shift = _mm256_unpackhi_epi64(shift, shift);
				170	highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
				171	// Reinsert signs
				172	qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
				173	qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
				174	// Mask out zbin threshold coeffs
				175	qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
				176	qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
				177	highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr);
				178	coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant);
				179	dequant = _mm256_unpackhi_epi64(dequant, dequant);
				180	coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant);
				181	highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr);
				182	}
				183
				184	// AC only loop.
				185	while (index < n_coeffs) {
				186	coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index));
				187	qcoeff0 = _mm256_abs_epi32(coeff0);
				188	coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8));
				189	qcoeff1 = _mm256_abs_epi32(coeff1);
				190	highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index,
				191	&is_found0, &mask0);
				192	temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
				193	temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
				194	cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
				195	highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1);
				196	if (_mm256_movemask_epi8(cmp_mask) == 0) {
				197	_mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero);
				198	_mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero);
				199	_mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero);
				200	_mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero);
				201	index += 16;
				202	continue;
				203	}
				204	highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
				205	highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
				206	qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
				207	qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
				208	qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
				209	qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
				210	highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index);
				211	coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant);
				212	coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant);
				213	highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index);
				214	index += 16;
				215	}
				216	if (is_found0) {
				217	temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0),
				218	_mm256_extracti128_si256(mask0, 1));
				219	non_zero_count = calculate_non_zero_count(temp_mask0);
				220	}
				221	if (is_found1) {
				222	temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1),
				223	_mm256_extracti128_si256(mask1, 1));
				224	non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1);
				225	}
				226
				227	for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
				228	const int rc = scan[i];
				229	qcoeff_ptr[rc] = 0;
				230	dqcoeff_ptr[rc] = 0;
				231	}
				232
				233	for (int i = non_zero_count - 1; i >= 0; i--) {
				234	const int rc = scan[i];
				235	if (qcoeff_ptr[rc]) {
				236	eob = i;
				237	break;
				238	}
				239	}
				240
				241	*eob_ptr = eob + 1;
				242	#if SKIP_EOB_FACTOR_ADJUST
				243	// TODO(Aniket): Experiment the following loop with intrinsic by combining
				244	// with the quantization loop above
				245	for (int i = 0; i < non_zero_count; i++) {
				246	const int rc = scan[i];
				247	const int qcoeff = qcoeff_ptr[rc];
				248	if (qcoeff) {
				249	first = i;
				250	break;
				251	}
				252	}
				253	if ((eob_ptr - 1) >= 0 && first == (eob_ptr - 1)) {
				254	const int rc = scan[(*eob_ptr - 1)];
				255	if (qcoeff_ptr[rc] == 1 \|\| qcoeff_ptr[rc] == -1) {
				256	const int coeff = coeff_ptr[rc] * wt;
Yaowu Xu	9db5b8d	2020-04-10 08:58:15 -0700	[diff] [blame]	257	const int coeff_sign = AOMSIGN(coeff);
Remya	8a99f6b	2019-05-27 18:36:12 +0530	[diff] [blame]	258	const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
				259	const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
				260	const int prescan_add_val =
				261	ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
				262	if (abs_coeff <
				263	(zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
				264	qcoeff_ptr[rc] = 0;
				265	dqcoeff_ptr[rc] = 0;
				266	*eob_ptr = 0;
				267	}
				268	}
				269	}
				270	#endif
				271	}
Remya	a7d48a6	2019-06-03 14:22:34 +0530	[diff] [blame]	272
				273	void aom_highbd_quantize_b_32x32_adaptive_avx2(
				274	const tran_low_t coeff_ptr, intptr_t n_coeffs, const int16_t zbin_ptr,
				275	const int16_t round_ptr, const int16_t quant_ptr,
				276	const int16_t quant_shift_ptr, tran_low_t qcoeff_ptr,
				277	tran_low_t dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr,
				278	const int16_t scan, const int16_t iscan) {
				279	int index = 16;
				280	int non_zero_count = 0;
				281	int non_zero_count_prescan_add_zero = 0;
				282	int is_found0 = 0, is_found1 = 0;
				283	int eob = -1;
				284	const int log_scale = 1;
				285	const __m256i zero = _mm256_setzero_si256();
				286	__m256i zbin, round, quant, dequant, shift;
				287	__m256i coeff0, qcoeff0, coeff1, qcoeff1;
				288	__m256i cmp_mask, mask0 = zero, mask1 = zero;
				289	__m128i temp_mask0, temp_mask1;
				290	const __m256i one = _mm256_set1_epi32(1);
				291	const __m256i log_scale_vec = _mm256_set1_epi32(log_scale);
				292	int prescan_add[2];
				293	int thresh[2];
				294	const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
				295	ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
				296	const qm_val_t wt = (1 << AOM_QM_BITS);
				297	for (int i = 0; i < 2; ++i) {
				298	prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
				299	thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
				300	}
				301	__m256i threshold[2];
				302	threshold[0] = _mm256_set1_epi32(thresh[0]);
				303	threshold[1] = _mm256_set1_epi32(thresh[1]);
				304	threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe);
				305
				306	#if SKIP_EOB_FACTOR_ADJUST
				307	int first = -1;
				308	#endif
				309
				310	// Setup global values.
				311	zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)zbin_ptr));
				312	round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)round_ptr));
				313	quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_ptr));
				314	dequant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr));
				315	shift =
				316	_mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_shift_ptr));
				317
				318	// Shift with rounding.
				319	zbin = _mm256_add_epi32(zbin, log_scale_vec);
				320	round = _mm256_add_epi32(round, log_scale_vec);
				321	zbin = _mm256_srli_epi32(zbin, log_scale);
				322	round = _mm256_srli_epi32(round, log_scale);
				323	zbin = _mm256_sub_epi32(zbin, one);
				324
				325	// Do DC and first 15 AC.
				326	coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr));
				327	qcoeff0 = _mm256_abs_epi32(coeff0);
				328	coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
				329	qcoeff1 = _mm256_abs_epi32(coeff1);
				330	highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0,
				331	&mask0);
				332	__m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
				333	zbin = _mm256_permute2x128_si256(zbin, zbin, 0x11);
				334	__m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
				335	cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
				336	highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1);
				337	threshold[0] = threshold[1];
				338	if (_mm256_movemask_epi8(cmp_mask) == 0) {
				339	_mm256_store_si256((__m256i *)(qcoeff_ptr), zero);
				340	_mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero);
				341	_mm256_store_si256((__m256i *)(dqcoeff_ptr), zero);
				342	_mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero);
				343	round = _mm256_permute2x128_si256(round, round, 0x11);
				344	quant = _mm256_permute2x128_si256(quant, quant, 0x11);
				345	shift = _mm256_permute2x128_si256(shift, shift, 0x11);
				346	dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11);
				347	} else {
				348	highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
				349	round = _mm256_permute2x128_si256(round, round, 0x11);
				350	quant = _mm256_permute2x128_si256(quant, quant, 0x11);
				351	shift = _mm256_permute2x128_si256(shift, shift, 0x11);
				352	highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
				353	// Reinsert signs
				354	qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
				355	qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
				356	// Mask out zbin threshold coeffs
				357	qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
				358	qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
				359	highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr);
				360	coeff0 =
				361	highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale);
				362	dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11);
				363	coeff1 =
				364	highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale);
				365	highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr);
				366	}
				367
				368	// AC only loop.
				369	while (index < n_coeffs) {
				370	coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index));
				371	qcoeff0 = _mm256_abs_epi32(coeff0);
				372	coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8));
				373	qcoeff1 = _mm256_abs_epi32(coeff1);
				374	highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index,
				375	&is_found0, &mask0);
				376	temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
				377	temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
				378	cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
				379	highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1);
				380	if (_mm256_movemask_epi8(cmp_mask) == 0) {
				381	_mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero);
				382	_mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero);
				383	_mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero);
				384	_mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero);
				385	index += 16;
				386	continue;
				387	}
				388	highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
				389	highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
				390	qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
				391	qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
				392	qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
				393	qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
				394	highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index);
				395	coeff0 =
				396	highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale);
				397	coeff1 =
				398	highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale);
				399	highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index);
				400	index += 16;
				401	}
				402	if (is_found0) {
				403	temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0),
				404	_mm256_extracti128_si256(mask0, 1));
				405	non_zero_count = calculate_non_zero_count(temp_mask0);
				406	}
				407	if (is_found1) {
				408	temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1),
				409	_mm256_extracti128_si256(mask1, 1));
				410	non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1);
				411	}
				412
				413	for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
				414	const int rc = scan[i];
				415	qcoeff_ptr[rc] = 0;
				416	dqcoeff_ptr[rc] = 0;
				417	}
				418
				419	for (int i = non_zero_count - 1; i >= 0; i--) {
				420	const int rc = scan[i];
				421	if (qcoeff_ptr[rc]) {
				422	eob = i;
				423	break;
				424	}
				425	}
				426
				427	*eob_ptr = eob + 1;
				428	#if SKIP_EOB_FACTOR_ADJUST
				429	// TODO(Aniket): Experiment the following loop with intrinsic by combining
				430	// with the quantization loop above
				431	for (int i = 0; i < non_zero_count; i++) {
				432	const int rc = scan[i];
				433	const int qcoeff = qcoeff_ptr[rc];
				434	if (qcoeff) {
				435	first = i;
				436	break;
				437	}
				438	}
				439	if ((eob_ptr - 1) >= 0 && first == (eob_ptr - 1)) {
				440	const int rc = scan[(*eob_ptr - 1)];
				441	if (qcoeff_ptr[rc] == 1 \|\| qcoeff_ptr[rc] == -1) {
				442	const int coeff = coeff_ptr[rc] * wt;
Yaowu Xu	9db5b8d	2020-04-10 08:58:15 -0700	[diff] [blame]	443	const int coeff_sign = AOMSIGN(coeff);
Remya	a7d48a6	2019-06-03 14:22:34 +0530	[diff] [blame]	444	const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
				445	const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
				446	const int prescan_add_val =
				447	ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
				448	if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
				449	qcoeff_ptr[rc] = 0;
				450	dqcoeff_ptr[rc] = 0;
				451	*eob_ptr = 0;
				452	}
				453	}
				454	}
				455	#endif
				456	}