blob: 8f31f3596f633e0bc2b12c44c7d5d676c8202580 [file] [log] [blame]
Remyad9f35282019-06-03 10:52:34 +05301/*
2 * Copyright (c) 2019, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12#include <emmintrin.h>
13#include "config/aom_dsp_rtcd.h"
14
15#include "aom/aom_integer.h"
16#include "aom_dsp/x86/quantize_x86.h"
17#include "av1/encoder/av1_quantize.h"
18
19static INLINE __m128i highbd_invert_sign_64bit_sse2(__m128i a, __m128i sign) {
20 a = _mm_xor_si128(a, sign);
21 return _mm_sub_epi64(a, sign);
22}
23
24static INLINE void highbd_mul_shift_sse2(const __m128i *x, const __m128i *y,
25 __m128i *p, const int shift) {
26 __m128i sign = _mm_srai_epi32(*y, 31);
27 __m128i sign_lo = _mm_unpacklo_epi32(sign, sign);
28 __m128i sign_hi = _mm_unpackhi_epi32(sign, sign);
29 __m128i abs_y = invert_sign_32_sse2(*y, sign);
30 __m128i prod_lo = _mm_mul_epu32(*x, abs_y);
31 __m128i prod_hi = _mm_srli_epi64(*x, 32);
32 const __m128i mult_hi = _mm_srli_epi64(abs_y, 32);
33 prod_hi = _mm_mul_epu32(prod_hi, mult_hi);
34 prod_lo = highbd_invert_sign_64bit_sse2(prod_lo, sign_lo);
35 prod_hi = highbd_invert_sign_64bit_sse2(prod_hi, sign_hi);
36
37 prod_lo = _mm_srli_epi64(prod_lo, shift);
38 const __m128i mask = _mm_set_epi32(0, -1, 0, -1);
39 prod_lo = _mm_and_si128(prod_lo, mask);
40 prod_hi = _mm_srli_epi64(prod_hi, shift);
41
42 prod_hi = _mm_slli_epi64(prod_hi, 32);
43 *p = _mm_or_si128(prod_lo, prod_hi);
44}
45
46static INLINE void highbd_calculate_qcoeff(__m128i *coeff, const __m128i *round,
47 const __m128i *quant,
48 const __m128i *shift,
49 const int *log_scale) {
50 __m128i tmp, qcoeff;
51 qcoeff = _mm_add_epi32(*coeff, *round);
52 highbd_mul_shift_sse2(&qcoeff, quant, &tmp, 16);
53 qcoeff = _mm_add_epi32(tmp, qcoeff);
54 highbd_mul_shift_sse2(&qcoeff, shift, coeff, 16 - *log_scale);
55}
56
57static INLINE void highbd_update_mask1(__m128i *cmp_mask0,
58 const int16_t *iscan_ptr, int *is_found,
59 __m128i *mask) {
60 __m128i temp_mask = _mm_setzero_si128();
61 if (_mm_movemask_epi8(*cmp_mask0)) {
62 __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr));
63 __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0);
64 temp_mask = mask0;
65 *is_found = 1;
66 }
67 *mask = _mm_max_epi16(temp_mask, *mask);
68}
69
70static INLINE void highbd_update_mask0(__m128i *qcoeff0, __m128i *qcoeff1,
71 __m128i *threshold,
72 const int16_t *iscan_ptr, int *is_found,
73 __m128i *mask) {
74 __m128i coeff[2], cmp_mask0, cmp_mask1;
75
76 coeff[0] = _mm_slli_epi32(*qcoeff0, AOM_QM_BITS);
77 cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]);
78 coeff[1] = _mm_slli_epi32(*qcoeff1, AOM_QM_BITS);
79 cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]);
80
81 cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1);
82
83 highbd_update_mask1(&cmp_mask0, iscan_ptr, is_found, mask);
84}
85
86static INLINE __m128i highbd_calculate_dqcoeff(__m128i qcoeff, __m128i dequant,
87 const int log_scale) {
88 __m128i coeff_sign = _mm_srai_epi32(qcoeff, 31);
89 __m128i abs_coeff = invert_sign_32_sse2(qcoeff, coeff_sign);
90 highbd_mul_shift_sse2(&abs_coeff, &dequant, &abs_coeff, log_scale);
91 return invert_sign_32_sse2(abs_coeff, coeff_sign);
92}
93
Remya7a3af0c2019-06-04 11:51:32 +053094void aom_highbd_quantize_b_adaptive_sse2(
95 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
96 const int16_t *round_ptr, const int16_t *quant_ptr,
97 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
98 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
99 const int16_t *scan, const int16_t *iscan) {
100 int index = 8;
101 const int log_scale = 0;
102 int non_zero_count = 0;
103 int non_zero_count_prescan_add_zero = 0;
104 int is_found0 = 0, is_found1 = 0;
105 int eob = -1;
106 const __m128i zero = _mm_setzero_si128();
107 const __m128i one = _mm_set1_epi32(1);
108 __m128i zbin, round, quant, dequant, shift;
109 __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
110 __m128i qcoeff0, qcoeff1;
111 __m128i cmp_mask0, cmp_mask1, cmp_mask;
112 __m128i all_zero;
113 __m128i mask0 = zero, mask1 = zero;
114
115 int prescan_add[2];
116 int thresh[4];
117 const qm_val_t wt = (1 << AOM_QM_BITS);
118 for (int i = 0; i < 2; ++i) {
119 prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
120 thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
121 }
122 thresh[2] = thresh[3] = thresh[1];
123 __m128i threshold[2];
124 threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
125 threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
126
127#if SKIP_EOB_FACTOR_ADJUST
128 int first = -1;
129#endif
130 // Setup global values.
131 zbin = _mm_load_si128((const __m128i *)zbin_ptr);
132 round = _mm_load_si128((const __m128i *)round_ptr);
133 quant = _mm_load_si128((const __m128i *)quant_ptr);
134 dequant = _mm_load_si128((const __m128i *)dequant_ptr);
135 shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
136
137 __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
138 __m128i round_sign = _mm_srai_epi16(round, 15);
139 __m128i quant_sign = _mm_srai_epi16(quant, 15);
140 __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
141 __m128i shift_sign = _mm_srai_epi16(shift, 15);
142
143 zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
144 round = _mm_unpacklo_epi16(round, round_sign);
145 quant = _mm_unpacklo_epi16(quant, quant_sign);
146 dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
147 shift = _mm_unpacklo_epi16(shift, shift_sign);
148 zbin = _mm_sub_epi32(zbin, one);
149
150 // Do DC and first 15 AC.
151 coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
152 coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
153
154 coeff0_sign = _mm_srai_epi32(coeff0, 31);
155 coeff1_sign = _mm_srai_epi32(coeff1, 31);
156 qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
157 qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
158
159 highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
160
161 cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
162 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
163 cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
164 cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
165 highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
166
167 threshold[0] = threshold[1];
168 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
169 if (_mm_movemask_epi8(all_zero) == 0) {
170 _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
171 _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
172 _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
173 _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
174
175 round = _mm_unpackhi_epi64(round, round);
176 quant = _mm_unpackhi_epi64(quant, quant);
177 shift = _mm_unpackhi_epi64(shift, shift);
178 dequant = _mm_unpackhi_epi64(dequant, dequant);
179 } else {
180 highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
181
182 round = _mm_unpackhi_epi64(round, round);
183 quant = _mm_unpackhi_epi64(quant, quant);
184 shift = _mm_unpackhi_epi64(shift, shift);
185 highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
186
187 // Reinsert signs
188 qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
189 qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
190
191 // Mask out zbin threshold coeffs
192 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
193 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
194
195 _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
196 _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
197
198 coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
199 dequant = _mm_unpackhi_epi64(dequant, dequant);
200 coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
201 _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
202 _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
203 }
204
205 // AC only loop.
206 while (index < n_coeffs) {
207 coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
208 coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
209
210 coeff0_sign = _mm_srai_epi32(coeff0, 31);
211 coeff1_sign = _mm_srai_epi32(coeff1, 31);
212 qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
213 qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
214
215 highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
216 &is_found0, &mask0);
217
218 cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
219 cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
220 cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
221 highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
222
223 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
224 if (_mm_movemask_epi8(all_zero) == 0) {
225 _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
226 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
227 _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
228 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
229 index += 8;
230 continue;
231 }
232 highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
233 highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
234
235 qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
236 qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
237
238 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
239 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
240
241 _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
242 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
243
244 coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
245 coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
246
247 _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
248 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
249
250 index += 8;
251 }
252 if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
253 if (is_found1)
254 non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
255
256 for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
257 const int rc = scan[i];
258 qcoeff_ptr[rc] = 0;
259 dqcoeff_ptr[rc] = 0;
260 }
261
262 for (int i = non_zero_count - 1; i >= 0; i--) {
263 const int rc = scan[i];
264 if (qcoeff_ptr[rc]) {
265 eob = i;
266 break;
267 }
268 }
269
270 *eob_ptr = eob + 1;
271#if SKIP_EOB_FACTOR_ADJUST
272 // TODO(Aniket): Experiment the following loop with intrinsic by combining
273 // with the quantization loop above
274 for (int i = 0; i < non_zero_count; i++) {
275 const int rc = scan[i];
276 const int qcoeff = qcoeff_ptr[rc];
277 if (qcoeff) {
278 first = i;
279 break;
280 }
281 }
282 if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
283 const int rc = scan[(*eob_ptr - 1)];
284 if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
285 const int coeff = coeff_ptr[rc] * wt;
Yaowu Xu9db5b8d2020-04-10 08:58:15 -0700286 const int coeff_sign = AOMSIGN(coeff);
Remya7a3af0c2019-06-04 11:51:32 +0530287 const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
288 const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
289 const int prescan_add_val =
290 ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
291 if (abs_coeff <
292 (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
293 qcoeff_ptr[rc] = 0;
294 dqcoeff_ptr[rc] = 0;
295 *eob_ptr = 0;
296 }
297 }
298 }
299#endif
300}
301
Remya014f91a2019-06-04 11:10:10 +0530302void aom_highbd_quantize_b_32x32_adaptive_sse2(
303 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
304 const int16_t *round_ptr, const int16_t *quant_ptr,
305 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
306 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
307 const int16_t *scan, const int16_t *iscan) {
308 int index = 8;
309 const int log_scale = 1;
310 int non_zero_count = 0;
311 int non_zero_count_prescan_add_zero = 0;
312 int is_found0 = 0, is_found1 = 0;
313 int eob = -1;
314 const __m128i zero = _mm_setzero_si128();
315 const __m128i one = _mm_set1_epi32(1);
316 const __m128i log_scale_vec = _mm_set1_epi32(log_scale);
317 __m128i zbin, round, quant, dequant, shift;
318 __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
319 __m128i qcoeff0, qcoeff1;
320 __m128i cmp_mask0, cmp_mask1, cmp_mask;
321 __m128i all_zero;
322 __m128i mask0 = zero, mask1 = zero;
323
324 const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
325 ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
326 int prescan_add[2];
327 int thresh[4];
328 const qm_val_t wt = (1 << AOM_QM_BITS);
329 for (int i = 0; i < 2; ++i) {
330 prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
331 thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
332 }
333 thresh[2] = thresh[3] = thresh[1];
334 __m128i threshold[2];
335 threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
336 threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
337
338#if SKIP_EOB_FACTOR_ADJUST
339 int first = -1;
340#endif
341 // Setup global values.
342 zbin = _mm_load_si128((const __m128i *)zbin_ptr);
343 round = _mm_load_si128((const __m128i *)round_ptr);
344 quant = _mm_load_si128((const __m128i *)quant_ptr);
345 dequant = _mm_load_si128((const __m128i *)dequant_ptr);
346 shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
347
348 __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
349 __m128i round_sign = _mm_srai_epi16(round, 15);
350 __m128i quant_sign = _mm_srai_epi16(quant, 15);
351 __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
352 __m128i shift_sign = _mm_srai_epi16(shift, 15);
353
354 zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
355 round = _mm_unpacklo_epi16(round, round_sign);
356 quant = _mm_unpacklo_epi16(quant, quant_sign);
357 dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
358 shift = _mm_unpacklo_epi16(shift, shift_sign);
359
360 // Shift with rounding.
361 zbin = _mm_add_epi32(zbin, log_scale_vec);
362 round = _mm_add_epi32(round, log_scale_vec);
363 zbin = _mm_srli_epi32(zbin, log_scale);
364 round = _mm_srli_epi32(round, log_scale);
365 zbin = _mm_sub_epi32(zbin, one);
366
367 // Do DC and first 15 AC.
368 coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
369 coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
370
371 coeff0_sign = _mm_srai_epi32(coeff0, 31);
372 coeff1_sign = _mm_srai_epi32(coeff1, 31);
373 qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
374 qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
375
376 highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
377
378 cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
379 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
380 cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
381 cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
382 highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
383
384 threshold[0] = threshold[1];
385 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
386 if (_mm_movemask_epi8(all_zero) == 0) {
387 _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
388 _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
389 _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
390 _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
391
392 round = _mm_unpackhi_epi64(round, round);
393 quant = _mm_unpackhi_epi64(quant, quant);
394 shift = _mm_unpackhi_epi64(shift, shift);
395 dequant = _mm_unpackhi_epi64(dequant, dequant);
396 } else {
397 highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
398
399 round = _mm_unpackhi_epi64(round, round);
400 quant = _mm_unpackhi_epi64(quant, quant);
401 shift = _mm_unpackhi_epi64(shift, shift);
402 highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
403
404 // Reinsert signs
405 qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
406 qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
407
408 // Mask out zbin threshold coeffs
409 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
410 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
411
412 _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
413 _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
414
415 coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
416 dequant = _mm_unpackhi_epi64(dequant, dequant);
417 coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
418 _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
419 _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
420 }
421
422 // AC only loop.
423 while (index < n_coeffs) {
424 coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
425 coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
426
427 coeff0_sign = _mm_srai_epi32(coeff0, 31);
428 coeff1_sign = _mm_srai_epi32(coeff1, 31);
429 qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
430 qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
431
432 highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
433 &is_found0, &mask0);
434
435 cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
436 cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
437 cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
438 highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
439
440 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
441 if (_mm_movemask_epi8(all_zero) == 0) {
442 _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
443 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
444 _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
445 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
446 index += 8;
447 continue;
448 }
449 highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
450 highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
451
452 qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
453 qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
454
455 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
456 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
457
458 _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
459 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
460
461 coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
462 coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
463
464 _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
465 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
466
467 index += 8;
468 }
469 if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
470 if (is_found1)
471 non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
472
473 for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
474 const int rc = scan[i];
475 qcoeff_ptr[rc] = 0;
476 dqcoeff_ptr[rc] = 0;
477 }
478
479 for (int i = non_zero_count - 1; i >= 0; i--) {
480 const int rc = scan[i];
481 if (qcoeff_ptr[rc]) {
482 eob = i;
483 break;
484 }
485 }
486
487 *eob_ptr = eob + 1;
488#if SKIP_EOB_FACTOR_ADJUST
489 // TODO(Aniket): Experiment the following loop with intrinsic by combining
490 // with the quantization loop above
491 for (int i = 0; i < non_zero_count; i++) {
492 const int rc = scan[i];
493 const int qcoeff = qcoeff_ptr[rc];
494 if (qcoeff) {
495 first = i;
496 break;
497 }
498 }
499 if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
500 const int rc = scan[(*eob_ptr - 1)];
501 if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
502 const int coeff = coeff_ptr[rc] * wt;
Yaowu Xu9db5b8d2020-04-10 08:58:15 -0700503 const int coeff_sign = AOMSIGN(coeff);
Remya014f91a2019-06-04 11:10:10 +0530504 const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
505 const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
506 const int prescan_add_val =
507 ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
508 if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
509 qcoeff_ptr[rc] = 0;
510 dqcoeff_ptr[rc] = 0;
511 *eob_ptr = 0;
512 }
513 }
514 }
515#endif
516}
517
Remyad9f35282019-06-03 10:52:34 +0530518void aom_highbd_quantize_b_64x64_adaptive_sse2(
519 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
520 const int16_t *round_ptr, const int16_t *quant_ptr,
521 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
522 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
523 const int16_t *scan, const int16_t *iscan) {
524 int index = 8;
525 const int log_scale = 2;
526 int non_zero_count = 0;
527 int non_zero_count_prescan_add_zero = 0;
528 int is_found0 = 0, is_found1 = 0;
529 int eob = -1;
530 const __m128i zero = _mm_setzero_si128();
531 const __m128i one = _mm_set1_epi32(1);
532 const __m128i log_scale_vec = _mm_set1_epi32(log_scale);
533 __m128i zbin, round, quant, dequant, shift;
534 __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
535 __m128i qcoeff0, qcoeff1;
536 __m128i cmp_mask0, cmp_mask1, cmp_mask;
537 __m128i all_zero;
538 __m128i mask0 = zero, mask1 = zero;
539
540 const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
541 ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
542 int prescan_add[2];
543 int thresh[4];
544 const qm_val_t wt = (1 << AOM_QM_BITS);
545 for (int i = 0; i < 2; ++i) {
546 prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
547 thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
548 }
549 thresh[2] = thresh[3] = thresh[1];
550 __m128i threshold[2];
551 threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
552 threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
553
554#if SKIP_EOB_FACTOR_ADJUST
555 int first = -1;
556#endif
557 // Setup global values.
558 zbin = _mm_load_si128((const __m128i *)zbin_ptr);
559 round = _mm_load_si128((const __m128i *)round_ptr);
560 quant = _mm_load_si128((const __m128i *)quant_ptr);
561 dequant = _mm_load_si128((const __m128i *)dequant_ptr);
562 shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
563
564 __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
565 __m128i round_sign = _mm_srai_epi16(round, 15);
566 __m128i quant_sign = _mm_srai_epi16(quant, 15);
567 __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
568 __m128i shift_sign = _mm_srai_epi16(shift, 15);
569
570 zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
571 round = _mm_unpacklo_epi16(round, round_sign);
572 quant = _mm_unpacklo_epi16(quant, quant_sign);
573 dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
574 shift = _mm_unpacklo_epi16(shift, shift_sign);
575
576 // Shift with rounding.
577 zbin = _mm_add_epi32(zbin, log_scale_vec);
578 round = _mm_add_epi32(round, log_scale_vec);
579 zbin = _mm_srli_epi32(zbin, log_scale);
580 round = _mm_srli_epi32(round, log_scale);
581 zbin = _mm_sub_epi32(zbin, one);
582
583 // Do DC and first 15 AC.
584 coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
585 coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
586
587 coeff0_sign = _mm_srai_epi32(coeff0, 31);
588 coeff1_sign = _mm_srai_epi32(coeff1, 31);
589 qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
590 qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
591
592 highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
593
594 cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
595 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
596 cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
597 cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
598 highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
599
600 threshold[0] = threshold[1];
601 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
602 if (_mm_movemask_epi8(all_zero) == 0) {
603 _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
604 _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
605 _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
606 _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
607
608 round = _mm_unpackhi_epi64(round, round);
609 quant = _mm_unpackhi_epi64(quant, quant);
610 shift = _mm_unpackhi_epi64(shift, shift);
611 dequant = _mm_unpackhi_epi64(dequant, dequant);
612 } else {
613 highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
614
615 round = _mm_unpackhi_epi64(round, round);
616 quant = _mm_unpackhi_epi64(quant, quant);
617 shift = _mm_unpackhi_epi64(shift, shift);
618 highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
619
620 // Reinsert signs
621 qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
622 qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
623
624 // Mask out zbin threshold coeffs
625 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
626 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
627
628 _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
629 _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
630
631 coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
632 dequant = _mm_unpackhi_epi64(dequant, dequant);
633 coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
634 _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
635 _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
636 }
637
638 // AC only loop.
639 while (index < n_coeffs) {
640 coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
641 coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
642
643 coeff0_sign = _mm_srai_epi32(coeff0, 31);
644 coeff1_sign = _mm_srai_epi32(coeff1, 31);
645 qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
646 qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
647
648 highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
649 &is_found0, &mask0);
650
651 cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
652 cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
653 cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
654 highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
655
656 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
657 if (_mm_movemask_epi8(all_zero) == 0) {
658 _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
659 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
660 _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
661 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
662 index += 8;
663 continue;
664 }
665 highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
666 highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
667
668 qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
669 qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
670
671 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
672 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
673
674 _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
675 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
676
677 coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
678 coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
679
680 _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
681 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
682
683 index += 8;
684 }
685 if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
686 if (is_found1)
687 non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
688
689 for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
690 const int rc = scan[i];
691 qcoeff_ptr[rc] = 0;
692 dqcoeff_ptr[rc] = 0;
693 }
694
695 for (int i = non_zero_count - 1; i >= 0; i--) {
696 const int rc = scan[i];
697 if (qcoeff_ptr[rc]) {
698 eob = i;
699 break;
700 }
701 }
702
703 *eob_ptr = eob + 1;
704#if SKIP_EOB_FACTOR_ADJUST
705 // TODO(Aniket): Experiment the following loop with intrinsic by combining
706 // with the quantization loop above
707 for (int i = 0; i < non_zero_count; i++) {
708 const int rc = scan[i];
709 const int qcoeff = qcoeff_ptr[rc];
710 if (qcoeff) {
711 first = i;
712 break;
713 }
714 }
715 if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
716 const int rc = scan[(*eob_ptr - 1)];
717 if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
718 const int coeff = coeff_ptr[rc] * wt;
Yaowu Xu9db5b8d2020-04-10 08:58:15 -0700719 const int coeff_sign = AOMSIGN(coeff);
Remyad9f35282019-06-03 10:52:34 +0530720 const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
721 const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
722 const int prescan_add_val =
723 ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
724 if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
725 qcoeff_ptr[rc] = 0;
726 dqcoeff_ptr[rc] = 0;
727 *eob_ptr = 0;
728 }
729 }
730 }
731#endif
732}