Remya | 8a99f6b | 2019-05-27 18:36:12 +0530 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2019, Alliance for Open Media. All rights reserved |
| 3 | * |
| 4 | * This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | * was not distributed with this source code in the LICENSE file, you can |
| 7 | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | * Media Patent License 1.0 was not distributed with this source code in the |
| 9 | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| 10 | */ |
| 11 | |
| 12 | #include <immintrin.h> |
| 13 | |
| 14 | #include "config/aom_dsp_rtcd.h" |
| 15 | |
| 16 | #include "aom/aom_integer.h" |
Yaowu Xu | a6f54e0 | 2021-07-15 09:24:20 -0700 | [diff] [blame] | 17 | #include "aom_dsp/quantize.h" |
Remya | 8a99f6b | 2019-05-27 18:36:12 +0530 | [diff] [blame] | 18 | #include "aom_dsp/x86/quantize_x86.h" |
| 19 | |
Remya | 8a99f6b | 2019-05-27 18:36:12 +0530 | [diff] [blame] | 20 | static INLINE void highbd_load_b_values_avx2( |
| 21 | const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr, |
| 22 | __m256i *round, const int16_t *quant_ptr, __m256i *quant, |
| 23 | const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr, |
| 24 | __m256i *shift) { |
| 25 | *zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)zbin_ptr)); |
| 26 | *zbin = _mm256_sub_epi32(*zbin, _mm256_set1_epi32(1)); |
| 27 | *round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)round_ptr)); |
| 28 | *quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_ptr)); |
| 29 | *dequant = |
| 30 | _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr)); |
| 31 | *shift = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)shift_ptr)); |
| 32 | } |
| 33 | |
| 34 | static INLINE void highbd_update_mask1_avx2(__m256i *cmp_mask, |
| 35 | const int16_t *iscan_ptr, |
| 36 | int *is_found, __m256i *mask) { |
| 37 | __m256i temp_mask = _mm256_setzero_si256(); |
| 38 | if (_mm256_movemask_epi8(*cmp_mask)) { |
| 39 | __m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr)); |
| 40 | temp_mask = _mm256_and_si256(*cmp_mask, iscan); |
| 41 | *is_found = 1; |
| 42 | } |
| 43 | *mask = _mm256_max_epi16(temp_mask, *mask); |
| 44 | } |
| 45 | |
| 46 | static INLINE void highbd_update_mask0_avx2(__m256i *qcoeff0, __m256i *qcoeff1, |
| 47 | __m256i *threshold, |
| 48 | const int16_t *iscan_ptr, |
| 49 | int *is_found, __m256i *mask) { |
| 50 | __m256i coeff[2], cmp_mask0, cmp_mask1; |
| 51 | coeff[0] = _mm256_slli_epi32(*qcoeff0, AOM_QM_BITS); |
| 52 | cmp_mask0 = _mm256_cmpgt_epi32(coeff[0], threshold[0]); |
| 53 | coeff[1] = _mm256_slli_epi32(*qcoeff1, AOM_QM_BITS); |
| 54 | cmp_mask1 = _mm256_cmpgt_epi32(coeff[1], threshold[1]); |
| 55 | cmp_mask0 = |
| 56 | _mm256_permute4x64_epi64(_mm256_packs_epi32(cmp_mask0, cmp_mask1), 0xd8); |
| 57 | highbd_update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask); |
| 58 | } |
| 59 | |
| 60 | static INLINE void highbd_mul_shift_avx2(const __m256i *x, const __m256i *y, |
| 61 | __m256i *p, const int shift) { |
| 62 | __m256i prod_lo = _mm256_mul_epi32(*x, *y); |
| 63 | __m256i prod_hi = _mm256_srli_epi64(*x, 32); |
| 64 | const __m256i mult_hi = _mm256_srli_epi64(*y, 32); |
| 65 | prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); |
| 66 | |
| 67 | prod_lo = _mm256_srli_epi64(prod_lo, shift); |
| 68 | prod_hi = _mm256_srli_epi64(prod_hi, shift); |
| 69 | |
| 70 | prod_hi = _mm256_slli_epi64(prod_hi, 32); |
| 71 | *p = _mm256_blend_epi32(prod_lo, prod_hi, 0xaa); |
| 72 | } |
| 73 | |
| 74 | static INLINE void highbd_calculate_qcoeff_avx2(__m256i *coeff, |
| 75 | const __m256i *round, |
| 76 | const __m256i *quant, |
| 77 | const __m256i *shift, |
| 78 | const int *log_scale) { |
| 79 | __m256i tmp, qcoeff; |
| 80 | qcoeff = _mm256_add_epi32(*coeff, *round); |
| 81 | highbd_mul_shift_avx2(&qcoeff, quant, &tmp, 16); |
| 82 | qcoeff = _mm256_add_epi32(tmp, qcoeff); |
| 83 | highbd_mul_shift_avx2(&qcoeff, shift, coeff, 16 - *log_scale); |
| 84 | } |
| 85 | |
| 86 | static INLINE __m256i highbd_calculate_dqcoeff_avx2(__m256i qcoeff, |
| 87 | __m256i dequant) { |
| 88 | return _mm256_mullo_epi32(qcoeff, dequant); |
| 89 | } |
| 90 | |
Remya | a7d48a6 | 2019-06-03 14:22:34 +0530 | [diff] [blame] | 91 | static INLINE __m256i highbd_calculate_dqcoeff_log_scale_avx2( |
| 92 | __m256i qcoeff, __m256i dequant, const int log_scale) { |
| 93 | __m256i abs_coeff = _mm256_abs_epi32(qcoeff); |
| 94 | highbd_mul_shift_avx2(&abs_coeff, &dequant, &abs_coeff, log_scale); |
| 95 | return _mm256_sign_epi32(abs_coeff, qcoeff); |
| 96 | } |
| 97 | |
Remya | 8a99f6b | 2019-05-27 18:36:12 +0530 | [diff] [blame] | 98 | static INLINE void highbd_store_coefficients_avx2(__m256i coeff0, |
| 99 | __m256i coeff1, |
| 100 | tran_low_t *coeff_ptr) { |
| 101 | _mm256_store_si256((__m256i *)(coeff_ptr), coeff0); |
| 102 | _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff1); |
| 103 | } |
| 104 | |
| 105 | void aom_highbd_quantize_b_adaptive_avx2( |
| 106 | const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, |
| 107 | const int16_t *round_ptr, const int16_t *quant_ptr, |
| 108 | const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, |
| 109 | tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, |
| 110 | const int16_t *scan, const int16_t *iscan) { |
| 111 | int index = 16; |
| 112 | int non_zero_count = 0; |
| 113 | int non_zero_count_prescan_add_zero = 0; |
| 114 | int is_found0 = 0, is_found1 = 0; |
| 115 | int eob = -1; |
| 116 | const __m256i zero = _mm256_setzero_si256(); |
| 117 | __m256i zbin, round, quant, dequant, shift; |
| 118 | __m256i coeff0, qcoeff0, coeff1, qcoeff1; |
| 119 | __m256i cmp_mask, mask0 = zero, mask1 = zero; |
| 120 | __m128i temp_mask0, temp_mask1; |
| 121 | int prescan_add[2]; |
| 122 | int thresh[2]; |
| 123 | const int log_scale = 0; |
| 124 | const qm_val_t wt = (1 << AOM_QM_BITS); |
| 125 | for (int i = 0; i < 2; ++i) { |
| 126 | prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); |
| 127 | thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1; |
| 128 | } |
| 129 | __m256i threshold[2]; |
| 130 | threshold[0] = _mm256_set1_epi32(thresh[0]); |
| 131 | threshold[1] = _mm256_set1_epi32(thresh[1]); |
| 132 | threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe); |
| 133 | |
| 134 | #if SKIP_EOB_FACTOR_ADJUST |
| 135 | int first = -1; |
| 136 | #endif |
| 137 | |
| 138 | // Setup global values. |
| 139 | highbd_load_b_values_avx2(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, |
| 140 | &quant, dequant_ptr, &dequant, quant_shift_ptr, |
| 141 | &shift); |
| 142 | |
| 143 | // Do DC and first 15 AC. |
| 144 | coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr)); |
| 145 | qcoeff0 = _mm256_abs_epi32(coeff0); |
| 146 | coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8)); |
| 147 | qcoeff1 = _mm256_abs_epi32(coeff1); |
| 148 | highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, |
| 149 | &mask0); |
| 150 | __m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin); |
| 151 | zbin = _mm256_unpackhi_epi64(zbin, zbin); |
| 152 | __m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin); |
| 153 | cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8); |
| 154 | highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1); |
| 155 | threshold[0] = threshold[1]; |
| 156 | if (_mm256_movemask_epi8(cmp_mask) == 0) { |
| 157 | _mm256_store_si256((__m256i *)(qcoeff_ptr), zero); |
| 158 | _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero); |
| 159 | _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero); |
| 160 | _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero); |
| 161 | round = _mm256_unpackhi_epi64(round, round); |
| 162 | quant = _mm256_unpackhi_epi64(quant, quant); |
| 163 | shift = _mm256_unpackhi_epi64(shift, shift); |
| 164 | dequant = _mm256_unpackhi_epi64(dequant, dequant); |
| 165 | } else { |
| 166 | highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale); |
| 167 | round = _mm256_unpackhi_epi64(round, round); |
| 168 | quant = _mm256_unpackhi_epi64(quant, quant); |
| 169 | shift = _mm256_unpackhi_epi64(shift, shift); |
| 170 | highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale); |
| 171 | // Reinsert signs |
| 172 | qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0); |
| 173 | qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1); |
| 174 | // Mask out zbin threshold coeffs |
| 175 | qcoeff0 = _mm256_and_si256(qcoeff0, temp0); |
| 176 | qcoeff1 = _mm256_and_si256(qcoeff1, temp1); |
| 177 | highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr); |
| 178 | coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant); |
| 179 | dequant = _mm256_unpackhi_epi64(dequant, dequant); |
| 180 | coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant); |
| 181 | highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr); |
| 182 | } |
| 183 | |
| 184 | // AC only loop. |
| 185 | while (index < n_coeffs) { |
| 186 | coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index)); |
| 187 | qcoeff0 = _mm256_abs_epi32(coeff0); |
| 188 | coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8)); |
| 189 | qcoeff1 = _mm256_abs_epi32(coeff1); |
| 190 | highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index, |
| 191 | &is_found0, &mask0); |
| 192 | temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin); |
| 193 | temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin); |
| 194 | cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8); |
| 195 | highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1); |
| 196 | if (_mm256_movemask_epi8(cmp_mask) == 0) { |
| 197 | _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero); |
| 198 | _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero); |
| 199 | _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero); |
| 200 | _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero); |
| 201 | index += 16; |
| 202 | continue; |
| 203 | } |
| 204 | highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale); |
| 205 | highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale); |
| 206 | qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0); |
| 207 | qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1); |
| 208 | qcoeff0 = _mm256_and_si256(qcoeff0, temp0); |
| 209 | qcoeff1 = _mm256_and_si256(qcoeff1, temp1); |
| 210 | highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index); |
| 211 | coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant); |
| 212 | coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant); |
| 213 | highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index); |
| 214 | index += 16; |
| 215 | } |
| 216 | if (is_found0) { |
| 217 | temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0), |
| 218 | _mm256_extracti128_si256(mask0, 1)); |
| 219 | non_zero_count = calculate_non_zero_count(temp_mask0); |
| 220 | } |
| 221 | if (is_found1) { |
| 222 | temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1), |
| 223 | _mm256_extracti128_si256(mask1, 1)); |
| 224 | non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1); |
| 225 | } |
| 226 | |
| 227 | for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { |
| 228 | const int rc = scan[i]; |
| 229 | qcoeff_ptr[rc] = 0; |
| 230 | dqcoeff_ptr[rc] = 0; |
| 231 | } |
| 232 | |
| 233 | for (int i = non_zero_count - 1; i >= 0; i--) { |
| 234 | const int rc = scan[i]; |
| 235 | if (qcoeff_ptr[rc]) { |
| 236 | eob = i; |
| 237 | break; |
| 238 | } |
| 239 | } |
| 240 | |
| 241 | *eob_ptr = eob + 1; |
| 242 | #if SKIP_EOB_FACTOR_ADJUST |
| 243 | // TODO(Aniket): Experiment the following loop with intrinsic by combining |
| 244 | // with the quantization loop above |
| 245 | for (int i = 0; i < non_zero_count; i++) { |
| 246 | const int rc = scan[i]; |
| 247 | const int qcoeff = qcoeff_ptr[rc]; |
| 248 | if (qcoeff) { |
| 249 | first = i; |
| 250 | break; |
| 251 | } |
| 252 | } |
| 253 | if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { |
| 254 | const int rc = scan[(*eob_ptr - 1)]; |
| 255 | if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { |
| 256 | const int coeff = coeff_ptr[rc] * wt; |
Yaowu Xu | 9db5b8d | 2020-04-10 08:58:15 -0700 | [diff] [blame] | 257 | const int coeff_sign = AOMSIGN(coeff); |
Remya | 8a99f6b | 2019-05-27 18:36:12 +0530 | [diff] [blame] | 258 | const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; |
| 259 | const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; |
| 260 | const int prescan_add_val = |
| 261 | ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); |
| 262 | if (abs_coeff < |
| 263 | (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { |
| 264 | qcoeff_ptr[rc] = 0; |
| 265 | dqcoeff_ptr[rc] = 0; |
| 266 | *eob_ptr = 0; |
| 267 | } |
| 268 | } |
| 269 | } |
| 270 | #endif |
| 271 | } |
Remya | a7d48a6 | 2019-06-03 14:22:34 +0530 | [diff] [blame] | 272 | |
| 273 | void aom_highbd_quantize_b_32x32_adaptive_avx2( |
| 274 | const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, |
| 275 | const int16_t *round_ptr, const int16_t *quant_ptr, |
| 276 | const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, |
| 277 | tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, |
| 278 | const int16_t *scan, const int16_t *iscan) { |
| 279 | int index = 16; |
| 280 | int non_zero_count = 0; |
| 281 | int non_zero_count_prescan_add_zero = 0; |
| 282 | int is_found0 = 0, is_found1 = 0; |
| 283 | int eob = -1; |
| 284 | const int log_scale = 1; |
| 285 | const __m256i zero = _mm256_setzero_si256(); |
| 286 | __m256i zbin, round, quant, dequant, shift; |
| 287 | __m256i coeff0, qcoeff0, coeff1, qcoeff1; |
| 288 | __m256i cmp_mask, mask0 = zero, mask1 = zero; |
| 289 | __m128i temp_mask0, temp_mask1; |
| 290 | const __m256i one = _mm256_set1_epi32(1); |
| 291 | const __m256i log_scale_vec = _mm256_set1_epi32(log_scale); |
| 292 | int prescan_add[2]; |
| 293 | int thresh[2]; |
| 294 | const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), |
| 295 | ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; |
| 296 | const qm_val_t wt = (1 << AOM_QM_BITS); |
| 297 | for (int i = 0; i < 2; ++i) { |
| 298 | prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); |
| 299 | thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; |
| 300 | } |
| 301 | __m256i threshold[2]; |
| 302 | threshold[0] = _mm256_set1_epi32(thresh[0]); |
| 303 | threshold[1] = _mm256_set1_epi32(thresh[1]); |
| 304 | threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe); |
| 305 | |
| 306 | #if SKIP_EOB_FACTOR_ADJUST |
| 307 | int first = -1; |
| 308 | #endif |
| 309 | |
| 310 | // Setup global values. |
| 311 | zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)zbin_ptr)); |
| 312 | round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)round_ptr)); |
| 313 | quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_ptr)); |
| 314 | dequant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr)); |
| 315 | shift = |
| 316 | _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_shift_ptr)); |
| 317 | |
| 318 | // Shift with rounding. |
| 319 | zbin = _mm256_add_epi32(zbin, log_scale_vec); |
| 320 | round = _mm256_add_epi32(round, log_scale_vec); |
| 321 | zbin = _mm256_srli_epi32(zbin, log_scale); |
| 322 | round = _mm256_srli_epi32(round, log_scale); |
| 323 | zbin = _mm256_sub_epi32(zbin, one); |
| 324 | |
| 325 | // Do DC and first 15 AC. |
| 326 | coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr)); |
| 327 | qcoeff0 = _mm256_abs_epi32(coeff0); |
| 328 | coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8)); |
| 329 | qcoeff1 = _mm256_abs_epi32(coeff1); |
| 330 | highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, |
| 331 | &mask0); |
| 332 | __m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin); |
| 333 | zbin = _mm256_permute2x128_si256(zbin, zbin, 0x11); |
| 334 | __m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin); |
| 335 | cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8); |
| 336 | highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1); |
| 337 | threshold[0] = threshold[1]; |
| 338 | if (_mm256_movemask_epi8(cmp_mask) == 0) { |
| 339 | _mm256_store_si256((__m256i *)(qcoeff_ptr), zero); |
| 340 | _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero); |
| 341 | _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero); |
| 342 | _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero); |
| 343 | round = _mm256_permute2x128_si256(round, round, 0x11); |
| 344 | quant = _mm256_permute2x128_si256(quant, quant, 0x11); |
| 345 | shift = _mm256_permute2x128_si256(shift, shift, 0x11); |
| 346 | dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11); |
| 347 | } else { |
| 348 | highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale); |
| 349 | round = _mm256_permute2x128_si256(round, round, 0x11); |
| 350 | quant = _mm256_permute2x128_si256(quant, quant, 0x11); |
| 351 | shift = _mm256_permute2x128_si256(shift, shift, 0x11); |
| 352 | highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale); |
| 353 | // Reinsert signs |
| 354 | qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0); |
| 355 | qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1); |
| 356 | // Mask out zbin threshold coeffs |
| 357 | qcoeff0 = _mm256_and_si256(qcoeff0, temp0); |
| 358 | qcoeff1 = _mm256_and_si256(qcoeff1, temp1); |
| 359 | highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr); |
| 360 | coeff0 = |
| 361 | highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale); |
| 362 | dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11); |
| 363 | coeff1 = |
| 364 | highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale); |
| 365 | highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr); |
| 366 | } |
| 367 | |
| 368 | // AC only loop. |
| 369 | while (index < n_coeffs) { |
| 370 | coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index)); |
| 371 | qcoeff0 = _mm256_abs_epi32(coeff0); |
| 372 | coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8)); |
| 373 | qcoeff1 = _mm256_abs_epi32(coeff1); |
| 374 | highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index, |
| 375 | &is_found0, &mask0); |
| 376 | temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin); |
| 377 | temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin); |
| 378 | cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8); |
| 379 | highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1); |
| 380 | if (_mm256_movemask_epi8(cmp_mask) == 0) { |
| 381 | _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero); |
| 382 | _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero); |
| 383 | _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero); |
| 384 | _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero); |
| 385 | index += 16; |
| 386 | continue; |
| 387 | } |
| 388 | highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale); |
| 389 | highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale); |
| 390 | qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0); |
| 391 | qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1); |
| 392 | qcoeff0 = _mm256_and_si256(qcoeff0, temp0); |
| 393 | qcoeff1 = _mm256_and_si256(qcoeff1, temp1); |
| 394 | highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index); |
| 395 | coeff0 = |
| 396 | highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale); |
| 397 | coeff1 = |
| 398 | highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale); |
| 399 | highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index); |
| 400 | index += 16; |
| 401 | } |
| 402 | if (is_found0) { |
| 403 | temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0), |
| 404 | _mm256_extracti128_si256(mask0, 1)); |
| 405 | non_zero_count = calculate_non_zero_count(temp_mask0); |
| 406 | } |
| 407 | if (is_found1) { |
| 408 | temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1), |
| 409 | _mm256_extracti128_si256(mask1, 1)); |
| 410 | non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1); |
| 411 | } |
| 412 | |
| 413 | for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { |
| 414 | const int rc = scan[i]; |
| 415 | qcoeff_ptr[rc] = 0; |
| 416 | dqcoeff_ptr[rc] = 0; |
| 417 | } |
| 418 | |
| 419 | for (int i = non_zero_count - 1; i >= 0; i--) { |
| 420 | const int rc = scan[i]; |
| 421 | if (qcoeff_ptr[rc]) { |
| 422 | eob = i; |
| 423 | break; |
| 424 | } |
| 425 | } |
| 426 | |
| 427 | *eob_ptr = eob + 1; |
| 428 | #if SKIP_EOB_FACTOR_ADJUST |
| 429 | // TODO(Aniket): Experiment the following loop with intrinsic by combining |
| 430 | // with the quantization loop above |
| 431 | for (int i = 0; i < non_zero_count; i++) { |
| 432 | const int rc = scan[i]; |
| 433 | const int qcoeff = qcoeff_ptr[rc]; |
| 434 | if (qcoeff) { |
| 435 | first = i; |
| 436 | break; |
| 437 | } |
| 438 | } |
| 439 | if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { |
| 440 | const int rc = scan[(*eob_ptr - 1)]; |
| 441 | if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { |
| 442 | const int coeff = coeff_ptr[rc] * wt; |
Yaowu Xu | 9db5b8d | 2020-04-10 08:58:15 -0700 | [diff] [blame] | 443 | const int coeff_sign = AOMSIGN(coeff); |
Remya | a7d48a6 | 2019-06-03 14:22:34 +0530 | [diff] [blame] | 444 | const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; |
| 445 | const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; |
| 446 | const int prescan_add_val = |
| 447 | ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); |
| 448 | if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { |
| 449 | qcoeff_ptr[rc] = 0; |
| 450 | dqcoeff_ptr[rc] = 0; |
| 451 | *eob_ptr = 0; |
| 452 | } |
| 453 | } |
| 454 | } |
| 455 | #endif |
| 456 | } |