Peter de Rivaz | a7b2d09 | 2014-10-16 13:38:46 +0100 | [diff] [blame] | 1 | /* |
Yunqing Wang | 38f1fbb | 2015-07-17 12:05:42 -0700 | [diff] [blame] | 2 | * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
Peter de Rivaz | a7b2d09 | 2014-10-16 13:38:46 +0100 | [diff] [blame] | 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license |
| 5 | * that can be found in the LICENSE file in the root of the source |
| 6 | * tree. An additional intellectual property rights grant can be found |
| 7 | * in the file PATENTS. All contributing project authors may |
| 8 | * be found in the AUTHORS file in the root of the source tree. |
| 9 | */ |
| 10 | |
| 11 | #include <emmintrin.h> |
| 12 | |
Yunqing Wang | 38f1fbb | 2015-07-17 12:05:42 -0700 | [diff] [blame] | 13 | #include "vpx_dsp/vpx_dsp_common.h" |
| 14 | #include "vpx_mem/vpx_mem.h" |
Johann | 1d7ccd5 | 2015-05-11 19:09:22 -0700 | [diff] [blame] | 15 | #include "vpx_ports/mem.h" |
Peter de Rivaz | a7b2d09 | 2014-10-16 13:38:46 +0100 | [diff] [blame] | 16 | |
Yaowu Xu | 3246fc0 | 2016-01-20 16:13:04 -0800 | [diff] [blame] | 17 | #if CONFIG_VPX_HIGHBITDEPTH |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame^] | 18 | void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, |
| 19 | int skip_block, const int16_t *zbin_ptr, |
Peter de Rivaz | a7b2d09 | 2014-10-16 13:38:46 +0100 | [diff] [blame] | 20 | const int16_t *round_ptr, |
| 21 | const int16_t *quant_ptr, |
| 22 | const int16_t *quant_shift_ptr, |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame^] | 23 | tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, |
| 24 | const int16_t *dequant_ptr, uint16_t *eob_ptr, |
| 25 | const int16_t *scan, const int16_t *iscan) { |
Peter de Rivaz | a7b2d09 | 2014-10-16 13:38:46 +0100 | [diff] [blame] | 26 | int i, j, non_zero_regs = (int)count / 4, eob_i = -1; |
| 27 | __m128i zbins[2]; |
| 28 | __m128i nzbins[2]; |
| 29 | |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame^] | 30 | zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1], |
Jingning Han | d0f2377 | 2014-12-22 09:35:29 -0800 | [diff] [blame] | 31 | (int)zbin_ptr[0]); |
| 32 | zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]); |
Peter de Rivaz | a7b2d09 | 2014-10-16 13:38:46 +0100 | [diff] [blame] | 33 | |
| 34 | nzbins[0] = _mm_setzero_si128(); |
| 35 | nzbins[1] = _mm_setzero_si128(); |
| 36 | nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); |
| 37 | nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); |
| 38 | |
| 39 | (void)scan; |
| 40 | |
James Zern | f58011a | 2015-04-23 20:47:40 -0700 | [diff] [blame] | 41 | memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); |
| 42 | memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); |
Peter de Rivaz | a7b2d09 | 2014-10-16 13:38:46 +0100 | [diff] [blame] | 43 | |
| 44 | if (!skip_block) { |
| 45 | // Pre-scan pass |
| 46 | for (i = ((int)count / 4) - 1; i >= 0; i--) { |
| 47 | __m128i coeffs, cmp1, cmp2; |
| 48 | int test; |
| 49 | coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); |
| 50 | cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); |
| 51 | cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); |
| 52 | cmp1 = _mm_and_si128(cmp1, cmp2); |
| 53 | test = _mm_movemask_epi8(cmp1); |
| 54 | if (test == 0xffff) |
| 55 | non_zero_regs--; |
| 56 | else |
| 57 | break; |
| 58 | } |
| 59 | |
| 60 | // Quantization pass: |
| 61 | for (i = 0; i < non_zero_regs; i++) { |
| 62 | __m128i coeffs, coeffs_sign, tmp1, tmp2; |
| 63 | int test; |
| 64 | int abs_coeff[4]; |
| 65 | int coeff_sign[4]; |
| 66 | |
| 67 | coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); |
| 68 | coeffs_sign = _mm_srai_epi32(coeffs, 31); |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame^] | 69 | coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); |
Peter de Rivaz | a7b2d09 | 2014-10-16 13:38:46 +0100 | [diff] [blame] | 70 | tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); |
| 71 | tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); |
| 72 | tmp1 = _mm_or_si128(tmp1, tmp2); |
| 73 | test = _mm_movemask_epi8(tmp1); |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame^] | 74 | _mm_storeu_si128((__m128i *)abs_coeff, coeffs); |
| 75 | _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); |
Peter de Rivaz | a7b2d09 | 2014-10-16 13:38:46 +0100 | [diff] [blame] | 76 | |
| 77 | for (j = 0; j < 4; j++) { |
| 78 | if (test & (1 << (4 * j))) { |
| 79 | int k = 4 * i + j; |
Yaowu Xu | b58c99e | 2015-07-07 18:29:14 -0700 | [diff] [blame] | 80 | const int64_t tmp1 = abs_coeff[j] + round_ptr[k != 0]; |
| 81 | const int64_t tmp2 = ((tmp1 * quant_ptr[k != 0]) >> 16) + tmp1; |
| 82 | const uint32_t abs_qcoeff = |
| 83 | (uint32_t)((tmp2 * quant_shift_ptr[k != 0]) >> 16); |
| 84 | qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; |
Peter de Rivaz | a7b2d09 | 2014-10-16 13:38:46 +0100 | [diff] [blame] | 85 | dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame^] | 86 | if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; |
Peter de Rivaz | a7b2d09 | 2014-10-16 13:38:46 +0100 | [diff] [blame] | 87 | } |
| 88 | } |
| 89 | } |
| 90 | } |
| 91 | *eob_ptr = eob_i + 1; |
| 92 | } |
| 93 | |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame^] | 94 | void vpx_highbd_quantize_b_32x32_sse2( |
| 95 | const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, |
| 96 | const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, |
| 97 | const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, |
| 98 | tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, |
| 99 | const int16_t *scan, const int16_t *iscan) { |
Peter de Rivaz | a7b2d09 | 2014-10-16 13:38:46 +0100 | [diff] [blame] | 100 | __m128i zbins[2]; |
| 101 | __m128i nzbins[2]; |
| 102 | int idx = 0; |
| 103 | int idx_arr[1024]; |
| 104 | int i, eob = -1; |
Jingning Han | d0f2377 | 2014-12-22 09:35:29 -0800 | [diff] [blame] | 105 | const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); |
| 106 | const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); |
Peter de Rivaz | a7b2d09 | 2014-10-16 13:38:46 +0100 | [diff] [blame] | 107 | (void)scan; |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame^] | 108 | zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); |
Jingning Han | d0f2377 | 2014-12-22 09:35:29 -0800 | [diff] [blame] | 109 | zbins[1] = _mm_set1_epi32(zbin1_tmp); |
Peter de Rivaz | a7b2d09 | 2014-10-16 13:38:46 +0100 | [diff] [blame] | 110 | |
| 111 | nzbins[0] = _mm_setzero_si128(); |
| 112 | nzbins[1] = _mm_setzero_si128(); |
| 113 | nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); |
| 114 | nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); |
| 115 | |
James Zern | f58011a | 2015-04-23 20:47:40 -0700 | [diff] [blame] | 116 | memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); |
| 117 | memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); |
Peter de Rivaz | a7b2d09 | 2014-10-16 13:38:46 +0100 | [diff] [blame] | 118 | |
| 119 | if (!skip_block) { |
| 120 | // Pre-scan pass |
| 121 | for (i = 0; i < n_coeffs / 4; i++) { |
| 122 | __m128i coeffs, cmp1, cmp2; |
| 123 | int test; |
| 124 | coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); |
| 125 | cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); |
| 126 | cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); |
| 127 | cmp1 = _mm_and_si128(cmp1, cmp2); |
| 128 | test = _mm_movemask_epi8(cmp1); |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame^] | 129 | if (!(test & 0xf)) idx_arr[idx++] = i * 4; |
| 130 | if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; |
| 131 | if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; |
| 132 | if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; |
Peter de Rivaz | a7b2d09 | 2014-10-16 13:38:46 +0100 | [diff] [blame] | 133 | } |
| 134 | |
| 135 | // Quantization pass: only process the coefficients selected in |
| 136 | // pre-scan pass. Note: idx can be zero. |
| 137 | for (i = 0; i < idx; i++) { |
| 138 | const int rc = idx_arr[i]; |
| 139 | const int coeff = coeff_ptr[rc]; |
| 140 | const int coeff_sign = (coeff >> 31); |
Yaowu Xu | b58c99e | 2015-07-07 18:29:14 -0700 | [diff] [blame] | 141 | const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame^] | 142 | const int64_t tmp1 = |
| 143 | abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); |
Yaowu Xu | b58c99e | 2015-07-07 18:29:14 -0700 | [diff] [blame] | 144 | const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; |
| 145 | const uint32_t abs_qcoeff = |
| 146 | (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); |
| 147 | qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; |
Peter de Rivaz | a7b2d09 | 2014-10-16 13:38:46 +0100 | [diff] [blame] | 148 | dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame^] | 149 | if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; |
Peter de Rivaz | a7b2d09 | 2014-10-16 13:38:46 +0100 | [diff] [blame] | 150 | } |
| 151 | } |
| 152 | *eob_ptr = eob + 1; |
| 153 | } |
| 154 | #endif |