blob: 6fb8891e1ca45f92e623df909028b4ad61463bc6 [file] [log] [blame]
Peter de Rivaza7b2d092014-10-16 13:38:46 +01001/*
Yunqing Wang38f1fbb2015-07-17 12:05:42 -07002 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
Peter de Rivaza7b2d092014-10-16 13:38:46 +01003 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <emmintrin.h>
12
Yunqing Wang38f1fbb2015-07-17 12:05:42 -070013#include "vpx_dsp/vpx_dsp_common.h"
14#include "vpx_mem/vpx_mem.h"
Johann1d7ccd52015-05-11 19:09:22 -070015#include "vpx_ports/mem.h"
Peter de Rivaza7b2d092014-10-16 13:38:46 +010016
Yaowu Xu3246fc02016-01-20 16:13:04 -080017#if CONFIG_VPX_HIGHBITDEPTH
clang-format99e28b82016-01-27 12:42:45 -080018void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
19 int skip_block, const int16_t *zbin_ptr,
Peter de Rivaza7b2d092014-10-16 13:38:46 +010020 const int16_t *round_ptr,
21 const int16_t *quant_ptr,
22 const int16_t *quant_shift_ptr,
clang-format99e28b82016-01-27 12:42:45 -080023 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
24 const int16_t *dequant_ptr, uint16_t *eob_ptr,
25 const int16_t *scan, const int16_t *iscan) {
Peter de Rivaza7b2d092014-10-16 13:38:46 +010026 int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
27 __m128i zbins[2];
28 __m128i nzbins[2];
29
clang-format99e28b82016-01-27 12:42:45 -080030 zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
Jingning Hand0f23772014-12-22 09:35:29 -080031 (int)zbin_ptr[0]);
32 zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
Peter de Rivaza7b2d092014-10-16 13:38:46 +010033
34 nzbins[0] = _mm_setzero_si128();
35 nzbins[1] = _mm_setzero_si128();
36 nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
37 nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
38
39 (void)scan;
40
James Zernf58011a2015-04-23 20:47:40 -070041 memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
42 memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
Peter de Rivaza7b2d092014-10-16 13:38:46 +010043
44 if (!skip_block) {
45 // Pre-scan pass
46 for (i = ((int)count / 4) - 1; i >= 0; i--) {
47 __m128i coeffs, cmp1, cmp2;
48 int test;
49 coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
50 cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
51 cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
52 cmp1 = _mm_and_si128(cmp1, cmp2);
53 test = _mm_movemask_epi8(cmp1);
54 if (test == 0xffff)
55 non_zero_regs--;
56 else
57 break;
58 }
59
60 // Quantization pass:
61 for (i = 0; i < non_zero_regs; i++) {
62 __m128i coeffs, coeffs_sign, tmp1, tmp2;
63 int test;
64 int abs_coeff[4];
65 int coeff_sign[4];
66
67 coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
68 coeffs_sign = _mm_srai_epi32(coeffs, 31);
clang-format99e28b82016-01-27 12:42:45 -080069 coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
Peter de Rivaza7b2d092014-10-16 13:38:46 +010070 tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
71 tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
72 tmp1 = _mm_or_si128(tmp1, tmp2);
73 test = _mm_movemask_epi8(tmp1);
clang-format99e28b82016-01-27 12:42:45 -080074 _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
75 _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
Peter de Rivaza7b2d092014-10-16 13:38:46 +010076
77 for (j = 0; j < 4; j++) {
78 if (test & (1 << (4 * j))) {
79 int k = 4 * i + j;
Yaowu Xub58c99e2015-07-07 18:29:14 -070080 const int64_t tmp1 = abs_coeff[j] + round_ptr[k != 0];
81 const int64_t tmp2 = ((tmp1 * quant_ptr[k != 0]) >> 16) + tmp1;
82 const uint32_t abs_qcoeff =
83 (uint32_t)((tmp2 * quant_shift_ptr[k != 0]) >> 16);
84 qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
Peter de Rivaza7b2d092014-10-16 13:38:46 +010085 dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
clang-format99e28b82016-01-27 12:42:45 -080086 if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
Peter de Rivaza7b2d092014-10-16 13:38:46 +010087 }
88 }
89 }
90 }
91 *eob_ptr = eob_i + 1;
92}
93
clang-format99e28b82016-01-27 12:42:45 -080094void vpx_highbd_quantize_b_32x32_sse2(
95 const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
96 const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
97 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
98 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
99 const int16_t *scan, const int16_t *iscan) {
Peter de Rivaza7b2d092014-10-16 13:38:46 +0100100 __m128i zbins[2];
101 __m128i nzbins[2];
102 int idx = 0;
103 int idx_arr[1024];
104 int i, eob = -1;
Jingning Hand0f23772014-12-22 09:35:29 -0800105 const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
106 const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
Peter de Rivaza7b2d092014-10-16 13:38:46 +0100107 (void)scan;
clang-format99e28b82016-01-27 12:42:45 -0800108 zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
Jingning Hand0f23772014-12-22 09:35:29 -0800109 zbins[1] = _mm_set1_epi32(zbin1_tmp);
Peter de Rivaza7b2d092014-10-16 13:38:46 +0100110
111 nzbins[0] = _mm_setzero_si128();
112 nzbins[1] = _mm_setzero_si128();
113 nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
114 nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
115
James Zernf58011a2015-04-23 20:47:40 -0700116 memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
117 memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
Peter de Rivaza7b2d092014-10-16 13:38:46 +0100118
119 if (!skip_block) {
120 // Pre-scan pass
121 for (i = 0; i < n_coeffs / 4; i++) {
122 __m128i coeffs, cmp1, cmp2;
123 int test;
124 coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
125 cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
126 cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
127 cmp1 = _mm_and_si128(cmp1, cmp2);
128 test = _mm_movemask_epi8(cmp1);
clang-format99e28b82016-01-27 12:42:45 -0800129 if (!(test & 0xf)) idx_arr[idx++] = i * 4;
130 if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
131 if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
132 if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
Peter de Rivaza7b2d092014-10-16 13:38:46 +0100133 }
134
135 // Quantization pass: only process the coefficients selected in
136 // pre-scan pass. Note: idx can be zero.
137 for (i = 0; i < idx; i++) {
138 const int rc = idx_arr[i];
139 const int coeff = coeff_ptr[rc];
140 const int coeff_sign = (coeff >> 31);
Yaowu Xub58c99e2015-07-07 18:29:14 -0700141 const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
clang-format99e28b82016-01-27 12:42:45 -0800142 const int64_t tmp1 =
143 abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
Yaowu Xub58c99e2015-07-07 18:29:14 -0700144 const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
145 const uint32_t abs_qcoeff =
146 (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
147 qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
Peter de Rivaza7b2d092014-10-16 13:38:46 +0100148 dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
clang-format99e28b82016-01-27 12:42:45 -0800149 if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
Peter de Rivaza7b2d092014-10-16 13:38:46 +0100150 }
151 }
152 *eob_ptr = eob + 1;
153}
154#endif