blob: 9ab9143eee3d3202349a5909d87dbf7baa0e9780 [file] [log] [blame]
Yue Chen7cae98f2018-08-24 10:43:16 -07001/*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12#include <immintrin.h>
13
14#include "config/aom_dsp_rtcd.h"
15#include "aom/aom_integer.h"
16#include "aom_dsp/x86/bitdepth_conversion_sse2.h"
Kyle Siefringb1637f02022-10-09 17:04:01 -040017#include "aom_dsp/x86/mem_sse2.h"
Yue Chen7cae98f2018-08-24 10:43:16 -070018#include "aom_ports/mem.h"
19
Anupam Pandey38d838c2023-06-20 09:44:11 +053020static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero,
21 __m128i *out_lo,
22 __m128i *out_hi) {
23 const __m128i sign_bits = _mm_cmplt_epi16(in, zero);
24 *out_lo = _mm_unpacklo_epi16(in, sign_bits);
25 *out_hi = _mm_unpackhi_epi16(in, sign_bits);
26}
27
Anupam Pandey77ba7c52023-08-17 16:17:30 +053028static INLINE __m128i invert_sign_32_sse2(__m128i a, __m128i sign) {
29 a = _mm_xor_si128(a, sign);
30 return _mm_sub_epi32(a, sign);
31}
32
kyslov7b9d0d62018-12-21 11:12:26 -080033void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
34 int *min, int *max) {
35 __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
36 u0 = _mm_setzero_si128();
37 // Row 0
38 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
39 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
40 diff = _mm_subs_epi16(s0, d0);
41 negdiff = _mm_subs_epi16(u0, diff);
42 absdiff0 = _mm_max_epi16(diff, negdiff);
43 // Row 1
44 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
45 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
46 diff = _mm_subs_epi16(s0, d0);
47 negdiff = _mm_subs_epi16(u0, diff);
48 absdiff = _mm_max_epi16(diff, negdiff);
49 maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
50 minabsdiff = _mm_min_epi16(absdiff0, absdiff);
51 // Row 2
52 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
53 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
54 diff = _mm_subs_epi16(s0, d0);
55 negdiff = _mm_subs_epi16(u0, diff);
56 absdiff = _mm_max_epi16(diff, negdiff);
57 maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
58 minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
59 // Row 3
60 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
61 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
62 diff = _mm_subs_epi16(s0, d0);
63 negdiff = _mm_subs_epi16(u0, diff);
64 absdiff = _mm_max_epi16(diff, negdiff);
65 maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
66 minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
67 // Row 4
68 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
69 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
70 diff = _mm_subs_epi16(s0, d0);
71 negdiff = _mm_subs_epi16(u0, diff);
72 absdiff = _mm_max_epi16(diff, negdiff);
73 maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
74 minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
75 // Row 5
76 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
77 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
78 diff = _mm_subs_epi16(s0, d0);
79 negdiff = _mm_subs_epi16(u0, diff);
80 absdiff = _mm_max_epi16(diff, negdiff);
81 maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
82 minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
83 // Row 6
84 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
85 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
86 diff = _mm_subs_epi16(s0, d0);
87 negdiff = _mm_subs_epi16(u0, diff);
88 absdiff = _mm_max_epi16(diff, negdiff);
89 maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
90 minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
91 // Row 7
92 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
93 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
94 diff = _mm_subs_epi16(s0, d0);
95 negdiff = _mm_subs_epi16(u0, diff);
96 absdiff = _mm_max_epi16(diff, negdiff);
97 maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
98 minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
99
100 maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
101 maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
102 maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
103 *max = _mm_extract_epi16(maxabsdiff, 0);
104
105 minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
106 minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
107 minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
108 *min = _mm_extract_epi16(minabsdiff, 0);
109}
110
111unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) {
Kyle Siefringb1637f02022-10-09 17:04:01 -0400112 __m128i sum0, sum1, s0, s1, s2, s3, u0;
kyslov7b9d0d62018-12-21 11:12:26 -0800113 unsigned int avg = 0;
114 u0 = _mm_setzero_si128();
Kyle Siefringb1637f02022-10-09 17:04:01 -0400115 s0 = loadh_epi64((const __m128i *)(s + p),
116 _mm_loadl_epi64((const __m128i *)(s)));
117 s1 = loadh_epi64((const __m128i *)(s + 3 * p),
118 _mm_loadl_epi64((const __m128i *)(s + 2 * p)));
119 s2 = loadh_epi64((const __m128i *)(s + 5 * p),
120 _mm_loadl_epi64((const __m128i *)(s + 4 * p)));
121 s3 = loadh_epi64((const __m128i *)(s + 7 * p),
122 _mm_loadl_epi64((const __m128i *)(s + 6 * p)));
123 s0 = _mm_sad_epu8(s0, u0);
124 s1 = _mm_sad_epu8(s1, u0);
125 s2 = _mm_sad_epu8(s2, u0);
126 s3 = _mm_sad_epu8(s3, u0);
kyslov7b9d0d62018-12-21 11:12:26 -0800127
Kyle Siefringb1637f02022-10-09 17:04:01 -0400128 sum0 = _mm_add_epi16(s0, s1);
129 sum1 = _mm_add_epi16(s2, s3);
130 sum0 = _mm_add_epi16(sum0, sum1);
131 sum0 = _mm_add_epi16(sum0, _mm_srli_si128(sum0, 8));
132 avg = _mm_cvtsi128_si32(sum0);
kyslov7b9d0d62018-12-21 11:12:26 -0800133 return (avg + 32) >> 6;
134}
135
Kyle Siefringb1637f02022-10-09 17:04:01 -0400136void calc_avg_8x8_dual_sse2(const uint8_t *s, int p, int *avg) {
137 __m128i sum0, sum1, s0, s1, s2, s3, u0;
138 u0 = _mm_setzero_si128();
139 s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s)), u0);
140 s1 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + p)), u0);
141 s2 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 2 * p)), u0);
142 s3 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 3 * p)), u0);
143 sum0 = _mm_add_epi16(s0, s1);
144 sum1 = _mm_add_epi16(s2, s3);
145 s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 4 * p)), u0);
146 s1 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 5 * p)), u0);
147 s2 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 6 * p)), u0);
148 s3 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 7 * p)), u0);
149 sum0 = _mm_add_epi16(sum0, _mm_add_epi16(s0, s1));
150 sum1 = _mm_add_epi16(sum1, _mm_add_epi16(s2, s3));
151 sum0 = _mm_add_epi16(sum0, sum1);
152
153 // (avg + 32) >> 6
154 __m128i rounding = _mm_set1_epi32(32);
155 sum0 = _mm_add_epi32(sum0, rounding);
156 sum0 = _mm_srli_epi32(sum0, 6);
157 avg[0] = _mm_cvtsi128_si32(sum0);
158 avg[1] = _mm_extract_epi16(sum0, 4);
159}
160
venkat sanampudid9b3de02022-02-28 23:42:53 +0530161void aom_avg_8x8_quad_sse2(const uint8_t *s, int p, int x16_idx, int y16_idx,
162 int *avg) {
Kyle Siefringb1637f02022-10-09 17:04:01 -0400163 const uint8_t *s_ptr = s + y16_idx * p + x16_idx;
164 for (int k = 0; k < 2; k++) {
165 calc_avg_8x8_dual_sse2(s_ptr, p, avg + k * 2);
166 s_ptr += 8 * p;
venkat sanampudid9b3de02022-02-28 23:42:53 +0530167 }
168}
169
kyslov7b9d0d62018-12-21 11:12:26 -0800170unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) {
171 __m128i s0, s1, u0;
172 unsigned int avg = 0;
173 u0 = _mm_setzero_si128();
Kyle Siefringb1637f02022-10-09 17:04:01 -0400174 s0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s)),
175 _mm_cvtsi32_si128(*(const int *)(s + p)));
176 s1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s + p * 2)),
177 _mm_cvtsi32_si128(*(const int *)(s + p * 3)));
178 s0 = _mm_sad_epu8(s0, u0);
179 s1 = _mm_sad_epu8(s1, u0);
180 s0 = _mm_add_epi16(s0, s1);
181 avg = _mm_cvtsi128_si32(s0);
kyslov7b9d0d62018-12-21 11:12:26 -0800182 return (avg + 8) >> 4;
183}
184
Scott LaVarnwayc57ea342022-01-12 14:34:35 -0500185static INLINE void hadamard_col4_sse2(__m128i *in, int iter) {
186 const __m128i a0 = in[0];
187 const __m128i a1 = in[1];
188 const __m128i a2 = in[2];
189 const __m128i a3 = in[3];
190 const __m128i b0 = _mm_srai_epi16(_mm_add_epi16(a0, a1), 1);
191 const __m128i b1 = _mm_srai_epi16(_mm_sub_epi16(a0, a1), 1);
192 const __m128i b2 = _mm_srai_epi16(_mm_add_epi16(a2, a3), 1);
193 const __m128i b3 = _mm_srai_epi16(_mm_sub_epi16(a2, a3), 1);
194 in[0] = _mm_add_epi16(b0, b2);
195 in[1] = _mm_add_epi16(b1, b3);
196 in[2] = _mm_sub_epi16(b0, b2);
197 in[3] = _mm_sub_epi16(b1, b3);
198
199 if (iter == 0) {
200 const __m128i ba = _mm_unpacklo_epi16(in[0], in[1]);
201 const __m128i dc = _mm_unpacklo_epi16(in[2], in[3]);
202 const __m128i dcba_lo = _mm_unpacklo_epi32(ba, dc);
203 const __m128i dcba_hi = _mm_unpackhi_epi32(ba, dc);
204 in[0] = dcba_lo;
205 in[1] = _mm_srli_si128(dcba_lo, 8);
206 in[2] = dcba_hi;
207 in[3] = _mm_srli_si128(dcba_hi, 8);
208 }
209}
210
211void aom_hadamard_4x4_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
212 tran_low_t *coeff) {
213 __m128i src[4];
214 src[0] = _mm_loadl_epi64((const __m128i *)src_diff);
215 src[1] = _mm_loadl_epi64((const __m128i *)(src_diff += src_stride));
216 src[2] = _mm_loadl_epi64((const __m128i *)(src_diff += src_stride));
James Zern3b9a0432022-07-18 19:30:11 -0700217 src[3] = _mm_loadl_epi64((const __m128i *)(src_diff + src_stride));
Scott LaVarnwayc57ea342022-01-12 14:34:35 -0500218
219 hadamard_col4_sse2(src, 0);
220 hadamard_col4_sse2(src, 1);
221
222 store_tran_low(_mm_unpacklo_epi64(src[0], src[1]), coeff);
223 coeff += 8;
224 store_tran_low(_mm_unpacklo_epi64(src[2], src[3]), coeff);
225}
226
Fyodor Kyslove6d70342019-11-13 16:59:07 -0800227static INLINE void hadamard_col8_sse2(__m128i *in, int iter) {
Yue Chen7cae98f2018-08-24 10:43:16 -0700228 __m128i a0 = in[0];
229 __m128i a1 = in[1];
230 __m128i a2 = in[2];
231 __m128i a3 = in[3];
232 __m128i a4 = in[4];
233 __m128i a5 = in[5];
234 __m128i a6 = in[6];
235 __m128i a7 = in[7];
236
237 __m128i b0 = _mm_add_epi16(a0, a1);
238 __m128i b1 = _mm_sub_epi16(a0, a1);
239 __m128i b2 = _mm_add_epi16(a2, a3);
240 __m128i b3 = _mm_sub_epi16(a2, a3);
241 __m128i b4 = _mm_add_epi16(a4, a5);
242 __m128i b5 = _mm_sub_epi16(a4, a5);
243 __m128i b6 = _mm_add_epi16(a6, a7);
244 __m128i b7 = _mm_sub_epi16(a6, a7);
245
246 a0 = _mm_add_epi16(b0, b2);
247 a1 = _mm_add_epi16(b1, b3);
248 a2 = _mm_sub_epi16(b0, b2);
249 a3 = _mm_sub_epi16(b1, b3);
250 a4 = _mm_add_epi16(b4, b6);
251 a5 = _mm_add_epi16(b5, b7);
252 a6 = _mm_sub_epi16(b4, b6);
253 a7 = _mm_sub_epi16(b5, b7);
254
255 if (iter == 0) {
256 b0 = _mm_add_epi16(a0, a4);
257 b7 = _mm_add_epi16(a1, a5);
258 b3 = _mm_add_epi16(a2, a6);
259 b4 = _mm_add_epi16(a3, a7);
260 b2 = _mm_sub_epi16(a0, a4);
261 b6 = _mm_sub_epi16(a1, a5);
262 b1 = _mm_sub_epi16(a2, a6);
263 b5 = _mm_sub_epi16(a3, a7);
264
265 a0 = _mm_unpacklo_epi16(b0, b1);
266 a1 = _mm_unpacklo_epi16(b2, b3);
267 a2 = _mm_unpackhi_epi16(b0, b1);
268 a3 = _mm_unpackhi_epi16(b2, b3);
269 a4 = _mm_unpacklo_epi16(b4, b5);
270 a5 = _mm_unpacklo_epi16(b6, b7);
271 a6 = _mm_unpackhi_epi16(b4, b5);
272 a7 = _mm_unpackhi_epi16(b6, b7);
273
274 b0 = _mm_unpacklo_epi32(a0, a1);
275 b1 = _mm_unpacklo_epi32(a4, a5);
276 b2 = _mm_unpackhi_epi32(a0, a1);
277 b3 = _mm_unpackhi_epi32(a4, a5);
278 b4 = _mm_unpacklo_epi32(a2, a3);
279 b5 = _mm_unpacklo_epi32(a6, a7);
280 b6 = _mm_unpackhi_epi32(a2, a3);
281 b7 = _mm_unpackhi_epi32(a6, a7);
282
283 in[0] = _mm_unpacklo_epi64(b0, b1);
284 in[1] = _mm_unpackhi_epi64(b0, b1);
285 in[2] = _mm_unpacklo_epi64(b2, b3);
286 in[3] = _mm_unpackhi_epi64(b2, b3);
287 in[4] = _mm_unpacklo_epi64(b4, b5);
288 in[5] = _mm_unpackhi_epi64(b4, b5);
289 in[6] = _mm_unpacklo_epi64(b6, b7);
290 in[7] = _mm_unpackhi_epi64(b6, b7);
291 } else {
292 in[0] = _mm_add_epi16(a0, a4);
293 in[7] = _mm_add_epi16(a1, a5);
294 in[3] = _mm_add_epi16(a2, a6);
295 in[4] = _mm_add_epi16(a3, a7);
296 in[2] = _mm_sub_epi16(a0, a4);
297 in[6] = _mm_sub_epi16(a1, a5);
298 in[1] = _mm_sub_epi16(a2, a6);
299 in[5] = _mm_sub_epi16(a3, a7);
300 }
301}
302
303static INLINE void hadamard_8x8_sse2(const int16_t *src_diff,
304 ptrdiff_t src_stride, tran_low_t *coeff,
305 int is_final) {
306 __m128i src[8];
307 src[0] = _mm_load_si128((const __m128i *)src_diff);
308 src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
309 src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
310 src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
311 src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
312 src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
313 src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
James Zern3b9a0432022-07-18 19:30:11 -0700314 src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride));
Yue Chen7cae98f2018-08-24 10:43:16 -0700315
316 hadamard_col8_sse2(src, 0);
317 hadamard_col8_sse2(src, 1);
318
319 if (is_final) {
320 store_tran_low(src[0], coeff);
321 coeff += 8;
322 store_tran_low(src[1], coeff);
323 coeff += 8;
324 store_tran_low(src[2], coeff);
325 coeff += 8;
326 store_tran_low(src[3], coeff);
327 coeff += 8;
328 store_tran_low(src[4], coeff);
329 coeff += 8;
330 store_tran_low(src[5], coeff);
331 coeff += 8;
332 store_tran_low(src[6], coeff);
333 coeff += 8;
334 store_tran_low(src[7], coeff);
335 } else {
336 int16_t *coeff16 = (int16_t *)coeff;
337 _mm_store_si128((__m128i *)coeff16, src[0]);
338 coeff16 += 8;
339 _mm_store_si128((__m128i *)coeff16, src[1]);
340 coeff16 += 8;
341 _mm_store_si128((__m128i *)coeff16, src[2]);
342 coeff16 += 8;
343 _mm_store_si128((__m128i *)coeff16, src[3]);
344 coeff16 += 8;
345 _mm_store_si128((__m128i *)coeff16, src[4]);
346 coeff16 += 8;
347 _mm_store_si128((__m128i *)coeff16, src[5]);
348 coeff16 += 8;
349 _mm_store_si128((__m128i *)coeff16, src[6]);
350 coeff16 += 8;
351 _mm_store_si128((__m128i *)coeff16, src[7]);
352 }
353}
354
355void aom_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
356 tran_low_t *coeff) {
357 hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
358}
359
chiyotsaia49340f2021-09-27 13:54:18 -0700360static INLINE void hadamard_lp_8x8_sse2(const int16_t *src_diff,
361 ptrdiff_t src_stride, int16_t *coeff) {
Fyodor Kyslov2ab25442020-01-28 16:41:26 -0800362 __m128i src[8];
363 src[0] = _mm_load_si128((const __m128i *)src_diff);
364 src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
365 src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
366 src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
367 src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
368 src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
369 src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
James Zern3b9a0432022-07-18 19:30:11 -0700370 src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride));
Fyodor Kyslov2ab25442020-01-28 16:41:26 -0800371
372 hadamard_col8_sse2(src, 0);
373 hadamard_col8_sse2(src, 1);
374
375 _mm_store_si128((__m128i *)coeff, src[0]);
376 coeff += 8;
377 _mm_store_si128((__m128i *)coeff, src[1]);
378 coeff += 8;
379 _mm_store_si128((__m128i *)coeff, src[2]);
380 coeff += 8;
381 _mm_store_si128((__m128i *)coeff, src[3]);
382 coeff += 8;
383 _mm_store_si128((__m128i *)coeff, src[4]);
384 coeff += 8;
385 _mm_store_si128((__m128i *)coeff, src[5]);
386 coeff += 8;
387 _mm_store_si128((__m128i *)coeff, src[6]);
388 coeff += 8;
389 _mm_store_si128((__m128i *)coeff, src[7]);
390}
391
chiyotsaia49340f2021-09-27 13:54:18 -0700392void aom_hadamard_lp_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
393 int16_t *coeff) {
394 hadamard_lp_8x8_sse2(src_diff, src_stride, coeff);
395}
396
venkat sanampudi205ce162022-08-01 17:47:08 +0530397void aom_hadamard_lp_8x8_dual_sse2(const int16_t *src_diff,
398 ptrdiff_t src_stride, int16_t *coeff) {
Arun Singh Negi7d10bc92022-03-10 13:01:05 +0530399 for (int i = 0; i < 2; i++) {
400 hadamard_lp_8x8_sse2(src_diff + (i * 8), src_stride, coeff + (i * 64));
401 }
402}
403
chiyotsaia49340f2021-09-27 13:54:18 -0700404void aom_hadamard_lp_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
405 int16_t *coeff) {
406 for (int idx = 0; idx < 4; ++idx) {
407 const int16_t *src_ptr =
408 src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
409 hadamard_lp_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
410 }
411
412 int16_t *t_coeff = coeff;
413 for (int idx = 0; idx < 64; idx += 8) {
414 __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
415 __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
416 __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
417 __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
418
419 __m128i b0 = _mm_add_epi16(coeff0, coeff1);
420 __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
421 __m128i b2 = _mm_add_epi16(coeff2, coeff3);
422 __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
423
424 b0 = _mm_srai_epi16(b0, 1);
425 b1 = _mm_srai_epi16(b1, 1);
426 b2 = _mm_srai_epi16(b2, 1);
427 b3 = _mm_srai_epi16(b3, 1);
428
429 coeff0 = _mm_add_epi16(b0, b2);
430 coeff1 = _mm_add_epi16(b1, b3);
431 coeff2 = _mm_sub_epi16(b0, b2);
432 coeff3 = _mm_sub_epi16(b1, b3);
433
434 _mm_store_si128((__m128i *)t_coeff, coeff0);
435 _mm_store_si128((__m128i *)(t_coeff + 64), coeff1);
436 _mm_store_si128((__m128i *)(t_coeff + 128), coeff2);
437 _mm_store_si128((__m128i *)(t_coeff + 192), coeff3);
438
439 t_coeff += 8;
440 }
441}
442
Yue Chen7cae98f2018-08-24 10:43:16 -0700443static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
444 ptrdiff_t src_stride, tran_low_t *coeff,
445 int is_final) {
446 // For high bitdepths, it is unnecessary to store_tran_low
447 // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
448 // next stage. Output to an intermediate buffer first, then store_tran_low()
449 // in the final stage.
450 DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
451 int16_t *t_coeff = temp_coeff;
452 int16_t *coeff16 = (int16_t *)coeff;
453 int idx;
454 for (idx = 0; idx < 4; ++idx) {
455 const int16_t *src_ptr =
456 src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
457 hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64),
458 0);
459 }
460
461 for (idx = 0; idx < 64; idx += 8) {
462 __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
463 __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
464 __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
465 __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
466
467 __m128i b0 = _mm_add_epi16(coeff0, coeff1);
468 __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
469 __m128i b2 = _mm_add_epi16(coeff2, coeff3);
470 __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
471
472 b0 = _mm_srai_epi16(b0, 1);
473 b1 = _mm_srai_epi16(b1, 1);
474 b2 = _mm_srai_epi16(b2, 1);
475 b3 = _mm_srai_epi16(b3, 1);
476
477 coeff0 = _mm_add_epi16(b0, b2);
478 coeff1 = _mm_add_epi16(b1, b3);
479 coeff2 = _mm_sub_epi16(b0, b2);
480 coeff3 = _mm_sub_epi16(b1, b3);
481
482 if (is_final) {
Ranjit Kumar Tulabandu6594a4e2022-09-21 01:11:26 +0530483 store_tran_low_offset_4(coeff0, coeff);
484 store_tran_low_offset_4(coeff1, coeff + 64);
485 store_tran_low_offset_4(coeff2, coeff + 128);
486 store_tran_low_offset_4(coeff3, coeff + 192);
487 coeff += 4;
Yue Chen7cae98f2018-08-24 10:43:16 -0700488 } else {
489 _mm_store_si128((__m128i *)coeff16, coeff0);
490 _mm_store_si128((__m128i *)(coeff16 + 64), coeff1);
491 _mm_store_si128((__m128i *)(coeff16 + 128), coeff2);
492 _mm_store_si128((__m128i *)(coeff16 + 192), coeff3);
493 coeff16 += 8;
494 }
495
496 t_coeff += 8;
Ranjit Kumar Tulabandu6594a4e2022-09-21 01:11:26 +0530497 // Increment the pointer additionally by 0 and 8 in alternate
498 // iterations(instead of 8) to ensure the coherency with the implementation
499 // of store_tran_low_offset_4()
500 coeff += (((idx >> 3) & 1) << 3);
Yue Chen7cae98f2018-08-24 10:43:16 -0700501 }
502}
503
504void aom_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
505 tran_low_t *coeff) {
506 hadamard_16x16_sse2(src_diff, src_stride, coeff, 1);
507}
508
509void aom_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
510 tran_low_t *coeff) {
511 // For high bitdepths, it is unnecessary to store_tran_low
512 // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
513 // next stage. Output to an intermediate buffer first, then store_tran_low()
514 // in the final stage.
515 DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
516 int16_t *t_coeff = temp_coeff;
517 int idx;
Anupam Pandey38d838c2023-06-20 09:44:11 +0530518 __m128i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo,
519 b3_lo;
520 __m128i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi,
521 b3_hi;
522 __m128i b0, b1, b2, b3;
523 const __m128i zero = _mm_setzero_si128();
Yue Chen7cae98f2018-08-24 10:43:16 -0700524 for (idx = 0; idx < 4; ++idx) {
525 const int16_t *src_ptr =
526 src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
527 hadamard_16x16_sse2(src_ptr, src_stride,
528 (tran_low_t *)(t_coeff + idx * 256), 0);
529 }
530
531 for (idx = 0; idx < 256; idx += 8) {
532 __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
533 __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256));
534 __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512));
535 __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768));
536
Anupam Pandey38d838c2023-06-20 09:44:11 +0530537 // Sign extend 16 bit to 32 bit.
538 sign_extend_16bit_to_32bit_sse2(coeff0, zero, &coeff0_lo, &coeff0_hi);
539 sign_extend_16bit_to_32bit_sse2(coeff1, zero, &coeff1_lo, &coeff1_hi);
540 sign_extend_16bit_to_32bit_sse2(coeff2, zero, &coeff2_lo, &coeff2_hi);
541 sign_extend_16bit_to_32bit_sse2(coeff3, zero, &coeff3_lo, &coeff3_hi);
Yue Chen7cae98f2018-08-24 10:43:16 -0700542
Anupam Pandey38d838c2023-06-20 09:44:11 +0530543 b0_lo = _mm_add_epi32(coeff0_lo, coeff1_lo);
544 b0_hi = _mm_add_epi32(coeff0_hi, coeff1_hi);
545
546 b1_lo = _mm_sub_epi32(coeff0_lo, coeff1_lo);
547 b1_hi = _mm_sub_epi32(coeff0_hi, coeff1_hi);
548
549 b2_lo = _mm_add_epi32(coeff2_lo, coeff3_lo);
550 b2_hi = _mm_add_epi32(coeff2_hi, coeff3_hi);
551
552 b3_lo = _mm_sub_epi32(coeff2_lo, coeff3_lo);
553 b3_hi = _mm_sub_epi32(coeff2_hi, coeff3_hi);
554
555 b0_lo = _mm_srai_epi32(b0_lo, 2);
556 b1_lo = _mm_srai_epi32(b1_lo, 2);
557 b2_lo = _mm_srai_epi32(b2_lo, 2);
558 b3_lo = _mm_srai_epi32(b3_lo, 2);
559
560 b0_hi = _mm_srai_epi32(b0_hi, 2);
561 b1_hi = _mm_srai_epi32(b1_hi, 2);
562 b2_hi = _mm_srai_epi32(b2_hi, 2);
563 b3_hi = _mm_srai_epi32(b3_hi, 2);
564
565 b0 = _mm_packs_epi32(b0_lo, b0_hi);
566 b1 = _mm_packs_epi32(b1_lo, b1_hi);
567 b2 = _mm_packs_epi32(b2_lo, b2_hi);
568 b3 = _mm_packs_epi32(b3_lo, b3_hi);
Yue Chen7cae98f2018-08-24 10:43:16 -0700569
570 coeff0 = _mm_add_epi16(b0, b2);
571 coeff1 = _mm_add_epi16(b1, b3);
Ranjit Kumar Tulabandu6594a4e2022-09-21 01:11:26 +0530572 store_tran_low_offset_4(coeff0, coeff);
573 store_tran_low_offset_4(coeff1, coeff + 256);
Yue Chen7cae98f2018-08-24 10:43:16 -0700574
575 coeff2 = _mm_sub_epi16(b0, b2);
576 coeff3 = _mm_sub_epi16(b1, b3);
Ranjit Kumar Tulabandu6594a4e2022-09-21 01:11:26 +0530577 store_tran_low_offset_4(coeff2, coeff + 512);
578 store_tran_low_offset_4(coeff3, coeff + 768);
Yue Chen7cae98f2018-08-24 10:43:16 -0700579
Ranjit Kumar Tulabandu6594a4e2022-09-21 01:11:26 +0530580 // Increment the pointer by 4 and 12 in alternate iterations(instead of 8)
581 // to ensure the coherency with the implementation of
582 // store_tran_low_offset_4()
583 coeff += (4 + (((idx >> 3) & 1) << 3));
Yue Chen7cae98f2018-08-24 10:43:16 -0700584 t_coeff += 8;
585 }
586}
587
588int aom_satd_sse2(const tran_low_t *coeff, int length) {
589 int i;
590 const __m128i zero = _mm_setzero_si128();
591 __m128i accum = zero;
592
Anupam Pandey77ba7c52023-08-17 16:17:30 +0530593 for (i = 0; i < length; i += 4) {
594 const __m128i src_line = _mm_load_si128((const __m128i *)coeff);
595 const __m128i coeff_sign = _mm_srai_epi32(src_line, 31);
596 const __m128i abs_coeff = invert_sign_32_sse2(src_line, coeff_sign);
597 accum = _mm_add_epi32(accum, abs_coeff);
598 coeff += 4;
Yue Chen7cae98f2018-08-24 10:43:16 -0700599 }
600
601 { // cascading summation of accum
602 __m128i hi = _mm_srli_si128(accum, 8);
603 accum = _mm_add_epi32(accum, hi);
604 hi = _mm_srli_epi64(accum, 32);
605 accum = _mm_add_epi32(accum, hi);
606 }
607
608 return _mm_cvtsi128_si32(accum);
609}
Jerome Jiangee586532019-07-19 16:24:56 -0700610
Yunqing Wangb9e790a2021-09-28 16:08:06 -0700611int aom_satd_lp_sse2(const int16_t *coeff, int length) {
612 const __m128i zero = _mm_setzero_si128();
613 const __m128i one = _mm_set1_epi16(1);
614 __m128i accum = zero;
615
616 for (int i = 0; i < length; i += 16) {
617 const __m128i src_line0 = _mm_loadu_si128((const __m128i *)coeff);
618 const __m128i src_line1 = _mm_loadu_si128((const __m128i *)(coeff + 8));
619 const __m128i inv0 = _mm_sub_epi16(zero, src_line0);
620 const __m128i inv1 = _mm_sub_epi16(zero, src_line1);
621 const __m128i abs0 = _mm_max_epi16(src_line0, inv0); // abs(src_line)
622 const __m128i abs1 = _mm_max_epi16(src_line1, inv1); // abs(src_line)
623 const __m128i sum0 = _mm_madd_epi16(abs0, one);
624 const __m128i sum1 = _mm_madd_epi16(abs1, one);
625 accum = _mm_add_epi32(accum, sum0);
626 accum = _mm_add_epi32(accum, sum1);
627 coeff += 16;
628 }
629
630 { // cascading summation of accum
631 __m128i hi = _mm_srli_si128(accum, 8);
632 accum = _mm_add_epi32(accum, hi);
633 hi = _mm_srli_epi64(accum, 32);
634 accum = _mm_add_epi32(accum, hi);
635 }
636
637 return _mm_cvtsi128_si32(accum);
638}
639
venkat sanampudi6df621c2022-09-15 16:03:11 +0530640void aom_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
641 const int ref_stride, const int width,
642 const int height, int norm_factor) {
643 // SIMD implementation assumes width and height to be multiple of 16 and 2
644 // respectively. For any odd width or height, SIMD support needs to be added.
645 assert(width % 16 == 0 && height % 2 == 0);
Jerome Jiangee586532019-07-19 16:24:56 -0700646 __m128i zero = _mm_setzero_si128();
Jerome Jiangee586532019-07-19 16:24:56 -0700647
venkat sanampudi6df621c2022-09-15 16:03:11 +0530648 for (int wd = 0; wd < width; wd += 16) {
649 const uint8_t *ref_tmp = ref + wd;
650 int16_t *hbuf_tmp = hbuf + wd;
651 __m128i s0 = zero;
652 __m128i s1 = zero;
653 int idx = 0;
654 do {
655 __m128i src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
656 __m128i t0 = _mm_unpacklo_epi8(src_line, zero);
657 __m128i t1 = _mm_unpackhi_epi8(src_line, zero);
Kyle Siefring4ebe8322022-10-10 10:15:13 -0400658 s0 = _mm_add_epi16(s0, t0);
659 s1 = _mm_add_epi16(s1, t1);
venkat sanampudi6df621c2022-09-15 16:03:11 +0530660 ref_tmp += ref_stride;
Jerome Jiangee586532019-07-19 16:24:56 -0700661
venkat sanampudi6df621c2022-09-15 16:03:11 +0530662 src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
663 t0 = _mm_unpacklo_epi8(src_line, zero);
664 t1 = _mm_unpackhi_epi8(src_line, zero);
Kyle Siefring4ebe8322022-10-10 10:15:13 -0400665 s0 = _mm_add_epi16(s0, t0);
666 s1 = _mm_add_epi16(s1, t1);
venkat sanampudi6df621c2022-09-15 16:03:11 +0530667 ref_tmp += ref_stride;
668 idx += 2;
669 } while (idx < height);
670
671 s0 = _mm_srai_epi16(s0, norm_factor);
672 s1 = _mm_srai_epi16(s1, norm_factor);
673 _mm_storeu_si128((__m128i *)(hbuf_tmp), s0);
674 _mm_storeu_si128((__m128i *)(hbuf_tmp + 8), s1);
Jerome Jiangee586532019-07-19 16:24:56 -0700675 }
Jerome Jiangee586532019-07-19 16:24:56 -0700676}
677
venkat sanampudi6df621c2022-09-15 16:03:11 +0530678void aom_int_pro_col_sse2(int16_t *vbuf, const uint8_t *ref,
679 const int ref_stride, const int width,
680 const int height, int norm_factor) {
681 // SIMD implementation assumes width to be multiple of 16.
682 assert(width % 16 == 0);
Jerome Jiangee586532019-07-19 16:24:56 -0700683
venkat sanampudi6df621c2022-09-15 16:03:11 +0530684 for (int ht = 0; ht < height; ht++) {
685 const uint8_t *ref_tmp = ref + (ht * ref_stride);
686 __m128i zero = _mm_setzero_si128();
687 __m128i s0 = zero;
688 __m128i s1, src_line;
689 for (int i = 0; i < width; i += 16) {
690 src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
691 s1 = _mm_sad_epu8(src_line, zero);
Kyle Siefring4ebe8322022-10-10 10:15:13 -0400692 s0 = _mm_add_epi16(s0, s1);
venkat sanampudi6df621c2022-09-15 16:03:11 +0530693 ref_tmp += 16;
694 }
695
696 s1 = _mm_srli_si128(s0, 8);
Kyle Siefring4ebe8322022-10-10 10:15:13 -0400697 s0 = _mm_add_epi16(s0, s1);
698 vbuf[ht] = _mm_cvtsi128_si32(s0) >> norm_factor;
Jerome Jiangee586532019-07-19 16:24:56 -0700699 }
Jerome Jiangee586532019-07-19 16:24:56 -0700700}