blob: 807ed1770fca919cb2653ab4db08b414d0e332ae [file] [log] [blame]
Yi Luoa0f66fc2017-09-26 15:49:59 -07001/*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12#include <tmmintrin.h>
13
Tom Finegan44702c82018-05-22 13:00:39 -070014#include "config/aom_dsp_rtcd.h"
15
Yi Luo46ae1ea2017-09-29 17:02:40 -070016#include "aom_dsp/intrapred_common.h"
Yi Luoa0f66fc2017-09-26 15:49:59 -070017
18// -----------------------------------------------------------------------------
Urvang Joshi96d1c0a2017-10-10 13:15:32 -070019// PAETH_PRED
Yi Luoa0f66fc2017-09-26 15:49:59 -070020
21// Return 8 16-bit pixels in one row
22static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
23 const __m128i *topleft) {
24 const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
25
26 __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
27 __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
28 __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
29
30 __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
31 mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
32 __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
33
34 pl = _mm_andnot_si128(mask1, *left);
35
36 ptl = _mm_and_si128(mask2, *topleft);
37 pt = _mm_andnot_si128(mask2, *top);
38 pt = _mm_or_si128(pt, ptl);
39 pt = _mm_and_si128(mask1, pt);
40
41 return _mm_or_si128(pl, pt);
42}
43
44void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
45 const uint8_t *above, const uint8_t *left) {
46 __m128i l = _mm_loadl_epi64((const __m128i *)left);
47 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
48 const __m128i zero = _mm_setzero_si128();
49 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
50 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
51 __m128i rep = _mm_set1_epi16(0x8000);
52 const __m128i one = _mm_set1_epi16(1);
53
54 int i;
55 for (i = 0; i < 4; ++i) {
56 const __m128i l16 = _mm_shuffle_epi8(l, rep);
57 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
58
59 *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
60 dst += stride;
61 rep = _mm_add_epi16(rep, one);
62 }
63}
64
65void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
66 const uint8_t *above, const uint8_t *left) {
67 __m128i l = _mm_loadl_epi64((const __m128i *)left);
68 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
69 const __m128i zero = _mm_setzero_si128();
70 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
71 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
72 __m128i rep = _mm_set1_epi16(0x8000);
73 const __m128i one = _mm_set1_epi16(1);
74
75 int i;
76 for (i = 0; i < 8; ++i) {
77 const __m128i l16 = _mm_shuffle_epi8(l, rep);
78 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
79
80 *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
81 dst += stride;
82 rep = _mm_add_epi16(rep, one);
83 }
84}
85
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -070086void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
87 const uint8_t *above, const uint8_t *left) {
88 __m128i l = _mm_load_si128((const __m128i *)left);
89 const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
90 const __m128i zero = _mm_setzero_si128();
91 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
92 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
93 __m128i rep = _mm_set1_epi16(0x8000);
94 const __m128i one = _mm_set1_epi16(1);
95
96 for (int i = 0; i < 16; ++i) {
97 const __m128i l16 = _mm_shuffle_epi8(l, rep);
98 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
99
100 *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
101 dst += stride;
102 rep = _mm_add_epi16(rep, one);
103 }
104}
105
Yi Luoa0f66fc2017-09-26 15:49:59 -0700106void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
107 const uint8_t *above, const uint8_t *left) {
108 __m128i l = _mm_loadl_epi64((const __m128i *)left);
109 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
110 const __m128i zero = _mm_setzero_si128();
111 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
112 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
113 __m128i rep = _mm_set1_epi16(0x8000);
114 const __m128i one = _mm_set1_epi16(1);
115
116 int i;
117 for (i = 0; i < 4; ++i) {
118 const __m128i l16 = _mm_shuffle_epi8(l, rep);
119 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
120
121 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
122 dst += stride;
123 rep = _mm_add_epi16(rep, one);
124 }
125}
126
127void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
128 const uint8_t *above, const uint8_t *left) {
129 __m128i l = _mm_loadl_epi64((const __m128i *)left);
130 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
131 const __m128i zero = _mm_setzero_si128();
132 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
133 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
134 __m128i rep = _mm_set1_epi16(0x8000);
135 const __m128i one = _mm_set1_epi16(1);
136
137 int i;
138 for (i = 0; i < 8; ++i) {
139 const __m128i l16 = _mm_shuffle_epi8(l, rep);
140 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
141
142 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
143 dst += stride;
144 rep = _mm_add_epi16(rep, one);
145 }
146}
147
148void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
149 const uint8_t *above, const uint8_t *left) {
150 __m128i l = _mm_load_si128((const __m128i *)left);
151 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
152 const __m128i zero = _mm_setzero_si128();
153 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
154 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
155 __m128i rep = _mm_set1_epi16(0x8000);
156 const __m128i one = _mm_set1_epi16(1);
157
158 int i;
159 for (i = 0; i < 16; ++i) {
160 const __m128i l16 = _mm_shuffle_epi8(l, rep);
161 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
162
163 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
164 dst += stride;
165 rep = _mm_add_epi16(rep, one);
166 }
167}
168
Scott LaVarnway925d4e52018-04-02 05:12:44 -0700169void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
170 const uint8_t *above, const uint8_t *left) {
171 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
172 const __m128i zero = _mm_setzero_si128();
173 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
174 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
175 const __m128i one = _mm_set1_epi16(1);
176
177 for (int j = 0; j < 2; ++j) {
178 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
179 __m128i rep = _mm_set1_epi16(0x8000);
180 for (int i = 0; i < 16; ++i) {
181 const __m128i l16 = _mm_shuffle_epi8(l, rep);
182 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
183
184 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
185 dst += stride;
186 rep = _mm_add_epi16(rep, one);
187 }
188 }
189}
190
Yi Luoa0f66fc2017-09-26 15:49:59 -0700191// Return 16 8-bit pixels in one row
192static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
193 const __m128i *top1,
194 const __m128i *topleft) {
195 const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
196 const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
197 return _mm_packus_epi16(p0, p1);
198}
199
Scott LaVarnway00f8a932018-04-02 08:02:40 -0700200void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
201 const uint8_t *above, const uint8_t *left) {
202 __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
203 const __m128i t = _mm_load_si128((const __m128i *)above);
204 const __m128i zero = _mm_setzero_si128();
205 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
206 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
207 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
208 __m128i rep = _mm_set1_epi16(0x8000);
209 const __m128i one = _mm_set1_epi16(1);
210
211 for (int i = 0; i < 4; ++i) {
212 const __m128i l16 = _mm_shuffle_epi8(l, rep);
213 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
214
215 _mm_store_si128((__m128i *)dst, row);
216 dst += stride;
217 rep = _mm_add_epi16(rep, one);
218 }
219}
220
Yi Luoa0f66fc2017-09-26 15:49:59 -0700221void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
222 const uint8_t *above, const uint8_t *left) {
223 __m128i l = _mm_loadl_epi64((const __m128i *)left);
224 const __m128i t = _mm_load_si128((const __m128i *)above);
225 const __m128i zero = _mm_setzero_si128();
226 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
227 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
228 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
229 __m128i rep = _mm_set1_epi16(0x8000);
230 const __m128i one = _mm_set1_epi16(1);
231
232 int i;
233 for (i = 0; i < 8; ++i) {
234 const __m128i l16 = _mm_shuffle_epi8(l, rep);
235 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
236
237 _mm_store_si128((__m128i *)dst, row);
238 dst += stride;
239 rep = _mm_add_epi16(rep, one);
240 }
241}
242
243void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
244 const uint8_t *above,
245 const uint8_t *left) {
246 __m128i l = _mm_load_si128((const __m128i *)left);
247 const __m128i t = _mm_load_si128((const __m128i *)above);
248 const __m128i zero = _mm_setzero_si128();
249 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
250 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
251 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
252 __m128i rep = _mm_set1_epi16(0x8000);
253 const __m128i one = _mm_set1_epi16(1);
254
255 int i;
256 for (i = 0; i < 16; ++i) {
257 const __m128i l16 = _mm_shuffle_epi8(l, rep);
258 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
259
260 _mm_store_si128((__m128i *)dst, row);
261 dst += stride;
262 rep = _mm_add_epi16(rep, one);
263 }
264}
265
266void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
267 const uint8_t *above,
268 const uint8_t *left) {
269 __m128i l = _mm_load_si128((const __m128i *)left);
270 const __m128i t = _mm_load_si128((const __m128i *)above);
271 const __m128i zero = _mm_setzero_si128();
272 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
273 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
274 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
275 __m128i rep = _mm_set1_epi16(0x8000);
276 const __m128i one = _mm_set1_epi16(1);
277 __m128i l16;
278
279 int i;
280 for (i = 0; i < 16; ++i) {
281 l16 = _mm_shuffle_epi8(l, rep);
282 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
283
284 _mm_store_si128((__m128i *)dst, row);
285 dst += stride;
286 rep = _mm_add_epi16(rep, one);
287 }
288
289 l = _mm_load_si128((const __m128i *)(left + 16));
290 rep = _mm_set1_epi16(0x8000);
291 for (i = 0; i < 16; ++i) {
292 l16 = _mm_shuffle_epi8(l, rep);
293 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
294
295 _mm_store_si128((__m128i *)dst, row);
296 dst += stride;
297 rep = _mm_add_epi16(rep, one);
298 }
299}
300
Scott LaVarnwayee5a4d42018-03-14 07:53:40 -0700301void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
302 const uint8_t *above,
303 const uint8_t *left) {
304 const __m128i t = _mm_load_si128((const __m128i *)above);
305 const __m128i zero = _mm_setzero_si128();
306 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
307 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
308 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
309 const __m128i one = _mm_set1_epi16(1);
310
311 for (int j = 0; j < 4; ++j) {
312 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
313 __m128i rep = _mm_set1_epi16(0x8000);
314 for (int i = 0; i < 16; ++i) {
315 const __m128i l16 = _mm_shuffle_epi8(l, rep);
316 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
317 _mm_store_si128((__m128i *)dst, row);
318 dst += stride;
319 rep = _mm_add_epi16(rep, one);
320 }
321 }
322}
323
Scott LaVarnway76003142018-04-03 07:17:32 -0700324void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
325 const uint8_t *above, const uint8_t *left) {
326 const __m128i a = _mm_load_si128((const __m128i *)above);
327 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
328 const __m128i zero = _mm_setzero_si128();
329 const __m128i al = _mm_unpacklo_epi8(a, zero);
330 const __m128i ah = _mm_unpackhi_epi8(a, zero);
331 const __m128i bl = _mm_unpacklo_epi8(b, zero);
332 const __m128i bh = _mm_unpackhi_epi8(b, zero);
333
334 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
335 __m128i rep = _mm_set1_epi16(0x8000);
336 const __m128i one = _mm_set1_epi16(1);
337 const __m128i l = _mm_loadl_epi64((const __m128i *)left);
338 __m128i l16;
339
340 for (int i = 0; i < 8; ++i) {
341 l16 = _mm_shuffle_epi8(l, rep);
342 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
343 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
344
345 _mm_store_si128((__m128i *)dst, r32l);
346 _mm_store_si128((__m128i *)(dst + 16), r32h);
347 dst += stride;
348 rep = _mm_add_epi16(rep, one);
349 }
350}
351
Yi Luoa0f66fc2017-09-26 15:49:59 -0700352void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
353 const uint8_t *above,
354 const uint8_t *left) {
355 const __m128i a = _mm_load_si128((const __m128i *)above);
356 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
357 const __m128i zero = _mm_setzero_si128();
358 const __m128i al = _mm_unpacklo_epi8(a, zero);
359 const __m128i ah = _mm_unpackhi_epi8(a, zero);
360 const __m128i bl = _mm_unpacklo_epi8(b, zero);
361 const __m128i bh = _mm_unpackhi_epi8(b, zero);
362
363 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
364 __m128i rep = _mm_set1_epi16(0x8000);
365 const __m128i one = _mm_set1_epi16(1);
366 __m128i l = _mm_load_si128((const __m128i *)left);
367 __m128i l16;
368
369 int i;
370 for (i = 0; i < 16; ++i) {
371 l16 = _mm_shuffle_epi8(l, rep);
372 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
373 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
374
375 _mm_store_si128((__m128i *)dst, r32l);
376 _mm_store_si128((__m128i *)(dst + 16), r32h);
377 dst += stride;
378 rep = _mm_add_epi16(rep, one);
379 }
380}
381
382void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
383 const uint8_t *above,
384 const uint8_t *left) {
385 const __m128i a = _mm_load_si128((const __m128i *)above);
386 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
387 const __m128i zero = _mm_setzero_si128();
388 const __m128i al = _mm_unpacklo_epi8(a, zero);
389 const __m128i ah = _mm_unpackhi_epi8(a, zero);
390 const __m128i bl = _mm_unpacklo_epi8(b, zero);
391 const __m128i bh = _mm_unpackhi_epi8(b, zero);
392
393 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
394 __m128i rep = _mm_set1_epi16(0x8000);
395 const __m128i one = _mm_set1_epi16(1);
396 __m128i l = _mm_load_si128((const __m128i *)left);
397 __m128i l16;
398
399 int i;
400 for (i = 0; i < 16; ++i) {
401 l16 = _mm_shuffle_epi8(l, rep);
402 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
403 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
404
405 _mm_store_si128((__m128i *)dst, r32l);
406 _mm_store_si128((__m128i *)(dst + 16), r32h);
407 dst += stride;
408 rep = _mm_add_epi16(rep, one);
409 }
410
411 rep = _mm_set1_epi16(0x8000);
412 l = _mm_load_si128((const __m128i *)(left + 16));
413 for (i = 0; i < 16; ++i) {
414 l16 = _mm_shuffle_epi8(l, rep);
415 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
416 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
417
418 _mm_store_si128((__m128i *)dst, r32l);
419 _mm_store_si128((__m128i *)(dst + 16), r32h);
420 dst += stride;
421 rep = _mm_add_epi16(rep, one);
422 }
423}
Yi Luo46ae1ea2017-09-29 17:02:40 -0700424
Scott LaVarnwayf0cf4e32018-02-26 12:19:02 -0800425void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
426 const uint8_t *above,
427 const uint8_t *left) {
428 const __m128i a = _mm_load_si128((const __m128i *)above);
429 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
430 const __m128i zero = _mm_setzero_si128();
431 const __m128i al = _mm_unpacklo_epi8(a, zero);
432 const __m128i ah = _mm_unpackhi_epi8(a, zero);
433 const __m128i bl = _mm_unpacklo_epi8(b, zero);
434 const __m128i bh = _mm_unpackhi_epi8(b, zero);
435
436 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
437 const __m128i one = _mm_set1_epi16(1);
438 __m128i l16;
439
440 int i, j;
441 for (j = 0; j < 4; ++j) {
442 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
443 __m128i rep = _mm_set1_epi16(0x8000);
444 for (i = 0; i < 16; ++i) {
445 l16 = _mm_shuffle_epi8(l, rep);
446 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
447 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
448
449 _mm_store_si128((__m128i *)dst, r32l);
450 _mm_store_si128((__m128i *)(dst + 16), r32h);
451 dst += stride;
452 rep = _mm_add_epi16(rep, one);
453 }
454 }
455}
456
457void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
458 const uint8_t *above,
459 const uint8_t *left) {
460 const __m128i a = _mm_load_si128((const __m128i *)above);
461 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
462 const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
463 const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
464 const __m128i zero = _mm_setzero_si128();
465 const __m128i al = _mm_unpacklo_epi8(a, zero);
466 const __m128i ah = _mm_unpackhi_epi8(a, zero);
467 const __m128i bl = _mm_unpacklo_epi8(b, zero);
468 const __m128i bh = _mm_unpackhi_epi8(b, zero);
469 const __m128i cl = _mm_unpacklo_epi8(c, zero);
470 const __m128i ch = _mm_unpackhi_epi8(c, zero);
471 const __m128i dl = _mm_unpacklo_epi8(d, zero);
472 const __m128i dh = _mm_unpackhi_epi8(d, zero);
473
474 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
475 const __m128i one = _mm_set1_epi16(1);
476 __m128i l16;
477
478 int i, j;
479 for (j = 0; j < 2; ++j) {
480 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
481 __m128i rep = _mm_set1_epi16(0x8000);
482 for (i = 0; i < 16; ++i) {
483 l16 = _mm_shuffle_epi8(l, rep);
484 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
485 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
486 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
487 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
488
489 _mm_store_si128((__m128i *)dst, r0);
490 _mm_store_si128((__m128i *)(dst + 16), r1);
491 _mm_store_si128((__m128i *)(dst + 32), r2);
492 _mm_store_si128((__m128i *)(dst + 48), r3);
493 dst += stride;
494 rep = _mm_add_epi16(rep, one);
495 }
496 }
497}
498
499void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
500 const uint8_t *above,
501 const uint8_t *left) {
502 const __m128i a = _mm_load_si128((const __m128i *)above);
503 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
504 const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
505 const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
506 const __m128i zero = _mm_setzero_si128();
507 const __m128i al = _mm_unpacklo_epi8(a, zero);
508 const __m128i ah = _mm_unpackhi_epi8(a, zero);
509 const __m128i bl = _mm_unpacklo_epi8(b, zero);
510 const __m128i bh = _mm_unpackhi_epi8(b, zero);
511 const __m128i cl = _mm_unpacklo_epi8(c, zero);
512 const __m128i ch = _mm_unpackhi_epi8(c, zero);
513 const __m128i dl = _mm_unpacklo_epi8(d, zero);
514 const __m128i dh = _mm_unpackhi_epi8(d, zero);
515
516 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
517 const __m128i one = _mm_set1_epi16(1);
518 __m128i l16;
519
520 int i, j;
521 for (j = 0; j < 4; ++j) {
522 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
523 __m128i rep = _mm_set1_epi16(0x8000);
524 for (i = 0; i < 16; ++i) {
525 l16 = _mm_shuffle_epi8(l, rep);
526 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
527 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
528 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
529 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
530
531 _mm_store_si128((__m128i *)dst, r0);
532 _mm_store_si128((__m128i *)(dst + 16), r1);
533 _mm_store_si128((__m128i *)(dst + 32), r2);
534 _mm_store_si128((__m128i *)(dst + 48), r3);
535 dst += stride;
536 rep = _mm_add_epi16(rep, one);
537 }
538 }
539}
540
541void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
542 const uint8_t *above,
543 const uint8_t *left) {
544 const __m128i a = _mm_load_si128((const __m128i *)above);
545 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
546 const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
547 const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
548 const __m128i zero = _mm_setzero_si128();
549 const __m128i al = _mm_unpacklo_epi8(a, zero);
550 const __m128i ah = _mm_unpackhi_epi8(a, zero);
551 const __m128i bl = _mm_unpacklo_epi8(b, zero);
552 const __m128i bh = _mm_unpackhi_epi8(b, zero);
553 const __m128i cl = _mm_unpacklo_epi8(c, zero);
554 const __m128i ch = _mm_unpackhi_epi8(c, zero);
555 const __m128i dl = _mm_unpacklo_epi8(d, zero);
556 const __m128i dh = _mm_unpackhi_epi8(d, zero);
557
558 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
559 const __m128i one = _mm_set1_epi16(1);
560 __m128i l16;
561
562 int i;
563 const __m128i l = _mm_load_si128((const __m128i *)left);
564 __m128i rep = _mm_set1_epi16(0x8000);
565 for (i = 0; i < 16; ++i) {
566 l16 = _mm_shuffle_epi8(l, rep);
567 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
568 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
569 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
570 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
571
572 _mm_store_si128((__m128i *)dst, r0);
573 _mm_store_si128((__m128i *)(dst + 16), r1);
574 _mm_store_si128((__m128i *)(dst + 32), r2);
575 _mm_store_si128((__m128i *)(dst + 48), r3);
576 dst += stride;
577 rep = _mm_add_epi16(rep, one);
578 }
579}
580
Yi Luo46ae1ea2017-09-29 17:02:40 -0700581// -----------------------------------------------------------------------------
582// SMOOTH_PRED
583
584// pixels[0]: above and below_pred interleave vector
585// pixels[1]: left vector
586// pixels[2]: right_pred vector
587static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
588 int height, __m128i *pixels) {
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700589 __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
590 if (height == 4)
591 pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
592 else if (height == 8)
593 pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
594 else
595 pixels[1] = _mm_loadu_si128(((const __m128i *)left));
596
Yi Luo46ae1ea2017-09-29 17:02:40 -0700597 pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700598
599 const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
600 const __m128i zero = _mm_setzero_si128();
601 d = _mm_unpacklo_epi8(d, zero);
602 pixels[0] = _mm_unpacklo_epi16(d, bp);
603}
604
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700605// weight_h[0]: weight_h vector
606// weight_h[1]: scale - weight_h vector
607// weight_h[2]: same as [0], second half for height = 16 only
608// weight_h[3]: same as [1], second half for height = 16 only
609// weight_w[0]: weights_w and scale - weights_w interleave vector
Yi Luo46ae1ea2017-09-29 17:02:40 -0700610static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700611 __m128i *weight_h, __m128i *weight_w) {
Yi Luo46ae1ea2017-09-29 17:02:40 -0700612 const __m128i zero = _mm_setzero_si128();
Yi Luo46ae1ea2017-09-29 17:02:40 -0700613 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700614 const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
615 weight_h[0] = _mm_unpacklo_epi8(t, zero);
616 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
617 weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700618
619 if (height == 8) {
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700620 const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
621 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
622 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
623 } else if (height == 16) {
624 const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
625 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
626 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
627 weight_h[2] = _mm_unpackhi_epi8(weight, zero);
628 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700629 }
630}
631
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700632static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
633 const __m128i *ww, int h, uint8_t *dst,
634 ptrdiff_t stride, int second_half) {
Yi Luo46ae1ea2017-09-29 17:02:40 -0700635 const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
636 const __m128i one = _mm_set1_epi16(1);
637 const __m128i inc = _mm_set1_epi16(0x202);
638 const __m128i gat = _mm_set1_epi32(0xc080400);
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700639 __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700640 __m128i d = _mm_set1_epi16(0x100);
641
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700642 for (int i = 0; i < h; ++i) {
643 const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
644 const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700645 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
646 __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
647
648 __m128i b = _mm_shuffle_epi8(pixel[1], rep);
649 b = _mm_unpacklo_epi16(b, pixel[2]);
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700650 __m128i sum = _mm_madd_epi16(b, ww[0]);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700651
652 sum = _mm_add_epi32(s, sum);
653 sum = _mm_add_epi32(sum, round);
654 sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale);
655
656 sum = _mm_shuffle_epi8(sum, gat);
657 *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
658 dst += stride;
659
660 rep = _mm_add_epi16(rep, one);
661 d = _mm_add_epi16(d, inc);
662 }
663}
664
665void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
666 const uint8_t *above, const uint8_t *left) {
667 __m128i pixels[3];
668 load_pixel_w4(above, left, 4, pixels);
669
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700670 __m128i wh[4], ww[2];
671 load_weight_w4(sm_weight_arrays, 4, wh, ww);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700672
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700673 smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700674}
675
676void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
677 const uint8_t *above, const uint8_t *left) {
678 __m128i pixels[3];
679 load_pixel_w4(above, left, 8, pixels);
680
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700681 __m128i wh[4], ww[2];
682 load_weight_w4(sm_weight_arrays, 8, wh, ww);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700683
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700684 smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
685}
686
687void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
688 const uint8_t *above,
689 const uint8_t *left) {
690 __m128i pixels[3];
691 load_pixel_w4(above, left, 16, pixels);
692
693 __m128i wh[4], ww[2];
694 load_weight_w4(sm_weight_arrays, 16, wh, ww);
695
696 smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
697 dst += stride << 3;
698 smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700699}
700
701// pixels[0]: above and below_pred interleave vector, first half
702// pixels[1]: above and below_pred interleave vector, second half
703// pixels[2]: left vector
704// pixels[3]: right_pred vector
Scott LaVarnway925d4e52018-04-02 05:12:44 -0700705// pixels[4]: above and below_pred interleave vector, first half
706// pixels[5]: above and below_pred interleave vector, second half
707// pixels[6]: left vector + 16
708// pixels[7]: right_pred vector
Yi Luo46ae1ea2017-09-29 17:02:40 -0700709static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
710 int height, __m128i *pixels) {
Yi Luo46ae1ea2017-09-29 17:02:40 -0700711 const __m128i zero = _mm_setzero_si128();
Scott LaVarnway925d4e52018-04-02 05:12:44 -0700712 const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
713 __m128i d = _mm_loadl_epi64((const __m128i *)above);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700714 d = _mm_unpacklo_epi8(d, zero);
715 pixels[0] = _mm_unpacklo_epi16(d, bp);
716 pixels[1] = _mm_unpackhi_epi16(d, bp);
Scott LaVarnway925d4e52018-04-02 05:12:44 -0700717
718 pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
719
720 if (height == 4) {
721 pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
722 } else if (height == 8) {
723 pixels[2] = _mm_loadl_epi64((const __m128i *)left);
724 } else if (height == 16) {
725 pixels[2] = _mm_load_si128((const __m128i *)left);
726 } else {
727 pixels[2] = _mm_load_si128((const __m128i *)left);
728 pixels[4] = pixels[0];
729 pixels[5] = pixels[1];
730 pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
731 pixels[7] = pixels[3];
732 }
Yi Luo46ae1ea2017-09-29 17:02:40 -0700733}
734
735// weight_h[0]: weight_h vector
736// weight_h[1]: scale - weight_h vector
Scott LaVarnway925d4e52018-04-02 05:12:44 -0700737// weight_h[2]: same as [0], offset 8
738// weight_h[3]: same as [1], offset 8
739// weight_h[4]: same as [0], offset 16
740// weight_h[5]: same as [1], offset 16
741// weight_h[6]: same as [0], offset 24
742// weight_h[7]: same as [1], offset 24
Yi Luo46ae1ea2017-09-29 17:02:40 -0700743// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
744// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
745static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
746 __m128i *weight_h, __m128i *weight_w) {
747 const __m128i zero = _mm_setzero_si128();
748 const int we_offset = height < 8 ? 4 : 8;
749 __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
750 weight_h[0] = _mm_unpacklo_epi8(we, zero);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700751 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
752 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
753
754 if (height == 4) {
755 we = _mm_srli_si128(we, 4);
756 __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
757 __m128i tmp2 = _mm_sub_epi16(d, tmp1);
758 weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
759 weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
760 } else {
761 weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
762 weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
763 }
764
765 if (height == 16) {
766 we = _mm_loadu_si128((const __m128i *)&weight_array[16]);
767 weight_h[0] = _mm_unpacklo_epi8(we, zero);
768 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
769 weight_h[2] = _mm_unpackhi_epi8(we, zero);
770 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
Scott LaVarnway925d4e52018-04-02 05:12:44 -0700771 } else if (height == 32) {
772 const __m128i weight_lo =
773 _mm_loadu_si128((const __m128i *)&weight_array[32]);
774 weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
775 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
776 weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
777 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
778 const __m128i weight_hi =
779 _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
780 weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
781 weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
782 weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
783 weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700784 }
785}
786
787static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
788 const __m128i *ww, int h, uint8_t *dst,
789 ptrdiff_t stride, int second_half) {
790 const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
791 const __m128i one = _mm_set1_epi16(1);
792 const __m128i inc = _mm_set1_epi16(0x202);
793 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
794
795 __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
796 __m128i d = _mm_set1_epi16(0x100);
797
798 int i;
799 for (i = 0; i < h; ++i) {
800 const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
801 const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
802 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
803 __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
804 __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
805
806 __m128i b = _mm_shuffle_epi8(pixels[2], rep);
807 b = _mm_unpacklo_epi16(b, pixels[3]);
808 __m128i sum0 = _mm_madd_epi16(b, ww[0]);
809 __m128i sum1 = _mm_madd_epi16(b, ww[1]);
810
811 s0 = _mm_add_epi32(s0, sum0);
812 s0 = _mm_add_epi32(s0, round);
813 s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
814
815 s1 = _mm_add_epi32(s1, sum1);
816 s1 = _mm_add_epi32(s1, round);
817 s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
818
819 sum0 = _mm_packus_epi16(s0, s1);
820 sum0 = _mm_shuffle_epi8(sum0, gat);
821 _mm_storel_epi64((__m128i *)dst, sum0);
822 dst += stride;
823
824 rep = _mm_add_epi16(rep, one);
825 d = _mm_add_epi16(d, inc);
826 }
827}
828
829void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
830 const uint8_t *above, const uint8_t *left) {
831 __m128i pixels[4];
832 load_pixel_w8(above, left, 4, pixels);
833
834 __m128i wh[4], ww[2];
835 load_weight_w8(sm_weight_arrays, 4, wh, ww);
836
837 smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
838}
839
840void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
841 const uint8_t *above, const uint8_t *left) {
842 __m128i pixels[4];
843 load_pixel_w8(above, left, 8, pixels);
844
845 __m128i wh[4], ww[2];
846 load_weight_w8(sm_weight_arrays, 8, wh, ww);
847
848 smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
849}
850
851void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
852 const uint8_t *above,
853 const uint8_t *left) {
854 __m128i pixels[4];
855 load_pixel_w8(above, left, 16, pixels);
856
857 __m128i wh[4], ww[2];
858 load_weight_w8(sm_weight_arrays, 16, wh, ww);
859
860 smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
861 dst += stride << 3;
862 smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
863}
864
Scott LaVarnway925d4e52018-04-02 05:12:44 -0700865void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
866 const uint8_t *above,
867 const uint8_t *left) {
868 __m128i pixels[8];
869 load_pixel_w8(above, left, 32, pixels);
870
871 __m128i wh[8], ww[2];
872 load_weight_w8(sm_weight_arrays, 32, wh, ww);
873
874 smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
875 dst += stride << 3;
876 smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
877 dst += stride << 3;
878 smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
879 dst += stride << 3;
880 smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
881}
882
Scott LaVarnway7cb2db12018-03-12 06:49:03 -0700883static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
884 const uint8_t *above,
885 const uint8_t *left, uint32_t bw,
886 uint32_t bh) {
887 const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
888 const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
889 const __m128i zero = _mm_setzero_si128();
890 const __m128i scale_value =
891 _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
892 const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
Imdad Sardharwalla51232512018-04-30 14:41:28 +0100893 const __m128i dup16 = _mm_set1_epi32(0x01000100);
Scott LaVarnway7cb2db12018-03-12 06:49:03 -0700894 const __m128i top_right =
895 _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
896 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
897 const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale));
898
899 for (uint32_t y = 0; y < bh; ++y) {
900 const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
901 const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
902 const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
903 __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
904 const __m128i wl_y =
905 _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
906 pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round);
907 pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0);
908
909 for (uint32_t x = 0; x < bw; x += 8) {
910 const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
911 const __m128i weights_x =
912 _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
913 const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x);
914 const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero);
915 const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero);
916
917 __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
918 __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
919
920 const __m128i scale_m_weights_x =
921 _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero));
922 const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right);
923 const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero);
924 const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero);
925
926 pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl);
927 pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl);
928
929 pred_lo = _mm_add_epi32(pred_lo, swxtr_lo);
930 pred_hi = _mm_add_epi32(pred_hi, swxtr_hi);
931
932 pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale));
933 pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale));
934
935 __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
936 pred = _mm_shuffle_epi8(pred, gat);
937 _mm_storel_epi64((__m128i *)(dst + x), pred);
938 }
939 dst += stride;
940 }
941}
Scott LaVarnwaye25a4ba2018-03-21 13:20:31 -0700942
Scott LaVarnway00f8a932018-04-02 08:02:40 -0700943void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
944 const uint8_t *above,
945 const uint8_t *left) {
946 smooth_predictor_wxh(dst, stride, above, left, 16, 4);
947}
948
Scott LaVarnway5997c7d2018-03-14 12:16:46 -0700949void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
950 const uint8_t *above,
951 const uint8_t *left) {
952 smooth_predictor_wxh(dst, stride, above, left, 16, 8);
953}
954
955void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
956 const uint8_t *above,
957 const uint8_t *left) {
958 smooth_predictor_wxh(dst, stride, above, left, 16, 16);
959}
960
961void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
962 const uint8_t *above,
963 const uint8_t *left) {
964 smooth_predictor_wxh(dst, stride, above, left, 16, 32);
965}
Scott LaVarnway7cb2db12018-03-12 06:49:03 -0700966
Scott LaVarnway76003142018-04-03 07:17:32 -0700967void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
968 const uint8_t *above,
969 const uint8_t *left) {
970 smooth_predictor_wxh(dst, stride, above, left, 32, 8);
971}
972
Scott LaVarnwaybcaa2f82018-03-13 16:21:01 -0700973void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
974 const uint8_t *above,
975 const uint8_t *left) {
976 smooth_predictor_wxh(dst, stride, above, left, 32, 16);
977}
978
979void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
980 const uint8_t *above,
981 const uint8_t *left) {
982 smooth_predictor_wxh(dst, stride, above, left, 32, 32);
983}
984
Scott LaVarnway7cb2db12018-03-12 06:49:03 -0700985void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
986 const uint8_t *above,
987 const uint8_t *left) {
988 smooth_predictor_wxh(dst, stride, above, left, 32, 64);
989}
990
991void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
992 const uint8_t *above,
993 const uint8_t *left) {
994 smooth_predictor_wxh(dst, stride, above, left, 64, 64);
995}
996
997void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
998 const uint8_t *above,
999 const uint8_t *left) {
1000 smooth_predictor_wxh(dst, stride, above, left, 64, 32);
1001}
1002
1003void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1004 const uint8_t *above,
1005 const uint8_t *left) {
1006 smooth_predictor_wxh(dst, stride, above, left, 64, 16);
1007}
Scott LaVarnway139542e2018-03-13 16:35:14 -07001008
1009void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1010 const uint8_t *above,
1011 const uint8_t *left) {
1012 smooth_predictor_wxh(dst, stride, above, left, 16, 64);
1013}
Scott LaVarnwaye25a4ba2018-03-21 13:20:31 -07001014
1015// -----------------------------------------------------------------------------
1016// SMOOTH_V_PRED
1017
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001018// pixels[0]: above and below_pred interleave vector
1019static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
1020 int height, __m128i *pixels) {
1021 const __m128i zero = _mm_setzero_si128();
1022 __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
1023 const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
1024 d = _mm_unpacklo_epi8(d, zero);
1025 pixels[0] = _mm_unpacklo_epi16(d, bp);
1026}
1027
1028// weights[0]: weights_h vector
1029// weights[1]: scale - weights_h vector
1030static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height,
1031 __m128i *weights) {
1032 const __m128i zero = _mm_setzero_si128();
1033 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1034
1035 if (height == 4) {
1036 const __m128i weight =
1037 _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
1038 weights[0] = _mm_unpacklo_epi8(weight, zero);
1039 weights[1] = _mm_sub_epi16(d, weights[0]);
1040 } else if (height == 8) {
1041 const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
1042 weights[0] = _mm_unpacklo_epi8(weight, zero);
1043 weights[1] = _mm_sub_epi16(d, weights[0]);
1044 } else {
1045 const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
1046 weights[0] = _mm_unpacklo_epi8(weight, zero);
1047 weights[1] = _mm_sub_epi16(d, weights[0]);
1048 weights[2] = _mm_unpackhi_epi8(weight, zero);
1049 weights[3] = _mm_sub_epi16(d, weights[2]);
1050 }
1051}
1052
1053static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
1054 const __m128i *weight, int h, uint8_t *dst,
1055 ptrdiff_t stride) {
1056 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1057 const __m128i inc = _mm_set1_epi16(0x202);
1058 const __m128i gat = _mm_set1_epi32(0xc080400);
1059 __m128i d = _mm_set1_epi16(0x100);
1060
1061 for (int i = 0; i < h; ++i) {
1062 const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
1063 const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
1064 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
1065 __m128i sum = _mm_madd_epi16(pixel[0], wh_sc);
1066 sum = _mm_add_epi32(sum, pred_round);
1067 sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
1068 sum = _mm_shuffle_epi8(sum, gat);
1069 *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
1070 dst += stride;
1071 d = _mm_add_epi16(d, inc);
1072 }
1073}
1074
1075void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1076 const uint8_t *above,
1077 const uint8_t *left) {
1078 __m128i pixels;
1079 load_pixel_v_w4(above, left, 4, &pixels);
1080
1081 __m128i weights[2];
1082 load_weight_v_w4(sm_weight_arrays, 4, weights);
1083
1084 smooth_v_pred_4xh(&pixels, weights, 4, dst, stride);
1085}
1086
1087void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1088 const uint8_t *above,
1089 const uint8_t *left) {
1090 __m128i pixels;
1091 load_pixel_v_w4(above, left, 8, &pixels);
1092
1093 __m128i weights[2];
1094 load_weight_v_w4(sm_weight_arrays, 8, weights);
1095
1096 smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
1097}
1098
1099void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1100 const uint8_t *above,
1101 const uint8_t *left) {
1102 __m128i pixels;
1103 load_pixel_v_w4(above, left, 16, &pixels);
1104
1105 __m128i weights[4];
1106 load_weight_v_w4(sm_weight_arrays, 16, weights);
1107
1108 smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
1109 dst += stride << 3;
1110 smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride);
1111}
1112
Scott LaVarnway664b74f2018-03-22 13:47:41 -07001113// pixels[0]: above and below_pred interleave vector, first half
1114// pixels[1]: above and below_pred interleave vector, second half
1115static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left,
1116 int height, __m128i *pixels) {
1117 const __m128i zero = _mm_setzero_si128();
1118 __m128i d = _mm_loadl_epi64((const __m128i *)above);
1119 const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
1120 d = _mm_unpacklo_epi8(d, zero);
1121 pixels[0] = _mm_unpacklo_epi16(d, bp);
1122 pixels[1] = _mm_unpackhi_epi16(d, bp);
1123}
1124
1125// weight_h[0]: weight_h vector
1126// weight_h[1]: scale - weight_h vector
1127// weight_h[2]: same as [0], offset 8
1128// weight_h[3]: same as [1], offset 8
1129// weight_h[4]: same as [0], offset 16
1130// weight_h[5]: same as [1], offset 16
1131// weight_h[6]: same as [0], offset 24
1132// weight_h[7]: same as [1], offset 24
1133static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height,
1134 __m128i *weight_h) {
1135 const __m128i zero = _mm_setzero_si128();
1136 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1137
1138 if (height < 16) {
1139 const int offset = height < 8 ? 4 : 8;
1140 const __m128i weight =
1141 _mm_loadu_si128((const __m128i *)&weight_array[offset]);
1142 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
1143 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
1144 } else if (height == 16) {
1145 const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
1146 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
1147 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
1148 weight_h[2] = _mm_unpackhi_epi8(weight, zero);
1149 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
1150 } else {
1151 const __m128i weight_lo =
1152 _mm_loadu_si128((const __m128i *)&weight_array[32]);
1153 weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
1154 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
1155 weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
1156 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
1157 const __m128i weight_hi =
1158 _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
1159 weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
1160 weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
1161 weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
1162 weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
1163 }
1164}
1165
1166static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh,
1167 int h, uint8_t *dst, ptrdiff_t stride) {
1168 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1169 const __m128i inc = _mm_set1_epi16(0x202);
1170 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1171 __m128i d = _mm_set1_epi16(0x100);
1172
1173 for (int i = 0; i < h; ++i) {
1174 const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
1175 const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
1176 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
1177 __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
1178 __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
1179
1180 s0 = _mm_add_epi32(s0, pred_round);
1181 s0 = _mm_srai_epi32(s0, sm_weight_log2_scale);
1182
1183 s1 = _mm_add_epi32(s1, pred_round);
1184 s1 = _mm_srai_epi32(s1, sm_weight_log2_scale);
1185
1186 __m128i sum01 = _mm_packus_epi16(s0, s1);
1187 sum01 = _mm_shuffle_epi8(sum01, gat);
1188 _mm_storel_epi64((__m128i *)dst, sum01);
1189 dst += stride;
1190
1191 d = _mm_add_epi16(d, inc);
1192 }
1193}
1194
1195void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1196 const uint8_t *above,
1197 const uint8_t *left) {
1198 __m128i pixels[2];
1199 load_pixel_v_w8(above, left, 4, pixels);
1200
1201 __m128i wh[2];
1202 load_weight_v_w8(sm_weight_arrays, 4, wh);
1203
1204 smooth_v_pred_8xh(pixels, wh, 4, dst, stride);
1205}
1206
1207void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1208 const uint8_t *above,
1209 const uint8_t *left) {
1210 __m128i pixels[2];
1211 load_pixel_v_w8(above, left, 8, pixels);
1212
1213 __m128i wh[2];
1214 load_weight_v_w8(sm_weight_arrays, 8, wh);
1215
1216 smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
1217}
1218
1219void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1220 const uint8_t *above,
1221 const uint8_t *left) {
1222 __m128i pixels[2];
1223 load_pixel_v_w8(above, left, 16, pixels);
1224
1225 __m128i wh[4];
1226 load_weight_v_w8(sm_weight_arrays, 16, wh);
1227
1228 smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
1229 dst += stride << 3;
1230 smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
1231}
1232
1233void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1234 const uint8_t *above,
1235 const uint8_t *left) {
1236 __m128i pixels[2];
1237 load_pixel_v_w8(above, left, 32, pixels);
1238
1239 __m128i wh[8];
1240 load_weight_v_w8(sm_weight_arrays, 32, wh);
1241
1242 smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride);
1243 dst += stride << 3;
1244 smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
1245 dst += stride << 3;
1246 smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride);
1247 dst += stride << 3;
1248 smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride);
1249}
1250
Scott LaVarnwaye25a4ba2018-03-21 13:20:31 -07001251static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
1252 const uint8_t *above,
1253 const uint8_t *left, uint32_t bw,
1254 uint32_t bh) {
1255 const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
1256 const __m128i zero = _mm_setzero_si128();
1257 const __m128i scale_value =
1258 _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
Imdad Sardharwalla51232512018-04-30 14:41:28 +01001259 const __m128i dup16 = _mm_set1_epi32(0x01000100);
Scott LaVarnwaye25a4ba2018-03-21 13:20:31 -07001260 const __m128i bottom_left =
1261 _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
1262 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1263 const __m128i round =
1264 _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1)));
1265
1266 for (uint32_t y = 0; y < bh; ++y) {
1267 const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
1268 const __m128i scale_m_weights_y =
1269 _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
1270 const __m128i wl_y =
1271 _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0);
1272
1273 for (uint32_t x = 0; x < bw; x += 8) {
1274 const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
1275 // 8 -> 16
1276 const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero);
1277 const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y);
1278 const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y);
1279 // top_x * weights_y + scale_m_weights_y * bottom_left
1280 __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
1281 __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
1282
1283 pred_lo = _mm_add_epi32(pred_lo, round);
1284 pred_hi = _mm_add_epi32(pred_hi, round);
1285 pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
1286 pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
1287
1288 __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
1289 pred = _mm_shuffle_epi8(pred, gat);
1290 _mm_storel_epi64((__m128i *)(dst + x), pred);
1291 }
1292 dst += stride;
1293 }
1294}
1295
1296void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1297 const uint8_t *above,
1298 const uint8_t *left) {
1299 smooth_v_predictor_wxh(dst, stride, above, left, 16, 4);
1300}
1301
1302void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1303 const uint8_t *above,
1304 const uint8_t *left) {
1305 smooth_v_predictor_wxh(dst, stride, above, left, 16, 8);
1306}
1307
1308void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1309 const uint8_t *above,
1310 const uint8_t *left) {
1311 smooth_v_predictor_wxh(dst, stride, above, left, 16, 16);
1312}
1313
1314void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1315 const uint8_t *above,
1316 const uint8_t *left) {
1317 smooth_v_predictor_wxh(dst, stride, above, left, 16, 32);
1318}
1319
1320void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1321 const uint8_t *above,
1322 const uint8_t *left) {
1323 smooth_v_predictor_wxh(dst, stride, above, left, 32, 8);
1324}
1325
1326void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1327 const uint8_t *above,
1328 const uint8_t *left) {
1329 smooth_v_predictor_wxh(dst, stride, above, left, 32, 16);
1330}
1331
1332void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1333 const uint8_t *above,
1334 const uint8_t *left) {
1335 smooth_v_predictor_wxh(dst, stride, above, left, 32, 32);
1336}
1337
1338void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1339 const uint8_t *above,
1340 const uint8_t *left) {
1341 smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
1342}
1343
Scott LaVarnwaye25a4ba2018-03-21 13:20:31 -07001344void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1345 const uint8_t *above,
1346 const uint8_t *left) {
1347 smooth_v_predictor_wxh(dst, stride, above, left, 64, 64);
1348}
1349
1350void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1351 const uint8_t *above,
1352 const uint8_t *left) {
1353 smooth_v_predictor_wxh(dst, stride, above, left, 64, 32);
1354}
1355
1356void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1357 const uint8_t *above,
1358 const uint8_t *left) {
1359 smooth_v_predictor_wxh(dst, stride, above, left, 64, 16);
1360}
1361
1362void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1363 const uint8_t *above,
1364 const uint8_t *left) {
1365 smooth_v_predictor_wxh(dst, stride, above, left, 16, 64);
1366}
Scott LaVarnwaydeeee7c2018-03-28 07:26:51 -07001367
1368// -----------------------------------------------------------------------------
1369// SMOOTH_H_PRED
1370
1371// pixels[0]: left vector
1372// pixels[1]: right_pred vector
1373static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
1374 int height, __m128i *pixels) {
1375 if (height == 4)
1376 pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
1377 else if (height == 8)
1378 pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
1379 else
1380 pixels[0] = _mm_loadu_si128(((const __m128i *)left));
1381 pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
1382}
1383
1384// weights[0]: weights_w and scale - weights_w interleave vector
1385static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height,
1386 __m128i *weights) {
1387 (void)height;
1388 const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
1389 const __m128i zero = _mm_setzero_si128();
1390
1391 const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
1392 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1393 const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
1394 weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
1395}
1396
1397static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
1398 const __m128i *weight, int h, uint8_t *dst,
1399 ptrdiff_t stride) {
1400 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1401 const __m128i one = _mm_set1_epi16(1);
1402 const __m128i gat = _mm_set1_epi32(0xc080400);
1403 __m128i rep = _mm_set1_epi16(0x8000);
1404
1405 for (int i = 0; i < h; ++i) {
1406 __m128i b = _mm_shuffle_epi8(pixel[0], rep);
1407 b = _mm_unpacklo_epi16(b, pixel[1]);
1408 __m128i sum = _mm_madd_epi16(b, weight[0]);
1409
1410 sum = _mm_add_epi32(sum, pred_round);
1411 sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
1412
1413 sum = _mm_shuffle_epi8(sum, gat);
1414 *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
1415 dst += stride;
1416
1417 rep = _mm_add_epi16(rep, one);
1418 }
1419}
1420
1421void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1422 const uint8_t *above,
1423 const uint8_t *left) {
1424 __m128i pixels[2];
1425 load_pixel_h_w4(above, left, 4, pixels);
1426
1427 __m128i weights;
1428 load_weight_h_w4(sm_weight_arrays, 4, &weights);
1429
1430 smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
1431}
1432
1433void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1434 const uint8_t *above,
1435 const uint8_t *left) {
1436 __m128i pixels[2];
1437 load_pixel_h_w4(above, left, 8, pixels);
1438
1439 __m128i weights;
1440 load_weight_h_w4(sm_weight_arrays, 8, &weights);
1441
1442 smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
1443}
1444
1445void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1446 const uint8_t *above,
1447 const uint8_t *left) {
1448 __m128i pixels[2];
1449 load_pixel_h_w4(above, left, 16, pixels);
1450
1451 __m128i weights;
1452 load_weight_h_w4(sm_weight_arrays, 8, &weights);
1453
1454 smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
1455 dst += stride << 3;
1456
1457 pixels[0] = _mm_srli_si128(pixels[0], 8);
1458 smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
1459}
Scott LaVarnway5be7c662018-03-28 14:27:45 -07001460
1461// pixels[0]: left vector
1462// pixels[1]: right_pred vector
1463// pixels[2]: left vector + 16
1464// pixels[3]: right_pred vector
1465static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left,
1466 int height, __m128i *pixels) {
1467 pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
1468
1469 if (height == 4) {
1470 pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
1471 } else if (height == 8) {
1472 pixels[0] = _mm_loadl_epi64((const __m128i *)left);
1473 } else if (height == 16) {
1474 pixels[0] = _mm_load_si128((const __m128i *)left);
1475 } else {
1476 pixels[0] = _mm_load_si128((const __m128i *)left);
1477 pixels[2] = _mm_load_si128((const __m128i *)(left + 16));
1478 pixels[3] = pixels[1];
1479 }
1480}
1481
1482// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
1483// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
1484static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height,
1485 __m128i *weight_w) {
1486 (void)height;
1487 const __m128i zero = _mm_setzero_si128();
1488 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1489 const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]);
1490 const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
1491 const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
1492 weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
1493 weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
1494}
1495
1496static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
1497 int h, uint8_t *dst, ptrdiff_t stride,
1498 int second_half) {
1499 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1500 const __m128i one = _mm_set1_epi16(1);
1501 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1502 __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
1503
1504 for (int i = 0; i < h; ++i) {
1505 __m128i b = _mm_shuffle_epi8(pixels[0], rep);
1506 b = _mm_unpacklo_epi16(b, pixels[1]);
1507 __m128i sum0 = _mm_madd_epi16(b, ww[0]);
1508 __m128i sum1 = _mm_madd_epi16(b, ww[1]);
1509
1510 sum0 = _mm_add_epi32(sum0, pred_round);
1511 sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale);
1512
1513 sum1 = _mm_add_epi32(sum1, pred_round);
1514 sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
1515
1516 sum0 = _mm_packus_epi16(sum0, sum1);
1517 sum0 = _mm_shuffle_epi8(sum0, gat);
1518 _mm_storel_epi64((__m128i *)dst, sum0);
1519 dst += stride;
1520
1521 rep = _mm_add_epi16(rep, one);
1522 }
1523}
1524
1525void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1526 const uint8_t *above,
1527 const uint8_t *left) {
1528 __m128i pixels[2];
1529 load_pixel_h_w8(above, left, 4, pixels);
1530
1531 __m128i ww[2];
1532 load_weight_h_w8(sm_weight_arrays, 4, ww);
1533
1534 smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0);
1535}
1536
1537void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1538 const uint8_t *above,
1539 const uint8_t *left) {
1540 __m128i pixels[2];
1541 load_pixel_h_w8(above, left, 8, pixels);
1542
1543 __m128i ww[2];
1544 load_weight_h_w8(sm_weight_arrays, 8, ww);
1545
1546 smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
1547}
1548
1549void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1550 const uint8_t *above,
1551 const uint8_t *left) {
1552 __m128i pixels[2];
1553 load_pixel_h_w8(above, left, 16, pixels);
1554
1555 __m128i ww[2];
1556 load_weight_h_w8(sm_weight_arrays, 16, ww);
1557
1558 smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
1559 dst += stride << 3;
1560 smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1);
1561}
1562
1563void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1564 const uint8_t *above,
1565 const uint8_t *left) {
1566 __m128i pixels[4];
1567 load_pixel_h_w8(above, left, 32, pixels);
1568
1569 __m128i ww[2];
1570 load_weight_h_w8(sm_weight_arrays, 32, ww);
1571
1572 smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0);
1573 dst += stride << 3;
1574 smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1);
1575 dst += stride << 3;
1576 smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0);
1577 dst += stride << 3;
1578 smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1);
1579}
Scott LaVarnway6d9d52d2018-03-30 07:01:16 -07001580
1581static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
1582 const uint8_t *above,
1583 const uint8_t *left, uint32_t bw,
1584 uint32_t bh) {
1585 const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
1586 const __m128i zero = _mm_setzero_si128();
1587 const __m128i scale_value =
1588 _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1589 const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
1590 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1591 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1592
1593 for (uint32_t y = 0; y < bh; ++y) {
1594 const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
1595 const __m128i tr_ly =
1596 _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
1597
1598 for (uint32_t x = 0; x < bw; x += 8) {
1599 const __m128i weights_x =
1600 _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
1601 const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero);
1602 const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw);
1603 const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw);
1604 const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw);
1605 __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly);
1606 __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly);
1607
1608 pred_lo = _mm_add_epi32(pred_lo, pred_round);
1609 pred_hi = _mm_add_epi32(pred_hi, pred_round);
1610
1611 pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
1612 pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
1613
1614 __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
1615 pred = _mm_shuffle_epi8(pred, gat);
1616 _mm_storel_epi64((__m128i *)(dst + x), pred);
1617 }
1618 dst += stride;
1619 }
1620}
1621
1622void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1623 const uint8_t *above,
1624 const uint8_t *left) {
1625 smooth_h_predictor_wxh(dst, stride, above, left, 16, 4);
1626}
1627
1628void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1629 const uint8_t *above,
1630 const uint8_t *left) {
1631 smooth_h_predictor_wxh(dst, stride, above, left, 16, 8);
1632}
1633
1634void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1635 const uint8_t *above,
1636 const uint8_t *left) {
1637 smooth_h_predictor_wxh(dst, stride, above, left, 16, 16);
1638}
1639
1640void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1641 const uint8_t *above,
1642 const uint8_t *left) {
1643 smooth_h_predictor_wxh(dst, stride, above, left, 16, 32);
1644}
1645
1646void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1647 const uint8_t *above,
1648 const uint8_t *left) {
1649 smooth_h_predictor_wxh(dst, stride, above, left, 16, 64);
1650}
1651
1652void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1653 const uint8_t *above,
1654 const uint8_t *left) {
1655 smooth_h_predictor_wxh(dst, stride, above, left, 32, 8);
1656}
1657
1658void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1659 const uint8_t *above,
1660 const uint8_t *left) {
1661 smooth_h_predictor_wxh(dst, stride, above, left, 32, 16);
1662}
1663
1664void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1665 const uint8_t *above,
1666 const uint8_t *left) {
1667 smooth_h_predictor_wxh(dst, stride, above, left, 32, 32);
1668}
1669
1670void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1671 const uint8_t *above,
1672 const uint8_t *left) {
1673 smooth_h_predictor_wxh(dst, stride, above, left, 32, 64);
1674}
1675
1676void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1677 const uint8_t *above,
1678 const uint8_t *left) {
1679 smooth_h_predictor_wxh(dst, stride, above, left, 64, 64);
1680}
1681
1682void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1683 const uint8_t *above,
1684 const uint8_t *left) {
1685 smooth_h_predictor_wxh(dst, stride, above, left, 64, 32);
1686}
1687
1688void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1689 const uint8_t *above,
1690 const uint8_t *left) {
1691 smooth_h_predictor_wxh(dst, stride, above, left, 64, 16);
1692}