blob: fd48260c6f2ffb81fb01d9f9460b94ff98350fe2 [file] [log] [blame]
Yi Luoa0f66fc2017-09-26 15:49:59 -07001/*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12#include <tmmintrin.h>
13
Tom Finegan44702c82018-05-22 13:00:39 -070014#include "config/aom_dsp_rtcd.h"
15
Yi Luo46ae1ea2017-09-29 17:02:40 -070016#include "aom_dsp/intrapred_common.h"
Yi Luoa0f66fc2017-09-26 15:49:59 -070017
18// -----------------------------------------------------------------------------
Urvang Joshi96d1c0a2017-10-10 13:15:32 -070019// PAETH_PRED
Yi Luoa0f66fc2017-09-26 15:49:59 -070020
21// Return 8 16-bit pixels in one row
22static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
23 const __m128i *topleft) {
24 const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
25
26 __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
27 __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
28 __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
29
30 __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
31 mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
32 __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
33
34 pl = _mm_andnot_si128(mask1, *left);
35
36 ptl = _mm_and_si128(mask2, *topleft);
37 pt = _mm_andnot_si128(mask2, *top);
38 pt = _mm_or_si128(pt, ptl);
39 pt = _mm_and_si128(mask1, pt);
40
41 return _mm_or_si128(pl, pt);
42}
43
44void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
45 const uint8_t *above, const uint8_t *left) {
46 __m128i l = _mm_loadl_epi64((const __m128i *)left);
47 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
48 const __m128i zero = _mm_setzero_si128();
49 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
James Zern59197962022-08-27 19:57:51 -070050 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
Hien Ho82a1b772019-08-23 11:23:04 -070051 __m128i rep = _mm_set1_epi16((short)0x8000);
Yi Luoa0f66fc2017-09-26 15:49:59 -070052 const __m128i one = _mm_set1_epi16(1);
53
54 int i;
55 for (i = 0; i < 4; ++i) {
56 const __m128i l16 = _mm_shuffle_epi8(l, rep);
57 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
58
James Zernbf733e62022-07-30 19:48:54 -070059 *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
Yi Luoa0f66fc2017-09-26 15:49:59 -070060 dst += stride;
61 rep = _mm_add_epi16(rep, one);
62 }
63}
64
65void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
66 const uint8_t *above, const uint8_t *left) {
67 __m128i l = _mm_loadl_epi64((const __m128i *)left);
68 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
69 const __m128i zero = _mm_setzero_si128();
70 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
James Zern59197962022-08-27 19:57:51 -070071 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
Hien Ho82a1b772019-08-23 11:23:04 -070072 __m128i rep = _mm_set1_epi16((short)0x8000);
Yi Luoa0f66fc2017-09-26 15:49:59 -070073 const __m128i one = _mm_set1_epi16(1);
74
75 int i;
76 for (i = 0; i < 8; ++i) {
77 const __m128i l16 = _mm_shuffle_epi8(l, rep);
78 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
79
James Zernbf733e62022-07-30 19:48:54 -070080 *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
Yi Luoa0f66fc2017-09-26 15:49:59 -070081 dst += stride;
82 rep = _mm_add_epi16(rep, one);
83 }
84}
85
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -070086void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
87 const uint8_t *above, const uint8_t *left) {
88 __m128i l = _mm_load_si128((const __m128i *)left);
James Zern24836c72022-08-26 18:38:48 -070089 const __m128i t = _mm_cvtsi32_si128(((const int *)above)[0]);
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -070090 const __m128i zero = _mm_setzero_si128();
91 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
James Zern59197962022-08-27 19:57:51 -070092 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
Hien Ho82a1b772019-08-23 11:23:04 -070093 __m128i rep = _mm_set1_epi16((short)0x8000);
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -070094 const __m128i one = _mm_set1_epi16(1);
95
96 for (int i = 0; i < 16; ++i) {
97 const __m128i l16 = _mm_shuffle_epi8(l, rep);
98 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
99
James Zernbf733e62022-07-30 19:48:54 -0700100 *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700101 dst += stride;
102 rep = _mm_add_epi16(rep, one);
103 }
104}
105
Yi Luoa0f66fc2017-09-26 15:49:59 -0700106void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
107 const uint8_t *above, const uint8_t *left) {
108 __m128i l = _mm_loadl_epi64((const __m128i *)left);
109 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
110 const __m128i zero = _mm_setzero_si128();
111 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
James Zern59197962022-08-27 19:57:51 -0700112 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
Hien Ho82a1b772019-08-23 11:23:04 -0700113 __m128i rep = _mm_set1_epi16((short)0x8000);
Yi Luoa0f66fc2017-09-26 15:49:59 -0700114 const __m128i one = _mm_set1_epi16(1);
115
116 int i;
117 for (i = 0; i < 4; ++i) {
118 const __m128i l16 = _mm_shuffle_epi8(l, rep);
119 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
120
121 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
122 dst += stride;
123 rep = _mm_add_epi16(rep, one);
124 }
125}
126
127void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
128 const uint8_t *above, const uint8_t *left) {
129 __m128i l = _mm_loadl_epi64((const __m128i *)left);
130 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
131 const __m128i zero = _mm_setzero_si128();
132 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
James Zern59197962022-08-27 19:57:51 -0700133 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
Hien Ho82a1b772019-08-23 11:23:04 -0700134 __m128i rep = _mm_set1_epi16((short)0x8000);
Yi Luoa0f66fc2017-09-26 15:49:59 -0700135 const __m128i one = _mm_set1_epi16(1);
136
137 int i;
138 for (i = 0; i < 8; ++i) {
139 const __m128i l16 = _mm_shuffle_epi8(l, rep);
140 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
141
142 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
143 dst += stride;
144 rep = _mm_add_epi16(rep, one);
145 }
146}
147
148void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
149 const uint8_t *above, const uint8_t *left) {
150 __m128i l = _mm_load_si128((const __m128i *)left);
151 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
152 const __m128i zero = _mm_setzero_si128();
153 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
James Zern59197962022-08-27 19:57:51 -0700154 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
Hien Ho82a1b772019-08-23 11:23:04 -0700155 __m128i rep = _mm_set1_epi16((short)0x8000);
Yi Luoa0f66fc2017-09-26 15:49:59 -0700156 const __m128i one = _mm_set1_epi16(1);
157
158 int i;
159 for (i = 0; i < 16; ++i) {
160 const __m128i l16 = _mm_shuffle_epi8(l, rep);
161 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
162
163 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
164 dst += stride;
165 rep = _mm_add_epi16(rep, one);
166 }
167}
168
Scott LaVarnway925d4e52018-04-02 05:12:44 -0700169void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
170 const uint8_t *above, const uint8_t *left) {
171 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
172 const __m128i zero = _mm_setzero_si128();
173 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
James Zern59197962022-08-27 19:57:51 -0700174 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
Scott LaVarnway925d4e52018-04-02 05:12:44 -0700175 const __m128i one = _mm_set1_epi16(1);
176
177 for (int j = 0; j < 2; ++j) {
178 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
Hien Ho82a1b772019-08-23 11:23:04 -0700179 __m128i rep = _mm_set1_epi16((short)0x8000);
Scott LaVarnway925d4e52018-04-02 05:12:44 -0700180 for (int i = 0; i < 16; ++i) {
181 const __m128i l16 = _mm_shuffle_epi8(l, rep);
182 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
183
184 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
185 dst += stride;
186 rep = _mm_add_epi16(rep, one);
187 }
188 }
189}
190
Yi Luoa0f66fc2017-09-26 15:49:59 -0700191// Return 16 8-bit pixels in one row
192static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
193 const __m128i *top1,
194 const __m128i *topleft) {
195 const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
196 const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
197 return _mm_packus_epi16(p0, p1);
198}
199
Scott LaVarnway00f8a932018-04-02 08:02:40 -0700200void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
201 const uint8_t *above, const uint8_t *left) {
James Zern24836c72022-08-26 18:38:48 -0700202 __m128i l = _mm_cvtsi32_si128(((const int *)left)[0]);
Scott LaVarnway00f8a932018-04-02 08:02:40 -0700203 const __m128i t = _mm_load_si128((const __m128i *)above);
204 const __m128i zero = _mm_setzero_si128();
205 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
206 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
James Zern59197962022-08-27 19:57:51 -0700207 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
Hien Ho82a1b772019-08-23 11:23:04 -0700208 __m128i rep = _mm_set1_epi16((short)0x8000);
Scott LaVarnway00f8a932018-04-02 08:02:40 -0700209 const __m128i one = _mm_set1_epi16(1);
210
211 for (int i = 0; i < 4; ++i) {
212 const __m128i l16 = _mm_shuffle_epi8(l, rep);
213 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
214
215 _mm_store_si128((__m128i *)dst, row);
216 dst += stride;
217 rep = _mm_add_epi16(rep, one);
218 }
219}
220
Yi Luoa0f66fc2017-09-26 15:49:59 -0700221void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
222 const uint8_t *above, const uint8_t *left) {
223 __m128i l = _mm_loadl_epi64((const __m128i *)left);
224 const __m128i t = _mm_load_si128((const __m128i *)above);
225 const __m128i zero = _mm_setzero_si128();
226 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
227 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
James Zern59197962022-08-27 19:57:51 -0700228 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
Hien Ho82a1b772019-08-23 11:23:04 -0700229 __m128i rep = _mm_set1_epi16((short)0x8000);
Yi Luoa0f66fc2017-09-26 15:49:59 -0700230 const __m128i one = _mm_set1_epi16(1);
231
232 int i;
233 for (i = 0; i < 8; ++i) {
234 const __m128i l16 = _mm_shuffle_epi8(l, rep);
235 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
236
237 _mm_store_si128((__m128i *)dst, row);
238 dst += stride;
239 rep = _mm_add_epi16(rep, one);
240 }
241}
242
243void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
244 const uint8_t *above,
245 const uint8_t *left) {
246 __m128i l = _mm_load_si128((const __m128i *)left);
247 const __m128i t = _mm_load_si128((const __m128i *)above);
248 const __m128i zero = _mm_setzero_si128();
249 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
250 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
James Zern59197962022-08-27 19:57:51 -0700251 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
Hien Ho82a1b772019-08-23 11:23:04 -0700252 __m128i rep = _mm_set1_epi16((short)0x8000);
Yi Luoa0f66fc2017-09-26 15:49:59 -0700253 const __m128i one = _mm_set1_epi16(1);
254
255 int i;
256 for (i = 0; i < 16; ++i) {
257 const __m128i l16 = _mm_shuffle_epi8(l, rep);
258 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
259
260 _mm_store_si128((__m128i *)dst, row);
261 dst += stride;
262 rep = _mm_add_epi16(rep, one);
263 }
264}
265
266void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
267 const uint8_t *above,
268 const uint8_t *left) {
269 __m128i l = _mm_load_si128((const __m128i *)left);
270 const __m128i t = _mm_load_si128((const __m128i *)above);
271 const __m128i zero = _mm_setzero_si128();
272 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
273 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
James Zern59197962022-08-27 19:57:51 -0700274 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
Hien Ho82a1b772019-08-23 11:23:04 -0700275 __m128i rep = _mm_set1_epi16((short)0x8000);
Yi Luoa0f66fc2017-09-26 15:49:59 -0700276 const __m128i one = _mm_set1_epi16(1);
277 __m128i l16;
278
279 int i;
280 for (i = 0; i < 16; ++i) {
281 l16 = _mm_shuffle_epi8(l, rep);
282 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
283
284 _mm_store_si128((__m128i *)dst, row);
285 dst += stride;
286 rep = _mm_add_epi16(rep, one);
287 }
288
289 l = _mm_load_si128((const __m128i *)(left + 16));
Hien Ho82a1b772019-08-23 11:23:04 -0700290 rep = _mm_set1_epi16((short)0x8000);
Yi Luoa0f66fc2017-09-26 15:49:59 -0700291 for (i = 0; i < 16; ++i) {
292 l16 = _mm_shuffle_epi8(l, rep);
293 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
294
295 _mm_store_si128((__m128i *)dst, row);
296 dst += stride;
297 rep = _mm_add_epi16(rep, one);
298 }
299}
300
Scott LaVarnwayee5a4d42018-03-14 07:53:40 -0700301void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
302 const uint8_t *above,
303 const uint8_t *left) {
304 const __m128i t = _mm_load_si128((const __m128i *)above);
305 const __m128i zero = _mm_setzero_si128();
306 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
307 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
James Zern59197962022-08-27 19:57:51 -0700308 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
Scott LaVarnwayee5a4d42018-03-14 07:53:40 -0700309 const __m128i one = _mm_set1_epi16(1);
310
311 for (int j = 0; j < 4; ++j) {
312 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
Hien Ho82a1b772019-08-23 11:23:04 -0700313 __m128i rep = _mm_set1_epi16((short)0x8000);
Scott LaVarnwayee5a4d42018-03-14 07:53:40 -0700314 for (int i = 0; i < 16; ++i) {
315 const __m128i l16 = _mm_shuffle_epi8(l, rep);
316 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
317 _mm_store_si128((__m128i *)dst, row);
318 dst += stride;
319 rep = _mm_add_epi16(rep, one);
320 }
321 }
322}
323
Scott LaVarnway76003142018-04-03 07:17:32 -0700324void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
325 const uint8_t *above, const uint8_t *left) {
326 const __m128i a = _mm_load_si128((const __m128i *)above);
327 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
328 const __m128i zero = _mm_setzero_si128();
329 const __m128i al = _mm_unpacklo_epi8(a, zero);
330 const __m128i ah = _mm_unpackhi_epi8(a, zero);
331 const __m128i bl = _mm_unpacklo_epi8(b, zero);
332 const __m128i bh = _mm_unpackhi_epi8(b, zero);
333
James Zern59197962022-08-27 19:57:51 -0700334 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
Hien Ho82a1b772019-08-23 11:23:04 -0700335 __m128i rep = _mm_set1_epi16((short)0x8000);
Scott LaVarnway76003142018-04-03 07:17:32 -0700336 const __m128i one = _mm_set1_epi16(1);
337 const __m128i l = _mm_loadl_epi64((const __m128i *)left);
338 __m128i l16;
339
340 for (int i = 0; i < 8; ++i) {
341 l16 = _mm_shuffle_epi8(l, rep);
342 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
343 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
344
345 _mm_store_si128((__m128i *)dst, r32l);
346 _mm_store_si128((__m128i *)(dst + 16), r32h);
347 dst += stride;
348 rep = _mm_add_epi16(rep, one);
349 }
350}
351
Yi Luoa0f66fc2017-09-26 15:49:59 -0700352void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
353 const uint8_t *above,
354 const uint8_t *left) {
355 const __m128i a = _mm_load_si128((const __m128i *)above);
356 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
357 const __m128i zero = _mm_setzero_si128();
358 const __m128i al = _mm_unpacklo_epi8(a, zero);
359 const __m128i ah = _mm_unpackhi_epi8(a, zero);
360 const __m128i bl = _mm_unpacklo_epi8(b, zero);
361 const __m128i bh = _mm_unpackhi_epi8(b, zero);
362
James Zern59197962022-08-27 19:57:51 -0700363 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
Hien Ho82a1b772019-08-23 11:23:04 -0700364 __m128i rep = _mm_set1_epi16((short)0x8000);
Yi Luoa0f66fc2017-09-26 15:49:59 -0700365 const __m128i one = _mm_set1_epi16(1);
366 __m128i l = _mm_load_si128((const __m128i *)left);
367 __m128i l16;
368
369 int i;
370 for (i = 0; i < 16; ++i) {
371 l16 = _mm_shuffle_epi8(l, rep);
372 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
373 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
374
375 _mm_store_si128((__m128i *)dst, r32l);
376 _mm_store_si128((__m128i *)(dst + 16), r32h);
377 dst += stride;
378 rep = _mm_add_epi16(rep, one);
379 }
380}
381
382void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
383 const uint8_t *above,
384 const uint8_t *left) {
385 const __m128i a = _mm_load_si128((const __m128i *)above);
386 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
387 const __m128i zero = _mm_setzero_si128();
388 const __m128i al = _mm_unpacklo_epi8(a, zero);
389 const __m128i ah = _mm_unpackhi_epi8(a, zero);
390 const __m128i bl = _mm_unpacklo_epi8(b, zero);
391 const __m128i bh = _mm_unpackhi_epi8(b, zero);
392
James Zern59197962022-08-27 19:57:51 -0700393 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
Hien Ho82a1b772019-08-23 11:23:04 -0700394 __m128i rep = _mm_set1_epi16((short)0x8000);
Yi Luoa0f66fc2017-09-26 15:49:59 -0700395 const __m128i one = _mm_set1_epi16(1);
396 __m128i l = _mm_load_si128((const __m128i *)left);
397 __m128i l16;
398
399 int i;
400 for (i = 0; i < 16; ++i) {
401 l16 = _mm_shuffle_epi8(l, rep);
402 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
403 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
404
405 _mm_store_si128((__m128i *)dst, r32l);
406 _mm_store_si128((__m128i *)(dst + 16), r32h);
407 dst += stride;
408 rep = _mm_add_epi16(rep, one);
409 }
410
Hien Ho82a1b772019-08-23 11:23:04 -0700411 rep = _mm_set1_epi16((short)0x8000);
Yi Luoa0f66fc2017-09-26 15:49:59 -0700412 l = _mm_load_si128((const __m128i *)(left + 16));
413 for (i = 0; i < 16; ++i) {
414 l16 = _mm_shuffle_epi8(l, rep);
415 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
416 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
417
418 _mm_store_si128((__m128i *)dst, r32l);
419 _mm_store_si128((__m128i *)(dst + 16), r32h);
420 dst += stride;
421 rep = _mm_add_epi16(rep, one);
422 }
423}
Yi Luo46ae1ea2017-09-29 17:02:40 -0700424
Scott LaVarnwayf0cf4e32018-02-26 12:19:02 -0800425void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
426 const uint8_t *above,
427 const uint8_t *left) {
428 const __m128i a = _mm_load_si128((const __m128i *)above);
429 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
430 const __m128i zero = _mm_setzero_si128();
431 const __m128i al = _mm_unpacklo_epi8(a, zero);
432 const __m128i ah = _mm_unpackhi_epi8(a, zero);
433 const __m128i bl = _mm_unpacklo_epi8(b, zero);
434 const __m128i bh = _mm_unpackhi_epi8(b, zero);
435
James Zern59197962022-08-27 19:57:51 -0700436 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
Scott LaVarnwayf0cf4e32018-02-26 12:19:02 -0800437 const __m128i one = _mm_set1_epi16(1);
438 __m128i l16;
439
440 int i, j;
441 for (j = 0; j < 4; ++j) {
442 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
Hien Ho82a1b772019-08-23 11:23:04 -0700443 __m128i rep = _mm_set1_epi16((short)0x8000);
Scott LaVarnwayf0cf4e32018-02-26 12:19:02 -0800444 for (i = 0; i < 16; ++i) {
445 l16 = _mm_shuffle_epi8(l, rep);
446 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
447 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
448
449 _mm_store_si128((__m128i *)dst, r32l);
450 _mm_store_si128((__m128i *)(dst + 16), r32h);
451 dst += stride;
452 rep = _mm_add_epi16(rep, one);
453 }
454 }
455}
456
457void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
458 const uint8_t *above,
459 const uint8_t *left) {
460 const __m128i a = _mm_load_si128((const __m128i *)above);
461 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
462 const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
463 const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
464 const __m128i zero = _mm_setzero_si128();
465 const __m128i al = _mm_unpacklo_epi8(a, zero);
466 const __m128i ah = _mm_unpackhi_epi8(a, zero);
467 const __m128i bl = _mm_unpacklo_epi8(b, zero);
468 const __m128i bh = _mm_unpackhi_epi8(b, zero);
469 const __m128i cl = _mm_unpacklo_epi8(c, zero);
470 const __m128i ch = _mm_unpackhi_epi8(c, zero);
471 const __m128i dl = _mm_unpacklo_epi8(d, zero);
472 const __m128i dh = _mm_unpackhi_epi8(d, zero);
473
James Zern59197962022-08-27 19:57:51 -0700474 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
Scott LaVarnwayf0cf4e32018-02-26 12:19:02 -0800475 const __m128i one = _mm_set1_epi16(1);
476 __m128i l16;
477
478 int i, j;
479 for (j = 0; j < 2; ++j) {
480 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
Hien Ho82a1b772019-08-23 11:23:04 -0700481 __m128i rep = _mm_set1_epi16((short)0x8000);
Scott LaVarnwayf0cf4e32018-02-26 12:19:02 -0800482 for (i = 0; i < 16; ++i) {
483 l16 = _mm_shuffle_epi8(l, rep);
484 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
485 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
486 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
487 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
488
489 _mm_store_si128((__m128i *)dst, r0);
490 _mm_store_si128((__m128i *)(dst + 16), r1);
491 _mm_store_si128((__m128i *)(dst + 32), r2);
492 _mm_store_si128((__m128i *)(dst + 48), r3);
493 dst += stride;
494 rep = _mm_add_epi16(rep, one);
495 }
496 }
497}
498
499void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
500 const uint8_t *above,
501 const uint8_t *left) {
502 const __m128i a = _mm_load_si128((const __m128i *)above);
503 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
504 const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
505 const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
506 const __m128i zero = _mm_setzero_si128();
507 const __m128i al = _mm_unpacklo_epi8(a, zero);
508 const __m128i ah = _mm_unpackhi_epi8(a, zero);
509 const __m128i bl = _mm_unpacklo_epi8(b, zero);
510 const __m128i bh = _mm_unpackhi_epi8(b, zero);
511 const __m128i cl = _mm_unpacklo_epi8(c, zero);
512 const __m128i ch = _mm_unpackhi_epi8(c, zero);
513 const __m128i dl = _mm_unpacklo_epi8(d, zero);
514 const __m128i dh = _mm_unpackhi_epi8(d, zero);
515
James Zern59197962022-08-27 19:57:51 -0700516 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
Scott LaVarnwayf0cf4e32018-02-26 12:19:02 -0800517 const __m128i one = _mm_set1_epi16(1);
518 __m128i l16;
519
520 int i, j;
521 for (j = 0; j < 4; ++j) {
522 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
Hien Ho82a1b772019-08-23 11:23:04 -0700523 __m128i rep = _mm_set1_epi16((short)0x8000);
Scott LaVarnwayf0cf4e32018-02-26 12:19:02 -0800524 for (i = 0; i < 16; ++i) {
525 l16 = _mm_shuffle_epi8(l, rep);
526 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
527 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
528 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
529 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
530
531 _mm_store_si128((__m128i *)dst, r0);
532 _mm_store_si128((__m128i *)(dst + 16), r1);
533 _mm_store_si128((__m128i *)(dst + 32), r2);
534 _mm_store_si128((__m128i *)(dst + 48), r3);
535 dst += stride;
536 rep = _mm_add_epi16(rep, one);
537 }
538 }
539}
540
541void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
542 const uint8_t *above,
543 const uint8_t *left) {
544 const __m128i a = _mm_load_si128((const __m128i *)above);
545 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
546 const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
547 const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
548 const __m128i zero = _mm_setzero_si128();
549 const __m128i al = _mm_unpacklo_epi8(a, zero);
550 const __m128i ah = _mm_unpackhi_epi8(a, zero);
551 const __m128i bl = _mm_unpacklo_epi8(b, zero);
552 const __m128i bh = _mm_unpackhi_epi8(b, zero);
553 const __m128i cl = _mm_unpacklo_epi8(c, zero);
554 const __m128i ch = _mm_unpackhi_epi8(c, zero);
555 const __m128i dl = _mm_unpacklo_epi8(d, zero);
556 const __m128i dh = _mm_unpackhi_epi8(d, zero);
557
James Zern59197962022-08-27 19:57:51 -0700558 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
Scott LaVarnwayf0cf4e32018-02-26 12:19:02 -0800559 const __m128i one = _mm_set1_epi16(1);
560 __m128i l16;
561
562 int i;
563 const __m128i l = _mm_load_si128((const __m128i *)left);
Hien Ho82a1b772019-08-23 11:23:04 -0700564 __m128i rep = _mm_set1_epi16((short)0x8000);
Scott LaVarnwayf0cf4e32018-02-26 12:19:02 -0800565 for (i = 0; i < 16; ++i) {
566 l16 = _mm_shuffle_epi8(l, rep);
567 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
568 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
569 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
570 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
571
572 _mm_store_si128((__m128i *)dst, r0);
573 _mm_store_si128((__m128i *)(dst + 16), r1);
574 _mm_store_si128((__m128i *)(dst + 32), r2);
575 _mm_store_si128((__m128i *)(dst + 48), r3);
576 dst += stride;
577 rep = _mm_add_epi16(rep, one);
578 }
579}
580
Yi Luo46ae1ea2017-09-29 17:02:40 -0700581// -----------------------------------------------------------------------------
582// SMOOTH_PRED
583
584// pixels[0]: above and below_pred interleave vector
585// pixels[1]: left vector
586// pixels[2]: right_pred vector
587static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
588 int height, __m128i *pixels) {
James Zern24836c72022-08-26 18:38:48 -0700589 __m128i d = _mm_cvtsi32_si128(((const int *)above)[0]);
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700590 if (height == 4)
James Zern24836c72022-08-26 18:38:48 -0700591 pixels[1] = _mm_cvtsi32_si128(((const int *)left)[0]);
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700592 else if (height == 8)
593 pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
594 else
595 pixels[1] = _mm_loadu_si128(((const __m128i *)left));
596
James Zern59197962022-08-27 19:57:51 -0700597 pixels[2] = _mm_set1_epi16((int16_t)above[3]);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700598
James Zern59197962022-08-27 19:57:51 -0700599 const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700600 const __m128i zero = _mm_setzero_si128();
601 d = _mm_unpacklo_epi8(d, zero);
602 pixels[0] = _mm_unpacklo_epi16(d, bp);
603}
604
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700605// weight_h[0]: weight_h vector
606// weight_h[1]: scale - weight_h vector
607// weight_h[2]: same as [0], second half for height = 16 only
608// weight_h[3]: same as [1], second half for height = 16 only
609// weight_w[0]: weights_w and scale - weights_w interleave vector
James Zern833e5292022-04-23 13:50:25 -0700610static INLINE void load_weight_w4(int height, __m128i *weight_h,
611 __m128i *weight_w) {
Yi Luo46ae1ea2017-09-29 17:02:40 -0700612 const __m128i zero = _mm_setzero_si128();
James Zern59197962022-08-27 19:57:51 -0700613 const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
James Zern24836c72022-08-26 18:38:48 -0700614 const __m128i t = _mm_cvtsi32_si128(((const int *)smooth_weights)[0]);
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700615 weight_h[0] = _mm_unpacklo_epi8(t, zero);
616 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
617 weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700618
619 if (height == 8) {
James Zern6be7f492022-04-27 12:18:20 -0700620 const __m128i weight = _mm_loadl_epi64((const __m128i *)&smooth_weights[4]);
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700621 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
622 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
623 } else if (height == 16) {
James Zern833e5292022-04-23 13:50:25 -0700624 const __m128i weight =
James Zern6be7f492022-04-27 12:18:20 -0700625 _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700626 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
627 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
628 weight_h[2] = _mm_unpackhi_epi8(weight, zero);
629 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700630 }
631}
632
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700633static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
634 const __m128i *ww, int h, uint8_t *dst,
635 ptrdiff_t stride, int second_half) {
James Zern6be7f492022-04-27 12:18:20 -0700636 const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
Yi Luo46ae1ea2017-09-29 17:02:40 -0700637 const __m128i one = _mm_set1_epi16(1);
638 const __m128i inc = _mm_set1_epi16(0x202);
639 const __m128i gat = _mm_set1_epi32(0xc080400);
Hien Ho82a1b772019-08-23 11:23:04 -0700640 __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
641 : _mm_set1_epi16((short)0x8000);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700642 __m128i d = _mm_set1_epi16(0x100);
643
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700644 for (int i = 0; i < h; ++i) {
645 const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
646 const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700647 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
648 __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
649
650 __m128i b = _mm_shuffle_epi8(pixel[1], rep);
651 b = _mm_unpacklo_epi16(b, pixel[2]);
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700652 __m128i sum = _mm_madd_epi16(b, ww[0]);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700653
654 sum = _mm_add_epi32(s, sum);
655 sum = _mm_add_epi32(sum, round);
James Zern6be7f492022-04-27 12:18:20 -0700656 sum = _mm_srai_epi32(sum, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700657
658 sum = _mm_shuffle_epi8(sum, gat);
James Zernbf733e62022-07-30 19:48:54 -0700659 *(int *)dst = _mm_cvtsi128_si32(sum);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700660 dst += stride;
661
662 rep = _mm_add_epi16(rep, one);
663 d = _mm_add_epi16(d, inc);
664 }
665}
666
667void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
668 const uint8_t *above, const uint8_t *left) {
669 __m128i pixels[3];
670 load_pixel_w4(above, left, 4, pixels);
671
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700672 __m128i wh[4], ww[2];
James Zern833e5292022-04-23 13:50:25 -0700673 load_weight_w4(4, wh, ww);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700674
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700675 smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700676}
677
678void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
679 const uint8_t *above, const uint8_t *left) {
680 __m128i pixels[3];
681 load_pixel_w4(above, left, 8, pixels);
682
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700683 __m128i wh[4], ww[2];
James Zern833e5292022-04-23 13:50:25 -0700684 load_weight_w4(8, wh, ww);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700685
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700686 smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
687}
688
689void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
690 const uint8_t *above,
691 const uint8_t *left) {
692 __m128i pixels[3];
693 load_pixel_w4(above, left, 16, pixels);
694
695 __m128i wh[4], ww[2];
James Zern833e5292022-04-23 13:50:25 -0700696 load_weight_w4(16, wh, ww);
Scott LaVarnwayaaed33a2018-03-30 08:04:20 -0700697
698 smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
699 dst += stride << 3;
700 smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700701}
702
703// pixels[0]: above and below_pred interleave vector, first half
704// pixels[1]: above and below_pred interleave vector, second half
705// pixels[2]: left vector
706// pixels[3]: right_pred vector
Scott LaVarnway925d4e52018-04-02 05:12:44 -0700707// pixels[4]: above and below_pred interleave vector, first half
708// pixels[5]: above and below_pred interleave vector, second half
709// pixels[6]: left vector + 16
710// pixels[7]: right_pred vector
Yi Luo46ae1ea2017-09-29 17:02:40 -0700711static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
712 int height, __m128i *pixels) {
Yi Luo46ae1ea2017-09-29 17:02:40 -0700713 const __m128i zero = _mm_setzero_si128();
James Zern59197962022-08-27 19:57:51 -0700714 const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
Scott LaVarnway925d4e52018-04-02 05:12:44 -0700715 __m128i d = _mm_loadl_epi64((const __m128i *)above);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700716 d = _mm_unpacklo_epi8(d, zero);
717 pixels[0] = _mm_unpacklo_epi16(d, bp);
718 pixels[1] = _mm_unpackhi_epi16(d, bp);
Scott LaVarnway925d4e52018-04-02 05:12:44 -0700719
James Zern59197962022-08-27 19:57:51 -0700720 pixels[3] = _mm_set1_epi16((int16_t)above[7]);
Scott LaVarnway925d4e52018-04-02 05:12:44 -0700721
722 if (height == 4) {
James Zern24836c72022-08-26 18:38:48 -0700723 pixels[2] = _mm_cvtsi32_si128(((const int *)left)[0]);
Scott LaVarnway925d4e52018-04-02 05:12:44 -0700724 } else if (height == 8) {
725 pixels[2] = _mm_loadl_epi64((const __m128i *)left);
726 } else if (height == 16) {
727 pixels[2] = _mm_load_si128((const __m128i *)left);
728 } else {
729 pixels[2] = _mm_load_si128((const __m128i *)left);
730 pixels[4] = pixels[0];
731 pixels[5] = pixels[1];
732 pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
733 pixels[7] = pixels[3];
734 }
Yi Luo46ae1ea2017-09-29 17:02:40 -0700735}
736
737// weight_h[0]: weight_h vector
738// weight_h[1]: scale - weight_h vector
Scott LaVarnway925d4e52018-04-02 05:12:44 -0700739// weight_h[2]: same as [0], offset 8
740// weight_h[3]: same as [1], offset 8
741// weight_h[4]: same as [0], offset 16
742// weight_h[5]: same as [1], offset 16
743// weight_h[6]: same as [0], offset 24
744// weight_h[7]: same as [1], offset 24
Yi Luo46ae1ea2017-09-29 17:02:40 -0700745// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
746// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
James Zern833e5292022-04-23 13:50:25 -0700747static INLINE void load_weight_w8(int height, __m128i *weight_h,
748 __m128i *weight_w) {
Yi Luo46ae1ea2017-09-29 17:02:40 -0700749 const __m128i zero = _mm_setzero_si128();
James Zerna95a4a62022-04-23 19:38:00 -0700750 const int we_offset = height < 8 ? 0 : 4;
James Zern6be7f492022-04-27 12:18:20 -0700751 __m128i we = _mm_loadu_si128((const __m128i *)&smooth_weights[we_offset]);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700752 weight_h[0] = _mm_unpacklo_epi8(we, zero);
James Zern59197962022-08-27 19:57:51 -0700753 const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
Yi Luo46ae1ea2017-09-29 17:02:40 -0700754 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
755
756 if (height == 4) {
757 we = _mm_srli_si128(we, 4);
758 __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
759 __m128i tmp2 = _mm_sub_epi16(d, tmp1);
760 weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
761 weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
762 } else {
763 weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
764 weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
765 }
766
767 if (height == 16) {
James Zern6be7f492022-04-27 12:18:20 -0700768 we = _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700769 weight_h[0] = _mm_unpacklo_epi8(we, zero);
770 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
771 weight_h[2] = _mm_unpackhi_epi8(we, zero);
772 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
Scott LaVarnway925d4e52018-04-02 05:12:44 -0700773 } else if (height == 32) {
774 const __m128i weight_lo =
James Zern6be7f492022-04-27 12:18:20 -0700775 _mm_loadu_si128((const __m128i *)&smooth_weights[28]);
Scott LaVarnway925d4e52018-04-02 05:12:44 -0700776 weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
777 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
778 weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
779 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
780 const __m128i weight_hi =
James Zern6be7f492022-04-27 12:18:20 -0700781 _mm_loadu_si128((const __m128i *)&smooth_weights[28 + 16]);
Scott LaVarnway925d4e52018-04-02 05:12:44 -0700782 weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
783 weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
784 weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
785 weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700786 }
787}
788
789static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
790 const __m128i *ww, int h, uint8_t *dst,
791 ptrdiff_t stride, int second_half) {
James Zern6be7f492022-04-27 12:18:20 -0700792 const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
Yi Luo46ae1ea2017-09-29 17:02:40 -0700793 const __m128i one = _mm_set1_epi16(1);
794 const __m128i inc = _mm_set1_epi16(0x202);
795 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
796
Hien Ho82a1b772019-08-23 11:23:04 -0700797 __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
798 : _mm_set1_epi16((short)0x8000);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700799 __m128i d = _mm_set1_epi16(0x100);
800
801 int i;
802 for (i = 0; i < h; ++i) {
803 const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
804 const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
805 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
806 __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
807 __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
808
809 __m128i b = _mm_shuffle_epi8(pixels[2], rep);
810 b = _mm_unpacklo_epi16(b, pixels[3]);
811 __m128i sum0 = _mm_madd_epi16(b, ww[0]);
812 __m128i sum1 = _mm_madd_epi16(b, ww[1]);
813
814 s0 = _mm_add_epi32(s0, sum0);
815 s0 = _mm_add_epi32(s0, round);
James Zern6be7f492022-04-27 12:18:20 -0700816 s0 = _mm_srai_epi32(s0, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700817
818 s1 = _mm_add_epi32(s1, sum1);
819 s1 = _mm_add_epi32(s1, round);
James Zern6be7f492022-04-27 12:18:20 -0700820 s1 = _mm_srai_epi32(s1, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700821
822 sum0 = _mm_packus_epi16(s0, s1);
823 sum0 = _mm_shuffle_epi8(sum0, gat);
824 _mm_storel_epi64((__m128i *)dst, sum0);
825 dst += stride;
826
827 rep = _mm_add_epi16(rep, one);
828 d = _mm_add_epi16(d, inc);
829 }
830}
831
832void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
833 const uint8_t *above, const uint8_t *left) {
834 __m128i pixels[4];
835 load_pixel_w8(above, left, 4, pixels);
836
837 __m128i wh[4], ww[2];
James Zern833e5292022-04-23 13:50:25 -0700838 load_weight_w8(4, wh, ww);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700839
840 smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
841}
842
843void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
844 const uint8_t *above, const uint8_t *left) {
845 __m128i pixels[4];
846 load_pixel_w8(above, left, 8, pixels);
847
848 __m128i wh[4], ww[2];
James Zern833e5292022-04-23 13:50:25 -0700849 load_weight_w8(8, wh, ww);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700850
851 smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
852}
853
854void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
855 const uint8_t *above,
856 const uint8_t *left) {
857 __m128i pixels[4];
858 load_pixel_w8(above, left, 16, pixels);
859
860 __m128i wh[4], ww[2];
James Zern833e5292022-04-23 13:50:25 -0700861 load_weight_w8(16, wh, ww);
Yi Luo46ae1ea2017-09-29 17:02:40 -0700862
863 smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
864 dst += stride << 3;
865 smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
866}
867
Scott LaVarnway925d4e52018-04-02 05:12:44 -0700868void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
869 const uint8_t *above,
870 const uint8_t *left) {
871 __m128i pixels[8];
872 load_pixel_w8(above, left, 32, pixels);
873
874 __m128i wh[8], ww[2];
James Zern833e5292022-04-23 13:50:25 -0700875 load_weight_w8(32, wh, ww);
Scott LaVarnway925d4e52018-04-02 05:12:44 -0700876
877 smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
878 dst += stride << 3;
879 smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
880 dst += stride << 3;
881 smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
882 dst += stride << 3;
883 smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
884}
885
Scott LaVarnwayfd650502022-11-18 08:45:49 -0800886// TODO(slavarnway): Visual Studio only supports restrict when /std:c11
887// (available in 2019+) or greater is specified; __restrict can be used in that
888// case. This should be moved to rtcd and used consistently between the
889// function declarations and definitions to avoid warnings in Visual Studio
890// when defining LIBAOM_RESTRICT to restrict or __restrict.
891#if defined(_MSC_VER)
892#define LIBAOM_RESTRICT
893#else
894#define LIBAOM_RESTRICT restrict
895#endif
896
897static AOM_FORCE_INLINE __m128i Load4(const void *src) {
898 // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
899 // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
900 // movss instruction.
901 //
902 // Until compiler support of _mm_loadu_si32 is widespread, use of
903 // _mm_loadu_si32 is banned.
904 int val;
905 memcpy(&val, src, sizeof(val));
906 return _mm_cvtsi32_si128(val);
907}
908
909static AOM_FORCE_INLINE __m128i LoadLo8(const void *a) {
910 return _mm_loadl_epi64((const __m128i *)(a));
911}
912
913static AOM_FORCE_INLINE __m128i LoadUnaligned16(const void *a) {
914 return _mm_loadu_si128((const __m128i *)(a));
915}
916
Scott LaVarnwayd556d562022-12-01 14:19:58 -0800917static AOM_FORCE_INLINE void Store4(void *dst, const __m128i x) {
918 const int val = _mm_cvtsi128_si32(x);
919 memcpy(dst, &val, sizeof(val));
Scott LaVarnwayfd650502022-11-18 08:45:49 -0800920}
921
922static AOM_FORCE_INLINE void StoreLo8(void *a, const __m128i v) {
923 _mm_storel_epi64((__m128i *)(a), v);
924}
925
Scott LaVarnwayd556d562022-12-01 14:19:58 -0800926static AOM_FORCE_INLINE void StoreUnaligned16(void *a, const __m128i v) {
927 _mm_storeu_si128((__m128i *)(a), v);
928}
929
Scott LaVarnwayfd650502022-11-18 08:45:49 -0800930static AOM_FORCE_INLINE __m128i cvtepu8_epi16(__m128i x) {
931 return _mm_unpacklo_epi8((x), _mm_setzero_si128());
932}
933
Scott LaVarnwayd556d562022-12-01 14:19:58 -0800934static AOM_FORCE_INLINE __m128i cvtepu8_epi32(__m128i x) {
935 const __m128i tmp = _mm_unpacklo_epi8((x), _mm_setzero_si128());
936 return _mm_unpacklo_epi16(tmp, _mm_setzero_si128());
937}
938
Scott LaVarnwayfd650502022-11-18 08:45:49 -0800939static AOM_FORCE_INLINE __m128i cvtepu16_epi32(__m128i x) {
940 return _mm_unpacklo_epi16((x), _mm_setzero_si128());
941}
942
943void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
944 const uint8_t *LIBAOM_RESTRICT top_row,
945 const uint8_t *LIBAOM_RESTRICT left_column, int width,
946 int height) {
947 const uint8_t *const sm_weights_h = smooth_weights + height - 4;
948 const uint8_t *const sm_weights_w = smooth_weights + width - 4;
Scott LaVarnway7cb2db12018-03-12 06:49:03 -0700949 const __m128i zero = _mm_setzero_si128();
Scott LaVarnwayfd650502022-11-18 08:45:49 -0800950 const __m128i scale_value = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
951 const __m128i bottom_left = _mm_cvtsi32_si128(left_column[height - 1]);
952 const __m128i top_right = _mm_set1_epi16(top_row[width - 1]);
953 const __m128i round = _mm_set1_epi32(1 << SMOOTH_WEIGHT_LOG2_SCALE);
954 for (int y = 0; y < height; ++y) {
James Zern24836c72022-08-26 18:38:48 -0700955 const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
Scott LaVarnwayfd650502022-11-18 08:45:49 -0800956 const __m128i left_y = _mm_cvtsi32_si128(left_column[y]);
Scott LaVarnway7cb2db12018-03-12 06:49:03 -0700957 const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
Scott LaVarnwayfd650502022-11-18 08:45:49 -0800958 __m128i scaled_bottom_left =
959 _mm_mullo_epi16(scale_m_weights_y, bottom_left);
960 const __m128i weight_left_y =
Scott LaVarnway7cb2db12018-03-12 06:49:03 -0700961 _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
Scott LaVarnwayfd650502022-11-18 08:45:49 -0800962 scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round);
963 scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0);
964 for (int x = 0; x < width; x += 8) {
965 const __m128i top_x = LoadLo8(top_row + x);
966 const __m128i weights_x = LoadLo8(sm_weights_w + x);
967 const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x);
968 const __m128i top_weights_x_lo = cvtepu8_epi16(top_weights_x);
969 const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero);
Scott LaVarnway7cb2db12018-03-12 06:49:03 -0700970
Scott LaVarnwayfd650502022-11-18 08:45:49 -0800971 // Here opposite weights and pixels are multiplied, where the order of
972 // interleaving is indicated in the names.
973 __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y);
974 __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y);
Scott LaVarnway7cb2db12018-03-12 06:49:03 -0700975
Scott LaVarnwayfd650502022-11-18 08:45:49 -0800976 // |scaled_bottom_left| is always scaled by the same weight each row, so
977 // we only derive |scaled_top_right| values here.
978 const __m128i inverted_weights_x =
979 _mm_sub_epi16(scale_value, cvtepu8_epi16(weights_x));
980 const __m128i scaled_top_right =
981 _mm_mullo_epi16(inverted_weights_x, top_right);
982 const __m128i scaled_top_right_lo = cvtepu16_epi32(scaled_top_right);
983 const __m128i scaled_top_right_hi =
984 _mm_unpackhi_epi16(scaled_top_right, zero);
985 pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left);
986 pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left);
987 pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo);
988 pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi);
Scott LaVarnway7cb2db12018-03-12 06:49:03 -0700989
Scott LaVarnwayfd650502022-11-18 08:45:49 -0800990 // The round value for RightShiftWithRounding was added with
991 // |scaled_bottom_left|.
992 pred_lo = _mm_srli_epi32(pred_lo, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
993 pred_hi = _mm_srli_epi32(pred_hi, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
994 const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
995 StoreLo8(dst + x, _mm_packus_epi16(pred, pred));
Scott LaVarnway7cb2db12018-03-12 06:49:03 -0700996 }
997 dst += stride;
998 }
999}
Scott LaVarnwaye25a4ba2018-03-21 13:20:31 -07001000
Scott LaVarnway00f8a932018-04-02 08:02:40 -07001001void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1002 const uint8_t *above,
1003 const uint8_t *left) {
1004 smooth_predictor_wxh(dst, stride, above, left, 16, 4);
1005}
1006
Scott LaVarnway5997c7d2018-03-14 12:16:46 -07001007void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1008 const uint8_t *above,
1009 const uint8_t *left) {
1010 smooth_predictor_wxh(dst, stride, above, left, 16, 8);
1011}
1012
1013void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1014 const uint8_t *above,
1015 const uint8_t *left) {
1016 smooth_predictor_wxh(dst, stride, above, left, 16, 16);
1017}
1018
1019void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1020 const uint8_t *above,
1021 const uint8_t *left) {
1022 smooth_predictor_wxh(dst, stride, above, left, 16, 32);
1023}
Scott LaVarnway7cb2db12018-03-12 06:49:03 -07001024
Scott LaVarnwayfd650502022-11-18 08:45:49 -08001025void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1026 const uint8_t *above,
1027 const uint8_t *left) {
1028 smooth_predictor_wxh(dst, stride, above, left, 16, 64);
1029}
1030
Scott LaVarnway76003142018-04-03 07:17:32 -07001031void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1032 const uint8_t *above,
1033 const uint8_t *left) {
1034 smooth_predictor_wxh(dst, stride, above, left, 32, 8);
1035}
1036
Scott LaVarnwaybcaa2f82018-03-13 16:21:01 -07001037void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1038 const uint8_t *above,
1039 const uint8_t *left) {
1040 smooth_predictor_wxh(dst, stride, above, left, 32, 16);
1041}
1042
1043void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1044 const uint8_t *above,
1045 const uint8_t *left) {
1046 smooth_predictor_wxh(dst, stride, above, left, 32, 32);
1047}
1048
Scott LaVarnway7cb2db12018-03-12 06:49:03 -07001049void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1050 const uint8_t *above,
1051 const uint8_t *left) {
1052 smooth_predictor_wxh(dst, stride, above, left, 32, 64);
1053}
1054
Scott LaVarnwayfd650502022-11-18 08:45:49 -08001055void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
Scott LaVarnway7cb2db12018-03-12 06:49:03 -07001056 const uint8_t *above,
1057 const uint8_t *left) {
Scott LaVarnwayfd650502022-11-18 08:45:49 -08001058 smooth_predictor_wxh(dst, stride, above, left, 64, 16);
Scott LaVarnway7cb2db12018-03-12 06:49:03 -07001059}
1060
1061void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1062 const uint8_t *above,
1063 const uint8_t *left) {
1064 smooth_predictor_wxh(dst, stride, above, left, 64, 32);
1065}
1066
Scott LaVarnwayfd650502022-11-18 08:45:49 -08001067void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
Scott LaVarnway7cb2db12018-03-12 06:49:03 -07001068 const uint8_t *above,
1069 const uint8_t *left) {
Scott LaVarnwayfd650502022-11-18 08:45:49 -08001070 smooth_predictor_wxh(dst, stride, above, left, 64, 64);
Scott LaVarnway7cb2db12018-03-12 06:49:03 -07001071}
Scott LaVarnway139542e2018-03-13 16:35:14 -07001072
Scott LaVarnwayfd650502022-11-18 08:45:49 -08001073// -----------------------------------------------------------------------------
Scott LaVarnwayea5b6462022-12-02 10:31:37 -08001074// Smooth horizontal/vertical helper functions.
Scott LaVarnwayfd650502022-11-18 08:45:49 -08001075
1076// For Horizontal, pixels1 and pixels2 are the same repeated value. For
1077// Vertical, weights1 and weights2 are the same, and scaled_corner1 and
1078// scaled_corner2 are the same.
1079static AOM_FORCE_INLINE void write_smooth_directional_sum16(
1080 uint8_t *LIBAOM_RESTRICT dst, const __m128i pixels1, const __m128i pixels2,
1081 const __m128i weights1, const __m128i weights2,
1082 const __m128i scaled_corner1, const __m128i scaled_corner2,
1083 const __m128i round) {
1084 const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1);
1085 const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2);
1086 const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1);
1087 const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2);
1088 // Equivalent to RightShiftWithRounding(pred[x][y], 8).
1089 const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8);
1090 const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8);
1091 StoreUnaligned16(dst, _mm_packus_epi16(pred1, pred2));
Scott LaVarnway139542e2018-03-13 16:35:14 -07001092}
Scott LaVarnwaye25a4ba2018-03-21 13:20:31 -07001093
Scott LaVarnwayea5b6462022-12-02 10:31:37 -08001094static AOM_FORCE_INLINE __m128i smooth_directional_sum8(
1095 const __m128i pixels, const __m128i weights, const __m128i scaled_corner) {
1096 const __m128i weighted_px = _mm_mullo_epi16(pixels, weights);
1097 return _mm_add_epi16(scaled_corner, weighted_px);
1098}
1099
1100static AOM_FORCE_INLINE void write_smooth_directional_sum8(
1101 uint8_t *LIBAOM_RESTRICT dst, const __m128i *pixels, const __m128i *weights,
1102 const __m128i *scaled_corner, const __m128i *round) {
1103 const __m128i pred_sum =
1104 smooth_directional_sum8(*pixels, *weights, *scaled_corner);
1105 // Equivalent to RightShiftWithRounding(pred[x][y], 8).
1106 const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, *round), 8);
1107 StoreLo8(dst, _mm_packus_epi16(pred, pred));
1108}
1109
Scott LaVarnwaye25a4ba2018-03-21 13:20:31 -07001110// -----------------------------------------------------------------------------
1111// SMOOTH_V_PRED
1112
Scott LaVarnwayb8662ce2022-12-06 10:08:58 -08001113static AOM_FORCE_INLINE void load_smooth_vertical_pixels4(
1114 const uint8_t *LIBAOM_RESTRICT above, const uint8_t *LIBAOM_RESTRICT left,
1115 const int height, __m128i *pixels) {
1116 __m128i top = Load4(above);
1117 const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
1118 top = cvtepu8_epi16(top);
1119 pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001120}
1121
Scott LaVarnwayb8662ce2022-12-06 10:08:58 -08001122// |weight_array| alternates weight vectors from the table with their inverted
1123// (256-w) counterparts. This is precomputed by the compiler when the weights
1124// table is visible to this module. Removing this visibility can cut speed by up
1125// to half in both 4xH and 8xH transforms.
1126static AOM_FORCE_INLINE void load_smooth_vertical_weights4(
1127 const uint8_t *LIBAOM_RESTRICT weight_array, const int height,
1128 __m128i *weights) {
1129 const __m128i inverter = _mm_set1_epi16(256);
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001130
1131 if (height == 4) {
Scott LaVarnwayb8662ce2022-12-06 10:08:58 -08001132 const __m128i weight = Load4(weight_array);
1133 weights[0] = cvtepu8_epi16(weight);
1134 weights[1] = _mm_sub_epi16(inverter, weights[0]);
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001135 } else if (height == 8) {
Scott LaVarnwayb8662ce2022-12-06 10:08:58 -08001136 const __m128i weight = LoadLo8(weight_array + 4);
1137 weights[0] = cvtepu8_epi16(weight);
1138 weights[1] = _mm_sub_epi16(inverter, weights[0]);
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001139 } else {
Scott LaVarnwayb8662ce2022-12-06 10:08:58 -08001140 const __m128i weight = LoadUnaligned16(weight_array + 12);
1141 const __m128i zero = _mm_setzero_si128();
1142 weights[0] = cvtepu8_epi16(weight);
1143 weights[1] = _mm_sub_epi16(inverter, weights[0]);
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001144 weights[2] = _mm_unpackhi_epi8(weight, zero);
Scott LaVarnwayb8662ce2022-12-06 10:08:58 -08001145 weights[3] = _mm_sub_epi16(inverter, weights[2]);
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001146 }
1147}
1148
Scott LaVarnwayb8662ce2022-12-06 10:08:58 -08001149static AOM_FORCE_INLINE void write_smooth_vertical4xh(
1150 const __m128i *pixel, const __m128i *weight, const int height,
1151 uint8_t *LIBAOM_RESTRICT dst, const ptrdiff_t stride) {
1152 const __m128i pred_round = _mm_set1_epi32(128);
1153 const __m128i mask_increment = _mm_set1_epi16(0x0202);
1154 const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400);
1155 __m128i y_select = _mm_set1_epi16(0x0100);
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001156
Scott LaVarnwayb8662ce2022-12-06 10:08:58 -08001157 for (int y = 0; y < height; ++y) {
1158 const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select);
1159 const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select);
1160 const __m128i alternate_weights =
1161 _mm_unpacklo_epi16(weight_y, inverted_weight_y);
1162 // Here the pixel vector is top_row[0], corner, top_row[1], corner, ...
1163 // The madd instruction yields four results of the form:
1164 // (top_row[x] * weight[y] + corner * inverted_weight[y])
1165 __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights);
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001166 sum = _mm_add_epi32(sum, pred_round);
Scott LaVarnwayb8662ce2022-12-06 10:08:58 -08001167 sum = _mm_srai_epi32(sum, 8);
1168 sum = _mm_shuffle_epi8(sum, cvtepu8_epi32);
1169 Store4(dst, sum);
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001170 dst += stride;
Scott LaVarnwayb8662ce2022-12-06 10:08:58 -08001171 y_select = _mm_add_epi16(y_select, mask_increment);
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001172 }
1173}
1174
Scott LaVarnwayb8662ce2022-12-06 10:08:58 -08001175void aom_smooth_v_predictor_4x4_ssse3(
1176 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1177 const uint8_t *LIBAOM_RESTRICT top_row,
1178 const uint8_t *LIBAOM_RESTRICT left_column) {
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001179 __m128i pixels;
Scott LaVarnwayb8662ce2022-12-06 10:08:58 -08001180 load_smooth_vertical_pixels4(top_row, left_column, 4, &pixels);
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001181
1182 __m128i weights[2];
Scott LaVarnwayb8662ce2022-12-06 10:08:58 -08001183 load_smooth_vertical_weights4(smooth_weights, 4, weights);
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001184
Scott LaVarnwayb8662ce2022-12-06 10:08:58 -08001185 write_smooth_vertical4xh(&pixels, weights, 4, dst, stride);
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001186}
1187
Scott LaVarnwayb8662ce2022-12-06 10:08:58 -08001188void aom_smooth_v_predictor_4x8_ssse3(
1189 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1190 const uint8_t *LIBAOM_RESTRICT top_row,
1191 const uint8_t *LIBAOM_RESTRICT left_column) {
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001192 __m128i pixels;
Scott LaVarnwayb8662ce2022-12-06 10:08:58 -08001193 load_smooth_vertical_pixels4(top_row, left_column, 8, &pixels);
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001194
1195 __m128i weights[2];
Scott LaVarnwayb8662ce2022-12-06 10:08:58 -08001196 load_smooth_vertical_weights4(smooth_weights, 8, weights);
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001197
Scott LaVarnwayb8662ce2022-12-06 10:08:58 -08001198 write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001199}
1200
Scott LaVarnwayb8662ce2022-12-06 10:08:58 -08001201void aom_smooth_v_predictor_4x16_ssse3(
1202 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1203 const uint8_t *LIBAOM_RESTRICT top_row,
1204 const uint8_t *LIBAOM_RESTRICT left_column) {
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001205 __m128i pixels;
Scott LaVarnwayb8662ce2022-12-06 10:08:58 -08001206 load_smooth_vertical_pixels4(top_row, left_column, 16, &pixels);
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001207
1208 __m128i weights[4];
Scott LaVarnwayb8662ce2022-12-06 10:08:58 -08001209 load_smooth_vertical_weights4(smooth_weights, 16, weights);
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001210
Scott LaVarnwayb8662ce2022-12-06 10:08:58 -08001211 write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001212 dst += stride << 3;
Scott LaVarnwayb8662ce2022-12-06 10:08:58 -08001213 write_smooth_vertical4xh(&pixels, &weights[2], 8, dst, stride);
Scott LaVarnway7db820e2018-03-26 17:01:29 -07001214}
1215
Scott LaVarnwayea5b6462022-12-02 10:31:37 -08001216void aom_smooth_v_predictor_8x4_ssse3(
1217 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1218 const uint8_t *LIBAOM_RESTRICT top_row,
1219 const uint8_t *LIBAOM_RESTRICT left_column) {
1220 const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
1221 const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
1222 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1223 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1224 const __m128i scaled_bottom_left =
1225 _mm_mullo_epi16(inverted_weights, bottom_left);
1226 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1227 __m128i y_select = _mm_set1_epi32(0x01000100);
1228 const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1229 __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1230 __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1231 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1232 &round);
1233 dst += stride;
1234 y_select = _mm_set1_epi32(0x03020302);
1235 weights_y = _mm_shuffle_epi8(weights, y_select);
1236 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1237 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1238 &round);
1239 dst += stride;
1240 y_select = _mm_set1_epi32(0x05040504);
1241 weights_y = _mm_shuffle_epi8(weights, y_select);
1242 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1243 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1244 &round);
1245 dst += stride;
1246 y_select = _mm_set1_epi32(0x07060706);
1247 weights_y = _mm_shuffle_epi8(weights, y_select);
1248 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1249 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1250 &round);
Scott LaVarnway664b74f2018-03-22 13:47:41 -07001251}
1252
Scott LaVarnwayea5b6462022-12-02 10:31:37 -08001253void aom_smooth_v_predictor_8x8_ssse3(
1254 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1255 const uint8_t *LIBAOM_RESTRICT top_row,
1256 const uint8_t *LIBAOM_RESTRICT left_column) {
1257 const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
1258 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
1259 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1260 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1261 const __m128i scaled_bottom_left =
1262 _mm_mullo_epi16(inverted_weights, bottom_left);
1263 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1264 const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1265 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1266 const __m128i y_select = _mm_set1_epi32(y_mask);
1267 const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1268 const __m128i scaled_bottom_left_y =
1269 _mm_shuffle_epi8(scaled_bottom_left, y_select);
1270 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1271 &round);
Scott LaVarnway664b74f2018-03-22 13:47:41 -07001272 dst += stride;
Scott LaVarnway664b74f2018-03-22 13:47:41 -07001273 }
1274}
1275
Scott LaVarnwayea5b6462022-12-02 10:31:37 -08001276void aom_smooth_v_predictor_8x16_ssse3(
1277 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1278 const uint8_t *LIBAOM_RESTRICT top_row,
1279 const uint8_t *LIBAOM_RESTRICT left_column) {
1280 const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1281 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
Scott LaVarnway664b74f2018-03-22 13:47:41 -07001282
Scott LaVarnwayea5b6462022-12-02 10:31:37 -08001283 const __m128i weights1 = cvtepu8_epi16(weights);
1284 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
1285 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1286 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1287 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1288 const __m128i scaled_bottom_left1 =
1289 _mm_mullo_epi16(inverted_weights1, bottom_left);
1290 const __m128i scaled_bottom_left2 =
1291 _mm_mullo_epi16(inverted_weights2, bottom_left);
1292 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1293 const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1294 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1295 const __m128i y_select = _mm_set1_epi32(y_mask);
1296 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1297 const __m128i scaled_bottom_left_y =
1298 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1299 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1300 &round);
1301 dst += stride;
1302 }
1303 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1304 const __m128i y_select = _mm_set1_epi32(y_mask);
1305 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1306 const __m128i scaled_bottom_left_y =
1307 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1308 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1309 &round);
1310 dst += stride;
1311 }
Scott LaVarnway664b74f2018-03-22 13:47:41 -07001312}
1313
Scott LaVarnwayea5b6462022-12-02 10:31:37 -08001314void aom_smooth_v_predictor_8x32_ssse3(
1315 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1316 const uint8_t *LIBAOM_RESTRICT top_row,
1317 const uint8_t *LIBAOM_RESTRICT left_column) {
1318 const __m128i zero = _mm_setzero_si128();
1319 const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1320 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1321 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1322 const __m128i weights1 = cvtepu8_epi16(weights_lo);
1323 const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1324 const __m128i weights3 = cvtepu8_epi16(weights_hi);
1325 const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1326 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1327 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1328 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1329 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1330 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1331 const __m128i scaled_bottom_left1 =
1332 _mm_mullo_epi16(inverted_weights1, bottom_left);
1333 const __m128i scaled_bottom_left2 =
1334 _mm_mullo_epi16(inverted_weights2, bottom_left);
1335 const __m128i scaled_bottom_left3 =
1336 _mm_mullo_epi16(inverted_weights3, bottom_left);
1337 const __m128i scaled_bottom_left4 =
1338 _mm_mullo_epi16(inverted_weights4, bottom_left);
1339 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1340 const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1341 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1342 const __m128i y_select = _mm_set1_epi32(y_mask);
1343 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1344 const __m128i scaled_bottom_left_y =
1345 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1346 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1347 &round);
1348 dst += stride;
1349 }
1350 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1351 const __m128i y_select = _mm_set1_epi32(y_mask);
1352 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1353 const __m128i scaled_bottom_left_y =
1354 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1355 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1356 &round);
1357 dst += stride;
1358 }
1359 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1360 const __m128i y_select = _mm_set1_epi32(y_mask);
1361 const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1362 const __m128i scaled_bottom_left_y =
1363 _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1364 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1365 &round);
1366 dst += stride;
1367 }
1368 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1369 const __m128i y_select = _mm_set1_epi32(y_mask);
1370 const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
1371 const __m128i scaled_bottom_left_y =
1372 _mm_shuffle_epi8(scaled_bottom_left4, y_select);
1373 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1374 &round);
1375 dst += stride;
1376 }
Scott LaVarnway664b74f2018-03-22 13:47:41 -07001377}
1378
Scott LaVarnway0dd01902022-11-16 08:27:35 -08001379void aom_smooth_v_predictor_16x4_ssse3(
1380 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1381 const uint8_t *LIBAOM_RESTRICT top_row,
1382 const uint8_t *LIBAOM_RESTRICT left_column) {
1383 const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
1384 const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
1385 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1386 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1387 const __m128i scaled_bottom_left =
1388 _mm_mullo_epi16(inverted_weights, bottom_left);
1389 const __m128i round = _mm_set1_epi16(128);
1390 const __m128i top = LoadUnaligned16(top_row);
1391 const __m128i top_lo = cvtepu8_epi16(top);
1392 const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
1393
1394 __m128i y_select = _mm_set1_epi32(0x01000100);
1395 __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1396 __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1397 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1398 scaled_bottom_left_y, scaled_bottom_left_y,
1399 round);
1400 dst += stride;
1401 y_select = _mm_set1_epi32(0x03020302);
1402 weights_y = _mm_shuffle_epi8(weights, y_select);
1403 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1404 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1405 scaled_bottom_left_y, scaled_bottom_left_y,
1406 round);
1407 dst += stride;
1408 y_select = _mm_set1_epi32(0x05040504);
1409 weights_y = _mm_shuffle_epi8(weights, y_select);
1410 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1411 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1412 scaled_bottom_left_y, scaled_bottom_left_y,
1413 round);
1414 dst += stride;
1415 y_select = _mm_set1_epi32(0x07060706);
1416 weights_y = _mm_shuffle_epi8(weights, y_select);
1417 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1418 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1419 scaled_bottom_left_y, scaled_bottom_left_y,
1420 round);
1421}
1422
1423void aom_smooth_v_predictor_16x8_ssse3(
1424 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1425 const uint8_t *LIBAOM_RESTRICT top_row,
1426 const uint8_t *LIBAOM_RESTRICT left_column) {
1427 const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
1428 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
1429 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1430 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1431 const __m128i scaled_bottom_left =
1432 _mm_mullo_epi16(inverted_weights, bottom_left);
1433 const __m128i round = _mm_set1_epi16(128);
1434 const __m128i top = LoadUnaligned16(top_row);
1435 const __m128i top_lo = cvtepu8_epi16(top);
1436 const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
1437 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1438 const __m128i y_select = _mm_set1_epi32(y_mask);
1439 const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1440 const __m128i scaled_bottom_left_y =
1441 _mm_shuffle_epi8(scaled_bottom_left, y_select);
1442 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1443 scaled_bottom_left_y, scaled_bottom_left_y,
1444 round);
1445 dst += stride;
1446 }
1447}
1448
1449void aom_smooth_v_predictor_16x16_ssse3(
1450 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1451 const uint8_t *LIBAOM_RESTRICT top_row,
1452 const uint8_t *LIBAOM_RESTRICT left_column) {
1453 const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1454 const __m128i zero = _mm_setzero_si128();
1455 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1456 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
1457 const __m128i weights_lo = cvtepu8_epi16(weights);
1458 const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
1459 const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
1460 const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
1461 const __m128i scaled_bottom_left_lo =
1462 _mm_mullo_epi16(inverted_weights_lo, bottom_left);
1463 const __m128i scaled_bottom_left_hi =
1464 _mm_mullo_epi16(inverted_weights_hi, bottom_left);
1465 const __m128i round = _mm_set1_epi16(128);
1466
1467 const __m128i top = LoadUnaligned16(top_row);
1468 const __m128i top_lo = cvtepu8_epi16(top);
1469 const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1470 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1471 const __m128i y_select = _mm_set1_epi32(y_mask);
1472 const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
1473 const __m128i scaled_bottom_left_y =
1474 _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
1475 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1476 scaled_bottom_left_y, scaled_bottom_left_y,
1477 round);
1478 dst += stride;
1479 }
1480 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1481 const __m128i y_select = _mm_set1_epi32(y_mask);
1482 const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
1483 const __m128i scaled_bottom_left_y =
1484 _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
1485 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1486 scaled_bottom_left_y, scaled_bottom_left_y,
1487 round);
1488 dst += stride;
1489 }
1490}
1491
1492void aom_smooth_v_predictor_16x32_ssse3(
1493 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1494 const uint8_t *LIBAOM_RESTRICT top_row,
1495 const uint8_t *LIBAOM_RESTRICT left_column) {
1496 const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1497 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1498 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1499 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1500 const __m128i zero = _mm_setzero_si128();
1501 const __m128i weights1 = cvtepu8_epi16(weights_lo);
1502 const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1503 const __m128i weights3 = cvtepu8_epi16(weights_hi);
1504 const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1505 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1506 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1507 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1508 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1509 const __m128i scaled_bottom_left1 =
1510 _mm_mullo_epi16(inverted_weights1, bottom_left);
1511 const __m128i scaled_bottom_left2 =
1512 _mm_mullo_epi16(inverted_weights2, bottom_left);
1513 const __m128i scaled_bottom_left3 =
1514 _mm_mullo_epi16(inverted_weights3, bottom_left);
1515 const __m128i scaled_bottom_left4 =
1516 _mm_mullo_epi16(inverted_weights4, bottom_left);
1517 const __m128i round = _mm_set1_epi16(128);
1518
1519 const __m128i top = LoadUnaligned16(top_row);
1520 const __m128i top_lo = cvtepu8_epi16(top);
1521 const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1522 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1523 const __m128i y_select = _mm_set1_epi32(y_mask);
1524 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1525 const __m128i scaled_bottom_left_y =
1526 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1527 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1528 scaled_bottom_left_y, scaled_bottom_left_y,
1529 round);
1530 dst += stride;
1531 }
1532 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1533 const __m128i y_select = _mm_set1_epi32(y_mask);
1534 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1535 const __m128i scaled_bottom_left_y =
1536 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1537 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1538 scaled_bottom_left_y, scaled_bottom_left_y,
1539 round);
1540 dst += stride;
1541 }
1542 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1543 const __m128i y_select = _mm_set1_epi32(y_mask);
1544 const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1545 const __m128i scaled_bottom_left_y =
1546 _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1547 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1548 scaled_bottom_left_y, scaled_bottom_left_y,
1549 round);
1550 dst += stride;
1551 }
1552 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1553 const __m128i y_select = _mm_set1_epi32(y_mask);
1554 const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
1555 const __m128i scaled_bottom_left_y =
1556 _mm_shuffle_epi8(scaled_bottom_left4, y_select);
1557 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1558 scaled_bottom_left_y, scaled_bottom_left_y,
1559 round);
1560 dst += stride;
1561 }
1562}
1563
1564void aom_smooth_v_predictor_16x64_ssse3(
1565 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1566 const uint8_t *LIBAOM_RESTRICT top_row,
1567 const uint8_t *LIBAOM_RESTRICT left_column) {
1568 const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
1569 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1570 const __m128i round = _mm_set1_epi16(128);
1571 const __m128i zero = _mm_setzero_si128();
1572 const __m128i top = LoadUnaligned16(top_row);
1573 const __m128i top_lo = cvtepu8_epi16(top);
1574 const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1575 const uint8_t *weights_base_ptr = smooth_weights + 60;
1576 for (int left_offset = 0; left_offset < 64; left_offset += 16) {
1577 const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
1578 const __m128i weights_lo = cvtepu8_epi16(weights);
1579 const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
1580 const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
1581 const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
1582 const __m128i scaled_bottom_left_lo =
1583 _mm_mullo_epi16(inverted_weights_lo, bottom_left);
1584 const __m128i scaled_bottom_left_hi =
1585 _mm_mullo_epi16(inverted_weights_hi, bottom_left);
1586
1587 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1588 const __m128i y_select = _mm_set1_epi32(y_mask);
1589 const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
1590 const __m128i scaled_bottom_left_y =
1591 _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
1592 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1593 scaled_bottom_left_y, scaled_bottom_left_y,
1594 round);
1595 dst += stride;
1596 }
1597 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1598 const __m128i y_select = _mm_set1_epi32(y_mask);
1599 const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
1600 const __m128i scaled_bottom_left_y =
1601 _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
1602 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1603 scaled_bottom_left_y, scaled_bottom_left_y,
1604 round);
1605 dst += stride;
1606 }
1607 }
Scott LaVarnwaye25a4ba2018-03-21 13:20:31 -07001608}
1609
Scott LaVarnway24d58d52022-11-17 07:40:06 -08001610void aom_smooth_v_predictor_32x8_ssse3(
1611 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1612 const uint8_t *LIBAOM_RESTRICT top_row,
1613 const uint8_t *LIBAOM_RESTRICT left_column) {
1614 const __m128i zero = _mm_setzero_si128();
1615 const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
1616 const __m128i top_lo = LoadUnaligned16(top_row);
1617 const __m128i top_hi = LoadUnaligned16(top_row + 16);
1618 const __m128i top1 = cvtepu8_epi16(top_lo);
1619 const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1620 const __m128i top3 = cvtepu8_epi16(top_hi);
1621 const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1622 __m128i scale = _mm_set1_epi16(256);
1623 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
1624 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1625 const __m128i scaled_bottom_left =
1626 _mm_mullo_epi16(inverted_weights, bottom_left);
1627 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1628 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1629 __m128i y_select = _mm_set1_epi32(y_mask);
1630 const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1631 const __m128i scaled_bottom_left_y =
1632 _mm_shuffle_epi8(scaled_bottom_left, y_select);
1633 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1634 scaled_bottom_left_y, scaled_bottom_left_y,
1635 round);
1636 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1637 scaled_bottom_left_y, scaled_bottom_left_y,
1638 round);
1639 dst += stride;
1640 }
Scott LaVarnwaye25a4ba2018-03-21 13:20:31 -07001641}
1642
Scott LaVarnway24d58d52022-11-17 07:40:06 -08001643void aom_smooth_v_predictor_32x16_ssse3(
1644 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1645 const uint8_t *LIBAOM_RESTRICT top_row,
1646 const uint8_t *LIBAOM_RESTRICT left_column) {
1647 const __m128i zero = _mm_setzero_si128();
1648 const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1649 const __m128i top_lo = LoadUnaligned16(top_row);
1650 const __m128i top_hi = LoadUnaligned16(top_row + 16);
1651 const __m128i top1 = cvtepu8_epi16(top_lo);
1652 const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1653 const __m128i top3 = cvtepu8_epi16(top_hi);
1654 const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1655 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
1656 const __m128i weights1 = cvtepu8_epi16(weights);
1657 const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
1658 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1659 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1660 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1661 const __m128i scaled_bottom_left1 =
1662 _mm_mullo_epi16(inverted_weights1, bottom_left);
1663 const __m128i scaled_bottom_left2 =
1664 _mm_mullo_epi16(inverted_weights2, bottom_left);
1665 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1666 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1667 __m128i y_select = _mm_set1_epi32(y_mask);
1668 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1669 const __m128i scaled_bottom_left_y =
1670 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1671 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1672 scaled_bottom_left_y, scaled_bottom_left_y,
1673 round);
1674 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1675 scaled_bottom_left_y, scaled_bottom_left_y,
1676 round);
1677 dst += stride;
1678 }
1679 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1680 __m128i y_select = _mm_set1_epi32(y_mask);
1681 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1682 const __m128i scaled_bottom_left_y =
1683 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1684 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1685 scaled_bottom_left_y, scaled_bottom_left_y,
1686 round);
1687 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1688 scaled_bottom_left_y, scaled_bottom_left_y,
1689 round);
1690 dst += stride;
1691 }
Scott LaVarnwaye25a4ba2018-03-21 13:20:31 -07001692}
1693
Scott LaVarnway24d58d52022-11-17 07:40:06 -08001694void aom_smooth_v_predictor_32x32_ssse3(
1695 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1696 const uint8_t *LIBAOM_RESTRICT top_row,
1697 const uint8_t *LIBAOM_RESTRICT left_column) {
1698 const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1699 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1700 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1701 const __m128i zero = _mm_setzero_si128();
1702 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1703 const __m128i top_lo = LoadUnaligned16(top_row);
1704 const __m128i top_hi = LoadUnaligned16(top_row + 16);
1705 const __m128i top1 = cvtepu8_epi16(top_lo);
1706 const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1707 const __m128i top3 = cvtepu8_epi16(top_hi);
1708 const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1709 const __m128i weights1 = cvtepu8_epi16(weights_lo);
1710 const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1711 const __m128i weights3 = cvtepu8_epi16(weights_hi);
1712 const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1713 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1714 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1715 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1716 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1717 const __m128i scaled_bottom_left1 =
1718 _mm_mullo_epi16(inverted_weights1, bottom_left);
1719 const __m128i scaled_bottom_left2 =
1720 _mm_mullo_epi16(inverted_weights2, bottom_left);
1721 const __m128i scaled_bottom_left3 =
1722 _mm_mullo_epi16(inverted_weights3, bottom_left);
1723 const __m128i scaled_bottom_left4 =
1724 _mm_mullo_epi16(inverted_weights4, bottom_left);
1725 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1726 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1727 const __m128i y_select = _mm_set1_epi32(y_mask);
1728 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1729 const __m128i scaled_bottom_left_y =
1730 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1731 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1732 scaled_bottom_left_y, scaled_bottom_left_y,
1733 round);
1734 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1735 scaled_bottom_left_y, scaled_bottom_left_y,
1736 round);
1737 dst += stride;
1738 }
1739 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1740 const __m128i y_select = _mm_set1_epi32(y_mask);
1741 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1742 const __m128i scaled_bottom_left_y =
1743 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1744 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1745 scaled_bottom_left_y, scaled_bottom_left_y,
1746 round);
1747 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1748 scaled_bottom_left_y, scaled_bottom_left_y,
1749 round);
1750 dst += stride;
1751 }
1752 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1753 const __m128i y_select = _mm_set1_epi32(y_mask);
1754 const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1755 const __m128i scaled_bottom_left_y =
1756 _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1757 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1758 scaled_bottom_left_y, scaled_bottom_left_y,
1759 round);
1760 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1761 scaled_bottom_left_y, scaled_bottom_left_y,
1762 round);
1763 dst += stride;
1764 }
1765 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1766 const __m128i y_select = _mm_set1_epi32(y_mask);
1767 const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
1768 const __m128i scaled_bottom_left_y =
1769 _mm_shuffle_epi8(scaled_bottom_left4, y_select);
1770 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1771 scaled_bottom_left_y, scaled_bottom_left_y,
1772 round);
1773 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1774 scaled_bottom_left_y, scaled_bottom_left_y,
1775 round);
1776 dst += stride;
1777 }
Scott LaVarnwaye25a4ba2018-03-21 13:20:31 -07001778}
1779
Scott LaVarnway24d58d52022-11-17 07:40:06 -08001780void aom_smooth_v_predictor_32x64_ssse3(
1781 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1782 const uint8_t *LIBAOM_RESTRICT top_row,
1783 const uint8_t *LIBAOM_RESTRICT left_column) {
1784 const __m128i zero = _mm_setzero_si128();
1785 const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
1786 const __m128i top_lo = LoadUnaligned16(top_row);
1787 const __m128i top_hi = LoadUnaligned16(top_row + 16);
1788 const __m128i top1 = cvtepu8_epi16(top_lo);
1789 const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1790 const __m128i top3 = cvtepu8_epi16(top_hi);
1791 const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1792 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1793 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1794 const uint8_t *weights_base_ptr = smooth_weights + 60;
1795 for (int left_offset = 0; left_offset < 64; left_offset += 16) {
1796 const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
1797 const __m128i weights_lo = cvtepu8_epi16(weights);
1798 const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
1799 const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
1800 const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
1801 const __m128i scaled_bottom_left_lo =
1802 _mm_mullo_epi16(inverted_weights_lo, bottom_left);
1803 const __m128i scaled_bottom_left_hi =
1804 _mm_mullo_epi16(inverted_weights_hi, bottom_left);
1805
1806 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1807 const __m128i y_select = _mm_set1_epi32(y_mask);
1808 const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
1809 const __m128i scaled_bottom_left_y =
1810 _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
1811 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1812 scaled_bottom_left_y, scaled_bottom_left_y,
1813 round);
1814 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1815 scaled_bottom_left_y, scaled_bottom_left_y,
1816 round);
1817 dst += stride;
1818 }
1819 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1820 const __m128i y_select = _mm_set1_epi32(y_mask);
1821 const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
1822 const __m128i scaled_bottom_left_y =
1823 _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
1824 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1825 scaled_bottom_left_y, scaled_bottom_left_y,
1826 round);
1827 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1828 scaled_bottom_left_y, scaled_bottom_left_y,
1829 round);
1830 dst += stride;
1831 }
1832 }
Scott LaVarnwaye25a4ba2018-03-21 13:20:31 -07001833}
1834
Scott LaVarnway1a3d8c92022-11-17 11:27:43 -08001835void aom_smooth_v_predictor_64x16_ssse3(
1836 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1837 const uint8_t *LIBAOM_RESTRICT top_row,
1838 const uint8_t *LIBAOM_RESTRICT left_column) {
1839 const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1840 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1841 const __m128i zero = _mm_setzero_si128();
1842 const __m128i top_lolo = LoadUnaligned16(top_row);
1843 const __m128i top_lohi = LoadUnaligned16(top_row + 16);
1844 const __m128i top1 = cvtepu8_epi16(top_lolo);
1845 const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
1846 const __m128i top3 = cvtepu8_epi16(top_lohi);
1847 const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
1848
1849 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
1850 const __m128i weights1 = cvtepu8_epi16(weights);
1851 const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
1852 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1853 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1854 const __m128i top_hilo = LoadUnaligned16(top_row + 32);
1855 const __m128i top_hihi = LoadUnaligned16(top_row + 48);
1856 const __m128i top5 = cvtepu8_epi16(top_hilo);
1857 const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
1858 const __m128i top7 = cvtepu8_epi16(top_hihi);
1859 const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
1860 const __m128i scaled_bottom_left1 =
1861 _mm_mullo_epi16(inverted_weights1, bottom_left);
1862 const __m128i scaled_bottom_left2 =
1863 _mm_mullo_epi16(inverted_weights2, bottom_left);
1864 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1865 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1866 const __m128i y_select = _mm_set1_epi32(y_mask);
1867 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1868 const __m128i scaled_bottom_left_y =
1869 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1870 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1871 scaled_bottom_left_y, scaled_bottom_left_y,
1872 round);
1873 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1874 scaled_bottom_left_y, scaled_bottom_left_y,
1875 round);
1876 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1877 scaled_bottom_left_y, scaled_bottom_left_y,
1878 round);
1879 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1880 scaled_bottom_left_y, scaled_bottom_left_y,
1881 round);
1882 dst += stride;
1883 }
1884 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1885 const __m128i y_select = _mm_set1_epi32(y_mask);
1886 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1887 const __m128i scaled_bottom_left_y =
1888 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1889 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1890 scaled_bottom_left_y, scaled_bottom_left_y,
1891 round);
1892 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1893 scaled_bottom_left_y, scaled_bottom_left_y,
1894 round);
1895 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1896 scaled_bottom_left_y, scaled_bottom_left_y,
1897 round);
1898 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1899 scaled_bottom_left_y, scaled_bottom_left_y,
1900 round);
1901 dst += stride;
1902 }
Scott LaVarnwaye25a4ba2018-03-21 13:20:31 -07001903}
1904
Scott LaVarnway1a3d8c92022-11-17 11:27:43 -08001905void aom_smooth_v_predictor_64x32_ssse3(
1906 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1907 const uint8_t *LIBAOM_RESTRICT top_row,
1908 const uint8_t *LIBAOM_RESTRICT left_column) {
1909 const __m128i zero = _mm_setzero_si128();
1910 const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1911 const __m128i top_lolo = LoadUnaligned16(top_row);
1912 const __m128i top_lohi = LoadUnaligned16(top_row + 16);
1913 const __m128i top1 = cvtepu8_epi16(top_lolo);
1914 const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
1915 const __m128i top3 = cvtepu8_epi16(top_lohi);
1916 const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
1917 const __m128i top_hilo = LoadUnaligned16(top_row + 32);
1918 const __m128i top_hihi = LoadUnaligned16(top_row + 48);
1919 const __m128i top5 = cvtepu8_epi16(top_hilo);
1920 const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
1921 const __m128i top7 = cvtepu8_epi16(top_hihi);
1922 const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
1923 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1924 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1925 const __m128i weights1 = cvtepu8_epi16(weights_lo);
1926 const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1927 const __m128i weights3 = cvtepu8_epi16(weights_hi);
1928 const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1929 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1930 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1931 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1932 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1933 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1934 const __m128i scaled_bottom_left1 =
1935 _mm_mullo_epi16(inverted_weights1, bottom_left);
1936 const __m128i scaled_bottom_left2 =
1937 _mm_mullo_epi16(inverted_weights2, bottom_left);
1938 const __m128i scaled_bottom_left3 =
1939 _mm_mullo_epi16(inverted_weights3, bottom_left);
1940 const __m128i scaled_bottom_left4 =
1941 _mm_mullo_epi16(inverted_weights4, bottom_left);
1942 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1943
1944 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1945 const __m128i y_select = _mm_set1_epi32(y_mask);
1946 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1947 const __m128i scaled_bottom_left_y =
1948 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1949 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1950 scaled_bottom_left_y, scaled_bottom_left_y,
1951 round);
1952 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1953 scaled_bottom_left_y, scaled_bottom_left_y,
1954 round);
1955 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1956 scaled_bottom_left_y, scaled_bottom_left_y,
1957 round);
1958 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1959 scaled_bottom_left_y, scaled_bottom_left_y,
1960 round);
1961 dst += stride;
1962 }
1963 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1964 const __m128i y_select = _mm_set1_epi32(y_mask);
1965 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1966 const __m128i scaled_bottom_left_y =
1967 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1968 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1969 scaled_bottom_left_y, scaled_bottom_left_y,
1970 round);
1971 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1972 scaled_bottom_left_y, scaled_bottom_left_y,
1973 round);
1974 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1975 scaled_bottom_left_y, scaled_bottom_left_y,
1976 round);
1977 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1978 scaled_bottom_left_y, scaled_bottom_left_y,
1979 round);
1980 dst += stride;
1981 }
1982 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1983 const __m128i y_select = _mm_set1_epi32(y_mask);
1984 const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1985 const __m128i scaled_bottom_left_y =
1986 _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1987 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1988 scaled_bottom_left_y, scaled_bottom_left_y,
1989 round);
1990 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1991 scaled_bottom_left_y, scaled_bottom_left_y,
1992 round);
1993 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1994 scaled_bottom_left_y, scaled_bottom_left_y,
1995 round);
1996 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1997 scaled_bottom_left_y, scaled_bottom_left_y,
1998 round);
1999 dst += stride;
2000 }
2001 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2002 const __m128i y_select = _mm_set1_epi32(y_mask);
2003 const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
2004 const __m128i scaled_bottom_left_y =
2005 _mm_shuffle_epi8(scaled_bottom_left4, y_select);
2006 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
2007 scaled_bottom_left_y, scaled_bottom_left_y,
2008 round);
2009 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
2010 scaled_bottom_left_y, scaled_bottom_left_y,
2011 round);
2012 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
2013 scaled_bottom_left_y, scaled_bottom_left_y,
2014 round);
2015 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
2016 scaled_bottom_left_y, scaled_bottom_left_y,
2017 round);
2018 dst += stride;
2019 }
Scott LaVarnwaye25a4ba2018-03-21 13:20:31 -07002020}
2021
Scott LaVarnway1a3d8c92022-11-17 11:27:43 -08002022void aom_smooth_v_predictor_64x64_ssse3(
2023 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2024 const uint8_t *LIBAOM_RESTRICT top_row,
2025 const uint8_t *LIBAOM_RESTRICT left_column) {
2026 const __m128i zero = _mm_setzero_si128();
2027 const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
2028 const __m128i top_lolo = LoadUnaligned16(top_row);
2029 const __m128i top_lohi = LoadUnaligned16(top_row + 16);
2030 const __m128i top1 = cvtepu8_epi16(top_lolo);
2031 const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
2032 const __m128i top3 = cvtepu8_epi16(top_lohi);
2033 const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
2034 const __m128i top_hilo = LoadUnaligned16(top_row + 32);
2035 const __m128i top_hihi = LoadUnaligned16(top_row + 48);
2036 const __m128i top5 = cvtepu8_epi16(top_hilo);
2037 const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
2038 const __m128i top7 = cvtepu8_epi16(top_hihi);
2039 const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
2040 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2041 const __m128i round = _mm_set1_epi16(128);
2042 const uint8_t *weights_base_ptr = smooth_weights + 60;
2043 for (int left_offset = 0; left_offset < 64; left_offset += 16) {
2044 const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
2045 const __m128i weights_lo = cvtepu8_epi16(weights);
2046 const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
2047 const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
2048 const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
2049 const __m128i scaled_bottom_left_lo =
2050 _mm_mullo_epi16(inverted_weights_lo, bottom_left);
2051 const __m128i scaled_bottom_left_hi =
2052 _mm_mullo_epi16(inverted_weights_hi, bottom_left);
2053 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2054 const __m128i y_select = _mm_set1_epi32(y_mask);
2055 const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
2056 const __m128i scaled_bottom_left_y =
2057 _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
2058 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
2059 scaled_bottom_left_y, scaled_bottom_left_y,
2060 round);
2061 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
2062 scaled_bottom_left_y, scaled_bottom_left_y,
2063 round);
2064 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
2065 scaled_bottom_left_y, scaled_bottom_left_y,
2066 round);
2067 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
2068 scaled_bottom_left_y, scaled_bottom_left_y,
2069 round);
2070 dst += stride;
2071 }
2072 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2073 const __m128i y_select = _mm_set1_epi32(y_mask);
2074 const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
2075 const __m128i scaled_bottom_left_y =
2076 _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
2077 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
2078 scaled_bottom_left_y, scaled_bottom_left_y,
2079 round);
2080 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
2081 scaled_bottom_left_y, scaled_bottom_left_y,
2082 round);
2083 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
2084 scaled_bottom_left_y, scaled_bottom_left_y,
2085 round);
2086 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
2087 scaled_bottom_left_y, scaled_bottom_left_y,
2088 round);
2089 dst += stride;
2090 }
2091 }
Scott LaVarnwaye25a4ba2018-03-21 13:20:31 -07002092}
2093
Scott LaVarnwaydeeee7c2018-03-28 07:26:51 -07002094// -----------------------------------------------------------------------------
2095// SMOOTH_H_PRED
Scott LaVarnwayd556d562022-12-01 14:19:58 -08002096static AOM_FORCE_INLINE void write_smooth_horizontal_sum4(
2097 uint8_t *LIBAOM_RESTRICT dst, const __m128i *left_y, const __m128i *weights,
2098 const __m128i *scaled_top_right, const __m128i *round) {
2099 const __m128i weighted_left_y = _mm_mullo_epi16(*left_y, *weights);
2100 const __m128i pred_sum = _mm_add_epi32(*scaled_top_right, weighted_left_y);
2101 // Equivalent to RightShiftWithRounding(pred[x][y], 8).
2102 const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, *round), 8);
2103 const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
2104 Store4(dst, _mm_shuffle_epi8(pred, cvtepi32_epi8));
Scott LaVarnwaydeeee7c2018-03-28 07:26:51 -07002105}
2106
Scott LaVarnwayd556d562022-12-01 14:19:58 -08002107void aom_smooth_h_predictor_4x4_ssse3(
2108 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2109 const uint8_t *LIBAOM_RESTRICT top_row,
2110 const uint8_t *LIBAOM_RESTRICT left_column) {
2111 const __m128i top_right = _mm_set1_epi32(top_row[3]);
2112 const __m128i left = cvtepu8_epi32(Load4(left_column));
2113 const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
2114 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2115 const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
2116 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2117 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2118 __m128i left_y = _mm_shuffle_epi32(left, 0);
2119 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2120 &round);
2121 dst += stride;
2122 left_y = _mm_shuffle_epi32(left, 0x55);
2123 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2124 &round);
2125 dst += stride;
2126 left_y = _mm_shuffle_epi32(left, 0xaa);
2127 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2128 &round);
2129 dst += stride;
2130 left_y = _mm_shuffle_epi32(left, 0xff);
2131 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2132 &round);
Scott LaVarnwaydeeee7c2018-03-28 07:26:51 -07002133}
2134
Scott LaVarnwayd556d562022-12-01 14:19:58 -08002135void aom_smooth_h_predictor_4x8_ssse3(
2136 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2137 const uint8_t *LIBAOM_RESTRICT top_row,
2138 const uint8_t *LIBAOM_RESTRICT left_column) {
2139 const __m128i top_right = _mm_set1_epi32(top_row[3]);
2140 const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
2141 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2142 const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
2143 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2144 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2145 __m128i left = cvtepu8_epi32(Load4(left_column));
2146 __m128i left_y = _mm_shuffle_epi32(left, 0);
2147 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2148 &round);
2149 dst += stride;
2150 left_y = _mm_shuffle_epi32(left, 0x55);
2151 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2152 &round);
2153 dst += stride;
2154 left_y = _mm_shuffle_epi32(left, 0xaa);
2155 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2156 &round);
2157 dst += stride;
2158 left_y = _mm_shuffle_epi32(left, 0xff);
2159 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2160 &round);
2161 dst += stride;
Scott LaVarnwaydeeee7c2018-03-28 07:26:51 -07002162
Scott LaVarnwayd556d562022-12-01 14:19:58 -08002163 left = cvtepu8_epi32(Load4(left_column + 4));
2164 left_y = _mm_shuffle_epi32(left, 0);
2165 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2166 &round);
2167 dst += stride;
2168 left_y = _mm_shuffle_epi32(left, 0x55);
2169 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2170 &round);
2171 dst += stride;
2172 left_y = _mm_shuffle_epi32(left, 0xaa);
2173 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2174 &round);
2175 dst += stride;
2176 left_y = _mm_shuffle_epi32(left, 0xff);
2177 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2178 &round);
Scott LaVarnwaydeeee7c2018-03-28 07:26:51 -07002179}
2180
Scott LaVarnwayd556d562022-12-01 14:19:58 -08002181void aom_smooth_h_predictor_4x16_ssse3(
2182 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2183 const uint8_t *LIBAOM_RESTRICT top_row,
2184 const uint8_t *LIBAOM_RESTRICT left_column) {
2185 const __m128i top_right = _mm_set1_epi32(top_row[3]);
2186 const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
2187 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2188 const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
2189 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2190 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2191 __m128i left = cvtepu8_epi32(Load4(left_column));
2192 __m128i left_y = _mm_shuffle_epi32(left, 0);
2193 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2194 &round);
2195 dst += stride;
2196 left_y = _mm_shuffle_epi32(left, 0x55);
2197 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2198 &round);
2199 dst += stride;
2200 left_y = _mm_shuffle_epi32(left, 0xaa);
2201 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2202 &round);
2203 dst += stride;
2204 left_y = _mm_shuffle_epi32(left, 0xff);
2205 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2206 &round);
2207 dst += stride;
Scott LaVarnwaydeeee7c2018-03-28 07:26:51 -07002208
Scott LaVarnwayd556d562022-12-01 14:19:58 -08002209 left = cvtepu8_epi32(Load4(left_column + 4));
2210 left_y = _mm_shuffle_epi32(left, 0);
2211 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2212 &round);
2213 dst += stride;
2214 left_y = _mm_shuffle_epi32(left, 0x55);
2215 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2216 &round);
2217 dst += stride;
2218 left_y = _mm_shuffle_epi32(left, 0xaa);
2219 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2220 &round);
2221 dst += stride;
2222 left_y = _mm_shuffle_epi32(left, 0xff);
2223 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2224 &round);
2225 dst += stride;
Scott LaVarnwaydeeee7c2018-03-28 07:26:51 -07002226
Scott LaVarnwayd556d562022-12-01 14:19:58 -08002227 left = cvtepu8_epi32(Load4(left_column + 8));
2228 left_y = _mm_shuffle_epi32(left, 0);
2229 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2230 &round);
2231 dst += stride;
2232 left_y = _mm_shuffle_epi32(left, 0x55);
2233 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2234 &round);
2235 dst += stride;
2236 left_y = _mm_shuffle_epi32(left, 0xaa);
2237 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2238 &round);
2239 dst += stride;
2240 left_y = _mm_shuffle_epi32(left, 0xff);
2241 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2242 &round);
2243 dst += stride;
Scott LaVarnwaydeeee7c2018-03-28 07:26:51 -07002244
Scott LaVarnwayd556d562022-12-01 14:19:58 -08002245 left = cvtepu8_epi32(Load4(left_column + 12));
2246 left_y = _mm_shuffle_epi32(left, 0);
2247 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2248 &round);
2249 dst += stride;
2250 left_y = _mm_shuffle_epi32(left, 0x55);
2251 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2252 &round);
2253 dst += stride;
2254 left_y = _mm_shuffle_epi32(left, 0xaa);
2255 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2256 &round);
2257 dst += stride;
2258 left_y = _mm_shuffle_epi32(left, 0xff);
2259 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2260 &round);
Scott LaVarnwaydeeee7c2018-03-28 07:26:51 -07002261}
Scott LaVarnway5be7c662018-03-28 14:27:45 -07002262
Scott LaVarnway7381d0b2022-11-21 13:03:49 -08002263// For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,
2264// |pixels| is a segment of the top row or the whole top row, and |weights| is
2265// repeated.
Scott LaVarnway7381d0b2022-11-21 13:03:49 -08002266void aom_smooth_h_predictor_8x4_ssse3(
2267 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2268 const uint8_t *LIBAOM_RESTRICT top_row,
2269 const uint8_t *LIBAOM_RESTRICT left_column) {
2270 const __m128i top_right = _mm_set1_epi16(top_row[7]);
2271 const __m128i left = cvtepu8_epi16(Load4(left_column));
2272 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2273 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2274 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2275 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2276 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2277 __m128i y_select = _mm_set1_epi32(0x01000100);
2278 __m128i left_y = _mm_shuffle_epi8(left, y_select);
2279 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2280 &round);
2281 dst += stride;
2282 y_select = _mm_set1_epi32(0x03020302);
2283 left_y = _mm_shuffle_epi8(left, y_select);
2284 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2285 &round);
2286 dst += stride;
2287 y_select = _mm_set1_epi32(0x05040504);
2288 left_y = _mm_shuffle_epi8(left, y_select);
2289 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2290 &round);
2291 dst += stride;
2292 y_select = _mm_set1_epi32(0x07060706);
2293 left_y = _mm_shuffle_epi8(left, y_select);
2294 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2295 &round);
2296}
Scott LaVarnway5be7c662018-03-28 14:27:45 -07002297
Scott LaVarnway7381d0b2022-11-21 13:03:49 -08002298void aom_smooth_h_predictor_8x8_ssse3(
2299 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2300 const uint8_t *LIBAOM_RESTRICT top_row,
2301 const uint8_t *LIBAOM_RESTRICT left_column) {
2302 const __m128i top_right = _mm_set1_epi16(top_row[7]);
2303 const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2304 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2305 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2306 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2307 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2308 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2309 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2310 const __m128i y_select = _mm_set1_epi32(y_mask);
2311 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2312 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2313 &round);
Scott LaVarnway5be7c662018-03-28 14:27:45 -07002314 dst += stride;
Scott LaVarnway5be7c662018-03-28 14:27:45 -07002315 }
2316}
2317
Scott LaVarnway7381d0b2022-11-21 13:03:49 -08002318void aom_smooth_h_predictor_8x16_ssse3(
2319 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2320 const uint8_t *LIBAOM_RESTRICT top_row,
2321 const uint8_t *LIBAOM_RESTRICT left_column) {
2322 const __m128i top_right = _mm_set1_epi16(top_row[7]);
2323 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2324 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2325 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2326 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2327 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2328 __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2329 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2330 const __m128i y_select = _mm_set1_epi32(y_mask);
2331 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2332 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2333 &round);
2334 dst += stride;
2335 }
2336 left = cvtepu8_epi16(LoadLo8(left_column + 8));
2337 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2338 const __m128i y_select = _mm_set1_epi32(y_mask);
2339 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2340 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2341 &round);
2342 dst += stride;
2343 }
Scott LaVarnway5be7c662018-03-28 14:27:45 -07002344}
2345
Scott LaVarnway7381d0b2022-11-21 13:03:49 -08002346void aom_smooth_h_predictor_8x32_ssse3(
2347 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2348 const uint8_t *LIBAOM_RESTRICT top_row,
2349 const uint8_t *LIBAOM_RESTRICT left_column) {
2350 const __m128i top_right = _mm_set1_epi16(top_row[7]);
2351 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2352 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2353 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2354 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2355 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2356 __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2357 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2358 const __m128i y_select = _mm_set1_epi32(y_mask);
2359 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2360 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2361 &round);
2362 dst += stride;
2363 }
2364 left = cvtepu8_epi16(LoadLo8(left_column + 8));
2365 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2366 const __m128i y_select = _mm_set1_epi32(y_mask);
2367 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2368 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2369 &round);
2370 dst += stride;
2371 }
2372 left = cvtepu8_epi16(LoadLo8(left_column + 16));
2373 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2374 const __m128i y_select = _mm_set1_epi32(y_mask);
2375 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2376 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2377 &round);
2378 dst += stride;
2379 }
2380 left = cvtepu8_epi16(LoadLo8(left_column + 24));
2381 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2382 const __m128i y_select = _mm_set1_epi32(y_mask);
2383 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2384 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2385 &round);
2386 dst += stride;
2387 }
Scott LaVarnway5be7c662018-03-28 14:27:45 -07002388}
Scott LaVarnway6d9d52d2018-03-30 07:01:16 -07002389
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002390void aom_smooth_h_predictor_16x4_ssse3(
Scott LaVarnway23868d32022-11-15 09:14:52 -08002391 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
James Zern2171f152022-11-07 10:52:45 -08002392 const uint8_t *LIBAOM_RESTRICT top_row,
2393 const uint8_t *LIBAOM_RESTRICT left_column) {
Scott LaVarnway23868d32022-11-15 09:14:52 -08002394 const __m128i top_right = _mm_set1_epi16(top_row[15]);
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002395 const __m128i left = cvtepu8_epi16(Load4(left_column));
2396 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002397 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002398 const __m128i weights1 = cvtepu8_epi16(weights);
2399 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2400 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2401 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2402 const __m128i scaled_top_right1 =
2403 _mm_mullo_epi16(inverted_weights1, top_right);
2404 const __m128i scaled_top_right2 =
2405 _mm_mullo_epi16(inverted_weights2, top_right);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002406 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002407 __m128i y_mask = _mm_set1_epi32(0x01000100);
2408 __m128i left_y = _mm_shuffle_epi8(left, y_mask);
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002409 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002410 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002411 dst += stride;
2412 y_mask = _mm_set1_epi32(0x03020302);
2413 left_y = _mm_shuffle_epi8(left, y_mask);
2414 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002415 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002416 dst += stride;
2417 y_mask = _mm_set1_epi32(0x05040504);
2418 left_y = _mm_shuffle_epi8(left, y_mask);
2419 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002420 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002421 dst += stride;
2422 y_mask = _mm_set1_epi32(0x07060706);
2423 left_y = _mm_shuffle_epi8(left, y_mask);
2424 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002425 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002426}
2427
2428void aom_smooth_h_predictor_16x8_ssse3(
Scott LaVarnway23868d32022-11-15 09:14:52 -08002429 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
James Zern2171f152022-11-07 10:52:45 -08002430 const uint8_t *LIBAOM_RESTRICT top_row,
2431 const uint8_t *LIBAOM_RESTRICT left_column) {
Scott LaVarnway23868d32022-11-15 09:14:52 -08002432 const __m128i top_right = _mm_set1_epi16(top_row[15]);
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002433 const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2434 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002435 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002436 const __m128i weights1 = cvtepu8_epi16(weights);
2437 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2438 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2439 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2440 const __m128i scaled_top_right1 =
2441 _mm_mullo_epi16(inverted_weights1, top_right);
2442 const __m128i scaled_top_right2 =
2443 _mm_mullo_epi16(inverted_weights2, top_right);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002444 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002445 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2446 const __m128i y_select = _mm_set1_epi32(y_mask);
2447 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2448 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002449 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002450 dst += stride;
2451 }
2452}
2453
2454void aom_smooth_h_predictor_16x16_ssse3(
Scott LaVarnway23868d32022-11-15 09:14:52 -08002455 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
James Zern2171f152022-11-07 10:52:45 -08002456 const uint8_t *LIBAOM_RESTRICT top_row,
2457 const uint8_t *LIBAOM_RESTRICT left_column) {
Scott LaVarnway23868d32022-11-15 09:14:52 -08002458 const __m128i top_right = _mm_set1_epi16(top_row[15]);
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002459 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002460 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002461 const __m128i weights1 = cvtepu8_epi16(weights);
2462 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2463 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2464 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2465 const __m128i scaled_top_right1 =
2466 _mm_mullo_epi16(inverted_weights1, top_right);
2467 const __m128i scaled_top_right2 =
2468 _mm_mullo_epi16(inverted_weights2, top_right);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002469 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002470 __m128i left = cvtepu8_epi16(LoadLo8(left_column));
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002471 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2472 const __m128i y_select = _mm_set1_epi32(y_mask);
2473 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2474 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002475 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002476 dst += stride;
2477 }
Scott LaVarnway23868d32022-11-15 09:14:52 -08002478 left = cvtepu8_epi16(LoadLo8(left_column + 8));
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002479 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2480 const __m128i y_select = _mm_set1_epi32(y_mask);
2481 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2482 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002483 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002484 dst += stride;
2485 }
2486}
2487
2488void aom_smooth_h_predictor_16x32_ssse3(
Scott LaVarnway23868d32022-11-15 09:14:52 -08002489 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
James Zern2171f152022-11-07 10:52:45 -08002490 const uint8_t *LIBAOM_RESTRICT top_row,
2491 const uint8_t *LIBAOM_RESTRICT left_column) {
Scott LaVarnway23868d32022-11-15 09:14:52 -08002492 const __m128i top_right = _mm_set1_epi16(top_row[15]);
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002493 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002494 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002495 const __m128i weights1 = cvtepu8_epi16(weights);
2496 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2497 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2498 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2499 const __m128i scaled_top_right1 =
2500 _mm_mullo_epi16(inverted_weights1, top_right);
2501 const __m128i scaled_top_right2 =
2502 _mm_mullo_epi16(inverted_weights2, top_right);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002503 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002504 __m128i left = cvtepu8_epi16(LoadLo8(left_column));
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002505 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2506 const __m128i y_select = _mm_set1_epi32(y_mask);
2507 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2508 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002509 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002510 dst += stride;
2511 }
Scott LaVarnway23868d32022-11-15 09:14:52 -08002512 left = cvtepu8_epi16(LoadLo8(left_column + 8));
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002513 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2514 const __m128i y_select = _mm_set1_epi32(y_mask);
2515 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2516 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002517 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002518 dst += stride;
2519 }
Scott LaVarnway23868d32022-11-15 09:14:52 -08002520 left = cvtepu8_epi16(LoadLo8(left_column + 16));
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002521 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2522 const __m128i y_select = _mm_set1_epi32(y_mask);
2523 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2524 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002525 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002526 dst += stride;
2527 }
Scott LaVarnway23868d32022-11-15 09:14:52 -08002528 left = cvtepu8_epi16(LoadLo8(left_column + 24));
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002529 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2530 const __m128i y_select = _mm_set1_epi32(y_mask);
2531 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2532 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002533 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002534 dst += stride;
2535 }
2536}
2537
2538void aom_smooth_h_predictor_16x64_ssse3(
Scott LaVarnway23868d32022-11-15 09:14:52 -08002539 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
James Zern2171f152022-11-07 10:52:45 -08002540 const uint8_t *LIBAOM_RESTRICT top_row,
2541 const uint8_t *LIBAOM_RESTRICT left_column) {
Scott LaVarnway23868d32022-11-15 09:14:52 -08002542 const __m128i top_right = _mm_set1_epi16(top_row[15]);
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002543 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002544 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002545 const __m128i weights1 = cvtepu8_epi16(weights);
2546 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2547 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2548 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2549 const __m128i scaled_top_right1 =
2550 _mm_mullo_epi16(inverted_weights1, top_right);
2551 const __m128i scaled_top_right2 =
2552 _mm_mullo_epi16(inverted_weights2, top_right);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002553 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002554 for (int left_offset = 0; left_offset < 64; left_offset += 8) {
Scott LaVarnway23868d32022-11-15 09:14:52 -08002555 const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002556 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2557 const __m128i y_select = _mm_set1_epi32(y_mask);
2558 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2559 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2560 scaled_top_right1, scaled_top_right2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002561 round);
Scott LaVarnwayc62b4332022-11-02 12:55:35 -07002562 dst += stride;
2563 }
2564 }
Scott LaVarnway6d9d52d2018-03-30 07:01:16 -07002565}
2566
Scott LaVarnway86784422022-11-03 10:54:43 -07002567void aom_smooth_h_predictor_32x8_ssse3(
Scott LaVarnway23868d32022-11-15 09:14:52 -08002568 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
Scott LaVarnway4461c222022-11-08 12:24:41 -08002569 const uint8_t *LIBAOM_RESTRICT top_row,
2570 const uint8_t *LIBAOM_RESTRICT left_column) {
Scott LaVarnway23868d32022-11-15 09:14:52 -08002571 const __m128i top_right = _mm_set1_epi16(top_row[31]);
Scott LaVarnway86784422022-11-03 10:54:43 -07002572 const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2573 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2574 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002575 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
Scott LaVarnway86784422022-11-03 10:54:43 -07002576 const __m128i weights1 = cvtepu8_epi16(weights_lo);
2577 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2578 const __m128i weights3 = cvtepu8_epi16(weights_hi);
2579 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2580 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2581 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2582 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2583 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2584 const __m128i scaled_top_right1 =
2585 _mm_mullo_epi16(inverted_weights1, top_right);
2586 const __m128i scaled_top_right2 =
2587 _mm_mullo_epi16(inverted_weights2, top_right);
2588 const __m128i scaled_top_right3 =
2589 _mm_mullo_epi16(inverted_weights3, top_right);
2590 const __m128i scaled_top_right4 =
2591 _mm_mullo_epi16(inverted_weights4, top_right);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002592 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
Scott LaVarnway86784422022-11-03 10:54:43 -07002593 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2594 __m128i y_select = _mm_set1_epi32(y_mask);
2595 __m128i left_y = _mm_shuffle_epi8(left, y_select);
2596 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002597 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnway86784422022-11-03 10:54:43 -07002598 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002599 scaled_top_right3, scaled_top_right4, round);
Scott LaVarnway86784422022-11-03 10:54:43 -07002600 dst += stride;
2601 }
Scott LaVarnway6d9d52d2018-03-30 07:01:16 -07002602}
2603
Scott LaVarnway86784422022-11-03 10:54:43 -07002604void aom_smooth_h_predictor_32x16_ssse3(
Scott LaVarnway23868d32022-11-15 09:14:52 -08002605 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
Scott LaVarnway4461c222022-11-08 12:24:41 -08002606 const uint8_t *LIBAOM_RESTRICT top_row,
2607 const uint8_t *LIBAOM_RESTRICT left_column) {
Scott LaVarnway23868d32022-11-15 09:14:52 -08002608 const __m128i top_right = _mm_set1_epi16(top_row[31]);
Scott LaVarnway86784422022-11-03 10:54:43 -07002609 const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
2610 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2611 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002612 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
Scott LaVarnway86784422022-11-03 10:54:43 -07002613 const __m128i weights1 = cvtepu8_epi16(weights_lo);
2614 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2615 const __m128i weights3 = cvtepu8_epi16(weights_hi);
2616 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2617 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2618 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2619 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2620 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2621 const __m128i scaled_top_right1 =
2622 _mm_mullo_epi16(inverted_weights1, top_right);
2623 const __m128i scaled_top_right2 =
2624 _mm_mullo_epi16(inverted_weights2, top_right);
2625 const __m128i scaled_top_right3 =
2626 _mm_mullo_epi16(inverted_weights3, top_right);
2627 const __m128i scaled_top_right4 =
2628 _mm_mullo_epi16(inverted_weights4, top_right);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002629 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
Scott LaVarnway86784422022-11-03 10:54:43 -07002630 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2631 __m128i y_select = _mm_set1_epi32(y_mask);
2632 __m128i left_y = _mm_shuffle_epi8(left1, y_select);
2633 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002634 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnway86784422022-11-03 10:54:43 -07002635 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002636 scaled_top_right3, scaled_top_right4, round);
Scott LaVarnway86784422022-11-03 10:54:43 -07002637 dst += stride;
2638 }
2639 const __m128i left2 =
2640 cvtepu8_epi16(LoadLo8((const uint8_t *)left_column + 8));
2641 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2642 __m128i y_select = _mm_set1_epi32(y_mask);
2643 __m128i left_y = _mm_shuffle_epi8(left2, y_select);
2644 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002645 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnway86784422022-11-03 10:54:43 -07002646 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002647 scaled_top_right3, scaled_top_right4, round);
Scott LaVarnway86784422022-11-03 10:54:43 -07002648 dst += stride;
2649 }
Scott LaVarnway6d9d52d2018-03-30 07:01:16 -07002650}
2651
Scott LaVarnway86784422022-11-03 10:54:43 -07002652void aom_smooth_h_predictor_32x32_ssse3(
Scott LaVarnway23868d32022-11-15 09:14:52 -08002653 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
Scott LaVarnway4461c222022-11-08 12:24:41 -08002654 const uint8_t *LIBAOM_RESTRICT top_row,
2655 const uint8_t *LIBAOM_RESTRICT left_column) {
Scott LaVarnway23868d32022-11-15 09:14:52 -08002656 const __m128i top_right = _mm_set1_epi16(top_row[31]);
Scott LaVarnway86784422022-11-03 10:54:43 -07002657 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2658 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002659 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
Scott LaVarnway86784422022-11-03 10:54:43 -07002660 const __m128i weights1 = cvtepu8_epi16(weights_lo);
2661 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2662 const __m128i weights3 = cvtepu8_epi16(weights_hi);
2663 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2664 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2665 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2666 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2667 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2668 const __m128i scaled_top_right1 =
2669 _mm_mullo_epi16(inverted_weights1, top_right);
2670 const __m128i scaled_top_right2 =
2671 _mm_mullo_epi16(inverted_weights2, top_right);
2672 const __m128i scaled_top_right3 =
2673 _mm_mullo_epi16(inverted_weights3, top_right);
2674 const __m128i scaled_top_right4 =
2675 _mm_mullo_epi16(inverted_weights4, top_right);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002676 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
Scott LaVarnway86784422022-11-03 10:54:43 -07002677 __m128i left = cvtepu8_epi16(LoadLo8(left_column));
Scott LaVarnway86784422022-11-03 10:54:43 -07002678 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2679 __m128i y_select = _mm_set1_epi32(y_mask);
2680 __m128i left_y = _mm_shuffle_epi8(left, y_select);
2681 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002682 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnway86784422022-11-03 10:54:43 -07002683 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002684 scaled_top_right3, scaled_top_right4, round);
Scott LaVarnway86784422022-11-03 10:54:43 -07002685 dst += stride;
2686 }
Scott LaVarnway23868d32022-11-15 09:14:52 -08002687 left = cvtepu8_epi16(LoadLo8(left_column + 8));
Scott LaVarnway86784422022-11-03 10:54:43 -07002688 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2689 __m128i y_select = _mm_set1_epi32(y_mask);
2690 __m128i left_y = _mm_shuffle_epi8(left, y_select);
2691 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002692 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnway86784422022-11-03 10:54:43 -07002693 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002694 scaled_top_right3, scaled_top_right4, round);
Scott LaVarnway86784422022-11-03 10:54:43 -07002695 dst += stride;
2696 }
Scott LaVarnway23868d32022-11-15 09:14:52 -08002697 left = cvtepu8_epi16(LoadLo8(left_column + 16));
Scott LaVarnway86784422022-11-03 10:54:43 -07002698 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2699 __m128i y_select = _mm_set1_epi32(y_mask);
2700 __m128i left_y = _mm_shuffle_epi8(left, y_select);
2701 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002702 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnway86784422022-11-03 10:54:43 -07002703 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002704 scaled_top_right3, scaled_top_right4, round);
Scott LaVarnway86784422022-11-03 10:54:43 -07002705 dst += stride;
2706 }
Scott LaVarnway23868d32022-11-15 09:14:52 -08002707 left = cvtepu8_epi16(LoadLo8(left_column + 24));
Scott LaVarnway86784422022-11-03 10:54:43 -07002708 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2709 __m128i y_select = _mm_set1_epi32(y_mask);
2710 __m128i left_y = _mm_shuffle_epi8(left, y_select);
2711 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002712 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnway86784422022-11-03 10:54:43 -07002713 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002714 scaled_top_right3, scaled_top_right4, round);
Scott LaVarnway86784422022-11-03 10:54:43 -07002715 dst += stride;
2716 }
Scott LaVarnway6d9d52d2018-03-30 07:01:16 -07002717}
2718
Scott LaVarnway86784422022-11-03 10:54:43 -07002719void aom_smooth_h_predictor_32x64_ssse3(
Scott LaVarnway23868d32022-11-15 09:14:52 -08002720 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
Scott LaVarnway4461c222022-11-08 12:24:41 -08002721 const uint8_t *LIBAOM_RESTRICT top_row,
2722 const uint8_t *LIBAOM_RESTRICT left_column) {
Scott LaVarnway23868d32022-11-15 09:14:52 -08002723 const __m128i top_right = _mm_set1_epi16(top_row[31]);
Scott LaVarnway86784422022-11-03 10:54:43 -07002724 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2725 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002726 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
Scott LaVarnway86784422022-11-03 10:54:43 -07002727 const __m128i weights1 = cvtepu8_epi16(weights_lo);
2728 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2729 const __m128i weights3 = cvtepu8_epi16(weights_hi);
2730 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2731 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2732 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2733 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2734 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2735 const __m128i scaled_top_right1 =
2736 _mm_mullo_epi16(inverted_weights1, top_right);
2737 const __m128i scaled_top_right2 =
2738 _mm_mullo_epi16(inverted_weights2, top_right);
2739 const __m128i scaled_top_right3 =
2740 _mm_mullo_epi16(inverted_weights3, top_right);
2741 const __m128i scaled_top_right4 =
2742 _mm_mullo_epi16(inverted_weights4, top_right);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002743 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
Scott LaVarnway86784422022-11-03 10:54:43 -07002744 for (int left_offset = 0; left_offset < 64; left_offset += 8) {
Scott LaVarnway23868d32022-11-15 09:14:52 -08002745 const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
Scott LaVarnway86784422022-11-03 10:54:43 -07002746 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2747 const __m128i y_select = _mm_set1_epi32(y_mask);
2748 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2749 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2750 scaled_top_right1, scaled_top_right2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002751 round);
Scott LaVarnway86784422022-11-03 10:54:43 -07002752 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
2753 weights4, scaled_top_right3,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002754 scaled_top_right4, round);
Scott LaVarnway86784422022-11-03 10:54:43 -07002755 dst += stride;
2756 }
2757 }
Scott LaVarnway6d9d52d2018-03-30 07:01:16 -07002758}
2759
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002760void aom_smooth_h_predictor_64x16_ssse3(
Scott LaVarnway23868d32022-11-15 09:14:52 -08002761 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002762 const uint8_t *LIBAOM_RESTRICT top_row,
2763 const uint8_t *LIBAOM_RESTRICT left_column) {
Scott LaVarnway23868d32022-11-15 09:14:52 -08002764 const __m128i top_right = _mm_set1_epi16(top_row[63]);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002765 const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
2766 const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
2767 const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002768 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002769 const __m128i weights1 = cvtepu8_epi16(weights_lolo);
2770 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
2771 const __m128i weights3 = cvtepu8_epi16(weights_lohi);
2772 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
2773 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2774 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2775 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2776 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2777 const __m128i scaled_top_right1 =
2778 _mm_mullo_epi16(inverted_weights1, top_right);
2779 const __m128i scaled_top_right2 =
2780 _mm_mullo_epi16(inverted_weights2, top_right);
2781 const __m128i scaled_top_right3 =
2782 _mm_mullo_epi16(inverted_weights3, top_right);
2783 const __m128i scaled_top_right4 =
2784 _mm_mullo_epi16(inverted_weights4, top_right);
2785 const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
2786 const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
2787 const __m128i weights5 = cvtepu8_epi16(weights_hilo);
2788 const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
2789 const __m128i weights7 = cvtepu8_epi16(weights_hihi);
2790 const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
2791 const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
2792 const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
2793 const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
2794 const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
2795 const __m128i scaled_top_right5 =
2796 _mm_mullo_epi16(inverted_weights5, top_right);
2797 const __m128i scaled_top_right6 =
2798 _mm_mullo_epi16(inverted_weights6, top_right);
2799 const __m128i scaled_top_right7 =
2800 _mm_mullo_epi16(inverted_weights7, top_right);
2801 const __m128i scaled_top_right8 =
2802 _mm_mullo_epi16(inverted_weights8, top_right);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002803 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002804 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2805 __m128i y_select = _mm_set1_epi32(y_mask);
2806 __m128i left_y = _mm_shuffle_epi8(left1, y_select);
2807 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002808 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002809 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002810 scaled_top_right3, scaled_top_right4, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002811 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002812 scaled_top_right5, scaled_top_right6, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002813 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002814 scaled_top_right7, scaled_top_right8, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002815 dst += stride;
2816 }
Scott LaVarnway23868d32022-11-15 09:14:52 -08002817 const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002818 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2819 __m128i y_select = _mm_set1_epi32(y_mask);
2820 __m128i left_y = _mm_shuffle_epi8(left2, y_select);
2821 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002822 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002823 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002824 scaled_top_right3, scaled_top_right4, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002825 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002826 scaled_top_right5, scaled_top_right6, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002827 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002828 scaled_top_right7, scaled_top_right8, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002829 dst += stride;
2830 }
Scott LaVarnway6d9d52d2018-03-30 07:01:16 -07002831}
2832
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002833void aom_smooth_h_predictor_64x32_ssse3(
Scott LaVarnway23868d32022-11-15 09:14:52 -08002834 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002835 const uint8_t *LIBAOM_RESTRICT top_row,
2836 const uint8_t *LIBAOM_RESTRICT left_column) {
Scott LaVarnway23868d32022-11-15 09:14:52 -08002837 const __m128i top_right = _mm_set1_epi16(top_row[63]);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002838 const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
2839 const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
2840 const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002841 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002842 const __m128i weights1 = cvtepu8_epi16(weights_lolo);
2843 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
2844 const __m128i weights3 = cvtepu8_epi16(weights_lohi);
2845 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
2846 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2847 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2848 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2849 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2850 const __m128i scaled_top_right1 =
2851 _mm_mullo_epi16(inverted_weights1, top_right);
2852 const __m128i scaled_top_right2 =
2853 _mm_mullo_epi16(inverted_weights2, top_right);
2854 const __m128i scaled_top_right3 =
2855 _mm_mullo_epi16(inverted_weights3, top_right);
2856 const __m128i scaled_top_right4 =
2857 _mm_mullo_epi16(inverted_weights4, top_right);
2858 const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
2859 const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
2860 const __m128i weights5 = cvtepu8_epi16(weights_hilo);
2861 const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
2862 const __m128i weights7 = cvtepu8_epi16(weights_hihi);
2863 const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
2864 const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
2865 const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
2866 const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
2867 const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
2868 const __m128i scaled_top_right5 =
2869 _mm_mullo_epi16(inverted_weights5, top_right);
2870 const __m128i scaled_top_right6 =
2871 _mm_mullo_epi16(inverted_weights6, top_right);
2872 const __m128i scaled_top_right7 =
2873 _mm_mullo_epi16(inverted_weights7, top_right);
2874 const __m128i scaled_top_right8 =
2875 _mm_mullo_epi16(inverted_weights8, top_right);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002876 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002877 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2878 const __m128i y_select = _mm_set1_epi32(y_mask);
2879 const __m128i left_y = _mm_shuffle_epi8(left1, y_select);
2880 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002881 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002882 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002883 scaled_top_right3, scaled_top_right4, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002884 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002885 scaled_top_right5, scaled_top_right6, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002886 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002887 scaled_top_right7, scaled_top_right8, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002888 dst += stride;
2889 }
Scott LaVarnway23868d32022-11-15 09:14:52 -08002890 const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002891 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2892 const __m128i y_select = _mm_set1_epi32(y_mask);
2893 const __m128i left_y = _mm_shuffle_epi8(left2, y_select);
2894 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002895 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002896 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002897 scaled_top_right3, scaled_top_right4, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002898 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002899 scaled_top_right5, scaled_top_right6, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002900 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002901 scaled_top_right7, scaled_top_right8, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002902 dst += stride;
2903 }
Scott LaVarnway23868d32022-11-15 09:14:52 -08002904 const __m128i left3 = cvtepu8_epi16(LoadLo8(left_column + 16));
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002905 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2906 const __m128i y_select = _mm_set1_epi32(y_mask);
2907 const __m128i left_y = _mm_shuffle_epi8(left3, y_select);
2908 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002909 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002910 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002911 scaled_top_right3, scaled_top_right4, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002912 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002913 scaled_top_right5, scaled_top_right6, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002914 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002915 scaled_top_right7, scaled_top_right8, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002916 dst += stride;
2917 }
Scott LaVarnway23868d32022-11-15 09:14:52 -08002918 const __m128i left4 = cvtepu8_epi16(LoadLo8(left_column + 24));
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002919 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2920 const __m128i y_select = _mm_set1_epi32(y_mask);
2921 const __m128i left_y = _mm_shuffle_epi8(left4, y_select);
2922 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002923 scaled_top_right1, scaled_top_right2, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002924 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002925 scaled_top_right3, scaled_top_right4, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002926 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002927 scaled_top_right5, scaled_top_right6, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002928 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002929 scaled_top_right7, scaled_top_right8, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002930 dst += stride;
2931 }
Scott LaVarnway6d9d52d2018-03-30 07:01:16 -07002932}
2933
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002934void aom_smooth_h_predictor_64x64_ssse3(
Scott LaVarnway23868d32022-11-15 09:14:52 -08002935 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002936 const uint8_t *LIBAOM_RESTRICT top_row,
2937 const uint8_t *LIBAOM_RESTRICT left_column) {
Scott LaVarnway23868d32022-11-15 09:14:52 -08002938 const __m128i top_right = _mm_set1_epi16(top_row[63]);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002939 const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
2940 const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002941 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002942 const __m128i weights1 = cvtepu8_epi16(weights_lolo);
2943 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
2944 const __m128i weights3 = cvtepu8_epi16(weights_lohi);
2945 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
2946 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2947 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2948 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2949 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2950 const __m128i scaled_top_right1 =
2951 _mm_mullo_epi16(inverted_weights1, top_right);
2952 const __m128i scaled_top_right2 =
2953 _mm_mullo_epi16(inverted_weights2, top_right);
2954 const __m128i scaled_top_right3 =
2955 _mm_mullo_epi16(inverted_weights3, top_right);
2956 const __m128i scaled_top_right4 =
2957 _mm_mullo_epi16(inverted_weights4, top_right);
2958 const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
2959 const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
2960 const __m128i weights5 = cvtepu8_epi16(weights_hilo);
2961 const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
2962 const __m128i weights7 = cvtepu8_epi16(weights_hihi);
2963 const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
2964 const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
2965 const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
2966 const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
2967 const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
2968 const __m128i scaled_top_right5 =
2969 _mm_mullo_epi16(inverted_weights5, top_right);
2970 const __m128i scaled_top_right6 =
2971 _mm_mullo_epi16(inverted_weights6, top_right);
2972 const __m128i scaled_top_right7 =
2973 _mm_mullo_epi16(inverted_weights7, top_right);
2974 const __m128i scaled_top_right8 =
2975 _mm_mullo_epi16(inverted_weights8, top_right);
Scott LaVarnway23868d32022-11-15 09:14:52 -08002976 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002977 for (int left_offset = 0; left_offset < 64; left_offset += 8) {
Scott LaVarnway23868d32022-11-15 09:14:52 -08002978 const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002979 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2980 const __m128i y_select = _mm_set1_epi32(y_mask);
2981 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2982 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2983 scaled_top_right1, scaled_top_right2,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002984 round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002985 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
2986 weights4, scaled_top_right3,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002987 scaled_top_right4, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002988 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5,
2989 weights6, scaled_top_right5,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002990 scaled_top_right6, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002991 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7,
2992 weights8, scaled_top_right7,
Scott LaVarnway23868d32022-11-15 09:14:52 -08002993 scaled_top_right8, round);
Scott LaVarnway39e731b2022-11-14 11:54:56 -08002994 dst += stride;
2995 }
2996 }
Scott LaVarnway6d9d52d2018-03-30 07:01:16 -07002997}