blob: bfd86ee410fab55c1c8b2740e1e92028335daca0 [file] [log] [blame]
David Barker0aa39ff2017-05-23 12:53:08 +01001/*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12#include <stdlib.h>
13#include <string.h>
14#include <tmmintrin.h>
15
Tom Finegan60e653d2018-05-22 11:34:58 -070016#include "config/aom_config.h"
Tom Finegan44702c82018-05-22 13:00:39 -070017#include "config/aom_dsp_rtcd.h"
Tom Finegan60e653d2018-05-22 11:34:58 -070018
David Barker0aa39ff2017-05-23 12:53:08 +010019#include "aom/aom_integer.h"
David Barker0aa39ff2017-05-23 12:53:08 +010020#include "aom_dsp/aom_filter.h"
Peng Bin3c74dd42018-02-02 13:23:19 +080021#include "aom_dsp/blend.h"
22#include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
David Barker0aa39ff2017-05-23 12:53:08 +010023#include "aom_dsp/x86/synonyms.h"
Peng Bin3c74dd42018-02-02 13:23:19 +080024#include "aom_ports/mem.h"
David Barker0aa39ff2017-05-23 12:53:08 +010025
26// For width a multiple of 16
27static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
28 int yoffset, uint8_t *dst, int w, int h);
29
30static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
31 int yoffset, uint8_t *dst, int h);
32
33static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
34 int yoffset, uint8_t *dst, int h);
35
36// For width a multiple of 16
37static void masked_variance(const uint8_t *src_ptr, int src_stride,
38 const uint8_t *a_ptr, int a_stride,
39 const uint8_t *b_ptr, int b_stride,
40 const uint8_t *m_ptr, int m_stride, int width,
41 int height, unsigned int *sse, int *sum_);
42
43static void masked_variance8xh(const uint8_t *src_ptr, int src_stride,
44 const uint8_t *a_ptr, const uint8_t *b_ptr,
45 const uint8_t *m_ptr, int m_stride, int height,
46 unsigned int *sse, int *sum_);
47
48static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
49 const uint8_t *a_ptr, const uint8_t *b_ptr,
50 const uint8_t *m_ptr, int m_stride, int height,
51 unsigned int *sse, int *sum_);
52
53#define MASK_SUBPIX_VAR_SSSE3(W, H) \
54 unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3( \
55 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
56 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
57 const uint8_t *msk, int msk_stride, int invert_mask, \
58 unsigned int *sse) { \
59 int sum; \
60 uint8_t temp[(H + 1) * W]; \
61 \
62 bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \
63 \
64 if (!invert_mask) \
65 masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
66 msk_stride, W, H, sse, &sum); \
67 else \
68 masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
69 msk_stride, W, H, sse, &sum); \
70 return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
71 }
72
73#define MASK_SUBPIX_VAR8XH_SSSE3(H) \
74 unsigned int aom_masked_sub_pixel_variance8x##H##_ssse3( \
75 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
76 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
77 const uint8_t *msk, int msk_stride, int invert_mask, \
78 unsigned int *sse) { \
79 int sum; \
80 uint8_t temp[(H + 1) * 8]; \
81 \
82 bilinear_filter8xh(src, src_stride, xoffset, yoffset, temp, H); \
83 \
84 if (!invert_mask) \
85 masked_variance8xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \
86 H, sse, &sum); \
87 else \
88 masked_variance8xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \
89 H, sse, &sum); \
90 return *sse - (uint32_t)(((int64_t)sum * sum) / (8 * H)); \
91 }
92
93#define MASK_SUBPIX_VAR4XH_SSSE3(H) \
94 unsigned int aom_masked_sub_pixel_variance4x##H##_ssse3( \
95 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
96 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
97 const uint8_t *msk, int msk_stride, int invert_mask, \
98 unsigned int *sse) { \
99 int sum; \
100 uint8_t temp[(H + 1) * 4]; \
101 \
102 bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \
103 \
104 if (!invert_mask) \
105 masked_variance4xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \
106 H, sse, &sum); \
107 else \
108 masked_variance4xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \
109 H, sse, &sum); \
110 return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \
111 }
112
David Barker0aa39ff2017-05-23 12:53:08 +0100113MASK_SUBPIX_VAR_SSSE3(128, 128)
114MASK_SUBPIX_VAR_SSSE3(128, 64)
115MASK_SUBPIX_VAR_SSSE3(64, 128)
David Barker0aa39ff2017-05-23 12:53:08 +0100116MASK_SUBPIX_VAR_SSSE3(64, 64)
117MASK_SUBPIX_VAR_SSSE3(64, 32)
118MASK_SUBPIX_VAR_SSSE3(32, 64)
119MASK_SUBPIX_VAR_SSSE3(32, 32)
120MASK_SUBPIX_VAR_SSSE3(32, 16)
121MASK_SUBPIX_VAR_SSSE3(16, 32)
122MASK_SUBPIX_VAR_SSSE3(16, 16)
123MASK_SUBPIX_VAR_SSSE3(16, 8)
124MASK_SUBPIX_VAR8XH_SSSE3(16)
125MASK_SUBPIX_VAR8XH_SSSE3(8)
126MASK_SUBPIX_VAR8XH_SSSE3(4)
127MASK_SUBPIX_VAR4XH_SSSE3(8)
128MASK_SUBPIX_VAR4XH_SSSE3(4)
Rupert Swarbrick93c39e92017-07-12 11:11:02 +0100129MASK_SUBPIX_VAR4XH_SSSE3(16)
130MASK_SUBPIX_VAR_SSSE3(16, 4)
131MASK_SUBPIX_VAR8XH_SSSE3(32)
132MASK_SUBPIX_VAR_SSSE3(32, 8)
Rupert Swarbrick72678572017-08-02 12:05:26 +0100133MASK_SUBPIX_VAR_SSSE3(64, 16)
134MASK_SUBPIX_VAR_SSSE3(16, 64)
David Barker0aa39ff2017-05-23 12:53:08 +0100135
136static INLINE __m128i filter_block(const __m128i a, const __m128i b,
137 const __m128i filter) {
138 __m128i v0 = _mm_unpacklo_epi8(a, b);
139 v0 = _mm_maddubs_epi16(v0, filter);
140 v0 = xx_roundn_epu16(v0, FILTER_BITS);
141
142 __m128i v1 = _mm_unpackhi_epi8(a, b);
143 v1 = _mm_maddubs_epi16(v1, filter);
144 v1 = xx_roundn_epu16(v1, FILTER_BITS);
145
146 return _mm_packus_epi16(v0, v1);
147}
148
149static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
150 int yoffset, uint8_t *dst, int w, int h) {
151 int i, j;
152 // Horizontal filter
153 if (xoffset == 0) {
154 uint8_t *b = dst;
155 for (i = 0; i < h + 1; ++i) {
156 for (j = 0; j < w; j += 16) {
157 __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
158 _mm_storeu_si128((__m128i *)&b[j], x);
159 }
160 src += src_stride;
161 b += w;
162 }
163 } else if (xoffset == 4) {
164 uint8_t *b = dst;
165 for (i = 0; i < h + 1; ++i) {
166 for (j = 0; j < w; j += 16) {
167 __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
168 __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]);
169 __m128i z = _mm_alignr_epi8(y, x, 1);
170 _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu8(x, z));
171 }
172 src += src_stride;
173 b += w;
174 }
175 } else {
176 uint8_t *b = dst;
177 const uint8_t *hfilter = bilinear_filters_2t[xoffset];
178 const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
179 for (i = 0; i < h + 1; ++i) {
180 for (j = 0; j < w; j += 16) {
181 const __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
182 const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]);
183 const __m128i z = _mm_alignr_epi8(y, x, 1);
184 const __m128i res = filter_block(x, z, hfilter_vec);
185 _mm_storeu_si128((__m128i *)&b[j], res);
186 }
187
188 src += src_stride;
189 b += w;
190 }
191 }
192
193 // Vertical filter
194 if (yoffset == 0) {
195 // The data is already in 'dst', so no need to filter
196 } else if (yoffset == 4) {
197 for (i = 0; i < h; ++i) {
198 for (j = 0; j < w; j += 16) {
199 __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
200 __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
201 _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu8(x, y));
202 }
203 dst += w;
204 }
205 } else {
206 const uint8_t *vfilter = bilinear_filters_2t[yoffset];
207 const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
208 for (i = 0; i < h; ++i) {
209 for (j = 0; j < w; j += 16) {
210 const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
211 const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
212 const __m128i res = filter_block(x, y, vfilter_vec);
213 _mm_storeu_si128((__m128i *)&dst[j], res);
214 }
215
216 dst += w;
217 }
218 }
219}
220
David Barkerdab3e992017-06-08 14:01:37 +0100221static INLINE __m128i filter_block_2rows(const __m128i *a0, const __m128i *b0,
222 const __m128i *a1, const __m128i *b1,
223 const __m128i *filter) {
224 __m128i v0 = _mm_unpacklo_epi8(*a0, *b0);
225 v0 = _mm_maddubs_epi16(v0, *filter);
David Barker0aa39ff2017-05-23 12:53:08 +0100226 v0 = xx_roundn_epu16(v0, FILTER_BITS);
227
David Barkerdab3e992017-06-08 14:01:37 +0100228 __m128i v1 = _mm_unpacklo_epi8(*a1, *b1);
229 v1 = _mm_maddubs_epi16(v1, *filter);
David Barker0aa39ff2017-05-23 12:53:08 +0100230 v1 = xx_roundn_epu16(v1, FILTER_BITS);
231
232 return _mm_packus_epi16(v0, v1);
233}
234
235static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
236 int yoffset, uint8_t *dst, int h) {
237 int i;
238 // Horizontal filter
239 if (xoffset == 0) {
240 uint8_t *b = dst;
241 for (i = 0; i < h + 1; ++i) {
242 __m128i x = _mm_loadl_epi64((__m128i *)src);
243 _mm_storel_epi64((__m128i *)b, x);
244 src += src_stride;
245 b += 8;
246 }
247 } else if (xoffset == 4) {
248 uint8_t *b = dst;
249 for (i = 0; i < h + 1; ++i) {
250 __m128i x = _mm_loadu_si128((__m128i *)src);
251 __m128i z = _mm_srli_si128(x, 1);
252 _mm_storel_epi64((__m128i *)b, _mm_avg_epu8(x, z));
253 src += src_stride;
254 b += 8;
255 }
256 } else {
257 uint8_t *b = dst;
258 const uint8_t *hfilter = bilinear_filters_2t[xoffset];
259 const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
260 for (i = 0; i < h; i += 2) {
261 const __m128i x0 = _mm_loadu_si128((__m128i *)src);
262 const __m128i z0 = _mm_srli_si128(x0, 1);
263 const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
264 const __m128i z1 = _mm_srli_si128(x1, 1);
David Barkerdab3e992017-06-08 14:01:37 +0100265 const __m128i res = filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec);
David Barker0aa39ff2017-05-23 12:53:08 +0100266 _mm_storeu_si128((__m128i *)b, res);
267
268 src += src_stride * 2;
269 b += 16;
270 }
271 // Handle i = h separately
272 const __m128i x0 = _mm_loadu_si128((__m128i *)src);
273 const __m128i z0 = _mm_srli_si128(x0, 1);
274
275 __m128i v0 = _mm_unpacklo_epi8(x0, z0);
276 v0 = _mm_maddubs_epi16(v0, hfilter_vec);
277 v0 = xx_roundn_epu16(v0, FILTER_BITS);
278
279 _mm_storel_epi64((__m128i *)b, _mm_packus_epi16(v0, v0));
280 }
281
282 // Vertical filter
283 if (yoffset == 0) {
284 // The data is already in 'dst', so no need to filter
285 } else if (yoffset == 4) {
286 for (i = 0; i < h; ++i) {
287 __m128i x = _mm_loadl_epi64((__m128i *)dst);
288 __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
289 _mm_storel_epi64((__m128i *)dst, _mm_avg_epu8(x, y));
290 dst += 8;
291 }
292 } else {
293 const uint8_t *vfilter = bilinear_filters_2t[yoffset];
294 const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
295 for (i = 0; i < h; i += 2) {
296 const __m128i x = _mm_loadl_epi64((__m128i *)dst);
297 const __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
298 const __m128i z = _mm_loadl_epi64((__m128i *)&dst[16]);
David Barkerdab3e992017-06-08 14:01:37 +0100299 const __m128i res = filter_block_2rows(&x, &y, &y, &z, &vfilter_vec);
David Barker0aa39ff2017-05-23 12:53:08 +0100300 _mm_storeu_si128((__m128i *)dst, res);
301
302 dst += 16;
303 }
304 }
305}
306
307static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
308 int yoffset, uint8_t *dst, int h) {
309 int i;
310 // Horizontal filter
311 if (xoffset == 0) {
312 uint8_t *b = dst;
313 for (i = 0; i < h + 1; ++i) {
314 __m128i x = xx_loadl_32((__m128i *)src);
315 xx_storel_32((__m128i *)b, x);
316 src += src_stride;
317 b += 4;
318 }
319 } else if (xoffset == 4) {
320 uint8_t *b = dst;
321 for (i = 0; i < h + 1; ++i) {
322 __m128i x = _mm_loadl_epi64((__m128i *)src);
323 __m128i z = _mm_srli_si128(x, 1);
324 xx_storel_32((__m128i *)b, _mm_avg_epu8(x, z));
325 src += src_stride;
326 b += 4;
327 }
328 } else {
329 uint8_t *b = dst;
330 const uint8_t *hfilter = bilinear_filters_2t[xoffset];
331 const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
332 for (i = 0; i < h; i += 4) {
333 const __m128i x0 = _mm_loadl_epi64((__m128i *)src);
334 const __m128i z0 = _mm_srli_si128(x0, 1);
335 const __m128i x1 = _mm_loadl_epi64((__m128i *)&src[src_stride]);
336 const __m128i z1 = _mm_srli_si128(x1, 1);
337 const __m128i x2 = _mm_loadl_epi64((__m128i *)&src[src_stride * 2]);
338 const __m128i z2 = _mm_srli_si128(x2, 1);
339 const __m128i x3 = _mm_loadl_epi64((__m128i *)&src[src_stride * 3]);
340 const __m128i z3 = _mm_srli_si128(x3, 1);
341
342 const __m128i a0 = _mm_unpacklo_epi32(x0, x1);
343 const __m128i b0 = _mm_unpacklo_epi32(z0, z1);
344 const __m128i a1 = _mm_unpacklo_epi32(x2, x3);
345 const __m128i b1 = _mm_unpacklo_epi32(z2, z3);
David Barkerdab3e992017-06-08 14:01:37 +0100346 const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &hfilter_vec);
David Barker0aa39ff2017-05-23 12:53:08 +0100347 _mm_storeu_si128((__m128i *)b, res);
348
349 src += src_stride * 4;
350 b += 16;
351 }
352 // Handle i = h separately
353 const __m128i x = _mm_loadl_epi64((__m128i *)src);
354 const __m128i z = _mm_srli_si128(x, 1);
355
356 __m128i v0 = _mm_unpacklo_epi8(x, z);
357 v0 = _mm_maddubs_epi16(v0, hfilter_vec);
358 v0 = xx_roundn_epu16(v0, FILTER_BITS);
359
360 xx_storel_32((__m128i *)b, _mm_packus_epi16(v0, v0));
361 }
362
363 // Vertical filter
364 if (yoffset == 0) {
365 // The data is already in 'dst', so no need to filter
366 } else if (yoffset == 4) {
367 for (i = 0; i < h; ++i) {
368 __m128i x = xx_loadl_32((__m128i *)dst);
369 __m128i y = xx_loadl_32((__m128i *)&dst[4]);
370 xx_storel_32((__m128i *)dst, _mm_avg_epu8(x, y));
371 dst += 4;
372 }
373 } else {
374 const uint8_t *vfilter = bilinear_filters_2t[yoffset];
375 const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
376 for (i = 0; i < h; i += 4) {
377 const __m128i a = xx_loadl_32((__m128i *)dst);
378 const __m128i b = xx_loadl_32((__m128i *)&dst[4]);
379 const __m128i c = xx_loadl_32((__m128i *)&dst[8]);
380 const __m128i d = xx_loadl_32((__m128i *)&dst[12]);
381 const __m128i e = xx_loadl_32((__m128i *)&dst[16]);
382
383 const __m128i a0 = _mm_unpacklo_epi32(a, b);
384 const __m128i b0 = _mm_unpacklo_epi32(b, c);
385 const __m128i a1 = _mm_unpacklo_epi32(c, d);
386 const __m128i b1 = _mm_unpacklo_epi32(d, e);
David Barkerdab3e992017-06-08 14:01:37 +0100387 const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &vfilter_vec);
David Barker0aa39ff2017-05-23 12:53:08 +0100388 _mm_storeu_si128((__m128i *)dst, res);
389
390 dst += 16;
391 }
392 }
393}
394
David Barkerdab3e992017-06-08 14:01:37 +0100395static INLINE void accumulate_block(const __m128i *src, const __m128i *a,
396 const __m128i *b, const __m128i *m,
David Barker0aa39ff2017-05-23 12:53:08 +0100397 __m128i *sum, __m128i *sum_sq) {
398 const __m128i zero = _mm_setzero_si128();
399 const __m128i one = _mm_set1_epi16(1);
400 const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
David Barkerdab3e992017-06-08 14:01:37 +0100401 const __m128i m_inv = _mm_sub_epi8(mask_max, *m);
David Barker0aa39ff2017-05-23 12:53:08 +0100402
403 // Calculate 16 predicted pixels.
404 // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
405 // is 64 * 255, so we have plenty of space to add rounding constants.
David Barkerdab3e992017-06-08 14:01:37 +0100406 const __m128i data_l = _mm_unpacklo_epi8(*a, *b);
407 const __m128i mask_l = _mm_unpacklo_epi8(*m, m_inv);
David Barker0aa39ff2017-05-23 12:53:08 +0100408 __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
409 pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
410
David Barkerdab3e992017-06-08 14:01:37 +0100411 const __m128i data_r = _mm_unpackhi_epi8(*a, *b);
412 const __m128i mask_r = _mm_unpackhi_epi8(*m, m_inv);
David Barker0aa39ff2017-05-23 12:53:08 +0100413 __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
414 pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
415
David Barkerdab3e992017-06-08 14:01:37 +0100416 const __m128i src_l = _mm_unpacklo_epi8(*src, zero);
417 const __m128i src_r = _mm_unpackhi_epi8(*src, zero);
David Barker0aa39ff2017-05-23 12:53:08 +0100418 const __m128i diff_l = _mm_sub_epi16(pred_l, src_l);
419 const __m128i diff_r = _mm_sub_epi16(pred_r, src_r);
420
421 // Update partial sums and partial sums of squares
422 *sum =
423 _mm_add_epi32(*sum, _mm_madd_epi16(_mm_add_epi16(diff_l, diff_r), one));
424 *sum_sq =
425 _mm_add_epi32(*sum_sq, _mm_add_epi32(_mm_madd_epi16(diff_l, diff_l),
426 _mm_madd_epi16(diff_r, diff_r)));
427}
428
429static void masked_variance(const uint8_t *src_ptr, int src_stride,
430 const uint8_t *a_ptr, int a_stride,
431 const uint8_t *b_ptr, int b_stride,
432 const uint8_t *m_ptr, int m_stride, int width,
433 int height, unsigned int *sse, int *sum_) {
434 int x, y;
435 __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
436
437 for (y = 0; y < height; y++) {
438 for (x = 0; x < width; x += 16) {
439 const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
440 const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
441 const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
442 const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
David Barkerdab3e992017-06-08 14:01:37 +0100443 accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
David Barker0aa39ff2017-05-23 12:53:08 +0100444 }
445
446 src_ptr += src_stride;
447 a_ptr += a_stride;
448 b_ptr += b_stride;
449 m_ptr += m_stride;
450 }
451 // Reduce down to a single sum and sum of squares
452 sum = _mm_hadd_epi32(sum, sum_sq);
453 sum = _mm_hadd_epi32(sum, sum);
454 *sum_ = _mm_cvtsi128_si32(sum);
455 *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
456}
457
458static void masked_variance8xh(const uint8_t *src_ptr, int src_stride,
459 const uint8_t *a_ptr, const uint8_t *b_ptr,
460 const uint8_t *m_ptr, int m_stride, int height,
461 unsigned int *sse, int *sum_) {
462 int y;
463 __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
464
465 for (y = 0; y < height; y += 2) {
466 __m128i src = _mm_unpacklo_epi64(
467 _mm_loadl_epi64((const __m128i *)src_ptr),
468 _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
469 const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
470 const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
471 const __m128i m =
472 _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
473 _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
David Barkerdab3e992017-06-08 14:01:37 +0100474 accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
David Barker0aa39ff2017-05-23 12:53:08 +0100475
476 src_ptr += src_stride * 2;
477 a_ptr += 16;
478 b_ptr += 16;
479 m_ptr += m_stride * 2;
480 }
481 // Reduce down to a single sum and sum of squares
482 sum = _mm_hadd_epi32(sum, sum_sq);
483 sum = _mm_hadd_epi32(sum, sum);
484 *sum_ = _mm_cvtsi128_si32(sum);
485 *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
486}
487
488static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
489 const uint8_t *a_ptr, const uint8_t *b_ptr,
490 const uint8_t *m_ptr, int m_stride, int height,
491 unsigned int *sse, int *sum_) {
492 int y;
493 __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
494
495 for (y = 0; y < height; y += 4) {
496 // Load four rows at a time
497 __m128i src =
498 _mm_setr_epi32(*(uint32_t *)src_ptr, *(uint32_t *)&src_ptr[src_stride],
499 *(uint32_t *)&src_ptr[src_stride * 2],
500 *(uint32_t *)&src_ptr[src_stride * 3]);
501 const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
502 const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
503 const __m128i m = _mm_setr_epi32(
504 *(uint32_t *)m_ptr, *(uint32_t *)&m_ptr[m_stride],
505 *(uint32_t *)&m_ptr[m_stride * 2], *(uint32_t *)&m_ptr[m_stride * 3]);
David Barkerdab3e992017-06-08 14:01:37 +0100506 accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
David Barker0aa39ff2017-05-23 12:53:08 +0100507
508 src_ptr += src_stride * 4;
509 a_ptr += 16;
510 b_ptr += 16;
511 m_ptr += m_stride * 4;
512 }
513 // Reduce down to a single sum and sum of squares
514 sum = _mm_hadd_epi32(sum, sum_sq);
515 sum = _mm_hadd_epi32(sum, sum);
516 *sum_ = _mm_cvtsi128_si32(sum);
517 *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
518}
519
Jerome Jiangfa1d1732019-08-06 10:31:20 -0700520#if CONFIG_AV1_HIGHBITDEPTH
David Barker0aa39ff2017-05-23 12:53:08 +0100521// For width a multiple of 8
522static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
523 int xoffset, int yoffset, uint16_t *dst,
524 int w, int h);
525
526static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
527 int xoffset, int yoffset, uint16_t *dst,
528 int h);
529
530// For width a multiple of 8
531static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,
532 const uint16_t *a_ptr, int a_stride,
533 const uint16_t *b_ptr, int b_stride,
534 const uint8_t *m_ptr, int m_stride,
535 int width, int height, uint64_t *sse,
536 int *sum_);
537
538static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
539 const uint16_t *a_ptr,
540 const uint16_t *b_ptr,
541 const uint8_t *m_ptr, int m_stride,
542 int height, int *sse, int *sum_);
543
544#define HIGHBD_MASK_SUBPIX_VAR_SSSE3(W, H) \
545 unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_ssse3( \
546 const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
547 const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
548 const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
549 uint64_t sse64; \
550 int sum; \
551 uint16_t temp[(H + 1) * W]; \
552 const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
553 const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
554 const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
555 \
556 highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \
557 \
558 if (!invert_mask) \
559 highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
560 msk_stride, W, H, &sse64, &sum); \
561 else \
562 highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
563 msk_stride, W, H, &sse64, &sum); \
564 *sse = (uint32_t)sse64; \
565 return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
566 } \
567 unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3( \
568 const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
569 const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
570 const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
571 uint64_t sse64; \
572 int sum; \
Peter de Rivaze0794b52017-07-13 10:29:20 +0100573 int64_t var; \
David Barker0aa39ff2017-05-23 12:53:08 +0100574 uint16_t temp[(H + 1) * W]; \
575 const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
576 const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
577 const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
578 \
579 highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \
580 \
581 if (!invert_mask) \
582 highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
583 msk_stride, W, H, &sse64, &sum); \
584 else \
585 highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
586 msk_stride, W, H, &sse64, &sum); \
587 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 4); \
588 sum = ROUND_POWER_OF_TWO(sum, 2); \
Peter de Rivaze0794b52017-07-13 10:29:20 +0100589 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
590 return (var >= 0) ? (uint32_t)var : 0; \
David Barker0aa39ff2017-05-23 12:53:08 +0100591 } \
592 unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3( \
593 const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
594 const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
595 const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
596 uint64_t sse64; \
597 int sum; \
Peter de Rivaze0794b52017-07-13 10:29:20 +0100598 int64_t var; \
David Barker0aa39ff2017-05-23 12:53:08 +0100599 uint16_t temp[(H + 1) * W]; \
600 const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
601 const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
602 const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
603 \
604 highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \
605 \
606 if (!invert_mask) \
607 highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
608 msk_stride, W, H, &sse64, &sum); \
609 else \
610 highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
611 msk_stride, W, H, &sse64, &sum); \
612 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 8); \
613 sum = ROUND_POWER_OF_TWO(sum, 4); \
Peter de Rivaze0794b52017-07-13 10:29:20 +0100614 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
615 return (var >= 0) ? (uint32_t)var : 0; \
David Barker0aa39ff2017-05-23 12:53:08 +0100616 }
617
618#define HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(H) \
619 unsigned int aom_highbd_8_masked_sub_pixel_variance4x##H##_ssse3( \
620 const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
621 const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
622 const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
623 int sse_; \
624 int sum; \
625 uint16_t temp[(H + 1) * 4]; \
626 const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
627 const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
628 const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
629 \
630 highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \
631 \
632 if (!invert_mask) \
633 highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \
634 msk_stride, H, &sse_, &sum); \
635 else \
636 highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \
637 msk_stride, H, &sse_, &sum); \
638 *sse = (uint32_t)sse_; \
639 return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \
640 } \
641 unsigned int aom_highbd_10_masked_sub_pixel_variance4x##H##_ssse3( \
642 const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
643 const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
644 const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
645 int sse_; \
646 int sum; \
Peter de Rivaze0794b52017-07-13 10:29:20 +0100647 int64_t var; \
David Barker0aa39ff2017-05-23 12:53:08 +0100648 uint16_t temp[(H + 1) * 4]; \
649 const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
650 const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
651 const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
652 \
653 highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \
654 \
655 if (!invert_mask) \
656 highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \
657 msk_stride, H, &sse_, &sum); \
658 else \
659 highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \
660 msk_stride, H, &sse_, &sum); \
661 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 4); \
662 sum = ROUND_POWER_OF_TWO(sum, 2); \
Peter de Rivaze0794b52017-07-13 10:29:20 +0100663 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H)); \
664 return (var >= 0) ? (uint32_t)var : 0; \
David Barker0aa39ff2017-05-23 12:53:08 +0100665 } \
666 unsigned int aom_highbd_12_masked_sub_pixel_variance4x##H##_ssse3( \
667 const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
668 const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
669 const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
670 int sse_; \
671 int sum; \
Peter de Rivaze0794b52017-07-13 10:29:20 +0100672 int64_t var; \
David Barker0aa39ff2017-05-23 12:53:08 +0100673 uint16_t temp[(H + 1) * 4]; \
674 const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
675 const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
676 const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
677 \
678 highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \
679 \
680 if (!invert_mask) \
681 highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \
682 msk_stride, H, &sse_, &sum); \
683 else \
684 highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \
685 msk_stride, H, &sse_, &sum); \
686 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 8); \
687 sum = ROUND_POWER_OF_TWO(sum, 4); \
Peter de Rivaze0794b52017-07-13 10:29:20 +0100688 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H)); \
689 return (var >= 0) ? (uint32_t)var : 0; \
David Barker0aa39ff2017-05-23 12:53:08 +0100690 }
691
David Barker0aa39ff2017-05-23 12:53:08 +0100692HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 128)
693HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 64)
694HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 128)
David Barker0aa39ff2017-05-23 12:53:08 +0100695HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 64)
696HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 32)
697HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 64)
698HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 32)
699HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 16)
700HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 32)
701HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 16)
702HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 8)
703HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 16)
704HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8)
705HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4)
706HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8)
707HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4)
Rupert Swarbrick93c39e92017-07-12 11:11:02 +0100708HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16)
709HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4)
710HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32)
711HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8)
Rupert Swarbrick72678572017-08-02 12:05:26 +0100712HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64)
713HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16)
David Barker0aa39ff2017-05-23 12:53:08 +0100714
715static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b,
716 const __m128i filter) {
717 __m128i v0 = _mm_unpacklo_epi16(a, b);
718 v0 = _mm_madd_epi16(v0, filter);
719 v0 = xx_roundn_epu32(v0, FILTER_BITS);
720
721 __m128i v1 = _mm_unpackhi_epi16(a, b);
722 v1 = _mm_madd_epi16(v1, filter);
723 v1 = xx_roundn_epu32(v1, FILTER_BITS);
724
725 return _mm_packs_epi32(v0, v1);
726}
727
728static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
729 int xoffset, int yoffset, uint16_t *dst,
730 int w, int h) {
731 int i, j;
732 // Horizontal filter
733 if (xoffset == 0) {
734 uint16_t *b = dst;
735 for (i = 0; i < h + 1; ++i) {
736 for (j = 0; j < w; j += 8) {
737 __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
738 _mm_storeu_si128((__m128i *)&b[j], x);
739 }
740 src += src_stride;
741 b += w;
742 }
743 } else if (xoffset == 4) {
744 uint16_t *b = dst;
745 for (i = 0; i < h + 1; ++i) {
746 for (j = 0; j < w; j += 8) {
747 __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
748 __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]);
749 __m128i z = _mm_alignr_epi8(y, x, 2);
750 _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu16(x, z));
751 }
752 src += src_stride;
753 b += w;
754 }
755 } else {
756 uint16_t *b = dst;
757 const uint8_t *hfilter = bilinear_filters_2t[xoffset];
758 const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
759 for (i = 0; i < h + 1; ++i) {
760 for (j = 0; j < w; j += 8) {
761 const __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
762 const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]);
763 const __m128i z = _mm_alignr_epi8(y, x, 2);
764 const __m128i res = highbd_filter_block(x, z, hfilter_vec);
765 _mm_storeu_si128((__m128i *)&b[j], res);
766 }
767
768 src += src_stride;
769 b += w;
770 }
771 }
772
773 // Vertical filter
774 if (yoffset == 0) {
775 // The data is already in 'dst', so no need to filter
776 } else if (yoffset == 4) {
777 for (i = 0; i < h; ++i) {
778 for (j = 0; j < w; j += 8) {
779 __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
780 __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
781 _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu16(x, y));
782 }
783 dst += w;
784 }
785 } else {
786 const uint8_t *vfilter = bilinear_filters_2t[yoffset];
787 const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
788 for (i = 0; i < h; ++i) {
789 for (j = 0; j < w; j += 8) {
790 const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
791 const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
792 const __m128i res = highbd_filter_block(x, y, vfilter_vec);
793 _mm_storeu_si128((__m128i *)&dst[j], res);
794 }
795
796 dst += w;
797 }
798 }
799}
800
David Barkerdab3e992017-06-08 14:01:37 +0100801static INLINE __m128i highbd_filter_block_2rows(const __m128i *a0,
802 const __m128i *b0,
803 const __m128i *a1,
804 const __m128i *b1,
805 const __m128i *filter) {
806 __m128i v0 = _mm_unpacklo_epi16(*a0, *b0);
807 v0 = _mm_madd_epi16(v0, *filter);
David Barker0aa39ff2017-05-23 12:53:08 +0100808 v0 = xx_roundn_epu32(v0, FILTER_BITS);
809
David Barkerdab3e992017-06-08 14:01:37 +0100810 __m128i v1 = _mm_unpacklo_epi16(*a1, *b1);
811 v1 = _mm_madd_epi16(v1, *filter);
David Barker0aa39ff2017-05-23 12:53:08 +0100812 v1 = xx_roundn_epu32(v1, FILTER_BITS);
813
814 return _mm_packs_epi32(v0, v1);
815}
816
817static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
818 int xoffset, int yoffset, uint16_t *dst,
819 int h) {
820 int i;
821 // Horizontal filter
822 if (xoffset == 0) {
823 uint16_t *b = dst;
824 for (i = 0; i < h + 1; ++i) {
825 __m128i x = _mm_loadl_epi64((__m128i *)src);
826 _mm_storel_epi64((__m128i *)b, x);
827 src += src_stride;
828 b += 4;
829 }
830 } else if (xoffset == 4) {
831 uint16_t *b = dst;
832 for (i = 0; i < h + 1; ++i) {
833 __m128i x = _mm_loadu_si128((__m128i *)src);
834 __m128i z = _mm_srli_si128(x, 2);
835 _mm_storel_epi64((__m128i *)b, _mm_avg_epu16(x, z));
836 src += src_stride;
837 b += 4;
838 }
839 } else {
840 uint16_t *b = dst;
841 const uint8_t *hfilter = bilinear_filters_2t[xoffset];
842 const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
843 for (i = 0; i < h; i += 2) {
844 const __m128i x0 = _mm_loadu_si128((__m128i *)src);
845 const __m128i z0 = _mm_srli_si128(x0, 2);
846 const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
847 const __m128i z1 = _mm_srli_si128(x1, 2);
848 const __m128i res =
David Barkerdab3e992017-06-08 14:01:37 +0100849 highbd_filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec);
David Barker0aa39ff2017-05-23 12:53:08 +0100850 _mm_storeu_si128((__m128i *)b, res);
851
852 src += src_stride * 2;
853 b += 8;
854 }
855 // Process i = h separately
856 __m128i x = _mm_loadu_si128((__m128i *)src);
857 __m128i z = _mm_srli_si128(x, 2);
858
859 __m128i v0 = _mm_unpacklo_epi16(x, z);
860 v0 = _mm_madd_epi16(v0, hfilter_vec);
861 v0 = xx_roundn_epu32(v0, FILTER_BITS);
862
863 _mm_storel_epi64((__m128i *)b, _mm_packs_epi32(v0, v0));
864 }
865
866 // Vertical filter
867 if (yoffset == 0) {
868 // The data is already in 'dst', so no need to filter
869 } else if (yoffset == 4) {
870 for (i = 0; i < h; ++i) {
871 __m128i x = _mm_loadl_epi64((__m128i *)dst);
872 __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
873 _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(x, y));
874 dst += 4;
875 }
876 } else {
877 const uint8_t *vfilter = bilinear_filters_2t[yoffset];
878 const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
879 for (i = 0; i < h; i += 2) {
880 const __m128i x = _mm_loadl_epi64((__m128i *)dst);
881 const __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
882 const __m128i z = _mm_loadl_epi64((__m128i *)&dst[8]);
David Barkerdab3e992017-06-08 14:01:37 +0100883 const __m128i res =
884 highbd_filter_block_2rows(&x, &y, &y, &z, &vfilter_vec);
David Barker0aa39ff2017-05-23 12:53:08 +0100885 _mm_storeu_si128((__m128i *)dst, res);
886
887 dst += 8;
888 }
889 }
890}
891
892static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,
893 const uint16_t *a_ptr, int a_stride,
894 const uint16_t *b_ptr, int b_stride,
895 const uint8_t *m_ptr, int m_stride,
896 int width, int height, uint64_t *sse,
897 int *sum_) {
898 int x, y;
899 // Note on bit widths:
900 // The maximum value of 'sum' is (2^12 - 1) * 128 * 128 =~ 2^26,
901 // so this can be kept as four 32-bit values.
902 // But the maximum value of 'sum_sq' is (2^12 - 1)^2 * 128 * 128 =~ 2^38,
903 // so this must be stored as two 64-bit values.
904 __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
905 const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
906 const __m128i round_const =
907 _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
908 const __m128i zero = _mm_setzero_si128();
909
910 for (y = 0; y < height; y++) {
911 for (x = 0; x < width; x += 8) {
912 const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
913 const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
914 const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
915 const __m128i m =
916 _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&m_ptr[x]), zero);
917 const __m128i m_inv = _mm_sub_epi16(mask_max, m);
918
919 // Calculate 8 predicted pixels.
920 const __m128i data_l = _mm_unpacklo_epi16(a, b);
921 const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
922 __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
923 pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
924 AOM_BLEND_A64_ROUND_BITS);
925
926 const __m128i data_r = _mm_unpackhi_epi16(a, b);
927 const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
928 __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
929 pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
930 AOM_BLEND_A64_ROUND_BITS);
931
932 const __m128i src_l = _mm_unpacklo_epi16(src, zero);
933 const __m128i src_r = _mm_unpackhi_epi16(src, zero);
934 __m128i diff_l = _mm_sub_epi32(pred_l, src_l);
935 __m128i diff_r = _mm_sub_epi32(pred_r, src_r);
936
937 // Update partial sums and partial sums of squares
938 sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r));
939 // A trick: Now each entry of diff_l and diff_r is stored in a 32-bit
940 // field, but the range of values is only [-(2^12 - 1), 2^12 - 1].
941 // So we can re-pack into 16-bit fields and use _mm_madd_epi16
942 // to calculate the squares and partially sum them.
943 const __m128i tmp = _mm_packs_epi32(diff_l, diff_r);
944 const __m128i prod = _mm_madd_epi16(tmp, tmp);
945 // Then we want to sign-extend to 64 bits and accumulate
946 const __m128i sign = _mm_srai_epi32(prod, 31);
947 const __m128i tmp_0 = _mm_unpacklo_epi32(prod, sign);
948 const __m128i tmp_1 = _mm_unpackhi_epi32(prod, sign);
949 sum_sq = _mm_add_epi64(sum_sq, _mm_add_epi64(tmp_0, tmp_1));
950 }
951
952 src_ptr += src_stride;
953 a_ptr += a_stride;
954 b_ptr += b_stride;
955 m_ptr += m_stride;
956 }
957 // Reduce down to a single sum and sum of squares
958 sum = _mm_hadd_epi32(sum, zero);
959 sum = _mm_hadd_epi32(sum, zero);
960 *sum_ = _mm_cvtsi128_si32(sum);
961 sum_sq = _mm_add_epi64(sum_sq, _mm_srli_si128(sum_sq, 8));
962 _mm_storel_epi64((__m128i *)sse, sum_sq);
963}
964
965static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
966 const uint16_t *a_ptr,
967 const uint16_t *b_ptr,
968 const uint8_t *m_ptr, int m_stride,
969 int height, int *sse, int *sum_) {
970 int y;
971 // Note: For this function, h <= 8 (or maybe 16 if we add 4:1 partitions).
972 // So the maximum value of sum is (2^12 - 1) * 4 * 16 =~ 2^18
973 // and the maximum value of sum_sq is (2^12 - 1)^2 * 4 * 16 =~ 2^30.
974 // So we can safely pack sum_sq into 32-bit fields, which is slightly more
975 // convenient.
976 __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
977 const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
978 const __m128i round_const =
979 _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
980 const __m128i zero = _mm_setzero_si128();
981
982 for (y = 0; y < height; y += 2) {
983 __m128i src = _mm_unpacklo_epi64(
984 _mm_loadl_epi64((const __m128i *)src_ptr),
985 _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
986 const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
987 const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
988 const __m128i m = _mm_unpacklo_epi8(
989 _mm_unpacklo_epi32(
990 _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
991 _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
992 zero);
993 const __m128i m_inv = _mm_sub_epi16(mask_max, m);
994
995 const __m128i data_l = _mm_unpacklo_epi16(a, b);
996 const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
997 __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
998 pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
999 AOM_BLEND_A64_ROUND_BITS);
1000
1001 const __m128i data_r = _mm_unpackhi_epi16(a, b);
1002 const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
1003 __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
1004 pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
1005 AOM_BLEND_A64_ROUND_BITS);
1006
1007 const __m128i src_l = _mm_unpacklo_epi16(src, zero);
1008 const __m128i src_r = _mm_unpackhi_epi16(src, zero);
1009 __m128i diff_l = _mm_sub_epi32(pred_l, src_l);
1010 __m128i diff_r = _mm_sub_epi32(pred_r, src_r);
1011
1012 // Update partial sums and partial sums of squares
1013 sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r));
1014 const __m128i tmp = _mm_packs_epi32(diff_l, diff_r);
1015 const __m128i prod = _mm_madd_epi16(tmp, tmp);
1016 sum_sq = _mm_add_epi32(sum_sq, prod);
1017
1018 src_ptr += src_stride * 2;
1019 a_ptr += 8;
1020 b_ptr += 8;
1021 m_ptr += m_stride * 2;
1022 }
1023 // Reduce down to a single sum and sum of squares
1024 sum = _mm_hadd_epi32(sum, sum_sq);
1025 sum = _mm_hadd_epi32(sum, zero);
1026 *sum_ = _mm_cvtsi128_si32(sum);
1027 *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
1028}
Jerome Jiangfa1d1732019-08-06 10:31:20 -07001029#endif // CONFIG_AV1_HIGHBITDEPTH
Peng Bin33ba1fe2018-01-24 11:48:08 +08001030
Peng Bin953b77e2018-02-01 11:28:39 +08001031void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
1032 int width, int height, const uint8_t *ref,
1033 int ref_stride, const uint8_t *mask,
1034 int mask_stride, int invert_mask) {
Peng Bin33ba1fe2018-01-24 11:48:08 +08001035 const uint8_t *src0 = invert_mask ? pred : ref;
1036 const uint8_t *src1 = invert_mask ? ref : pred;
1037 const int stride0 = invert_mask ? width : ref_stride;
1038 const int stride1 = invert_mask ? ref_stride : width;
Peng Bin33ba1fe2018-01-24 11:48:08 +08001039 assert(height % 2 == 0);
Peng Bin953b77e2018-02-01 11:28:39 +08001040 int i = 0;
1041 if (width == 8) {
Peng Bin3c74dd42018-02-02 13:23:19 +08001042 comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
1043 mask, mask_stride);
Peng Bin953b77e2018-02-01 11:28:39 +08001044 } else if (width == 16) {
1045 do {
1046 comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred);
1047 comp_mask_pred_16_ssse3(src0 + stride0, src1 + stride1,
1048 mask + mask_stride, comp_pred + width);
1049 comp_pred += (width << 1);
1050 src0 += (stride0 << 1);
1051 src1 += (stride1 << 1);
1052 mask += (mask_stride << 1);
1053 i += 2;
1054 } while (i < height);
Krishna Malladic05e2fc2020-10-23 12:23:04 -07001055 } else {
Peng Bin953b77e2018-02-01 11:28:39 +08001056 do {
Krishna Malladia97c2432020-10-30 10:25:16 -07001057 for (int x = 0; x < width; x += 32) {
1058 comp_mask_pred_16_ssse3(src0 + x, src1 + x, mask + x, comp_pred);
1059 comp_mask_pred_16_ssse3(src0 + x + 16, src1 + x + 16, mask + x + 16,
1060 comp_pred + 16);
Krishna Malladic05e2fc2020-10-23 12:23:04 -07001061 comp_pred += 32;
1062 }
Peng Bin953b77e2018-02-01 11:28:39 +08001063 src0 += (stride0);
1064 src1 += (stride1);
1065 mask += (mask_stride);
1066 i += 1;
1067 } while (i < height);
1068 }
Peng Bin33ba1fe2018-01-24 11:48:08 +08001069}