blob: 8b9ff1099d30c77bedf53d0ff982259ad1ddb08f [file] [log] [blame]
Debargha Mukherjee1d69cee2016-02-29 16:08:07 -08001/*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <stdlib.h>
12#include <emmintrin.h>
13#include <tmmintrin.h>
14
15#include "vpx_ports/mem.h"
16#include "./vpx_config.h"
17#include "vpx/vpx_integer.h"
18
19static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) {
20 __m128i temp1 = _mm_loadl_epi64((const __m128i*)ptr);
21 __m128i temp2 = _mm_loadl_epi64((const __m128i*)(ptr + stride));
22 return _mm_unpacklo_epi64(temp1, temp2);
23}
24
25static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) {
26 __m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t*)ptr);
27 __m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride));
28 __m128i temp3 = _mm_unpacklo_epi32(temp1, temp2);
29 temp1 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride * 2));
30 temp2 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride * 3));
31 temp1 = _mm_unpacklo_epi32(temp1, temp2);
32 return _mm_unpacklo_epi64(temp3, temp1);
33}
34
35static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
36 const uint8_t *b_ptr, int b_stride,
37 const uint8_t *m_ptr, int m_stride,
38 int width, int height);
39
40static INLINE unsigned int masked_sad8xh_ssse3(const uint8_t *a_ptr,
41 int a_stride,
42 const uint8_t *b_ptr,
43 int b_stride,
44 const uint8_t *m_ptr,
45 int m_stride,
46 int height);
47
48static INLINE unsigned int masked_sad4xh_ssse3(const uint8_t *a_ptr,
49 int a_stride,
50 const uint8_t *b_ptr,
51 int b_stride,
52 const uint8_t *m_ptr,
53 int m_stride,
54 int height);
55
56#define MASKSADMXN_SSSE3(m, n) \
57unsigned int vpx_masked_sad##m##x##n##_ssse3(const uint8_t *src, \
58 int src_stride, \
59 const uint8_t *ref, \
60 int ref_stride, \
61 const uint8_t *msk, \
62 int msk_stride) { \
63 return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \
64 m, n); \
65}
66
Geza Lore697bf5b2016-03-02 11:12:52 +000067#if CONFIG_EXT_PARTITION
68MASKSADMXN_SSSE3(128, 128)
69MASKSADMXN_SSSE3(128, 64)
70MASKSADMXN_SSSE3(64, 128)
71#endif // CONFIG_EXT_PARTITION
Debargha Mukherjee1d69cee2016-02-29 16:08:07 -080072MASKSADMXN_SSSE3(64, 64)
73MASKSADMXN_SSSE3(64, 32)
74MASKSADMXN_SSSE3(32, 64)
75MASKSADMXN_SSSE3(32, 32)
76MASKSADMXN_SSSE3(32, 16)
77MASKSADMXN_SSSE3(16, 32)
78MASKSADMXN_SSSE3(16, 16)
79MASKSADMXN_SSSE3(16, 8)
80
81#define MASKSAD8XN_SSSE3(n) \
82unsigned int vpx_masked_sad8x##n##_ssse3(const uint8_t *src, \
83 int src_stride, \
84 const uint8_t *ref, \
85 int ref_stride, \
86 const uint8_t *msk, \
87 int msk_stride) { \
88 return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk, \
89 msk_stride, n); \
90}
91
92MASKSAD8XN_SSSE3(16)
93MASKSAD8XN_SSSE3(8)
94MASKSAD8XN_SSSE3(4)
95
96#define MASKSAD4XN_SSSE3(n) \
97unsigned int vpx_masked_sad4x##n##_ssse3(const uint8_t *src, int src_stride, \
98 const uint8_t *ref, int ref_stride, \
99 const uint8_t *msk, int msk_stride) { \
100 return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \
101 msk_stride, n); \
102}
103
104MASKSAD4XN_SSSE3(8)
105MASKSAD4XN_SSSE3(4)
106
107// For width a multiple of 16
Geza Lore697bf5b2016-03-02 11:12:52 +0000108// Assumes values in m are <=64
Debargha Mukherjee1d69cee2016-02-29 16:08:07 -0800109static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
110 const uint8_t *b_ptr, int b_stride,
111 const uint8_t *m_ptr, int m_stride,
112 int width, int height) {
113 int y, x;
114 __m128i a, b, m, temp1, temp2;
115 __m128i res = _mm_setzero_si128();
116 __m128i one = _mm_set1_epi16(1);
117 // For each row
118 for (y = 0; y < height; y++) {
119 // Covering the full width
120 for (x = 0; x < width; x += 16) {
121 // Load a, b, m in xmm registers
122 a = _mm_loadu_si128((const __m128i*)(a_ptr + x));
123 b = _mm_loadu_si128((const __m128i*)(b_ptr + x));
124 m = _mm_loadu_si128((const __m128i*)(m_ptr + x));
125
126 // Calculate the difference between a & b
127 temp1 = _mm_subs_epu8(a, b);
128 temp2 = _mm_subs_epu8(b, a);
129 temp1 = _mm_or_si128(temp1, temp2);
130
131 // Multiply by m and add together
132 temp2 = _mm_maddubs_epi16(temp1, m);
133 // Pad out row result to 32 bit integers & add to running total
134 res = _mm_add_epi32(res, _mm_madd_epi16(temp2, one));
135 }
136 // Move onto the next row
137 a_ptr += a_stride;
138 b_ptr += b_stride;
139 m_ptr += m_stride;
140 }
141 res = _mm_hadd_epi32(res, _mm_setzero_si128());
142 res = _mm_hadd_epi32(res, _mm_setzero_si128());
143 // sad = (sad + 31) >> 6;
144 return (_mm_cvtsi128_si32(res) + 31) >> 6;
145}
146
147static INLINE unsigned int masked_sad8xh_ssse3(const uint8_t *a_ptr,
148 int a_stride,
149 const uint8_t *b_ptr,
150 int b_stride,
151 const uint8_t *m_ptr,
152 int m_stride,
153 int height) {
154 int y;
155 __m128i a, b, m, temp1, temp2, row_res;
156 __m128i res = _mm_setzero_si128();
157 __m128i one = _mm_set1_epi16(1);
158 // Add the masked SAD for 2 rows at a time
159 for (y = 0; y < height; y += 2) {
160 // Load a, b, m in xmm registers
161 a = width8_load_2rows(a_ptr, a_stride);
162 b = width8_load_2rows(b_ptr, b_stride);
163 m = width8_load_2rows(m_ptr, m_stride);
164
165 // Calculate the difference between a & b
166 temp1 = _mm_subs_epu8(a, b);
167 temp2 = _mm_subs_epu8(b, a);
168 temp1 = _mm_or_si128(temp1, temp2);
169
170 // Multiply by m and add together
171 row_res = _mm_maddubs_epi16(temp1, m);
172
173 // Pad out row result to 32 bit integers & add to running total
174 res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
175
176 // Move onto the next rows
177 a_ptr += a_stride * 2;
178 b_ptr += b_stride * 2;
179 m_ptr += m_stride * 2;
180 }
181 res = _mm_hadd_epi32(res, _mm_setzero_si128());
182 res = _mm_hadd_epi32(res, _mm_setzero_si128());
183 // sad = (sad + 31) >> 6;
184 return (_mm_cvtsi128_si32(res) + 31) >> 6;
185}
186
187static INLINE unsigned int masked_sad4xh_ssse3(const uint8_t *a_ptr,
188 int a_stride,
189 const uint8_t *b_ptr,
190 int b_stride,
191 const uint8_t *m_ptr,
192 int m_stride,
193 int height) {
194 int y;
195 __m128i a, b, m, temp1, temp2, row_res;
196 __m128i res = _mm_setzero_si128();
197 __m128i one = _mm_set1_epi16(1);
198 // Add the masked SAD for 4 rows at a time
199 for (y = 0; y < height; y += 4) {
200 // Load a, b, m in xmm registers
201 a = width4_load_4rows(a_ptr, a_stride);
202 b = width4_load_4rows(b_ptr, b_stride);
203 m = width4_load_4rows(m_ptr, m_stride);
204
205 // Calculate the difference between a & b
206 temp1 = _mm_subs_epu8(a, b);
207 temp2 = _mm_subs_epu8(b, a);
208 temp1 = _mm_or_si128(temp1, temp2);
209
210 // Multiply by m and add together
211 row_res = _mm_maddubs_epi16(temp1, m);
212
213 // Pad out row result to 32 bit integers & add to running total
214 res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
215
216 // Move onto the next rows
217 a_ptr += a_stride * 4;
218 b_ptr += b_stride * 4;
219 m_ptr += m_stride * 4;
220 }
221 // Pad out row result to 32 bit integers & add to running total
222 res = _mm_hadd_epi32(res, _mm_setzero_si128());
223 res = _mm_hadd_epi32(res, _mm_setzero_si128());
224 // sad = (sad + 31) >> 6;
225 return (_mm_cvtsi128_si32(res) + 31) >> 6;
226}
227
228#if CONFIG_VP9_HIGHBITDEPTH
229static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr,
230 int stride) {
231 __m128i temp1 = _mm_loadl_epi64((const __m128i*)ptr);
232 __m128i temp2 = _mm_loadl_epi64((const __m128i*)(ptr + stride));
233 return _mm_unpacklo_epi64(temp1, temp2);
234}
235
236static INLINE unsigned int highbd_masked_sad_ssse3(const uint8_t *a8_ptr,
237 int a_stride,
238 const uint8_t *b8_ptr,
239 int b_stride,
240 const uint8_t *m_ptr,
241 int m_stride,
242 int width, int height);
243
244static INLINE unsigned int highbd_masked_sad4xh_ssse3(const uint8_t *a8_ptr,
245 int a_stride,
246 const uint8_t *b8_ptr,
247 int b_stride,
248 const uint8_t *m_ptr,
249 int m_stride,
250 int height);
251
252#define HIGHBD_MASKSADMXN_SSSE3(m, n) \
253unsigned int vpx_highbd_masked_sad##m##x##n##_ssse3(const uint8_t *src, \
254 int src_stride, \
255 const uint8_t *ref, \
256 int ref_stride, \
257 const uint8_t *msk, \
258 int msk_stride) { \
259 return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, \
260 msk_stride, m, n); \
261}
262
Geza Lore697bf5b2016-03-02 11:12:52 +0000263#if CONFIG_EXT_PARTITION
264HIGHBD_MASKSADMXN_SSSE3(128, 128)
265HIGHBD_MASKSADMXN_SSSE3(128, 64)
266HIGHBD_MASKSADMXN_SSSE3(64, 128)
267#endif // CONFIG_EXT_PARTITION
Debargha Mukherjee1d69cee2016-02-29 16:08:07 -0800268HIGHBD_MASKSADMXN_SSSE3(64, 64)
269HIGHBD_MASKSADMXN_SSSE3(64, 32)
270HIGHBD_MASKSADMXN_SSSE3(32, 64)
271HIGHBD_MASKSADMXN_SSSE3(32, 32)
272HIGHBD_MASKSADMXN_SSSE3(32, 16)
273HIGHBD_MASKSADMXN_SSSE3(16, 32)
274HIGHBD_MASKSADMXN_SSSE3(16, 16)
275HIGHBD_MASKSADMXN_SSSE3(16, 8)
276HIGHBD_MASKSADMXN_SSSE3(8, 16)
277HIGHBD_MASKSADMXN_SSSE3(8, 8)
278HIGHBD_MASKSADMXN_SSSE3(8, 4)
279
280#define HIGHBD_MASKSAD4XN_SSSE3(n) \
281unsigned int vpx_highbd_masked_sad4x##n##_ssse3(const uint8_t *src, \
282 int src_stride, \
283 const uint8_t *ref, \
284 int ref_stride, \
285 const uint8_t *msk, \
286 int msk_stride) { \
287 return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \
288 msk_stride, n); \
289}
290
291HIGHBD_MASKSAD4XN_SSSE3(8)
292HIGHBD_MASKSAD4XN_SSSE3(4)
293
294// For width a multiple of 8
295// Assumes values in m are <=64
296static INLINE unsigned int highbd_masked_sad_ssse3(const uint8_t *a8_ptr,
297 int a_stride,
298 const uint8_t *b8_ptr,
299 int b_stride,
300 const uint8_t *m_ptr,
301 int m_stride,
302 int width, int height) {
303 int y, x;
304 __m128i a, b, m, temp1, temp2;
305 const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
306 const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
307 __m128i res = _mm_setzero_si128();
308 // For each row
309 for (y = 0; y < height; y++) {
310 // Covering the full width
311 for (x = 0; x < width; x += 8) {
312 // Load a, b, m in xmm registers
313 a = _mm_loadu_si128((const __m128i*)(a_ptr + x));
314 b = _mm_loadu_si128((const __m128i*)(b_ptr + x));
315 m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(m_ptr + x)),
316 _mm_setzero_si128());
317
318 // Calculate the difference between a & b
319 temp1 = _mm_subs_epu16(a, b);
320 temp2 = _mm_subs_epu16(b, a);
321 temp1 = _mm_or_si128(temp1, temp2);
322
323 // Add result of multiplying by m and add pairs together to running total
324 res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
325 }
326 // Move onto the next row
327 a_ptr += a_stride;
328 b_ptr += b_stride;
329 m_ptr += m_stride;
330 }
331 res = _mm_hadd_epi32(res, _mm_setzero_si128());
332 res = _mm_hadd_epi32(res, _mm_setzero_si128());
333 // sad = (sad + 31) >> 6;
334 return (_mm_cvtsi128_si32(res) + 31) >> 6;
335}
336
337static INLINE unsigned int highbd_masked_sad4xh_ssse3(const uint8_t *a8_ptr,
338 int a_stride,
339 const uint8_t *b8_ptr,
340 int b_stride,
341 const uint8_t *m_ptr,
342 int m_stride,
343 int height) {
344 int y;
345 __m128i a, b, m, temp1, temp2;
346 const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
347 const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
348 __m128i res = _mm_setzero_si128();
349 // Add the masked SAD for 2 rows at a time
350 for (y = 0; y < height; y += 2) {
351 // Load a, b, m in xmm registers
352 a = highbd_width4_load_2rows(a_ptr, a_stride);
353 b = highbd_width4_load_2rows(b_ptr, b_stride);
354 temp1 = _mm_loadl_epi64((const __m128i*)m_ptr);
355 temp2 = _mm_loadl_epi64((const __m128i*)(m_ptr + m_stride));
356 m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2),
357 _mm_setzero_si128());
358
359 // Calculate the difference between a & b
360 temp1 = _mm_subs_epu16(a, b);
361 temp2 = _mm_subs_epu16(b, a);
362 temp1 = _mm_or_si128(temp1, temp2);
363
364 // Multiply by m and add together
365 res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
366
367 // Move onto the next rows
368 a_ptr += a_stride * 2;
369 b_ptr += b_stride * 2;
370 m_ptr += m_stride * 2;
371 }
372 res = _mm_hadd_epi32(res, _mm_setzero_si128());
373 res = _mm_hadd_epi32(res, _mm_setzero_si128());
374 // sad = (sad + 31) >> 6;
375 return (_mm_cvtsi128_si32(res) + 31) >> 6;
376}
377#endif // CONFIG_VP9_HIGHBITDEPTH