blob: c0d9ec8764ea4b119edb785940d45c49a4c580ea [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Yaowu Xubde4ac82016-11-28 15:26:06 -08002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Yaowu Xubde4ac82016-11-28 15:26:06 -08004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xuc27fc142016-08-22 16:08:15 -070010 */
11
12#include <stdlib.h>
13#include <emmintrin.h>
14#include <tmmintrin.h>
15
16#include "aom_ports/mem.h"
Yaowu Xuf883b422016-08-30 14:01:10 -070017#include "./aom_config.h"
18#include "aom/aom_integer.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070019
20static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) {
21 __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
22 __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
23 return _mm_unpacklo_epi64(temp1, temp2);
24}
25
26static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) {
27 __m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t *)ptr);
28 __m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride));
29 __m128i temp3 = _mm_unpacklo_epi32(temp1, temp2);
30 temp1 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 2));
31 temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 3));
32 temp1 = _mm_unpacklo_epi32(temp1, temp2);
33 return _mm_unpacklo_epi64(temp3, temp1);
34}
35
36static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
37 const uint8_t *b_ptr, int b_stride,
38 const uint8_t *m_ptr, int m_stride,
39 int width, int height);
40
41static INLINE unsigned int masked_sad8xh_ssse3(
42 const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
43 const uint8_t *m_ptr, int m_stride, int height);
44
45static INLINE unsigned int masked_sad4xh_ssse3(
46 const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
47 const uint8_t *m_ptr, int m_stride, int height);
48
49#define MASKSADMXN_SSSE3(m, n) \
Yaowu Xuf883b422016-08-30 14:01:10 -070050 unsigned int aom_masked_sad##m##x##n##_ssse3( \
Yaowu Xuc27fc142016-08-22 16:08:15 -070051 const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
52 const uint8_t *msk, int msk_stride) { \
53 return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \
54 m, n); \
55 }
56
57#if CONFIG_EXT_PARTITION
58MASKSADMXN_SSSE3(128, 128)
59MASKSADMXN_SSSE3(128, 64)
60MASKSADMXN_SSSE3(64, 128)
61#endif // CONFIG_EXT_PARTITION
62MASKSADMXN_SSSE3(64, 64)
63MASKSADMXN_SSSE3(64, 32)
64MASKSADMXN_SSSE3(32, 64)
65MASKSADMXN_SSSE3(32, 32)
66MASKSADMXN_SSSE3(32, 16)
67MASKSADMXN_SSSE3(16, 32)
68MASKSADMXN_SSSE3(16, 16)
69MASKSADMXN_SSSE3(16, 8)
70
71#define MASKSAD8XN_SSSE3(n) \
Yaowu Xuf883b422016-08-30 14:01:10 -070072 unsigned int aom_masked_sad8x##n##_ssse3( \
Yaowu Xuc27fc142016-08-22 16:08:15 -070073 const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
74 const uint8_t *msk, int msk_stride) { \
75 return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk, \
76 msk_stride, n); \
77 }
78
79MASKSAD8XN_SSSE3(16)
80MASKSAD8XN_SSSE3(8)
81MASKSAD8XN_SSSE3(4)
82
83#define MASKSAD4XN_SSSE3(n) \
Yaowu Xuf883b422016-08-30 14:01:10 -070084 unsigned int aom_masked_sad4x##n##_ssse3( \
Yaowu Xuc27fc142016-08-22 16:08:15 -070085 const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
86 const uint8_t *msk, int msk_stride) { \
87 return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \
88 msk_stride, n); \
89 }
90
91MASKSAD4XN_SSSE3(8)
92MASKSAD4XN_SSSE3(4)
93
94// For width a multiple of 16
95// Assumes values in m are <=64
96static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
97 const uint8_t *b_ptr, int b_stride,
98 const uint8_t *m_ptr, int m_stride,
99 int width, int height) {
100 int y, x;
101 __m128i a, b, m, temp1, temp2;
102 __m128i res = _mm_setzero_si128();
103 __m128i one = _mm_set1_epi16(1);
104 // For each row
105 for (y = 0; y < height; y++) {
106 // Covering the full width
107 for (x = 0; x < width; x += 16) {
108 // Load a, b, m in xmm registers
109 a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
110 b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
111 m = _mm_loadu_si128((const __m128i *)(m_ptr + x));
112
113 // Calculate the difference between a & b
114 temp1 = _mm_subs_epu8(a, b);
115 temp2 = _mm_subs_epu8(b, a);
116 temp1 = _mm_or_si128(temp1, temp2);
117
118 // Multiply by m and add together
119 temp2 = _mm_maddubs_epi16(temp1, m);
120 // Pad out row result to 32 bit integers & add to running total
121 res = _mm_add_epi32(res, _mm_madd_epi16(temp2, one));
122 }
123 // Move onto the next row
124 a_ptr += a_stride;
125 b_ptr += b_stride;
126 m_ptr += m_stride;
127 }
128 res = _mm_hadd_epi32(res, _mm_setzero_si128());
129 res = _mm_hadd_epi32(res, _mm_setzero_si128());
130 // sad = (sad + 31) >> 6;
131 return (_mm_cvtsi128_si32(res) + 31) >> 6;
132}
133
134static INLINE unsigned int masked_sad8xh_ssse3(
135 const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
136 const uint8_t *m_ptr, int m_stride, int height) {
137 int y;
138 __m128i a, b, m, temp1, temp2, row_res;
139 __m128i res = _mm_setzero_si128();
140 __m128i one = _mm_set1_epi16(1);
141 // Add the masked SAD for 2 rows at a time
142 for (y = 0; y < height; y += 2) {
143 // Load a, b, m in xmm registers
144 a = width8_load_2rows(a_ptr, a_stride);
145 b = width8_load_2rows(b_ptr, b_stride);
146 m = width8_load_2rows(m_ptr, m_stride);
147
148 // Calculate the difference between a & b
149 temp1 = _mm_subs_epu8(a, b);
150 temp2 = _mm_subs_epu8(b, a);
151 temp1 = _mm_or_si128(temp1, temp2);
152
153 // Multiply by m and add together
154 row_res = _mm_maddubs_epi16(temp1, m);
155
156 // Pad out row result to 32 bit integers & add to running total
157 res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
158
159 // Move onto the next rows
160 a_ptr += a_stride * 2;
161 b_ptr += b_stride * 2;
162 m_ptr += m_stride * 2;
163 }
164 res = _mm_hadd_epi32(res, _mm_setzero_si128());
165 res = _mm_hadd_epi32(res, _mm_setzero_si128());
166 // sad = (sad + 31) >> 6;
167 return (_mm_cvtsi128_si32(res) + 31) >> 6;
168}
169
170static INLINE unsigned int masked_sad4xh_ssse3(
171 const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
172 const uint8_t *m_ptr, int m_stride, int height) {
173 int y;
174 __m128i a, b, m, temp1, temp2, row_res;
175 __m128i res = _mm_setzero_si128();
176 __m128i one = _mm_set1_epi16(1);
177 // Add the masked SAD for 4 rows at a time
178 for (y = 0; y < height; y += 4) {
179 // Load a, b, m in xmm registers
180 a = width4_load_4rows(a_ptr, a_stride);
181 b = width4_load_4rows(b_ptr, b_stride);
182 m = width4_load_4rows(m_ptr, m_stride);
183
184 // Calculate the difference between a & b
185 temp1 = _mm_subs_epu8(a, b);
186 temp2 = _mm_subs_epu8(b, a);
187 temp1 = _mm_or_si128(temp1, temp2);
188
189 // Multiply by m and add together
190 row_res = _mm_maddubs_epi16(temp1, m);
191
192 // Pad out row result to 32 bit integers & add to running total
193 res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
194
195 // Move onto the next rows
196 a_ptr += a_stride * 4;
197 b_ptr += b_stride * 4;
198 m_ptr += m_stride * 4;
199 }
200 // Pad out row result to 32 bit integers & add to running total
201 res = _mm_hadd_epi32(res, _mm_setzero_si128());
202 res = _mm_hadd_epi32(res, _mm_setzero_si128());
203 // sad = (sad + 31) >> 6;
204 return (_mm_cvtsi128_si32(res) + 31) >> 6;
205}
206
Yaowu Xuf883b422016-08-30 14:01:10 -0700207#if CONFIG_AOM_HIGHBITDEPTH
Yaowu Xuc27fc142016-08-22 16:08:15 -0700208static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr,
209 int stride) {
210 __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
211 __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
212 return _mm_unpacklo_epi64(temp1, temp2);
213}
214
215static INLINE unsigned int highbd_masked_sad_ssse3(
216 const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
217 const uint8_t *m_ptr, int m_stride, int width, int height);
218
219static INLINE unsigned int highbd_masked_sad4xh_ssse3(
220 const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
221 const uint8_t *m_ptr, int m_stride, int height);
222
223#define HIGHBD_MASKSADMXN_SSSE3(m, n) \
Yaowu Xuf883b422016-08-30 14:01:10 -0700224 unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700225 const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
226 const uint8_t *msk, int msk_stride) { \
227 return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, \
228 msk_stride, m, n); \
229 }
230
231#if CONFIG_EXT_PARTITION
232HIGHBD_MASKSADMXN_SSSE3(128, 128)
233HIGHBD_MASKSADMXN_SSSE3(128, 64)
234HIGHBD_MASKSADMXN_SSSE3(64, 128)
235#endif // CONFIG_EXT_PARTITION
236HIGHBD_MASKSADMXN_SSSE3(64, 64)
237HIGHBD_MASKSADMXN_SSSE3(64, 32)
238HIGHBD_MASKSADMXN_SSSE3(32, 64)
239HIGHBD_MASKSADMXN_SSSE3(32, 32)
240HIGHBD_MASKSADMXN_SSSE3(32, 16)
241HIGHBD_MASKSADMXN_SSSE3(16, 32)
242HIGHBD_MASKSADMXN_SSSE3(16, 16)
243HIGHBD_MASKSADMXN_SSSE3(16, 8)
244HIGHBD_MASKSADMXN_SSSE3(8, 16)
245HIGHBD_MASKSADMXN_SSSE3(8, 8)
246HIGHBD_MASKSADMXN_SSSE3(8, 4)
247
248#define HIGHBD_MASKSAD4XN_SSSE3(n) \
Yaowu Xuf883b422016-08-30 14:01:10 -0700249 unsigned int aom_highbd_masked_sad4x##n##_ssse3( \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700250 const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
251 const uint8_t *msk, int msk_stride) { \
252 return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \
253 msk_stride, n); \
254 }
255
256HIGHBD_MASKSAD4XN_SSSE3(8)
257HIGHBD_MASKSAD4XN_SSSE3(4)
258
259// For width a multiple of 8
260// Assumes values in m are <=64
261static INLINE unsigned int highbd_masked_sad_ssse3(
262 const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
263 const uint8_t *m_ptr, int m_stride, int width, int height) {
264 int y, x;
265 __m128i a, b, m, temp1, temp2;
266 const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
267 const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
268 __m128i res = _mm_setzero_si128();
269 // For each row
270 for (y = 0; y < height; y++) {
271 // Covering the full width
272 for (x = 0; x < width; x += 8) {
273 // Load a, b, m in xmm registers
274 a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
275 b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
276 m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(m_ptr + x)),
277 _mm_setzero_si128());
278
279 // Calculate the difference between a & b
280 temp1 = _mm_subs_epu16(a, b);
281 temp2 = _mm_subs_epu16(b, a);
282 temp1 = _mm_or_si128(temp1, temp2);
283
284 // Add result of multiplying by m and add pairs together to running total
285 res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
286 }
287 // Move onto the next row
288 a_ptr += a_stride;
289 b_ptr += b_stride;
290 m_ptr += m_stride;
291 }
292 res = _mm_hadd_epi32(res, _mm_setzero_si128());
293 res = _mm_hadd_epi32(res, _mm_setzero_si128());
294 // sad = (sad + 31) >> 6;
295 return (_mm_cvtsi128_si32(res) + 31) >> 6;
296}
297
298static INLINE unsigned int highbd_masked_sad4xh_ssse3(
299 const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
300 const uint8_t *m_ptr, int m_stride, int height) {
301 int y;
302 __m128i a, b, m, temp1, temp2;
303 const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
304 const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
305 __m128i res = _mm_setzero_si128();
306 // Add the masked SAD for 2 rows at a time
307 for (y = 0; y < height; y += 2) {
308 // Load a, b, m in xmm registers
309 a = highbd_width4_load_2rows(a_ptr, a_stride);
310 b = highbd_width4_load_2rows(b_ptr, b_stride);
311 temp1 = _mm_loadl_epi64((const __m128i *)m_ptr);
312 temp2 = _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride));
313 m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2),
314 _mm_setzero_si128());
315
316 // Calculate the difference between a & b
317 temp1 = _mm_subs_epu16(a, b);
318 temp2 = _mm_subs_epu16(b, a);
319 temp1 = _mm_or_si128(temp1, temp2);
320
321 // Multiply by m and add together
322 res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
323
324 // Move onto the next rows
325 a_ptr += a_stride * 2;
326 b_ptr += b_stride * 2;
327 m_ptr += m_stride * 2;
328 }
329 res = _mm_hadd_epi32(res, _mm_setzero_si128());
330 res = _mm_hadd_epi32(res, _mm_setzero_si128());
331 // sad = (sad + 31) >> 6;
332 return (_mm_cvtsi128_si32(res) + 31) >> 6;
333}
Yaowu Xuf883b422016-08-30 14:01:10 -0700334#endif // CONFIG_AOM_HIGHBITDEPTH