blob: 58cdd80d99bdf9bdcc1ca88530b86fb701601f4e [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xuc27fc142016-08-22 16:08:15 -070010 */
11
Tom Finegan44702c82018-05-22 13:00:39 -070012#include "config/aom_dsp_rtcd.h"
13
Yaowu Xuc27fc142016-08-22 16:08:15 -070014#include "aom_dsp/mips/macros_msa.h"
15
16#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \
17 { \
18 out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
19 out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
20 out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
21 out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \
22 }
23#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
24
25static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
26 const uint8_t *ref_ptr, int32_t ref_stride,
27 int32_t height) {
28 int32_t ht_cnt;
29 uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
30 v16u8 src = { 0 };
31 v16u8 ref = { 0 };
32 v16u8 diff;
33 v8u16 sad = { 0 };
34
35 for (ht_cnt = (height >> 2); ht_cnt--;) {
36 LW4(src_ptr, src_stride, src0, src1, src2, src3);
37 src_ptr += (4 * src_stride);
38 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
39 ref_ptr += (4 * ref_stride);
40
41 INSERT_W4_UB(src0, src1, src2, src3, src);
42 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
43
44 diff = __msa_asub_u_b(src, ref);
45 sad += __msa_hadd_u_h(diff, diff);
46 }
47
48 return HADD_UH_U32(sad);
49}
50
51static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
52 const uint8_t *ref, int32_t ref_stride,
53 int32_t height) {
54 int32_t ht_cnt;
55 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
56 v8u16 sad = { 0 };
57
58 for (ht_cnt = (height >> 2); ht_cnt--;) {
59 LD_UB4(src, src_stride, src0, src1, src2, src3);
60 src += (4 * src_stride);
61 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
62 ref += (4 * ref_stride);
63
64 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
65 ref0, ref1);
66 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
67 }
68
69 return HADD_UH_U32(sad);
70}
71
72static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
73 const uint8_t *ref, int32_t ref_stride,
74 int32_t height) {
75 int32_t ht_cnt;
76 v16u8 src0, src1, ref0, ref1;
77 v8u16 sad = { 0 };
78
79 for (ht_cnt = (height >> 2); ht_cnt--;) {
80 LD_UB2(src, src_stride, src0, src1);
81 src += (2 * src_stride);
82 LD_UB2(ref, ref_stride, ref0, ref1);
83 ref += (2 * ref_stride);
84 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
85
86 LD_UB2(src, src_stride, src0, src1);
87 src += (2 * src_stride);
88 LD_UB2(ref, ref_stride, ref0, ref1);
89 ref += (2 * ref_stride);
90 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
91 }
92
93 return HADD_UH_U32(sad);
94}
95
96static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride,
97 const uint8_t *ref, int32_t ref_stride,
98 int32_t height) {
99 int32_t ht_cnt;
100 v16u8 src0, src1, ref0, ref1;
101 v8u16 sad = { 0 };
102
103 for (ht_cnt = (height >> 2); ht_cnt--;) {
104 LD_UB2(src, 16, src0, src1);
105 src += src_stride;
106 LD_UB2(ref, 16, ref0, ref1);
107 ref += ref_stride;
108 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
109
110 LD_UB2(src, 16, src0, src1);
111 src += src_stride;
112 LD_UB2(ref, 16, ref0, ref1);
113 ref += ref_stride;
114 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
115
116 LD_UB2(src, 16, src0, src1);
117 src += src_stride;
118 LD_UB2(ref, 16, ref0, ref1);
119 ref += ref_stride;
120 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
121
122 LD_UB2(src, 16, src0, src1);
123 src += src_stride;
124 LD_UB2(ref, 16, ref0, ref1);
125 ref += ref_stride;
126 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
127 }
128
129 return HADD_UH_U32(sad);
130}
131
132static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
133 const uint8_t *ref, int32_t ref_stride,
134 int32_t height) {
135 int32_t ht_cnt;
136 uint32_t sad = 0;
137 v16u8 src0, src1, src2, src3;
138 v16u8 ref0, ref1, ref2, ref3;
139 v8u16 sad0 = { 0 };
140 v8u16 sad1 = { 0 };
141
142 for (ht_cnt = (height >> 1); ht_cnt--;) {
143 LD_UB4(src, 16, src0, src1, src2, src3);
144 src += src_stride;
145 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
146 ref += ref_stride;
147 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
148 sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
149
150 LD_UB4(src, 16, src0, src1, src2, src3);
151 src += src_stride;
152 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
153 ref += ref_stride;
154 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
155 sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
156 }
157
158 sad = HADD_UH_U32(sad0);
159 sad += HADD_UH_U32(sad1);
160
161 return sad;
162}
163
Yaowu Xuc27fc142016-08-22 16:08:15 -0700164static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
165 const uint8_t *const aref_ptr[],
166 int32_t ref_stride, int32_t height,
167 uint32_t *sad_array) {
168 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
169 int32_t ht_cnt;
170 uint32_t src0, src1, src2, src3;
171 uint32_t ref0, ref1, ref2, ref3;
172 v16u8 src = { 0 };
173 v16u8 ref = { 0 };
174 v16u8 diff;
175 v8u16 sad0 = { 0 };
176 v8u16 sad1 = { 0 };
177 v8u16 sad2 = { 0 };
178 v8u16 sad3 = { 0 };
179
180 ref0_ptr = aref_ptr[0];
181 ref1_ptr = aref_ptr[1];
182 ref2_ptr = aref_ptr[2];
183 ref3_ptr = aref_ptr[3];
184
185 for (ht_cnt = (height >> 2); ht_cnt--;) {
186 LW4(src_ptr, src_stride, src0, src1, src2, src3);
187 INSERT_W4_UB(src0, src1, src2, src3, src);
188 src_ptr += (4 * src_stride);
189
190 LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
191 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
192 ref0_ptr += (4 * ref_stride);
193
194 diff = __msa_asub_u_b(src, ref);
195 sad0 += __msa_hadd_u_h(diff, diff);
196
197 LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3);
198 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
199 ref1_ptr += (4 * ref_stride);
200
201 diff = __msa_asub_u_b(src, ref);
202 sad1 += __msa_hadd_u_h(diff, diff);
203
204 LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3);
205 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
206 ref2_ptr += (4 * ref_stride);
207
208 diff = __msa_asub_u_b(src, ref);
209 sad2 += __msa_hadd_u_h(diff, diff);
210
211 LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3);
212 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
213 ref3_ptr += (4 * ref_stride);
214
215 diff = __msa_asub_u_b(src, ref);
216 sad3 += __msa_hadd_u_h(diff, diff);
217 }
218
219 sad_array[0] = HADD_UH_U32(sad0);
220 sad_array[1] = HADD_UH_U32(sad1);
221 sad_array[2] = HADD_UH_U32(sad2);
222 sad_array[3] = HADD_UH_U32(sad3);
223}
224
225static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
226 const uint8_t *const aref_ptr[],
227 int32_t ref_stride, int32_t height,
228 uint32_t *sad_array) {
229 int32_t ht_cnt;
230 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
231 v16u8 src0, src1, src2, src3;
232 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
233 v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
234 v8u16 sad0 = { 0 };
235 v8u16 sad1 = { 0 };
236 v8u16 sad2 = { 0 };
237 v8u16 sad3 = { 0 };
238
239 ref0_ptr = aref_ptr[0];
240 ref1_ptr = aref_ptr[1];
241 ref2_ptr = aref_ptr[2];
242 ref3_ptr = aref_ptr[3];
243
244 for (ht_cnt = (height >> 2); ht_cnt--;) {
245 LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
246 src_ptr += (4 * src_stride);
247 LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
248 ref0_ptr += (4 * ref_stride);
249 LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7);
250 ref1_ptr += (4 * ref_stride);
251 LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11);
252 ref2_ptr += (4 * ref_stride);
253 LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15);
254 ref3_ptr += (4 * ref_stride);
255
256 PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
257 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
258 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
259
260 PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1);
261 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
262
263 PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1);
264 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
265
266 PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1);
267 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
268 }
269
270 sad_array[0] = HADD_UH_U32(sad0);
271 sad_array[1] = HADD_UH_U32(sad1);
272 sad_array[2] = HADD_UH_U32(sad2);
273 sad_array[3] = HADD_UH_U32(sad3);
274}
275
276static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
277 const uint8_t *const aref_ptr[],
278 int32_t ref_stride, int32_t height,
279 uint32_t *sad_array) {
280 int32_t ht_cnt;
281 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
282 v16u8 src, ref0, ref1, ref2, ref3, diff;
283 v8u16 sad0 = { 0 };
284 v8u16 sad1 = { 0 };
285 v8u16 sad2 = { 0 };
286 v8u16 sad3 = { 0 };
287
288 ref0_ptr = aref_ptr[0];
289 ref1_ptr = aref_ptr[1];
290 ref2_ptr = aref_ptr[2];
291 ref3_ptr = aref_ptr[3];
292
293 for (ht_cnt = (height >> 1); ht_cnt--;) {
294 src = LD_UB(src_ptr);
295 src_ptr += src_stride;
296 ref0 = LD_UB(ref0_ptr);
297 ref0_ptr += ref_stride;
298 ref1 = LD_UB(ref1_ptr);
299 ref1_ptr += ref_stride;
300 ref2 = LD_UB(ref2_ptr);
301 ref2_ptr += ref_stride;
302 ref3 = LD_UB(ref3_ptr);
303 ref3_ptr += ref_stride;
304
305 diff = __msa_asub_u_b(src, ref0);
306 sad0 += __msa_hadd_u_h(diff, diff);
307 diff = __msa_asub_u_b(src, ref1);
308 sad1 += __msa_hadd_u_h(diff, diff);
309 diff = __msa_asub_u_b(src, ref2);
310 sad2 += __msa_hadd_u_h(diff, diff);
311 diff = __msa_asub_u_b(src, ref3);
312 sad3 += __msa_hadd_u_h(diff, diff);
313
314 src = LD_UB(src_ptr);
315 src_ptr += src_stride;
316 ref0 = LD_UB(ref0_ptr);
317 ref0_ptr += ref_stride;
318 ref1 = LD_UB(ref1_ptr);
319 ref1_ptr += ref_stride;
320 ref2 = LD_UB(ref2_ptr);
321 ref2_ptr += ref_stride;
322 ref3 = LD_UB(ref3_ptr);
323 ref3_ptr += ref_stride;
324
325 diff = __msa_asub_u_b(src, ref0);
326 sad0 += __msa_hadd_u_h(diff, diff);
327 diff = __msa_asub_u_b(src, ref1);
328 sad1 += __msa_hadd_u_h(diff, diff);
329 diff = __msa_asub_u_b(src, ref2);
330 sad2 += __msa_hadd_u_h(diff, diff);
331 diff = __msa_asub_u_b(src, ref3);
332 sad3 += __msa_hadd_u_h(diff, diff);
333 }
334
335 sad_array[0] = HADD_UH_U32(sad0);
336 sad_array[1] = HADD_UH_U32(sad1);
337 sad_array[2] = HADD_UH_U32(sad2);
338 sad_array[3] = HADD_UH_U32(sad3);
339}
340
341static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
342 const uint8_t *const aref_ptr[],
343 int32_t ref_stride, int32_t height,
344 uint32_t *sad_array) {
345 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
346 int32_t ht_cnt;
347 v16u8 src0, src1, ref0, ref1;
348 v8u16 sad0 = { 0 };
349 v8u16 sad1 = { 0 };
350 v8u16 sad2 = { 0 };
351 v8u16 sad3 = { 0 };
352
353 ref0_ptr = aref_ptr[0];
354 ref1_ptr = aref_ptr[1];
355 ref2_ptr = aref_ptr[2];
356 ref3_ptr = aref_ptr[3];
357
358 for (ht_cnt = height; ht_cnt--;) {
359 LD_UB2(src, 16, src0, src1);
360 src += src_stride;
361
362 LD_UB2(ref0_ptr, 16, ref0, ref1);
363 ref0_ptr += ref_stride;
364 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
365
366 LD_UB2(ref1_ptr, 16, ref0, ref1);
367 ref1_ptr += ref_stride;
368 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
369
370 LD_UB2(ref2_ptr, 16, ref0, ref1);
371 ref2_ptr += ref_stride;
372 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
373
374 LD_UB2(ref3_ptr, 16, ref0, ref1);
375 ref3_ptr += ref_stride;
376 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
377 }
378
379 sad_array[0] = HADD_UH_U32(sad0);
380 sad_array[1] = HADD_UH_U32(sad1);
381 sad_array[2] = HADD_UH_U32(sad2);
382 sad_array[3] = HADD_UH_U32(sad3);
383}
384
385static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
386 const uint8_t *const aref_ptr[],
387 int32_t ref_stride, int32_t height,
388 uint32_t *sad_array) {
389 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
390 int32_t ht_cnt;
391 v16u8 src0, src1, src2, src3;
392 v16u8 ref0, ref1, ref2, ref3;
393 v8u16 sad0_0 = { 0 };
394 v8u16 sad0_1 = { 0 };
395 v8u16 sad1_0 = { 0 };
396 v8u16 sad1_1 = { 0 };
397 v8u16 sad2_0 = { 0 };
398 v8u16 sad2_1 = { 0 };
399 v8u16 sad3_0 = { 0 };
400 v8u16 sad3_1 = { 0 };
401
402 ref0_ptr = aref_ptr[0];
403 ref1_ptr = aref_ptr[1];
404 ref2_ptr = aref_ptr[2];
405 ref3_ptr = aref_ptr[3];
406
407 for (ht_cnt = height; ht_cnt--;) {
408 LD_UB4(src, 16, src0, src1, src2, src3);
409 src += src_stride;
410
411 LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3);
412 ref0_ptr += ref_stride;
413 sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
414 sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
415
416 LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3);
417 ref1_ptr += ref_stride;
418 sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
419 sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
420
421 LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3);
422 ref2_ptr += ref_stride;
423 sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
424 sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
425
426 LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3);
427 ref3_ptr += ref_stride;
428 sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
429 sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
430 }
431
432 sad_array[0] = HADD_UH_U32(sad0_0);
433 sad_array[0] += HADD_UH_U32(sad0_1);
434 sad_array[1] = HADD_UH_U32(sad1_0);
435 sad_array[1] += HADD_UH_U32(sad1_1);
436 sad_array[2] = HADD_UH_U32(sad2_0);
437 sad_array[2] += HADD_UH_U32(sad2_1);
438 sad_array[3] = HADD_UH_U32(sad3_0);
439 sad_array[3] += HADD_UH_U32(sad3_1);
440}
441
442static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
443 const uint8_t *ref_ptr, int32_t ref_stride,
444 int32_t height, const uint8_t *sec_pred) {
445 int32_t ht_cnt;
446 uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
447 v16u8 src = { 0 };
448 v16u8 ref = { 0 };
449 v16u8 diff, pred, comp;
450 v8u16 sad = { 0 };
451
452 for (ht_cnt = (height >> 2); ht_cnt--;) {
453 LW4(src_ptr, src_stride, src0, src1, src2, src3);
454 src_ptr += (4 * src_stride);
455 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
456 ref_ptr += (4 * ref_stride);
457 pred = LD_UB(sec_pred);
458 sec_pred += 16;
459
460 INSERT_W4_UB(src0, src1, src2, src3, src);
461 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
462
463 comp = __msa_aver_u_b(pred, ref);
464 diff = __msa_asub_u_b(src, comp);
465 sad += __msa_hadd_u_h(diff, diff);
466 }
467
468 return HADD_UH_U32(sad);
469}
470
471static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
472 const uint8_t *ref, int32_t ref_stride,
473 int32_t height, const uint8_t *sec_pred) {
474 int32_t ht_cnt;
475 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
476 v16u8 diff0, diff1, pred0, pred1;
477 v8u16 sad = { 0 };
478
479 for (ht_cnt = (height >> 2); ht_cnt--;) {
480 LD_UB4(src, src_stride, src0, src1, src2, src3);
481 src += (4 * src_stride);
482 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
483 ref += (4 * ref_stride);
484 LD_UB2(sec_pred, 16, pred0, pred1);
485 sec_pred += 32;
486 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
487 ref0, ref1);
488 AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
489 sad += SAD_UB2_UH(src0, src1, diff0, diff1);
490 }
491
492 return HADD_UH_U32(sad);
493}
494
495static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride,
496 const uint8_t *ref, int32_t ref_stride,
497 int32_t height, const uint8_t *sec_pred) {
498 int32_t ht_cnt;
499 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
500 v16u8 pred0, pred1, pred2, pred3, comp0, comp1;
501 v8u16 sad = { 0 };
502
503 for (ht_cnt = (height >> 3); ht_cnt--;) {
504 LD_UB4(src, src_stride, src0, src1, src2, src3);
505 src += (4 * src_stride);
506 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
507 ref += (4 * ref_stride);
508 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
509 sec_pred += (4 * 16);
510 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
511 sad += SAD_UB2_UH(src0, src1, comp0, comp1);
512 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
513 sad += SAD_UB2_UH(src2, src3, comp0, comp1);
514
515 LD_UB4(src, src_stride, src0, src1, src2, src3);
516 src += (4 * src_stride);
517 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
518 ref += (4 * ref_stride);
519 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
520 sec_pred += (4 * 16);
521 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
522 sad += SAD_UB2_UH(src0, src1, comp0, comp1);
523 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
524 sad += SAD_UB2_UH(src2, src3, comp0, comp1);
525 }
526
527 return HADD_UH_U32(sad);
528}
529
530static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride,
531 const uint8_t *ref, int32_t ref_stride,
532 int32_t height, const uint8_t *sec_pred) {
533 int32_t ht_cnt;
534 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
535 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
536 v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
537 v16u8 comp0, comp1;
538 v8u16 sad = { 0 };
539
540 for (ht_cnt = (height >> 2); ht_cnt--;) {
541 LD_UB4(src, src_stride, src0, src2, src4, src6);
542 LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
543 src += (4 * src_stride);
544
545 LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6);
546 LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7);
547 ref += (4 * ref_stride);
548
549 LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6);
550 LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7);
551 sec_pred += (4 * 32);
552
553 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
554 sad += SAD_UB2_UH(src0, src1, comp0, comp1);
555 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
556 sad += SAD_UB2_UH(src2, src3, comp0, comp1);
557 AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1);
558 sad += SAD_UB2_UH(src4, src5, comp0, comp1);
559 AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1);
560 sad += SAD_UB2_UH(src6, src7, comp0, comp1);
561 }
562
563 return HADD_UH_U32(sad);
564}
565
566static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
567 const uint8_t *ref, int32_t ref_stride,
568 int32_t height, const uint8_t *sec_pred) {
569 int32_t ht_cnt;
570 v16u8 src0, src1, src2, src3;
571 v16u8 ref0, ref1, ref2, ref3;
572 v16u8 comp0, comp1, comp2, comp3;
573 v16u8 pred0, pred1, pred2, pred3;
574 v8u16 sad0 = { 0 };
575 v8u16 sad1 = { 0 };
576 v4u32 sad;
577
578 for (ht_cnt = (height >> 2); ht_cnt--;) {
579 LD_UB4(src, 16, src0, src1, src2, src3);
580 src += src_stride;
581 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
582 ref += ref_stride;
583 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
584 sec_pred += 64;
585 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
586 comp1, comp2, comp3);
587 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
588 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
589
590 LD_UB4(src, 16, src0, src1, src2, src3);
591 src += src_stride;
592 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
593 ref += ref_stride;
594 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
595 sec_pred += 64;
596 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
597 comp1, comp2, comp3);
598 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
599 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
600
601 LD_UB4(src, 16, src0, src1, src2, src3);
602 src += src_stride;
603 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
604 ref += ref_stride;
605 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
606 sec_pred += 64;
607 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
608 comp1, comp2, comp3);
609 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
610 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
611
612 LD_UB4(src, 16, src0, src1, src2, src3);
613 src += src_stride;
614 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
615 ref += ref_stride;
616 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
617 sec_pred += 64;
618 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
619 comp1, comp2, comp3);
620 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
621 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
622 }
623
624 sad = __msa_hadd_u_w(sad0, sad0);
625 sad += __msa_hadd_u_w(sad1, sad1);
626
627 return HADD_SW_S32(sad);
628}
629
Yaowu Xuf883b422016-08-30 14:01:10 -0700630#define AOM_SAD_4xHEIGHT_MSA(height) \
631 uint32_t aom_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700632 const uint8_t *ref, int32_t ref_stride) { \
633 return sad_4width_msa(src, src_stride, ref, ref_stride, height); \
634 }
635
Yaowu Xuf883b422016-08-30 14:01:10 -0700636#define AOM_SAD_8xHEIGHT_MSA(height) \
637 uint32_t aom_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700638 const uint8_t *ref, int32_t ref_stride) { \
639 return sad_8width_msa(src, src_stride, ref, ref_stride, height); \
640 }
641
Yaowu Xuf883b422016-08-30 14:01:10 -0700642#define AOM_SAD_16xHEIGHT_MSA(height) \
643 uint32_t aom_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700644 const uint8_t *ref, int32_t ref_stride) { \
645 return sad_16width_msa(src, src_stride, ref, ref_stride, height); \
646 }
647
Yaowu Xuf883b422016-08-30 14:01:10 -0700648#define AOM_SAD_32xHEIGHT_MSA(height) \
649 uint32_t aom_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700650 const uint8_t *ref, int32_t ref_stride) { \
651 return sad_32width_msa(src, src_stride, ref, ref_stride, height); \
652 }
653
Yaowu Xuf883b422016-08-30 14:01:10 -0700654#define AOM_SAD_64xHEIGHT_MSA(height) \
655 uint32_t aom_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700656 const uint8_t *ref, int32_t ref_stride) { \
657 return sad_64width_msa(src, src_stride, ref, ref_stride, height); \
658 }
659
Yaowu Xuf883b422016-08-30 14:01:10 -0700660#define AOM_SAD_4xHEIGHTx4D_MSA(height) \
661 void aom_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700662 const uint8_t *const refs[], \
663 int32_t ref_stride, uint32_t *sads) { \
664 sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
665 }
666
Yaowu Xuf883b422016-08-30 14:01:10 -0700667#define AOM_SAD_8xHEIGHTx4D_MSA(height) \
668 void aom_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700669 const uint8_t *const refs[], \
670 int32_t ref_stride, uint32_t *sads) { \
671 sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
672 }
673
Yaowu Xuf883b422016-08-30 14:01:10 -0700674#define AOM_SAD_16xHEIGHTx4D_MSA(height) \
675 void aom_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700676 const uint8_t *const refs[], \
677 int32_t ref_stride, uint32_t *sads) { \
678 sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
679 }
680
Yaowu Xuf883b422016-08-30 14:01:10 -0700681#define AOM_SAD_32xHEIGHTx4D_MSA(height) \
682 void aom_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700683 const uint8_t *const refs[], \
684 int32_t ref_stride, uint32_t *sads) { \
685 sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
686 }
687
Yaowu Xuf883b422016-08-30 14:01:10 -0700688#define AOM_SAD_64xHEIGHTx4D_MSA(height) \
689 void aom_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700690 const uint8_t *const refs[], \
691 int32_t ref_stride, uint32_t *sads) { \
692 sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
693 }
694
Yaowu Xuf883b422016-08-30 14:01:10 -0700695#define AOM_AVGSAD_4xHEIGHT_MSA(height) \
696 uint32_t aom_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700697 const uint8_t *ref, int32_t ref_stride, \
698 const uint8_t *second_pred) { \
699 return avgsad_4width_msa(src, src_stride, ref, ref_stride, height, \
700 second_pred); \
701 }
702
Yaowu Xuf883b422016-08-30 14:01:10 -0700703#define AOM_AVGSAD_8xHEIGHT_MSA(height) \
704 uint32_t aom_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700705 const uint8_t *ref, int32_t ref_stride, \
706 const uint8_t *second_pred) { \
707 return avgsad_8width_msa(src, src_stride, ref, ref_stride, height, \
708 second_pred); \
709 }
710
Yaowu Xuf883b422016-08-30 14:01:10 -0700711#define AOM_AVGSAD_16xHEIGHT_MSA(height) \
712 uint32_t aom_sad16x##height##_avg_msa( \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700713 const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
714 int32_t ref_stride, const uint8_t *second_pred) { \
715 return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \
716 second_pred); \
717 }
718
Yaowu Xuf883b422016-08-30 14:01:10 -0700719#define AOM_AVGSAD_32xHEIGHT_MSA(height) \
720 uint32_t aom_sad32x##height##_avg_msa( \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700721 const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
722 int32_t ref_stride, const uint8_t *second_pred) { \
723 return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \
724 second_pred); \
725 }
726
Yaowu Xuf883b422016-08-30 14:01:10 -0700727#define AOM_AVGSAD_64xHEIGHT_MSA(height) \
728 uint32_t aom_sad64x##height##_avg_msa( \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700729 const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
730 int32_t ref_stride, const uint8_t *second_pred) { \
731 return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \
732 second_pred); \
733 }
734
Yaowu Xu410fee82016-10-13 13:04:29 -0700735/* clang-format off */
Yaowu Xuc27fc142016-08-22 16:08:15 -0700736// 64x64
Yaowu Xu410fee82016-10-13 13:04:29 -0700737AOM_SAD_64xHEIGHT_MSA(64)
Yaowu Xu410fee82016-10-13 13:04:29 -0700738AOM_SAD_64xHEIGHTx4D_MSA(64)
739AOM_AVGSAD_64xHEIGHT_MSA(64)
Yaowu Xuc27fc142016-08-22 16:08:15 -0700740
741// 64x32
Yaowu Xu410fee82016-10-13 13:04:29 -0700742AOM_SAD_64xHEIGHT_MSA(32)
Yaowu Xu410fee82016-10-13 13:04:29 -0700743AOM_SAD_64xHEIGHTx4D_MSA(32)
744AOM_AVGSAD_64xHEIGHT_MSA(32)
Yaowu Xuc27fc142016-08-22 16:08:15 -0700745
746// 32x64
Yaowu Xu410fee82016-10-13 13:04:29 -0700747AOM_SAD_32xHEIGHT_MSA(64)
Yaowu Xu410fee82016-10-13 13:04:29 -0700748AOM_SAD_32xHEIGHTx4D_MSA(64)
749AOM_AVGSAD_32xHEIGHT_MSA(64)
Yaowu Xuc27fc142016-08-22 16:08:15 -0700750
751// 32x32
Yaowu Xu410fee82016-10-13 13:04:29 -0700752AOM_SAD_32xHEIGHT_MSA(32)
Yaowu Xu410fee82016-10-13 13:04:29 -0700753AOM_SAD_32xHEIGHTx4D_MSA(32)
754AOM_AVGSAD_32xHEIGHT_MSA(32)
Yaowu Xuc27fc142016-08-22 16:08:15 -0700755
756// 32x16
Yaowu Xu410fee82016-10-13 13:04:29 -0700757AOM_SAD_32xHEIGHT_MSA(16)
Yaowu Xu410fee82016-10-13 13:04:29 -0700758AOM_SAD_32xHEIGHTx4D_MSA(16)
759AOM_AVGSAD_32xHEIGHT_MSA(16)
Yaowu Xuc27fc142016-08-22 16:08:15 -0700760
761// 16x32
Yaowu Xu410fee82016-10-13 13:04:29 -0700762AOM_SAD_16xHEIGHT_MSA(32)
Yaowu Xu410fee82016-10-13 13:04:29 -0700763AOM_SAD_16xHEIGHTx4D_MSA(32)
764AOM_AVGSAD_16xHEIGHT_MSA(32)
Yaowu Xuc27fc142016-08-22 16:08:15 -0700765
766// 16x16
Yaowu Xu410fee82016-10-13 13:04:29 -0700767AOM_SAD_16xHEIGHT_MSA(16)
Yaowu Xu410fee82016-10-13 13:04:29 -0700768AOM_SAD_16xHEIGHTx4D_MSA(16)
769AOM_AVGSAD_16xHEIGHT_MSA(16)
Yaowu Xuc27fc142016-08-22 16:08:15 -0700770
771// 16x8
Yaowu Xu410fee82016-10-13 13:04:29 -0700772AOM_SAD_16xHEIGHT_MSA(8)
Yaowu Xu410fee82016-10-13 13:04:29 -0700773AOM_SAD_16xHEIGHTx4D_MSA(8)
774AOM_AVGSAD_16xHEIGHT_MSA(8)
Yaowu Xuc27fc142016-08-22 16:08:15 -0700775
776// 8x16
Yaowu Xu410fee82016-10-13 13:04:29 -0700777AOM_SAD_8xHEIGHT_MSA(16)
Yaowu Xu410fee82016-10-13 13:04:29 -0700778AOM_SAD_8xHEIGHTx4D_MSA(16)
779AOM_AVGSAD_8xHEIGHT_MSA(16)
Yaowu Xuc27fc142016-08-22 16:08:15 -0700780
781// 8x8
Yaowu Xu410fee82016-10-13 13:04:29 -0700782AOM_SAD_8xHEIGHT_MSA(8)
Yaowu Xu410fee82016-10-13 13:04:29 -0700783AOM_SAD_8xHEIGHTx4D_MSA(8)
784AOM_AVGSAD_8xHEIGHT_MSA(8)
Yaowu Xuc27fc142016-08-22 16:08:15 -0700785
786// 8x4
Yaowu Xu410fee82016-10-13 13:04:29 -0700787AOM_SAD_8xHEIGHT_MSA(4)
Yaowu Xu410fee82016-10-13 13:04:29 -0700788AOM_SAD_8xHEIGHTx4D_MSA(4)
789AOM_AVGSAD_8xHEIGHT_MSA(4)
Yaowu Xuc27fc142016-08-22 16:08:15 -0700790
791// 4x8
Yaowu Xu410fee82016-10-13 13:04:29 -0700792AOM_SAD_4xHEIGHT_MSA(8)
Yaowu Xu410fee82016-10-13 13:04:29 -0700793AOM_SAD_4xHEIGHTx4D_MSA(8)
794AOM_AVGSAD_4xHEIGHT_MSA(8)
Yaowu Xuc27fc142016-08-22 16:08:15 -0700795
796// 4x4
Yaowu Xu410fee82016-10-13 13:04:29 -0700797AOM_SAD_4xHEIGHT_MSA(4)
Yaowu Xu410fee82016-10-13 13:04:29 -0700798AOM_SAD_4xHEIGHTx4D_MSA(4)
799AOM_AVGSAD_4xHEIGHT_MSA(4)
800 /* clang-format on */