blob: 745fdfc9c5c8e04f4d936c3b5c782cdaca53dd7c [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Yaowu Xu2ab7ff02016-09-02 12:04:54 -07002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Yaowu Xu2ab7ff02016-09-02 12:04:54 -07004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xuc27fc142016-08-22 16:08:15 -070010 */
11
Yaowu Xuf883b422016-08-30 14:01:10 -070012#include "./aom_dsp_rtcd.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070013#include "aom_dsp/mips/macros_msa.h"
14
15#define CALC_MSE_B(src, ref, var) \
16 { \
17 v16u8 src_l0_m, src_l1_m; \
18 v8i16 res_l0_m, res_l1_m; \
19 \
20 ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
21 HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
22 DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
23 }
24
25#define CALC_MSE_AVG_B(src, ref, var, sub) \
26 { \
27 v16u8 src_l0_m, src_l1_m; \
28 v8i16 res_l0_m, res_l1_m; \
29 \
30 ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
31 HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
32 DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
33 \
34 sub += res_l0_m + res_l1_m; \
35 }
36
37#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
38
39#define VARIANCE_LARGE_WxH(sse, diff, shift) \
40 sse - (((int64_t)diff * diff) >> shift)
41
42static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
43 const uint8_t *ref_ptr, int32_t ref_stride,
44 int32_t height, int32_t *diff) {
45 uint32_t src0, src1, src2, src3;
46 uint32_t ref0, ref1, ref2, ref3;
47 int32_t ht_cnt;
48 v16u8 src = { 0 };
49 v16u8 ref = { 0 };
50 v8i16 avg = { 0 };
51 v4i32 vec, var = { 0 };
52
53 for (ht_cnt = (height >> 2); ht_cnt--;) {
54 LW4(src_ptr, src_stride, src0, src1, src2, src3);
55 src_ptr += (4 * src_stride);
56 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
57 ref_ptr += (4 * ref_stride);
58
59 INSERT_W4_UB(src0, src1, src2, src3, src);
60 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
61 CALC_MSE_AVG_B(src, ref, var, avg);
62 }
63
64 vec = __msa_hadd_s_w(avg, avg);
65 *diff = HADD_SW_S32(vec);
66
67 return HADD_SW_S32(var);
68}
69
70static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
71 const uint8_t *ref_ptr, int32_t ref_stride,
72 int32_t height, int32_t *diff) {
73 int32_t ht_cnt;
74 v16u8 src0, src1, src2, src3;
75 v16u8 ref0, ref1, ref2, ref3;
76 v8i16 avg = { 0 };
77 v4i32 vec, var = { 0 };
78
79 for (ht_cnt = (height >> 2); ht_cnt--;) {
80 LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
81 src_ptr += (4 * src_stride);
82 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
83 ref_ptr += (4 * ref_stride);
84
85 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
86 ref0, ref1);
87 CALC_MSE_AVG_B(src0, ref0, var, avg);
88 CALC_MSE_AVG_B(src1, ref1, var, avg);
89 }
90
91 vec = __msa_hadd_s_w(avg, avg);
92 *diff = HADD_SW_S32(vec);
93
94 return HADD_SW_S32(var);
95}
96
97static uint32_t sse_diff_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
98 const uint8_t *ref_ptr, int32_t ref_stride,
99 int32_t height, int32_t *diff) {
100 int32_t ht_cnt;
101 v16u8 src, ref;
102 v8i16 avg = { 0 };
103 v4i32 vec, var = { 0 };
104
105 for (ht_cnt = (height >> 2); ht_cnt--;) {
106 src = LD_UB(src_ptr);
107 src_ptr += src_stride;
108 ref = LD_UB(ref_ptr);
109 ref_ptr += ref_stride;
110 CALC_MSE_AVG_B(src, ref, var, avg);
111
112 src = LD_UB(src_ptr);
113 src_ptr += src_stride;
114 ref = LD_UB(ref_ptr);
115 ref_ptr += ref_stride;
116 CALC_MSE_AVG_B(src, ref, var, avg);
117
118 src = LD_UB(src_ptr);
119 src_ptr += src_stride;
120 ref = LD_UB(ref_ptr);
121 ref_ptr += ref_stride;
122 CALC_MSE_AVG_B(src, ref, var, avg);
123
124 src = LD_UB(src_ptr);
125 src_ptr += src_stride;
126 ref = LD_UB(ref_ptr);
127 ref_ptr += ref_stride;
128 CALC_MSE_AVG_B(src, ref, var, avg);
129 }
130
131 vec = __msa_hadd_s_w(avg, avg);
132 *diff = HADD_SW_S32(vec);
133
134 return HADD_SW_S32(var);
135}
136
137static uint32_t sse_diff_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
138 const uint8_t *ref_ptr, int32_t ref_stride,
139 int32_t height, int32_t *diff) {
140 int32_t ht_cnt;
141 v16u8 src0, src1, ref0, ref1;
142 v8i16 avg = { 0 };
143 v4i32 vec, var = { 0 };
144
145 for (ht_cnt = (height >> 2); ht_cnt--;) {
146 LD_UB2(src_ptr, 16, src0, src1);
147 src_ptr += src_stride;
148 LD_UB2(ref_ptr, 16, ref0, ref1);
149 ref_ptr += ref_stride;
150 CALC_MSE_AVG_B(src0, ref0, var, avg);
151 CALC_MSE_AVG_B(src1, ref1, var, avg);
152
153 LD_UB2(src_ptr, 16, src0, src1);
154 src_ptr += src_stride;
155 LD_UB2(ref_ptr, 16, ref0, ref1);
156 ref_ptr += ref_stride;
157 CALC_MSE_AVG_B(src0, ref0, var, avg);
158 CALC_MSE_AVG_B(src1, ref1, var, avg);
159
160 LD_UB2(src_ptr, 16, src0, src1);
161 src_ptr += src_stride;
162 LD_UB2(ref_ptr, 16, ref0, ref1);
163 ref_ptr += ref_stride;
164 CALC_MSE_AVG_B(src0, ref0, var, avg);
165 CALC_MSE_AVG_B(src1, ref1, var, avg);
166
167 LD_UB2(src_ptr, 16, src0, src1);
168 src_ptr += src_stride;
169 LD_UB2(ref_ptr, 16, ref0, ref1);
170 ref_ptr += ref_stride;
171 CALC_MSE_AVG_B(src0, ref0, var, avg);
172 CALC_MSE_AVG_B(src1, ref1, var, avg);
173 }
174
175 vec = __msa_hadd_s_w(avg, avg);
176 *diff = HADD_SW_S32(vec);
177
178 return HADD_SW_S32(var);
179}
180
181static uint32_t sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride,
182 const uint8_t *ref_ptr, int32_t ref_stride,
183 int32_t *diff) {
184 int32_t ht_cnt;
185 v16u8 src0, src1, ref0, ref1;
186 v8i16 avg0 = { 0 };
187 v8i16 avg1 = { 0 };
188 v4i32 vec, var = { 0 };
189
190 for (ht_cnt = 16; ht_cnt--;) {
191 LD_UB2(src_ptr, 16, src0, src1);
192 src_ptr += src_stride;
193 LD_UB2(ref_ptr, 16, ref0, ref1);
194 ref_ptr += ref_stride;
195 CALC_MSE_AVG_B(src0, ref0, var, avg0);
196 CALC_MSE_AVG_B(src1, ref1, var, avg1);
197
198 LD_UB2(src_ptr, 16, src0, src1);
199 src_ptr += src_stride;
200 LD_UB2(ref_ptr, 16, ref0, ref1);
201 ref_ptr += ref_stride;
202 CALC_MSE_AVG_B(src0, ref0, var, avg0);
203 CALC_MSE_AVG_B(src1, ref1, var, avg1);
204
205 LD_UB2(src_ptr, 16, src0, src1);
206 src_ptr += src_stride;
207 LD_UB2(ref_ptr, 16, ref0, ref1);
208 ref_ptr += ref_stride;
209 CALC_MSE_AVG_B(src0, ref0, var, avg0);
210 CALC_MSE_AVG_B(src1, ref1, var, avg1);
211
212 LD_UB2(src_ptr, 16, src0, src1);
213 src_ptr += src_stride;
214 LD_UB2(ref_ptr, 16, ref0, ref1);
215 ref_ptr += ref_stride;
216 CALC_MSE_AVG_B(src0, ref0, var, avg0);
217 CALC_MSE_AVG_B(src1, ref1, var, avg1);
218 }
219
220 vec = __msa_hadd_s_w(avg0, avg0);
221 vec += __msa_hadd_s_w(avg1, avg1);
222 *diff = HADD_SW_S32(vec);
223
224 return HADD_SW_S32(var);
225}
226
227static uint32_t sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride,
228 const uint8_t *ref_ptr, int32_t ref_stride,
229 int32_t *diff) {
230 int32_t ht_cnt;
231 v16u8 src0, src1, src2, src3;
232 v16u8 ref0, ref1, ref2, ref3;
233 v8i16 avg0 = { 0 };
234 v8i16 avg1 = { 0 };
235 v4i32 vec, var = { 0 };
236
237 for (ht_cnt = 16; ht_cnt--;) {
238 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
239 src_ptr += src_stride;
240 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
241 ref_ptr += ref_stride;
242 CALC_MSE_AVG_B(src0, ref0, var, avg0);
243 CALC_MSE_AVG_B(src2, ref2, var, avg0);
244 CALC_MSE_AVG_B(src1, ref1, var, avg1);
245 CALC_MSE_AVG_B(src3, ref3, var, avg1);
246
247 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
248 src_ptr += src_stride;
249 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
250 ref_ptr += ref_stride;
251 CALC_MSE_AVG_B(src0, ref0, var, avg0);
252 CALC_MSE_AVG_B(src2, ref2, var, avg0);
253 CALC_MSE_AVG_B(src1, ref1, var, avg1);
254 CALC_MSE_AVG_B(src3, ref3, var, avg1);
255 }
256
257 vec = __msa_hadd_s_w(avg0, avg0);
258 vec += __msa_hadd_s_w(avg1, avg1);
259 *diff = HADD_SW_S32(vec);
260
261 return HADD_SW_S32(var);
262}
263
264static uint32_t sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride,
265 const uint8_t *ref_ptr, int32_t ref_stride,
266 int32_t *diff) {
267 int32_t ht_cnt;
268 v16u8 src0, src1, src2, src3;
269 v16u8 ref0, ref1, ref2, ref3;
270 v8i16 avg0 = { 0 };
271 v8i16 avg1 = { 0 };
272 v8i16 avg2 = { 0 };
273 v8i16 avg3 = { 0 };
274 v4i32 vec, var = { 0 };
275
276 for (ht_cnt = 32; ht_cnt--;) {
277 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
278 src_ptr += src_stride;
279 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
280 ref_ptr += ref_stride;
281
282 CALC_MSE_AVG_B(src0, ref0, var, avg0);
283 CALC_MSE_AVG_B(src1, ref1, var, avg1);
284 CALC_MSE_AVG_B(src2, ref2, var, avg2);
285 CALC_MSE_AVG_B(src3, ref3, var, avg3);
286 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
287 src_ptr += src_stride;
288 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
289 ref_ptr += ref_stride;
290 CALC_MSE_AVG_B(src0, ref0, var, avg0);
291 CALC_MSE_AVG_B(src1, ref1, var, avg1);
292 CALC_MSE_AVG_B(src2, ref2, var, avg2);
293 CALC_MSE_AVG_B(src3, ref3, var, avg3);
294 }
295
296 vec = __msa_hadd_s_w(avg0, avg0);
297 vec += __msa_hadd_s_w(avg1, avg1);
298 vec += __msa_hadd_s_w(avg2, avg2);
299 vec += __msa_hadd_s_w(avg3, avg3);
300 *diff = HADD_SW_S32(vec);
301
302 return HADD_SW_S32(var);
303}
304
305static uint32_t get_mb_ss_msa(const int16_t *src) {
306 uint32_t sum, cnt;
307 v8i16 src0, src1, src2, src3;
308 v4i32 src0_l, src1_l, src2_l, src3_l;
309 v4i32 src0_r, src1_r, src2_r, src3_r;
310 v2i64 sq_src_l = { 0 };
311 v2i64 sq_src_r = { 0 };
312
313 for (cnt = 8; cnt--;) {
314 LD_SH4(src, 8, src0, src1, src2, src3);
315 src += 4 * 8;
316
317 UNPCK_SH_SW(src0, src0_l, src0_r);
318 UNPCK_SH_SW(src1, src1_l, src1_r);
319 UNPCK_SH_SW(src2, src2_l, src2_r);
320 UNPCK_SH_SW(src3, src3_l, src3_r);
321
322 DPADD_SD2_SD(src0_l, src0_r, sq_src_l, sq_src_r);
323 DPADD_SD2_SD(src1_l, src1_r, sq_src_l, sq_src_r);
324 DPADD_SD2_SD(src2_l, src2_r, sq_src_l, sq_src_r);
325 DPADD_SD2_SD(src3_l, src3_r, sq_src_l, sq_src_r);
326 }
327
328 sq_src_l += __msa_splati_d(sq_src_l, 1);
329 sq_src_r += __msa_splati_d(sq_src_r, 1);
330
331 sum = __msa_copy_s_d(sq_src_l, 0);
332 sum += __msa_copy_s_d(sq_src_r, 0);
333
334 return sum;
335}
336
337static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
338 const uint8_t *ref_ptr, int32_t ref_stride,
339 int32_t height) {
340 int32_t ht_cnt;
341 uint32_t src0, src1, src2, src3;
342 uint32_t ref0, ref1, ref2, ref3;
343 v16u8 src = { 0 };
344 v16u8 ref = { 0 };
345 v4i32 var = { 0 };
346
347 for (ht_cnt = (height >> 2); ht_cnt--;) {
348 LW4(src_ptr, src_stride, src0, src1, src2, src3);
349 src_ptr += (4 * src_stride);
350 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
351 ref_ptr += (4 * ref_stride);
352
353 INSERT_W4_UB(src0, src1, src2, src3, src);
354 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
355 CALC_MSE_B(src, ref, var);
356 }
357
358 return HADD_SW_S32(var);
359}
360
361static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
362 const uint8_t *ref_ptr, int32_t ref_stride,
363 int32_t height) {
364 int32_t ht_cnt;
365 v16u8 src0, src1, src2, src3;
366 v16u8 ref0, ref1, ref2, ref3;
367 v4i32 var = { 0 };
368
369 for (ht_cnt = (height >> 2); ht_cnt--;) {
370 LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
371 src_ptr += (4 * src_stride);
372 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
373 ref_ptr += (4 * ref_stride);
374
375 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
376 ref0, ref1);
377 CALC_MSE_B(src0, ref0, var);
378 CALC_MSE_B(src1, ref1, var);
379 }
380
381 return HADD_SW_S32(var);
382}
383
384static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
385 const uint8_t *ref_ptr, int32_t ref_stride,
386 int32_t height) {
387 int32_t ht_cnt;
388 v16u8 src, ref;
389 v4i32 var = { 0 };
390
391 for (ht_cnt = (height >> 2); ht_cnt--;) {
392 src = LD_UB(src_ptr);
393 src_ptr += src_stride;
394 ref = LD_UB(ref_ptr);
395 ref_ptr += ref_stride;
396 CALC_MSE_B(src, ref, var);
397
398 src = LD_UB(src_ptr);
399 src_ptr += src_stride;
400 ref = LD_UB(ref_ptr);
401 ref_ptr += ref_stride;
402 CALC_MSE_B(src, ref, var);
403
404 src = LD_UB(src_ptr);
405 src_ptr += src_stride;
406 ref = LD_UB(ref_ptr);
407 ref_ptr += ref_stride;
408 CALC_MSE_B(src, ref, var);
409
410 src = LD_UB(src_ptr);
411 src_ptr += src_stride;
412 ref = LD_UB(ref_ptr);
413 ref_ptr += ref_stride;
414 CALC_MSE_B(src, ref, var);
415 }
416
417 return HADD_SW_S32(var);
418}
419
420static uint32_t sse_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
421 const uint8_t *ref_ptr, int32_t ref_stride,
422 int32_t height) {
423 int32_t ht_cnt;
424 v16u8 src0, src1, ref0, ref1;
425 v4i32 var = { 0 };
426
427 for (ht_cnt = (height >> 2); ht_cnt--;) {
428 LD_UB2(src_ptr, 16, src0, src1);
429 src_ptr += src_stride;
430 LD_UB2(ref_ptr, 16, ref0, ref1);
431 ref_ptr += ref_stride;
432 CALC_MSE_B(src0, ref0, var);
433 CALC_MSE_B(src1, ref1, var);
434
435 LD_UB2(src_ptr, 16, src0, src1);
436 src_ptr += src_stride;
437 LD_UB2(ref_ptr, 16, ref0, ref1);
438 ref_ptr += ref_stride;
439 CALC_MSE_B(src0, ref0, var);
440 CALC_MSE_B(src1, ref1, var);
441
442 LD_UB2(src_ptr, 16, src0, src1);
443 src_ptr += src_stride;
444 LD_UB2(ref_ptr, 16, ref0, ref1);
445 ref_ptr += ref_stride;
446 CALC_MSE_B(src0, ref0, var);
447 CALC_MSE_B(src1, ref1, var);
448
449 LD_UB2(src_ptr, 16, src0, src1);
450 src_ptr += src_stride;
451 LD_UB2(ref_ptr, 16, ref0, ref1);
452 ref_ptr += ref_stride;
453 CALC_MSE_B(src0, ref0, var);
454 CALC_MSE_B(src1, ref1, var);
455 }
456
457 return HADD_SW_S32(var);
458}
459
460static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride,
461 const uint8_t *ref_ptr, int32_t ref_stride,
462 int32_t height) {
463 int32_t ht_cnt;
464 v16u8 src0, src1, src2, src3;
465 v16u8 ref0, ref1, ref2, ref3;
466 v4i32 var = { 0 };
467
468 for (ht_cnt = height >> 1; ht_cnt--;) {
469 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
470 src_ptr += src_stride;
471 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
472 ref_ptr += ref_stride;
473 CALC_MSE_B(src0, ref0, var);
474 CALC_MSE_B(src2, ref2, var);
475 CALC_MSE_B(src1, ref1, var);
476 CALC_MSE_B(src3, ref3, var);
477
478 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
479 src_ptr += src_stride;
480 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
481 ref_ptr += ref_stride;
482 CALC_MSE_B(src0, ref0, var);
483 CALC_MSE_B(src2, ref2, var);
484 CALC_MSE_B(src1, ref1, var);
485 CALC_MSE_B(src3, ref3, var);
486 }
487
488 return HADD_SW_S32(var);
489}
490
Yaowu Xuf883b422016-08-30 14:01:10 -0700491uint32_t aom_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700492 const uint8_t *ref_ptr, int32_t ref_stride) {
493 uint32_t err = 0;
494 uint32_t src0, src1, src2, src3;
495 uint32_t ref0, ref1, ref2, ref3;
496 v16i8 src = { 0 };
497 v16i8 ref = { 0 };
498 v16u8 src_vec0, src_vec1;
499 v8i16 diff0, diff1;
500 v4i32 err0 = { 0 };
501 v4i32 err1 = { 0 };
502
503 LW4(src_ptr, src_stride, src0, src1, src2, src3);
504 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
505 INSERT_W4_SB(src0, src1, src2, src3, src);
506 INSERT_W4_SB(ref0, ref1, ref2, ref3, ref);
507 ILVRL_B2_UB(src, ref, src_vec0, src_vec1);
508 HSUB_UB2_SH(src_vec0, src_vec1, diff0, diff1);
509 DPADD_SH2_SW(diff0, diff1, diff0, diff1, err0, err1);
510 err = HADD_SW_S32(err0);
511 err += HADD_SW_S32(err1);
512
513 return err;
514}
515
516#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
517#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
518#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
519#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
520#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
521#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
522#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
523
524#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
525#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
526#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
527#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
528#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
529#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
530
Yaowu Xuf883b422016-08-30 14:01:10 -0700531#define AOM_VARIANCE_WDXHT_MSA(wd, ht) \
532 uint32_t aom_variance##wd##x##ht##_msa( \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700533 const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
534 int32_t ref_stride, uint32_t *sse) { \
535 int32_t diff; \
536 \
537 *sse = \
538 sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \
539 \
540 return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
541 }
542
Yaowu Xu410fee82016-10-13 13:04:29 -0700543/* clang-format off */
544AOM_VARIANCE_WDXHT_MSA(4, 4)
545AOM_VARIANCE_WDXHT_MSA(4, 8)
Yaowu Xuc27fc142016-08-22 16:08:15 -0700546
Yaowu Xuf883b422016-08-30 14:01:10 -0700547AOM_VARIANCE_WDXHT_MSA(8, 4)
548AOM_VARIANCE_WDXHT_MSA(8, 8)
549AOM_VARIANCE_WDXHT_MSA(8, 16)
Yaowu Xuc27fc142016-08-22 16:08:15 -0700550
Yaowu Xuf883b422016-08-30 14:01:10 -0700551AOM_VARIANCE_WDXHT_MSA(16, 8)
552AOM_VARIANCE_WDXHT_MSA(16, 16)
553AOM_VARIANCE_WDXHT_MSA(16, 32)
Yaowu Xuc27fc142016-08-22 16:08:15 -0700554
Yaowu Xuf883b422016-08-30 14:01:10 -0700555AOM_VARIANCE_WDXHT_MSA(32, 16)
556AOM_VARIANCE_WDXHT_MSA(32, 32)
Yaowu Xu410fee82016-10-13 13:04:29 -0700557/* clang-format on */
Yaowu Xuc27fc142016-08-22 16:08:15 -0700558
Yaowu Xuf883b422016-08-30 14:01:10 -0700559uint32_t aom_variance32x64_msa(const uint8_t *src, int32_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700560 const uint8_t *ref, int32_t ref_stride,
561 uint32_t *sse) {
562 int32_t diff;
563
564 *sse = sse_diff_32x64_msa(src, src_stride, ref, ref_stride, &diff);
565
566 return VARIANCE_32Wx64H(*sse, diff);
567}
568
Yaowu Xuf883b422016-08-30 14:01:10 -0700569uint32_t aom_variance64x32_msa(const uint8_t *src, int32_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700570 const uint8_t *ref, int32_t ref_stride,
571 uint32_t *sse) {
572 int32_t diff;
573
574 *sse = sse_diff_64x32_msa(src, src_stride, ref, ref_stride, &diff);
575
576 return VARIANCE_64Wx32H(*sse, diff);
577}
578
Yaowu Xuf883b422016-08-30 14:01:10 -0700579uint32_t aom_variance64x64_msa(const uint8_t *src, int32_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700580 const uint8_t *ref, int32_t ref_stride,
581 uint32_t *sse) {
582 int32_t diff;
583
584 *sse = sse_diff_64x64_msa(src, src_stride, ref, ref_stride, &diff);
585
586 return VARIANCE_64Wx64H(*sse, diff);
587}
588
Yaowu Xuf883b422016-08-30 14:01:10 -0700589uint32_t aom_mse8x8_msa(const uint8_t *src, int32_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700590 const uint8_t *ref, int32_t ref_stride, uint32_t *sse) {
591 *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8);
592
593 return *sse;
594}
595
Yaowu Xuf883b422016-08-30 14:01:10 -0700596uint32_t aom_mse8x16_msa(const uint8_t *src, int32_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700597 const uint8_t *ref, int32_t ref_stride,
598 uint32_t *sse) {
599 *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 16);
600
601 return *sse;
602}
603
Yaowu Xuf883b422016-08-30 14:01:10 -0700604uint32_t aom_mse16x8_msa(const uint8_t *src, int32_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700605 const uint8_t *ref, int32_t ref_stride,
606 uint32_t *sse) {
607 *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 8);
608
609 return *sse;
610}
611
Yaowu Xuf883b422016-08-30 14:01:10 -0700612uint32_t aom_mse16x16_msa(const uint8_t *src, int32_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700613 const uint8_t *ref, int32_t ref_stride,
614 uint32_t *sse) {
615 *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 16);
616
617 return *sse;
618}
619
Yaowu Xuf883b422016-08-30 14:01:10 -0700620void aom_get8x8var_msa(const uint8_t *src, int32_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700621 const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
622 int32_t *sum) {
623 *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum);
624}
625
Yaowu Xuf883b422016-08-30 14:01:10 -0700626void aom_get16x16var_msa(const uint8_t *src, int32_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700627 const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
628 int32_t *sum) {
629 *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum);
630}
631
Yaowu Xuf883b422016-08-30 14:01:10 -0700632uint32_t aom_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); }