blob: c8ab61249a1fb09b2bd86232a8ecc333acb3b2db [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xuc27fc142016-08-22 16:08:15 -070010 */
11
12#include <assert.h>
Tom Finegan44702c82018-05-22 13:00:39 -070013
14#include "config/aom_dsp_rtcd.h"
15
Yaowu Xuf883b422016-08-30 14:01:10 -070016#include "aom_dsp/mips/aom_convolve_msa.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070017
18static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
19 uint8_t *dst, int32_t dst_stride,
20 int8_t *filter) {
21 v16u8 mask0, mask1, mask2, mask3, out;
22 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
23 v8i16 filt, out0, out1;
24
25 mask0 = LD_UB(&mc_filt_mask_arr[16]);
26 src -= 3;
27
28 /* rearranging filter */
29 filt = LD_SH(filter);
30 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
31
32 mask1 = mask0 + 2;
33 mask2 = mask0 + 4;
34 mask3 = mask0 + 6;
35
36 LD_SB4(src, src_stride, src0, src1, src2, src3);
37 XORI_B4_128_SB(src0, src1, src2, src3);
38 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
39 filt0, filt1, filt2, filt3, out0, out1);
40 SRARI_H2_SH(out0, out1, FILTER_BITS);
41 SAT_SH2_SH(out0, out1, 7);
42 out = PCKEV_XORI128_UB(out0, out1);
43 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
44}
45
46static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
47 uint8_t *dst, int32_t dst_stride,
48 int8_t *filter) {
49 v16i8 filt0, filt1, filt2, filt3;
50 v16i8 src0, src1, src2, src3;
51 v16u8 mask0, mask1, mask2, mask3, out;
52 v8i16 filt, out0, out1, out2, out3;
53
54 mask0 = LD_UB(&mc_filt_mask_arr[16]);
55 src -= 3;
56
57 /* rearranging filter */
58 filt = LD_SH(filter);
59 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
60
61 mask1 = mask0 + 2;
62 mask2 = mask0 + 4;
63 mask3 = mask0 + 6;
64
65 LD_SB4(src, src_stride, src0, src1, src2, src3);
66 XORI_B4_128_SB(src0, src1, src2, src3);
67 src += (4 * src_stride);
68 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
69 filt0, filt1, filt2, filt3, out0, out1);
70 LD_SB4(src, src_stride, src0, src1, src2, src3);
71 XORI_B4_128_SB(src0, src1, src2, src3);
72 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
73 filt0, filt1, filt2, filt3, out2, out3);
74 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
75 SAT_SH4_SH(out0, out1, out2, out3, 7);
76 out = PCKEV_XORI128_UB(out0, out1);
77 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
78 dst += (4 * dst_stride);
79 out = PCKEV_XORI128_UB(out2, out3);
80 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
81}
82
83static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
84 uint8_t *dst, int32_t dst_stride,
85 int8_t *filter, int32_t height) {
86 if (4 == height) {
87 common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
88 } else if (8 == height) {
89 common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
90 }
91}
92
93static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
94 uint8_t *dst, int32_t dst_stride,
95 int8_t *filter) {
96 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
97 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
98 v8i16 filt, out0, out1, out2, out3;
99
100 mask0 = LD_UB(&mc_filt_mask_arr[0]);
101 src -= 3;
102
103 /* rearranging filter */
104 filt = LD_SH(filter);
105 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
106
107 mask1 = mask0 + 2;
108 mask2 = mask0 + 4;
109 mask3 = mask0 + 6;
110
111 LD_SB4(src, src_stride, src0, src1, src2, src3);
112 XORI_B4_128_SB(src0, src1, src2, src3);
113 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
114 filt0, filt1, filt2, filt3, out0, out1, out2,
115 out3);
116 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
117 SAT_SH4_SH(out0, out1, out2, out3, 7);
118 tmp0 = PCKEV_XORI128_UB(out0, out1);
119 tmp1 = PCKEV_XORI128_UB(out2, out3);
120 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
121}
122
123static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
124 uint8_t *dst, int32_t dst_stride,
125 int8_t *filter, int32_t height) {
126 uint32_t loop_cnt;
127 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
128 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
129 v8i16 filt, out0, out1, out2, out3;
130
131 mask0 = LD_UB(&mc_filt_mask_arr[0]);
132 src -= 3;
133
134 /* rearranging filter */
135 filt = LD_SH(filter);
136 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
137
138 mask1 = mask0 + 2;
139 mask2 = mask0 + 4;
140 mask3 = mask0 + 6;
141
142 for (loop_cnt = (height >> 2); loop_cnt--;) {
143 LD_SB4(src, src_stride, src0, src1, src2, src3);
144 XORI_B4_128_SB(src0, src1, src2, src3);
145 src += (4 * src_stride);
146 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
147 mask3, filt0, filt1, filt2, filt3, out0, out1,
148 out2, out3);
149 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
150 SAT_SH4_SH(out0, out1, out2, out3, 7);
151 tmp0 = PCKEV_XORI128_UB(out0, out1);
152 tmp1 = PCKEV_XORI128_UB(out2, out3);
153 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
154 dst += (4 * dst_stride);
155 }
156}
157
158static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
159 uint8_t *dst, int32_t dst_stride,
160 int8_t *filter, int32_t height) {
161 if (4 == height) {
162 common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
163 } else {
164 common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
165 }
166}
167
168static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
169 uint8_t *dst, int32_t dst_stride,
170 int8_t *filter, int32_t height) {
171 uint32_t loop_cnt;
172 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
173 v16u8 mask0, mask1, mask2, mask3, out;
174 v8i16 filt, out0, out1, out2, out3;
175
176 mask0 = LD_UB(&mc_filt_mask_arr[0]);
177 src -= 3;
178
179 /* rearranging filter */
180 filt = LD_SH(filter);
181 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
182
183 mask1 = mask0 + 2;
184 mask2 = mask0 + 4;
185 mask3 = mask0 + 6;
186
187 for (loop_cnt = (height >> 1); loop_cnt--;) {
188 LD_SB2(src, src_stride, src0, src2);
189 LD_SB2(src + 8, src_stride, src1, src3);
190 XORI_B4_128_SB(src0, src1, src2, src3);
191 src += (2 * src_stride);
192 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
193 mask3, filt0, filt1, filt2, filt3, out0, out1,
194 out2, out3);
195 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
196 SAT_SH4_SH(out0, out1, out2, out3, 7);
197 out = PCKEV_XORI128_UB(out0, out1);
198 ST_UB(out, dst);
199 dst += dst_stride;
200 out = PCKEV_XORI128_UB(out2, out3);
201 ST_UB(out, dst);
202 dst += dst_stride;
203 }
204}
205
206static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
207 uint8_t *dst, int32_t dst_stride,
208 int8_t *filter, int32_t height) {
209 uint32_t loop_cnt;
210 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
211 v16u8 mask0, mask1, mask2, mask3, out;
212 v8i16 filt, out0, out1, out2, out3;
213
214 mask0 = LD_UB(&mc_filt_mask_arr[0]);
215 src -= 3;
216
217 /* rearranging filter */
218 filt = LD_SH(filter);
219 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
220
221 mask1 = mask0 + 2;
222 mask2 = mask0 + 4;
223 mask3 = mask0 + 6;
224
225 for (loop_cnt = (height >> 1); loop_cnt--;) {
226 src0 = LD_SB(src);
227 src2 = LD_SB(src + 16);
228 src3 = LD_SB(src + 24);
229 src1 = __msa_sldi_b(src2, src0, 8);
230 src += src_stride;
231 XORI_B4_128_SB(src0, src1, src2, src3);
232 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
233 mask3, filt0, filt1, filt2, filt3, out0, out1,
234 out2, out3);
235 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
236 SAT_SH4_SH(out0, out1, out2, out3, 7);
237
238 src0 = LD_SB(src);
239 src2 = LD_SB(src + 16);
240 src3 = LD_SB(src + 24);
241 src1 = __msa_sldi_b(src2, src0, 8);
242 src += src_stride;
243
244 out = PCKEV_XORI128_UB(out0, out1);
245 ST_UB(out, dst);
246 out = PCKEV_XORI128_UB(out2, out3);
247 ST_UB(out, dst + 16);
248 dst += dst_stride;
249
250 XORI_B4_128_SB(src0, src1, src2, src3);
251 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
252 mask3, filt0, filt1, filt2, filt3, out0, out1,
253 out2, out3);
254 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
255 SAT_SH4_SH(out0, out1, out2, out3, 7);
256 out = PCKEV_XORI128_UB(out0, out1);
257 ST_UB(out, dst);
258 out = PCKEV_XORI128_UB(out2, out3);
259 ST_UB(out, dst + 16);
260 dst += dst_stride;
261 }
262}
263
264static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
265 uint8_t *dst, int32_t dst_stride,
266 int8_t *filter, int32_t height) {
267 int32_t loop_cnt;
268 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
269 v16u8 mask0, mask1, mask2, mask3, out;
270 v8i16 filt, out0, out1, out2, out3;
271
272 mask0 = LD_UB(&mc_filt_mask_arr[0]);
273 src -= 3;
274
275 /* rearranging filter */
276 filt = LD_SH(filter);
277 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
278
279 mask1 = mask0 + 2;
280 mask2 = mask0 + 4;
281 mask3 = mask0 + 6;
282
283 for (loop_cnt = height; loop_cnt--;) {
284 src0 = LD_SB(src);
285 src2 = LD_SB(src + 16);
286 src3 = LD_SB(src + 24);
287 src1 = __msa_sldi_b(src2, src0, 8);
288
289 XORI_B4_128_SB(src0, src1, src2, src3);
290 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
291 mask3, filt0, filt1, filt2, filt3, out0, out1,
292 out2, out3);
293 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
294 SAT_SH4_SH(out0, out1, out2, out3, 7);
295 out = PCKEV_XORI128_UB(out0, out1);
296 ST_UB(out, dst);
297 out = PCKEV_XORI128_UB(out2, out3);
298 ST_UB(out, dst + 16);
299
300 src0 = LD_SB(src + 32);
301 src2 = LD_SB(src + 48);
302 src3 = LD_SB(src + 56);
303 src1 = __msa_sldi_b(src2, src0, 8);
304 src += src_stride;
305
306 XORI_B4_128_SB(src0, src1, src2, src3);
307 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
308 mask3, filt0, filt1, filt2, filt3, out0, out1,
309 out2, out3);
310 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
311 SAT_SH4_SH(out0, out1, out2, out3, 7);
312 out = PCKEV_XORI128_UB(out0, out1);
313 ST_UB(out, dst + 32);
314 out = PCKEV_XORI128_UB(out2, out3);
315 ST_UB(out, dst + 48);
316 dst += dst_stride;
317 }
318}
319
320static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
321 uint8_t *dst, int32_t dst_stride,
322 int8_t *filter) {
323 v16i8 src0, src1, src2, src3, mask;
324 v16u8 filt0, vec0, vec1, res0, res1;
325 v8u16 vec2, vec3, filt;
326
327 mask = LD_SB(&mc_filt_mask_arr[16]);
328
329 /* rearranging filter */
330 filt = LD_UH(filter);
331 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
332
333 LD_SB4(src, src_stride, src0, src1, src2, src3);
334 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
335 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
336 SRARI_H2_UH(vec2, vec3, FILTER_BITS);
337 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
338 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
339}
340
341static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
342 uint8_t *dst, int32_t dst_stride,
343 int8_t *filter) {
344 v16u8 vec0, vec1, vec2, vec3, filt0;
345 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
346 v16i8 res0, res1, res2, res3;
347 v8u16 vec4, vec5, vec6, vec7, filt;
348
349 mask = LD_SB(&mc_filt_mask_arr[16]);
350
351 /* rearranging filter */
352 filt = LD_UH(filter);
353 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
354
355 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
356 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
357 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
358 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
359 vec6, vec7);
360 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
361 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
362 res3);
363 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
364 dst += (4 * dst_stride);
365 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
366}
367
368static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride,
369 uint8_t *dst, int32_t dst_stride,
370 int8_t *filter, int32_t height) {
371 if (4 == height) {
372 common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
373 } else if (8 == height) {
374 common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
375 }
376}
377
378static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
379 uint8_t *dst, int32_t dst_stride,
380 int8_t *filter) {
381 v16u8 filt0;
382 v16i8 src0, src1, src2, src3, mask;
383 v8u16 vec0, vec1, vec2, vec3, filt;
384
385 mask = LD_SB(&mc_filt_mask_arr[0]);
386
387 /* rearranging filter */
388 filt = LD_UH(filter);
389 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
390
391 LD_SB4(src, src_stride, src0, src1, src2, src3);
392 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
393 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
394 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
395 vec2, vec3);
396 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
397 PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
398 ST8x4_UB(src0, src1, dst, dst_stride);
399}
400
401static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
402 uint8_t *dst, int32_t dst_stride,
403 int8_t *filter, int32_t height) {
404 v16u8 filt0;
405 v16i8 src0, src1, src2, src3, mask, out0, out1;
406 v8u16 vec0, vec1, vec2, vec3, filt;
407
408 mask = LD_SB(&mc_filt_mask_arr[0]);
409
410 /* rearranging filter */
411 filt = LD_UH(filter);
412 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
413
414 LD_SB4(src, src_stride, src0, src1, src2, src3);
415 src += (4 * src_stride);
416
417 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
418 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
419 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
420 vec2, vec3);
421 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
422
423 LD_SB4(src, src_stride, src0, src1, src2, src3);
424 src += (4 * src_stride);
425
426 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
427 ST8x4_UB(out0, out1, dst, dst_stride);
428 dst += (4 * dst_stride);
429
430 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
431 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
432 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
433 vec2, vec3);
434 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
435 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
436 ST8x4_UB(out0, out1, dst, dst_stride);
437 dst += (4 * dst_stride);
438
439 if (16 == height) {
440 LD_SB4(src, src_stride, src0, src1, src2, src3);
441 src += (4 * src_stride);
442
443 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
444 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
445 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
446 vec2, vec3);
447 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
448 LD_SB4(src, src_stride, src0, src1, src2, src3);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700449
450 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
451 ST8x4_UB(out0, out1, dst, dst_stride);
452
453 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
454 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
455 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
456 vec2, vec3);
457 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
458 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
459 ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
460 }
461}
462
463static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride,
464 uint8_t *dst, int32_t dst_stride,
465 int8_t *filter, int32_t height) {
466 if (4 == height) {
467 common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
468 } else {
469 common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
470 }
471}
472
473static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
474 uint8_t *dst, int32_t dst_stride,
475 int8_t *filter, int32_t height) {
476 uint32_t loop_cnt;
477 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
478 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
479 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
480
481 mask = LD_SB(&mc_filt_mask_arr[0]);
482
483 loop_cnt = (height >> 2) - 1;
484
485 /* rearranging filter */
486 filt = LD_UH(filter);
487 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
488
489 LD_SB4(src, src_stride, src0, src2, src4, src6);
490 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
491 src += (4 * src_stride);
492
493 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
494 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
495 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
496 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
497 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
498 out2, out3);
499 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
500 out6, out7);
501 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
502 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
503 PCKEV_ST_SB(out0, out1, dst);
504 dst += dst_stride;
505 PCKEV_ST_SB(out2, out3, dst);
506 dst += dst_stride;
507 PCKEV_ST_SB(out4, out5, dst);
508 dst += dst_stride;
509 PCKEV_ST_SB(out6, out7, dst);
510 dst += dst_stride;
511
512 for (; loop_cnt--;) {
513 LD_SB4(src, src_stride, src0, src2, src4, src6);
514 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
515 src += (4 * src_stride);
516
517 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
518 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
519 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
520 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
521 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
522 out2, out3);
523 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
524 out6, out7);
525 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
526 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
527 PCKEV_ST_SB(out0, out1, dst);
528 dst += dst_stride;
529 PCKEV_ST_SB(out2, out3, dst);
530 dst += dst_stride;
531 PCKEV_ST_SB(out4, out5, dst);
532 dst += dst_stride;
533 PCKEV_ST_SB(out6, out7, dst);
534 dst += dst_stride;
535 }
536}
537
538static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
539 uint8_t *dst, int32_t dst_stride,
540 int8_t *filter, int32_t height) {
541 uint32_t loop_cnt;
542 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
543 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
544 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
545
546 mask = LD_SB(&mc_filt_mask_arr[0]);
547
548 /* rearranging filter */
549 filt = LD_UH(filter);
550 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
551
552 for (loop_cnt = height >> 1; loop_cnt--;) {
553 src0 = LD_SB(src);
554 src2 = LD_SB(src + 16);
555 src3 = LD_SB(src + 24);
556 src1 = __msa_sldi_b(src2, src0, 8);
557 src += src_stride;
558 src4 = LD_SB(src);
559 src6 = LD_SB(src + 16);
560 src7 = LD_SB(src + 24);
561 src5 = __msa_sldi_b(src6, src4, 8);
562 src += src_stride;
563
564 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
565 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
566 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
567 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
568 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
569 out2, out3);
570 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
571 out6, out7);
572 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
573 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
574 PCKEV_ST_SB(out0, out1, dst);
575 PCKEV_ST_SB(out2, out3, dst + 16);
576 dst += dst_stride;
577 PCKEV_ST_SB(out4, out5, dst);
578 PCKEV_ST_SB(out6, out7, dst + 16);
579 dst += dst_stride;
580 }
581}
582
583static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
584 uint8_t *dst, int32_t dst_stride,
585 int8_t *filter, int32_t height) {
586 uint32_t loop_cnt;
587 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
588 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
589 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
590
591 mask = LD_SB(&mc_filt_mask_arr[0]);
592
593 /* rearranging filter */
594 filt = LD_UH(filter);
595 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
596
597 for (loop_cnt = height; loop_cnt--;) {
598 src0 = LD_SB(src);
599 src2 = LD_SB(src + 16);
600 src4 = LD_SB(src + 32);
601 src6 = LD_SB(src + 48);
602 src7 = LD_SB(src + 56);
603 SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
604 src += src_stride;
605
606 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
607 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
608 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
609 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
610 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
611 out2, out3);
612 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
613 out6, out7);
614 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
615 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
616 PCKEV_ST_SB(out0, out1, dst);
617 PCKEV_ST_SB(out2, out3, dst + 16);
618 PCKEV_ST_SB(out4, out5, dst + 32);
619 PCKEV_ST_SB(out6, out7, dst + 48);
620 dst += dst_stride;
621 }
622}
623
Yaowu Xuf883b422016-08-30 14:01:10 -0700624void aom_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700625 uint8_t *dst, ptrdiff_t dst_stride,
626 const int16_t *filter_x, int x_step_q4,
627 const int16_t *filter_y, int y_step_q4, int w,
628 int h) {
629 int8_t cnt, filt_hor[8];
630
631 assert(x_step_q4 == 16);
632 assert(((const int32_t *)filter_x)[1] != 0x800000);
633
634 for (cnt = 0; cnt < 8; ++cnt) {
635 filt_hor[cnt] = filter_x[cnt];
636 }
637
638 if (((const int32_t *)filter_x)[0] == 0) {
639 switch (w) {
640 case 4:
641 common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
642 &filt_hor[3], h);
643 break;
644 case 8:
645 common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
646 &filt_hor[3], h);
647 break;
648 case 16:
649 common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
650 &filt_hor[3], h);
651 break;
652 case 32:
653 common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
654 &filt_hor[3], h);
655 break;
656 case 64:
657 common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
658 &filt_hor[3], h);
659 break;
660 default:
Yaowu Xuf883b422016-08-30 14:01:10 -0700661 aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700662 x_step_q4, filter_y, y_step_q4, w, h);
663 break;
664 }
665 } else {
666 switch (w) {
667 case 4:
668 common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
669 filt_hor, h);
670 break;
671 case 8:
672 common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
673 filt_hor, h);
674 break;
675 case 16:
676 common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
677 filt_hor, h);
678 break;
679 case 32:
680 common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
681 filt_hor, h);
682 break;
683 case 64:
684 common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
685 filt_hor, h);
686 break;
687 default:
Yaowu Xuf883b422016-08-30 14:01:10 -0700688 aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700689 x_step_q4, filter_y, y_step_q4, w, h);
690 break;
691 }
692 }
693}