blob: a4d594931a93989b32c63ee4b7d8c757c12149b2 [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xuc27fc142016-08-22 16:08:15 -070010 */
11
12#include <assert.h>
Yaowu Xuf883b422016-08-30 14:01:10 -070013#include "./aom_dsp_rtcd.h"
14#include "aom_dsp/mips/aom_convolve_msa.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070015
16const uint8_t mc_filt_mask_arr[16 * 3] = {
17 /* 8 width cases */
18 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
19 /* 4 width cases */
20 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
21 /* 4 width cases */
22 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
23};
24
25static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
26 uint8_t *dst, int32_t dst_stride,
27 int8_t *filter_horiz, int8_t *filter_vert,
28 int32_t height) {
29 uint32_t loop_cnt;
30 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
31 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
32 v16u8 mask0, mask1, mask2, mask3, out;
33 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
34 v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
35 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
36
37 mask0 = LD_UB(&mc_filt_mask_arr[16]);
38 src -= (3 + 3 * src_stride);
39
40 /* rearranging filter */
41 filt = LD_SH(filter_horiz);
42 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
43
44 mask1 = mask0 + 2;
45 mask2 = mask0 + 4;
46 mask3 = mask0 + 6;
47
48 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
49 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
50 src += (7 * src_stride);
51
52 hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
53 filt_hz1, filt_hz2, filt_hz3);
54 hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
55 filt_hz1, filt_hz2, filt_hz3);
56 hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
57 filt_hz1, filt_hz2, filt_hz3);
58 hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
59 filt_hz1, filt_hz2, filt_hz3);
60 SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
61
62 filt = LD_SH(filter_vert);
63 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
64
65 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
66 out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
67
68 for (loop_cnt = (height >> 2); loop_cnt--;) {
69 LD_SB4(src, src_stride, src7, src8, src9, src10);
70 XORI_B4_128_SB(src7, src8, src9, src10);
71 src += (4 * src_stride);
72
73 hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
74 filt_hz1, filt_hz2, filt_hz3);
75 hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
76 out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
77 tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
78 filt_vt2, filt_vt3);
79
80 hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
81 filt_hz1, filt_hz2, filt_hz3);
82 hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
83 out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
84 tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
85 filt_vt2, filt_vt3);
86 SRARI_H2_SH(tmp0, tmp1, FILTER_BITS);
87 SAT_SH2_SH(tmp0, tmp1, 7);
88 out = PCKEV_XORI128_UB(tmp0, tmp1);
89 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
90 dst += (4 * dst_stride);
91
92 hz_out5 = hz_out9;
93 out0 = out2;
94 out1 = out3;
95 out2 = out4;
96 }
97}
98
99static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
100 uint8_t *dst, int32_t dst_stride,
101 int8_t *filter_horiz, int8_t *filter_vert,
102 int32_t height) {
103 uint32_t loop_cnt;
104 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
105 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
106 v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
107 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
108 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
109 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
110 v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
111
112 mask0 = LD_UB(&mc_filt_mask_arr[0]);
113 src -= (3 + 3 * src_stride);
114
115 /* rearranging filter */
116 filt = LD_SH(filter_horiz);
117 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
118
119 mask1 = mask0 + 2;
120 mask2 = mask0 + 4;
121 mask3 = mask0 + 6;
122
123 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
124 src += (7 * src_stride);
125
126 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
127 hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
128 filt_hz1, filt_hz2, filt_hz3);
129 hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
130 filt_hz1, filt_hz2, filt_hz3);
131 hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
132 filt_hz1, filt_hz2, filt_hz3);
133 hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
134 filt_hz1, filt_hz2, filt_hz3);
135 hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
136 filt_hz1, filt_hz2, filt_hz3);
137 hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
138 filt_hz1, filt_hz2, filt_hz3);
139 hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
140 filt_hz1, filt_hz2, filt_hz3);
141
142 filt = LD_SH(filter_vert);
143 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
144
145 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
146 ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
147 ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
148
149 for (loop_cnt = (height >> 2); loop_cnt--;) {
150 LD_SB4(src, src_stride, src7, src8, src9, src10);
151 src += (4 * src_stride);
152
153 XORI_B4_128_SB(src7, src8, src9, src10);
154
155 hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
156 filt_hz1, filt_hz2, filt_hz3);
157 out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
158 tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
159 filt_vt2, filt_vt3);
160
161 hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
162 filt_hz1, filt_hz2, filt_hz3);
163 out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
164 tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
165 filt_vt2, filt_vt3);
166
167 hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
168 filt_hz1, filt_hz2, filt_hz3);
169 out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
170 tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
171 filt_vt2, filt_vt3);
172
173 hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
174 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
175 out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
176 tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
177 filt_vt2, filt_vt3);
178 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
179 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
180 vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
181 vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
182 ST8x4_UB(vec0, vec1, dst, dst_stride);
183 dst += (4 * dst_stride);
184
185 hz_out6 = hz_out10;
186 out0 = out2;
187 out1 = out3;
188 out2 = out8;
189 out4 = out6;
190 out5 = out7;
191 out6 = out9;
192 }
193}
194
195static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
196 uint8_t *dst, int32_t dst_stride,
197 int8_t *filter_horiz, int8_t *filter_vert,
198 int32_t height) {
199 int32_t multiple8_cnt;
200 for (multiple8_cnt = 2; multiple8_cnt--;) {
201 common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
202 filter_vert, height);
203 src += 8;
204 dst += 8;
205 }
206}
207
208static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
209 uint8_t *dst, int32_t dst_stride,
210 int8_t *filter_horiz, int8_t *filter_vert,
211 int32_t height) {
212 int32_t multiple8_cnt;
213 for (multiple8_cnt = 4; multiple8_cnt--;) {
214 common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
215 filter_vert, height);
216 src += 8;
217 dst += 8;
218 }
219}
220
221static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
222 uint8_t *dst, int32_t dst_stride,
223 int8_t *filter_horiz, int8_t *filter_vert,
224 int32_t height) {
225 int32_t multiple8_cnt;
226 for (multiple8_cnt = 8; multiple8_cnt--;) {
227 common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
228 filter_vert, height);
229 src += 8;
230 dst += 8;
231 }
232}
233
234static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
235 uint8_t *dst, int32_t dst_stride,
236 int8_t *filter_horiz,
237 int8_t *filter_vert) {
238 v16i8 src0, src1, src2, src3, src4, mask;
239 v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
240 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
241
242 mask = LD_SB(&mc_filt_mask_arr[16]);
243
244 /* rearranging filter */
245 filt = LD_UH(filter_horiz);
246 filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
247
248 filt = LD_UH(filter_vert);
249 filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
250
251 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
252 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
253 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
254 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
255 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
256 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
257
258 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
259 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
260 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
261 PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
262 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
263}
264
265static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
266 uint8_t *dst, int32_t dst_stride,
267 int8_t *filter_horiz,
268 int8_t *filter_vert) {
269 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
270 v16i8 res0, res1, res2, res3;
271 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
272 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
273 v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
274
275 mask = LD_SB(&mc_filt_mask_arr[16]);
276
277 /* rearranging filter */
278 filt = LD_UH(filter_horiz);
279 filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
280
281 filt = LD_UH(filter_vert);
282 filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
283
284 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
285 src += (8 * src_stride);
286 src8 = LD_SB(src);
287
288 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
289 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
290 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
291 hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
292 hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
293 SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
294 hz_out3, hz_out5, 8);
295 hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
296
297 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
298 ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
299 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4,
300 vec5, vec6, vec7);
301 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
302 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
303 res3);
304 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
305 dst += (4 * dst_stride);
306 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
307}
308
309static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride,
310 uint8_t *dst, int32_t dst_stride,
311 int8_t *filter_horiz, int8_t *filter_vert,
312 int32_t height) {
313 if (4 == height) {
314 common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
315 filter_vert);
316 } else if (8 == height) {
317 common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz,
318 filter_vert);
319 }
320}
321
322static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
323 uint8_t *dst, int32_t dst_stride,
324 int8_t *filter_horiz,
325 int8_t *filter_vert) {
326 v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
327 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
328 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
329 v8i16 filt;
330
331 mask = LD_SB(&mc_filt_mask_arr[0]);
332
333 /* rearranging filter */
334 filt = LD_SH(filter_horiz);
335 filt_hz = (v16u8)__msa_splati_h(filt, 0);
336
337 filt = LD_SH(filter_vert);
338 filt_vt = (v16u8)__msa_splati_h(filt, 0);
339
340 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
341
342 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
343 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
344 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
345 tmp0 = __msa_dotp_u_h(vec0, filt_vt);
346
347 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
348 vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
349 tmp1 = __msa_dotp_u_h(vec1, filt_vt);
350
351 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
352 vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
353 tmp2 = __msa_dotp_u_h(vec2, filt_vt);
354
355 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
356 vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
357 tmp3 = __msa_dotp_u_h(vec3, filt_vt);
358
359 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
360 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
361 ST8x4_UB(out0, out1, dst, dst_stride);
362}
363
364static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
365 int32_t src_stride, uint8_t *dst,
366 int32_t dst_stride,
367 int8_t *filter_horiz,
368 int8_t *filter_vert, int32_t height) {
369 uint32_t loop_cnt;
370 v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
371 v16u8 filt_hz, filt_vt, vec0;
372 v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
373 v8i16 filt;
374
375 mask = LD_SB(&mc_filt_mask_arr[0]);
376
377 /* rearranging filter */
378 filt = LD_SH(filter_horiz);
379 filt_hz = (v16u8)__msa_splati_h(filt, 0);
380
381 filt = LD_SH(filter_vert);
382 filt_vt = (v16u8)__msa_splati_h(filt, 0);
383
384 src0 = LD_SB(src);
385 src += src_stride;
386
387 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
388
389 for (loop_cnt = (height >> 3); loop_cnt--;) {
390 LD_SB4(src, src_stride, src1, src2, src3, src4);
391 src += (4 * src_stride);
392
393 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
394 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
395 tmp1 = __msa_dotp_u_h(vec0, filt_vt);
396
397 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
398 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
399 tmp2 = __msa_dotp_u_h(vec0, filt_vt);
400
401 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
402
403 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
404 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
405 tmp3 = __msa_dotp_u_h(vec0, filt_vt);
406
407 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
408 LD_SB4(src, src_stride, src1, src2, src3, src4);
409 src += (4 * src_stride);
410 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
411 tmp4 = __msa_dotp_u_h(vec0, filt_vt);
412
413 SRARI_H2_UH(tmp3, tmp4, FILTER_BITS);
414 PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
415 ST8x4_UB(out0, out1, dst, dst_stride);
416 dst += (4 * dst_stride);
417
418 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
419 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
420 tmp5 = __msa_dotp_u_h(vec0, filt_vt);
421
422 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
423 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
424 tmp6 = __msa_dotp_u_h(vec0, filt_vt);
425
426 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
427 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
428 tmp7 = __msa_dotp_u_h(vec0, filt_vt);
429
430 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
431 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
432 tmp8 = __msa_dotp_u_h(vec0, filt_vt);
433
434 SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS);
435 PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
436 ST8x4_UB(out0, out1, dst, dst_stride);
437 dst += (4 * dst_stride);
438 }
439}
440
441static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride,
442 uint8_t *dst, int32_t dst_stride,
443 int8_t *filter_horiz, int8_t *filter_vert,
444 int32_t height) {
445 if (4 == height) {
446 common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
447 filter_vert);
448 } else {
449 common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
450 filter_horiz, filter_vert, height);
451 }
452}
453
454static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
455 uint8_t *dst, int32_t dst_stride,
456 int8_t *filter_horiz, int8_t *filter_vert,
457 int32_t height) {
458 uint32_t loop_cnt;
459 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
460 v16u8 filt_hz, filt_vt, vec0, vec1;
461 v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
462 v8i16 filt;
463
464 mask = LD_SB(&mc_filt_mask_arr[0]);
465
466 /* rearranging filter */
467 filt = LD_SH(filter_horiz);
468 filt_hz = (v16u8)__msa_splati_h(filt, 0);
469
470 filt = LD_SH(filter_vert);
471 filt_vt = (v16u8)__msa_splati_h(filt, 0);
472
473 LD_SB2(src, 8, src0, src1);
474 src += src_stride;
475
476 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
477 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
478
479 for (loop_cnt = (height >> 2); loop_cnt--;) {
480 LD_SB4(src, src_stride, src0, src2, src4, src6);
481 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
482 src += (4 * src_stride);
483
484 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
485 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
486 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
487 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
488 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
489 PCKEV_ST_SB(tmp1, tmp2, dst);
490 dst += dst_stride;
491
492 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
493 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
494 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
495 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
496 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
497 PCKEV_ST_SB(tmp1, tmp2, dst);
498 dst += dst_stride;
499
500 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
501 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
502 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
503 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
504 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
505 PCKEV_ST_SB(tmp1, tmp2, dst);
506 dst += dst_stride;
507
508 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
509 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
510 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
511 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
512 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
513 PCKEV_ST_SB(tmp1, tmp2, dst);
514 dst += dst_stride;
515 }
516}
517
518static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride,
519 uint8_t *dst, int32_t dst_stride,
520 int8_t *filter_horiz, int8_t *filter_vert,
521 int32_t height) {
522 int32_t multiple8_cnt;
523 for (multiple8_cnt = 2; multiple8_cnt--;) {
524 common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
525 filter_vert, height);
526 src += 16;
527 dst += 16;
528 }
529}
530
531static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
532 uint8_t *dst, int32_t dst_stride,
533 int8_t *filter_horiz, int8_t *filter_vert,
534 int32_t height) {
535 int32_t multiple8_cnt;
536 for (multiple8_cnt = 4; multiple8_cnt--;) {
537 common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
538 filter_vert, height);
539 src += 16;
540 dst += 16;
541 }
542}
543
Yaowu Xuf883b422016-08-30 14:01:10 -0700544void aom_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700545 ptrdiff_t dst_stride, const int16_t *filter_x,
546 int32_t x_step_q4, const int16_t *filter_y,
547 int32_t y_step_q4, int32_t w, int32_t h) {
548 int8_t cnt, filt_hor[8], filt_ver[8];
549
550 assert(x_step_q4 == 16);
551 assert(y_step_q4 == 16);
552 assert(((const int32_t *)filter_x)[1] != 0x800000);
553 assert(((const int32_t *)filter_y)[1] != 0x800000);
554
555 for (cnt = 0; cnt < 8; ++cnt) {
556 filt_hor[cnt] = filter_x[cnt];
557 filt_ver[cnt] = filter_y[cnt];
558 }
559
560 if (((const int32_t *)filter_x)[0] == 0 &&
561 ((const int32_t *)filter_y)[0] == 0) {
562 switch (w) {
563 case 4:
564 common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst,
565 (int32_t)dst_stride, &filt_hor[3],
566 &filt_ver[3], (int32_t)h);
567 break;
568 case 8:
569 common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst,
570 (int32_t)dst_stride, &filt_hor[3],
571 &filt_ver[3], (int32_t)h);
572 break;
573 case 16:
574 common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst,
575 (int32_t)dst_stride, &filt_hor[3],
576 &filt_ver[3], (int32_t)h);
577 break;
578 case 32:
579 common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst,
580 (int32_t)dst_stride, &filt_hor[3],
581 &filt_ver[3], (int32_t)h);
582 break;
583 case 64:
584 common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst,
585 (int32_t)dst_stride, &filt_hor[3],
586 &filt_ver[3], (int32_t)h);
587 break;
588 default:
Yaowu Xuf883b422016-08-30 14:01:10 -0700589 aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700590 filter_y, y_step_q4, w, h);
591 break;
592 }
593 } else if (((const int32_t *)filter_x)[0] == 0 ||
594 ((const int32_t *)filter_y)[0] == 0) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700595 aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700596 filter_y, y_step_q4, w, h);
597 } else {
598 switch (w) {
599 case 4:
600 common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst,
601 (int32_t)dst_stride, filt_hor, filt_ver,
602 (int32_t)h);
603 break;
604 case 8:
605 common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst,
606 (int32_t)dst_stride, filt_hor, filt_ver,
607 (int32_t)h);
608 break;
609 case 16:
610 common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst,
611 (int32_t)dst_stride, filt_hor, filt_ver,
612 (int32_t)h);
613 break;
614 case 32:
615 common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst,
616 (int32_t)dst_stride, filt_hor, filt_ver,
617 (int32_t)h);
618 break;
619 case 64:
620 common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst,
621 (int32_t)dst_stride, filt_hor, filt_ver,
622 (int32_t)h);
623 break;
624 default:
Yaowu Xuf883b422016-08-30 14:01:10 -0700625 aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700626 filter_y, y_step_q4, w, h);
627 break;
628 }
629 }
630}