blob: c60557617922bb0104aad89fae6fec76a0c58b9c [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xuc27fc142016-08-22 16:08:15 -070010 */
11
12#include <assert.h>
13#include <stdio.h>
14
Yaowu Xuf883b422016-08-30 14:01:10 -070015#include "./aom_dsp_rtcd.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070016#include "aom_dsp/mips/convolve_common_dspr2.h"
Yaowu Xuf883b422016-08-30 14:01:10 -070017#include "aom_dsp/aom_dsp_common.h"
18#include "aom_dsp/aom_filter.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070019#include "aom_ports/mem.h"
20
21#if HAVE_DSPR2
22static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
23 uint8_t *dst, int32_t dst_stride,
24 const int16_t *filter_x0, int32_t h) {
25 int32_t y;
Yaowu Xuf883b422016-08-30 14:01:10 -070026 uint8_t *cm = aom_ff_cropTbl;
Yaowu Xuc27fc142016-08-22 16:08:15 -070027 int32_t vector1b, vector2b, vector3b, vector4b;
28 int32_t Temp1, Temp2, Temp3, Temp4;
29 uint32_t vector4a = 64;
30 uint32_t tp1, tp2;
31 uint32_t p1, p2, p3, p4;
32 uint32_t n1, n2, n3, n4;
33 uint32_t tn1, tn2;
34
35 vector1b = ((const int32_t *)filter_x0)[0];
36 vector2b = ((const int32_t *)filter_x0)[1];
37 vector3b = ((const int32_t *)filter_x0)[2];
38 vector4b = ((const int32_t *)filter_x0)[3];
39
40 for (y = h; y--;) {
41 /* prefetch data to cache memory */
42 prefetch_load(src + src_stride);
43 prefetch_load(src + src_stride + 32);
44 prefetch_store(dst + dst_stride);
45
46 __asm__ __volatile__(
47 "ulw %[tp1], 0(%[src]) \n\t"
48 "ulw %[tp2], 4(%[src]) \n\t"
49
50 /* even 1. pixel */
51 "mtlo %[vector4a], $ac3 \n\t"
52 "mthi $zero, $ac3 \n\t"
53 "preceu.ph.qbr %[p1], %[tp1] \n\t"
54 "preceu.ph.qbl %[p2], %[tp1] \n\t"
55 "preceu.ph.qbr %[p3], %[tp2] \n\t"
56 "preceu.ph.qbl %[p4], %[tp2] \n\t"
57 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
58 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
59 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
60 "ulw %[tn2], 8(%[src]) \n\t"
61 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
62 "extp %[Temp1], $ac3, 31 \n\t"
63
64 /* even 2. pixel */
65 "mtlo %[vector4a], $ac2 \n\t"
66 "mthi $zero, $ac2 \n\t"
67 "preceu.ph.qbr %[p1], %[tn2] \n\t"
68 "balign %[tn1], %[tn2], 3 \n\t"
69 "balign %[tn2], %[tp2], 3 \n\t"
70 "balign %[tp2], %[tp1], 3 \n\t"
71 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
72 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
73 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
74 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
75 "extp %[Temp3], $ac2, 31 \n\t"
76
77 /* odd 1. pixel */
78 "lbux %[tp1], %[Temp1](%[cm]) \n\t"
79 "mtlo %[vector4a], $ac3 \n\t"
80 "mthi $zero, $ac3 \n\t"
81 "preceu.ph.qbr %[n1], %[tp2] \n\t"
82 "preceu.ph.qbl %[n2], %[tp2] \n\t"
83 "preceu.ph.qbr %[n3], %[tn2] \n\t"
84 "preceu.ph.qbl %[n4], %[tn2] \n\t"
85 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
86 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
87 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
88 "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t"
89 "extp %[Temp2], $ac3, 31 \n\t"
90
91 /* odd 2. pixel */
92 "lbux %[tp2], %[Temp3](%[cm]) \n\t"
93 "mtlo %[vector4a], $ac2 \n\t"
94 "mthi $zero, $ac2 \n\t"
95 "preceu.ph.qbr %[n1], %[tn1] \n\t"
96 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
97 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
98 "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
99 "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t"
100 "extp %[Temp4], $ac2, 31 \n\t"
101
102 /* clamp */
103 "lbux %[tn1], %[Temp2](%[cm]) \n\t"
104 "lbux %[n2], %[Temp4](%[cm]) \n\t"
105
106 /* store bytes */
107 "sb %[tp1], 0(%[dst]) \n\t"
108 "sb %[tn1], 1(%[dst]) \n\t"
109 "sb %[tp2], 2(%[dst]) \n\t"
110 "sb %[n2], 3(%[dst]) \n\t"
111
112 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
113 [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
114 [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
115 [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
116 [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
117 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
118 [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
119 [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
120 [src] "r"(src));
121
122 /* Next row... */
123 src += src_stride;
124 dst += dst_stride;
125 }
126}
127
128static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
129 uint8_t *dst, int32_t dst_stride,
130 const int16_t *filter_x0, int32_t h) {
131 int32_t y;
Yaowu Xuf883b422016-08-30 14:01:10 -0700132 uint8_t *cm = aom_ff_cropTbl;
Yaowu Xuc27fc142016-08-22 16:08:15 -0700133 uint32_t vector4a = 64;
134 int32_t vector1b, vector2b, vector3b, vector4b;
135 int32_t Temp1, Temp2, Temp3;
136 uint32_t tp1, tp2;
137 uint32_t p1, p2, p3, p4, n1;
138 uint32_t tn1, tn2, tn3;
139 uint32_t st0, st1;
140
141 vector1b = ((const int32_t *)filter_x0)[0];
142 vector2b = ((const int32_t *)filter_x0)[1];
143 vector3b = ((const int32_t *)filter_x0)[2];
144 vector4b = ((const int32_t *)filter_x0)[3];
145
146 for (y = h; y--;) {
147 /* prefetch data to cache memory */
148 prefetch_load(src + src_stride);
149 prefetch_load(src + src_stride + 32);
150 prefetch_store(dst + dst_stride);
151
152 __asm__ __volatile__(
153 "ulw %[tp1], 0(%[src]) \n\t"
154 "ulw %[tp2], 4(%[src]) \n\t"
155
156 /* even 1. pixel */
157 "mtlo %[vector4a], $ac3 \n\t"
158 "mthi $zero, $ac3 \n\t"
159 "mtlo %[vector4a], $ac2 \n\t"
160 "mthi $zero, $ac2 \n\t"
161 "preceu.ph.qbr %[p1], %[tp1] \n\t"
162 "preceu.ph.qbl %[p2], %[tp1] \n\t"
163 "preceu.ph.qbr %[p3], %[tp2] \n\t"
164 "preceu.ph.qbl %[p4], %[tp2] \n\t"
165 "ulw %[tn2], 8(%[src]) \n\t"
166 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
167 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
168 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
169 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
170 "extp %[Temp1], $ac3, 31 \n\t"
171
172 /* even 2. pixel */
173 "preceu.ph.qbr %[p1], %[tn2] \n\t"
174 "preceu.ph.qbl %[n1], %[tn2] \n\t"
175 "ulw %[tn1], 12(%[src]) \n\t"
176 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
177 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
178 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
179 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
180 "extp %[Temp3], $ac2, 31 \n\t"
181
182 /* even 3. pixel */
183 "lbux %[st0], %[Temp1](%[cm]) \n\t"
184 "mtlo %[vector4a], $ac1 \n\t"
185 "mthi $zero, $ac1 \n\t"
186 "preceu.ph.qbr %[p2], %[tn1] \n\t"
187 "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t"
188 "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t"
189 "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t"
190 "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t"
191 "extp %[Temp1], $ac1, 31 \n\t"
192
193 /* even 4. pixel */
194 "mtlo %[vector4a], $ac2 \n\t"
195 "mthi $zero, $ac2 \n\t"
196 "mtlo %[vector4a], $ac3 \n\t"
197 "mthi $zero, $ac3 \n\t"
198 "sb %[st0], 0(%[dst]) \n\t"
199 "lbux %[st1], %[Temp3](%[cm]) \n\t"
200
201 "balign %[tn3], %[tn1], 3 \n\t"
202 "balign %[tn1], %[tn2], 3 \n\t"
203 "balign %[tn2], %[tp2], 3 \n\t"
204 "balign %[tp2], %[tp1], 3 \n\t"
205
206 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
207 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
208 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
209 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
210 "extp %[Temp3], $ac2, 31 \n\t"
211
212 "lbux %[st0], %[Temp1](%[cm]) \n\t"
213
214 /* odd 1. pixel */
215 "mtlo %[vector4a], $ac1 \n\t"
216 "mthi $zero, $ac1 \n\t"
217 "sb %[st1], 2(%[dst]) \n\t"
218 "preceu.ph.qbr %[p1], %[tp2] \n\t"
219 "preceu.ph.qbl %[p2], %[tp2] \n\t"
220 "preceu.ph.qbr %[p3], %[tn2] \n\t"
221 "preceu.ph.qbl %[p4], %[tn2] \n\t"
222 "sb %[st0], 4(%[dst]) \n\t"
223 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
224 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
225 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
226 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
227 "extp %[Temp2], $ac3, 31 \n\t"
228
229 /* odd 2. pixel */
230 "mtlo %[vector4a], $ac3 \n\t"
231 "mthi $zero, $ac3 \n\t"
232 "mtlo %[vector4a], $ac2 \n\t"
233 "mthi $zero, $ac2 \n\t"
234 "preceu.ph.qbr %[p1], %[tn1] \n\t"
235 "preceu.ph.qbl %[n1], %[tn1] \n\t"
236 "lbux %[st0], %[Temp3](%[cm]) \n\t"
237 "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t"
238 "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t"
239 "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t"
240 "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t"
241 "extp %[Temp3], $ac1, 31 \n\t"
242
243 /* odd 3. pixel */
244 "lbux %[st1], %[Temp2](%[cm]) \n\t"
245 "preceu.ph.qbr %[p2], %[tn3] \n\t"
246 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
247 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
248 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
249 "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t"
250 "extp %[Temp2], $ac3, 31 \n\t"
251
252 /* odd 4. pixel */
253 "sb %[st1], 1(%[dst]) \n\t"
254 "sb %[st0], 6(%[dst]) \n\t"
255 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
256 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
257 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
258 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
259 "extp %[Temp1], $ac2, 31 \n\t"
260
261 /* clamp */
262 "lbux %[p4], %[Temp3](%[cm]) \n\t"
263 "lbux %[p2], %[Temp2](%[cm]) \n\t"
264 "lbux %[n1], %[Temp1](%[cm]) \n\t"
265
266 /* store bytes */
267 "sb %[p4], 3(%[dst]) \n\t"
268 "sb %[p2], 5(%[dst]) \n\t"
269 "sb %[n1], 7(%[dst]) \n\t"
270
271 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
272 [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
273 [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
274 [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
275 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
276 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
277 [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
278 [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
279 [src] "r"(src));
280
281 /* Next row... */
282 src += src_stride;
283 dst += dst_stride;
284 }
285}
286
287static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride,
288 uint8_t *dst_ptr, int32_t dst_stride,
289 const int16_t *filter_x0, int32_t h,
290 int32_t count) {
291 int32_t y, c;
292 const uint8_t *src;
293 uint8_t *dst;
Yaowu Xuf883b422016-08-30 14:01:10 -0700294 uint8_t *cm = aom_ff_cropTbl;
Yaowu Xuc27fc142016-08-22 16:08:15 -0700295 uint32_t vector_64 = 64;
296 int32_t filter12, filter34, filter56, filter78;
297 int32_t Temp1, Temp2, Temp3;
298 uint32_t qload1, qload2, qload3;
299 uint32_t p1, p2, p3, p4, p5;
300 uint32_t st1, st2, st3;
301
302 filter12 = ((const int32_t *)filter_x0)[0];
303 filter34 = ((const int32_t *)filter_x0)[1];
304 filter56 = ((const int32_t *)filter_x0)[2];
305 filter78 = ((const int32_t *)filter_x0)[3];
306
307 for (y = h; y--;) {
308 src = src_ptr;
309 dst = dst_ptr;
310
311 /* prefetch data to cache memory */
312 prefetch_load(src_ptr + src_stride);
313 prefetch_load(src_ptr + src_stride + 32);
314 prefetch_store(dst_ptr + dst_stride);
315
316 for (c = 0; c < count; c++) {
317 __asm__ __volatile__(
318 "ulw %[qload1], 0(%[src]) \n\t"
319 "ulw %[qload2], 4(%[src]) \n\t"
320
321 /* even 1. pixel */
322 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
323 "mthi $zero, $ac1 \n\t"
324 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
325 "mthi $zero, $ac2 \n\t"
326 "preceu.ph.qbr %[p1], %[qload1] \n\t"
327 "preceu.ph.qbl %[p2], %[qload1] \n\t"
328 "preceu.ph.qbr %[p3], %[qload2] \n\t"
329 "preceu.ph.qbl %[p4], %[qload2] \n\t"
330 "ulw %[qload3], 8(%[src]) \n\t"
331 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
332 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
333 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
334 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
335 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
336
337 /* even 2. pixel */
338 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
339 "mthi $zero, $ac3 \n\t"
340 "preceu.ph.qbr %[p1], %[qload3] \n\t"
341 "preceu.ph.qbl %[p5], %[qload3] \n\t"
342 "ulw %[qload1], 12(%[src]) \n\t"
343 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
344 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
345 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
346 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
347 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
348 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
349
350 /* even 3. pixel */
351 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
352 "mthi $zero, $ac1 \n\t"
353 "preceu.ph.qbr %[p2], %[qload1] \n\t"
354 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
355 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
356 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
357 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
358 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
359 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
360 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
361
362 /* even 4. pixel */
363 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
364 "mthi $zero, $ac2 \n\t"
365 "preceu.ph.qbl %[p3], %[qload1] \n\t"
366 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
367 "ulw %[qload2], 16(%[src]) \n\t"
368 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
369 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
370 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
371 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
372 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
373 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
374
375 /* even 5. pixel */
376 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
377 "mthi $zero, $ac3 \n\t"
378 "preceu.ph.qbr %[p4], %[qload2] \n\t"
379 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
380 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
381 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
382 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
383 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
384 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
385 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
386
387 /* even 6. pixel */
388 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
389 "mthi $zero, $ac1 \n\t"
390 "preceu.ph.qbl %[p1], %[qload2] \n\t"
391 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
392 "ulw %[qload3], 20(%[src]) \n\t"
393 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
394 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
395 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
396 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
397 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
398 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
399
400 /* even 7. pixel */
401 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
402 "mthi $zero, $ac2 \n\t"
403 "preceu.ph.qbr %[p5], %[qload3] \n\t"
404 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
405 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
406 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
407 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
408 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
409 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
410 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
411
412 /* even 8. pixel */
413 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
414 "mthi $zero, $ac3 \n\t"
415 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
416 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
417 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
418 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
419 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
420 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
421 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
422
423 /* ODD pixels */
424 "ulw %[qload1], 1(%[src]) \n\t"
425 "ulw %[qload2], 5(%[src]) \n\t"
426
427 /* odd 1. pixel */
428 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
429 "mthi $zero, $ac1 \n\t"
430 "preceu.ph.qbr %[p1], %[qload1] \n\t"
431 "preceu.ph.qbl %[p2], %[qload1] \n\t"
432 "preceu.ph.qbr %[p3], %[qload2] \n\t"
433 "preceu.ph.qbl %[p4], %[qload2] \n\t"
434 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
435 "ulw %[qload3], 9(%[src]) \n\t"
436 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
437 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
438 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
439 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
440 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
441 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
442
443 /* odd 2. pixel */
444 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
445 "mthi $zero, $ac2 \n\t"
446 "preceu.ph.qbr %[p1], %[qload3] \n\t"
447 "preceu.ph.qbl %[p5], %[qload3] \n\t"
448 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
449 "ulw %[qload1], 13(%[src]) \n\t"
450 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
451 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
452 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
453 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
454 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
455 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
456
457 /* odd 3. pixel */
458 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
459 "mthi $zero, $ac3 \n\t"
460 "preceu.ph.qbr %[p2], %[qload1] \n\t"
461 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
462 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
463 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
464 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
465 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
466 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
467 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
468
469 /* odd 4. pixel */
470 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
471 "mthi $zero, $ac1 \n\t"
472 "preceu.ph.qbl %[p3], %[qload1] \n\t"
473 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
474 "ulw %[qload2], 17(%[src]) \n\t"
475 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
476 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
477 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
478 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
479 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
480 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
481
482 /* odd 5. pixel */
483 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
484 "mthi $zero, $ac2 \n\t"
485 "preceu.ph.qbr %[p4], %[qload2] \n\t"
486 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
487 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
488 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
489 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
490 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
491 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
492 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
493
494 /* odd 6. pixel */
495 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
496 "mthi $zero, $ac3 \n\t"
497 "preceu.ph.qbl %[p1], %[qload2] \n\t"
498 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
499 "ulw %[qload3], 21(%[src]) \n\t"
500 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
501 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
502 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
503 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
504 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
505 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
506
507 /* odd 7. pixel */
508 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
509 "mthi $zero, $ac1 \n\t"
510 "preceu.ph.qbr %[p5], %[qload3] \n\t"
511 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
512 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
513 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
514 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
515 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
516 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
517
518 /* odd 8. pixel */
519 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
520 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
521 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
522 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
523 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
524
525 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
526 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
527 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
528
529 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
530 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
531 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
532
533 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
534 [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
535 [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
536 [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
537 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
538 : [filter12] "r"(filter12), [filter34] "r"(filter34),
539 [filter56] "r"(filter56), [filter78] "r"(filter78),
540 [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
541 [src] "r"(src));
542
543 src += 16;
544 dst += 16;
545 }
546
547 /* Next row... */
548 src_ptr += src_stride;
549 dst_ptr += dst_stride;
550 }
551}
552
553static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride,
554 uint8_t *dst_ptr, int32_t dst_stride,
555 const int16_t *filter_x0, int32_t h) {
556 int32_t y, c;
557 const uint8_t *src;
558 uint8_t *dst;
Yaowu Xuf883b422016-08-30 14:01:10 -0700559 uint8_t *cm = aom_ff_cropTbl;
Yaowu Xuc27fc142016-08-22 16:08:15 -0700560 uint32_t vector_64 = 64;
561 int32_t filter12, filter34, filter56, filter78;
562 int32_t Temp1, Temp2, Temp3;
563 uint32_t qload1, qload2, qload3;
564 uint32_t p1, p2, p3, p4, p5;
565 uint32_t st1, st2, st3;
566
567 filter12 = ((const int32_t *)filter_x0)[0];
568 filter34 = ((const int32_t *)filter_x0)[1];
569 filter56 = ((const int32_t *)filter_x0)[2];
570 filter78 = ((const int32_t *)filter_x0)[3];
571
572 for (y = h; y--;) {
573 src = src_ptr;
574 dst = dst_ptr;
575
576 /* prefetch data to cache memory */
577 prefetch_load(src_ptr + src_stride);
578 prefetch_load(src_ptr + src_stride + 32);
579 prefetch_load(src_ptr + src_stride + 64);
580 prefetch_store(dst_ptr + dst_stride);
581 prefetch_store(dst_ptr + dst_stride + 32);
582
583 for (c = 0; c < 4; c++) {
584 __asm__ __volatile__(
585 "ulw %[qload1], 0(%[src]) \n\t"
586 "ulw %[qload2], 4(%[src]) \n\t"
587
588 /* even 1. pixel */
589 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
590 "mthi $zero, $ac1 \n\t"
591 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
592 "mthi $zero, $ac2 \n\t"
593 "preceu.ph.qbr %[p1], %[qload1] \n\t"
594 "preceu.ph.qbl %[p2], %[qload1] \n\t"
595 "preceu.ph.qbr %[p3], %[qload2] \n\t"
596 "preceu.ph.qbl %[p4], %[qload2] \n\t"
597 "ulw %[qload3], 8(%[src]) \n\t"
598 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
599 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
600 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
601 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
602 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
603
604 /* even 2. pixel */
605 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
606 "mthi $zero, $ac3 \n\t"
607 "preceu.ph.qbr %[p1], %[qload3] \n\t"
608 "preceu.ph.qbl %[p5], %[qload3] \n\t"
609 "ulw %[qload1], 12(%[src]) \n\t"
610 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
611 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
612 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
613 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
614 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
615 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
616
617 /* even 3. pixel */
618 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
619 "mthi $zero, $ac1 \n\t"
620 "preceu.ph.qbr %[p2], %[qload1] \n\t"
621 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
622 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
623 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
624 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
625 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
626 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
627 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
628
629 /* even 4. pixel */
630 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
631 "mthi $zero, $ac2 \n\t"
632 "preceu.ph.qbl %[p3], %[qload1] \n\t"
633 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
634 "ulw %[qload2], 16(%[src]) \n\t"
635 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
636 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
637 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
638 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
639 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
640 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
641
642 /* even 5. pixel */
643 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
644 "mthi $zero, $ac3 \n\t"
645 "preceu.ph.qbr %[p4], %[qload2] \n\t"
646 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
647 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
648 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
649 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
650 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
651 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
652 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
653
654 /* even 6. pixel */
655 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
656 "mthi $zero, $ac1 \n\t"
657 "preceu.ph.qbl %[p1], %[qload2] \n\t"
658 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
659 "ulw %[qload3], 20(%[src]) \n\t"
660 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
661 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
662 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
663 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
664 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
665 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
666
667 /* even 7. pixel */
668 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
669 "mthi $zero, $ac2 \n\t"
670 "preceu.ph.qbr %[p5], %[qload3] \n\t"
671 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
672 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
673 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
674 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
675 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
676 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
677 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
678
679 /* even 8. pixel */
680 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
681 "mthi $zero, $ac3 \n\t"
682 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
683 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
684 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
685 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
686 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
687 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
688 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
689
690 /* ODD pixels */
691 "ulw %[qload1], 1(%[src]) \n\t"
692 "ulw %[qload2], 5(%[src]) \n\t"
693
694 /* odd 1. pixel */
695 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
696 "mthi $zero, $ac1 \n\t"
697 "preceu.ph.qbr %[p1], %[qload1] \n\t"
698 "preceu.ph.qbl %[p2], %[qload1] \n\t"
699 "preceu.ph.qbr %[p3], %[qload2] \n\t"
700 "preceu.ph.qbl %[p4], %[qload2] \n\t"
701 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
702 "ulw %[qload3], 9(%[src]) \n\t"
703 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
704 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
705 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
706 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
707 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
708 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
709
710 /* odd 2. pixel */
711 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
712 "mthi $zero, $ac2 \n\t"
713 "preceu.ph.qbr %[p1], %[qload3] \n\t"
714 "preceu.ph.qbl %[p5], %[qload3] \n\t"
715 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
716 "ulw %[qload1], 13(%[src]) \n\t"
717 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
718 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
719 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
720 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
721 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
722 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
723
724 /* odd 3. pixel */
725 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
726 "mthi $zero, $ac3 \n\t"
727 "preceu.ph.qbr %[p2], %[qload1] \n\t"
728 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
729 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
730 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
731 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
732 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
733 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
734 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
735
736 /* odd 4. pixel */
737 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
738 "mthi $zero, $ac1 \n\t"
739 "preceu.ph.qbl %[p3], %[qload1] \n\t"
740 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
741 "ulw %[qload2], 17(%[src]) \n\t"
742 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
743 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
744 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
745 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
746 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
747 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
748
749 /* odd 5. pixel */
750 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
751 "mthi $zero, $ac2 \n\t"
752 "preceu.ph.qbr %[p4], %[qload2] \n\t"
753 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
754 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
755 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
756 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
757 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
758 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
759 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
760
761 /* odd 6. pixel */
762 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
763 "mthi $zero, $ac3 \n\t"
764 "preceu.ph.qbl %[p1], %[qload2] \n\t"
765 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
766 "ulw %[qload3], 21(%[src]) \n\t"
767 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
768 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
769 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
770 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
771 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
772 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
773
774 /* odd 7. pixel */
775 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
776 "mthi $zero, $ac1 \n\t"
777 "preceu.ph.qbr %[p5], %[qload3] \n\t"
778 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
779 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
780 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
781 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
782 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
783 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
784
785 /* odd 8. pixel */
786 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
787 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
788 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
789 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
790 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
791
792 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
793 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
794 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
795
796 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
797 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
798 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
799
800 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
801 [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
802 [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
803 [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
804 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
805 : [filter12] "r"(filter12), [filter34] "r"(filter34),
806 [filter56] "r"(filter56), [filter78] "r"(filter78),
807 [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
808 [src] "r"(src));
809
810 src += 16;
811 dst += 16;
812 }
813
814 /* Next row... */
815 src_ptr += src_stride;
816 dst_ptr += dst_stride;
817 }
818}
819
Yaowu Xuf883b422016-08-30 14:01:10 -0700820void aom_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700821 uint8_t *dst, ptrdiff_t dst_stride,
822 const int16_t *filter_x, int x_step_q4,
823 const int16_t *filter_y, int y_step_q4, int w,
824 int h) {
825 assert(x_step_q4 == 16);
826 assert(((const int32_t *)filter_x)[1] != 0x800000);
827
828 if (((const int32_t *)filter_x)[0] == 0) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700829 aom_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700830 x_step_q4, filter_y, y_step_q4, w, h);
831 } else {
832 uint32_t pos = 38;
833
834 prefetch_load((const uint8_t *)filter_x);
835 src -= 3;
836
837 /* bit positon for extract from acc */
838 __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
839 :
840 : [pos] "r"(pos));
841
842 /* prefetch data to cache memory */
843 prefetch_load(src);
844 prefetch_load(src + 32);
845 prefetch_store(dst);
846
847 switch (w) {
848 case 4:
849 convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst,
850 (int32_t)dst_stride, filter_x, (int32_t)h);
851 break;
852 case 8:
853 convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst,
854 (int32_t)dst_stride, filter_x, (int32_t)h);
855 break;
856 case 16:
857 convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
858 (int32_t)dst_stride, filter_x, (int32_t)h, 1);
859 break;
860 case 32:
861 convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
862 (int32_t)dst_stride, filter_x, (int32_t)h, 2);
863 break;
864 case 64:
865 prefetch_load(src + 64);
866 prefetch_store(dst + 32);
867
868 convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst,
869 (int32_t)dst_stride, filter_x, (int32_t)h);
870 break;
871 default:
Yaowu Xuf883b422016-08-30 14:01:10 -0700872 aom_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700873 x_step_q4, filter_y, y_step_q4, w, h);
874 break;
875 }
876 }
877}
878#endif