blob: 0af7b6aea7bb768b8398c83b416aadebf7917588 [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <stdio.h>
13
Yaowu Xuf883b422016-08-30 14:01:10 -070014#include "./aom_dsp_rtcd.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070015#include "aom_dsp/mips/convolve_common_dspr2.h"
Yaowu Xuf883b422016-08-30 14:01:10 -070016#include "aom_dsp/aom_convolve.h"
17#include "aom_dsp/aom_dsp_common.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070018#include "aom_ports/mem.h"
19
20#if HAVE_DSPR2
21static void convolve_bi_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
22 uint8_t *dst, int32_t dst_stride,
23 const int16_t *filter_y, int32_t w,
24 int32_t h) {
25 int32_t x, y;
26 const uint8_t *src_ptr;
27 uint8_t *dst_ptr;
Yaowu Xuf883b422016-08-30 14:01:10 -070028 uint8_t *cm = aom_ff_cropTbl;
Yaowu Xuc27fc142016-08-22 16:08:15 -070029 uint32_t vector4a = 64;
30 uint32_t load1, load2;
31 uint32_t p1, p2;
32 uint32_t scratch1;
33 uint32_t store1, store2;
34 int32_t Temp1, Temp2;
35 const int16_t *filter = &filter_y[3];
36 uint32_t filter45;
37
38 filter45 = ((const int32_t *)filter)[0];
39
40 for (y = h; y--;) {
41 /* prefetch data to cache memory */
42 prefetch_store(dst + dst_stride);
43
44 for (x = 0; x < w; x += 4) {
45 src_ptr = src + x;
46 dst_ptr = dst + x;
47
48 __asm__ __volatile__(
49 "ulw %[load1], 0(%[src_ptr]) \n\t"
50 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
51 "ulw %[load2], 0(%[src_ptr]) \n\t"
52
53 "mtlo %[vector4a], $ac0 \n\t"
54 "mtlo %[vector4a], $ac1 \n\t"
55 "mtlo %[vector4a], $ac2 \n\t"
56 "mtlo %[vector4a], $ac3 \n\t"
57 "mthi $zero, $ac0 \n\t"
58 "mthi $zero, $ac1 \n\t"
59 "mthi $zero, $ac2 \n\t"
60 "mthi $zero, $ac3 \n\t"
61
62 "preceu.ph.qbr %[scratch1], %[load1] \n\t"
63 "preceu.ph.qbr %[p1], %[load2] \n\t"
64
65 "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
66 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
67
68 "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
69 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
70
71 "preceu.ph.qbl %[scratch1], %[load1] \n\t"
72 "preceu.ph.qbl %[p1], %[load2] \n\t"
73
74 "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
75 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
76
77 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
78 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
79
80 "extp %[Temp1], $ac0, 31 \n\t"
81 "extp %[Temp2], $ac1, 31 \n\t"
82
83 "lbux %[store1], %[Temp1](%[cm]) \n\t"
84 "extp %[Temp1], $ac2, 31 \n\t"
85
86 "lbux %[store2], %[Temp2](%[cm]) \n\t"
87 "extp %[Temp2], $ac3, 31 \n\t"
88
89 "sb %[store1], 0(%[dst_ptr]) \n\t"
90 "sb %[store2], 1(%[dst_ptr]) \n\t"
91
92 "lbux %[store1], %[Temp1](%[cm]) \n\t"
93 "lbux %[store2], %[Temp2](%[cm]) \n\t"
94
95 "sb %[store1], 2(%[dst_ptr]) \n\t"
96 "sb %[store2], 3(%[dst_ptr]) \n\t"
97
98 : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
99 [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
100 [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
101 [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
102 : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
103 [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
104 }
105
106 /* Next row... */
107 src += src_stride;
108 dst += dst_stride;
109 }
110}
111
112static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
113 uint8_t *dst, int32_t dst_stride,
114 const int16_t *filter_y, int32_t h) {
115 int32_t x, y;
116 const uint8_t *src_ptr;
117 uint8_t *dst_ptr;
Yaowu Xuf883b422016-08-30 14:01:10 -0700118 uint8_t *cm = aom_ff_cropTbl;
Yaowu Xuc27fc142016-08-22 16:08:15 -0700119 uint32_t vector4a = 64;
120 uint32_t load1, load2;
121 uint32_t p1, p2;
122 uint32_t scratch1;
123 uint32_t store1, store2;
124 int32_t Temp1, Temp2;
125 const int16_t *filter = &filter_y[3];
126 uint32_t filter45;
127
128 filter45 = ((const int32_t *)filter)[0];
129
130 for (y = h; y--;) {
131 /* prefetch data to cache memory */
132 prefetch_store(dst + dst_stride);
133
134 for (x = 0; x < 64; x += 4) {
135 src_ptr = src + x;
136 dst_ptr = dst + x;
137
138 __asm__ __volatile__(
139 "ulw %[load1], 0(%[src_ptr]) \n\t"
140 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
141 "ulw %[load2], 0(%[src_ptr]) \n\t"
142
143 "mtlo %[vector4a], $ac0 \n\t"
144 "mtlo %[vector4a], $ac1 \n\t"
145 "mtlo %[vector4a], $ac2 \n\t"
146 "mtlo %[vector4a], $ac3 \n\t"
147 "mthi $zero, $ac0 \n\t"
148 "mthi $zero, $ac1 \n\t"
149 "mthi $zero, $ac2 \n\t"
150 "mthi $zero, $ac3 \n\t"
151
152 "preceu.ph.qbr %[scratch1], %[load1] \n\t"
153 "preceu.ph.qbr %[p1], %[load2] \n\t"
154
155 "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
156 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
157
158 "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
159 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
160
161 "preceu.ph.qbl %[scratch1], %[load1] \n\t"
162 "preceu.ph.qbl %[p1], %[load2] \n\t"
163
164 "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
165 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
166
167 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
168 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
169
170 "extp %[Temp1], $ac0, 31 \n\t"
171 "extp %[Temp2], $ac1, 31 \n\t"
172
173 "lbux %[store1], %[Temp1](%[cm]) \n\t"
174 "extp %[Temp1], $ac2, 31 \n\t"
175
176 "lbux %[store2], %[Temp2](%[cm]) \n\t"
177 "extp %[Temp2], $ac3, 31 \n\t"
178
179 "sb %[store1], 0(%[dst_ptr]) \n\t"
180 "sb %[store2], 1(%[dst_ptr]) \n\t"
181
182 "lbux %[store1], %[Temp1](%[cm]) \n\t"
183 "lbux %[store2], %[Temp2](%[cm]) \n\t"
184
185 "sb %[store1], 2(%[dst_ptr]) \n\t"
186 "sb %[store2], 3(%[dst_ptr]) \n\t"
187
188 : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
189 [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
190 [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
191 [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
192 : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
193 [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
194 }
195
196 /* Next row... */
197 src += src_stride;
198 dst += dst_stride;
199 }
200}
201
Yaowu Xuf883b422016-08-30 14:01:10 -0700202void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700203 uint8_t *dst, ptrdiff_t dst_stride,
204 const int16_t *filter_x, int x_step_q4,
205 const int16_t *filter_y, int y_step_q4, int w,
206 int h) {
207 uint32_t pos = 38;
208
209 assert(y_step_q4 == 16);
210
211 /* bit positon for extract from acc */
212 __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
213 :
214 : [pos] "r"(pos));
215
216 prefetch_store(dst);
217
218 switch (w) {
219 case 4:
220 case 8:
221 case 16:
222 case 32:
223 convolve_bi_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
224 h);
225 break;
226 case 64:
227 prefetch_store(dst + 32);
228 convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
229 break;
230 default:
Yaowu Xuf883b422016-08-30 14:01:10 -0700231 aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700232 x_step_q4, filter_y, y_step_q4, w, h);
233 break;
234 }
235}
236#endif