blob: 80c5df757f7f5ef02d54faaf38b6f415b4561925 [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xuc27fc142016-08-22 16:08:15 -070010 */
11
12#include <assert.h>
13#include <stdio.h>
14
Yaowu Xuf883b422016-08-30 14:01:10 -070015#include "./aom_dsp_rtcd.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070016#include "aom_dsp/mips/convolve_common_dspr2.h"
Yaowu Xuf883b422016-08-30 14:01:10 -070017#include "aom_dsp/aom_dsp_common.h"
18#include "aom_dsp/aom_filter.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070019#include "aom_ports/mem.h"
20
21#if HAVE_DSPR2
Yaowu Xuf883b422016-08-30 14:01:10 -070022void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -070023 uint8_t *dst, ptrdiff_t dst_stride,
24 const int16_t *filter_x, int filter_x_stride,
25 const int16_t *filter_y, int filter_y_stride,
26 int w, int h) {
27 int x, y;
28
Tom Finegan3b411362017-09-13 12:12:56 -070029 (void)filter_x;
30 (void)filter_x_stride;
31 (void)filter_y;
32 (void)filter_y_stride;
33
Yaowu Xuc27fc142016-08-22 16:08:15 -070034 /* prefetch data to cache memory */
35 prefetch_load(src);
36 prefetch_load(src + 32);
37 prefetch_store(dst);
38
39 switch (w) {
40 case 4: {
41 uint32_t tp1;
42
43 /* 1 word storage */
44 for (y = h; y--;) {
45 prefetch_load(src + src_stride);
46 prefetch_load(src + src_stride + 32);
47 prefetch_store(dst + dst_stride);
48
49 __asm__ __volatile__(
50 "ulw %[tp1], (%[src]) \n\t"
51 "sw %[tp1], (%[dst]) \n\t" /* store */
52
53 : [tp1] "=&r"(tp1)
54 : [src] "r"(src), [dst] "r"(dst));
55
56 src += src_stride;
57 dst += dst_stride;
58 }
59 } break;
60 case 8: {
61 uint32_t tp1, tp2;
62
63 /* 2 word storage */
64 for (y = h; y--;) {
65 prefetch_load(src + src_stride);
66 prefetch_load(src + src_stride + 32);
67 prefetch_store(dst + dst_stride);
68
69 __asm__ __volatile__(
70 "ulw %[tp1], 0(%[src]) \n\t"
71 "ulw %[tp2], 4(%[src]) \n\t"
72 "sw %[tp1], 0(%[dst]) \n\t" /* store */
73 "sw %[tp2], 4(%[dst]) \n\t" /* store */
74
75 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
76 : [src] "r"(src), [dst] "r"(dst));
77
78 src += src_stride;
79 dst += dst_stride;
80 }
81 } break;
82 case 16: {
83 uint32_t tp1, tp2, tp3, tp4;
84
85 /* 4 word storage */
86 for (y = h; y--;) {
87 prefetch_load(src + src_stride);
88 prefetch_load(src + src_stride + 32);
89 prefetch_store(dst + dst_stride);
90
91 __asm__ __volatile__(
92 "ulw %[tp1], 0(%[src]) \n\t"
93 "ulw %[tp2], 4(%[src]) \n\t"
94 "ulw %[tp3], 8(%[src]) \n\t"
95 "ulw %[tp4], 12(%[src]) \n\t"
96
97 "sw %[tp1], 0(%[dst]) \n\t" /* store */
98 "sw %[tp2], 4(%[dst]) \n\t" /* store */
99 "sw %[tp3], 8(%[dst]) \n\t" /* store */
100 "sw %[tp4], 12(%[dst]) \n\t" /* store */
101
102 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
103 [tp4] "=&r"(tp4)
104 : [src] "r"(src), [dst] "r"(dst));
105
106 src += src_stride;
107 dst += dst_stride;
108 }
109 } break;
110 case 32: {
111 uint32_t tp1, tp2, tp3, tp4;
112 uint32_t tp5, tp6, tp7, tp8;
113
114 /* 8 word storage */
115 for (y = h; y--;) {
116 prefetch_load(src + src_stride);
117 prefetch_load(src + src_stride + 32);
118 prefetch_store(dst + dst_stride);
119
120 __asm__ __volatile__(
121 "ulw %[tp1], 0(%[src]) \n\t"
122 "ulw %[tp2], 4(%[src]) \n\t"
123 "ulw %[tp3], 8(%[src]) \n\t"
124 "ulw %[tp4], 12(%[src]) \n\t"
125 "ulw %[tp5], 16(%[src]) \n\t"
126 "ulw %[tp6], 20(%[src]) \n\t"
127 "ulw %[tp7], 24(%[src]) \n\t"
128 "ulw %[tp8], 28(%[src]) \n\t"
129
130 "sw %[tp1], 0(%[dst]) \n\t" /* store */
131 "sw %[tp2], 4(%[dst]) \n\t" /* store */
132 "sw %[tp3], 8(%[dst]) \n\t" /* store */
133 "sw %[tp4], 12(%[dst]) \n\t" /* store */
134 "sw %[tp5], 16(%[dst]) \n\t" /* store */
135 "sw %[tp6], 20(%[dst]) \n\t" /* store */
136 "sw %[tp7], 24(%[dst]) \n\t" /* store */
137 "sw %[tp8], 28(%[dst]) \n\t" /* store */
138
139 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
140 [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
141 [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
142 : [src] "r"(src), [dst] "r"(dst));
143
144 src += src_stride;
145 dst += dst_stride;
146 }
147 } break;
148 case 64: {
149 uint32_t tp1, tp2, tp3, tp4;
150 uint32_t tp5, tp6, tp7, tp8;
151
152 prefetch_load(src + 64);
153 prefetch_store(dst + 32);
154
155 /* 16 word storage */
156 for (y = h; y--;) {
157 prefetch_load(src + src_stride);
158 prefetch_load(src + src_stride + 32);
159 prefetch_load(src + src_stride + 64);
160 prefetch_store(dst + dst_stride);
161 prefetch_store(dst + dst_stride + 32);
162
163 __asm__ __volatile__(
164 "ulw %[tp1], 0(%[src]) \n\t"
165 "ulw %[tp2], 4(%[src]) \n\t"
166 "ulw %[tp3], 8(%[src]) \n\t"
167 "ulw %[tp4], 12(%[src]) \n\t"
168 "ulw %[tp5], 16(%[src]) \n\t"
169 "ulw %[tp6], 20(%[src]) \n\t"
170 "ulw %[tp7], 24(%[src]) \n\t"
171 "ulw %[tp8], 28(%[src]) \n\t"
172
173 "sw %[tp1], 0(%[dst]) \n\t" /* store */
174 "sw %[tp2], 4(%[dst]) \n\t" /* store */
175 "sw %[tp3], 8(%[dst]) \n\t" /* store */
176 "sw %[tp4], 12(%[dst]) \n\t" /* store */
177 "sw %[tp5], 16(%[dst]) \n\t" /* store */
178 "sw %[tp6], 20(%[dst]) \n\t" /* store */
179 "sw %[tp7], 24(%[dst]) \n\t" /* store */
180 "sw %[tp8], 28(%[dst]) \n\t" /* store */
181
182 "ulw %[tp1], 32(%[src]) \n\t"
183 "ulw %[tp2], 36(%[src]) \n\t"
184 "ulw %[tp3], 40(%[src]) \n\t"
185 "ulw %[tp4], 44(%[src]) \n\t"
186 "ulw %[tp5], 48(%[src]) \n\t"
187 "ulw %[tp6], 52(%[src]) \n\t"
188 "ulw %[tp7], 56(%[src]) \n\t"
189 "ulw %[tp8], 60(%[src]) \n\t"
190
191 "sw %[tp1], 32(%[dst]) \n\t" /* store */
192 "sw %[tp2], 36(%[dst]) \n\t" /* store */
193 "sw %[tp3], 40(%[dst]) \n\t" /* store */
194 "sw %[tp4], 44(%[dst]) \n\t" /* store */
195 "sw %[tp5], 48(%[dst]) \n\t" /* store */
196 "sw %[tp6], 52(%[dst]) \n\t" /* store */
197 "sw %[tp7], 56(%[dst]) \n\t" /* store */
198 "sw %[tp8], 60(%[dst]) \n\t" /* store */
199
200 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
201 [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
202 [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
203 : [src] "r"(src), [dst] "r"(dst));
204
205 src += src_stride;
206 dst += dst_stride;
207 }
208 } break;
209 default:
210 for (y = h; y--;) {
211 for (x = 0; x < w; ++x) {
212 dst[x] = src[x];
213 }
214
215 src += src_stride;
216 dst += dst_stride;
217 }
218 break;
219 }
220}
221#endif