blob: 4d314286715ea0bed1c14efd8c2284e3c688c48c [file] [log] [blame]
Ronald S. Bultjedecead72013-07-10 11:17:19 -07001;
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002; Copyright (c) 2016, Alliance for Open Media. All rights reserved
Ronald S. Bultjedecead72013-07-10 11:17:19 -07003;
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
Ronald S. Bultjedecead72013-07-10 11:17:19 -070012;
13
14%include "third_party/x86inc/x86inc.asm"
15
16SECTION .text
17
Alex Converse0c00af12015-10-06 15:59:03 -070018%macro convolve_fn 1-2
James Zern40dab582015-11-04 20:01:34 -080019%ifidn %1, avg
20%define AUX_XMM_REGS 4
21%else
22%define AUX_XMM_REGS 0
23%endif
Alex Converse0c00af12015-10-06 15:59:03 -070024%ifidn %2, highbd
25%define pavg pavgw
James Zern40dab582015-11-04 20:01:34 -080026cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
27 dst, dst_stride, \
28 fx, fxs, fy, fys, w, h, bd
Alex Converse0c00af12015-10-06 15:59:03 -070029%else
30%define pavg pavgb
James Zern40dab582015-11-04 20:01:34 -080031cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
32 dst, dst_stride, \
33 fx, fxs, fy, fys, w, h
Alex Converse0c00af12015-10-06 15:59:03 -070034%endif
James Zernafd2f682015-08-04 17:52:57 -070035 mov r4d, dword wm
Alex Converse0c00af12015-10-06 15:59:03 -070036%ifidn %2, highbd
37 shl r4d, 1
38 shl srcq, 1
39 shl src_strideq, 1
40 shl dstq, 1
41 shl dst_strideq, 1
42%else
James Zernafd2f682015-08-04 17:52:57 -070043 cmp r4d, 4
Ronald S. Bultjedecead72013-07-10 11:17:19 -070044 je .w4
Alex Converse0c00af12015-10-06 15:59:03 -070045%endif
James Zernafd2f682015-08-04 17:52:57 -070046 cmp r4d, 8
Ronald S. Bultjedecead72013-07-10 11:17:19 -070047 je .w8
James Zernafd2f682015-08-04 17:52:57 -070048 cmp r4d, 16
Ronald S. Bultjedecead72013-07-10 11:17:19 -070049 je .w16
James Zernafd2f682015-08-04 17:52:57 -070050 cmp r4d, 32
Ronald S. Bultjedecead72013-07-10 11:17:19 -070051 je .w32
Geza Lore938b8df2016-03-04 15:55:48 +000052
Yaowu Xuf883b422016-08-30 14:01:10 -070053%if CONFIG_AV1 && CONFIG_EXT_PARTITION
Geza Lore938b8df2016-03-04 15:55:48 +000054 cmp r4d, 64
55 je .w64
56%ifidn %2, highbd
57 cmp r4d, 128
58 je .w128
59
60.w256:
61 mov r4d, dword hm
62.loop256:
63 movu m0, [srcq]
64 movu m1, [srcq+16]
65 movu m2, [srcq+32]
66 movu m3, [srcq+48]
67%ifidn %1, avg
68 pavg m0, [dstq]
69 pavg m1, [dstq+16]
70 pavg m2, [dstq+32]
71 pavg m3, [dstq+48]
72%endif
73 mova [dstq ], m0
74 mova [dstq+16], m1
75 mova [dstq+32], m2
76 mova [dstq+48], m3
77 movu m0, [srcq+64]
78 movu m1, [srcq+80]
79 movu m2, [srcq+96]
80 movu m3, [srcq+112]
81%ifidn %1, avg
82 pavg m0, [dstq+64]
83 pavg m1, [dstq+80]
84 pavg m2, [dstq+96]
85 pavg m3, [dstq+112]
86%endif
87 mova [dstq+64], m0
88 mova [dstq+80], m1
89 mova [dstq+96], m2
90 mova [dstq+112], m3
91 movu m0, [srcq+128]
92 movu m1, [srcq+128+16]
93 movu m2, [srcq+128+32]
94 movu m3, [srcq+128+48]
95%ifidn %1, avg
96 pavg m0, [dstq+128]
97 pavg m1, [dstq+128+16]
98 pavg m2, [dstq+128+32]
99 pavg m3, [dstq+128+48]
100%endif
101 mova [dstq+128 ], m0
102 mova [dstq+128+16], m1
103 mova [dstq+128+32], m2
104 mova [dstq+128+48], m3
105 movu m0, [srcq+128+64]
106 movu m1, [srcq+128+80]
107 movu m2, [srcq+128+96]
108 movu m3, [srcq+128+112]
109 add srcq, src_strideq
110%ifidn %1, avg
111 pavg m0, [dstq+128+64]
112 pavg m1, [dstq+128+80]
113 pavg m2, [dstq+128+96]
114 pavg m3, [dstq+128+112]
115%endif
116 mova [dstq+128+64], m0
117 mova [dstq+128+80], m1
118 mova [dstq+128+96], m2
119 mova [dstq+128+112], m3
120 add dstq, dst_strideq
121 sub r4d, 1
122 jnz .loop256
123 RET
124%endif
125
126.w128:
127 mov r4d, dword hm
128.loop128:
129 movu m0, [srcq]
130 movu m1, [srcq+16]
131 movu m2, [srcq+32]
132 movu m3, [srcq+48]
133%ifidn %1, avg
134 pavg m0, [dstq]
135 pavg m1, [dstq+16]
136 pavg m2, [dstq+32]
137 pavg m3, [dstq+48]
138%endif
139 mova [dstq ], m0
140 mova [dstq+16], m1
141 mova [dstq+32], m2
142 mova [dstq+48], m3
143 movu m0, [srcq+64]
144 movu m1, [srcq+80]
145 movu m2, [srcq+96]
146 movu m3, [srcq+112]
147 add srcq, src_strideq
148%ifidn %1, avg
149 pavg m0, [dstq+64]
150 pavg m1, [dstq+80]
151 pavg m2, [dstq+96]
152 pavg m3, [dstq+112]
153%endif
154 mova [dstq+64], m0
155 mova [dstq+80], m1
156 mova [dstq+96], m2
157 mova [dstq+112], m3
158 add dstq, dst_strideq
159 sub r4d, 1
160 jnz .loop128
161 RET
162
Yaowu Xuf883b422016-08-30 14:01:10 -0700163%else ; CONFIG_AV1 && CONFIG_EXT_PARTITION
Geza Lore938b8df2016-03-04 15:55:48 +0000164
Alex Converse0c00af12015-10-06 15:59:03 -0700165%ifidn %2, highbd
166 cmp r4d, 64
167 je .w64
Ronald S. Bultjedecead72013-07-10 11:17:19 -0700168
James Zernafd2f682015-08-04 17:52:57 -0700169 mov r4d, dword hm
Alex Converse0c00af12015-10-06 15:59:03 -0700170.loop128:
171 movu m0, [srcq]
172 movu m1, [srcq+16]
173 movu m2, [srcq+32]
174 movu m3, [srcq+48]
175%ifidn %1, avg
176 pavg m0, [dstq]
177 pavg m1, [dstq+16]
178 pavg m2, [dstq+32]
179 pavg m3, [dstq+48]
180%endif
181 mova [dstq ], m0
182 mova [dstq+16], m1
183 mova [dstq+32], m2
184 mova [dstq+48], m3
185 movu m0, [srcq+64]
186 movu m1, [srcq+80]
187 movu m2, [srcq+96]
188 movu m3, [srcq+112]
189 add srcq, src_strideq
190%ifidn %1, avg
191 pavg m0, [dstq+64]
192 pavg m1, [dstq+80]
193 pavg m2, [dstq+96]
194 pavg m3, [dstq+112]
195%endif
196 mova [dstq+64], m0
197 mova [dstq+80], m1
198 mova [dstq+96], m2
199 mova [dstq+112], m3
200 add dstq, dst_strideq
Geza Lore938b8df2016-03-04 15:55:48 +0000201 sub r4d, 1
Alex Converse0c00af12015-10-06 15:59:03 -0700202 jnz .loop128
203 RET
204%endif
Yaowu Xuf883b422016-08-30 14:01:10 -0700205%endif ; CONFIG_AV1 && CONFIG_EXT_PARTITION
Alex Converse0c00af12015-10-06 15:59:03 -0700206
Johann2967bf32016-06-22 16:08:10 -0700207.w64:
Alex Converse0c00af12015-10-06 15:59:03 -0700208 mov r4d, dword hm
Ronald S. Bultjedecead72013-07-10 11:17:19 -0700209.loop64:
James Zernafd2f682015-08-04 17:52:57 -0700210 movu m0, [srcq]
211 movu m1, [srcq+16]
212 movu m2, [srcq+32]
213 movu m3, [srcq+48]
214 add srcq, src_strideq
Ronald S. Bultjedecead72013-07-10 11:17:19 -0700215%ifidn %1, avg
Alex Converse0c00af12015-10-06 15:59:03 -0700216 pavg m0, [dstq]
217 pavg m1, [dstq+16]
218 pavg m2, [dstq+32]
219 pavg m3, [dstq+48]
Ronald S. Bultjedecead72013-07-10 11:17:19 -0700220%endif
James Zernafd2f682015-08-04 17:52:57 -0700221 mova [dstq ], m0
222 mova [dstq+16], m1
223 mova [dstq+32], m2
224 mova [dstq+48], m3
225 add dstq, dst_strideq
Geza Lore938b8df2016-03-04 15:55:48 +0000226 sub r4d, 1
Ronald S. Bultjedecead72013-07-10 11:17:19 -0700227 jnz .loop64
228 RET
229
230.w32:
James Zernafd2f682015-08-04 17:52:57 -0700231 mov r4d, dword hm
Ronald S. Bultjedecead72013-07-10 11:17:19 -0700232.loop32:
James Zernafd2f682015-08-04 17:52:57 -0700233 movu m0, [srcq]
234 movu m1, [srcq+16]
235 movu m2, [srcq+src_strideq]
236 movu m3, [srcq+src_strideq+16]
237 lea srcq, [srcq+src_strideq*2]
Ronald S. Bultjedecead72013-07-10 11:17:19 -0700238%ifidn %1, avg
Alex Converse0c00af12015-10-06 15:59:03 -0700239 pavg m0, [dstq]
240 pavg m1, [dstq +16]
241 pavg m2, [dstq+dst_strideq]
242 pavg m3, [dstq+dst_strideq+16]
Ronald S. Bultjedecead72013-07-10 11:17:19 -0700243%endif
James Zernafd2f682015-08-04 17:52:57 -0700244 mova [dstq ], m0
245 mova [dstq +16], m1
246 mova [dstq+dst_strideq ], m2
247 mova [dstq+dst_strideq+16], m3
248 lea dstq, [dstq+dst_strideq*2]
249 sub r4d, 2
Ronald S. Bultjedecead72013-07-10 11:17:19 -0700250 jnz .loop32
251 RET
252
253.w16:
James Zernafd2f682015-08-04 17:52:57 -0700254 mov r4d, dword hm
255 lea r5q, [src_strideq*3]
256 lea r6q, [dst_strideq*3]
Ronald S. Bultjedecead72013-07-10 11:17:19 -0700257.loop16:
James Zernafd2f682015-08-04 17:52:57 -0700258 movu m0, [srcq]
259 movu m1, [srcq+src_strideq]
260 movu m2, [srcq+src_strideq*2]
261 movu m3, [srcq+r5q]
262 lea srcq, [srcq+src_strideq*4]
Ronald S. Bultjedecead72013-07-10 11:17:19 -0700263%ifidn %1, avg
Alex Converse0c00af12015-10-06 15:59:03 -0700264 pavg m0, [dstq]
265 pavg m1, [dstq+dst_strideq]
266 pavg m2, [dstq+dst_strideq*2]
267 pavg m3, [dstq+r6q]
Ronald S. Bultjedecead72013-07-10 11:17:19 -0700268%endif
James Zernafd2f682015-08-04 17:52:57 -0700269 mova [dstq ], m0
270 mova [dstq+dst_strideq ], m1
271 mova [dstq+dst_strideq*2], m2
272 mova [dstq+r6q ], m3
273 lea dstq, [dstq+dst_strideq*4]
274 sub r4d, 4
Ronald S. Bultjedecead72013-07-10 11:17:19 -0700275 jnz .loop16
276 RET
277
Ronald S. Bultjedecead72013-07-10 11:17:19 -0700278.w8:
James Zernafd2f682015-08-04 17:52:57 -0700279 mov r4d, dword hm
280 lea r5q, [src_strideq*3]
281 lea r6q, [dst_strideq*3]
Ronald S. Bultjedecead72013-07-10 11:17:19 -0700282.loop8:
James Zernafd2f682015-08-04 17:52:57 -0700283 movh m0, [srcq]
284 movh m1, [srcq+src_strideq]
285 movh m2, [srcq+src_strideq*2]
286 movh m3, [srcq+r5q]
287 lea srcq, [srcq+src_strideq*4]
Ronald S. Bultjedecead72013-07-10 11:17:19 -0700288%ifidn %1, avg
James Zernafd2f682015-08-04 17:52:57 -0700289 movh m4, [dstq]
290 movh m5, [dstq+dst_strideq]
291 movh m6, [dstq+dst_strideq*2]
292 movh m7, [dstq+r6q]
Alex Converse0c00af12015-10-06 15:59:03 -0700293 pavg m0, m4
294 pavg m1, m5
295 pavg m2, m6
296 pavg m3, m7
Ronald S. Bultjedecead72013-07-10 11:17:19 -0700297%endif
James Zernafd2f682015-08-04 17:52:57 -0700298 movh [dstq ], m0
299 movh [dstq+dst_strideq ], m1
300 movh [dstq+dst_strideq*2], m2
301 movh [dstq+r6q ], m3
302 lea dstq, [dstq+dst_strideq*4]
303 sub r4d, 4
James Zern40dab582015-11-04 20:01:34 -0800304 jnz .loop8
305 RET
306
307%ifnidn %2, highbd
308.w4:
309 mov r4d, dword hm
310 lea r5q, [src_strideq*3]
311 lea r6q, [dst_strideq*3]
312.loop4:
313 movd m0, [srcq]
314 movd m1, [srcq+src_strideq]
315 movd m2, [srcq+src_strideq*2]
316 movd m3, [srcq+r5q]
317 lea srcq, [srcq+src_strideq*4]
318%ifidn %1, avg
319 movd m4, [dstq]
320 movd m5, [dstq+dst_strideq]
321 movd m6, [dstq+dst_strideq*2]
322 movd m7, [dstq+r6q]
323 pavg m0, m4
324 pavg m1, m5
325 pavg m2, m6
326 pavg m3, m7
327%endif
328 movd [dstq ], m0
329 movd [dstq+dst_strideq ], m1
330 movd [dstq+dst_strideq*2], m2
331 movd [dstq+r6q ], m3
332 lea dstq, [dstq+dst_strideq*4]
333 sub r4d, 4
Ronald S. Bultjedecead72013-07-10 11:17:19 -0700334 jnz .loop4
335 RET
Alex Converse0c00af12015-10-06 15:59:03 -0700336%endif
Ronald S. Bultjedecead72013-07-10 11:17:19 -0700337%endmacro
338
James Zern40dab582015-11-04 20:01:34 -0800339INIT_XMM sse2
Ronald S. Bultjedecead72013-07-10 11:17:19 -0700340convolve_fn copy
341convolve_fn avg
Sebastien Alaiwan71e87842017-04-12 16:03:28 +0200342%if CONFIG_HIGHBITDEPTH
Alex Converse0c00af12015-10-06 15:59:03 -0700343convolve_fn copy, highbd
344convolve_fn avg, highbd
345%endif