blob: 8f025a8be43256e00603a9a173ba68ec70cb28a4 [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001;
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002; Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003;
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
Yaowu Xuc27fc142016-08-22 16:08:15 -070012;
13
14%include "aom_ports/x86_abi_support.asm"
15
16%macro GET_PARAM_4 0
17 mov rdx, arg(5) ;filter ptr
18 mov rsi, arg(0) ;src_ptr
19 mov rdi, arg(2) ;output_ptr
20 mov rcx, 0x0400040
21
22 movdqa xmm3, [rdx] ;load filters
23 pshuflw xmm4, xmm3, 11111111b ;k3
24 psrldq xmm3, 8
25 pshuflw xmm3, xmm3, 0b ;k4
26 punpcklqdq xmm4, xmm3 ;k3k4
27
28 movq xmm3, rcx ;rounding
29 pshufd xmm3, xmm3, 0
30
31 pxor xmm2, xmm2
32
33 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
34 movsxd rdx, DWORD PTR arg(3) ;out_pitch
35 movsxd rcx, DWORD PTR arg(4) ;output_height
36%endm
37
38%macro APPLY_FILTER_4 1
39
40 punpckldq xmm0, xmm1 ;two row in one register
41 punpcklbw xmm0, xmm2 ;unpack to word
42 pmullw xmm0, xmm4 ;multiply the filter factors
43
44 movdqa xmm1, xmm0
45 psrldq xmm1, 8
46 paddsw xmm0, xmm1
47
48 paddsw xmm0, xmm3 ;rounding
49 psraw xmm0, 7 ;shift
50 packuswb xmm0, xmm0 ;pack to byte
51
52%if %1
53 movd xmm1, [rdi]
54 pavgb xmm0, xmm1
55%endif
56
57 movd [rdi], xmm0
58 lea rsi, [rsi + rax]
59 lea rdi, [rdi + rdx]
60 dec rcx
61%endm
62
63%macro GET_PARAM 0
64 mov rdx, arg(5) ;filter ptr
65 mov rsi, arg(0) ;src_ptr
66 mov rdi, arg(2) ;output_ptr
67 mov rcx, 0x0400040
68
69 movdqa xmm7, [rdx] ;load filters
70
71 pshuflw xmm6, xmm7, 11111111b ;k3
72 pshufhw xmm7, xmm7, 0b ;k4
73 punpcklwd xmm6, xmm6
74 punpckhwd xmm7, xmm7
75
76 movq xmm4, rcx ;rounding
77 pshufd xmm4, xmm4, 0
78
79 pxor xmm5, xmm5
80
81 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
82 movsxd rdx, DWORD PTR arg(3) ;out_pitch
83 movsxd rcx, DWORD PTR arg(4) ;output_height
84%endm
85
86%macro APPLY_FILTER_8 1
87 punpcklbw xmm0, xmm5
88 punpcklbw xmm1, xmm5
89
90 pmullw xmm0, xmm6
91 pmullw xmm1, xmm7
92 paddsw xmm0, xmm1
93 paddsw xmm0, xmm4 ;rounding
94 psraw xmm0, 7 ;shift
95 packuswb xmm0, xmm0 ;pack back to byte
96%if %1
97 movq xmm1, [rdi]
98 pavgb xmm0, xmm1
99%endif
100 movq [rdi], xmm0 ;store the result
101
102 lea rsi, [rsi + rax]
103 lea rdi, [rdi + rdx]
104 dec rcx
105%endm
106
107%macro APPLY_FILTER_16 1
108 punpcklbw xmm0, xmm5
109 punpcklbw xmm1, xmm5
110 punpckhbw xmm2, xmm5
111 punpckhbw xmm3, xmm5
112
113 pmullw xmm0, xmm6
114 pmullw xmm1, xmm7
115 pmullw xmm2, xmm6
116 pmullw xmm3, xmm7
117
118 paddsw xmm0, xmm1
119 paddsw xmm2, xmm3
120
121 paddsw xmm0, xmm4 ;rounding
122 paddsw xmm2, xmm4
123 psraw xmm0, 7 ;shift
124 psraw xmm2, 7
125 packuswb xmm0, xmm2 ;pack back to byte
126%if %1
127 movdqu xmm1, [rdi]
128 pavgb xmm0, xmm1
129%endif
130 movdqu [rdi], xmm0 ;store the result
131
132 lea rsi, [rsi + rax]
133 lea rdi, [rdi + rdx]
134 dec rcx
135%endm
136
Yaowu Xuf883b422016-08-30 14:01:10 -0700137global sym(aom_filter_block1d4_v2_sse2) PRIVATE
138sym(aom_filter_block1d4_v2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700139 push rbp
140 mov rbp, rsp
141 SHADOW_ARGS_TO_STACK 6
142 push rsi
143 push rdi
144 ; end prolog
145
146 GET_PARAM_4
147.loop:
148 movd xmm0, [rsi] ;load src
149 movd xmm1, [rsi + rax]
150
151 APPLY_FILTER_4 0
152 jnz .loop
153
154 ; begin epilog
155 pop rdi
156 pop rsi
157 UNSHADOW_ARGS
158 pop rbp
159 ret
160
Yaowu Xuf883b422016-08-30 14:01:10 -0700161global sym(aom_filter_block1d8_v2_sse2) PRIVATE
162sym(aom_filter_block1d8_v2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700163 push rbp
164 mov rbp, rsp
165 SHADOW_ARGS_TO_STACK 6
166 SAVE_XMM 7
167 push rsi
168 push rdi
169 ; end prolog
170
171 GET_PARAM
172.loop:
173 movq xmm0, [rsi] ;0
174 movq xmm1, [rsi + rax] ;1
175
176 APPLY_FILTER_8 0
177 jnz .loop
178
179 ; begin epilog
180 pop rdi
181 pop rsi
182 RESTORE_XMM
183 UNSHADOW_ARGS
184 pop rbp
185 ret
186
Yaowu Xuf883b422016-08-30 14:01:10 -0700187global sym(aom_filter_block1d16_v2_sse2) PRIVATE
188sym(aom_filter_block1d16_v2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700189 push rbp
190 mov rbp, rsp
191 SHADOW_ARGS_TO_STACK 6
192 SAVE_XMM 7
193 push rsi
194 push rdi
195 ; end prolog
196
197 GET_PARAM
198.loop:
199 movdqu xmm0, [rsi] ;0
200 movdqu xmm1, [rsi + rax] ;1
201 movdqa xmm2, xmm0
202 movdqa xmm3, xmm1
203
204 APPLY_FILTER_16 0
205 jnz .loop
206
207 ; begin epilog
208 pop rdi
209 pop rsi
210 RESTORE_XMM
211 UNSHADOW_ARGS
212 pop rbp
213 ret
214
Yaowu Xuf883b422016-08-30 14:01:10 -0700215global sym(aom_filter_block1d4_v2_avg_sse2) PRIVATE
216sym(aom_filter_block1d4_v2_avg_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700217 push rbp
218 mov rbp, rsp
219 SHADOW_ARGS_TO_STACK 6
220 push rsi
221 push rdi
222 ; end prolog
223
224 GET_PARAM_4
225.loop:
226 movd xmm0, [rsi] ;load src
227 movd xmm1, [rsi + rax]
228
229 APPLY_FILTER_4 1
230 jnz .loop
231
232 ; begin epilog
233 pop rdi
234 pop rsi
235 UNSHADOW_ARGS
236 pop rbp
237 ret
238
Yaowu Xuf883b422016-08-30 14:01:10 -0700239global sym(aom_filter_block1d8_v2_avg_sse2) PRIVATE
240sym(aom_filter_block1d8_v2_avg_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700241 push rbp
242 mov rbp, rsp
243 SHADOW_ARGS_TO_STACK 6
244 SAVE_XMM 7
245 push rsi
246 push rdi
247 ; end prolog
248
249 GET_PARAM
250.loop:
251 movq xmm0, [rsi] ;0
252 movq xmm1, [rsi + rax] ;1
253
254 APPLY_FILTER_8 1
255 jnz .loop
256
257 ; begin epilog
258 pop rdi
259 pop rsi
260 RESTORE_XMM
261 UNSHADOW_ARGS
262 pop rbp
263 ret
264
Yaowu Xuf883b422016-08-30 14:01:10 -0700265global sym(aom_filter_block1d16_v2_avg_sse2) PRIVATE
266sym(aom_filter_block1d16_v2_avg_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700267 push rbp
268 mov rbp, rsp
269 SHADOW_ARGS_TO_STACK 6
270 SAVE_XMM 7
271 push rsi
272 push rdi
273 ; end prolog
274
275 GET_PARAM
276.loop:
277 movdqu xmm0, [rsi] ;0
278 movdqu xmm1, [rsi + rax] ;1
279 movdqa xmm2, xmm0
280 movdqa xmm3, xmm1
281
282 APPLY_FILTER_16 1
283 jnz .loop
284
285 ; begin epilog
286 pop rdi
287 pop rsi
288 RESTORE_XMM
289 UNSHADOW_ARGS
290 pop rbp
291 ret
292
Yaowu Xuf883b422016-08-30 14:01:10 -0700293global sym(aom_filter_block1d4_h2_sse2) PRIVATE
294sym(aom_filter_block1d4_h2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700295 push rbp
296 mov rbp, rsp
297 SHADOW_ARGS_TO_STACK 6
298 push rsi
299 push rdi
300 ; end prolog
301
302 GET_PARAM_4
303.loop:
304 movdqu xmm0, [rsi] ;load src
305 movdqa xmm1, xmm0
306 psrldq xmm1, 1
307
308 APPLY_FILTER_4 0
309 jnz .loop
310
311 ; begin epilog
312 pop rdi
313 pop rsi
314 UNSHADOW_ARGS
315 pop rbp
316 ret
317
Yaowu Xuf883b422016-08-30 14:01:10 -0700318global sym(aom_filter_block1d8_h2_sse2) PRIVATE
319sym(aom_filter_block1d8_h2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700320 push rbp
321 mov rbp, rsp
322 SHADOW_ARGS_TO_STACK 6
323 SAVE_XMM 7
324 push rsi
325 push rdi
326 ; end prolog
327
328 GET_PARAM
329.loop:
330 movdqu xmm0, [rsi] ;load src
331 movdqa xmm1, xmm0
332 psrldq xmm1, 1
333
334 APPLY_FILTER_8 0
335 jnz .loop
336
337 ; begin epilog
338 pop rdi
339 pop rsi
340 RESTORE_XMM
341 UNSHADOW_ARGS
342 pop rbp
343 ret
344
Yaowu Xuf883b422016-08-30 14:01:10 -0700345global sym(aom_filter_block1d16_h2_sse2) PRIVATE
346sym(aom_filter_block1d16_h2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700347 push rbp
348 mov rbp, rsp
349 SHADOW_ARGS_TO_STACK 6
350 SAVE_XMM 7
351 push rsi
352 push rdi
353 ; end prolog
354
355 GET_PARAM
356.loop:
357 movdqu xmm0, [rsi] ;load src
358 movdqu xmm1, [rsi + 1]
359 movdqa xmm2, xmm0
360 movdqa xmm3, xmm1
361
362 APPLY_FILTER_16 0
363 jnz .loop
364
365 ; begin epilog
366 pop rdi
367 pop rsi
368 RESTORE_XMM
369 UNSHADOW_ARGS
370 pop rbp
371 ret
372
Yaowu Xuf883b422016-08-30 14:01:10 -0700373global sym(aom_filter_block1d4_h2_avg_sse2) PRIVATE
374sym(aom_filter_block1d4_h2_avg_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700375 push rbp
376 mov rbp, rsp
377 SHADOW_ARGS_TO_STACK 6
378 push rsi
379 push rdi
380 ; end prolog
381
382 GET_PARAM_4
383.loop:
384 movdqu xmm0, [rsi] ;load src
385 movdqa xmm1, xmm0
386 psrldq xmm1, 1
387
388 APPLY_FILTER_4 1
389 jnz .loop
390
391 ; begin epilog
392 pop rdi
393 pop rsi
394 UNSHADOW_ARGS
395 pop rbp
396 ret
397
Yaowu Xuf883b422016-08-30 14:01:10 -0700398global sym(aom_filter_block1d8_h2_avg_sse2) PRIVATE
399sym(aom_filter_block1d8_h2_avg_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700400 push rbp
401 mov rbp, rsp
402 SHADOW_ARGS_TO_STACK 6
403 SAVE_XMM 7
404 push rsi
405 push rdi
406 ; end prolog
407
408 GET_PARAM
409.loop:
410 movdqu xmm0, [rsi] ;load src
411 movdqa xmm1, xmm0
412 psrldq xmm1, 1
413
414 APPLY_FILTER_8 1
415 jnz .loop
416
417 ; begin epilog
418 pop rdi
419 pop rsi
420 RESTORE_XMM
421 UNSHADOW_ARGS
422 pop rbp
423 ret
424
Yaowu Xuf883b422016-08-30 14:01:10 -0700425global sym(aom_filter_block1d16_h2_avg_sse2) PRIVATE
426sym(aom_filter_block1d16_h2_avg_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700427 push rbp
428 mov rbp, rsp
429 SHADOW_ARGS_TO_STACK 6
430 SAVE_XMM 7
431 push rsi
432 push rdi
433 ; end prolog
434
435 GET_PARAM
436.loop:
437 movdqu xmm0, [rsi] ;load src
438 movdqu xmm1, [rsi + 1]
439 movdqa xmm2, xmm0
440 movdqa xmm3, xmm1
441
442 APPLY_FILTER_16 1
443 jnz .loop
444
445 ; begin epilog
446 pop rdi
447 pop rsi
448 RESTORE_XMM
449 UNSHADOW_ARGS
450 pop rbp
451 ret