blob: 9e2ec748c7fa19e082cc2fd986de6954679bd12a [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001;
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002; Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003;
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
Yaowu Xuc27fc142016-08-22 16:08:15 -070012;
13
14%include "aom_ports/x86_abi_support.asm"
15
16%macro HIGH_GET_PARAM_4 0
17 mov rdx, arg(5) ;filter ptr
18 mov rsi, arg(0) ;src_ptr
19 mov rdi, arg(2) ;output_ptr
20 mov rcx, 0x00000040
21
22 movdqa xmm3, [rdx] ;load filters
23 pshuflw xmm4, xmm3, 11111111b ;k3
24 psrldq xmm3, 8
25 pshuflw xmm3, xmm3, 0b ;k4
26 punpcklwd xmm4, xmm3 ;k3k4
27
28 movq xmm3, rcx ;rounding
29 pshufd xmm3, xmm3, 0
30
31 mov rdx, 0x00010001
32 movsxd rcx, DWORD PTR arg(6) ;bps
33 movq xmm5, rdx
34 movq xmm2, rcx
35 pshufd xmm5, xmm5, 0b
36 movdqa xmm1, xmm5
37 psllw xmm5, xmm2
38 psubw xmm5, xmm1 ;max value (for clamping)
39 pxor xmm2, xmm2 ;min value (for clamping)
40
41 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
42 movsxd rdx, DWORD PTR arg(3) ;out_pitch
43 movsxd rcx, DWORD PTR arg(4) ;output_height
44%endm
45
46%macro HIGH_APPLY_FILTER_4 1
47
48 punpcklwd xmm0, xmm1 ;two row in one register
49 pmaddwd xmm0, xmm4 ;multiply the filter factors
50
51 paddd xmm0, xmm3 ;rounding
52 psrad xmm0, 7 ;shift
53 packssdw xmm0, xmm0 ;pack to word
54
55 ;clamp the values
56 pminsw xmm0, xmm5
57 pmaxsw xmm0, xmm2
58
59%if %1
60 movq xmm1, [rdi]
61 pavgw xmm0, xmm1
62%endif
63
64 movq [rdi], xmm0
65 lea rsi, [rsi + 2*rax]
66 lea rdi, [rdi + 2*rdx]
67 dec rcx
68%endm
69
70%if ARCH_X86_64
71%macro HIGH_GET_PARAM 0
72 mov rdx, arg(5) ;filter ptr
73 mov rsi, arg(0) ;src_ptr
74 mov rdi, arg(2) ;output_ptr
75 mov rcx, 0x00000040
76
77 movdqa xmm6, [rdx] ;load filters
78
79 pshuflw xmm7, xmm6, 11111111b ;k3
80 pshufhw xmm6, xmm6, 0b ;k4
81 psrldq xmm6, 8
82 punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4
83
84 movq xmm4, rcx ;rounding
85 pshufd xmm4, xmm4, 0
86
87 mov rdx, 0x00010001
88 movsxd rcx, DWORD PTR arg(6) ;bps
89 movq xmm8, rdx
90 movq xmm5, rcx
91 pshufd xmm8, xmm8, 0b
92 movdqa xmm1, xmm8
93 psllw xmm8, xmm5
94 psubw xmm8, xmm1 ;max value (for clamping)
95 pxor xmm5, xmm5 ;min value (for clamping)
96
97 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
98 movsxd rdx, DWORD PTR arg(3) ;out_pitch
99 movsxd rcx, DWORD PTR arg(4) ;output_height
100%endm
101
102%macro HIGH_APPLY_FILTER_8 1
103 movdqa xmm6, xmm0
104 punpckhwd xmm6, xmm1
105 punpcklwd xmm0, xmm1
106 pmaddwd xmm6, xmm7
107 pmaddwd xmm0, xmm7
108
109 paddd xmm6, xmm4 ;rounding
110 paddd xmm0, xmm4 ;rounding
111 psrad xmm6, 7 ;shift
112 psrad xmm0, 7 ;shift
113 packssdw xmm0, xmm6 ;pack back to word
114
115 ;clamp the values
116 pminsw xmm0, xmm8
117 pmaxsw xmm0, xmm5
118
119%if %1
120 movdqu xmm1, [rdi]
121 pavgw xmm0, xmm1
122%endif
123 movdqu [rdi], xmm0 ;store the result
124
125 lea rsi, [rsi + 2*rax]
126 lea rdi, [rdi + 2*rdx]
127 dec rcx
128%endm
129
130%macro HIGH_APPLY_FILTER_16 1
131 movdqa xmm9, xmm0
132 movdqa xmm6, xmm2
133 punpckhwd xmm9, xmm1
134 punpckhwd xmm6, xmm3
135 punpcklwd xmm0, xmm1
136 punpcklwd xmm2, xmm3
137
138 pmaddwd xmm9, xmm7
139 pmaddwd xmm6, xmm7
140 pmaddwd xmm0, xmm7
141 pmaddwd xmm2, xmm7
142
143 paddd xmm9, xmm4 ;rounding
144 paddd xmm6, xmm4
145 paddd xmm0, xmm4
146 paddd xmm2, xmm4
147
148 psrad xmm9, 7 ;shift
149 psrad xmm6, 7
150 psrad xmm0, 7
151 psrad xmm2, 7
152
153 packssdw xmm0, xmm9 ;pack back to word
154 packssdw xmm2, xmm6 ;pack back to word
155
156 ;clamp the values
157 pminsw xmm0, xmm8
158 pmaxsw xmm0, xmm5
159 pminsw xmm2, xmm8
160 pmaxsw xmm2, xmm5
161
162%if %1
163 movdqu xmm1, [rdi]
164 movdqu xmm3, [rdi + 16]
165 pavgw xmm0, xmm1
166 pavgw xmm2, xmm3
167%endif
168 movdqu [rdi], xmm0 ;store the result
169 movdqu [rdi + 16], xmm2 ;store the result
170
171 lea rsi, [rsi + 2*rax]
172 lea rdi, [rdi + 2*rdx]
173 dec rcx
174%endm
175%endif
176
Yaowu Xuf883b422016-08-30 14:01:10 -0700177global sym(aom_highbd_filter_block1d4_v2_sse2) PRIVATE
178sym(aom_highbd_filter_block1d4_v2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700179 push rbp
180 mov rbp, rsp
181 SHADOW_ARGS_TO_STACK 7
182 push rsi
183 push rdi
184 ; end prolog
185
186 HIGH_GET_PARAM_4
187.loop:
188 movq xmm0, [rsi] ;load src
189 movq xmm1, [rsi + 2*rax]
190
191 HIGH_APPLY_FILTER_4 0
192 jnz .loop
193
194 ; begin epilog
195 pop rdi
196 pop rsi
197 UNSHADOW_ARGS
198 pop rbp
199 ret
200
201%if ARCH_X86_64
Yaowu Xuf883b422016-08-30 14:01:10 -0700202global sym(aom_highbd_filter_block1d8_v2_sse2) PRIVATE
203sym(aom_highbd_filter_block1d8_v2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700204 push rbp
205 mov rbp, rsp
206 SHADOW_ARGS_TO_STACK 7
207 SAVE_XMM 8
208 push rsi
209 push rdi
210 ; end prolog
211
212 HIGH_GET_PARAM
213.loop:
214 movdqu xmm0, [rsi] ;0
215 movdqu xmm1, [rsi + 2*rax] ;1
216
217 HIGH_APPLY_FILTER_8 0
218 jnz .loop
219
220 ; begin epilog
221 pop rdi
222 pop rsi
223 RESTORE_XMM
224 UNSHADOW_ARGS
225 pop rbp
226 ret
227
Yaowu Xuf883b422016-08-30 14:01:10 -0700228global sym(aom_highbd_filter_block1d16_v2_sse2) PRIVATE
229sym(aom_highbd_filter_block1d16_v2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700230 push rbp
231 mov rbp, rsp
232 SHADOW_ARGS_TO_STACK 7
233 SAVE_XMM 9
234 push rsi
235 push rdi
236 ; end prolog
237
238 HIGH_GET_PARAM
239.loop:
240 movdqu xmm0, [rsi] ;0
241 movdqu xmm2, [rsi + 16]
242 movdqu xmm1, [rsi + 2*rax] ;1
243 movdqu xmm3, [rsi + 2*rax + 16]
244
245 HIGH_APPLY_FILTER_16 0
246 jnz .loop
247
248 ; begin epilog
249 pop rdi
250 pop rsi
251 RESTORE_XMM
252 UNSHADOW_ARGS
253 pop rbp
254 ret
255%endif
256
Yaowu Xuf883b422016-08-30 14:01:10 -0700257global sym(aom_highbd_filter_block1d4_v2_avg_sse2) PRIVATE
258sym(aom_highbd_filter_block1d4_v2_avg_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700259 push rbp
260 mov rbp, rsp
261 SHADOW_ARGS_TO_STACK 7
262 push rsi
263 push rdi
264 ; end prolog
265
266 HIGH_GET_PARAM_4
267.loop:
268 movq xmm0, [rsi] ;load src
269 movq xmm1, [rsi + 2*rax]
270
271 HIGH_APPLY_FILTER_4 1
272 jnz .loop
273
274 ; begin epilog
275 pop rdi
276 pop rsi
277 UNSHADOW_ARGS
278 pop rbp
279 ret
280
281%if ARCH_X86_64
Yaowu Xuf883b422016-08-30 14:01:10 -0700282global sym(aom_highbd_filter_block1d8_v2_avg_sse2) PRIVATE
283sym(aom_highbd_filter_block1d8_v2_avg_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700284 push rbp
285 mov rbp, rsp
286 SHADOW_ARGS_TO_STACK 7
287 SAVE_XMM 8
288 push rsi
289 push rdi
290 ; end prolog
291
292 HIGH_GET_PARAM
293.loop:
294 movdqu xmm0, [rsi] ;0
295 movdqu xmm1, [rsi + 2*rax] ;1
296
297 HIGH_APPLY_FILTER_8 1
298 jnz .loop
299
300 ; begin epilog
301 pop rdi
302 pop rsi
303 RESTORE_XMM
304 UNSHADOW_ARGS
305 pop rbp
306 ret
307
Yaowu Xuf883b422016-08-30 14:01:10 -0700308global sym(aom_highbd_filter_block1d16_v2_avg_sse2) PRIVATE
309sym(aom_highbd_filter_block1d16_v2_avg_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700310 push rbp
311 mov rbp, rsp
312 SHADOW_ARGS_TO_STACK 7
313 SAVE_XMM 9
314 push rsi
315 push rdi
316 ; end prolog
317
318 HIGH_GET_PARAM
319.loop:
320 movdqu xmm0, [rsi] ;0
321 movdqu xmm1, [rsi + 2*rax] ;1
322 movdqu xmm2, [rsi + 16]
323 movdqu xmm3, [rsi + 2*rax + 16]
324
325 HIGH_APPLY_FILTER_16 1
326 jnz .loop
327
328 ; begin epilog
329 pop rdi
330 pop rsi
331 RESTORE_XMM
332 UNSHADOW_ARGS
333 pop rbp
334 ret
335%endif
336
Yaowu Xuf883b422016-08-30 14:01:10 -0700337global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE
338sym(aom_highbd_filter_block1d4_h2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700339 push rbp
340 mov rbp, rsp
341 SHADOW_ARGS_TO_STACK 7
342 push rsi
343 push rdi
344 ; end prolog
345
346 HIGH_GET_PARAM_4
347.loop:
348 movdqu xmm0, [rsi] ;load src
349 movdqa xmm1, xmm0
350 psrldq xmm1, 2
351
352 HIGH_APPLY_FILTER_4 0
353 jnz .loop
354
355 ; begin epilog
356 pop rdi
357 pop rsi
358 UNSHADOW_ARGS
359 pop rbp
360 ret
361
362%if ARCH_X86_64
Yaowu Xuf883b422016-08-30 14:01:10 -0700363global sym(aom_highbd_filter_block1d8_h2_sse2) PRIVATE
364sym(aom_highbd_filter_block1d8_h2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700365 push rbp
366 mov rbp, rsp
367 SHADOW_ARGS_TO_STACK 7
368 SAVE_XMM 8
369 push rsi
370 push rdi
371 ; end prolog
372
373 HIGH_GET_PARAM
374.loop:
375 movdqu xmm0, [rsi] ;load src
376 movdqu xmm1, [rsi + 2]
377
378 HIGH_APPLY_FILTER_8 0
379 jnz .loop
380
381 ; begin epilog
382 pop rdi
383 pop rsi
384 RESTORE_XMM
385 UNSHADOW_ARGS
386 pop rbp
387 ret
388
Yaowu Xuf883b422016-08-30 14:01:10 -0700389global sym(aom_highbd_filter_block1d16_h2_sse2) PRIVATE
390sym(aom_highbd_filter_block1d16_h2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700391 push rbp
392 mov rbp, rsp
393 SHADOW_ARGS_TO_STACK 7
394 SAVE_XMM 9
395 push rsi
396 push rdi
397 ; end prolog
398
399 HIGH_GET_PARAM
400.loop:
401 movdqu xmm0, [rsi] ;load src
402 movdqu xmm1, [rsi + 2]
403 movdqu xmm2, [rsi + 16]
404 movdqu xmm3, [rsi + 18]
405
406 HIGH_APPLY_FILTER_16 0
407 jnz .loop
408
409 ; begin epilog
410 pop rdi
411 pop rsi
412 RESTORE_XMM
413 UNSHADOW_ARGS
414 pop rbp
415 ret
416%endif
417
Yaowu Xuf883b422016-08-30 14:01:10 -0700418global sym(aom_highbd_filter_block1d4_h2_avg_sse2) PRIVATE
419sym(aom_highbd_filter_block1d4_h2_avg_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700420 push rbp
421 mov rbp, rsp
422 SHADOW_ARGS_TO_STACK 7
423 push rsi
424 push rdi
425 ; end prolog
426
427 HIGH_GET_PARAM_4
428.loop:
429 movdqu xmm0, [rsi] ;load src
430 movdqa xmm1, xmm0
431 psrldq xmm1, 2
432
433 HIGH_APPLY_FILTER_4 1
434 jnz .loop
435
436 ; begin epilog
437 pop rdi
438 pop rsi
439 UNSHADOW_ARGS
440 pop rbp
441 ret
442
443%if ARCH_X86_64
Yaowu Xuf883b422016-08-30 14:01:10 -0700444global sym(aom_highbd_filter_block1d8_h2_avg_sse2) PRIVATE
445sym(aom_highbd_filter_block1d8_h2_avg_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700446 push rbp
447 mov rbp, rsp
448 SHADOW_ARGS_TO_STACK 7
449 SAVE_XMM 8
450 push rsi
451 push rdi
452 ; end prolog
453
454 HIGH_GET_PARAM
455.loop:
456 movdqu xmm0, [rsi] ;load src
457 movdqu xmm1, [rsi + 2]
458
459 HIGH_APPLY_FILTER_8 1
460 jnz .loop
461
462 ; begin epilog
463 pop rdi
464 pop rsi
465 RESTORE_XMM
466 UNSHADOW_ARGS
467 pop rbp
468 ret
469
Yaowu Xuf883b422016-08-30 14:01:10 -0700470global sym(aom_highbd_filter_block1d16_h2_avg_sse2) PRIVATE
471sym(aom_highbd_filter_block1d16_h2_avg_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700472 push rbp
473 mov rbp, rsp
474 SHADOW_ARGS_TO_STACK 7
475 SAVE_XMM 9
476 push rsi
477 push rdi
478 ; end prolog
479
480 HIGH_GET_PARAM
481.loop:
482 movdqu xmm0, [rsi] ;load src
483 movdqu xmm1, [rsi + 2]
484 movdqu xmm2, [rsi + 16]
485 movdqu xmm3, [rsi + 18]
486
487 HIGH_APPLY_FILTER_16 1
488 jnz .loop
489
490 ; begin epilog
491 pop rdi
492 pop rsi
493 RESTORE_XMM
494 UNSHADOW_ARGS
495 pop rbp
496 ret
497%endif