blob: a7152be57c4f9204205aeb70a488fe06f0afce4e [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001;
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002; Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003;
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
Yaowu Xuc27fc142016-08-22 16:08:15 -070012;
13
14%include "aom_ports/x86_abi_support.asm"
15
16%macro HIGH_GET_PARAM_4 0
17 mov rdx, arg(5) ;filter ptr
18 mov rsi, arg(0) ;src_ptr
19 mov rdi, arg(2) ;output_ptr
20 mov rcx, 0x00000040
21
22 movdqa xmm3, [rdx] ;load filters
23 pshuflw xmm4, xmm3, 11111111b ;k3
24 psrldq xmm3, 8
25 pshuflw xmm3, xmm3, 0b ;k4
26 punpcklwd xmm4, xmm3 ;k3k4
27
28 movq xmm3, rcx ;rounding
29 pshufd xmm3, xmm3, 0
30
31 mov rdx, 0x00010001
32 movsxd rcx, DWORD PTR arg(6) ;bps
33 movq xmm5, rdx
34 movq xmm2, rcx
35 pshufd xmm5, xmm5, 0b
36 movdqa xmm1, xmm5
37 psllw xmm5, xmm2
38 psubw xmm5, xmm1 ;max value (for clamping)
39 pxor xmm2, xmm2 ;min value (for clamping)
40
41 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
42 movsxd rdx, DWORD PTR arg(3) ;out_pitch
43 movsxd rcx, DWORD PTR arg(4) ;output_height
44%endm
45
46%macro HIGH_APPLY_FILTER_4 1
47
48 punpcklwd xmm0, xmm1 ;two row in one register
49 pmaddwd xmm0, xmm4 ;multiply the filter factors
50
51 paddd xmm0, xmm3 ;rounding
52 psrad xmm0, 7 ;shift
53 packssdw xmm0, xmm0 ;pack to word
54
55 ;clamp the values
56 pminsw xmm0, xmm5
57 pmaxsw xmm0, xmm2
58
59%if %1
60 movq xmm1, [rdi]
61 pavgw xmm0, xmm1
62%endif
63
64 movq [rdi], xmm0
65 lea rsi, [rsi + 2*rax]
66 lea rdi, [rdi + 2*rdx]
67 dec rcx
68%endm
69
Yaowu Xuc27fc142016-08-22 16:08:15 -070070%macro HIGH_GET_PARAM 0
71 mov rdx, arg(5) ;filter ptr
72 mov rsi, arg(0) ;src_ptr
73 mov rdi, arg(2) ;output_ptr
74 mov rcx, 0x00000040
75
76 movdqa xmm6, [rdx] ;load filters
77
78 pshuflw xmm7, xmm6, 11111111b ;k3
79 pshufhw xmm6, xmm6, 0b ;k4
80 psrldq xmm6, 8
81 punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4
82
83 movq xmm4, rcx ;rounding
84 pshufd xmm4, xmm4, 0
85
86 mov rdx, 0x00010001
87 movsxd rcx, DWORD PTR arg(6) ;bps
Remya4bec5c32018-11-12 10:45:49 +053088 movq xmm3, rdx
Yaowu Xuc27fc142016-08-22 16:08:15 -070089 movq xmm5, rcx
Remya4bec5c32018-11-12 10:45:49 +053090 pshufd xmm3, xmm3, 0b
91 movdqa xmm1, xmm3
92 psllw xmm3, xmm5
93 psubw xmm3, xmm1 ;max value (for clamping)
Yaowu Xuc27fc142016-08-22 16:08:15 -070094 pxor xmm5, xmm5 ;min value (for clamping)
95
Remya4bec5c32018-11-12 10:45:49 +053096 movdqa max, xmm3
97 movdqa min, xmm5
98
Yaowu Xuc27fc142016-08-22 16:08:15 -070099 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
100 movsxd rdx, DWORD PTR arg(3) ;out_pitch
101 movsxd rcx, DWORD PTR arg(4) ;output_height
102%endm
103
104%macro HIGH_APPLY_FILTER_8 1
105 movdqa xmm6, xmm0
106 punpckhwd xmm6, xmm1
107 punpcklwd xmm0, xmm1
108 pmaddwd xmm6, xmm7
109 pmaddwd xmm0, xmm7
110
111 paddd xmm6, xmm4 ;rounding
112 paddd xmm0, xmm4 ;rounding
113 psrad xmm6, 7 ;shift
114 psrad xmm0, 7 ;shift
115 packssdw xmm0, xmm6 ;pack back to word
116
117 ;clamp the values
Remya4bec5c32018-11-12 10:45:49 +0530118 pminsw xmm0, max
119 pmaxsw xmm0, min
Yaowu Xuc27fc142016-08-22 16:08:15 -0700120
121%if %1
122 movdqu xmm1, [rdi]
123 pavgw xmm0, xmm1
124%endif
125 movdqu [rdi], xmm0 ;store the result
126
127 lea rsi, [rsi + 2*rax]
128 lea rdi, [rdi + 2*rdx]
129 dec rcx
130%endm
131
132%macro HIGH_APPLY_FILTER_16 1
Remya4bec5c32018-11-12 10:45:49 +0530133 movdqa xmm5, xmm0
Yaowu Xuc27fc142016-08-22 16:08:15 -0700134 movdqa xmm6, xmm2
Remya4bec5c32018-11-12 10:45:49 +0530135 punpckhwd xmm5, xmm1
Yaowu Xuc27fc142016-08-22 16:08:15 -0700136 punpckhwd xmm6, xmm3
137 punpcklwd xmm0, xmm1
138 punpcklwd xmm2, xmm3
139
Remya4bec5c32018-11-12 10:45:49 +0530140 pmaddwd xmm5, xmm7
Yaowu Xuc27fc142016-08-22 16:08:15 -0700141 pmaddwd xmm6, xmm7
142 pmaddwd xmm0, xmm7
143 pmaddwd xmm2, xmm7
144
Remya4bec5c32018-11-12 10:45:49 +0530145 paddd xmm5, xmm4 ;rounding
Yaowu Xuc27fc142016-08-22 16:08:15 -0700146 paddd xmm6, xmm4
147 paddd xmm0, xmm4
148 paddd xmm2, xmm4
149
Remya4bec5c32018-11-12 10:45:49 +0530150 psrad xmm5, 7 ;shift
Yaowu Xuc27fc142016-08-22 16:08:15 -0700151 psrad xmm6, 7
152 psrad xmm0, 7
153 psrad xmm2, 7
154
Remya4bec5c32018-11-12 10:45:49 +0530155 packssdw xmm0, xmm5 ;pack back to word
Yaowu Xuc27fc142016-08-22 16:08:15 -0700156 packssdw xmm2, xmm6 ;pack back to word
157
158 ;clamp the values
Remya4bec5c32018-11-12 10:45:49 +0530159 pminsw xmm0, max
160 pmaxsw xmm0, min
161 pminsw xmm2, max
162 pmaxsw xmm2, min
Yaowu Xuc27fc142016-08-22 16:08:15 -0700163
164%if %1
165 movdqu xmm1, [rdi]
166 movdqu xmm3, [rdi + 16]
167 pavgw xmm0, xmm1
168 pavgw xmm2, xmm3
169%endif
170 movdqu [rdi], xmm0 ;store the result
171 movdqu [rdi + 16], xmm2 ;store the result
172
173 lea rsi, [rsi + 2*rax]
174 lea rdi, [rdi + 2*rdx]
175 dec rcx
176%endm
Yaowu Xuc27fc142016-08-22 16:08:15 -0700177
Johann0fff5342017-12-04 09:12:34 -0800178SECTION .text
179
Yaowu Xuf883b422016-08-30 14:01:10 -0700180global sym(aom_highbd_filter_block1d4_v2_sse2) PRIVATE
181sym(aom_highbd_filter_block1d4_v2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700182 push rbp
183 mov rbp, rsp
184 SHADOW_ARGS_TO_STACK 7
185 push rsi
186 push rdi
187 ; end prolog
188
189 HIGH_GET_PARAM_4
190.loop:
191 movq xmm0, [rsi] ;load src
192 movq xmm1, [rsi + 2*rax]
193
194 HIGH_APPLY_FILTER_4 0
195 jnz .loop
196
197 ; begin epilog
198 pop rdi
199 pop rsi
200 UNSHADOW_ARGS
201 pop rbp
202 ret
203
Yaowu Xuf883b422016-08-30 14:01:10 -0700204global sym(aom_highbd_filter_block1d8_v2_sse2) PRIVATE
205sym(aom_highbd_filter_block1d8_v2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700206 push rbp
207 mov rbp, rsp
208 SHADOW_ARGS_TO_STACK 7
209 SAVE_XMM 8
210 push rsi
211 push rdi
212 ; end prolog
213
Remya4bec5c32018-11-12 10:45:49 +0530214 ALIGN_STACK 16, rax
215 sub rsp, 16 * 2
216 %define max [rsp + 16 * 0]
217 %define min [rsp + 16 * 1]
218
Yaowu Xuc27fc142016-08-22 16:08:15 -0700219 HIGH_GET_PARAM
220.loop:
221 movdqu xmm0, [rsi] ;0
222 movdqu xmm1, [rsi + 2*rax] ;1
223
224 HIGH_APPLY_FILTER_8 0
225 jnz .loop
226
Remya4bec5c32018-11-12 10:45:49 +0530227 add rsp, 16 * 2
228 pop rsp
229
Yaowu Xuc27fc142016-08-22 16:08:15 -0700230 ; begin epilog
231 pop rdi
232 pop rsi
233 RESTORE_XMM
234 UNSHADOW_ARGS
235 pop rbp
236 ret
237
Yaowu Xuf883b422016-08-30 14:01:10 -0700238global sym(aom_highbd_filter_block1d16_v2_sse2) PRIVATE
239sym(aom_highbd_filter_block1d16_v2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700240 push rbp
241 mov rbp, rsp
242 SHADOW_ARGS_TO_STACK 7
243 SAVE_XMM 9
244 push rsi
245 push rdi
246 ; end prolog
247
Remya4bec5c32018-11-12 10:45:49 +0530248 ALIGN_STACK 16, rax
249 sub rsp, 16 * 2
250 %define max [rsp + 16 * 0]
251 %define min [rsp + 16 * 1]
252
Yaowu Xuc27fc142016-08-22 16:08:15 -0700253 HIGH_GET_PARAM
254.loop:
255 movdqu xmm0, [rsi] ;0
256 movdqu xmm2, [rsi + 16]
257 movdqu xmm1, [rsi + 2*rax] ;1
258 movdqu xmm3, [rsi + 2*rax + 16]
259
260 HIGH_APPLY_FILTER_16 0
261 jnz .loop
262
Remya4bec5c32018-11-12 10:45:49 +0530263 add rsp, 16 * 2
264 pop rsp
265
Yaowu Xuc27fc142016-08-22 16:08:15 -0700266 ; begin epilog
267 pop rdi
268 pop rsi
269 RESTORE_XMM
270 UNSHADOW_ARGS
271 pop rbp
272 ret
Yaowu Xuc27fc142016-08-22 16:08:15 -0700273
Yaowu Xuf883b422016-08-30 14:01:10 -0700274global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE
275sym(aom_highbd_filter_block1d4_h2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700276 push rbp
277 mov rbp, rsp
278 SHADOW_ARGS_TO_STACK 7
279 push rsi
280 push rdi
281 ; end prolog
282
283 HIGH_GET_PARAM_4
284.loop:
285 movdqu xmm0, [rsi] ;load src
286 movdqa xmm1, xmm0
287 psrldq xmm1, 2
288
289 HIGH_APPLY_FILTER_4 0
290 jnz .loop
291
292 ; begin epilog
293 pop rdi
294 pop rsi
295 UNSHADOW_ARGS
296 pop rbp
297 ret
298
Yaowu Xuf883b422016-08-30 14:01:10 -0700299global sym(aom_highbd_filter_block1d8_h2_sse2) PRIVATE
300sym(aom_highbd_filter_block1d8_h2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700301 push rbp
302 mov rbp, rsp
303 SHADOW_ARGS_TO_STACK 7
304 SAVE_XMM 8
305 push rsi
306 push rdi
307 ; end prolog
308
Remya4bec5c32018-11-12 10:45:49 +0530309 ALIGN_STACK 16, rax
310 sub rsp, 16 * 2
311 %define max [rsp + 16 * 0]
312 %define min [rsp + 16 * 1]
313
Yaowu Xuc27fc142016-08-22 16:08:15 -0700314 HIGH_GET_PARAM
315.loop:
316 movdqu xmm0, [rsi] ;load src
317 movdqu xmm1, [rsi + 2]
318
319 HIGH_APPLY_FILTER_8 0
320 jnz .loop
321
Remya4bec5c32018-11-12 10:45:49 +0530322 add rsp, 16 * 2
323 pop rsp
324
Yaowu Xuc27fc142016-08-22 16:08:15 -0700325 ; begin epilog
326 pop rdi
327 pop rsi
328 RESTORE_XMM
329 UNSHADOW_ARGS
330 pop rbp
331 ret
332
Yaowu Xuf883b422016-08-30 14:01:10 -0700333global sym(aom_highbd_filter_block1d16_h2_sse2) PRIVATE
334sym(aom_highbd_filter_block1d16_h2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700335 push rbp
336 mov rbp, rsp
337 SHADOW_ARGS_TO_STACK 7
338 SAVE_XMM 9
339 push rsi
340 push rdi
341 ; end prolog
342
Remya4bec5c32018-11-12 10:45:49 +0530343 ALIGN_STACK 16, rax
344 sub rsp, 16 * 2
345 %define max [rsp + 16 * 0]
346 %define min [rsp + 16 * 1]
347
Yaowu Xuc27fc142016-08-22 16:08:15 -0700348 HIGH_GET_PARAM
349.loop:
350 movdqu xmm0, [rsi] ;load src
351 movdqu xmm1, [rsi + 2]
352 movdqu xmm2, [rsi + 16]
353 movdqu xmm3, [rsi + 18]
354
355 HIGH_APPLY_FILTER_16 0
356 jnz .loop
357
Remya4bec5c32018-11-12 10:45:49 +0530358 add rsp, 16 * 2
359 pop rsp
360
Yaowu Xuc27fc142016-08-22 16:08:15 -0700361 ; begin epilog
362 pop rdi
363 pop rsi
364 RESTORE_XMM
365 UNSHADOW_ARGS
366 pop rbp
367 ret