blob: 2c6f68f27db31a260a95c6a47e270562498d520a [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001;
Krishna Rapaka7319db52021-09-28 20:35:29 -07002; Copyright (c) 2021, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003;
Krishna Rapaka7319db52021-09-28 20:35:29 -07004; This source code is subject to the terms of the BSD 3-Clause Clear License and the
5; Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear License was
6; not distributed with this source code in the LICENSE file, you can obtain it
7; at aomedia.org/license/software-license/bsd-3-c-c/. If the Alliance for Open Media Patent
8; License 1.0 was not distributed with this source code in the PATENTS file, you
9; can obtain it at aomedia.org/license/patent-license/.
Yaowu Xu9c01aa12016-09-01 14:32:49 -070010;
11
Yaowu Xuc27fc142016-08-22 16:08:15 -070012;
13
14%include "aom_ports/x86_abi_support.asm"
15
16%macro GET_PARAM_4 0
17 mov rdx, arg(5) ;filter ptr
18 mov rsi, arg(0) ;src_ptr
19 mov rdi, arg(2) ;output_ptr
20 mov rcx, 0x0400040
21
22 movdqa xmm3, [rdx] ;load filters
23 pshuflw xmm4, xmm3, 11111111b ;k3
24 psrldq xmm3, 8
25 pshuflw xmm3, xmm3, 0b ;k4
26 punpcklqdq xmm4, xmm3 ;k3k4
27
28 movq xmm3, rcx ;rounding
29 pshufd xmm3, xmm3, 0
30
31 pxor xmm2, xmm2
32
33 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
34 movsxd rdx, DWORD PTR arg(3) ;out_pitch
35 movsxd rcx, DWORD PTR arg(4) ;output_height
36%endm
37
38%macro APPLY_FILTER_4 1
39
40 punpckldq xmm0, xmm1 ;two row in one register
41 punpcklbw xmm0, xmm2 ;unpack to word
42 pmullw xmm0, xmm4 ;multiply the filter factors
43
44 movdqa xmm1, xmm0
45 psrldq xmm1, 8
46 paddsw xmm0, xmm1
47
48 paddsw xmm0, xmm3 ;rounding
49 psraw xmm0, 7 ;shift
50 packuswb xmm0, xmm0 ;pack to byte
51
52%if %1
53 movd xmm1, [rdi]
54 pavgb xmm0, xmm1
55%endif
56
57 movd [rdi], xmm0
58 lea rsi, [rsi + rax]
59 lea rdi, [rdi + rdx]
60 dec rcx
61%endm
62
63%macro GET_PARAM 0
64 mov rdx, arg(5) ;filter ptr
65 mov rsi, arg(0) ;src_ptr
66 mov rdi, arg(2) ;output_ptr
67 mov rcx, 0x0400040
68
69 movdqa xmm7, [rdx] ;load filters
70
71 pshuflw xmm6, xmm7, 11111111b ;k3
72 pshufhw xmm7, xmm7, 0b ;k4
73 punpcklwd xmm6, xmm6
74 punpckhwd xmm7, xmm7
75
76 movq xmm4, rcx ;rounding
77 pshufd xmm4, xmm4, 0
78
79 pxor xmm5, xmm5
80
81 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
82 movsxd rdx, DWORD PTR arg(3) ;out_pitch
83 movsxd rcx, DWORD PTR arg(4) ;output_height
84%endm
85
86%macro APPLY_FILTER_8 1
87 punpcklbw xmm0, xmm5
88 punpcklbw xmm1, xmm5
89
90 pmullw xmm0, xmm6
91 pmullw xmm1, xmm7
92 paddsw xmm0, xmm1
93 paddsw xmm0, xmm4 ;rounding
94 psraw xmm0, 7 ;shift
95 packuswb xmm0, xmm0 ;pack back to byte
96%if %1
97 movq xmm1, [rdi]
98 pavgb xmm0, xmm1
99%endif
100 movq [rdi], xmm0 ;store the result
101
102 lea rsi, [rsi + rax]
103 lea rdi, [rdi + rdx]
104 dec rcx
105%endm
106
107%macro APPLY_FILTER_16 1
108 punpcklbw xmm0, xmm5
109 punpcklbw xmm1, xmm5
110 punpckhbw xmm2, xmm5
111 punpckhbw xmm3, xmm5
112
113 pmullw xmm0, xmm6
114 pmullw xmm1, xmm7
115 pmullw xmm2, xmm6
116 pmullw xmm3, xmm7
117
118 paddsw xmm0, xmm1
119 paddsw xmm2, xmm3
120
121 paddsw xmm0, xmm4 ;rounding
122 paddsw xmm2, xmm4
123 psraw xmm0, 7 ;shift
124 psraw xmm2, 7
125 packuswb xmm0, xmm2 ;pack back to byte
126%if %1
127 movdqu xmm1, [rdi]
128 pavgb xmm0, xmm1
129%endif
130 movdqu [rdi], xmm0 ;store the result
131
132 lea rsi, [rsi + rax]
133 lea rdi, [rdi + rdx]
134 dec rcx
135%endm
136
Johann0fff5342017-12-04 09:12:34 -0800137SECTION .text
138
Johann3662cec2020-04-26 20:22:43 +0900139globalsym(aom_filter_block1d4_v2_sse2)
Yaowu Xuf883b422016-08-30 14:01:10 -0700140sym(aom_filter_block1d4_v2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700141 push rbp
142 mov rbp, rsp
143 SHADOW_ARGS_TO_STACK 6
144 push rsi
145 push rdi
146 ; end prolog
147
148 GET_PARAM_4
149.loop:
150 movd xmm0, [rsi] ;load src
151 movd xmm1, [rsi + rax]
152
153 APPLY_FILTER_4 0
154 jnz .loop
155
156 ; begin epilog
157 pop rdi
158 pop rsi
159 UNSHADOW_ARGS
160 pop rbp
161 ret
162
Johann3662cec2020-04-26 20:22:43 +0900163globalsym(aom_filter_block1d8_v2_sse2)
Yaowu Xuf883b422016-08-30 14:01:10 -0700164sym(aom_filter_block1d8_v2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700165 push rbp
166 mov rbp, rsp
167 SHADOW_ARGS_TO_STACK 6
168 SAVE_XMM 7
169 push rsi
170 push rdi
171 ; end prolog
172
173 GET_PARAM
174.loop:
175 movq xmm0, [rsi] ;0
176 movq xmm1, [rsi + rax] ;1
177
178 APPLY_FILTER_8 0
179 jnz .loop
180
181 ; begin epilog
182 pop rdi
183 pop rsi
184 RESTORE_XMM
185 UNSHADOW_ARGS
186 pop rbp
187 ret
188
Johann3662cec2020-04-26 20:22:43 +0900189globalsym(aom_filter_block1d16_v2_sse2)
Yaowu Xuf883b422016-08-30 14:01:10 -0700190sym(aom_filter_block1d16_v2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700191 push rbp
192 mov rbp, rsp
193 SHADOW_ARGS_TO_STACK 6
194 SAVE_XMM 7
195 push rsi
196 push rdi
197 ; end prolog
198
199 GET_PARAM
200.loop:
201 movdqu xmm0, [rsi] ;0
202 movdqu xmm1, [rsi + rax] ;1
203 movdqa xmm2, xmm0
204 movdqa xmm3, xmm1
205
206 APPLY_FILTER_16 0
207 jnz .loop
208
209 ; begin epilog
210 pop rdi
211 pop rsi
212 RESTORE_XMM
213 UNSHADOW_ARGS
214 pop rbp
215 ret
216
Johann3662cec2020-04-26 20:22:43 +0900217globalsym(aom_filter_block1d4_h2_sse2)
Yaowu Xuf883b422016-08-30 14:01:10 -0700218sym(aom_filter_block1d4_h2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700219 push rbp
220 mov rbp, rsp
221 SHADOW_ARGS_TO_STACK 6
222 push rsi
223 push rdi
224 ; end prolog
225
226 GET_PARAM_4
227.loop:
228 movdqu xmm0, [rsi] ;load src
229 movdqa xmm1, xmm0
230 psrldq xmm1, 1
231
232 APPLY_FILTER_4 0
233 jnz .loop
234
235 ; begin epilog
236 pop rdi
237 pop rsi
238 UNSHADOW_ARGS
239 pop rbp
240 ret
241
Johann3662cec2020-04-26 20:22:43 +0900242globalsym(aom_filter_block1d8_h2_sse2)
Yaowu Xuf883b422016-08-30 14:01:10 -0700243sym(aom_filter_block1d8_h2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700244 push rbp
245 mov rbp, rsp
246 SHADOW_ARGS_TO_STACK 6
247 SAVE_XMM 7
248 push rsi
249 push rdi
250 ; end prolog
251
252 GET_PARAM
253.loop:
254 movdqu xmm0, [rsi] ;load src
255 movdqa xmm1, xmm0
256 psrldq xmm1, 1
257
258 APPLY_FILTER_8 0
259 jnz .loop
260
261 ; begin epilog
262 pop rdi
263 pop rsi
264 RESTORE_XMM
265 UNSHADOW_ARGS
266 pop rbp
267 ret
268
Johann3662cec2020-04-26 20:22:43 +0900269globalsym(aom_filter_block1d16_h2_sse2)
Yaowu Xuf883b422016-08-30 14:01:10 -0700270sym(aom_filter_block1d16_h2_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700271 push rbp
272 mov rbp, rsp
273 SHADOW_ARGS_TO_STACK 6
274 SAVE_XMM 7
275 push rsi
276 push rdi
277 ; end prolog
278
279 GET_PARAM
280.loop:
281 movdqu xmm0, [rsi] ;load src
282 movdqu xmm1, [rsi + 1]
283 movdqa xmm2, xmm0
284 movdqa xmm3, xmm1
285
286 APPLY_FILTER_16 0
287 jnz .loop
288
289 ; begin epilog
290 pop rdi
291 pop rsi
292 RESTORE_XMM
293 UNSHADOW_ARGS
294 pop rbp
295 ret