blob: 1bf3abbf34f6ae0dd5d18b2df8ef235090803950 [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001;
2; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3;
4; Use of this source code is governed by a BSD-style license
5; that can be found in the LICENSE file in the root of the source
6; tree. An additional intellectual property rights grant can be found
7; in the file PATENTS. All contributing project authors may
8; be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "aom_ports/x86_abi_support.asm"
13
14;unsigned int vpx_highbd_calc16x16var_sse2
15;(
16; unsigned char * src_ptr,
17; int source_stride,
18; unsigned char * ref_ptr,
19; int recon_stride,
20; unsigned int * SSE,
21; int * Sum
22;)
23global sym(vpx_highbd_calc16x16var_sse2) PRIVATE
24sym(vpx_highbd_calc16x16var_sse2):
25 push rbp
26 mov rbp, rsp
27 SHADOW_ARGS_TO_STACK 6
28 SAVE_XMM 7
29 push rbx
30 push rsi
31 push rdi
32 ; end prolog
33
34 mov rsi, arg(0) ;[src_ptr]
35 mov rdi, arg(2) ;[ref_ptr]
36
37 movsxd rax, DWORD PTR arg(1) ;[source_stride]
38 movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
39 add rax, rax ; source stride in bytes
40 add rdx, rdx ; recon stride in bytes
41
42 ; Prefetch data
43 prefetcht0 [rsi]
44 prefetcht0 [rsi+16]
45 prefetcht0 [rsi+rax]
46 prefetcht0 [rsi+rax+16]
47 lea rbx, [rsi+rax*2]
48 prefetcht0 [rbx]
49 prefetcht0 [rbx+16]
50 prefetcht0 [rbx+rax]
51 prefetcht0 [rbx+rax+16]
52
53 prefetcht0 [rdi]
54 prefetcht0 [rdi+16]
55 prefetcht0 [rdi+rdx]
56 prefetcht0 [rdi+rdx+16]
57 lea rbx, [rdi+rdx*2]
58 prefetcht0 [rbx]
59 prefetcht0 [rbx+16]
60 prefetcht0 [rbx+rdx]
61 prefetcht0 [rbx+rdx+16]
62
63 pxor xmm0, xmm0 ; clear xmm0 for unpack
64 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
65
66 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
67 mov rcx, 16
68
69.var16loop:
70 movdqu xmm1, XMMWORD PTR [rsi]
71 movdqu xmm2, XMMWORD PTR [rdi]
72
73 lea rbx, [rsi+rax*2]
74 prefetcht0 [rbx]
75 prefetcht0 [rbx+16]
76 prefetcht0 [rbx+rax]
77 prefetcht0 [rbx+rax+16]
78 lea rbx, [rdi+rdx*2]
79 prefetcht0 [rbx]
80 prefetcht0 [rbx+16]
81 prefetcht0 [rbx+rdx]
82 prefetcht0 [rbx+rdx+16]
83
84 pxor xmm5, xmm5
85
86 psubw xmm1, xmm2
87 movdqu xmm3, XMMWORD PTR [rsi+16]
88 paddw xmm5, xmm1
89 pmaddwd xmm1, xmm1
90 movdqu xmm2, XMMWORD PTR [rdi+16]
91 paddd xmm6, xmm1
92
93 psubw xmm3, xmm2
94 movdqu xmm1, XMMWORD PTR [rsi+rax]
95 paddw xmm5, xmm3
96 pmaddwd xmm3, xmm3
97 movdqu xmm2, XMMWORD PTR [rdi+rdx]
98 paddd xmm6, xmm3
99
100 psubw xmm1, xmm2
101 movdqu xmm3, XMMWORD PTR [rsi+rax+16]
102 paddw xmm5, xmm1
103 pmaddwd xmm1, xmm1
104 movdqu xmm2, XMMWORD PTR [rdi+rdx+16]
105 paddd xmm6, xmm1
106
107 psubw xmm3, xmm2
108 paddw xmm5, xmm3
109 pmaddwd xmm3, xmm3
110 paddd xmm6, xmm3
111
112 movdqa xmm1, xmm5
113 movdqa xmm2, xmm5
114 pcmpgtw xmm1, xmm0
115 pcmpeqw xmm2, xmm0
116 por xmm1, xmm2
117 pcmpeqw xmm1, xmm0
118 movdqa xmm2, xmm5
119 punpcklwd xmm5, xmm1
120 punpckhwd xmm2, xmm1
121 paddd xmm7, xmm5
122 paddd xmm7, xmm2
123
124 lea rsi, [rsi + 2*rax]
125 lea rdi, [rdi + 2*rdx]
126 sub rcx, 2
127 jnz .var16loop
128
129 movdqa xmm4, xmm6
130 punpckldq xmm6, xmm0
131
132 punpckhdq xmm4, xmm0
133 movdqa xmm5, xmm7
134
135 paddd xmm6, xmm4
136 punpckldq xmm7, xmm0
137
138 punpckhdq xmm5, xmm0
139 paddd xmm7, xmm5
140
141 movdqa xmm4, xmm6
142 movdqa xmm5, xmm7
143
144 psrldq xmm4, 8
145 psrldq xmm5, 8
146
147 paddd xmm6, xmm4
148 paddd xmm7, xmm5
149
150 mov rdi, arg(4) ; [SSE]
151 mov rax, arg(5) ; [Sum]
152
153 movd DWORD PTR [rdi], xmm6
154 movd DWORD PTR [rax], xmm7
155
156
157 ; begin epilog
158 pop rdi
159 pop rsi
160 pop rbx
161 RESTORE_XMM
162 UNSHADOW_ARGS
163 pop rbp
164 ret
165
166
167;unsigned int vpx_highbd_calc8x8var_sse2
168;(
169; unsigned char * src_ptr,
170; int source_stride,
171; unsigned char * ref_ptr,
172; int recon_stride,
173; unsigned int * SSE,
174; int * Sum
175;)
176global sym(vpx_highbd_calc8x8var_sse2) PRIVATE
177sym(vpx_highbd_calc8x8var_sse2):
178 push rbp
179 mov rbp, rsp
180 SHADOW_ARGS_TO_STACK 6
181 SAVE_XMM 7
182 push rbx
183 push rsi
184 push rdi
185 ; end prolog
186
187 mov rsi, arg(0) ;[src_ptr]
188 mov rdi, arg(2) ;[ref_ptr]
189
190 movsxd rax, DWORD PTR arg(1) ;[source_stride]
191 movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
192 add rax, rax ; source stride in bytes
193 add rdx, rdx ; recon stride in bytes
194
195 ; Prefetch data
196 prefetcht0 [rsi]
197 prefetcht0 [rsi+rax]
198 lea rbx, [rsi+rax*2]
199 prefetcht0 [rbx]
200 prefetcht0 [rbx+rax]
201
202 prefetcht0 [rdi]
203 prefetcht0 [rdi+rdx]
204 lea rbx, [rdi+rdx*2]
205 prefetcht0 [rbx]
206 prefetcht0 [rbx+rdx]
207
208 pxor xmm0, xmm0 ; clear xmm0 for unpack
209 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
210
211 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
212 mov rcx, 8
213
214.var8loop:
215 movdqu xmm1, XMMWORD PTR [rsi]
216 movdqu xmm2, XMMWORD PTR [rdi]
217
218 lea rbx, [rsi+rax*4]
219 prefetcht0 [rbx]
220 prefetcht0 [rbx+rax]
221 lea rbx, [rbx+rax*2]
222 prefetcht0 [rbx]
223 prefetcht0 [rbx+rax]
224 lea rbx, [rdi+rdx*4]
225 prefetcht0 [rbx]
226 prefetcht0 [rbx+rdx]
227 lea rbx, [rbx+rdx*2]
228 prefetcht0 [rbx]
229 prefetcht0 [rbx+rdx]
230
231 pxor xmm5, xmm5
232
233 psubw xmm1, xmm2
234 movdqu xmm3, XMMWORD PTR [rsi+rax]
235 paddw xmm5, xmm1
236 pmaddwd xmm1, xmm1
237 movdqu xmm2, XMMWORD PTR [rdi+rdx]
238 paddd xmm6, xmm1
239
240 lea rsi, [rsi + 2*rax]
241 lea rdi, [rdi + 2*rdx]
242
243 psubw xmm3, xmm2
244 movdqu xmm1, XMMWORD PTR [rsi]
245 paddw xmm5, xmm3
246 pmaddwd xmm3, xmm3
247 movdqu xmm2, XMMWORD PTR [rdi]
248 paddd xmm6, xmm3
249
250 psubw xmm1, xmm2
251 movdqu xmm3, XMMWORD PTR [rsi+rax]
252 paddw xmm5, xmm1
253 pmaddwd xmm1, xmm1
254 movdqu xmm2, XMMWORD PTR [rdi+rdx]
255 paddd xmm6, xmm1
256
257 psubw xmm3, xmm2
258 paddw xmm5, xmm3
259 pmaddwd xmm3, xmm3
260 paddd xmm6, xmm3
261
262 movdqa xmm1, xmm5
263 movdqa xmm2, xmm5
264 pcmpgtw xmm1, xmm0
265 pcmpeqw xmm2, xmm0
266 por xmm1, xmm2
267 pcmpeqw xmm1, xmm0
268 movdqa xmm2, xmm5
269 punpcklwd xmm5, xmm1
270 punpckhwd xmm2, xmm1
271 paddd xmm7, xmm5
272 paddd xmm7, xmm2
273
274 lea rsi, [rsi + 2*rax]
275 lea rdi, [rdi + 2*rdx]
276 sub rcx, 4
277 jnz .var8loop
278
279 movdqa xmm4, xmm6
280 punpckldq xmm6, xmm0
281
282 punpckhdq xmm4, xmm0
283 movdqa xmm5, xmm7
284
285 paddd xmm6, xmm4
286 punpckldq xmm7, xmm0
287
288 punpckhdq xmm5, xmm0
289 paddd xmm7, xmm5
290
291 movdqa xmm4, xmm6
292 movdqa xmm5, xmm7
293
294 psrldq xmm4, 8
295 psrldq xmm5, 8
296
297 paddd xmm6, xmm4
298 paddd xmm7, xmm5
299
300 mov rdi, arg(4) ; [SSE]
301 mov rax, arg(5) ; [Sum]
302
303 movd DWORD PTR [rdi], xmm6
304 movd DWORD PTR [rax], xmm7
305
306 ; begin epilog
307 pop rdi
308 pop rsi
309 pop rbx
310 RESTORE_XMM
311 UNSHADOW_ARGS
312 pop rbp
313 ret