blob: 0d954e1788e0839caa764123788af918d018d8de [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001;
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002; Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003;
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
Yaowu Xuc27fc142016-08-22 16:08:15 -070012;
13
14
15%include "aom_ports/x86_abi_support.asm"
16
Johann0fff5342017-12-04 09:12:34 -080017SECTION .text
18
Yaowu Xuf883b422016-08-30 14:01:10 -070019;unsigned int aom_highbd_calc16x16var_sse2
Yaowu Xuc27fc142016-08-22 16:08:15 -070020;(
21; unsigned char * src_ptr,
22; int source_stride,
23; unsigned char * ref_ptr,
24; int recon_stride,
25; unsigned int * SSE,
26; int * Sum
27;)
Yaowu Xuf883b422016-08-30 14:01:10 -070028global sym(aom_highbd_calc16x16var_sse2) PRIVATE
29sym(aom_highbd_calc16x16var_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -070030 push rbp
31 mov rbp, rsp
32 SHADOW_ARGS_TO_STACK 6
33 SAVE_XMM 7
34 push rbx
35 push rsi
36 push rdi
37 ; end prolog
38
39 mov rsi, arg(0) ;[src_ptr]
40 mov rdi, arg(2) ;[ref_ptr]
41
42 movsxd rax, DWORD PTR arg(1) ;[source_stride]
43 movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
44 add rax, rax ; source stride in bytes
45 add rdx, rdx ; recon stride in bytes
46
47 ; Prefetch data
48 prefetcht0 [rsi]
49 prefetcht0 [rsi+16]
50 prefetcht0 [rsi+rax]
51 prefetcht0 [rsi+rax+16]
52 lea rbx, [rsi+rax*2]
53 prefetcht0 [rbx]
54 prefetcht0 [rbx+16]
55 prefetcht0 [rbx+rax]
56 prefetcht0 [rbx+rax+16]
57
58 prefetcht0 [rdi]
59 prefetcht0 [rdi+16]
60 prefetcht0 [rdi+rdx]
61 prefetcht0 [rdi+rdx+16]
62 lea rbx, [rdi+rdx*2]
63 prefetcht0 [rbx]
64 prefetcht0 [rbx+16]
65 prefetcht0 [rbx+rdx]
66 prefetcht0 [rbx+rdx+16]
67
68 pxor xmm0, xmm0 ; clear xmm0 for unpack
69 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
70
71 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
72 mov rcx, 16
73
74.var16loop:
75 movdqu xmm1, XMMWORD PTR [rsi]
76 movdqu xmm2, XMMWORD PTR [rdi]
77
78 lea rbx, [rsi+rax*2]
79 prefetcht0 [rbx]
80 prefetcht0 [rbx+16]
81 prefetcht0 [rbx+rax]
82 prefetcht0 [rbx+rax+16]
83 lea rbx, [rdi+rdx*2]
84 prefetcht0 [rbx]
85 prefetcht0 [rbx+16]
86 prefetcht0 [rbx+rdx]
87 prefetcht0 [rbx+rdx+16]
88
89 pxor xmm5, xmm5
90
91 psubw xmm1, xmm2
92 movdqu xmm3, XMMWORD PTR [rsi+16]
93 paddw xmm5, xmm1
94 pmaddwd xmm1, xmm1
95 movdqu xmm2, XMMWORD PTR [rdi+16]
96 paddd xmm6, xmm1
97
98 psubw xmm3, xmm2
99 movdqu xmm1, XMMWORD PTR [rsi+rax]
100 paddw xmm5, xmm3
101 pmaddwd xmm3, xmm3
102 movdqu xmm2, XMMWORD PTR [rdi+rdx]
103 paddd xmm6, xmm3
104
105 psubw xmm1, xmm2
106 movdqu xmm3, XMMWORD PTR [rsi+rax+16]
107 paddw xmm5, xmm1
108 pmaddwd xmm1, xmm1
109 movdqu xmm2, XMMWORD PTR [rdi+rdx+16]
110 paddd xmm6, xmm1
111
112 psubw xmm3, xmm2
113 paddw xmm5, xmm3
114 pmaddwd xmm3, xmm3
115 paddd xmm6, xmm3
116
117 movdqa xmm1, xmm5
118 movdqa xmm2, xmm5
119 pcmpgtw xmm1, xmm0
120 pcmpeqw xmm2, xmm0
121 por xmm1, xmm2
122 pcmpeqw xmm1, xmm0
123 movdqa xmm2, xmm5
124 punpcklwd xmm5, xmm1
125 punpckhwd xmm2, xmm1
126 paddd xmm7, xmm5
127 paddd xmm7, xmm2
128
129 lea rsi, [rsi + 2*rax]
130 lea rdi, [rdi + 2*rdx]
131 sub rcx, 2
132 jnz .var16loop
133
134 movdqa xmm4, xmm6
135 punpckldq xmm6, xmm0
136
137 punpckhdq xmm4, xmm0
138 movdqa xmm5, xmm7
139
140 paddd xmm6, xmm4
141 punpckldq xmm7, xmm0
142
143 punpckhdq xmm5, xmm0
144 paddd xmm7, xmm5
145
146 movdqa xmm4, xmm6
147 movdqa xmm5, xmm7
148
149 psrldq xmm4, 8
150 psrldq xmm5, 8
151
152 paddd xmm6, xmm4
153 paddd xmm7, xmm5
154
155 mov rdi, arg(4) ; [SSE]
156 mov rax, arg(5) ; [Sum]
157
158 movd DWORD PTR [rdi], xmm6
159 movd DWORD PTR [rax], xmm7
160
161
162 ; begin epilog
163 pop rdi
164 pop rsi
165 pop rbx
166 RESTORE_XMM
167 UNSHADOW_ARGS
168 pop rbp
169 ret
170
171
Yaowu Xuf883b422016-08-30 14:01:10 -0700172;unsigned int aom_highbd_calc8x8var_sse2
Yaowu Xuc27fc142016-08-22 16:08:15 -0700173;(
174; unsigned char * src_ptr,
175; int source_stride,
176; unsigned char * ref_ptr,
177; int recon_stride,
178; unsigned int * SSE,
179; int * Sum
180;)
Yaowu Xuf883b422016-08-30 14:01:10 -0700181global sym(aom_highbd_calc8x8var_sse2) PRIVATE
182sym(aom_highbd_calc8x8var_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700183 push rbp
184 mov rbp, rsp
185 SHADOW_ARGS_TO_STACK 6
186 SAVE_XMM 7
187 push rbx
188 push rsi
189 push rdi
190 ; end prolog
191
192 mov rsi, arg(0) ;[src_ptr]
193 mov rdi, arg(2) ;[ref_ptr]
194
195 movsxd rax, DWORD PTR arg(1) ;[source_stride]
196 movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
197 add rax, rax ; source stride in bytes
198 add rdx, rdx ; recon stride in bytes
199
200 ; Prefetch data
201 prefetcht0 [rsi]
202 prefetcht0 [rsi+rax]
203 lea rbx, [rsi+rax*2]
204 prefetcht0 [rbx]
205 prefetcht0 [rbx+rax]
206
207 prefetcht0 [rdi]
208 prefetcht0 [rdi+rdx]
209 lea rbx, [rdi+rdx*2]
210 prefetcht0 [rbx]
211 prefetcht0 [rbx+rdx]
212
213 pxor xmm0, xmm0 ; clear xmm0 for unpack
214 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
215
216 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
217 mov rcx, 8
218
219.var8loop:
220 movdqu xmm1, XMMWORD PTR [rsi]
221 movdqu xmm2, XMMWORD PTR [rdi]
222
223 lea rbx, [rsi+rax*4]
224 prefetcht0 [rbx]
225 prefetcht0 [rbx+rax]
226 lea rbx, [rbx+rax*2]
227 prefetcht0 [rbx]
228 prefetcht0 [rbx+rax]
229 lea rbx, [rdi+rdx*4]
230 prefetcht0 [rbx]
231 prefetcht0 [rbx+rdx]
232 lea rbx, [rbx+rdx*2]
233 prefetcht0 [rbx]
234 prefetcht0 [rbx+rdx]
235
236 pxor xmm5, xmm5
237
238 psubw xmm1, xmm2
239 movdqu xmm3, XMMWORD PTR [rsi+rax]
240 paddw xmm5, xmm1
241 pmaddwd xmm1, xmm1
242 movdqu xmm2, XMMWORD PTR [rdi+rdx]
243 paddd xmm6, xmm1
244
245 lea rsi, [rsi + 2*rax]
246 lea rdi, [rdi + 2*rdx]
247
248 psubw xmm3, xmm2
249 movdqu xmm1, XMMWORD PTR [rsi]
250 paddw xmm5, xmm3
251 pmaddwd xmm3, xmm3
252 movdqu xmm2, XMMWORD PTR [rdi]
253 paddd xmm6, xmm3
254
255 psubw xmm1, xmm2
256 movdqu xmm3, XMMWORD PTR [rsi+rax]
257 paddw xmm5, xmm1
258 pmaddwd xmm1, xmm1
259 movdqu xmm2, XMMWORD PTR [rdi+rdx]
260 paddd xmm6, xmm1
261
262 psubw xmm3, xmm2
263 paddw xmm5, xmm3
264 pmaddwd xmm3, xmm3
265 paddd xmm6, xmm3
266
267 movdqa xmm1, xmm5
268 movdqa xmm2, xmm5
269 pcmpgtw xmm1, xmm0
270 pcmpeqw xmm2, xmm0
271 por xmm1, xmm2
272 pcmpeqw xmm1, xmm0
273 movdqa xmm2, xmm5
274 punpcklwd xmm5, xmm1
275 punpckhwd xmm2, xmm1
276 paddd xmm7, xmm5
277 paddd xmm7, xmm2
278
279 lea rsi, [rsi + 2*rax]
280 lea rdi, [rdi + 2*rdx]
281 sub rcx, 4
282 jnz .var8loop
283
284 movdqa xmm4, xmm6
285 punpckldq xmm6, xmm0
286
287 punpckhdq xmm4, xmm0
288 movdqa xmm5, xmm7
289
290 paddd xmm6, xmm4
291 punpckldq xmm7, xmm0
292
293 punpckhdq xmm5, xmm0
294 paddd xmm7, xmm5
295
296 movdqa xmm4, xmm6
297 movdqa xmm5, xmm7
298
299 psrldq xmm4, 8
300 psrldq xmm5, 8
301
302 paddd xmm6, xmm4
303 paddd xmm7, xmm5
304
305 mov rdi, arg(4) ; [SSE]
306 mov rax, arg(5) ; [Sum]
307
308 movd DWORD PTR [rdi], xmm6
309 movd DWORD PTR [rax], xmm7
310
311 ; begin epilog
312 pop rdi
313 pop rsi
314 pop rbx
315 RESTORE_XMM
316 UNSHADOW_ARGS
317 pop rbp
318 ret