blob: 0d90cfa86ea7d6dea565fc4874ee1aadeea9aaab [file] [log] [blame]
Yaowu Xu6035da52012-03-09 17:32:50 -08001;
2; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4; Use of this source code is governed by a BSD-style license
5; that can be found in the LICENSE file in the root of the source
6; tree. An additional intellectual property rights grant can be found
7; in the file PATENTS. All contributing project authors may
8; be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;void int vp8_makemask_sse3(
15; unsigned char *y,
16; unsigned char *u,
17; unsigned char *v,
18; unsigned char *ym,
19; unsigned char *uvm,
20; int yp,
21; int uvp,
22; int ys,
23; int us,
24; int vs,
25; int yt,
26; int ut,
27; int vt)
28global sym(vp8_makemask_sse3)
29sym(vp8_makemask_sse3):
30 push rbp
31 mov rbp, rsp
32 SHADOW_ARGS_TO_STACK 14
33 push rsi
34 push rdi
35 ; end prolog
36
37 mov rsi, arg(0) ;y
38 mov rdi, arg(1) ;u
39 mov rcx, arg(2) ;v
40 mov rax, arg(3) ;ym
41 movsxd rbx, dword arg(4) ;yp
42 movsxd rdx, dword arg(5) ;uvp
43
44 pxor xmm0,xmm0
45
46 ;make 16 copies of the center y value
47 movd xmm1, arg(6)
48 pshufb xmm1, xmm0
49
50 ; make 16 copies of the center u value
51 movd xmm2, arg(7)
52 pshufb xmm2, xmm0
53
54 ; make 16 copies of the center v value
55 movd xmm3, arg(8)
56 pshufb xmm3, xmm0
57 unpcklpd xmm2, xmm3
58
59 ;make 16 copies of the y tolerance
60 movd xmm3, arg(9)
61 pshufb xmm3, xmm0
62
63 ;make 16 copies of the u tolerance
64 movd xmm4, arg(10)
65 pshufb xmm4, xmm0
66
67 ;make 16 copies of the v tolerance
68 movd xmm5, arg(11)
69 pshufb xmm5, xmm0
70 unpckhpd xmm4, xmm5
71
72 mov r8,8
73
74NextPairOfRows:
75
76 ;grab the y source values
77 movdqu xmm0, [rsi]
78
79 ;compute abs difference between source and y target
80 movdqa xmm6, xmm1
81 movdqa xmm7, xmm0
82 psubusb xmm0, xmm1
83 psubusb xmm6, xmm7
84 por xmm0, xmm6
85
86 ;compute abs difference between
87 movdqa xmm6, xmm3
88 pcmpgtb xmm6, xmm0
89
90 ;grab the y source values
91 add rsi, rbx
92 movdqu xmm0, [rsi]
93
94 ;compute abs difference between source and y target
95 movdqa xmm11, xmm1
96 movdqa xmm7, xmm0
97 psubusb xmm0, xmm1
98 psubusb xmm11, xmm7
99 por xmm0, xmm11
100
101 ;compute abs difference between
102 movdqa xmm11, xmm3
103 pcmpgtb xmm11, xmm0
104
105
106 ;grab the u and v source values
107 movdqu xmm7, [rdi]
108 movdqu xmm8, [rcx]
109 unpcklpd xmm7, xmm8
110
111 ;compute abs difference between source and uv targets
112 movdqa xmm9, xmm2
113 movdqa xmm10, xmm7
114 psubusb xmm7, xmm2
115 psubusb xmm9, xmm10
116 por xmm7, xmm9
117
118 ;check whether the number is < tolerance
119 movdqa xmm0, xmm4
120 pcmpgtb xmm0, xmm7
121
122 ;double u and v masks
123 movdqa xmm8, xmm0
124 punpckhbw xmm0, xmm0
125 punpcklbw xmm8, xmm8
126
127 ;mask row 0 and output
128 pand xmm6, xmm8
129 pand xmm6, xmm0
130 movdqa [rax],xmm6
131
132 ;mask row 1 and output
133 pand xmm11, xmm8
134 pand xmm11, xmm0
135 movdqa [rax+16],xmm11
136
137
138 ; to the next row or set of rows
139 add rsi, rbx
140 add rdi, rdx
141 add rcx, rdx
142 add rax,32
143 dec r8
144 jnz NextPairOfRows
145
146
147 ; begin epilog
148 pop rdi
149 pop rsi
150 UNSHADOW_ARGS
151 pop rbp
152 ret
153
154;GROW_HORIZ (register for result, source register or mem local)
155; takes source and shifts left and ors with source
156; then shifts right and ors with source
157%macro GROW_HORIZ 2
158 movdqa %1, %2
159 movdqa xmm14, %1
160 movdqa xmm15, %1
161 pslldq xmm14, 1
162 psrldq xmm15, 1
163 por %1,xmm14
164 por %1,xmm15
165%endmacro
166;GROW_VERT (result, center row, above row, below row)
167%macro GROW_VERT 4
168 movdqa %1,%2
169 por %1,%3
170 por %1,%4
171%endmacro
172
173;GROW_NEXTLINE (new line to grow, new source, line to write)
174%macro GROW_NEXTLINE 3
175 GROW_HORIZ %1, %2
176 GROW_VERT xmm3, xmm0, xmm1, xmm2
177 movdqa %3,xmm3
178%endmacro
179
180
181;void int vp8_growmaskmb_sse3(
182; unsigned char *om,
183; unsigned char *nm,
184global sym(vp8_growmaskmb_sse3)
185sym(vp8_growmaskmb_sse3):
186 push rbp
187 mov rbp, rsp
188 SHADOW_ARGS_TO_STACK 2
189 push rsi
190 push rdi
191 ; end prolog
192
193 mov rsi, arg(0) ;src
194 mov rdi, arg(1) ;rst
195
196 GROW_HORIZ xmm0, [rsi]
197 GROW_HORIZ xmm1, [rsi+16]
198 GROW_HORIZ xmm2, [rsi+32]
199
200 GROW_VERT xmm3, xmm0, xmm1, xmm2
201 por xmm0,xmm1
202 movdqa [rdi], xmm0
203 movdqa [rdi+16],xmm3
204
205 GROW_NEXTLINE xmm0,[rsi+48],[rdi+32]
206 GROW_NEXTLINE xmm1,[rsi+64],[rdi+48]
207 GROW_NEXTLINE xmm2,[rsi+80],[rdi+64]
208 GROW_NEXTLINE xmm0,[rsi+96],[rdi+80]
209 GROW_NEXTLINE xmm1,[rsi+112],[rdi+96]
210 GROW_NEXTLINE xmm2,[rsi+128],[rdi+112]
211 GROW_NEXTLINE xmm0,[rsi+144],[rdi+128]
212 GROW_NEXTLINE xmm1,[rsi+160],[rdi+144]
213 GROW_NEXTLINE xmm2,[rsi+176],[rdi+160]
214 GROW_NEXTLINE xmm0,[rsi+192],[rdi+176]
215 GROW_NEXTLINE xmm1,[rsi+208],[rdi+192]
216 GROW_NEXTLINE xmm2,[rsi+224],[rdi+208]
217 GROW_NEXTLINE xmm0,[rsi+240],[rdi+224]
218
219 por xmm0,xmm2
220 movdqa [rdi+240], xmm0
221
222 ; begin epilog
223 pop rdi
224 pop rsi
225 UNSHADOW_ARGS
226 pop rbp
227 ret
228
229
230
231;unsigned int vp8_sad16x16_masked_wmt(
232; unsigned char *src_ptr,
233; int src_stride,
234; unsigned char *ref_ptr,
235; int ref_stride,
236; unsigned char *mask)
237global sym(vp8_sad16x16_masked_wmt)
238sym(vp8_sad16x16_masked_wmt):
239 push rbp
240 mov rbp, rsp
241 SHADOW_ARGS_TO_STACK 5
242 push rsi
243 push rdi
244 ; end prolog
245 mov rsi, arg(0) ;src_ptr
246 mov rdi, arg(2) ;ref_ptr
247
248 mov rbx, arg(4) ;mask
249 movsxd rax, dword ptr arg(1) ;src_stride
250 movsxd rdx, dword ptr arg(3) ;ref_stride
251
252 mov rcx, 16
253
254 pxor xmm3, xmm3
255
256NextSadRow:
257 movdqu xmm0, [rsi]
258 movdqu xmm1, [rdi]
259 movdqu xmm2, [rbx]
260 pand xmm0, xmm2
261 pand xmm1, xmm2
262
263 psadbw xmm0, xmm1
264 paddw xmm3, xmm0
265
266 add rsi, rax
267 add rdi, rdx
268 add rbx, 16
269
270 dec rcx
271 jnz NextSadRow
272
273 movdqa xmm4 , xmm3
274 psrldq xmm4, 8
275 paddw xmm3, xmm4
276 movq rax, xmm3
277 ; begin epilog
278 pop rdi
279 pop rsi
280 UNSHADOW_ARGS
281 pop rbp
282 ret
283
284
285;unsigned int vp8_sad16x16_unmasked_wmt(
286; unsigned char *src_ptr,
287; int src_stride,
288; unsigned char *ref_ptr,
289; int ref_stride,
290; unsigned char *mask)
291global sym(vp8_sad16x16_unmasked_wmt)
292sym(vp8_sad16x16_unmasked_wmt):
293 push rbp
294 mov rbp, rsp
295 SHADOW_ARGS_TO_STACK 5
296 push rsi
297 push rdi
298 ; end prolog
299 mov rsi, arg(0) ;src_ptr
300 mov rdi, arg(2) ;ref_ptr
301
302 mov rbx, arg(4) ;mask
303 movsxd rax, dword ptr arg(1) ;src_stride
304 movsxd rdx, dword ptr arg(3) ;ref_stride
305
306 mov rcx, 16
307
308 pxor xmm3, xmm3
309
310next_vp8_sad16x16_unmasked_wmt:
311 movdqu xmm0, [rsi]
312 movdqu xmm1, [rdi]
313 movdqu xmm2, [rbx]
314 por xmm0, xmm2
315 por xmm1, xmm2
316
317 psadbw xmm0, xmm1
318 paddw xmm3, xmm0
319
320 add rsi, rax
321 add rdi, rdx
322 add rbx, 16
323
324 dec rcx
325 jnz next_vp8_sad16x16_unmasked_wmt
326
327 movdqa xmm4 , xmm3
328 psrldq xmm4, 8
329 paddw xmm3, xmm4
330 movq rax, xmm3
331 ; begin epilog
332 pop rdi
333 pop rsi
334 UNSHADOW_ARGS
335 pop rbp
336 ret
337
338
339;unsigned int vp8_masked_predictor_wmt(
340; unsigned char *masked,
341; unsigned char *unmasked,
342; int src_stride,
343; unsigned char *dst_ptr,
344; int dst_stride,
345; unsigned char *mask)
346global sym(vp8_masked_predictor_wmt)
347sym(vp8_masked_predictor_wmt):
348 push rbp
349 mov rbp, rsp
350 SHADOW_ARGS_TO_STACK 6
351 push rsi
352 push rdi
353 ; end prolog
354 mov rsi, arg(0) ;src_ptr
355 mov rdi, arg(1) ;ref_ptr
356
357 mov rbx, arg(5) ;mask
358 movsxd rax, dword ptr arg(2) ;src_stride
359 mov r11, arg(3) ; destination
360 movsxd rdx, dword ptr arg(4) ;dst_stride
361
362 mov rcx, 16
363
364 pxor xmm3, xmm3
365
366next_vp8_masked_predictor_wmt:
367 movdqu xmm0, [rsi]
368 movdqu xmm1, [rdi]
369 movdqu xmm2, [rbx]
370
371 pand xmm0, xmm2
372 pandn xmm2, xmm1
373 por xmm0, xmm2
374 movdqu [r11], xmm0
375
376 add r11, rdx
377 add rsi, rax
378 add rdi, rdx
379 add rbx, 16
380
381 dec rcx
382 jnz next_vp8_masked_predictor_wmt
383
384 ; begin epilog
385 pop rdi
386 pop rsi
387 UNSHADOW_ARGS
388 pop rbp
389 ret
390
391;unsigned int vp8_masked_predictor_uv_wmt(
392; unsigned char *masked,
393; unsigned char *unmasked,
394; int src_stride,
395; unsigned char *dst_ptr,
396; int dst_stride,
397; unsigned char *mask)
398global sym(vp8_masked_predictor_uv_wmt)
399sym(vp8_masked_predictor_uv_wmt):
400 push rbp
401 mov rbp, rsp
402 SHADOW_ARGS_TO_STACK 6
403 push rsi
404 push rdi
405 ; end prolog
406 mov rsi, arg(0) ;src_ptr
407 mov rdi, arg(1) ;ref_ptr
408
409 mov rbx, arg(5) ;mask
410 movsxd rax, dword ptr arg(2) ;src_stride
411 mov r11, arg(3) ; destination
412 movsxd rdx, dword ptr arg(4) ;dst_stride
413
414 mov rcx, 8
415
416 pxor xmm3, xmm3
417
418next_vp8_masked_predictor_uv_wmt:
419 movq xmm0, [rsi]
420 movq xmm1, [rdi]
421 movq xmm2, [rbx]
422
423 pand xmm0, xmm2
424 pandn xmm2, xmm1
425 por xmm0, xmm2
426 movq [r11], xmm0
427
428 add r11, rdx
429 add rsi, rax
430 add rdi, rax
431 add rbx, 8
432
433 dec rcx
434 jnz next_vp8_masked_predictor_uv_wmt
435
436 ; begin epilog
437 pop rdi
438 pop rsi
439 UNSHADOW_ARGS
440 pop rbp
441 ret
442
443
444;unsigned int vp8_uv_from_y_mask(
445; unsigned char *ymask,
446; unsigned char *uvmask)
447global sym(vp8_uv_from_y_mask)
448sym(vp8_uv_from_y_mask):
449 push rbp
450 mov rbp, rsp
451 SHADOW_ARGS_TO_STACK 6
452 push rsi
453 push rdi
454 ; end prolog
455 mov rsi, arg(0) ;src_ptr
456 mov rdi, arg(1) ;dst_ptr
457
458
459 mov rcx, 8
460
461 pxor xmm3, xmm3
462
463next_p8_uv_from_y_mask:
464 movdqu xmm0, [rsi]
465 pshufb xmm0, [shuf1b] ;[GLOBAL(shuf1b)]
466 movq [rdi],xmm0
467 add rdi, 8
468 add rsi,32
469
470 dec rcx
471 jnz next_p8_uv_from_y_mask
472
473 ; begin epilog
474 pop rdi
475 pop rsi
476 UNSHADOW_ARGS
477 pop rbp
478 ret
479
480SECTION_RODATA
481align 16
482shuf1b:
483 db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
484