blob: e48d64441e9dffa357c3303641ab875ed8a0be04 [file] [log] [blame]
Daniel Kang7a000712012-06-07 17:25:54 -07001;*****************************************************************************
2;* x86inc.asm: x264asm abstraction layer
3;*****************************************************************************
Johannc8100662020-04-26 21:00:00 +09004;* Copyright (C) 2005-2019 x264 project
Daniel Kang7a000712012-06-07 17:25:54 -07005;*
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
Johannc8100662020-04-26 21:00:00 +09007;* Henrik Gramner <henrik@gramner.com>
Daniel Kang7a000712012-06-07 17:25:54 -07008;* Anton Mitrofanov <BugMaster@narod.ru>
Johann41a0a0c2015-07-30 09:19:43 -07009;* Fiona Glaser <fiona@x264.com>
Daniel Kang7a000712012-06-07 17:25:54 -070010;*
11;* Permission to use, copy, modify, and/or distribute this software for any
12;* purpose with or without fee is hereby granted, provided that the above
13;* copyright notice and this permission notice appear in all copies.
14;*
15;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
16;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
17;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
18;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
21;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22;*****************************************************************************
23
24; This is a header file for the x264ASM assembly language, which uses
25; NASM/YASM syntax combined with a large number of macros to provide easy
26; abstraction between different calling conventions (x86_32, win64, linux64).
27; It also has various other useful features to simplify writing the kind of
28; DSP functions that are most often used in x264.
29
30; Unlike the rest of x264, this file is available under an ISC license, as it
31; has significant usefulness outside of x264 and we want it to be available
32; to the largest audience possible. Of course, if you modify it for your own
33; purposes to add a new feature, we strongly encourage contributing a patch
34; as this feature might be useful for others as well. Send patches or ideas
35; to x264-devel@videolan.org .
36
Tom Finegan60e653d2018-05-22 11:34:58 -070037%include "config/aom_config.asm"
Johann2d6393a2016-02-17 17:42:07 -080038
Johann41a0a0c2015-07-30 09:19:43 -070039%ifndef private_prefix
Yaowu Xuf883b422016-08-30 14:01:10 -070040 %define private_prefix aom
Johannb09ac152015-04-14 15:25:14 -040041%endif
Daniel Kang7a000712012-06-07 17:25:54 -070042
Johann41a0a0c2015-07-30 09:19:43 -070043%ifndef public_prefix
44 %define public_prefix private_prefix
45%endif
Daniel Kang7a000712012-06-07 17:25:54 -070046
Johann41a0a0c2015-07-30 09:19:43 -070047%ifndef STACK_ALIGNMENT
48 %if ARCH_X86_64
49 %define STACK_ALIGNMENT 16
50 %else
51 %define STACK_ALIGNMENT 4
52 %endif
53%endif
54
Daniel Kang7a000712012-06-07 17:25:54 -070055%define WIN64 0
Johann41a0a0c2015-07-30 09:19:43 -070056%define UNIX64 0
Daniel Kang7a000712012-06-07 17:25:54 -070057%if ARCH_X86_64
58 %ifidn __OUTPUT_FORMAT__,win32
59 %define WIN64 1
60 %elifidn __OUTPUT_FORMAT__,win64
61 %define WIN64 1
Ronald S. Bultje1f9943a2012-06-18 12:34:51 -070062 %elifidn __OUTPUT_FORMAT__,x64
63 %define WIN64 1
Daniel Kang7a000712012-06-07 17:25:54 -070064 %else
65 %define UNIX64 1
66 %endif
67%endif
68
Johann24973562016-02-17 17:37:24 -080069%define FORMAT_ELF 0
Johannc8100662020-04-26 21:00:00 +090070%define FORMAT_MACHO 0
Johann24973562016-02-17 17:37:24 -080071%ifidn __OUTPUT_FORMAT__,elf
72 %define FORMAT_ELF 1
73%elifidn __OUTPUT_FORMAT__,elf32
74 %define FORMAT_ELF 1
Ronald S. Bultje1f9943a2012-06-18 12:34:51 -070075%elifidn __OUTPUT_FORMAT__,elf64
Johann24973562016-02-17 17:37:24 -080076 %define FORMAT_ELF 1
Johannc8100662020-04-26 21:00:00 +090077%elifidn __OUTPUT_FORMAT__,macho
78 %define FORMAT_MACHO 1
79%elifidn __OUTPUT_FORMAT__,macho32
80 %define FORMAT_MACHO 1
Johann71ccd102016-02-17 18:08:13 -080081%elifidn __OUTPUT_FORMAT__,macho64
Johannc8100662020-04-26 21:00:00 +090082 %define FORMAT_MACHO 1
Johann71ccd102016-02-17 18:08:13 -080083%endif
84
Yaowu Xuc27fc142016-08-22 16:08:15 -070085; Set PREFIX for libaom builds.
Johann4de96412016-02-17 17:55:18 -080086%if FORMAT_ELF
87 %undef PREFIX
88%elif WIN64
89 %undef PREFIX
90%else
91 %define PREFIX
92%endif
93
Johann24973562016-02-17 17:37:24 -080094%ifdef PREFIX
Ronald S. Bultje1f9943a2012-06-18 12:34:51 -070095 %define mangle(x) _ %+ x
Johann24973562016-02-17 17:37:24 -080096%else
97 %define mangle(x) x
Daniel Kang7a000712012-06-07 17:25:54 -070098%endif
99
Johann9679be42016-02-17 18:25:09 -0800100; In some instances macho32 tables get misaligned when using .rodata.
101; When looking at the disassembly it appears that the offset is either
102; correct or consistently off by 90. Placing them in the .text section
Yaowu Xuc27fc142016-08-22 16:08:15 -0700103; works around the issue. It appears to be specific to the way libaom
Johann9679be42016-02-17 18:25:09 -0800104; handles the tables.
Daniel Kang7a000712012-06-07 17:25:54 -0700105%macro SECTION_RODATA 0-1 16
Johannc8100662020-04-26 21:00:00 +0900106 %ifidn __OUTPUT_FORMAT__,win32
107 SECTION .rdata align=%1
108 %elif WIN64
109 SECTION .rdata align=%1
110 %elifidn __OUTPUT_FORMAT__,macho32
Johann9679be42016-02-17 18:25:09 -0800111 SECTION .text align=%1
112 fakegot:
Johanna97c83f2016-02-17 18:30:46 -0800113 %elifidn __OUTPUT_FORMAT__,aout
114 SECTION .text
Johann9679be42016-02-17 18:25:09 -0800115 %else
116 SECTION .rodata align=%1
117 %endif
Daniel Kang7a000712012-06-07 17:25:54 -0700118%endmacro
119
Johannc8100662020-04-26 21:00:00 +0900120; PIC macros from aom_ports/x86_abi_support.asm.
Johann7e065cd2016-02-17 18:21:03 -0800121%ifidn __OUTPUT_FORMAT__,elf32
122%define ABI_IS_32BIT 1
123%elifidn __OUTPUT_FORMAT__,macho32
124%define ABI_IS_32BIT 1
125%elifidn __OUTPUT_FORMAT__,win32
126%define ABI_IS_32BIT 1
127%elifidn __OUTPUT_FORMAT__,aout
128%define ABI_IS_32BIT 1
129%else
130%define ABI_IS_32BIT 0
Daniel Kang7a000712012-06-07 17:25:54 -0700131%endif
Johann7e065cd2016-02-17 18:21:03 -0800132
133%if ABI_IS_32BIT
134 %if CONFIG_PIC=1
135 %ifidn __OUTPUT_FORMAT__,elf32
136 %define GET_GOT_DEFINED 1
137 %define WRT_PLT wrt ..plt
138 %macro GET_GOT 1
139 extern _GLOBAL_OFFSET_TABLE_
140 push %1
141 call %%get_got
142 %%sub_offset:
143 jmp %%exitGG
144 %%get_got:
145 mov %1, [esp]
146 add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc
147 ret
148 %%exitGG:
149 %undef GLOBAL
150 %define GLOBAL(x) x + %1 wrt ..gotoff
151 %undef RESTORE_GOT
152 %define RESTORE_GOT pop %1
153 %endmacro
154 %elifidn __OUTPUT_FORMAT__,macho32
155 %define GET_GOT_DEFINED 1
156 %macro GET_GOT 1
157 push %1
158 call %%get_got
159 %%get_got:
160 pop %1
161 %undef GLOBAL
162 %define GLOBAL(x) x + %1 - %%get_got
163 %undef RESTORE_GOT
164 %define RESTORE_GOT pop %1
165 %endmacro
166 %else
167 %define GET_GOT_DEFINED 0
168 %endif
169 %endif
170
171 %if ARCH_X86_64 == 0
172 %undef PIC
173 %endif
174
175%else
176 %macro GET_GOT 1
177 %endmacro
178 %define GLOBAL(x) rel x
179 %define WRT_PLT wrt ..plt
180
181 %if WIN64
182 %define PIC
183 %elifidn __OUTPUT_FORMAT__,macho64
184 %define PIC
185 %elif CONFIG_PIC
186 %define PIC
187 %endif
188%endif
189
190%ifnmacro GET_GOT
191 %macro GET_GOT 1
192 %endmacro
193 %define GLOBAL(x) x
194%endif
195%ifndef RESTORE_GOT
196 %define RESTORE_GOT
197%endif
198%ifndef WRT_PLT
199 %define WRT_PLT
200%endif
201
Daniel Kang7a000712012-06-07 17:25:54 -0700202%ifdef PIC
203 default rel
204%endif
Scott LaVarnwaye56f9632016-03-07 11:34:49 -0800205
206%ifndef GET_GOT_DEFINED
207 %define GET_GOT_DEFINED 0
208%endif
Johannc8100662020-04-26 21:00:00 +0900209; End PIC macros from aom_ports/x86_abi_support.asm.
210
211; libaom explicitly sets visibilty in shared object builds. Avoid setting
212; visibility to hidden as it may break builds that split sources on e.g.,
213; directory boundaries.
214%ifdef CHROMIUM
215 %define VISIBILITY hidden
216 %define HAVE_PRIVATE_EXTERN 1
217%else
218 %define VISIBILITY
219 %define HAVE_PRIVATE_EXTERN 0
220%endif
Johann24973562016-02-17 17:37:24 -0800221
222%ifdef __NASM_VER__
223 %use smartalign
Johannc8100662020-04-26 21:00:00 +0900224 %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14
225 %define HAVE_PRIVATE_EXTERN 0
226 %endif
Johann24973562016-02-17 17:37:24 -0800227%endif
Daniel Kang7a000712012-06-07 17:25:54 -0700228
Daniel Kang7a000712012-06-07 17:25:54 -0700229; Macros to eliminate most code duplication between x86_32 and x86_64:
230; Currently this works only for leaf functions which load all their arguments
231; into registers at the start, and make no other use of the stack. Luckily that
232; covers most of x264's asm.
233
234; PROLOGUE:
235; %1 = number of arguments. loads them from stack if needed.
236; %2 = number of registers used. pushes callee-saved regs if needed.
237; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
Johann41a0a0c2015-07-30 09:19:43 -0700238; %4 = (optional) stack size to be allocated. The stack will be aligned before
239; allocating the specified stack size. If the required stack alignment is
240; larger than the known stack alignment the stack will be manually aligned
241; and an extra register will be allocated to hold the original stack
242; pointer (to not invalidate r0m etc.). To prevent the use of an extra
243; register as stack pointer, request a negative stack size.
244; %4+/%5+ = list of names to define to registers
Daniel Kang7a000712012-06-07 17:25:54 -0700245; PROLOGUE can also be invoked by adding the same options to cglobal
246
247; e.g.
Johann41a0a0c2015-07-30 09:19:43 -0700248; cglobal foo, 2,3,7,0x40, dst, src, tmp
249; declares a function (foo) that automatically loads two arguments (dst and
250; src) into registers, uses one additional register (tmp) plus 7 vector
251; registers (m0-m6) and allocates 0x40 bytes of stack space.
Daniel Kang7a000712012-06-07 17:25:54 -0700252
253; TODO Some functions can use some args directly from the stack. If they're the
254; last args then you can just not declare them, but if they're in the middle
255; we need more flexible macro.
256
257; RET:
258; Pops anything that was pushed by PROLOGUE, and returns.
259
260; REP_RET:
Johann41a0a0c2015-07-30 09:19:43 -0700261; Use this instead of RET if it's a branch target.
Daniel Kang7a000712012-06-07 17:25:54 -0700262
263; registers:
264; rN and rNq are the native-size register holding function argument N
265; rNd, rNw, rNb are dword, word, and byte size
Johann41a0a0c2015-07-30 09:19:43 -0700266; rNh is the high 8 bits of the word size
Daniel Kang7a000712012-06-07 17:25:54 -0700267; rNm is the original location of arg N (a register or on the stack), dword
268; rNmp is native size
269
Johann41a0a0c2015-07-30 09:19:43 -0700270%macro DECLARE_REG 2-3
Daniel Kang7a000712012-06-07 17:25:54 -0700271 %define r%1q %2
Johann41a0a0c2015-07-30 09:19:43 -0700272 %define r%1d %2d
273 %define r%1w %2w
274 %define r%1b %2b
275 %define r%1h %2h
Johann24973562016-02-17 17:37:24 -0800276 %define %2q %2
Johann41a0a0c2015-07-30 09:19:43 -0700277 %if %0 == 2
278 %define r%1m %2d
Daniel Kang7a000712012-06-07 17:25:54 -0700279 %define r%1mp %2
280 %elif ARCH_X86_64 ; memory
Johann41a0a0c2015-07-30 09:19:43 -0700281 %define r%1m [rstk + stack_offset + %3]
KO Myung-Hun7ddb5f52014-07-22 10:47:20 +0900282 %define r%1mp qword r %+ %1 %+ m
Daniel Kang7a000712012-06-07 17:25:54 -0700283 %else
Johann41a0a0c2015-07-30 09:19:43 -0700284 %define r%1m [rstk + stack_offset + %3]
KO Myung-Hun7ddb5f52014-07-22 10:47:20 +0900285 %define r%1mp dword r %+ %1 %+ m
Daniel Kang7a000712012-06-07 17:25:54 -0700286 %endif
287 %define r%1 %2
288%endmacro
289
Johann41a0a0c2015-07-30 09:19:43 -0700290%macro DECLARE_REG_SIZE 3
Daniel Kang7a000712012-06-07 17:25:54 -0700291 %define r%1q r%1
292 %define e%1q r%1
293 %define r%1d e%1
294 %define e%1d e%1
295 %define r%1w %1
296 %define e%1w %1
Johann41a0a0c2015-07-30 09:19:43 -0700297 %define r%1h %3
298 %define e%1h %3
Daniel Kang7a000712012-06-07 17:25:54 -0700299 %define r%1b %2
300 %define e%1b %2
Johann24973562016-02-17 17:37:24 -0800301 %if ARCH_X86_64 == 0
302 %define r%1 e%1
303 %endif
Daniel Kang7a000712012-06-07 17:25:54 -0700304%endmacro
305
Johann41a0a0c2015-07-30 09:19:43 -0700306DECLARE_REG_SIZE ax, al, ah
307DECLARE_REG_SIZE bx, bl, bh
308DECLARE_REG_SIZE cx, cl, ch
309DECLARE_REG_SIZE dx, dl, dh
310DECLARE_REG_SIZE si, sil, null
311DECLARE_REG_SIZE di, dil, null
312DECLARE_REG_SIZE bp, bpl, null
Daniel Kang7a000712012-06-07 17:25:54 -0700313
314; t# defines for when per-arch register allocation is more complex than just function arguments
315
316%macro DECLARE_REG_TMP 1-*
317 %assign %%i 0
318 %rep %0
319 CAT_XDEFINE t, %%i, r%1
320 %assign %%i %%i+1
321 %rotate 1
322 %endrep
323%endmacro
324
325%macro DECLARE_REG_TMP_SIZE 0-*
326 %rep %0
327 %define t%1q t%1 %+ q
328 %define t%1d t%1 %+ d
329 %define t%1w t%1 %+ w
Johann41a0a0c2015-07-30 09:19:43 -0700330 %define t%1h t%1 %+ h
Daniel Kang7a000712012-06-07 17:25:54 -0700331 %define t%1b t%1 %+ b
332 %rotate 1
333 %endrep
334%endmacro
335
336DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
337
338%if ARCH_X86_64
339 %define gprsize 8
340%else
341 %define gprsize 4
342%endif
343
Johannc8100662020-04-26 21:00:00 +0900344%macro LEA 2
345%if ARCH_X86_64
346 lea %1, [%2]
347%elif PIC
348 call $+5 ; special-cased to not affect the RSB on most CPU:s
349 pop %1
350 add %1, (%2)-$+1
351%else
352 mov %1, %2
353%endif
354%endmacro
355
Daniel Kang7a000712012-06-07 17:25:54 -0700356%macro PUSH 1
357 push %1
Johann41a0a0c2015-07-30 09:19:43 -0700358 %ifidn rstk, rsp
359 %assign stack_offset stack_offset+gprsize
360 %endif
Daniel Kang7a000712012-06-07 17:25:54 -0700361%endmacro
362
363%macro POP 1
364 pop %1
Johann41a0a0c2015-07-30 09:19:43 -0700365 %ifidn rstk, rsp
366 %assign stack_offset stack_offset-gprsize
367 %endif
Daniel Kang7a000712012-06-07 17:25:54 -0700368%endmacro
369
370%macro PUSH_IF_USED 1-*
371 %rep %0
372 %if %1 < regs_used
373 PUSH r%1
374 %endif
375 %rotate 1
376 %endrep
377%endmacro
378
379%macro POP_IF_USED 1-*
380 %rep %0
381 %if %1 < regs_used
382 pop r%1
383 %endif
384 %rotate 1
385 %endrep
386%endmacro
387
388%macro LOAD_IF_USED 1-*
389 %rep %0
390 %if %1 < num_args
391 mov r%1, r %+ %1 %+ mp
392 %endif
393 %rotate 1
394 %endrep
395%endmacro
396
397%macro SUB 2
398 sub %1, %2
Johann41a0a0c2015-07-30 09:19:43 -0700399 %ifidn %1, rstk
Daniel Kang7a000712012-06-07 17:25:54 -0700400 %assign stack_offset stack_offset+(%2)
401 %endif
402%endmacro
403
404%macro ADD 2
405 add %1, %2
Johann41a0a0c2015-07-30 09:19:43 -0700406 %ifidn %1, rstk
Daniel Kang7a000712012-06-07 17:25:54 -0700407 %assign stack_offset stack_offset-(%2)
408 %endif
409%endmacro
410
411%macro movifnidn 2
412 %ifnidn %1, %2
413 mov %1, %2
414 %endif
415%endmacro
416
Johannc8100662020-04-26 21:00:00 +0900417%if ARCH_X86_64 == 0
418 %define movsxd movifnidn
419%endif
420
Daniel Kang7a000712012-06-07 17:25:54 -0700421%macro movsxdifnidn 2
422 %ifnidn %1, %2
423 movsxd %1, %2
424 %endif
425%endmacro
426
427%macro ASSERT 1
428 %if (%1) == 0
Johann24973562016-02-17 17:37:24 -0800429 %error assertion ``%1'' failed
Daniel Kang7a000712012-06-07 17:25:54 -0700430 %endif
431%endmacro
432
433%macro DEFINE_ARGS 0-*
434 %ifdef n_arg_names
435 %assign %%i 0
436 %rep n_arg_names
437 CAT_UNDEF arg_name %+ %%i, q
438 CAT_UNDEF arg_name %+ %%i, d
439 CAT_UNDEF arg_name %+ %%i, w
Johann41a0a0c2015-07-30 09:19:43 -0700440 CAT_UNDEF arg_name %+ %%i, h
Daniel Kang7a000712012-06-07 17:25:54 -0700441 CAT_UNDEF arg_name %+ %%i, b
442 CAT_UNDEF arg_name %+ %%i, m
443 CAT_UNDEF arg_name %+ %%i, mp
444 CAT_UNDEF arg_name, %%i
445 %assign %%i %%i+1
446 %endrep
447 %endif
448
449 %xdefine %%stack_offset stack_offset
450 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
451 %assign %%i 0
452 %rep %0
453 %xdefine %1q r %+ %%i %+ q
454 %xdefine %1d r %+ %%i %+ d
455 %xdefine %1w r %+ %%i %+ w
Johann41a0a0c2015-07-30 09:19:43 -0700456 %xdefine %1h r %+ %%i %+ h
Daniel Kang7a000712012-06-07 17:25:54 -0700457 %xdefine %1b r %+ %%i %+ b
458 %xdefine %1m r %+ %%i %+ m
459 %xdefine %1mp r %+ %%i %+ mp
460 CAT_XDEFINE arg_name, %%i, %1
461 %assign %%i %%i+1
462 %rotate 1
463 %endrep
464 %xdefine stack_offset %%stack_offset
465 %assign n_arg_names %0
466%endmacro
467
Johann41a0a0c2015-07-30 09:19:43 -0700468%define required_stack_alignment ((mmsize + 15) & ~15)
Johannc8100662020-04-26 21:00:00 +0900469%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
470%define high_mm_regs (16*cpuflag(avx512))
Jingning Han4e4e7fa2014-05-14 18:22:18 -0700471
Johann41a0a0c2015-07-30 09:19:43 -0700472%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
473 %ifnum %1
474 %if %1 != 0
475 %assign %%pad 0
476 %assign stack_size %1
477 %if stack_size < 0
478 %assign stack_size -stack_size
479 %endif
480 %if WIN64
481 %assign %%pad %%pad + 32 ; shadow space
482 %if mmsize != 8
483 %assign xmm_regs_used %2
484 %if xmm_regs_used > 8
485 %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
486 %endif
487 %endif
488 %endif
489 %if required_stack_alignment <= STACK_ALIGNMENT
490 ; maintain the current stack alignment
491 %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
492 SUB rsp, stack_size_padded
493 %else
494 %assign %%reg_num (regs_used - 1)
495 %xdefine rstk r %+ %%reg_num
496 ; align stack, and save original stack location directly above
497 ; it, i.e. in [rsp+stack_size_padded], so we can restore the
498 ; stack in a single instruction (i.e. mov rsp, rstk or mov
499 ; rsp, [rsp+stack_size_padded])
500 %if %1 < 0 ; need to store rsp on stack
501 %xdefine rstkm [rsp + stack_size + %%pad]
502 %assign %%pad %%pad + gprsize
503 %else ; can keep rsp in rstk during whole function
504 %xdefine rstkm rstk
505 %endif
506 %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
507 mov rstk, rsp
508 and rsp, ~(required_stack_alignment-1)
509 sub rsp, stack_size_padded
510 movifnidn rstkm, rstk
511 %endif
512 WIN64_PUSH_XMM
513 %endif
514 %endif
Jingning Han4e4e7fa2014-05-14 18:22:18 -0700515%endmacro
516
Johann41a0a0c2015-07-30 09:19:43 -0700517%macro SETUP_STACK_POINTER 1
518 %ifnum %1
519 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
520 %if %1 > 0
Johannc8100662020-04-26 21:00:00 +0900521 ; Reserve an additional register for storing the original stack pointer, but avoid using
522 ; eax/rax for this purpose since it can potentially get overwritten as a return value.
Johann41a0a0c2015-07-30 09:19:43 -0700523 %assign regs_used (regs_used + 1)
Johannc8100662020-04-26 21:00:00 +0900524 %if ARCH_X86_64 && regs_used == 7
525 %assign regs_used 8
526 %elif ARCH_X86_64 == 0 && regs_used == 1
527 %assign regs_used 2
528 %endif
Johann24973562016-02-17 17:37:24 -0800529 %endif
530 %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
Johannc8100662020-04-26 21:00:00 +0900531 ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax)
532 ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used.
Johann24973562016-02-17 17:37:24 -0800533 %assign regs_used 5 + UNIX64 * 3
Johann41a0a0c2015-07-30 09:19:43 -0700534 %endif
535 %endif
536 %endif
Jingning Han4e4e7fa2014-05-14 18:22:18 -0700537%endmacro
Johann41a0a0c2015-07-30 09:19:43 -0700538
539%macro DEFINE_ARGS_INTERNAL 3+
540 %ifnum %2
541 DEFINE_ARGS %3
542 %elif %1 == 4
543 DEFINE_ARGS %2
544 %elif %1 > 4
545 DEFINE_ARGS %2, %3
546 %endif
547%endmacro
Jingning Han4e4e7fa2014-05-14 18:22:18 -0700548
Daniel Kang7a000712012-06-07 17:25:54 -0700549%if WIN64 ; Windows x64 ;=================================================
550
Johann41a0a0c2015-07-30 09:19:43 -0700551DECLARE_REG 0, rcx
552DECLARE_REG 1, rdx
553DECLARE_REG 2, R8
554DECLARE_REG 3, R9
555DECLARE_REG 4, R10, 40
556DECLARE_REG 5, R11, 48
557DECLARE_REG 6, rax, 56
558DECLARE_REG 7, rdi, 64
559DECLARE_REG 8, rsi, 72
560DECLARE_REG 9, rbx, 80
561DECLARE_REG 10, rbp, 88
Johannc8100662020-04-26 21:00:00 +0900562DECLARE_REG 11, R14, 96
563DECLARE_REG 12, R15, 104
564DECLARE_REG 13, R12, 112
565DECLARE_REG 14, R13, 120
Daniel Kang7a000712012-06-07 17:25:54 -0700566
Johann41a0a0c2015-07-30 09:19:43 -0700567%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
Daniel Kang7a000712012-06-07 17:25:54 -0700568 %assign num_args %1
569 %assign regs_used %2
570 ASSERT regs_used >= num_args
Johann41a0a0c2015-07-30 09:19:43 -0700571 SETUP_STACK_POINTER %4
Daniel Kang7a000712012-06-07 17:25:54 -0700572 ASSERT regs_used <= 15
573 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
Johann41a0a0c2015-07-30 09:19:43 -0700574 ALLOC_STACK %4, %3
575 %if mmsize != 8 && stack_size == 0
Daniel Kang7a000712012-06-07 17:25:54 -0700576 WIN64_SPILL_XMM %3
577 %endif
578 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
Johann41a0a0c2015-07-30 09:19:43 -0700579 DEFINE_ARGS_INTERNAL %0, %4, %5
580%endmacro
581
582%macro WIN64_PUSH_XMM 0
583 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
Johannc8100662020-04-26 21:00:00 +0900584 %if xmm_regs_used > 6 + high_mm_regs
Johann41a0a0c2015-07-30 09:19:43 -0700585 movaps [rstk + stack_offset + 8], xmm6
586 %endif
Johannc8100662020-04-26 21:00:00 +0900587 %if xmm_regs_used > 7 + high_mm_regs
Johann41a0a0c2015-07-30 09:19:43 -0700588 movaps [rstk + stack_offset + 24], xmm7
589 %endif
Johannc8100662020-04-26 21:00:00 +0900590 %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
591 %if %%xmm_regs_on_stack > 0
Johann41a0a0c2015-07-30 09:19:43 -0700592 %assign %%i 8
Johannc8100662020-04-26 21:00:00 +0900593 %rep %%xmm_regs_on_stack
Johann41a0a0c2015-07-30 09:19:43 -0700594 movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
595 %assign %%i %%i+1
596 %endrep
597 %endif
Daniel Kang7a000712012-06-07 17:25:54 -0700598%endmacro
599
600%macro WIN64_SPILL_XMM 1
601 %assign xmm_regs_used %1
Johannc8100662020-04-26 21:00:00 +0900602 ASSERT xmm_regs_used <= 16 + high_mm_regs
603 %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
604 %if %%xmm_regs_on_stack > 0
Johann41a0a0c2015-07-30 09:19:43 -0700605 ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
Johannc8100662020-04-26 21:00:00 +0900606 %assign %%pad %%xmm_regs_on_stack*16 + 32
Johann41a0a0c2015-07-30 09:19:43 -0700607 %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
608 SUB rsp, stack_size_padded
Daniel Kang7a000712012-06-07 17:25:54 -0700609 %endif
Johann41a0a0c2015-07-30 09:19:43 -0700610 WIN64_PUSH_XMM
Daniel Kang7a000712012-06-07 17:25:54 -0700611%endmacro
612
Johannc8100662020-04-26 21:00:00 +0900613%macro WIN64_RESTORE_XMM_INTERNAL 0
Johann41a0a0c2015-07-30 09:19:43 -0700614 %assign %%pad_size 0
Johannc8100662020-04-26 21:00:00 +0900615 %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
616 %if %%xmm_regs_on_stack > 0
617 %assign %%i xmm_regs_used - high_mm_regs
618 %rep %%xmm_regs_on_stack
Daniel Kang7a000712012-06-07 17:25:54 -0700619 %assign %%i %%i-1
Johannc8100662020-04-26 21:00:00 +0900620 movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
Daniel Kang7a000712012-06-07 17:25:54 -0700621 %endrep
Johann41a0a0c2015-07-30 09:19:43 -0700622 %endif
623 %if stack_size_padded > 0
624 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
625 mov rsp, rstkm
626 %else
Johannc8100662020-04-26 21:00:00 +0900627 add rsp, stack_size_padded
Johann41a0a0c2015-07-30 09:19:43 -0700628 %assign %%pad_size stack_size_padded
629 %endif
630 %endif
Johannc8100662020-04-26 21:00:00 +0900631 %if xmm_regs_used > 7 + high_mm_regs
632 movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
Johann41a0a0c2015-07-30 09:19:43 -0700633 %endif
Johannc8100662020-04-26 21:00:00 +0900634 %if xmm_regs_used > 6 + high_mm_regs
635 movaps xmm6, [rsp + stack_offset - %%pad_size + 8]
Daniel Kang7a000712012-06-07 17:25:54 -0700636 %endif
637%endmacro
638
Johannc8100662020-04-26 21:00:00 +0900639%macro WIN64_RESTORE_XMM 0
640 WIN64_RESTORE_XMM_INTERNAL
Johann41a0a0c2015-07-30 09:19:43 -0700641 %assign stack_offset (stack_offset-stack_size_padded)
Johannc8100662020-04-26 21:00:00 +0900642 %assign stack_size_padded 0
Daniel Kang7a000712012-06-07 17:25:54 -0700643 %assign xmm_regs_used 0
644%endmacro
645
Johannc8100662020-04-26 21:00:00 +0900646%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
Johann41a0a0c2015-07-30 09:19:43 -0700647
Daniel Kang7a000712012-06-07 17:25:54 -0700648%macro RET 0
Johannc8100662020-04-26 21:00:00 +0900649 WIN64_RESTORE_XMM_INTERNAL
Daniel Kang7a000712012-06-07 17:25:54 -0700650 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
Johannc8100662020-04-26 21:00:00 +0900651 %if vzeroupper_required
Johann24973562016-02-17 17:37:24 -0800652 vzeroupper
653 %endif
Johann41a0a0c2015-07-30 09:19:43 -0700654 AUTO_REP_RET
Daniel Kang7a000712012-06-07 17:25:54 -0700655%endmacro
656
657%elif ARCH_X86_64 ; *nix x64 ;=============================================
658
Johann41a0a0c2015-07-30 09:19:43 -0700659DECLARE_REG 0, rdi
660DECLARE_REG 1, rsi
661DECLARE_REG 2, rdx
662DECLARE_REG 3, rcx
663DECLARE_REG 4, R8
664DECLARE_REG 5, R9
665DECLARE_REG 6, rax, 8
666DECLARE_REG 7, R10, 16
667DECLARE_REG 8, R11, 24
668DECLARE_REG 9, rbx, 32
669DECLARE_REG 10, rbp, 40
Johannc8100662020-04-26 21:00:00 +0900670DECLARE_REG 11, R14, 48
671DECLARE_REG 12, R15, 56
672DECLARE_REG 13, R12, 64
673DECLARE_REG 14, R13, 72
Daniel Kang7a000712012-06-07 17:25:54 -0700674
Johannc8100662020-04-26 21:00:00 +0900675%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
Daniel Kang7a000712012-06-07 17:25:54 -0700676 %assign num_args %1
677 %assign regs_used %2
Johannc8100662020-04-26 21:00:00 +0900678 %assign xmm_regs_used %3
Daniel Kang7a000712012-06-07 17:25:54 -0700679 ASSERT regs_used >= num_args
Johann41a0a0c2015-07-30 09:19:43 -0700680 SETUP_STACK_POINTER %4
Daniel Kang7a000712012-06-07 17:25:54 -0700681 ASSERT regs_used <= 15
682 PUSH_IF_USED 9, 10, 11, 12, 13, 14
Johann41a0a0c2015-07-30 09:19:43 -0700683 ALLOC_STACK %4
Daniel Kang7a000712012-06-07 17:25:54 -0700684 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
Johann41a0a0c2015-07-30 09:19:43 -0700685 DEFINE_ARGS_INTERNAL %0, %4, %5
Daniel Kang7a000712012-06-07 17:25:54 -0700686%endmacro
687
Johannc8100662020-04-26 21:00:00 +0900688%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
Johann41a0a0c2015-07-30 09:19:43 -0700689
Daniel Kang7a000712012-06-07 17:25:54 -0700690%macro RET 0
Johann24973562016-02-17 17:37:24 -0800691 %if stack_size_padded > 0
692 %if required_stack_alignment > STACK_ALIGNMENT
693 mov rsp, rstkm
694 %else
695 add rsp, stack_size_padded
696 %endif
697 %endif
Daniel Kang7a000712012-06-07 17:25:54 -0700698 POP_IF_USED 14, 13, 12, 11, 10, 9
Johannc8100662020-04-26 21:00:00 +0900699 %if vzeroupper_required
Johann24973562016-02-17 17:37:24 -0800700 vzeroupper
701 %endif
Johann41a0a0c2015-07-30 09:19:43 -0700702 AUTO_REP_RET
Daniel Kang7a000712012-06-07 17:25:54 -0700703%endmacro
704
705%else ; X86_32 ;==============================================================
706
Johann41a0a0c2015-07-30 09:19:43 -0700707DECLARE_REG 0, eax, 4
708DECLARE_REG 1, ecx, 8
709DECLARE_REG 2, edx, 12
710DECLARE_REG 3, ebx, 16
711DECLARE_REG 4, esi, 20
712DECLARE_REG 5, edi, 24
713DECLARE_REG 6, ebp, 28
Daniel Kang7a000712012-06-07 17:25:54 -0700714%define rsp esp
715
716%macro DECLARE_ARG 1-*
717 %rep %0
Johann41a0a0c2015-07-30 09:19:43 -0700718 %define r%1m [rstk + stack_offset + 4*%1 + 4]
Daniel Kang7a000712012-06-07 17:25:54 -0700719 %define r%1mp dword r%1m
720 %rotate 1
721 %endrep
722%endmacro
723
724DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
725
Johann41a0a0c2015-07-30 09:19:43 -0700726%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
Daniel Kang7a000712012-06-07 17:25:54 -0700727 %assign num_args %1
728 %assign regs_used %2
Johann41a0a0c2015-07-30 09:19:43 -0700729 ASSERT regs_used >= num_args
730 %if num_args > 7
731 %assign num_args 7
732 %endif
Daniel Kang7a000712012-06-07 17:25:54 -0700733 %if regs_used > 7
734 %assign regs_used 7
735 %endif
Johann41a0a0c2015-07-30 09:19:43 -0700736 SETUP_STACK_POINTER %4
737 ASSERT regs_used <= 7
Daniel Kang7a000712012-06-07 17:25:54 -0700738 PUSH_IF_USED 3, 4, 5, 6
Johann41a0a0c2015-07-30 09:19:43 -0700739 ALLOC_STACK %4
Daniel Kang7a000712012-06-07 17:25:54 -0700740 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
Johann41a0a0c2015-07-30 09:19:43 -0700741 DEFINE_ARGS_INTERNAL %0, %4, %5
Daniel Kang7a000712012-06-07 17:25:54 -0700742%endmacro
743
Johannc8100662020-04-26 21:00:00 +0900744%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
Johann41a0a0c2015-07-30 09:19:43 -0700745
Daniel Kang7a000712012-06-07 17:25:54 -0700746%macro RET 0
Johann24973562016-02-17 17:37:24 -0800747 %if stack_size_padded > 0
748 %if required_stack_alignment > STACK_ALIGNMENT
749 mov rsp, rstkm
750 %else
751 add rsp, stack_size_padded
752 %endif
753 %endif
Daniel Kang7a000712012-06-07 17:25:54 -0700754 POP_IF_USED 6, 5, 4, 3
Johannc8100662020-04-26 21:00:00 +0900755 %if vzeroupper_required
Johann24973562016-02-17 17:37:24 -0800756 vzeroupper
757 %endif
Johann41a0a0c2015-07-30 09:19:43 -0700758 AUTO_REP_RET
Daniel Kang7a000712012-06-07 17:25:54 -0700759%endmacro
760
761%endif ;======================================================================
762
763%if WIN64 == 0
Johann24973562016-02-17 17:37:24 -0800764 %macro WIN64_SPILL_XMM 1
765 %endmacro
Johannc8100662020-04-26 21:00:00 +0900766 %macro WIN64_RESTORE_XMM 0
Johann24973562016-02-17 17:37:24 -0800767 %endmacro
768 %macro WIN64_PUSH_XMM 0
769 %endmacro
Daniel Kang7a000712012-06-07 17:25:54 -0700770%endif
771
Johann41a0a0c2015-07-30 09:19:43 -0700772; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
773; a branch or a branch target. So switch to a 2-byte form of ret in that case.
774; We can automatically detect "follows a branch", but not a branch target.
775; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
776%macro REP_RET 0
Johannc8100662020-04-26 21:00:00 +0900777 %if has_epilogue || cpuflag(ssse3)
Johann41a0a0c2015-07-30 09:19:43 -0700778 RET
779 %else
780 rep ret
781 %endif
Johann24973562016-02-17 17:37:24 -0800782 annotate_function_size
Johann41a0a0c2015-07-30 09:19:43 -0700783%endmacro
784
785%define last_branch_adr $$
786%macro AUTO_REP_RET 0
Johann24973562016-02-17 17:37:24 -0800787 %if notcpuflag(ssse3)
788 times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr.
Johann41a0a0c2015-07-30 09:19:43 -0700789 %endif
790 ret
Johann24973562016-02-17 17:37:24 -0800791 annotate_function_size
Johann41a0a0c2015-07-30 09:19:43 -0700792%endmacro
793
794%macro BRANCH_INSTR 0-*
795 %rep %0
796 %macro %1 1-2 %1
797 %2 %1
Johann24973562016-02-17 17:37:24 -0800798 %if notcpuflag(ssse3)
799 %%branch_instr equ $
800 %xdefine last_branch_adr %%branch_instr
801 %endif
Johann41a0a0c2015-07-30 09:19:43 -0700802 %endmacro
803 %rotate 1
804 %endrep
805%endmacro
806
807BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
808
Johannc8100662020-04-26 21:00:00 +0900809%macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent
Johann41a0a0c2015-07-30 09:19:43 -0700810 %if has_epilogue
811 call %1
812 RET
813 %elif %2
814 jmp %1
815 %endif
Johann24973562016-02-17 17:37:24 -0800816 annotate_function_size
Johann41a0a0c2015-07-30 09:19:43 -0700817%endmacro
818
Daniel Kang7a000712012-06-07 17:25:54 -0700819;=============================================================================
820; arch-independent part
821;=============================================================================
822
823%assign function_align 16
824
825; Begin a function.
826; Applies any symbol mangling needed for C linkage, and sets up a define such that
827; subsequent uses of the function name automatically refer to the mangled version.
828; Appends cpuflags to the function name if cpuflags has been specified.
Johann41a0a0c2015-07-30 09:19:43 -0700829; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
830; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
831%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
832 cglobal_internal 1, %1 %+ SUFFIX, %2
Daniel Kang7a000712012-06-07 17:25:54 -0700833%endmacro
Johann41a0a0c2015-07-30 09:19:43 -0700834%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
835 cglobal_internal 0, %1 %+ SUFFIX, %2
836%endmacro
837%macro cglobal_internal 2-3+
Johann24973562016-02-17 17:37:24 -0800838 annotate_function_size
Johann41a0a0c2015-07-30 09:19:43 -0700839 %ifndef cglobaled_%2
Johannc8100662020-04-26 21:00:00 +0900840 %if %1
841 %xdefine %2 mangle(private_prefix %+ _ %+ %2)
842 %else
843 %xdefine %2 mangle(public_prefix %+ _ %+ %2)
844 %endif
Johann41a0a0c2015-07-30 09:19:43 -0700845 %xdefine %2.skip_prologue %2 %+ .skip_prologue
846 CAT_XDEFINE cglobaled_, %2, 1
847 %endif
848 %xdefine current_function %2
Johann24973562016-02-17 17:37:24 -0800849 %xdefine current_function_section __SECT__
850 %if FORMAT_ELF
Johannc8100662020-04-26 21:00:00 +0900851 %if %1
852 global %2:function VISIBILITY
Johann71ccd102016-02-17 18:08:13 -0800853 %else
Johannc8100662020-04-26 21:00:00 +0900854 global %2:function
Johann71ccd102016-02-17 18:08:13 -0800855 %endif
Johannc8100662020-04-26 21:00:00 +0900856 %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1
857 global %2:private_extern
Daniel Kang7a000712012-06-07 17:25:54 -0700858 %else
Johann41a0a0c2015-07-30 09:19:43 -0700859 global %2
Daniel Kang7a000712012-06-07 17:25:54 -0700860 %endif
861 align function_align
Johann41a0a0c2015-07-30 09:19:43 -0700862 %2:
863 RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer
864 %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
865 %assign stack_offset 0 ; stack pointer offset relative to the return address
866 %assign stack_size 0 ; amount of stack space that can be freely used inside a function
867 %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
Johannc8100662020-04-26 21:00:00 +0900868 %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
Johann41a0a0c2015-07-30 09:19:43 -0700869 %ifnidn %3, ""
870 PROLOGUE %3
Daniel Kang7a000712012-06-07 17:25:54 -0700871 %endif
872%endmacro
873
Johannc8100662020-04-26 21:00:00 +0900874; Create a global symbol from a local label with the correct name mangling and type
875%macro cglobal_label 1
876 %if FORMAT_ELF
877 global current_function %+ %1:function VISIBILITY
878 %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
879 global current_function %+ %1:private_extern
880 %else
881 global current_function %+ %1
882 %endif
883 %1:
884%endmacro
885
Daniel Kang7a000712012-06-07 17:25:54 -0700886%macro cextern 1
Johann41a0a0c2015-07-30 09:19:43 -0700887 %xdefine %1 mangle(private_prefix %+ _ %+ %1)
Daniel Kang7a000712012-06-07 17:25:54 -0700888 CAT_XDEFINE cglobaled_, %1, 1
889 extern %1
890%endmacro
891
892; like cextern, but without the prefix
893%macro cextern_naked 1
Johann24973562016-02-17 17:37:24 -0800894 %ifdef PREFIX
895 %xdefine %1 mangle(%1)
896 %endif
Daniel Kang7a000712012-06-07 17:25:54 -0700897 CAT_XDEFINE cglobaled_, %1, 1
898 extern %1
899%endmacro
900
Johann41a0a0c2015-07-30 09:19:43 -0700901%macro const 1-2+
902 %xdefine %1 mangle(private_prefix %+ _ %+ %1)
Johann24973562016-02-17 17:37:24 -0800903 %if FORMAT_ELF
Johannc8100662020-04-26 21:00:00 +0900904 global %1:data VISIBILITY
905 %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
906 global %1:private_extern
Johann41a0a0c2015-07-30 09:19:43 -0700907 %else
908 global %1
909 %endif
Daniel Kang7a000712012-06-07 17:25:54 -0700910 %1: %2
911%endmacro
912
Johann24973562016-02-17 17:37:24 -0800913; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
914%if FORMAT_ELF
915 [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
Daniel Kang7a000712012-06-07 17:25:54 -0700916%endif
917
Johann24973562016-02-17 17:37:24 -0800918; Tell debuggers how large the function was.
919; This may be invoked multiple times per function; we rely on later instances overriding earlier ones.
920; This is invoked by RET and similar macros, and also cglobal does it for the previous function,
921; but if the last function in a source file doesn't use any of the standard macros for its epilogue,
922; then its size might be unspecified.
923%macro annotate_function_size 0
924 %ifdef __YASM_VER__
925 %ifdef current_function
926 %if FORMAT_ELF
927 current_function_section
928 %%ecf equ $
929 size current_function %%ecf - current_function
930 __SECT__
931 %endif
932 %endif
933 %endif
934%endmacro
935
Daniel Kang7a000712012-06-07 17:25:54 -0700936; cpuflags
937
938%assign cpuflags_mmx (1<<0)
939%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
940%assign cpuflags_3dnow (1<<2) | cpuflags_mmx
Johann41a0a0c2015-07-30 09:19:43 -0700941%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
Daniel Kang7a000712012-06-07 17:25:54 -0700942%assign cpuflags_sse (1<<4) | cpuflags_mmx2
943%assign cpuflags_sse2 (1<<5) | cpuflags_sse
944%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
Johannc8100662020-04-26 21:00:00 +0900945%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2
946%assign cpuflags_sse3 (1<<8) | cpuflags_sse2
947%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3
948%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3
949%assign cpuflags_sse42 (1<<11)| cpuflags_sse4
950%assign cpuflags_aesni (1<<12)| cpuflags_sse42
951%assign cpuflags_gfni (1<<13)| cpuflags_sse42
952%assign cpuflags_avx (1<<14)| cpuflags_sse42
953%assign cpuflags_xop (1<<15)| cpuflags_avx
954%assign cpuflags_fma4 (1<<16)| cpuflags_avx
955%assign cpuflags_fma3 (1<<17)| cpuflags_avx
956%assign cpuflags_bmi1 (1<<18)| cpuflags_avx|cpuflags_lzcnt
957%assign cpuflags_bmi2 (1<<19)| cpuflags_bmi1
958%assign cpuflags_avx2 (1<<20)| cpuflags_fma3|cpuflags_bmi2
959%assign cpuflags_avx512 (1<<21)| cpuflags_avx2 ; F, CD, BW, DQ, VL
Daniel Kang7a000712012-06-07 17:25:54 -0700960
Johannc8100662020-04-26 21:00:00 +0900961%assign cpuflags_cache32 (1<<22)
962%assign cpuflags_cache64 (1<<23)
963%assign cpuflags_aligned (1<<24) ; not a cpu feature, but a function variant
964%assign cpuflags_atom (1<<25)
Daniel Kang7a000712012-06-07 17:25:54 -0700965
Johann24973562016-02-17 17:37:24 -0800966; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
967%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
968%define notcpuflag(x) (cpuflag(x) ^ 1)
KO Myung-Hun6b564052015-08-28 22:23:38 +0900969
Johann41a0a0c2015-07-30 09:19:43 -0700970; Takes an arbitrary number of cpuflags from the above list.
Daniel Kang7a000712012-06-07 17:25:54 -0700971; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
972; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
Johann41a0a0c2015-07-30 09:19:43 -0700973%macro INIT_CPUFLAGS 0-*
974 %xdefine SUFFIX
975 %undef cpuname
976 %assign cpuflags 0
977
Daniel Kang7a000712012-06-07 17:25:54 -0700978 %if %0 >= 1
Johann41a0a0c2015-07-30 09:19:43 -0700979 %rep %0
980 %ifdef cpuname
981 %xdefine cpuname cpuname %+ _%1
982 %else
983 %xdefine cpuname %1
984 %endif
985 %assign cpuflags cpuflags | cpuflags_%1
986 %rotate 1
987 %endrep
Daniel Kang7a000712012-06-07 17:25:54 -0700988 %xdefine SUFFIX _ %+ cpuname
Johann41a0a0c2015-07-30 09:19:43 -0700989
Daniel Kang7a000712012-06-07 17:25:54 -0700990 %if cpuflag(avx)
991 %assign avx_enabled 1
992 %endif
Johann41a0a0c2015-07-30 09:19:43 -0700993 %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
Daniel Kang7a000712012-06-07 17:25:54 -0700994 %define mova movaps
995 %define movu movups
996 %define movnta movntps
997 %endif
998 %if cpuflag(aligned)
999 %define movu mova
Johann41a0a0c2015-07-30 09:19:43 -07001000 %elif cpuflag(sse3) && notcpuflag(ssse3)
Daniel Kang7a000712012-06-07 17:25:54 -07001001 %define movu lddqu
1002 %endif
Johann41a0a0c2015-07-30 09:19:43 -07001003 %endif
1004
Johann24973562016-02-17 17:37:24 -08001005 %if ARCH_X86_64 || cpuflag(sse2)
1006 %ifdef __NASM_VER__
Johannc8100662020-04-26 21:00:00 +09001007 ALIGNMODE p6
Johann24973562016-02-17 17:37:24 -08001008 %else
1009 CPU amdnop
1010 %endif
Daniel Kang7a000712012-06-07 17:25:54 -07001011 %else
Johann24973562016-02-17 17:37:24 -08001012 %ifdef __NASM_VER__
1013 ALIGNMODE nop
1014 %else
1015 CPU basicnop
1016 %endif
Daniel Kang7a000712012-06-07 17:25:54 -07001017 %endif
1018%endmacro
1019
Johannc8100662020-04-26 21:00:00 +09001020; Merge mmx, sse*, and avx*
Johann41a0a0c2015-07-30 09:19:43 -07001021; m# is a simd register of the currently selected size
1022; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
1023; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
Johannc8100662020-04-26 21:00:00 +09001024; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
1025; (All 4 remain in sync through SWAP.)
Daniel Kang7a000712012-06-07 17:25:54 -07001026
1027%macro CAT_XDEFINE 3
1028 %xdefine %1%2 %3
1029%endmacro
1030
1031%macro CAT_UNDEF 2
1032 %undef %1%2
1033%endmacro
1034
Johannc8100662020-04-26 21:00:00 +09001035%macro DEFINE_MMREGS 1 ; mmtype
1036 %assign %%prev_mmregs 0
1037 %ifdef num_mmregs
1038 %assign %%prev_mmregs num_mmregs
1039 %endif
1040
1041 %assign num_mmregs 8
1042 %if ARCH_X86_64 && mmsize >= 16
1043 %assign num_mmregs 16
1044 %if cpuflag(avx512) || mmsize == 64
1045 %assign num_mmregs 32
1046 %endif
1047 %endif
1048
1049 %assign %%i 0
1050 %rep num_mmregs
1051 CAT_XDEFINE m, %%i, %1 %+ %%i
1052 CAT_XDEFINE nn%1, %%i, %%i
1053 %assign %%i %%i+1
1054 %endrep
1055 %if %%prev_mmregs > num_mmregs
1056 %rep %%prev_mmregs - num_mmregs
1057 CAT_UNDEF m, %%i
1058 CAT_UNDEF nn %+ mmtype, %%i
1059 %assign %%i %%i+1
1060 %endrep
1061 %endif
1062 %xdefine mmtype %1
1063%endmacro
1064
1065; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
1066%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
1067 %if ARCH_X86_64 && cpuflag(avx512)
1068 %assign %%i %1
1069 %rep 16-%1
1070 %assign %%i_high %%i+16
1071 SWAP %%i, %%i_high
1072 %assign %%i %%i+1
1073 %endrep
1074 %endif
1075%endmacro
1076
Daniel Kang7a000712012-06-07 17:25:54 -07001077%macro INIT_MMX 0-1+
1078 %assign avx_enabled 0
1079 %define RESET_MM_PERMUTATION INIT_MMX %1
1080 %define mmsize 8
Daniel Kang7a000712012-06-07 17:25:54 -07001081 %define mova movq
1082 %define movu movq
1083 %define movh movd
1084 %define movnta movntq
Daniel Kang7a000712012-06-07 17:25:54 -07001085 INIT_CPUFLAGS %1
Johannc8100662020-04-26 21:00:00 +09001086 DEFINE_MMREGS mm
Daniel Kang7a000712012-06-07 17:25:54 -07001087%endmacro
1088
1089%macro INIT_XMM 0-1+
1090 %assign avx_enabled 0
1091 %define RESET_MM_PERMUTATION INIT_XMM %1
1092 %define mmsize 16
Daniel Kang7a000712012-06-07 17:25:54 -07001093 %define mova movdqa
1094 %define movu movdqu
1095 %define movh movq
1096 %define movnta movntdq
Daniel Kang7a000712012-06-07 17:25:54 -07001097 INIT_CPUFLAGS %1
Johannc8100662020-04-26 21:00:00 +09001098 DEFINE_MMREGS xmm
1099 %if WIN64
1100 AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers
1101 %endif
Daniel Kang7a000712012-06-07 17:25:54 -07001102%endmacro
1103
Daniel Kang7a000712012-06-07 17:25:54 -07001104%macro INIT_YMM 0-1+
1105 %assign avx_enabled 1
1106 %define RESET_MM_PERMUTATION INIT_YMM %1
1107 %define mmsize 32
Johann41a0a0c2015-07-30 09:19:43 -07001108 %define mova movdqa
1109 %define movu movdqu
Daniel Kang7a000712012-06-07 17:25:54 -07001110 %undef movh
Johann41a0a0c2015-07-30 09:19:43 -07001111 %define movnta movntdq
Daniel Kang7a000712012-06-07 17:25:54 -07001112 INIT_CPUFLAGS %1
Johannc8100662020-04-26 21:00:00 +09001113 DEFINE_MMREGS ymm
1114 AVX512_MM_PERMUTATION
1115%endmacro
1116
1117%macro INIT_ZMM 0-1+
1118 %assign avx_enabled 1
1119 %define RESET_MM_PERMUTATION INIT_ZMM %1
1120 %define mmsize 64
1121 %define mova movdqa
1122 %define movu movdqu
1123 %undef movh
1124 %define movnta movntdq
1125 INIT_CPUFLAGS %1
1126 DEFINE_MMREGS zmm
1127 AVX512_MM_PERMUTATION
Daniel Kang7a000712012-06-07 17:25:54 -07001128%endmacro
1129
1130INIT_XMM
1131
Johann41a0a0c2015-07-30 09:19:43 -07001132%macro DECLARE_MMCAST 1
1133 %define mmmm%1 mm%1
1134 %define mmxmm%1 mm%1
1135 %define mmymm%1 mm%1
Johannc8100662020-04-26 21:00:00 +09001136 %define mmzmm%1 mm%1
Johann41a0a0c2015-07-30 09:19:43 -07001137 %define xmmmm%1 mm%1
1138 %define xmmxmm%1 xmm%1
1139 %define xmmymm%1 xmm%1
Johannc8100662020-04-26 21:00:00 +09001140 %define xmmzmm%1 xmm%1
Johann41a0a0c2015-07-30 09:19:43 -07001141 %define ymmmm%1 mm%1
1142 %define ymmxmm%1 xmm%1
1143 %define ymmymm%1 ymm%1
Johannc8100662020-04-26 21:00:00 +09001144 %define ymmzmm%1 ymm%1
1145 %define zmmmm%1 mm%1
1146 %define zmmxmm%1 xmm%1
1147 %define zmmymm%1 ymm%1
1148 %define zmmzmm%1 zmm%1
Johann41a0a0c2015-07-30 09:19:43 -07001149 %define xm%1 xmm %+ m%1
1150 %define ym%1 ymm %+ m%1
Johannc8100662020-04-26 21:00:00 +09001151 %define zm%1 zmm %+ m%1
Johann41a0a0c2015-07-30 09:19:43 -07001152%endmacro
1153
1154%assign i 0
Johannc8100662020-04-26 21:00:00 +09001155%rep 32
Johann41a0a0c2015-07-30 09:19:43 -07001156 DECLARE_MMCAST i
Johann24973562016-02-17 17:37:24 -08001157 %assign i i+1
Johann41a0a0c2015-07-30 09:19:43 -07001158%endrep
1159
Daniel Kang7a000712012-06-07 17:25:54 -07001160; I often want to use macros that permute their arguments. e.g. there's no
1161; efficient way to implement butterfly or transpose or dct without swapping some
1162; arguments.
1163;
1164; I would like to not have to manually keep track of the permutations:
1165; If I insert a permutation in the middle of a function, it should automatically
1166; change everything that follows. For more complex macros I may also have multiple
1167; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
1168;
1169; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
1170; permutes its arguments. It's equivalent to exchanging the contents of the
1171; registers, except that this way you exchange the register names instead, so it
1172; doesn't cost any cycles.
1173
1174%macro PERMUTE 2-* ; takes a list of pairs to swap
Johann24973562016-02-17 17:37:24 -08001175 %rep %0/2
1176 %xdefine %%tmp%2 m%2
1177 %rotate 2
1178 %endrep
1179 %rep %0/2
1180 %xdefine m%1 %%tmp%2
1181 CAT_XDEFINE nn, m%1, %1
1182 %rotate 2
1183 %endrep
Daniel Kang7a000712012-06-07 17:25:54 -07001184%endmacro
1185
Johann41a0a0c2015-07-30 09:19:43 -07001186%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
Johann24973562016-02-17 17:37:24 -08001187 %ifnum %1 ; SWAP 0, 1, ...
1188 SWAP_INTERNAL_NUM %1, %2
1189 %else ; SWAP m0, m1, ...
1190 SWAP_INTERNAL_NAME %1, %2
1191 %endif
Johann41a0a0c2015-07-30 09:19:43 -07001192%endmacro
1193
1194%macro SWAP_INTERNAL_NUM 2-*
1195 %rep %0-1
1196 %xdefine %%tmp m%1
1197 %xdefine m%1 m%2
1198 %xdefine m%2 %%tmp
1199 CAT_XDEFINE nn, m%1, %1
1200 CAT_XDEFINE nn, m%2, %2
Johann24973562016-02-17 17:37:24 -08001201 %rotate 1
Johann41a0a0c2015-07-30 09:19:43 -07001202 %endrep
1203%endmacro
1204
1205%macro SWAP_INTERNAL_NAME 2-*
1206 %xdefine %%args nn %+ %1
1207 %rep %0-1
1208 %xdefine %%args %%args, nn %+ %2
Johann24973562016-02-17 17:37:24 -08001209 %rotate 1
Johann41a0a0c2015-07-30 09:19:43 -07001210 %endrep
1211 SWAP_INTERNAL_NUM %%args
Daniel Kang7a000712012-06-07 17:25:54 -07001212%endmacro
1213
1214; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
1215; calls to that function will automatically load the permutation, so values can
1216; be returned in mmregs.
1217%macro SAVE_MM_PERMUTATION 0-1
1218 %if %0
1219 %xdefine %%f %1_m
1220 %else
1221 %xdefine %%f current_function %+ _m
1222 %endif
1223 %assign %%i 0
1224 %rep num_mmregs
Johannc8100662020-04-26 21:00:00 +09001225 %xdefine %%tmp m %+ %%i
1226 CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp
Johann24973562016-02-17 17:37:24 -08001227 %assign %%i %%i+1
Daniel Kang7a000712012-06-07 17:25:54 -07001228 %endrep
1229%endmacro
1230
Johannc8100662020-04-26 21:00:00 +09001231%macro LOAD_MM_PERMUTATION 0-1 ; name to load from
1232 %if %0
1233 %xdefine %%f %1_m
1234 %else
1235 %xdefine %%f current_function %+ _m
1236 %endif
1237 %xdefine %%tmp %%f %+ 0
1238 %ifnum %%tmp
1239 RESET_MM_PERMUTATION
Daniel Kang7a000712012-06-07 17:25:54 -07001240 %assign %%i 0
1241 %rep num_mmregs
Johannc8100662020-04-26 21:00:00 +09001242 %xdefine %%tmp %%f %+ %%i
1243 CAT_XDEFINE %%m, %%i, m %+ %%tmp
Johann24973562016-02-17 17:37:24 -08001244 %assign %%i %%i+1
Daniel Kang7a000712012-06-07 17:25:54 -07001245 %endrep
Johannc8100662020-04-26 21:00:00 +09001246 %rep num_mmregs
1247 %assign %%i %%i-1
1248 CAT_XDEFINE m, %%i, %%m %+ %%i
1249 CAT_XDEFINE nn, m %+ %%i, %%i
1250 %endrep
Daniel Kang7a000712012-06-07 17:25:54 -07001251 %endif
1252%endmacro
1253
1254; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
1255%macro call 1
Johannc8100662020-04-26 21:00:00 +09001256 %ifid %1
1257 call_internal %1 %+ SUFFIX, %1
1258 %else
1259 call %1
1260 %endif
Daniel Kang7a000712012-06-07 17:25:54 -07001261%endmacro
1262%macro call_internal 2
Johann24973562016-02-17 17:37:24 -08001263 %xdefine %%i %2
1264 %ifndef cglobaled_%2
1265 %ifdef cglobaled_%1
1266 %xdefine %%i %1
Daniel Kang7a000712012-06-07 17:25:54 -07001267 %endif
1268 %endif
1269 call %%i
1270 LOAD_MM_PERMUTATION %%i
1271%endmacro
1272
1273; Substitutions that reduce instruction size but are functionally equivalent
1274%macro add 2
1275 %ifnum %2
1276 %if %2==128
1277 sub %1, -128
1278 %else
1279 add %1, %2
1280 %endif
1281 %else
1282 add %1, %2
1283 %endif
1284%endmacro
1285
1286%macro sub 2
1287 %ifnum %2
1288 %if %2==128
1289 add %1, -128
1290 %else
1291 sub %1, %2
1292 %endif
1293 %else
1294 sub %1, %2
1295 %endif
1296%endmacro
1297
1298;=============================================================================
1299; AVX abstraction layer
1300;=============================================================================
1301
1302%assign i 0
Johannc8100662020-04-26 21:00:00 +09001303%rep 32
Daniel Kang7a000712012-06-07 17:25:54 -07001304 %if i < 8
1305 CAT_XDEFINE sizeofmm, i, 8
Johannc8100662020-04-26 21:00:00 +09001306 CAT_XDEFINE regnumofmm, i, i
Daniel Kang7a000712012-06-07 17:25:54 -07001307 %endif
1308 CAT_XDEFINE sizeofxmm, i, 16
1309 CAT_XDEFINE sizeofymm, i, 32
Johannc8100662020-04-26 21:00:00 +09001310 CAT_XDEFINE sizeofzmm, i, 64
1311 CAT_XDEFINE regnumofxmm, i, i
1312 CAT_XDEFINE regnumofymm, i, i
1313 CAT_XDEFINE regnumofzmm, i, i
Johann24973562016-02-17 17:37:24 -08001314 %assign i i+1
Daniel Kang7a000712012-06-07 17:25:54 -07001315%endrep
1316%undef i
1317
Johann41a0a0c2015-07-30 09:19:43 -07001318%macro CHECK_AVX_INSTR_EMU 3-*
1319 %xdefine %%opcode %1
1320 %xdefine %%dst %2
1321 %rep %0-2
1322 %ifidn %%dst, %3
1323 %error non-avx emulation of ``%%opcode'' is not supported
Daniel Kang7a000712012-06-07 17:25:54 -07001324 %endif
Johann41a0a0c2015-07-30 09:19:43 -07001325 %rotate 1
1326 %endrep
1327%endmacro
Daniel Kang7a000712012-06-07 17:25:54 -07001328
Johann41a0a0c2015-07-30 09:19:43 -07001329;%1 == instruction
1330;%2 == minimal instruction set
1331;%3 == 1 if float, 0 if int
Johannc8100662020-04-26 21:00:00 +09001332;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
Johann41a0a0c2015-07-30 09:19:43 -07001333;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
1334;%6+: operands
1335%macro RUN_AVX_INSTR 6-9+
1336 %ifnum sizeof%7
1337 %assign __sizeofreg sizeof%7
1338 %elifnum sizeof%6
1339 %assign __sizeofreg sizeof%6
1340 %else
1341 %assign __sizeofreg mmsize
1342 %endif
1343 %assign __emulate_avx 0
1344 %if avx_enabled && __sizeofreg >= 16
1345 %xdefine __instr v%1
1346 %else
1347 %xdefine __instr %1
1348 %if %0 >= 8+%4
1349 %assign __emulate_avx 1
1350 %endif
1351 %endif
1352 %ifnidn %2, fnord
1353 %ifdef cpuname
1354 %if notcpuflag(%2)
1355 %error use of ``%1'' %2 instruction in cpuname function: current_function
Johannc8100662020-04-26 21:00:00 +09001356 %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2)
Johann41a0a0c2015-07-30 09:19:43 -07001357 %error use of ``%1'' sse2 instruction in cpuname function: current_function
Johannc8100662020-04-26 21:00:00 +09001358 %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2)
1359 %error use of ``%1'' avx2 instruction in cpuname function: current_function
1360 %elif __sizeofreg == 16 && notcpuflag(sse)
1361 %error use of ``%1'' sse instruction in cpuname function: current_function
1362 %elif __sizeofreg == 32 && notcpuflag(avx)
1363 %error use of ``%1'' avx instruction in cpuname function: current_function
1364 %elif __sizeofreg == 64 && notcpuflag(avx512)
1365 %error use of ``%1'' avx512 instruction in cpuname function: current_function
1366 %elifidn %1, pextrw ; special case because the base instruction is mmx2,
1367 %ifnid %6 ; but sse4 is required for memory operands
1368 %if notcpuflag(sse4)
1369 %error use of ``%1'' sse4 instruction in cpuname function: current_function
1370 %endif
1371 %endif
Daniel Kang7a000712012-06-07 17:25:54 -07001372 %endif
Daniel Kang7a000712012-06-07 17:25:54 -07001373 %endif
1374 %endif
Daniel Kang7a000712012-06-07 17:25:54 -07001375
Johann41a0a0c2015-07-30 09:19:43 -07001376 %if __emulate_avx
1377 %xdefine __src1 %7
1378 %xdefine __src2 %8
Johannc8100662020-04-26 21:00:00 +09001379 %if %5 && %4 == 0
1380 %ifnidn %6, %7
1381 %ifidn %6, %8
1382 %xdefine __src1 %8
1383 %xdefine __src2 %7
1384 %elifnnum sizeof%8
Johann41a0a0c2015-07-30 09:19:43 -07001385 ; 3-operand AVX instructions with a memory arg can only have it in src2,
1386 ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
1387 ; So, if the instruction is commutative with a memory arg, swap them.
1388 %xdefine __src1 %8
1389 %xdefine __src2 %7
1390 %endif
1391 %endif
Johannc8100662020-04-26 21:00:00 +09001392 %endif
1393 %ifnidn %6, __src1
1394 %if %0 >= 9
1395 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9
1396 %else
1397 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2
1398 %endif
Johann41a0a0c2015-07-30 09:19:43 -07001399 %if __sizeofreg == 8
1400 MOVQ %6, __src1
1401 %elif %3
1402 MOVAPS %6, __src1
1403 %else
1404 MOVDQA %6, __src1
1405 %endif
Daniel Kang7a000712012-06-07 17:25:54 -07001406 %endif
Johann41a0a0c2015-07-30 09:19:43 -07001407 %if %0 >= 9
1408 %1 %6, __src2, %9
1409 %else
1410 %1 %6, __src2
Daniel Kang7a000712012-06-07 17:25:54 -07001411 %endif
Johann41a0a0c2015-07-30 09:19:43 -07001412 %elif %0 >= 9
1413 __instr %6, %7, %8, %9
1414 %elif %0 == 8
Johannc8100662020-04-26 21:00:00 +09001415 %if avx_enabled && %5
1416 %xdefine __src1 %7
1417 %xdefine __src2 %8
1418 %ifnum regnumof%7
1419 %ifnum regnumof%8
1420 %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
1421 ; Most VEX-encoded instructions require an additional byte to encode when
1422 ; src2 is a high register (e.g. m8..15). If the instruction is commutative
1423 ; we can swap src1 and src2 when doing so reduces the instruction length.
1424 %xdefine __src1 %8
1425 %xdefine __src2 %7
1426 %endif
1427 %endif
1428 %endif
1429 __instr %6, __src1, __src2
1430 %else
1431 __instr %6, %7, %8
1432 %endif
Johann41a0a0c2015-07-30 09:19:43 -07001433 %elif %0 == 7
Johannc8100662020-04-26 21:00:00 +09001434 %if avx_enabled && %5
1435 %xdefine __src1 %6
1436 %xdefine __src2 %7
1437 %ifnum regnumof%6
1438 %ifnum regnumof%7
1439 %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32
1440 %xdefine __src1 %7
1441 %xdefine __src2 %6
1442 %endif
1443 %endif
1444 %endif
1445 __instr %6, __src1, __src2
1446 %else
1447 __instr %6, %7
1448 %endif
Daniel Kang7a000712012-06-07 17:25:54 -07001449 %else
Johann41a0a0c2015-07-30 09:19:43 -07001450 __instr %6
Daniel Kang7a000712012-06-07 17:25:54 -07001451 %endif
1452%endmacro
1453
1454;%1 == instruction
Johann41a0a0c2015-07-30 09:19:43 -07001455;%2 == minimal instruction set
1456;%3 == 1 if float, 0 if int
Johannc8100662020-04-26 21:00:00 +09001457;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
Johann41a0a0c2015-07-30 09:19:43 -07001458;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
Johannc8100662020-04-26 21:00:00 +09001459%macro AVX_INSTR 1-5 fnord, 0, 255, 0
Johann41a0a0c2015-07-30 09:19:43 -07001460 %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
1461 %ifidn %2, fnord
1462 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
1463 %elifidn %3, fnord
1464 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2
Daniel Kang7a000712012-06-07 17:25:54 -07001465 %elifidn %4, fnord
Johann41a0a0c2015-07-30 09:19:43 -07001466 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3
Daniel Kang7a000712012-06-07 17:25:54 -07001467 %elifidn %5, fnord
Johann41a0a0c2015-07-30 09:19:43 -07001468 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4
Daniel Kang7a000712012-06-07 17:25:54 -07001469 %else
Johann41a0a0c2015-07-30 09:19:43 -07001470 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5
Daniel Kang7a000712012-06-07 17:25:54 -07001471 %endif
1472 %endmacro
1473%endmacro
1474
Johannc8100662020-04-26 21:00:00 +09001475; Instructions with both VEX/EVEX and legacy encodings
Johann41a0a0c2015-07-30 09:19:43 -07001476; Non-destructive instructions are written without parameters
1477AVX_INSTR addpd, sse2, 1, 0, 1
1478AVX_INSTR addps, sse, 1, 0, 1
Johannc8100662020-04-26 21:00:00 +09001479AVX_INSTR addsd, sse2, 1, 0, 0
1480AVX_INSTR addss, sse, 1, 0, 0
Johann41a0a0c2015-07-30 09:19:43 -07001481AVX_INSTR addsubpd, sse3, 1, 0, 0
1482AVX_INSTR addsubps, sse3, 1, 0, 0
Johannc8100662020-04-26 21:00:00 +09001483AVX_INSTR aesdec, aesni, 0, 0, 0
1484AVX_INSTR aesdeclast, aesni, 0, 0, 0
1485AVX_INSTR aesenc, aesni, 0, 0, 0
1486AVX_INSTR aesenclast, aesni, 0, 0, 0
1487AVX_INSTR aesimc, aesni
1488AVX_INSTR aeskeygenassist, aesni
Johann41a0a0c2015-07-30 09:19:43 -07001489AVX_INSTR andnpd, sse2, 1, 0, 0
1490AVX_INSTR andnps, sse, 1, 0, 0
1491AVX_INSTR andpd, sse2, 1, 0, 1
1492AVX_INSTR andps, sse, 1, 0, 1
Johannc8100662020-04-26 21:00:00 +09001493AVX_INSTR blendpd, sse4, 1, 1, 0
1494AVX_INSTR blendps, sse4, 1, 1, 0
1495AVX_INSTR blendvpd, sse4 ; can't be emulated
1496AVX_INSTR blendvps, sse4 ; can't be emulated
1497AVX_INSTR cmpeqpd, sse2, 1, 0, 1
1498AVX_INSTR cmpeqps, sse, 1, 0, 1
1499AVX_INSTR cmpeqsd, sse2, 1, 0, 0
1500AVX_INSTR cmpeqss, sse, 1, 0, 0
1501AVX_INSTR cmplepd, sse2, 1, 0, 0
1502AVX_INSTR cmpleps, sse, 1, 0, 0
1503AVX_INSTR cmplesd, sse2, 1, 0, 0
1504AVX_INSTR cmpless, sse, 1, 0, 0
1505AVX_INSTR cmpltpd, sse2, 1, 0, 0
1506AVX_INSTR cmpltps, sse, 1, 0, 0
1507AVX_INSTR cmpltsd, sse2, 1, 0, 0
1508AVX_INSTR cmpltss, sse, 1, 0, 0
1509AVX_INSTR cmpneqpd, sse2, 1, 0, 1
1510AVX_INSTR cmpneqps, sse, 1, 0, 1
1511AVX_INSTR cmpneqsd, sse2, 1, 0, 0
1512AVX_INSTR cmpneqss, sse, 1, 0, 0
1513AVX_INSTR cmpnlepd, sse2, 1, 0, 0
1514AVX_INSTR cmpnleps, sse, 1, 0, 0
1515AVX_INSTR cmpnlesd, sse2, 1, 0, 0
1516AVX_INSTR cmpnless, sse, 1, 0, 0
1517AVX_INSTR cmpnltpd, sse2, 1, 0, 0
1518AVX_INSTR cmpnltps, sse, 1, 0, 0
1519AVX_INSTR cmpnltsd, sse2, 1, 0, 0
1520AVX_INSTR cmpnltss, sse, 1, 0, 0
1521AVX_INSTR cmpordpd, sse2 1, 0, 1
1522AVX_INSTR cmpordps, sse 1, 0, 1
1523AVX_INSTR cmpordsd, sse2 1, 0, 0
1524AVX_INSTR cmpordss, sse 1, 0, 0
Johann41a0a0c2015-07-30 09:19:43 -07001525AVX_INSTR cmppd, sse2, 1, 1, 0
1526AVX_INSTR cmpps, sse, 1, 1, 0
1527AVX_INSTR cmpsd, sse2, 1, 1, 0
1528AVX_INSTR cmpss, sse, 1, 1, 0
Johannc8100662020-04-26 21:00:00 +09001529AVX_INSTR cmpunordpd, sse2, 1, 0, 1
1530AVX_INSTR cmpunordps, sse, 1, 0, 1
1531AVX_INSTR cmpunordsd, sse2, 1, 0, 0
1532AVX_INSTR cmpunordss, sse, 1, 0, 0
1533AVX_INSTR comisd, sse2, 1
1534AVX_INSTR comiss, sse, 1
1535AVX_INSTR cvtdq2pd, sse2, 1
1536AVX_INSTR cvtdq2ps, sse2, 1
1537AVX_INSTR cvtpd2dq, sse2, 1
1538AVX_INSTR cvtpd2ps, sse2, 1
1539AVX_INSTR cvtps2dq, sse2, 1
1540AVX_INSTR cvtps2pd, sse2, 1
1541AVX_INSTR cvtsd2si, sse2, 1
1542AVX_INSTR cvtsd2ss, sse2, 1, 0, 0
1543AVX_INSTR cvtsi2sd, sse2, 1, 0, 0
1544AVX_INSTR cvtsi2ss, sse, 1, 0, 0
1545AVX_INSTR cvtss2sd, sse2, 1, 0, 0
1546AVX_INSTR cvtss2si, sse, 1
1547AVX_INSTR cvttpd2dq, sse2, 1
1548AVX_INSTR cvttps2dq, sse2, 1
1549AVX_INSTR cvttsd2si, sse2, 1
1550AVX_INSTR cvttss2si, sse, 1
Johann41a0a0c2015-07-30 09:19:43 -07001551AVX_INSTR divpd, sse2, 1, 0, 0
1552AVX_INSTR divps, sse, 1, 0, 0
1553AVX_INSTR divsd, sse2, 1, 0, 0
1554AVX_INSTR divss, sse, 1, 0, 0
1555AVX_INSTR dppd, sse4, 1, 1, 0
1556AVX_INSTR dpps, sse4, 1, 1, 0
Johannc8100662020-04-26 21:00:00 +09001557AVX_INSTR extractps, sse4, 1
1558AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0
1559AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0
1560AVX_INSTR gf2p8mulb, gfni, 0, 0, 0
Johann41a0a0c2015-07-30 09:19:43 -07001561AVX_INSTR haddpd, sse3, 1, 0, 0
1562AVX_INSTR haddps, sse3, 1, 0, 0
1563AVX_INSTR hsubpd, sse3, 1, 0, 0
1564AVX_INSTR hsubps, sse3, 1, 0, 0
1565AVX_INSTR insertps, sse4, 1, 1, 0
1566AVX_INSTR lddqu, sse3
Johannc8100662020-04-26 21:00:00 +09001567AVX_INSTR ldmxcsr, sse, 1
Johann41a0a0c2015-07-30 09:19:43 -07001568AVX_INSTR maskmovdqu, sse2
1569AVX_INSTR maxpd, sse2, 1, 0, 1
1570AVX_INSTR maxps, sse, 1, 0, 1
Johannc8100662020-04-26 21:00:00 +09001571AVX_INSTR maxsd, sse2, 1, 0, 0
1572AVX_INSTR maxss, sse, 1, 0, 0
Johann41a0a0c2015-07-30 09:19:43 -07001573AVX_INSTR minpd, sse2, 1, 0, 1
1574AVX_INSTR minps, sse, 1, 0, 1
Johannc8100662020-04-26 21:00:00 +09001575AVX_INSTR minsd, sse2, 1, 0, 0
1576AVX_INSTR minss, sse, 1, 0, 0
1577AVX_INSTR movapd, sse2, 1
1578AVX_INSTR movaps, sse, 1
Johann41a0a0c2015-07-30 09:19:43 -07001579AVX_INSTR movd, mmx
Johannc8100662020-04-26 21:00:00 +09001580AVX_INSTR movddup, sse3, 1
Johann41a0a0c2015-07-30 09:19:43 -07001581AVX_INSTR movdqa, sse2
1582AVX_INSTR movdqu, sse2
1583AVX_INSTR movhlps, sse, 1, 0, 0
1584AVX_INSTR movhpd, sse2, 1, 0, 0
1585AVX_INSTR movhps, sse, 1, 0, 0
1586AVX_INSTR movlhps, sse, 1, 0, 0
1587AVX_INSTR movlpd, sse2, 1, 0, 0
1588AVX_INSTR movlps, sse, 1, 0, 0
Johannc8100662020-04-26 21:00:00 +09001589AVX_INSTR movmskpd, sse2, 1
1590AVX_INSTR movmskps, sse, 1
Johann41a0a0c2015-07-30 09:19:43 -07001591AVX_INSTR movntdq, sse2
1592AVX_INSTR movntdqa, sse4
Johannc8100662020-04-26 21:00:00 +09001593AVX_INSTR movntpd, sse2, 1
1594AVX_INSTR movntps, sse, 1
Johann41a0a0c2015-07-30 09:19:43 -07001595AVX_INSTR movq, mmx
1596AVX_INSTR movsd, sse2, 1, 0, 0
Johannc8100662020-04-26 21:00:00 +09001597AVX_INSTR movshdup, sse3, 1
1598AVX_INSTR movsldup, sse3, 1
Johann41a0a0c2015-07-30 09:19:43 -07001599AVX_INSTR movss, sse, 1, 0, 0
Johannc8100662020-04-26 21:00:00 +09001600AVX_INSTR movupd, sse2, 1
1601AVX_INSTR movups, sse, 1
1602AVX_INSTR mpsadbw, sse4, 0, 1, 0
Johann41a0a0c2015-07-30 09:19:43 -07001603AVX_INSTR mulpd, sse2, 1, 0, 1
1604AVX_INSTR mulps, sse, 1, 0, 1
Johannc8100662020-04-26 21:00:00 +09001605AVX_INSTR mulsd, sse2, 1, 0, 0
1606AVX_INSTR mulss, sse, 1, 0, 0
Johann41a0a0c2015-07-30 09:19:43 -07001607AVX_INSTR orpd, sse2, 1, 0, 1
1608AVX_INSTR orps, sse, 1, 0, 1
1609AVX_INSTR pabsb, ssse3
1610AVX_INSTR pabsd, ssse3
1611AVX_INSTR pabsw, ssse3
1612AVX_INSTR packsswb, mmx, 0, 0, 0
1613AVX_INSTR packssdw, mmx, 0, 0, 0
1614AVX_INSTR packuswb, mmx, 0, 0, 0
1615AVX_INSTR packusdw, sse4, 0, 0, 0
1616AVX_INSTR paddb, mmx, 0, 0, 1
1617AVX_INSTR paddw, mmx, 0, 0, 1
1618AVX_INSTR paddd, mmx, 0, 0, 1
1619AVX_INSTR paddq, sse2, 0, 0, 1
1620AVX_INSTR paddsb, mmx, 0, 0, 1
1621AVX_INSTR paddsw, mmx, 0, 0, 1
1622AVX_INSTR paddusb, mmx, 0, 0, 1
1623AVX_INSTR paddusw, mmx, 0, 0, 1
Johannc8100662020-04-26 21:00:00 +09001624AVX_INSTR palignr, ssse3, 0, 1, 0
Johann41a0a0c2015-07-30 09:19:43 -07001625AVX_INSTR pand, mmx, 0, 0, 1
1626AVX_INSTR pandn, mmx, 0, 0, 0
1627AVX_INSTR pavgb, mmx2, 0, 0, 1
1628AVX_INSTR pavgw, mmx2, 0, 0, 1
Johannc8100662020-04-26 21:00:00 +09001629AVX_INSTR pblendvb, sse4 ; can't be emulated
1630AVX_INSTR pblendw, sse4, 0, 1, 0
1631AVX_INSTR pclmulqdq, fnord, 0, 1, 0
1632AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0
1633AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0
1634AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0
1635AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0
Johann41a0a0c2015-07-30 09:19:43 -07001636AVX_INSTR pcmpestri, sse42
1637AVX_INSTR pcmpestrm, sse42
1638AVX_INSTR pcmpistri, sse42
1639AVX_INSTR pcmpistrm, sse42
1640AVX_INSTR pcmpeqb, mmx, 0, 0, 1
1641AVX_INSTR pcmpeqw, mmx, 0, 0, 1
1642AVX_INSTR pcmpeqd, mmx, 0, 0, 1
1643AVX_INSTR pcmpeqq, sse4, 0, 0, 1
1644AVX_INSTR pcmpgtb, mmx, 0, 0, 0
1645AVX_INSTR pcmpgtw, mmx, 0, 0, 0
1646AVX_INSTR pcmpgtd, mmx, 0, 0, 0
1647AVX_INSTR pcmpgtq, sse42, 0, 0, 0
1648AVX_INSTR pextrb, sse4
1649AVX_INSTR pextrd, sse4
1650AVX_INSTR pextrq, sse4
1651AVX_INSTR pextrw, mmx2
1652AVX_INSTR phaddw, ssse3, 0, 0, 0
1653AVX_INSTR phaddd, ssse3, 0, 0, 0
1654AVX_INSTR phaddsw, ssse3, 0, 0, 0
1655AVX_INSTR phminposuw, sse4
1656AVX_INSTR phsubw, ssse3, 0, 0, 0
1657AVX_INSTR phsubd, ssse3, 0, 0, 0
1658AVX_INSTR phsubsw, ssse3, 0, 0, 0
Johannc8100662020-04-26 21:00:00 +09001659AVX_INSTR pinsrb, sse4, 0, 1, 0
1660AVX_INSTR pinsrd, sse4, 0, 1, 0
1661AVX_INSTR pinsrq, sse4, 0, 1, 0
1662AVX_INSTR pinsrw, mmx2, 0, 1, 0
Johann41a0a0c2015-07-30 09:19:43 -07001663AVX_INSTR pmaddwd, mmx, 0, 0, 1
1664AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
1665AVX_INSTR pmaxsb, sse4, 0, 0, 1
1666AVX_INSTR pmaxsw, mmx2, 0, 0, 1
1667AVX_INSTR pmaxsd, sse4, 0, 0, 1
1668AVX_INSTR pmaxub, mmx2, 0, 0, 1
1669AVX_INSTR pmaxuw, sse4, 0, 0, 1
1670AVX_INSTR pmaxud, sse4, 0, 0, 1
1671AVX_INSTR pminsb, sse4, 0, 0, 1
1672AVX_INSTR pminsw, mmx2, 0, 0, 1
1673AVX_INSTR pminsd, sse4, 0, 0, 1
1674AVX_INSTR pminub, mmx2, 0, 0, 1
1675AVX_INSTR pminuw, sse4, 0, 0, 1
1676AVX_INSTR pminud, sse4, 0, 0, 1
1677AVX_INSTR pmovmskb, mmx2
1678AVX_INSTR pmovsxbw, sse4
1679AVX_INSTR pmovsxbd, sse4
1680AVX_INSTR pmovsxbq, sse4
1681AVX_INSTR pmovsxwd, sse4
1682AVX_INSTR pmovsxwq, sse4
1683AVX_INSTR pmovsxdq, sse4
1684AVX_INSTR pmovzxbw, sse4
1685AVX_INSTR pmovzxbd, sse4
1686AVX_INSTR pmovzxbq, sse4
1687AVX_INSTR pmovzxwd, sse4
1688AVX_INSTR pmovzxwq, sse4
1689AVX_INSTR pmovzxdq, sse4
1690AVX_INSTR pmuldq, sse4, 0, 0, 1
1691AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
1692AVX_INSTR pmulhuw, mmx2, 0, 0, 1
1693AVX_INSTR pmulhw, mmx, 0, 0, 1
1694AVX_INSTR pmullw, mmx, 0, 0, 1
1695AVX_INSTR pmulld, sse4, 0, 0, 1
1696AVX_INSTR pmuludq, sse2, 0, 0, 1
1697AVX_INSTR por, mmx, 0, 0, 1
1698AVX_INSTR psadbw, mmx2, 0, 0, 1
1699AVX_INSTR pshufb, ssse3, 0, 0, 0
1700AVX_INSTR pshufd, sse2
1701AVX_INSTR pshufhw, sse2
1702AVX_INSTR pshuflw, sse2
1703AVX_INSTR psignb, ssse3, 0, 0, 0
1704AVX_INSTR psignw, ssse3, 0, 0, 0
1705AVX_INSTR psignd, ssse3, 0, 0, 0
1706AVX_INSTR psllw, mmx, 0, 0, 0
1707AVX_INSTR pslld, mmx, 0, 0, 0
1708AVX_INSTR psllq, mmx, 0, 0, 0
1709AVX_INSTR pslldq, sse2, 0, 0, 0
1710AVX_INSTR psraw, mmx, 0, 0, 0
1711AVX_INSTR psrad, mmx, 0, 0, 0
1712AVX_INSTR psrlw, mmx, 0, 0, 0
1713AVX_INSTR psrld, mmx, 0, 0, 0
1714AVX_INSTR psrlq, mmx, 0, 0, 0
1715AVX_INSTR psrldq, sse2, 0, 0, 0
1716AVX_INSTR psubb, mmx, 0, 0, 0
1717AVX_INSTR psubw, mmx, 0, 0, 0
1718AVX_INSTR psubd, mmx, 0, 0, 0
1719AVX_INSTR psubq, sse2, 0, 0, 0
1720AVX_INSTR psubsb, mmx, 0, 0, 0
1721AVX_INSTR psubsw, mmx, 0, 0, 0
1722AVX_INSTR psubusb, mmx, 0, 0, 0
1723AVX_INSTR psubusw, mmx, 0, 0, 0
1724AVX_INSTR ptest, sse4
1725AVX_INSTR punpckhbw, mmx, 0, 0, 0
1726AVX_INSTR punpckhwd, mmx, 0, 0, 0
1727AVX_INSTR punpckhdq, mmx, 0, 0, 0
1728AVX_INSTR punpckhqdq, sse2, 0, 0, 0
1729AVX_INSTR punpcklbw, mmx, 0, 0, 0
1730AVX_INSTR punpcklwd, mmx, 0, 0, 0
1731AVX_INSTR punpckldq, mmx, 0, 0, 0
1732AVX_INSTR punpcklqdq, sse2, 0, 0, 0
1733AVX_INSTR pxor, mmx, 0, 0, 1
Johannc8100662020-04-26 21:00:00 +09001734AVX_INSTR rcpps, sse, 1
Johann41a0a0c2015-07-30 09:19:43 -07001735AVX_INSTR rcpss, sse, 1, 0, 0
Johannc8100662020-04-26 21:00:00 +09001736AVX_INSTR roundpd, sse4, 1
1737AVX_INSTR roundps, sse4, 1
1738AVX_INSTR roundsd, sse4, 1, 1, 0
1739AVX_INSTR roundss, sse4, 1, 1, 0
1740AVX_INSTR rsqrtps, sse, 1
Johann41a0a0c2015-07-30 09:19:43 -07001741AVX_INSTR rsqrtss, sse, 1, 0, 0
1742AVX_INSTR shufpd, sse2, 1, 1, 0
1743AVX_INSTR shufps, sse, 1, 1, 0
Johannc8100662020-04-26 21:00:00 +09001744AVX_INSTR sqrtpd, sse2, 1
1745AVX_INSTR sqrtps, sse, 1
Johann41a0a0c2015-07-30 09:19:43 -07001746AVX_INSTR sqrtsd, sse2, 1, 0, 0
1747AVX_INSTR sqrtss, sse, 1, 0, 0
Johannc8100662020-04-26 21:00:00 +09001748AVX_INSTR stmxcsr, sse, 1
Johann41a0a0c2015-07-30 09:19:43 -07001749AVX_INSTR subpd, sse2, 1, 0, 0
1750AVX_INSTR subps, sse, 1, 0, 0
1751AVX_INSTR subsd, sse2, 1, 0, 0
1752AVX_INSTR subss, sse, 1, 0, 0
Johannc8100662020-04-26 21:00:00 +09001753AVX_INSTR ucomisd, sse2, 1
1754AVX_INSTR ucomiss, sse, 1
Johann41a0a0c2015-07-30 09:19:43 -07001755AVX_INSTR unpckhpd, sse2, 1, 0, 0
1756AVX_INSTR unpckhps, sse, 1, 0, 0
1757AVX_INSTR unpcklpd, sse2, 1, 0, 0
1758AVX_INSTR unpcklps, sse, 1, 0, 0
1759AVX_INSTR xorpd, sse2, 1, 0, 1
1760AVX_INSTR xorps, sse, 1, 0, 1
Daniel Kang7a000712012-06-07 17:25:54 -07001761
1762; 3DNow instructions, for sharing code between AVX, SSE and 3DN
Johann41a0a0c2015-07-30 09:19:43 -07001763AVX_INSTR pfadd, 3dnow, 1, 0, 1
1764AVX_INSTR pfsub, 3dnow, 1, 0, 0
1765AVX_INSTR pfmul, 3dnow, 1, 0, 1
Daniel Kang7a000712012-06-07 17:25:54 -07001766
Johannc8100662020-04-26 21:00:00 +09001767;%1 == instruction
1768;%2 == minimal instruction set
1769%macro GPR_INSTR 2
1770 %macro %1 2-5 fnord, %1, %2
1771 %ifdef cpuname
1772 %if notcpuflag(%5)
1773 %error use of ``%4'' %5 instruction in cpuname function: current_function
1774 %endif
1775 %endif
1776 %ifidn %3, fnord
1777 %4 %1, %2
1778 %else
1779 %4 %1, %2, %3
1780 %endif
1781 %endmacro
1782%endmacro
1783
1784GPR_INSTR andn, bmi1
1785GPR_INSTR bextr, bmi1
1786GPR_INSTR blsi, bmi1
1787GPR_INSTR blsr, bmi1
1788GPR_INSTR blsmsk, bmi1
1789GPR_INSTR bzhi, bmi2
1790GPR_INSTR mulx, bmi2
1791GPR_INSTR pdep, bmi2
1792GPR_INSTR pext, bmi2
1793GPR_INSTR popcnt, sse42
1794GPR_INSTR rorx, bmi2
1795GPR_INSTR sarx, bmi2
1796GPR_INSTR shlx, bmi2
1797GPR_INSTR shrx, bmi2
1798
Daniel Kang7a000712012-06-07 17:25:54 -07001799; base-4 constants for shuffles
1800%assign i 0
1801%rep 256
1802 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
1803 %if j < 10
1804 CAT_XDEFINE q000, j, i
1805 %elif j < 100
1806 CAT_XDEFINE q00, j, i
1807 %elif j < 1000
1808 CAT_XDEFINE q0, j, i
1809 %else
1810 CAT_XDEFINE q, j, i
1811 %endif
Johann24973562016-02-17 17:37:24 -08001812 %assign i i+1
Daniel Kang7a000712012-06-07 17:25:54 -07001813%endrep
1814%undef i
1815%undef j
1816
1817%macro FMA_INSTR 3
1818 %macro %1 4-7 %1, %2, %3
1819 %if cpuflag(xop)
1820 v%5 %1, %2, %3, %4
Johann41a0a0c2015-07-30 09:19:43 -07001821 %elifnidn %1, %4
Daniel Kang7a000712012-06-07 17:25:54 -07001822 %6 %1, %2, %3
1823 %7 %1, %4
Johann41a0a0c2015-07-30 09:19:43 -07001824 %else
1825 %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported
Daniel Kang7a000712012-06-07 17:25:54 -07001826 %endif
1827 %endmacro
1828%endmacro
1829
Daniel Kang7a000712012-06-07 17:25:54 -07001830FMA_INSTR pmacsww, pmullw, paddw
Johann41a0a0c2015-07-30 09:19:43 -07001831FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation
1832FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation
Daniel Kang7a000712012-06-07 17:25:54 -07001833FMA_INSTR pmadcswd, pmaddwd, paddd
Johann41a0a0c2015-07-30 09:19:43 -07001834
Johann24973562016-02-17 17:37:24 -08001835; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.
1836; FMA3 is only possible if dst is the same as one of the src registers.
1837; Either src2 or src3 can be a memory operand.
1838%macro FMA4_INSTR 2-*
1839 %push fma4_instr
1840 %xdefine %$prefix %1
1841 %rep %0 - 1
1842 %macro %$prefix%2 4-6 %$prefix, %2
1843 %if notcpuflag(fma3) && notcpuflag(fma4)
1844 %error use of ``%5%6'' fma instruction in cpuname function: current_function
1845 %elif cpuflag(fma4)
1846 v%5%6 %1, %2, %3, %4
1847 %elifidn %1, %2
1848 ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
Johannc8100662020-04-26 21:00:00 +09001849 %ifnum sizeof%3
Johann24973562016-02-17 17:37:24 -08001850 v%{5}213%6 %2, %3, %4
1851 %else
1852 v%{5}132%6 %2, %4, %3
1853 %endif
1854 %elifidn %1, %3
1855 v%{5}213%6 %3, %2, %4
1856 %elifidn %1, %4
1857 v%{5}231%6 %4, %2, %3
1858 %else
1859 %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported
1860 %endif
1861 %endmacro
1862 %rotate 1
1863 %endrep
1864 %pop
Johann41a0a0c2015-07-30 09:19:43 -07001865%endmacro
1866
Johann24973562016-02-17 17:37:24 -08001867FMA4_INSTR fmadd, pd, ps, sd, ss
1868FMA4_INSTR fmaddsub, pd, ps
1869FMA4_INSTR fmsub, pd, ps, sd, ss
1870FMA4_INSTR fmsubadd, pd, ps
1871FMA4_INSTR fnmadd, pd, ps, sd, ss
1872FMA4_INSTR fnmsub, pd, ps, sd, ss
Johann41a0a0c2015-07-30 09:19:43 -07001873
Johannc8100662020-04-26 21:00:00 +09001874; Macros for converting VEX instructions to equivalent EVEX ones.
1875%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
1876 %macro %1 2-7 fnord, fnord, %1, %2, %3
1877 %ifidn %3, fnord
1878 %define %%args %1, %2
1879 %elifidn %4, fnord
1880 %define %%args %1, %2, %3
1881 %else
1882 %define %%args %1, %2, %3, %4
1883 %endif
1884 %assign %%evex_required cpuflag(avx512) & %7
1885 %ifnum regnumof%1
1886 %if regnumof%1 >= 16 || sizeof%1 > 32
1887 %assign %%evex_required 1
Johann24973562016-02-17 17:37:24 -08001888 %endif
Johannc8100662020-04-26 21:00:00 +09001889 %endif
1890 %ifnum regnumof%2
1891 %if regnumof%2 >= 16 || sizeof%2 > 32
1892 %assign %%evex_required 1
1893 %endif
1894 %endif
1895 %ifnum regnumof%3
1896 %if regnumof%3 >= 16 || sizeof%3 > 32
1897 %assign %%evex_required 1
1898 %endif
1899 %endif
1900 %if %%evex_required
1901 %6 %%args
1902 %else
1903 %5 %%args ; Prefer VEX over EVEX due to shorter instruction length
1904 %endif
1905 %endmacro
1906%endmacro
1907
1908EVEX_INSTR vbroadcastf128, vbroadcastf32x4
1909EVEX_INSTR vbroadcasti128, vbroadcasti32x4
1910EVEX_INSTR vextractf128, vextractf32x4
1911EVEX_INSTR vextracti128, vextracti32x4
1912EVEX_INSTR vinsertf128, vinsertf32x4
1913EVEX_INSTR vinserti128, vinserti32x4
1914EVEX_INSTR vmovdqa, vmovdqa32
1915EVEX_INSTR vmovdqu, vmovdqu32
1916EVEX_INSTR vpand, vpandd
1917EVEX_INSTR vpandn, vpandnd
1918EVEX_INSTR vpor, vpord
1919EVEX_INSTR vpxor, vpxord
1920EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision
1921EVEX_INSTR vrcpss, vrcp14ss, 1
1922EVEX_INSTR vrsqrtps, vrsqrt14ps, 1
1923EVEX_INSTR vrsqrtss, vrsqrt14ss, 1