modify SAVE_XMM for potential 64bit use
the win64 abi requires saving and restoring xmm6:xmm15. currently
SAVE_XMM and RESTORE XMM only allow for saving xmm6:xmm7. allow
specifying the highest register used and if the stack is unaligned.
Change-Id: Ica5699622ffe3346d3a486f48eef0206c51cf867
diff --git a/vp8/common/x86/idctllm_sse2.asm b/vp8/common/x86/idctllm_sse2.asm
index c873869..34a7e18 100644
--- a/vp8/common/x86/idctllm_sse2.asm
+++ b/vp8/common/x86/idctllm_sse2.asm
@@ -102,7 +102,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -443,7 +443,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
diff --git a/vp8/common/x86/iwalsh_sse2.asm b/vp8/common/x86/iwalsh_sse2.asm
index 83c97df..1da4fd8 100644
--- a/vp8/common/x86/iwalsh_sse2.asm
+++ b/vp8/common/x86/iwalsh_sse2.asm
@@ -17,7 +17,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 2
- SAVE_XMM
+ SAVE_XMM 6
push rsi
push rdi
; end prolog
@@ -41,7 +41,7 @@
movdqa xmm4, xmm0
punpcklqdq xmm0, xmm3 ;d1 a1
punpckhqdq xmm4, xmm3 ;c1 b1
- movd xmm7, eax
+ movd xmm6, eax
movdqa xmm1, xmm4 ;c1 b1
paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0]
@@ -66,7 +66,7 @@
pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
movdqa xmm3, xmm4 ;ip[4] ip[0]
- pshufd xmm7, xmm7, 0 ;03 03 03 03 03 03 03 03
+ pshufd xmm6, xmm6, 0 ;03 03 03 03 03 03 03 03
paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
@@ -90,8 +90,8 @@
punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00
punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- paddw xmm5, xmm7
- paddw xmm1, xmm7
+ paddw xmm5, xmm6
+ paddw xmm1, xmm6
psraw xmm5, 3
psraw xmm1, 3
diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm
index 849133d..c2ce1a1 100644
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@@ -288,7 +288,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -338,7 +338,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -584,7 +584,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -634,7 +634,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1024,7 +1024,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1091,7 +1091,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1249,7 +1249,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1318,7 +1318,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1386,7 +1386,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1503,7 +1503,7 @@
push rbp ; save old base pointer value.
mov rbp, rsp ; set new base pointer value.
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx ; save callee-saved reg
push rsi
push rdi
diff --git a/vp8/common/x86/postproc_sse2.asm b/vp8/common/x86/postproc_sse2.asm
index 30b4bf5..06d51ec 100644
--- a/vp8/common/x86/postproc_sse2.asm
+++ b/vp8/common/x86/postproc_sse2.asm
@@ -26,7 +26,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -256,7 +256,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -456,7 +456,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm
index 4ad3973..67b6420 100644
--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@@ -67,7 +67,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
- SAVE_XMM
+ SAVE_XMM 7
push rsi
push rdi
; end prolog
diff --git a/vp8/common/x86/subpixel_sse2.asm b/vp8/common/x86/subpixel_sse2.asm
index b87cad2..83e3b14 100644
--- a/vp8/common/x86/subpixel_sse2.asm
+++ b/vp8/common/x86/subpixel_sse2.asm
@@ -37,7 +37,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -157,7 +157,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -333,7 +333,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 8
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -428,7 +428,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 8
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -538,7 +538,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -651,7 +651,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -816,7 +816,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -908,7 +908,6 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
- ;SAVE_XMM ;xmm6, xmm7 are not used here.
GET_GOT rbx
push rsi
push rdi
@@ -948,7 +947,6 @@
pop rdi
pop rsi
RESTORE_GOT
- ;RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -969,7 +967,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1238,7 +1236,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
diff --git a/vp8/common/x86/subpixel_ssse3.asm b/vp8/common/x86/subpixel_ssse3.asm
index 1db3d62..1ddbc54 100644
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ b/vp8/common/x86/subpixel_ssse3.asm
@@ -39,7 +39,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -182,7 +182,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -289,7 +289,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -418,7 +418,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -606,7 +606,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -819,7 +819,6 @@
pop rdi
pop rsi
RESTORE_GOT
- RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -886,7 +885,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1149,7 +1148,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm
index 287ad48..3d52a5d 100644
--- a/vp8/encoder/x86/dct_sse2.asm
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -33,7 +33,7 @@
%define input rcx
%define output rdx
%define pitch r8
- SAVE_XMM
+ SAVE_XMM 7, u
%else
%define input rdi
%define output rsi
diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm
index e142a75..9946294 100644
--- a/vp8/encoder/x86/encodeopt.asm
+++ b/vp8/encoder/x86/encodeopt.asm
@@ -208,7 +208,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 3
- SAVE_XMM ; 6
+ SAVE_XMM 6
push rsi
push rdi
; end prolog
diff --git a/vp8/encoder/x86/fwalsh_sse2.asm b/vp8/encoder/x86/fwalsh_sse2.asm
index 39439f0..71efd56 100644
--- a/vp8/encoder/x86/fwalsh_sse2.asm
+++ b/vp8/encoder/x86/fwalsh_sse2.asm
@@ -17,7 +17,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 3
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
index 7b7ae70..056b64c 100644
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -20,7 +20,7 @@
sym(vp8_regular_quantize_b_sse2):
push rbp
mov rbp, rsp
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
%if ABI_IS_32BIT
diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm
index d9ac3ff..04ee72f 100644
--- a/vp8/encoder/x86/sad_sse2.asm
+++ b/vp8/encoder/x86/sad_sse2.asm
@@ -21,7 +21,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
- SAVE_XMM ; 6
+ SAVE_XMM 6
push rsi
push rdi
; end prolog
diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm
index 6668792..2dbcc7d 100644
--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -33,15 +33,15 @@
movsxd rdx, dword ptr arg(3) ; ref_stride
%else
%ifidn __OUTPUT_FORMAT__,x64
+ SAVE_XMM 7, u
%define src_ptr rcx
%define src_stride rdx
%define ref_ptr r8
%define ref_stride r9
%define end_ptr r10
%define ret_var r11
- %define result_ptr [rsp+40+4*8]
- %define max_err [rsp+40+4*8]
- SAVE_XMM
+ %define result_ptr [rsp+xmm_stack_space+8+4*8]
+ %define max_err [rsp+xmm_stack_space+8+4*8]
%else
%define src_ptr rdi
%define src_stride rsi
@@ -108,6 +108,7 @@
xchg rbx, rax
%else
%ifidn __OUTPUT_FORMAT__,x64
+ SAVE_XMM 7, u
%define src_ptr rcx
%define src_stride rdx
%define r0_ptr rsi
@@ -115,8 +116,7 @@
%define r2_ptr r11
%define r3_ptr r8
%define ref_stride r9
- %define result_ptr [rsp+48+4*8]
- SAVE_XMM
+ %define result_ptr [rsp+xmm_stack_space+16+4*8]
push rsi
LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/encoder/x86/sad_ssse3.asm
index 7c7cd0a..6ecf081 100644
--- a/vp8/encoder/x86/sad_ssse3.asm
+++ b/vp8/encoder/x86/sad_ssse3.asm
@@ -157,7 +157,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
- SAVE_XMM
+ SAVE_XMM 7
push rsi
push rdi
push rcx
@@ -270,7 +270,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
- SAVE_XMM
+ SAVE_XMM 7
push rsi
push rdi
push rcx
diff --git a/vp8/encoder/x86/ssim_opt.asm b/vp8/encoder/x86/ssim_opt.asm
index d6cebf3..d5d267a 100644
--- a/vp8/encoder/x86/ssim_opt.asm
+++ b/vp8/encoder/x86/ssim_opt.asm
@@ -66,7 +66,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
- SAVE_XMM
+ SAVE_XMM 15
push rsi
push rdi
; end prolog
@@ -156,7 +156,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
- SAVE_XMM
+ SAVE_XMM 15
push rsi
push rdi
; end prolog
diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm
index 3fb23d0..95888f6 100644
--- a/vp8/encoder/x86/subtract_sse2.asm
+++ b/vp8/encoder/x86/subtract_sse2.asm
@@ -77,7 +77,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
diff --git a/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
index 0127b01..30674c8 100644
--- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -26,7 +26,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 8
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
index 2c0e170..5becc73 100644
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -85,7 +85,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
push rbx
push rsi
push rdi
@@ -225,7 +225,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -345,7 +345,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -534,7 +534,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -811,7 +811,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -933,7 +933,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1049,7 +1049,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1156,7 +1156,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1264,7 +1264,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
@@ -1369,7 +1369,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
diff --git a/vp8/encoder/x86/variance_impl_ssse3.asm b/vp8/encoder/x86/variance_impl_ssse3.asm
index 3c0fef9..a582f8d 100644
--- a/vp8/encoder/x86/variance_impl_ssse3.asm
+++ b/vp8/encoder/x86/variance_impl_ssse3.asm
@@ -34,7 +34,7 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
- SAVE_XMM
+ SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
diff --git a/vpx_ports/x86_abi_support.asm b/vpx_ports/x86_abi_support.asm
index 60dff49..37a3205 100644
--- a/vpx_ports/x86_abi_support.asm
+++ b/vpx_ports/x86_abi_support.asm
@@ -255,21 +255,48 @@
%define UNSHADOW_ARGS mov rsp, rbp
%endif
-; must keep XMM6:XMM15 (libvpx uses XMM6 and XMM7) on Win64 ABI
-; rsp register has to be aligned
+; Win64 ABI requires that XMM6:XMM15 are callee saved
+; SAVE_XMM n, [u]
+; store registers 6-n on the stack
+; if u is specified, use unaligned movs.
+; Win64 ABI requires 16 byte stack alignment, but then pushes an 8 byte return
+; value. Typically we follow this up with 'push rbp' - re-aligning the stack -
+; but in some cases this is not done and unaligned movs must be used.
%ifidn __OUTPUT_FORMAT__,x64
-%macro SAVE_XMM 0
- sub rsp, 32
- movdqu XMMWORD PTR [rsp], xmm6
- movdqu XMMWORD PTR [rsp+16], xmm7
+%macro SAVE_XMM 1-2 a
+ %if %1 < 6
+ %error Only xmm registers 6-15 must be preserved
+ %else
+ %assign last_xmm %1
+ %define movxmm movdq %+ %2
+ %assign xmm_stack_space ((last_xmm - 5) * 16)
+ sub rsp, xmm_stack_space
+ %assign i 6
+ %rep (last_xmm - 5)
+ movxmm [rsp + ((i - 6) * 16)], xmm %+ i
+ %assign i i+1
+ %endrep
+ %endif
%endmacro
%macro RESTORE_XMM 0
- movdqu xmm6, XMMWORD PTR [rsp]
- movdqu xmm7, XMMWORD PTR [rsp+16]
- add rsp, 32
+ %ifndef last_xmm
+ %error RESTORE_XMM must be paired with SAVE_XMM n
+ %else
+ %assign i last_xmm
+ %rep (last_xmm - 5)
+ movxmm xmm %+ i, [rsp +((i - 6) * 16)]
+ %assign i i-1
+ %endrep
+ add rsp, xmm_stack_space
+ ; there are a couple functions which return from multiple places.
+ ; otherwise, we could uncomment these:
+ ; %undef last_xmm
+ ; %undef xmm_stack_space
+ ; %undef movxmm
+ %endif
%endmacro
%else
-%macro SAVE_XMM 0
+%macro SAVE_XMM 1-2
%endmacro
%macro RESTORE_XMM 0
%endmacro