blob: 6d58321e035170dd65e42e23bad0f5be3e9a1656 [file] [log] [blame]
Jim Bankoski3f6f7282011-03-08 09:05:18 -05001;
2; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4; Use of this source code is governed by a BSD-style license
5; that can be found in the LICENSE file in the root of the source
6; tree. An additional intellectual property rights grant can be found
7; in the file PATENTS. All contributing project authors may
8; be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "vpx_ports/x86_abi_support.asm"
12
13; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
14%macro TABULATE_SSIM 0
15 paddusw xmm15, xmm3 ; sum_s
16 paddusw xmm14, xmm4 ; sum_r
17 movdqa xmm1, xmm3
18 pmaddwd xmm1, xmm1
Jim Bankoskid4cdb682011-03-28 16:39:05 -070019 paddd xmm13, xmm1 ; sum_sq_s
Jim Bankoski3f6f7282011-03-08 09:05:18 -050020 movdqa xmm2, xmm4
21 pmaddwd xmm2, xmm2
Jim Bankoskid4cdb682011-03-28 16:39:05 -070022 paddd xmm12, xmm2 ; sum_sq_r
Jim Bankoski3f6f7282011-03-08 09:05:18 -050023 pmaddwd xmm3, xmm4
Jim Bankoskid4cdb682011-03-28 16:39:05 -070024 paddd xmm11, xmm3 ; sum_sxr
Jim Bankoski3f6f7282011-03-08 09:05:18 -050025%endmacro
26
27; Sum across the register %1 starting with q words
28%macro SUM_ACROSS_Q 1
29 movdqa xmm2,%1
30 punpckldq %1,xmm0
31 punpckhdq xmm2,xmm0
32 paddq %1,xmm2
33 movdqa xmm2,%1
34 punpcklqdq %1,xmm0
35 punpckhqdq xmm2,xmm0
36 paddq %1,xmm2
37%endmacro
38
39; Sum across the register %1 starting with q words
40%macro SUM_ACROSS_W 1
41 movdqa xmm1, %1
42 punpcklwd %1,xmm0
43 punpckhwd xmm1,xmm0
44 paddd %1, xmm1
45 SUM_ACROSS_Q %1
46%endmacro
Fritz Koenig694d4e72011-08-22 12:36:28 -070047;void ssim_parms_sse2(
Jim Bankoski3f6f7282011-03-08 09:05:18 -050048; unsigned char *s,
49; int sp,
50; unsigned char *r,
51; int rp
Alex Conversec65e79d2015-08-06 13:46:28 -070052; uint32_t *sum_s,
53; uint32_t *sum_r,
54; uint32_t *sum_sq_s,
55; uint32_t *sum_sq_r,
56; uint32_t *sum_sxr);
Jim Bankoski3f6f7282011-03-08 09:05:18 -050057;
58; TODO: Use parm passing through structure, probably don't need the pxors
59; ( calling app will initialize to 0 ) could easily fit everything in sse2
60; without too much hastle, and can probably do better estimates with psadw
61; or pavgb At this point this is just meant to be first pass for calculating
62; all the parms needed for 16x16 ssim so we can play with dssim as distortion
63; in mode selection code.
Alex Conversec7b70112015-08-06 12:53:59 -070064global sym(vpx_ssim_parms_16x16_sse2) PRIVATE
65sym(vpx_ssim_parms_16x16_sse2):
Jim Bankoski3f6f7282011-03-08 09:05:18 -050066 push rbp
67 mov rbp, rsp
68 SHADOW_ARGS_TO_STACK 9
Johann4a2b6842011-04-15 10:05:20 -040069 SAVE_XMM 15
Jim Bankoski3f6f7282011-03-08 09:05:18 -050070 push rsi
71 push rdi
72 ; end prolog
73
74 mov rsi, arg(0) ;s
75 mov rcx, arg(1) ;sp
76 mov rdi, arg(2) ;r
77 mov rax, arg(3) ;rp
78
79 pxor xmm0, xmm0
80 pxor xmm15,xmm15 ;sum_s
81 pxor xmm14,xmm14 ;sum_r
82 pxor xmm13,xmm13 ;sum_sq_s
83 pxor xmm12,xmm12 ;sum_sq_r
84 pxor xmm11,xmm11 ;sum_sxr
85
86 mov rdx, 16 ;row counter
Fritz Koenigc5f890a2011-08-22 15:29:41 -070087.NextRow:
Jim Bankoski3f6f7282011-03-08 09:05:18 -050088
89 ;grab source and reference pixels
90 movdqu xmm5, [rsi]
91 movdqu xmm6, [rdi]
92 movdqa xmm3, xmm5
93 movdqa xmm4, xmm6
94 punpckhbw xmm3, xmm0 ; high_s
95 punpckhbw xmm4, xmm0 ; high_r
96
97 TABULATE_SSIM
98
99 movdqa xmm3, xmm5
100 movdqa xmm4, xmm6
101 punpcklbw xmm3, xmm0 ; low_s
102 punpcklbw xmm4, xmm0 ; low_r
103
104 TABULATE_SSIM
105
106 add rsi, rcx ; next s row
107 add rdi, rax ; next r row
108
109 dec rdx ; counter
Fritz Koenigc5f890a2011-08-22 15:29:41 -0700110 jnz .NextRow
Jim Bankoski3f6f7282011-03-08 09:05:18 -0500111
112 SUM_ACROSS_W xmm15
113 SUM_ACROSS_W xmm14
114 SUM_ACROSS_Q xmm13
115 SUM_ACROSS_Q xmm12
116 SUM_ACROSS_Q xmm11
117
118 mov rdi,arg(4)
Jim Bankoskid4cdb682011-03-28 16:39:05 -0700119 movd [rdi], xmm15;
Jim Bankoski3f6f7282011-03-08 09:05:18 -0500120 mov rdi,arg(5)
Jim Bankoskid4cdb682011-03-28 16:39:05 -0700121 movd [rdi], xmm14;
Jim Bankoski3f6f7282011-03-08 09:05:18 -0500122 mov rdi,arg(6)
Jim Bankoskid4cdb682011-03-28 16:39:05 -0700123 movd [rdi], xmm13;
Jim Bankoski3f6f7282011-03-08 09:05:18 -0500124 mov rdi,arg(7)
Jim Bankoskid4cdb682011-03-28 16:39:05 -0700125 movd [rdi], xmm12;
Jim Bankoski3f6f7282011-03-08 09:05:18 -0500126 mov rdi,arg(8)
Jim Bankoskid4cdb682011-03-28 16:39:05 -0700127 movd [rdi], xmm11;
Jim Bankoski3f6f7282011-03-08 09:05:18 -0500128
129 ; begin epilog
130 pop rdi
131 pop rsi
Jim Bankoskid4cdb682011-03-28 16:39:05 -0700132 RESTORE_XMM
Jim Bankoski3f6f7282011-03-08 09:05:18 -0500133 UNSHADOW_ARGS
134 pop rbp
135 ret
136
Fritz Koenig694d4e72011-08-22 12:36:28 -0700137;void ssim_parms_sse2(
Jim Bankoski3f6f7282011-03-08 09:05:18 -0500138; unsigned char *s,
139; int sp,
140; unsigned char *r,
141; int rp
Alex Conversec65e79d2015-08-06 13:46:28 -0700142; uint32_t *sum_s,
143; uint32_t *sum_r,
144; uint32_t *sum_sq_s,
145; uint32_t *sum_sq_r,
146; uint32_t *sum_sxr);
Jim Bankoski3f6f7282011-03-08 09:05:18 -0500147;
148; TODO: Use parm passing through structure, probably don't need the pxors
149; ( calling app will initialize to 0 ) could easily fit everything in sse2
150; without too much hastle, and can probably do better estimates with psadw
151; or pavgb At this point this is just meant to be first pass for calculating
152; all the parms needed for 16x16 ssim so we can play with dssim as distortion
153; in mode selection code.
Alex Conversec7b70112015-08-06 12:53:59 -0700154global sym(vpx_ssim_parms_8x8_sse2) PRIVATE
155sym(vpx_ssim_parms_8x8_sse2):
Jim Bankoski3f6f7282011-03-08 09:05:18 -0500156 push rbp
157 mov rbp, rsp
158 SHADOW_ARGS_TO_STACK 9
Johann4a2b6842011-04-15 10:05:20 -0400159 SAVE_XMM 15
Jim Bankoski3f6f7282011-03-08 09:05:18 -0500160 push rsi
161 push rdi
162 ; end prolog
163
164 mov rsi, arg(0) ;s
165 mov rcx, arg(1) ;sp
166 mov rdi, arg(2) ;r
167 mov rax, arg(3) ;rp
168
169 pxor xmm0, xmm0
170 pxor xmm15,xmm15 ;sum_s
171 pxor xmm14,xmm14 ;sum_r
172 pxor xmm13,xmm13 ;sum_sq_s
173 pxor xmm12,xmm12 ;sum_sq_r
174 pxor xmm11,xmm11 ;sum_sxr
175
176 mov rdx, 8 ;row counter
Fritz Koenigc5f890a2011-08-22 15:29:41 -0700177.NextRow:
Jim Bankoski3f6f7282011-03-08 09:05:18 -0500178
179 ;grab source and reference pixels
Jim Bankoskid4cdb682011-03-28 16:39:05 -0700180 movq xmm3, [rsi]
181 movq xmm4, [rdi]
Jim Bankoski3f6f7282011-03-08 09:05:18 -0500182 punpcklbw xmm3, xmm0 ; low_s
183 punpcklbw xmm4, xmm0 ; low_r
184
185 TABULATE_SSIM
186
187 add rsi, rcx ; next s row
188 add rdi, rax ; next r row
189
190 dec rdx ; counter
Fritz Koenigc5f890a2011-08-22 15:29:41 -0700191 jnz .NextRow
Jim Bankoski3f6f7282011-03-08 09:05:18 -0500192
193 SUM_ACROSS_W xmm15
194 SUM_ACROSS_W xmm14
195 SUM_ACROSS_Q xmm13
196 SUM_ACROSS_Q xmm12
197 SUM_ACROSS_Q xmm11
198
199 mov rdi,arg(4)
Jim Bankoskid4cdb682011-03-28 16:39:05 -0700200 movd [rdi], xmm15;
Jim Bankoski3f6f7282011-03-08 09:05:18 -0500201 mov rdi,arg(5)
Jim Bankoskid4cdb682011-03-28 16:39:05 -0700202 movd [rdi], xmm14;
Jim Bankoski3f6f7282011-03-08 09:05:18 -0500203 mov rdi,arg(6)
Jim Bankoskid4cdb682011-03-28 16:39:05 -0700204 movd [rdi], xmm13;
Jim Bankoski3f6f7282011-03-08 09:05:18 -0500205 mov rdi,arg(7)
Jim Bankoskid4cdb682011-03-28 16:39:05 -0700206 movd [rdi], xmm12;
Jim Bankoski3f6f7282011-03-08 09:05:18 -0500207 mov rdi,arg(8)
Jim Bankoskid4cdb682011-03-28 16:39:05 -0700208 movd [rdi], xmm11;
Jim Bankoski3f6f7282011-03-08 09:05:18 -0500209
210 ; begin epilog
211 pop rdi
212 pop rsi
Jim Bankoskid4cdb682011-03-28 16:39:05 -0700213 RESTORE_XMM
Jim Bankoski3f6f7282011-03-08 09:05:18 -0500214 UNSHADOW_ARGS
215 pop rbp
216 ret