blob: aa70106c84589d75930ca79f1e9c278a761a6e44 [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001;
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002; Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003;
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
Yaowu Xuc27fc142016-08-22 16:08:15 -070012;
13
14%include "aom_ports/x86_abi_support.asm"
15
16; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
17%macro TABULATE_SSIM 0
18 paddusw xmm15, xmm3 ; sum_s
19 paddusw xmm14, xmm4 ; sum_r
20 movdqa xmm1, xmm3
21 pmaddwd xmm1, xmm1
22 paddd xmm13, xmm1 ; sum_sq_s
23 movdqa xmm2, xmm4
24 pmaddwd xmm2, xmm2
25 paddd xmm12, xmm2 ; sum_sq_r
26 pmaddwd xmm3, xmm4
27 paddd xmm11, xmm3 ; sum_sxr
28%endmacro
29
30; Sum across the register %1 starting with q words
31%macro SUM_ACROSS_Q 1
32 movdqa xmm2,%1
33 punpckldq %1,xmm0
34 punpckhdq xmm2,xmm0
35 paddq %1,xmm2
36 movdqa xmm2,%1
37 punpcklqdq %1,xmm0
38 punpckhqdq xmm2,xmm0
39 paddq %1,xmm2
40%endmacro
41
42; Sum across the register %1 starting with q words
43%macro SUM_ACROSS_W 1
44 movdqa xmm1, %1
45 punpcklwd %1,xmm0
46 punpckhwd xmm1,xmm0
47 paddd %1, xmm1
48 SUM_ACROSS_Q %1
49%endmacro
50;void ssim_parms_sse2(
51; unsigned char *s,
52; int sp,
53; unsigned char *r,
54; int rp
55; uint32_t *sum_s,
56; uint32_t *sum_r,
57; uint32_t *sum_sq_s,
58; uint32_t *sum_sq_r,
59; uint32_t *sum_sxr);
60;
61; TODO: Use parm passing through structure, probably don't need the pxors
62; ( calling app will initialize to 0 ) could easily fit everything in sse2
63; without too much hastle, and can probably do better estimates with psadw
64; or pavgb At this point this is just meant to be first pass for calculating
65; all the parms needed for 16x16 ssim so we can play with dssim as distortion
66; in mode selection code.
Yaowu Xuf883b422016-08-30 14:01:10 -070067global sym(aom_ssim_parms_16x16_sse2) PRIVATE
68sym(aom_ssim_parms_16x16_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -070069 push rbp
70 mov rbp, rsp
71 SHADOW_ARGS_TO_STACK 9
72 SAVE_XMM 15
73 push rsi
74 push rdi
75 ; end prolog
76
77 mov rsi, arg(0) ;s
78 mov rcx, arg(1) ;sp
79 mov rdi, arg(2) ;r
80 mov rax, arg(3) ;rp
81
82 pxor xmm0, xmm0
83 pxor xmm15,xmm15 ;sum_s
84 pxor xmm14,xmm14 ;sum_r
85 pxor xmm13,xmm13 ;sum_sq_s
86 pxor xmm12,xmm12 ;sum_sq_r
87 pxor xmm11,xmm11 ;sum_sxr
88
89 mov rdx, 16 ;row counter
90.NextRow:
91
92 ;grab source and reference pixels
93 movdqu xmm5, [rsi]
94 movdqu xmm6, [rdi]
95 movdqa xmm3, xmm5
96 movdqa xmm4, xmm6
97 punpckhbw xmm3, xmm0 ; high_s
98 punpckhbw xmm4, xmm0 ; high_r
99
100 TABULATE_SSIM
101
102 movdqa xmm3, xmm5
103 movdqa xmm4, xmm6
104 punpcklbw xmm3, xmm0 ; low_s
105 punpcklbw xmm4, xmm0 ; low_r
106
107 TABULATE_SSIM
108
109 add rsi, rcx ; next s row
110 add rdi, rax ; next r row
111
112 dec rdx ; counter
113 jnz .NextRow
114
115 SUM_ACROSS_W xmm15
116 SUM_ACROSS_W xmm14
117 SUM_ACROSS_Q xmm13
118 SUM_ACROSS_Q xmm12
119 SUM_ACROSS_Q xmm11
120
121 mov rdi,arg(4)
122 movd [rdi], xmm15;
123 mov rdi,arg(5)
124 movd [rdi], xmm14;
125 mov rdi,arg(6)
126 movd [rdi], xmm13;
127 mov rdi,arg(7)
128 movd [rdi], xmm12;
129 mov rdi,arg(8)
130 movd [rdi], xmm11;
131
132 ; begin epilog
133 pop rdi
134 pop rsi
135 RESTORE_XMM
136 UNSHADOW_ARGS
137 pop rbp
138 ret
139
140;void ssim_parms_sse2(
141; unsigned char *s,
142; int sp,
143; unsigned char *r,
144; int rp
145; uint32_t *sum_s,
146; uint32_t *sum_r,
147; uint32_t *sum_sq_s,
148; uint32_t *sum_sq_r,
149; uint32_t *sum_sxr);
150;
151; TODO: Use parm passing through structure, probably don't need the pxors
152; ( calling app will initialize to 0 ) could easily fit everything in sse2
153; without too much hastle, and can probably do better estimates with psadw
154; or pavgb At this point this is just meant to be first pass for calculating
155; all the parms needed for 16x16 ssim so we can play with dssim as distortion
156; in mode selection code.
Yaowu Xuf883b422016-08-30 14:01:10 -0700157global sym(aom_ssim_parms_8x8_sse2) PRIVATE
158sym(aom_ssim_parms_8x8_sse2):
Yaowu Xuc27fc142016-08-22 16:08:15 -0700159 push rbp
160 mov rbp, rsp
161 SHADOW_ARGS_TO_STACK 9
162 SAVE_XMM 15
163 push rsi
164 push rdi
165 ; end prolog
166
167 mov rsi, arg(0) ;s
168 mov rcx, arg(1) ;sp
169 mov rdi, arg(2) ;r
170 mov rax, arg(3) ;rp
171
172 pxor xmm0, xmm0
173 pxor xmm15,xmm15 ;sum_s
174 pxor xmm14,xmm14 ;sum_r
175 pxor xmm13,xmm13 ;sum_sq_s
176 pxor xmm12,xmm12 ;sum_sq_r
177 pxor xmm11,xmm11 ;sum_sxr
178
179 mov rdx, 8 ;row counter
180.NextRow:
181
182 ;grab source and reference pixels
183 movq xmm3, [rsi]
184 movq xmm4, [rdi]
185 punpcklbw xmm3, xmm0 ; low_s
186 punpcklbw xmm4, xmm0 ; low_r
187
188 TABULATE_SSIM
189
190 add rsi, rcx ; next s row
191 add rdi, rax ; next r row
192
193 dec rdx ; counter
194 jnz .NextRow
195
196 SUM_ACROSS_W xmm15
197 SUM_ACROSS_W xmm14
198 SUM_ACROSS_Q xmm13
199 SUM_ACROSS_Q xmm12
200 SUM_ACROSS_Q xmm11
201
202 mov rdi,arg(4)
203 movd [rdi], xmm15;
204 mov rdi,arg(5)
205 movd [rdi], xmm14;
206 mov rdi,arg(6)
207 movd [rdi], xmm13;
208 mov rdi,arg(7)
209 movd [rdi], xmm12;
210 mov rdi,arg(8)
211 movd [rdi], xmm11;
212
213 ; begin epilog
214 pop rdi
215 pop rsi
216 RESTORE_XMM
217 UNSHADOW_ARGS
218 pop rbp
219 ret