blob: 54501d1fe17f7311c4331ca013811404424cea71 [file] [log] [blame]
Peter de Rivaz7eee4872014-10-16 13:41:55 +01001;
2; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3;
4; Use of this source code is governed by a BSD-style license
5; that can be found in the LICENSE file in the root of the source
6; tree. An additional intellectual property rights grant can be found
7; in the file PATENTS. All contributing project authors may
8; be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION .text
14
15; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
16%macro HIGH_PROCESS_4x2x4 5-6 0
17 movh m0, [srcq +%2*2]
18%if %1 == 1
19 movu m4, [ref1q+%3*2]
20 movu m5, [ref2q+%3*2]
21 movu m6, [ref3q+%3*2]
22 movu m7, [ref4q+%3*2]
23 movhps m0, [srcq +%4*2]
24 movhps m4, [ref1q+%5*2]
25 movhps m5, [ref2q+%5*2]
26 movhps m6, [ref3q+%5*2]
27 movhps m7, [ref4q+%5*2]
28 mova m3, m0
29 mova m2, m0
30 psubusw m3, m4
31 psubusw m2, m5
32 psubusw m4, m0
33 psubusw m5, m0
34 por m4, m3
35 por m5, m2
36 pmaddwd m4, m1
37 pmaddwd m5, m1
38 mova m3, m0
39 mova m2, m0
40 psubusw m3, m6
41 psubusw m2, m7
42 psubusw m6, m0
43 psubusw m7, m0
44 por m6, m3
45 por m7, m2
46 pmaddwd m6, m1
47 pmaddwd m7, m1
48%else
49 movu m2, [ref1q+%3*2]
50 movhps m0, [srcq +%4*2]
51 movhps m2, [ref1q+%5*2]
52 mova m3, m0
53 psubusw m3, m2
54 psubusw m2, m0
55 por m2, m3
56 pmaddwd m2, m1
57 paddd m4, m2
58
59 movu m2, [ref2q+%3*2]
60 mova m3, m0
61 movhps m2, [ref2q+%5*2]
62 psubusw m3, m2
63 psubusw m2, m0
64 por m2, m3
65 pmaddwd m2, m1
66 paddd m5, m2
67
68 movu m2, [ref3q+%3*2]
69 mova m3, m0
70 movhps m2, [ref3q+%5*2]
71 psubusw m3, m2
72 psubusw m2, m0
73 por m2, m3
74 pmaddwd m2, m1
75 paddd m6, m2
76
77 movu m2, [ref4q+%3*2]
78 mova m3, m0
79 movhps m2, [ref4q+%5*2]
80 psubusw m3, m2
81 psubusw m2, m0
82 por m2, m3
83 pmaddwd m2, m1
84 paddd m7, m2
85%endif
86%if %6 == 1
87 lea srcq, [srcq +src_strideq*4]
88 lea ref1q, [ref1q+ref_strideq*4]
89 lea ref2q, [ref2q+ref_strideq*4]
90 lea ref3q, [ref3q+ref_strideq*4]
91 lea ref4q, [ref4q+ref_strideq*4]
92%endif
93%endmacro
94
95; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
96%macro HIGH_PROCESS_8x2x4 5-6 0
97 ; 1st 8 px
98 mova m0, [srcq +%2*2]
99%if %1 == 1
100 movu m4, [ref1q+%3*2]
101 movu m5, [ref2q+%3*2]
102 movu m6, [ref3q+%3*2]
103 movu m7, [ref4q+%3*2]
104 mova m3, m0
105 mova m2, m0
106 psubusw m3, m4
107 psubusw m2, m5
108 psubusw m4, m0
109 psubusw m5, m0
110 por m4, m3
111 por m5, m2
112 pmaddwd m4, m1
113 pmaddwd m5, m1
114 mova m3, m0
115 mova m2, m0
116 psubusw m3, m6
117 psubusw m2, m7
118 psubusw m6, m0
119 psubusw m7, m0
120 por m6, m3
121 por m7, m2
122 pmaddwd m6, m1
123 pmaddwd m7, m1
124%else
125 mova m3, m0
126 movu m2, [ref1q+%3*2]
127 psubusw m3, m2
128 psubusw m2, m0
129 por m2, m3
130 mova m3, m0
131 pmaddwd m2, m1
132 paddd m4, m2
133 movu m2, [ref2q+%3*2]
134 psubusw m3, m2
135 psubusw m2, m0
136 por m2, m3
137 mova m3, m0
138 pmaddwd m2, m1
139 paddd m5, m2
140 movu m2, [ref3q+%3*2]
141 psubusw m3, m2
142 psubusw m2, m0
143 por m2, m3
144 mova m3, m0
145 pmaddwd m2, m1
146 paddd m6, m2
147 movu m2, [ref4q+%3*2]
148 psubusw m3, m2
149 psubusw m2, m0
150 por m2, m3
151 pmaddwd m2, m1
152 paddd m7, m2
153%endif
154
155 ; 2nd 8 px
156 mova m0, [srcq +(%4)*2]
157 mova m3, m0
158 movu m2, [ref1q+(%5)*2]
159 psubusw m3, m2
160 psubusw m2, m0
161 por m2, m3
162 mova m3, m0
163 pmaddwd m2, m1
164 paddd m4, m2
165 movu m2, [ref2q+(%5)*2]
166 psubusw m3, m2
167 psubusw m2, m0
168 por m2, m3
169 mova m3, m0
170 pmaddwd m2, m1
171 paddd m5, m2
172 movu m2, [ref3q+(%5)*2]
173 psubusw m3, m2
174 psubusw m2, m0
175 por m2, m3
176 mova m3, m0
177 pmaddwd m2, m1
178 paddd m6, m2
179 movu m2, [ref4q+(%5)*2]
180 psubusw m3, m2
181 psubusw m2, m0
182%if %6 == 1
183 lea srcq, [srcq +src_strideq*4]
184 lea ref1q, [ref1q+ref_strideq*4]
185 lea ref2q, [ref2q+ref_strideq*4]
186 lea ref3q, [ref3q+ref_strideq*4]
187 lea ref4q, [ref4q+ref_strideq*4]
188%endif
189 por m2, m3
190 pmaddwd m2, m1
191 paddd m7, m2
192%endmacro
193
194; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
195%macro HIGH_PROCESS_16x2x4 5-6 0
196 HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
197 HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6
198%endmacro
199
200; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
201%macro HIGH_PROCESS_32x2x4 5-6 0
202 HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
203 HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6
204%endmacro
205
206; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
207%macro HIGH_PROCESS_64x2x4 5-6 0
208 HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
209 HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6
210%endmacro
211
Yaowu Xuf883b422016-08-30 14:01:10 -0700212; void aom_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride,
Peter de Rivaz7eee4872014-10-16 13:41:55 +0100213; uint8_t *ref[4], int ref_stride,
Johannd5d92892015-04-17 16:11:38 -0400214; uint32_t res[4]);
Peter de Rivaz7eee4872014-10-16 13:41:55 +0100215; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
216%macro HIGH_SADNXN4D 2
217%if UNIX64
Yunqing Wang789ae442015-02-04 12:02:06 -0800218cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
219 res, ref2, ref3, ref4
Peter de Rivaz7eee4872014-10-16 13:41:55 +0100220%else
Yunqing Wang789ae442015-02-04 12:02:06 -0800221cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
222 ref2, ref3, ref4
Peter de Rivaz7eee4872014-10-16 13:41:55 +0100223%endif
224
Yunqing Wang789ae442015-02-04 12:02:06 -0800225; set m1
226 push srcq
227 mov srcd, 0x00010001
228 movd m1, srcd
229 pshufd m1, m1, 0x0
230 pop srcq
231
Peter de Rivaz7eee4872014-10-16 13:41:55 +0100232 movsxdifnidn src_strideq, src_strided
233 movsxdifnidn ref_strideq, ref_strided
234 mov ref2q, [ref1q+gprsize*1]
235 mov ref3q, [ref1q+gprsize*2]
236 mov ref4q, [ref1q+gprsize*3]
237 mov ref1q, [ref1q+gprsize*0]
238
239; convert byte pointers to short pointers
240 shl srcq, 1
241 shl ref2q, 1
242 shl ref3q, 1
243 shl ref4q, 1
244 shl ref1q, 1
245
Peter de Rivaz7eee4872014-10-16 13:41:55 +0100246 HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
247%rep (%2-4)/2
248 HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
249%endrep
250 HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
251 ; N.B. HIGH_PROCESS outputs dwords (32 bits)
252 ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
253 movhlps m0, m4
254 movhlps m1, m5
255 movhlps m2, m6
256 movhlps m3, m7
257 paddd m4, m0
258 paddd m5, m1
259 paddd m6, m2
260 paddd m7, m3
261 punpckldq m4, m5
262 punpckldq m6, m7
263 movhlps m0, m4
264 movhlps m1, m6
265 paddd m4, m0
266 paddd m6, m1
267 punpcklqdq m4, m6
268 movifnidn r4, r4mp
269 movu [r4], m4
270 RET
271%endmacro
272
273
274INIT_XMM sse2
275HIGH_SADNXN4D 64, 64
276HIGH_SADNXN4D 64, 32
277HIGH_SADNXN4D 32, 64
278HIGH_SADNXN4D 32, 32
279HIGH_SADNXN4D 32, 16
280HIGH_SADNXN4D 16, 32
281HIGH_SADNXN4D 16, 16
282HIGH_SADNXN4D 16, 8
283HIGH_SADNXN4D 8, 16
284HIGH_SADNXN4D 8, 8
285HIGH_SADNXN4D 8, 4
286HIGH_SADNXN4D 4, 8
287HIGH_SADNXN4D 4, 4