blob: b4cc6abf1a09d5032c7e9968f4f203a5f84cd27c [file] [log] [blame]
John Koleszar0ea50ce2010-05-18 11:58:33 -04001;
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002; Copyright (c) 2016, Alliance for Open Media. All rights reserved
John Koleszar0ea50ce2010-05-18 11:58:33 -04003;
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
John Koleszar0ea50ce2010-05-18 11:58:33 -040012;
13
John Koleszar7ad8dbe2013-02-27 10:23:06 -080014%include "third_party/x86inc/x86inc.asm"
John Koleszar0ea50ce2010-05-18 11:58:33 -040015
John Koleszar7ad8dbe2013-02-27 10:23:06 -080016SECTION .text
John Koleszar0ea50ce2010-05-18 11:58:33 -040017
Ronald S. Bultjec24d9222013-06-25 11:26:49 -070018%macro SAD_FN 4
19%if %4 == 0
20%if %3 == 5
21cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
22%else ; %3 == 7
James Zern8b2ddbc2015-12-18 19:19:32 -080023cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
Ronald S. Bultjec24d9222013-06-25 11:26:49 -070024 src_stride3, ref_stride3, n_rows
25%endif ; %3 == 5/7
26%else ; avg
27%if %3 == 5
28cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
29 second_pred, n_rows
30%else ; %3 == 7
James Zern8b2ddbc2015-12-18 19:19:32 -080031cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
Ronald S. Bultjec24d9222013-06-25 11:26:49 -070032 ref, ref_stride, \
33 second_pred, \
34 src_stride3, ref_stride3
35%if ARCH_X86_64
36%define n_rowsd r7d
37%else ; x86-32
38%define n_rowsd dword r0m
39%endif ; x86-32/64
40%endif ; %3 == 5/7
41%endif ; avg/sad
John Koleszar7ad8dbe2013-02-27 10:23:06 -080042 movsxdifnidn src_strideq, src_strided
43 movsxdifnidn ref_strideq, ref_strided
Ronald S. Bultjec24d9222013-06-25 11:26:49 -070044%if %3 == 7
45 lea src_stride3q, [src_strideq*3]
46 lea ref_stride3q, [ref_strideq*3]
47%endif ; %3 == 7
48%endmacro
49
Geza Lorea0e1c232016-05-06 14:18:00 +010050%if CONFIG_EXT_PARTITION
Yaowu Xuf883b422016-08-30 14:01:10 -070051; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride,
Geza Lorea0e1c232016-05-06 14:18:00 +010052; uint8_t *ref, int ref_stride);
53%macro SAD128XN 1-2 0
54 SAD_FN 128, %1, 5, %2
55 mov n_rowsd, %1
56 pxor m0, m0
57
58.loop:
59 movu m1, [refq]
60 movu m2, [refq+16]
61 movu m3, [refq+32]
62 movu m4, [refq+48]
63%if %2 == 1
64 pavgb m1, [second_predq+mmsize*0]
65 pavgb m2, [second_predq+mmsize*1]
66 pavgb m3, [second_predq+mmsize*2]
67 pavgb m4, [second_predq+mmsize*3]
68%endif
69 psadbw m1, [srcq]
70 psadbw m2, [srcq+16]
71 psadbw m3, [srcq+32]
72 psadbw m4, [srcq+48]
73
74 paddd m1, m2
75 paddd m3, m4
76 paddd m0, m1
77 paddd m0, m3
78
79 movu m1, [refq+64]
80 movu m2, [refq+80]
81 movu m3, [refq+96]
82 movu m4, [refq+112]
83%if %2 == 1
84 pavgb m1, [second_predq+mmsize*4]
85 pavgb m2, [second_predq+mmsize*5]
86 pavgb m3, [second_predq+mmsize*6]
87 pavgb m4, [second_predq+mmsize*7]
88 lea second_predq, [second_predq+mmsize*8]
89%endif
90 psadbw m1, [srcq+64]
91 psadbw m2, [srcq+80]
92 psadbw m3, [srcq+96]
93 psadbw m4, [srcq+112]
94
95 add refq, ref_strideq
96 add srcq, src_strideq
97
98 paddd m1, m2
99 paddd m3, m4
100 paddd m0, m1
101 paddd m0, m3
102
103 sub n_rowsd, 1
104 jg .loop
105
106 movhlps m1, m0
107 paddd m0, m1
108 movd eax, m0
109 RET
110%endmacro
111
112INIT_XMM sse2
113SAD128XN 128 ; sad128x128_sse2
114SAD128XN 128, 1 ; sad128x128_avg_sse2
115SAD128XN 64 ; sad128x64_sse2
116SAD128XN 64, 1 ; sad128x64_avg_sse2
117%endif
118
119
Yaowu Xuf883b422016-08-30 14:01:10 -0700120; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride,
Ronald S. Bultjec24d9222013-06-25 11:26:49 -0700121; uint8_t *ref, int ref_stride);
122%macro SAD64XN 1-2 0
123 SAD_FN 64, %1, 5, %2
Ronald S. Bultje0c481f42013-04-17 10:31:59 -0700124 mov n_rowsd, %1
John Koleszar7ad8dbe2013-02-27 10:23:06 -0800125 pxor m0, m0
126.loop:
127 movu m1, [refq]
128 movu m2, [refq+16]
129 movu m3, [refq+32]
130 movu m4, [refq+48]
Ronald S. Bultjec24d9222013-06-25 11:26:49 -0700131%if %2 == 1
132 pavgb m1, [second_predq+mmsize*0]
133 pavgb m2, [second_predq+mmsize*1]
134 pavgb m3, [second_predq+mmsize*2]
135 pavgb m4, [second_predq+mmsize*3]
136 lea second_predq, [second_predq+mmsize*4]
137%endif
John Koleszar7ad8dbe2013-02-27 10:23:06 -0800138 psadbw m1, [srcq]
139 psadbw m2, [srcq+16]
140 psadbw m3, [srcq+32]
141 psadbw m4, [srcq+48]
142 paddd m1, m2
143 paddd m3, m4
144 add refq, ref_strideq
145 paddd m0, m1
146 add srcq, src_strideq
147 paddd m0, m3
148 dec n_rowsd
149 jg .loop
Yunqing Wang20bd1442011-06-28 09:14:13 -0400150
John Koleszar7ad8dbe2013-02-27 10:23:06 -0800151 movhlps m1, m0
152 paddd m0, m1
153 movd eax, m0
154 RET
Ronald S. Bultje0c481f42013-04-17 10:31:59 -0700155%endmacro
156
157INIT_XMM sse2
Geza Lorea0e1c232016-05-06 14:18:00 +0100158%if CONFIG_EXT_PARTITION
159SAD64XN 128 ; sad64x128_sse2
160SAD64XN 128, 1 ; sad64x128_avg_sse2
161%endif
Ronald S. Bultje0c481f42013-04-17 10:31:59 -0700162SAD64XN 64 ; sad64x64_sse2
163SAD64XN 32 ; sad64x32_sse2
Ronald S. Bultjec24d9222013-06-25 11:26:49 -0700164SAD64XN 64, 1 ; sad64x64_avg_sse2
165SAD64XN 32, 1 ; sad64x32_avg_sse2
Rupert Swarbrick72678572017-08-02 12:05:26 +0100166%if CONFIG_EXT_PARTITION_TYPES
167SAD64XN 16 ; sad64x16_sse2
168SAD64XN 16, 1 ; sad64x16_avg_sse2
169%endif
Yunqing Wang20bd1442011-06-28 09:14:13 -0400170
Yaowu Xuf883b422016-08-30 14:01:10 -0700171; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride,
John Koleszar7ad8dbe2013-02-27 10:23:06 -0800172; uint8_t *ref, int ref_stride);
Ronald S. Bultjec24d9222013-06-25 11:26:49 -0700173%macro SAD32XN 1-2 0
174 SAD_FN 32, %1, 5, %2
Ronald S. Bultje0c481f42013-04-17 10:31:59 -0700175 mov n_rowsd, %1/2
John Koleszar7ad8dbe2013-02-27 10:23:06 -0800176 pxor m0, m0
John Koleszar7ad8dbe2013-02-27 10:23:06 -0800177.loop:
178 movu m1, [refq]
179 movu m2, [refq+16]
180 movu m3, [refq+ref_strideq]
181 movu m4, [refq+ref_strideq+16]
Ronald S. Bultjec24d9222013-06-25 11:26:49 -0700182%if %2 == 1
183 pavgb m1, [second_predq+mmsize*0]
184 pavgb m2, [second_predq+mmsize*1]
185 pavgb m3, [second_predq+mmsize*2]
186 pavgb m4, [second_predq+mmsize*3]
187 lea second_predq, [second_predq+mmsize*4]
188%endif
John Koleszar7ad8dbe2013-02-27 10:23:06 -0800189 psadbw m1, [srcq]
190 psadbw m2, [srcq+16]
191 psadbw m3, [srcq+src_strideq]
192 psadbw m4, [srcq+src_strideq+16]
193 paddd m1, m2
194 paddd m3, m4
195 lea refq, [refq+ref_strideq*2]
196 paddd m0, m1
197 lea srcq, [srcq+src_strideq*2]
198 paddd m0, m3
199 dec n_rowsd
200 jg .loop
Yunqing Wang20bd1442011-06-28 09:14:13 -0400201
John Koleszar7ad8dbe2013-02-27 10:23:06 -0800202 movhlps m1, m0
203 paddd m0, m1
204 movd eax, m0
205 RET
Ronald S. Bultje0c481f42013-04-17 10:31:59 -0700206%endmacro
207
208INIT_XMM sse2
209SAD32XN 64 ; sad32x64_sse2
210SAD32XN 32 ; sad32x32_sse2
211SAD32XN 16 ; sad32x16_sse2
Ronald S. Bultjec24d9222013-06-25 11:26:49 -0700212SAD32XN 64, 1 ; sad32x64_avg_sse2
213SAD32XN 32, 1 ; sad32x32_avg_sse2
214SAD32XN 16, 1 ; sad32x16_avg_sse2
Rupert Swarbrick93c39e92017-07-12 11:11:02 +0100215%if CONFIG_EXT_PARTITION_TYPES
216SAD32XN 8 ; sad_32x8_sse2
217SAD32XN 8, 1 ; sad_32x8_avg_sse2
218%endif
Yunqing Wang20bd1442011-06-28 09:14:13 -0400219
Yaowu Xuf883b422016-08-30 14:01:10 -0700220; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
John Koleszar7ad8dbe2013-02-27 10:23:06 -0800221; uint8_t *ref, int ref_stride);
Ronald S. Bultjec24d9222013-06-25 11:26:49 -0700222%macro SAD16XN 1-2 0
223 SAD_FN 16, %1, 7, %2
John Koleszar7ad8dbe2013-02-27 10:23:06 -0800224 mov n_rowsd, %1/4
225 pxor m0, m0
Yunqing Wang20bd1442011-06-28 09:14:13 -0400226
John Koleszar7ad8dbe2013-02-27 10:23:06 -0800227.loop:
228 movu m1, [refq]
229 movu m2, [refq+ref_strideq]
230 movu m3, [refq+ref_strideq*2]
231 movu m4, [refq+ref_stride3q]
Ronald S. Bultjec24d9222013-06-25 11:26:49 -0700232%if %2 == 1
233 pavgb m1, [second_predq+mmsize*0]
234 pavgb m2, [second_predq+mmsize*1]
235 pavgb m3, [second_predq+mmsize*2]
236 pavgb m4, [second_predq+mmsize*3]
237 lea second_predq, [second_predq+mmsize*4]
238%endif
John Koleszar7ad8dbe2013-02-27 10:23:06 -0800239 psadbw m1, [srcq]
240 psadbw m2, [srcq+src_strideq]
241 psadbw m3, [srcq+src_strideq*2]
242 psadbw m4, [srcq+src_stride3q]
243 paddd m1, m2
244 paddd m3, m4
245 lea refq, [refq+ref_strideq*4]
246 paddd m0, m1
247 lea srcq, [srcq+src_strideq*4]
248 paddd m0, m3
249 dec n_rowsd
250 jg .loop
Yunqing Wang20bd1442011-06-28 09:14:13 -0400251
John Koleszar7ad8dbe2013-02-27 10:23:06 -0800252 movhlps m1, m0
253 paddd m0, m1
254 movd eax, m0
255 RET
256%endmacro
Yunqing Wang20bd1442011-06-28 09:14:13 -0400257
John Koleszar7ad8dbe2013-02-27 10:23:06 -0800258INIT_XMM sse2
Ronald S. Bultje0c481f42013-04-17 10:31:59 -0700259SAD16XN 32 ; sad16x32_sse2
John Koleszar7ad8dbe2013-02-27 10:23:06 -0800260SAD16XN 16 ; sad16x16_sse2
261SAD16XN 8 ; sad16x8_sse2
Ronald S. Bultjec24d9222013-06-25 11:26:49 -0700262SAD16XN 32, 1 ; sad16x32_avg_sse2
263SAD16XN 16, 1 ; sad16x16_avg_sse2
264SAD16XN 8, 1 ; sad16x8_avg_sse2
Rupert Swarbrick93c39e92017-07-12 11:11:02 +0100265%if CONFIG_EXT_PARTITION_TYPES
266SAD16XN 4 ; sad_16x4_sse2
267SAD16XN 4, 1 ; sad_16x4_avg_sse2
Rupert Swarbrick72678572017-08-02 12:05:26 +0100268SAD16XN 64 ; sad_16x64_sse2
269SAD16XN 64, 1 ; sad_16x64_avg_sse2
Rupert Swarbrick93c39e92017-07-12 11:11:02 +0100270%endif
Yunqing Wang20bd1442011-06-28 09:14:13 -0400271
Yaowu Xuf883b422016-08-30 14:01:10 -0700272; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
John Koleszar7ad8dbe2013-02-27 10:23:06 -0800273; uint8_t *ref, int ref_stride);
Ronald S. Bultjec24d9222013-06-25 11:26:49 -0700274%macro SAD8XN 1-2 0
275 SAD_FN 8, %1, 7, %2
John Koleszar7ad8dbe2013-02-27 10:23:06 -0800276 mov n_rowsd, %1/4
277 pxor m0, m0
Yunqing Wang20bd1442011-06-28 09:14:13 -0400278
John Koleszar7ad8dbe2013-02-27 10:23:06 -0800279.loop:
280 movh m1, [refq]
281 movhps m1, [refq+ref_strideq]
282 movh m2, [refq+ref_strideq*2]
283 movhps m2, [refq+ref_stride3q]
Ronald S. Bultjec24d9222013-06-25 11:26:49 -0700284%if %2 == 1
285 pavgb m1, [second_predq+mmsize*0]
286 pavgb m2, [second_predq+mmsize*1]
287 lea second_predq, [second_predq+mmsize*2]
288%endif
John Koleszar7ad8dbe2013-02-27 10:23:06 -0800289 movh m3, [srcq]
290 movhps m3, [srcq+src_strideq]
291 movh m4, [srcq+src_strideq*2]
292 movhps m4, [srcq+src_stride3q]
293 psadbw m1, m3
294 psadbw m2, m4
295 lea refq, [refq+ref_strideq*4]
296 paddd m0, m1
297 lea srcq, [srcq+src_strideq*4]
298 paddd m0, m2
299 dec n_rowsd
300 jg .loop
Yunqing Wang20bd1442011-06-28 09:14:13 -0400301
John Koleszar7ad8dbe2013-02-27 10:23:06 -0800302 movhlps m1, m0
303 paddd m0, m1
304 movd eax, m0
305 RET
306%endmacro
Yunqing Wang20bd1442011-06-28 09:14:13 -0400307
John Koleszar7ad8dbe2013-02-27 10:23:06 -0800308INIT_XMM sse2
309SAD8XN 16 ; sad8x16_sse2
310SAD8XN 8 ; sad8x8_sse2
Jingning Han15f50e72013-06-13 11:07:12 -0700311SAD8XN 4 ; sad8x4_sse2
Ronald S. Bultjec24d9222013-06-25 11:26:49 -0700312SAD8XN 16, 1 ; sad8x16_avg_sse2
313SAD8XN 8, 1 ; sad8x8_avg_sse2
314SAD8XN 4, 1 ; sad8x4_avg_sse2
Rupert Swarbrick93c39e92017-07-12 11:11:02 +0100315%if CONFIG_EXT_PARTITION_TYPES
316SAD8XN 32 ; sad_8x32_sse2
317SAD8XN 32, 1 ; sad_8x32_avg_sse2
318%endif
Yunqing Wang20bd1442011-06-28 09:14:13 -0400319
Yaowu Xuf883b422016-08-30 14:01:10 -0700320; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
Jian Zhoub158d9a2015-12-16 15:52:02 -0800321; uint8_t *ref, int ref_stride);
Ronald S. Bultjec24d9222013-06-25 11:26:49 -0700322%macro SAD4XN 1-2 0
323 SAD_FN 4, %1, 7, %2
Jingning Han15f50e72013-06-13 11:07:12 -0700324 mov n_rowsd, %1/4
325 pxor m0, m0
326
327.loop:
328 movd m1, [refq]
329 movd m2, [refq+ref_strideq]
330 movd m3, [refq+ref_strideq*2]
331 movd m4, [refq+ref_stride3q]
332 punpckldq m1, m2
333 punpckldq m3, m4
Jian Zhoub158d9a2015-12-16 15:52:02 -0800334 movlhps m1, m3
Ronald S. Bultjec24d9222013-06-25 11:26:49 -0700335%if %2 == 1
336 pavgb m1, [second_predq+mmsize*0]
Jian Zhoub158d9a2015-12-16 15:52:02 -0800337 lea second_predq, [second_predq+mmsize*1]
Ronald S. Bultjec24d9222013-06-25 11:26:49 -0700338%endif
John Koleszar7ad8dbe2013-02-27 10:23:06 -0800339 movd m2, [srcq]
Jingning Han15f50e72013-06-13 11:07:12 -0700340 movd m5, [srcq+src_strideq]
341 movd m4, [srcq+src_strideq*2]
James Zern8b2ddbc2015-12-18 19:19:32 -0800342 movd m3, [srcq+src_stride3q]
Jingning Han15f50e72013-06-13 11:07:12 -0700343 punpckldq m2, m5
James Zern8b2ddbc2015-12-18 19:19:32 -0800344 punpckldq m4, m3
Jian Zhoub158d9a2015-12-16 15:52:02 -0800345 movlhps m2, m4
Jingning Han15f50e72013-06-13 11:07:12 -0700346 psadbw m1, m2
Jingning Han15f50e72013-06-13 11:07:12 -0700347 lea refq, [refq+ref_strideq*4]
348 paddd m0, m1
349 lea srcq, [srcq+src_strideq*4]
Jingning Han15f50e72013-06-13 11:07:12 -0700350 dec n_rowsd
351 jg .loop
352
Jian Zhoub158d9a2015-12-16 15:52:02 -0800353 movhlps m1, m0
354 paddd m0, m1
John Koleszar7ad8dbe2013-02-27 10:23:06 -0800355 movd eax, m0
356 RET
Jingning Han15f50e72013-06-13 11:07:12 -0700357%endmacro
358
Jian Zhoub158d9a2015-12-16 15:52:02 -0800359INIT_XMM sse2
Jingning Han15f50e72013-06-13 11:07:12 -0700360SAD4XN 8 ; sad4x8_sse
361SAD4XN 4 ; sad4x4_sse
Ronald S. Bultjec24d9222013-06-25 11:26:49 -0700362SAD4XN 8, 1 ; sad4x8_avg_sse
363SAD4XN 4, 1 ; sad4x4_avg_sse
Rupert Swarbrick93c39e92017-07-12 11:11:02 +0100364%if CONFIG_EXT_PARTITION_TYPES
365SAD4XN 16 ; sad_4x16_sse2
366SAD4XN 16, 1 ; sad_4x16_avg_sse2
367%endif