blob: 22ad232774f064a2fc0ea6dac21d4f399a075ff0 [file] [log] [blame]
Peter de Rivaz48032bf2014-10-16 14:00:54 +01001;
Lester Lu6bc30d62021-12-16 19:13:21 +00002; Copyright (c) 2021, Alliance for Open Media. All rights reserved
Peter de Rivaz48032bf2014-10-16 14:00:54 +01003;
Lester Lu6bc30d62021-12-16 19:13:21 +00004; This source code is subject to the terms of the BSD 3-Clause Clear License and the
5; Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear License was
6; not distributed with this source code in the LICENSE file, you can obtain it
7; at aomedia.org/license/software-license/bsd-3-c-c/. If the Alliance for Open Media Patent
8; License 1.0 was not distributed with this source code in the PATENTS file, you
9; can obtain it at aomedia.org/license/patent-license/.
Yaowu Xu9c01aa12016-09-01 14:32:49 -070010;
11
Peter de Rivaz48032bf2014-10-16 14:00:54 +010012;
13
14%include "third_party/x86inc/x86inc.asm"
15
16SECTION_RODATA
17pw_8: times 8 dw 8
18bilin_filter_m_sse2: times 8 dw 16
19 times 8 dw 0
Peter de Rivaz48032bf2014-10-16 14:00:54 +010020 times 8 dw 14
21 times 8 dw 2
Peter de Rivaz48032bf2014-10-16 14:00:54 +010022 times 8 dw 12
23 times 8 dw 4
Peter de Rivaz48032bf2014-10-16 14:00:54 +010024 times 8 dw 10
25 times 8 dw 6
Peter de Rivaz48032bf2014-10-16 14:00:54 +010026 times 16 dw 8
Peter de Rivaz48032bf2014-10-16 14:00:54 +010027 times 8 dw 6
28 times 8 dw 10
Peter de Rivaz48032bf2014-10-16 14:00:54 +010029 times 8 dw 4
30 times 8 dw 12
Peter de Rivaz48032bf2014-10-16 14:00:54 +010031 times 8 dw 2
32 times 8 dw 14
Peter de Rivaz48032bf2014-10-16 14:00:54 +010033
34SECTION .text
35
Yaowu Xuf883b422016-08-30 14:01:10 -070036; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
Peter de Rivaz48032bf2014-10-16 14:00:54 +010037; int x_offset, int y_offset,
38; const uint8_t *dst, ptrdiff_t dst_stride,
39; int height, unsigned int *sse);
40;
41; This function returns the SE and stores SSE in the given pointer.
42
43%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
44 psubw %3, %4
45 psubw %1, %2
46 mova %4, %3 ; make copies to manipulate to calc sum
47 mova %2, %1 ; use originals for calc sse
48 pmaddwd %3, %3
49 paddw %4, %2
50 pmaddwd %1, %1
51 movhlps %2, %4
52 paddd %6, %3
53 paddw %4, %2
54 pxor %2, %2
55 pcmpgtw %2, %4 ; mask for 0 > %4 (sum)
56 punpcklwd %4, %2 ; sign-extend word to dword
57 paddd %6, %1
58 paddd %5, %4
59
60%endmacro
61
62%macro STORE_AND_RET 0
63%if mmsize == 16
64 ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
65 ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
66 ; We have to sign-extend it before adding the words within the register
67 ; and outputing to a dword.
68 movhlps m3, m7
69 movhlps m4, m6
70 paddd m7, m3
71 paddd m6, m4
72 pshufd m3, m7, 0x1
73 pshufd m4, m6, 0x1
74 paddd m7, m3
75 paddd m6, m4
76 mov r1, ssem ; r1 = unsigned int *sse
77 movd [r1], m7 ; store sse
Tom Finegan0a2dab22017-07-10 13:34:35 -070078 movd eax, m6 ; store sum as return value
Peter de Rivaz48032bf2014-10-16 14:00:54 +010079%endif
80 RET
81%endmacro
82
83%macro INC_SRC_BY_SRC_STRIDE 0
84%if ARCH_X86=1 && CONFIG_PIC=1
Peter de Rivaz7361ef72015-12-14 16:35:29 +000085 add srcq, src_stridemp
86 add srcq, src_stridemp
Peter de Rivaz48032bf2014-10-16 14:00:54 +010087%else
88 lea srcq, [srcq + src_strideq*2]
89%endif
90%endmacro
91
Peter de Rivaz48032bf2014-10-16 14:00:54 +010092%macro SUBPEL_VARIANCE 1-2 0 ; W
93%define bilin_filter_m bilin_filter_m_sse2
94%define filter_idx_shift 5
95
96
Johann0cf864f2018-01-31 09:52:28 -080097%if ARCH_X86_64
Peter de Rivaz48032bf2014-10-16 14:00:54 +010098 %if %2 == 1 ; avg
99 cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
100 x_offset, y_offset, \
101 dst, dst_stride, \
102 sec, sec_stride, height, sse
103 %define sec_str sec_strideq
104 %else
Johann0cf864f2018-01-31 09:52:28 -0800105 cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
106 x_offset, y_offset, \
107 dst, dst_stride, height, sse
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100108 %endif
Johann124ada52015-07-28 14:00:32 -0700109 %define block_height heightd
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100110 %define bilin_filter sseq
111%else
Johann0cf864f2018-01-31 09:52:28 -0800112 %if CONFIG_PIC=1
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100113 %if %2 == 1 ; avg
114 cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
Johann0cf864f2018-01-31 09:52:28 -0800115 x_offset, y_offset, \
116 dst, dst_stride, \
Johannae356d12018-10-08 08:36:35 -0700117 sec, sec_stride, height, sse
Johann124ada52015-07-28 14:00:32 -0700118 %define block_height dword heightm
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100119 %define sec_str sec_stridemp
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100120 %else
121 cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
Johann0cf864f2018-01-31 09:52:28 -0800122 x_offset, y_offset, \
Johannae356d12018-10-08 08:36:35 -0700123 dst, dst_stride, height, sse
Johann124ada52015-07-28 14:00:32 -0700124 %define block_height heightd
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100125 %endif
Johannae356d12018-10-08 08:36:35 -0700126
127 ; reuse argument stack space
128 %define g_bilin_filterm x_offsetm
129 %define g_pw_8m y_offsetm
130
131 ; Store bilin_filter and pw_8 location in stack
132 %if GET_GOT_DEFINED == 1
133 GET_GOT eax
134 add esp, 4 ; restore esp
135 %endif
136
137 lea ecx, [GLOBAL(bilin_filter_m)]
138 mov g_bilin_filterm, ecx
139
140 lea ecx, [GLOBAL(pw_8)]
141 mov g_pw_8m, ecx
142
143 LOAD_IF_USED 0, 1 ; load eax, ecx back
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100144 %else
145 %if %2 == 1 ; avg
Johann0cf864f2018-01-31 09:52:28 -0800146 cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
147 x_offset, y_offset, \
148 dst, dst_stride, \
149 sec, sec_stride, height, sse
Johann124ada52015-07-28 14:00:32 -0700150 %define block_height dword heightm
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100151 %define sec_str sec_stridemp
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100152 %else
153 cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
Johann0cf864f2018-01-31 09:52:28 -0800154 x_offset, y_offset, \
155 dst, dst_stride, height, sse
Johann124ada52015-07-28 14:00:32 -0700156 %define block_height heightd
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100157 %endif
158
159 %define bilin_filter bilin_filter_m
160 %endif
161%endif
162
163 ASSERT %1 <= 16 ; m6 overflows if w > 16
164 pxor m6, m6 ; sum
165 pxor m7, m7 ; sse
166
167%if %1 < 16
Johann124ada52015-07-28 14:00:32 -0700168 sar block_height, 1
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100169%endif
Yunqing Wang789ae442015-02-04 12:02:06 -0800170%if %2 == 1 ; avg
171 shl sec_str, 1
172%endif
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100173
174 ; FIXME(rbultje) replace by jumptable?
175 test x_offsetd, x_offsetd
176 jnz .x_nonzero
177 ; x_offset == 0
178 test y_offsetd, y_offsetd
179 jnz .x_zero_y_nonzero
180
181 ; x_offset == 0 && y_offset == 0
182.x_zero_y_zero_loop:
183%if %1 == 16
184 movu m0, [srcq]
185 movu m2, [srcq + 16]
186 mova m1, [dstq]
187 mova m3, [dstq + 16]
188%if %2 == 1 ; avg
189 pavgw m0, [secq]
190 pavgw m2, [secq+16]
191%endif
192 SUM_SSE m0, m1, m2, m3, m6, m7
193
194 lea srcq, [srcq + src_strideq*2]
195 lea dstq, [dstq + dst_strideq*2]
196%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800197 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100198%endif
199%else ; %1 < 16
200 movu m0, [srcq]
201 movu m2, [srcq + src_strideq*2]
202 mova m1, [dstq]
203 mova m3, [dstq + dst_strideq*2]
204%if %2 == 1 ; avg
205 pavgw m0, [secq]
Yunqing Wang789ae442015-02-04 12:02:06 -0800206 add secq, sec_str
207 pavgw m2, [secq]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100208%endif
209 SUM_SSE m0, m1, m2, m3, m6, m7
210
211 lea srcq, [srcq + src_strideq*4]
212 lea dstq, [dstq + dst_strideq*4]
213%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800214 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100215%endif
216%endif
Johann124ada52015-07-28 14:00:32 -0700217 dec block_height
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100218 jg .x_zero_y_zero_loop
219 STORE_AND_RET
220
221.x_zero_y_nonzero:
222 cmp y_offsetd, 8
223 jne .x_zero_y_nonhalf
224
225 ; x_offset == 0 && y_offset == 0.5
226.x_zero_y_half_loop:
227%if %1 == 16
228 movu m0, [srcq]
229 movu m1, [srcq+16]
230 movu m4, [srcq+src_strideq*2]
231 movu m5, [srcq+src_strideq*2+16]
232 mova m2, [dstq]
233 mova m3, [dstq+16]
234 pavgw m0, m4
235 pavgw m1, m5
236%if %2 == 1 ; avg
237 pavgw m0, [secq]
238 pavgw m1, [secq+16]
239%endif
240 SUM_SSE m0, m2, m1, m3, m6, m7
241
242 lea srcq, [srcq + src_strideq*2]
243 lea dstq, [dstq + dst_strideq*2]
244%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800245 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100246%endif
247%else ; %1 < 16
248 movu m0, [srcq]
249 movu m1, [srcq+src_strideq*2]
250 movu m5, [srcq+src_strideq*4]
251 mova m2, [dstq]
252 mova m3, [dstq+dst_strideq*2]
253 pavgw m0, m1
254 pavgw m1, m5
255%if %2 == 1 ; avg
256 pavgw m0, [secq]
Yunqing Wang789ae442015-02-04 12:02:06 -0800257 add secq, sec_str
258 pavgw m1, [secq]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100259%endif
260 SUM_SSE m0, m2, m1, m3, m6, m7
261
262 lea srcq, [srcq + src_strideq*4]
263 lea dstq, [dstq + dst_strideq*4]
264%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800265 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100266%endif
267%endif
Johann124ada52015-07-28 14:00:32 -0700268 dec block_height
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100269 jg .x_zero_y_half_loop
270 STORE_AND_RET
271
272.x_zero_y_nonhalf:
273 ; x_offset == 0 && y_offset == bilin interpolation
Johann0cf864f2018-01-31 09:52:28 -0800274%if ARCH_X86_64
275 lea bilin_filter, [GLOBAL(bilin_filter_m)]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100276%endif
277 shl y_offsetd, filter_idx_shift
278%if ARCH_X86_64 && mmsize == 16
279 mova m8, [bilin_filter+y_offsetq]
280 mova m9, [bilin_filter+y_offsetq+16]
Johann0cf864f2018-01-31 09:52:28 -0800281 mova m10, [GLOBAL(pw_8)]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100282%define filter_y_a m8
283%define filter_y_b m9
284%define filter_rnd m10
285%else ; x86-32 or mmx
286%if ARCH_X86=1 && CONFIG_PIC=1
287; x_offset == 0, reuse x_offset reg
288%define tempq x_offsetq
289 add y_offsetq, g_bilin_filterm
290%define filter_y_a [y_offsetq]
291%define filter_y_b [y_offsetq+16]
292 mov tempq, g_pw_8m
293%define filter_rnd [tempq]
294%else
295 add y_offsetq, bilin_filter
296%define filter_y_a [y_offsetq]
297%define filter_y_b [y_offsetq+16]
Johann0cf864f2018-01-31 09:52:28 -0800298%define filter_rnd [GLOBAL(pw_8)]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100299%endif
300%endif
301
302.x_zero_y_other_loop:
303%if %1 == 16
304 movu m0, [srcq]
305 movu m1, [srcq + 16]
306 movu m4, [srcq+src_strideq*2]
307 movu m5, [srcq+src_strideq*2+16]
308 mova m2, [dstq]
309 mova m3, [dstq+16]
310 ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
311 ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
312 ; instructions is the same (5), but it is 1 mul instead of 2, so might be
313 ; slightly faster because of pmullw latency. It would also cut our rodata
314 ; tables in half for this function, and save 1-2 registers on x86-64.
315 pmullw m1, filter_y_a
316 pmullw m5, filter_y_b
317 paddw m1, filter_rnd
318 pmullw m0, filter_y_a
319 pmullw m4, filter_y_b
320 paddw m0, filter_rnd
321 paddw m1, m5
322 paddw m0, m4
323 psrlw m1, 4
324 psrlw m0, 4
325%if %2 == 1 ; avg
326 pavgw m0, [secq]
327 pavgw m1, [secq+16]
328%endif
329 SUM_SSE m0, m2, m1, m3, m6, m7
330
331 lea srcq, [srcq + src_strideq*2]
332 lea dstq, [dstq + dst_strideq*2]
333%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800334 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100335%endif
336%else ; %1 < 16
337 movu m0, [srcq]
338 movu m1, [srcq+src_strideq*2]
339 movu m5, [srcq+src_strideq*4]
340 mova m4, m1
341 mova m2, [dstq]
342 mova m3, [dstq+dst_strideq*2]
343 pmullw m1, filter_y_a
344 pmullw m5, filter_y_b
345 paddw m1, filter_rnd
346 pmullw m0, filter_y_a
347 pmullw m4, filter_y_b
348 paddw m0, filter_rnd
349 paddw m1, m5
350 paddw m0, m4
351 psrlw m1, 4
352 psrlw m0, 4
353%if %2 == 1 ; avg
354 pavgw m0, [secq]
Yunqing Wang789ae442015-02-04 12:02:06 -0800355 add secq, sec_str
356 pavgw m1, [secq]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100357%endif
358 SUM_SSE m0, m2, m1, m3, m6, m7
359
360 lea srcq, [srcq + src_strideq*4]
361 lea dstq, [dstq + dst_strideq*4]
362%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800363 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100364%endif
365%endif
Johann124ada52015-07-28 14:00:32 -0700366 dec block_height
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100367 jg .x_zero_y_other_loop
368%undef filter_y_a
369%undef filter_y_b
370%undef filter_rnd
371 STORE_AND_RET
372
373.x_nonzero:
374 cmp x_offsetd, 8
375 jne .x_nonhalf
376 ; x_offset == 0.5
377 test y_offsetd, y_offsetd
378 jnz .x_half_y_nonzero
379
380 ; x_offset == 0.5 && y_offset == 0
381.x_half_y_zero_loop:
382%if %1 == 16
383 movu m0, [srcq]
384 movu m1, [srcq + 16]
385 movu m4, [srcq + 2]
386 movu m5, [srcq + 18]
387 mova m2, [dstq]
388 mova m3, [dstq + 16]
389 pavgw m0, m4
390 pavgw m1, m5
391%if %2 == 1 ; avg
392 pavgw m0, [secq]
393 pavgw m1, [secq+16]
394%endif
395 SUM_SSE m0, m2, m1, m3, m6, m7
396
397 lea srcq, [srcq + src_strideq*2]
398 lea dstq, [dstq + dst_strideq*2]
399%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800400 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100401%endif
402%else ; %1 < 16
403 movu m0, [srcq]
404 movu m1, [srcq + src_strideq*2]
405 movu m4, [srcq + 2]
406 movu m5, [srcq + src_strideq*2 + 2]
407 mova m2, [dstq]
408 mova m3, [dstq + dst_strideq*2]
409 pavgw m0, m4
410 pavgw m1, m5
411%if %2 == 1 ; avg
412 pavgw m0, [secq]
Yunqing Wang789ae442015-02-04 12:02:06 -0800413 add secq, sec_str
414 pavgw m1, [secq]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100415%endif
416 SUM_SSE m0, m2, m1, m3, m6, m7
417
418 lea srcq, [srcq + src_strideq*4]
419 lea dstq, [dstq + dst_strideq*4]
420%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800421 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100422%endif
423%endif
Johann124ada52015-07-28 14:00:32 -0700424 dec block_height
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100425 jg .x_half_y_zero_loop
426 STORE_AND_RET
427
428.x_half_y_nonzero:
429 cmp y_offsetd, 8
430 jne .x_half_y_nonhalf
431
432 ; x_offset == 0.5 && y_offset == 0.5
433%if %1 == 16
434 movu m0, [srcq]
435 movu m1, [srcq+16]
436 movu m2, [srcq+2]
437 movu m3, [srcq+18]
438 lea srcq, [srcq + src_strideq*2]
439 pavgw m0, m2
440 pavgw m1, m3
441.x_half_y_half_loop:
442 movu m2, [srcq]
443 movu m3, [srcq + 16]
444 movu m4, [srcq + 2]
445 movu m5, [srcq + 18]
446 pavgw m2, m4
447 pavgw m3, m5
448 pavgw m0, m2
449 pavgw m1, m3
450 mova m4, [dstq]
451 mova m5, [dstq + 16]
452%if %2 == 1 ; avg
453 pavgw m0, [secq]
454 pavgw m1, [secq+16]
455%endif
456 SUM_SSE m0, m4, m1, m5, m6, m7
457 mova m0, m2
458 mova m1, m3
459
460 lea srcq, [srcq + src_strideq*2]
461 lea dstq, [dstq + dst_strideq*2]
462%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800463 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100464%endif
465%else ; %1 < 16
466 movu m0, [srcq]
467 movu m2, [srcq+2]
468 lea srcq, [srcq + src_strideq*2]
469 pavgw m0, m2
470.x_half_y_half_loop:
471 movu m2, [srcq]
472 movu m3, [srcq + src_strideq*2]
473 movu m4, [srcq + 2]
474 movu m5, [srcq + src_strideq*2 + 2]
475 pavgw m2, m4
476 pavgw m3, m5
477 pavgw m0, m2
478 pavgw m2, m3
479 mova m4, [dstq]
480 mova m5, [dstq + dst_strideq*2]
481%if %2 == 1 ; avg
482 pavgw m0, [secq]
Yunqing Wang789ae442015-02-04 12:02:06 -0800483 add secq, sec_str
484 pavgw m2, [secq]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100485%endif
486 SUM_SSE m0, m4, m2, m5, m6, m7
487 mova m0, m3
488
489 lea srcq, [srcq + src_strideq*4]
490 lea dstq, [dstq + dst_strideq*4]
491%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800492 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100493%endif
494%endif
Johann124ada52015-07-28 14:00:32 -0700495 dec block_height
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100496 jg .x_half_y_half_loop
497 STORE_AND_RET
498
499.x_half_y_nonhalf:
500 ; x_offset == 0.5 && y_offset == bilin interpolation
Johann0cf864f2018-01-31 09:52:28 -0800501%if ARCH_X86_64
502 lea bilin_filter, [GLOBAL(bilin_filter_m)]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100503%endif
504 shl y_offsetd, filter_idx_shift
505%if ARCH_X86_64 && mmsize == 16
506 mova m8, [bilin_filter+y_offsetq]
507 mova m9, [bilin_filter+y_offsetq+16]
Johann0cf864f2018-01-31 09:52:28 -0800508 mova m10, [GLOBAL(pw_8)]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100509%define filter_y_a m8
510%define filter_y_b m9
511%define filter_rnd m10
512%else ; x86_32
513%if ARCH_X86=1 && CONFIG_PIC=1
514; x_offset == 0.5. We can reuse x_offset reg
515%define tempq x_offsetq
516 add y_offsetq, g_bilin_filterm
517%define filter_y_a [y_offsetq]
518%define filter_y_b [y_offsetq+16]
519 mov tempq, g_pw_8m
520%define filter_rnd [tempq]
521%else
522 add y_offsetq, bilin_filter
523%define filter_y_a [y_offsetq]
524%define filter_y_b [y_offsetq+16]
Johann0cf864f2018-01-31 09:52:28 -0800525%define filter_rnd [GLOBAL(pw_8)]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100526%endif
527%endif
528
529%if %1 == 16
530 movu m0, [srcq]
531 movu m1, [srcq+16]
532 movu m2, [srcq+2]
533 movu m3, [srcq+18]
534 lea srcq, [srcq + src_strideq*2]
535 pavgw m0, m2
536 pavgw m1, m3
537.x_half_y_other_loop:
538 movu m2, [srcq]
539 movu m3, [srcq+16]
540 movu m4, [srcq+2]
541 movu m5, [srcq+18]
542 pavgw m2, m4
543 pavgw m3, m5
544 mova m4, m2
545 mova m5, m3
546 pmullw m1, filter_y_a
547 pmullw m3, filter_y_b
548 paddw m1, filter_rnd
549 paddw m1, m3
550 pmullw m0, filter_y_a
551 pmullw m2, filter_y_b
552 paddw m0, filter_rnd
553 psrlw m1, 4
554 paddw m0, m2
555 mova m2, [dstq]
556 psrlw m0, 4
557 mova m3, [dstq+16]
558%if %2 == 1 ; avg
559 pavgw m0, [secq]
560 pavgw m1, [secq+16]
561%endif
562 SUM_SSE m0, m2, m1, m3, m6, m7
563 mova m0, m4
564 mova m1, m5
565
566 lea srcq, [srcq + src_strideq*2]
567 lea dstq, [dstq + dst_strideq*2]
568%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800569 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100570%endif
571%else ; %1 < 16
572 movu m0, [srcq]
573 movu m2, [srcq+2]
574 lea srcq, [srcq + src_strideq*2]
575 pavgw m0, m2
576.x_half_y_other_loop:
577 movu m2, [srcq]
578 movu m3, [srcq+src_strideq*2]
579 movu m4, [srcq+2]
580 movu m5, [srcq+src_strideq*2+2]
581 pavgw m2, m4
582 pavgw m3, m5
583 mova m4, m2
584 mova m5, m3
585 pmullw m4, filter_y_a
586 pmullw m3, filter_y_b
587 paddw m4, filter_rnd
588 paddw m4, m3
589 pmullw m0, filter_y_a
590 pmullw m2, filter_y_b
591 paddw m0, filter_rnd
592 psrlw m4, 4
593 paddw m0, m2
594 mova m2, [dstq]
595 psrlw m0, 4
596 mova m3, [dstq+dst_strideq*2]
597%if %2 == 1 ; avg
598 pavgw m0, [secq]
Yunqing Wang789ae442015-02-04 12:02:06 -0800599 add secq, sec_str
600 pavgw m4, [secq]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100601%endif
602 SUM_SSE m0, m2, m4, m3, m6, m7
603 mova m0, m5
604
605 lea srcq, [srcq + src_strideq*4]
606 lea dstq, [dstq + dst_strideq*4]
607%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800608 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100609%endif
610%endif
Johann124ada52015-07-28 14:00:32 -0700611 dec block_height
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100612 jg .x_half_y_other_loop
613%undef filter_y_a
614%undef filter_y_b
615%undef filter_rnd
616 STORE_AND_RET
617
618.x_nonhalf:
619 test y_offsetd, y_offsetd
620 jnz .x_nonhalf_y_nonzero
621
622 ; x_offset == bilin interpolation && y_offset == 0
Johann0cf864f2018-01-31 09:52:28 -0800623%if ARCH_X86_64
624 lea bilin_filter, [GLOBAL(bilin_filter_m)]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100625%endif
626 shl x_offsetd, filter_idx_shift
627%if ARCH_X86_64 && mmsize == 16
628 mova m8, [bilin_filter+x_offsetq]
629 mova m9, [bilin_filter+x_offsetq+16]
Johann0cf864f2018-01-31 09:52:28 -0800630 mova m10, [GLOBAL(pw_8)]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100631%define filter_x_a m8
632%define filter_x_b m9
633%define filter_rnd m10
634%else ; x86-32
635%if ARCH_X86=1 && CONFIG_PIC=1
636; y_offset == 0. We can reuse y_offset reg.
637%define tempq y_offsetq
638 add x_offsetq, g_bilin_filterm
639%define filter_x_a [x_offsetq]
640%define filter_x_b [x_offsetq+16]
641 mov tempq, g_pw_8m
642%define filter_rnd [tempq]
643%else
644 add x_offsetq, bilin_filter
645%define filter_x_a [x_offsetq]
646%define filter_x_b [x_offsetq+16]
Johann0cf864f2018-01-31 09:52:28 -0800647%define filter_rnd [GLOBAL(pw_8)]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100648%endif
649%endif
650
651.x_other_y_zero_loop:
652%if %1 == 16
653 movu m0, [srcq]
654 movu m1, [srcq+16]
655 movu m2, [srcq+2]
656 movu m3, [srcq+18]
657 mova m4, [dstq]
658 mova m5, [dstq+16]
659 pmullw m1, filter_x_a
660 pmullw m3, filter_x_b
661 paddw m1, filter_rnd
662 pmullw m0, filter_x_a
663 pmullw m2, filter_x_b
664 paddw m0, filter_rnd
665 paddw m1, m3
666 paddw m0, m2
667 psrlw m1, 4
668 psrlw m0, 4
669%if %2 == 1 ; avg
670 pavgw m0, [secq]
671 pavgw m1, [secq+16]
672%endif
673 SUM_SSE m0, m4, m1, m5, m6, m7
674
675 lea srcq, [srcq+src_strideq*2]
676 lea dstq, [dstq+dst_strideq*2]
677%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800678 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100679%endif
680%else ; %1 < 16
681 movu m0, [srcq]
682 movu m1, [srcq+src_strideq*2]
683 movu m2, [srcq+2]
684 movu m3, [srcq+src_strideq*2+2]
685 mova m4, [dstq]
686 mova m5, [dstq+dst_strideq*2]
687 pmullw m1, filter_x_a
688 pmullw m3, filter_x_b
689 paddw m1, filter_rnd
690 pmullw m0, filter_x_a
691 pmullw m2, filter_x_b
692 paddw m0, filter_rnd
693 paddw m1, m3
694 paddw m0, m2
695 psrlw m1, 4
696 psrlw m0, 4
697%if %2 == 1 ; avg
698 pavgw m0, [secq]
Yunqing Wang789ae442015-02-04 12:02:06 -0800699 add secq, sec_str
700 pavgw m1, [secq]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100701%endif
702 SUM_SSE m0, m4, m1, m5, m6, m7
703
704 lea srcq, [srcq+src_strideq*4]
705 lea dstq, [dstq+dst_strideq*4]
706%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800707 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100708%endif
709%endif
Johann124ada52015-07-28 14:00:32 -0700710 dec block_height
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100711 jg .x_other_y_zero_loop
712%undef filter_x_a
713%undef filter_x_b
714%undef filter_rnd
715 STORE_AND_RET
716
717.x_nonhalf_y_nonzero:
718 cmp y_offsetd, 8
719 jne .x_nonhalf_y_nonhalf
720
721 ; x_offset == bilin interpolation && y_offset == 0.5
Johann0cf864f2018-01-31 09:52:28 -0800722%if ARCH_X86_64
723 lea bilin_filter, [GLOBAL(bilin_filter_m)]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100724%endif
725 shl x_offsetd, filter_idx_shift
726%if ARCH_X86_64 && mmsize == 16
727 mova m8, [bilin_filter+x_offsetq]
728 mova m9, [bilin_filter+x_offsetq+16]
Johann0cf864f2018-01-31 09:52:28 -0800729 mova m10, [GLOBAL(pw_8)]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100730%define filter_x_a m8
731%define filter_x_b m9
732%define filter_rnd m10
733%else ; x86-32
734%if ARCH_X86=1 && CONFIG_PIC=1
735; y_offset == 0.5. We can reuse y_offset reg.
736%define tempq y_offsetq
737 add x_offsetq, g_bilin_filterm
738%define filter_x_a [x_offsetq]
739%define filter_x_b [x_offsetq+16]
740 mov tempq, g_pw_8m
741%define filter_rnd [tempq]
742%else
743 add x_offsetq, bilin_filter
744%define filter_x_a [x_offsetq]
745%define filter_x_b [x_offsetq+16]
Johann0cf864f2018-01-31 09:52:28 -0800746%define filter_rnd [GLOBAL(pw_8)]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100747%endif
748%endif
749
750%if %1 == 16
751 movu m0, [srcq]
752 movu m1, [srcq+16]
753 movu m2, [srcq+2]
754 movu m3, [srcq+18]
755 pmullw m0, filter_x_a
756 pmullw m2, filter_x_b
757 paddw m0, filter_rnd
758 pmullw m1, filter_x_a
759 pmullw m3, filter_x_b
760 paddw m1, filter_rnd
761 paddw m0, m2
762 paddw m1, m3
763 psrlw m0, 4
764 psrlw m1, 4
765 lea srcq, [srcq+src_strideq*2]
766.x_other_y_half_loop:
767 movu m2, [srcq]
768 movu m3, [srcq+16]
769 movu m4, [srcq+2]
770 movu m5, [srcq+18]
771 pmullw m2, filter_x_a
772 pmullw m4, filter_x_b
773 paddw m2, filter_rnd
774 pmullw m3, filter_x_a
775 pmullw m5, filter_x_b
776 paddw m3, filter_rnd
777 paddw m2, m4
778 paddw m3, m5
779 mova m4, [dstq]
780 mova m5, [dstq+16]
781 psrlw m2, 4
782 psrlw m3, 4
783 pavgw m0, m2
784 pavgw m1, m3
785%if %2 == 1 ; avg
786 pavgw m0, [secq]
787 pavgw m1, [secq+16]
788%endif
789 SUM_SSE m0, m4, m1, m5, m6, m7
790 mova m0, m2
791 mova m1, m3
792
793 lea srcq, [srcq+src_strideq*2]
794 lea dstq, [dstq+dst_strideq*2]
795%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800796 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100797%endif
798%else ; %1 < 16
799 movu m0, [srcq]
800 movu m2, [srcq+2]
801 pmullw m0, filter_x_a
802 pmullw m2, filter_x_b
803 paddw m0, filter_rnd
804 paddw m0, m2
805 psrlw m0, 4
806 lea srcq, [srcq+src_strideq*2]
807.x_other_y_half_loop:
808 movu m2, [srcq]
809 movu m3, [srcq+src_strideq*2]
810 movu m4, [srcq+2]
811 movu m5, [srcq+src_strideq*2+2]
812 pmullw m2, filter_x_a
813 pmullw m4, filter_x_b
814 paddw m2, filter_rnd
815 pmullw m3, filter_x_a
816 pmullw m5, filter_x_b
817 paddw m3, filter_rnd
818 paddw m2, m4
819 paddw m3, m5
820 mova m4, [dstq]
821 mova m5, [dstq+dst_strideq*2]
822 psrlw m2, 4
823 psrlw m3, 4
824 pavgw m0, m2
825 pavgw m2, m3
826%if %2 == 1 ; avg
827 pavgw m0, [secq]
Yunqing Wang789ae442015-02-04 12:02:06 -0800828 add secq, sec_str
829 pavgw m2, [secq]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100830%endif
831 SUM_SSE m0, m4, m2, m5, m6, m7
832 mova m0, m3
833
834 lea srcq, [srcq+src_strideq*4]
835 lea dstq, [dstq+dst_strideq*4]
836%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800837 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100838%endif
839%endif
Johann124ada52015-07-28 14:00:32 -0700840 dec block_height
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100841 jg .x_other_y_half_loop
842%undef filter_x_a
843%undef filter_x_b
844%undef filter_rnd
845 STORE_AND_RET
846
847.x_nonhalf_y_nonhalf:
848; loading filter - this is same as in 8-bit depth
Johann0cf864f2018-01-31 09:52:28 -0800849%if ARCH_X86_64
850 lea bilin_filter, [GLOBAL(bilin_filter_m)]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100851%endif
852 shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5
853 shl y_offsetd, filter_idx_shift
854%if ARCH_X86_64 && mmsize == 16
855 mova m8, [bilin_filter+x_offsetq]
856 mova m9, [bilin_filter+x_offsetq+16]
857 mova m10, [bilin_filter+y_offsetq]
858 mova m11, [bilin_filter+y_offsetq+16]
Johann0cf864f2018-01-31 09:52:28 -0800859 mova m12, [GLOBAL(pw_8)]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100860%define filter_x_a m8
861%define filter_x_b m9
862%define filter_y_a m10
863%define filter_y_b m11
864%define filter_rnd m12
865%else ; x86-32
866%if ARCH_X86=1 && CONFIG_PIC=1
867; In this case, there is NO unused register. Used src_stride register. Later,
868; src_stride has to be loaded from stack when it is needed.
869%define tempq src_strideq
870 mov tempq, g_bilin_filterm
871 add x_offsetq, tempq
872 add y_offsetq, tempq
873%define filter_x_a [x_offsetq]
874%define filter_x_b [x_offsetq+16]
875%define filter_y_a [y_offsetq]
876%define filter_y_b [y_offsetq+16]
877
878 mov tempq, g_pw_8m
879%define filter_rnd [tempq]
880%else
881 add x_offsetq, bilin_filter
882 add y_offsetq, bilin_filter
883%define filter_x_a [x_offsetq]
884%define filter_x_b [x_offsetq+16]
885%define filter_y_a [y_offsetq]
886%define filter_y_b [y_offsetq+16]
Johann0cf864f2018-01-31 09:52:28 -0800887%define filter_rnd [GLOBAL(pw_8)]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100888%endif
889%endif
890; end of load filter
891
892 ; x_offset == bilin interpolation && y_offset == bilin interpolation
893%if %1 == 16
894 movu m0, [srcq]
895 movu m2, [srcq+2]
896 movu m1, [srcq+16]
897 movu m3, [srcq+18]
898 pmullw m0, filter_x_a
899 pmullw m2, filter_x_b
900 paddw m0, filter_rnd
901 pmullw m1, filter_x_a
902 pmullw m3, filter_x_b
903 paddw m1, filter_rnd
904 paddw m0, m2
905 paddw m1, m3
906 psrlw m0, 4
907 psrlw m1, 4
908
909 INC_SRC_BY_SRC_STRIDE
910
911.x_other_y_other_loop:
912 movu m2, [srcq]
913 movu m4, [srcq+2]
914 movu m3, [srcq+16]
915 movu m5, [srcq+18]
916 pmullw m2, filter_x_a
917 pmullw m4, filter_x_b
918 paddw m2, filter_rnd
919 pmullw m3, filter_x_a
920 pmullw m5, filter_x_b
921 paddw m3, filter_rnd
922 paddw m2, m4
923 paddw m3, m5
924 psrlw m2, 4
925 psrlw m3, 4
926 mova m4, m2
927 mova m5, m3
928 pmullw m0, filter_y_a
929 pmullw m2, filter_y_b
930 paddw m0, filter_rnd
931 pmullw m1, filter_y_a
932 pmullw m3, filter_y_b
933 paddw m0, m2
934 paddw m1, filter_rnd
935 mova m2, [dstq]
936 paddw m1, m3
937 psrlw m0, 4
938 psrlw m1, 4
939 mova m3, [dstq+16]
940%if %2 == 1 ; avg
941 pavgw m0, [secq]
942 pavgw m1, [secq+16]
943%endif
944 SUM_SSE m0, m2, m1, m3, m6, m7
945 mova m0, m4
946 mova m1, m5
947
948 INC_SRC_BY_SRC_STRIDE
949 lea dstq, [dstq + dst_strideq * 2]
950%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800951 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100952%endif
953%else ; %1 < 16
954 movu m0, [srcq]
955 movu m2, [srcq+2]
956 pmullw m0, filter_x_a
957 pmullw m2, filter_x_b
958 paddw m0, filter_rnd
959 paddw m0, m2
960 psrlw m0, 4
961
962 INC_SRC_BY_SRC_STRIDE
963
964.x_other_y_other_loop:
965 movu m2, [srcq]
966 movu m4, [srcq+2]
Peter de Rivaz7361ef72015-12-14 16:35:29 +0000967 INC_SRC_BY_SRC_STRIDE
968 movu m3, [srcq]
969 movu m5, [srcq+2]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100970 pmullw m2, filter_x_a
971 pmullw m4, filter_x_b
972 paddw m2, filter_rnd
973 pmullw m3, filter_x_a
974 pmullw m5, filter_x_b
975 paddw m3, filter_rnd
976 paddw m2, m4
977 paddw m3, m5
978 psrlw m2, 4
979 psrlw m3, 4
980 mova m4, m2
981 mova m5, m3
982 pmullw m0, filter_y_a
983 pmullw m2, filter_y_b
984 paddw m0, filter_rnd
985 pmullw m4, filter_y_a
986 pmullw m3, filter_y_b
987 paddw m0, m2
988 paddw m4, filter_rnd
989 mova m2, [dstq]
990 paddw m4, m3
991 psrlw m0, 4
992 psrlw m4, 4
993 mova m3, [dstq+dst_strideq*2]
994%if %2 == 1 ; avg
995 pavgw m0, [secq]
Yunqing Wang789ae442015-02-04 12:02:06 -0800996 add secq, sec_str
997 pavgw m4, [secq]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100998%endif
999 SUM_SSE m0, m2, m4, m3, m6, m7
1000 mova m0, m5
1001
Peter de Rivaz7361ef72015-12-14 16:35:29 +00001002 INC_SRC_BY_SRC_STRIDE
Peter de Rivaz48032bf2014-10-16 14:00:54 +01001003 lea dstq, [dstq + dst_strideq * 4]
1004%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -08001005 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +01001006%endif
1007%endif
Johann124ada52015-07-28 14:00:32 -07001008 dec block_height
Peter de Rivaz48032bf2014-10-16 14:00:54 +01001009 jg .x_other_y_other_loop
1010%undef filter_x_a
1011%undef filter_x_b
1012%undef filter_y_a
1013%undef filter_y_b
1014%undef filter_rnd
1015 STORE_AND_RET
1016%endmacro
1017
1018INIT_XMM sse2
1019SUBPEL_VARIANCE 8
1020SUBPEL_VARIANCE 16
1021
1022INIT_XMM sse2
1023SUBPEL_VARIANCE 8, 1
1024SUBPEL_VARIANCE 16, 1