blob: 797e9c1d4423022ad5960e74265efb2a7ac74df6 [file] [log] [blame]
Peter de Rivaz48032bf2014-10-16 14:00:54 +01001;
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002; Copyright (c) 2016, Alliance for Open Media. All rights reserved
Peter de Rivaz48032bf2014-10-16 14:00:54 +01003;
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
Peter de Rivaz48032bf2014-10-16 14:00:54 +010012;
13
14%include "third_party/x86inc/x86inc.asm"
15
16SECTION_RODATA
17pw_8: times 8 dw 8
18bilin_filter_m_sse2: times 8 dw 16
19 times 8 dw 0
Peter de Rivaz48032bf2014-10-16 14:00:54 +010020 times 8 dw 14
21 times 8 dw 2
Peter de Rivaz48032bf2014-10-16 14:00:54 +010022 times 8 dw 12
23 times 8 dw 4
Peter de Rivaz48032bf2014-10-16 14:00:54 +010024 times 8 dw 10
25 times 8 dw 6
Peter de Rivaz48032bf2014-10-16 14:00:54 +010026 times 16 dw 8
Peter de Rivaz48032bf2014-10-16 14:00:54 +010027 times 8 dw 6
28 times 8 dw 10
Peter de Rivaz48032bf2014-10-16 14:00:54 +010029 times 8 dw 4
30 times 8 dw 12
Peter de Rivaz48032bf2014-10-16 14:00:54 +010031 times 8 dw 2
32 times 8 dw 14
Peter de Rivaz48032bf2014-10-16 14:00:54 +010033
34SECTION .text
35
Yaowu Xuf883b422016-08-30 14:01:10 -070036; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
Peter de Rivaz48032bf2014-10-16 14:00:54 +010037; int x_offset, int y_offset,
38; const uint8_t *dst, ptrdiff_t dst_stride,
39; int height, unsigned int *sse);
40;
41; This function returns the SE and stores SSE in the given pointer.
42
43%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
44 psubw %3, %4
45 psubw %1, %2
46 mova %4, %3 ; make copies to manipulate to calc sum
47 mova %2, %1 ; use originals for calc sse
48 pmaddwd %3, %3
49 paddw %4, %2
50 pmaddwd %1, %1
51 movhlps %2, %4
52 paddd %6, %3
53 paddw %4, %2
54 pxor %2, %2
55 pcmpgtw %2, %4 ; mask for 0 > %4 (sum)
56 punpcklwd %4, %2 ; sign-extend word to dword
57 paddd %6, %1
58 paddd %5, %4
59
60%endmacro
61
62%macro STORE_AND_RET 0
63%if mmsize == 16
64 ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
65 ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
66 ; We have to sign-extend it before adding the words within the register
67 ; and outputing to a dword.
68 movhlps m3, m7
69 movhlps m4, m6
70 paddd m7, m3
71 paddd m6, m4
72 pshufd m3, m7, 0x1
73 pshufd m4, m6, 0x1
74 paddd m7, m3
75 paddd m6, m4
76 mov r1, ssem ; r1 = unsigned int *sse
77 movd [r1], m7 ; store sse
78 movd rax, m6 ; store sum as return value
79%endif
80 RET
81%endmacro
82
83%macro INC_SRC_BY_SRC_STRIDE 0
84%if ARCH_X86=1 && CONFIG_PIC=1
Peter de Rivaz7361ef72015-12-14 16:35:29 +000085 add srcq, src_stridemp
86 add srcq, src_stridemp
Peter de Rivaz48032bf2014-10-16 14:00:54 +010087%else
88 lea srcq, [srcq + src_strideq*2]
89%endif
90%endmacro
91
Peter de Rivaz48032bf2014-10-16 14:00:54 +010092%macro SUBPEL_VARIANCE 1-2 0 ; W
93%define bilin_filter_m bilin_filter_m_sse2
94%define filter_idx_shift 5
95
96
97%ifdef PIC ; 64bit PIC
98 %if %2 == 1 ; avg
99 cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
100 x_offset, y_offset, \
101 dst, dst_stride, \
102 sec, sec_stride, height, sse
103 %define sec_str sec_strideq
104 %else
105 cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
106 y_offset, dst, dst_stride, height, sse
107 %endif
Johann124ada52015-07-28 14:00:32 -0700108 %define block_height heightd
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100109 %define bilin_filter sseq
110%else
111 %if ARCH_X86=1 && CONFIG_PIC=1
112 %if %2 == 1 ; avg
113 cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
114 x_offset, y_offset, \
115 dst, dst_stride, \
116 sec, sec_stride, \
117 height, sse, g_bilin_filter, g_pw_8
Johann124ada52015-07-28 14:00:32 -0700118 %define block_height dword heightm
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100119 %define sec_str sec_stridemp
120
121 ; Store bilin_filter and pw_8 location in stack
Yunqing Wang322ea7f2015-12-10 14:25:01 -0800122 %if GET_GOT_DEFINED == 1
123 GET_GOT eax
124 add esp, 4 ; restore esp
125 %endif
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100126
127 lea ecx, [GLOBAL(bilin_filter_m)]
128 mov g_bilin_filterm, ecx
129
130 lea ecx, [GLOBAL(pw_8)]
131 mov g_pw_8m, ecx
132
133 LOAD_IF_USED 0, 1 ; load eax, ecx back
134 %else
135 cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
136 x_offset, y_offset, dst, dst_stride, height, \
137 sse, g_bilin_filter, g_pw_8
Johann124ada52015-07-28 14:00:32 -0700138 %define block_height heightd
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100139
140 ; Store bilin_filter and pw_8 location in stack
Yunqing Wang322ea7f2015-12-10 14:25:01 -0800141 %if GET_GOT_DEFINED == 1
142 GET_GOT eax
143 add esp, 4 ; restore esp
144 %endif
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100145
146 lea ecx, [GLOBAL(bilin_filter_m)]
147 mov g_bilin_filterm, ecx
148
149 lea ecx, [GLOBAL(pw_8)]
150 mov g_pw_8m, ecx
151
152 LOAD_IF_USED 0, 1 ; load eax, ecx back
153 %endif
154 %else
155 %if %2 == 1 ; avg
156 cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
157 7 + 2 * ARCH_X86_64, 13, src, src_stride, \
158 x_offset, y_offset, \
159 dst, dst_stride, \
160 sec, sec_stride, \
161 height, sse
162 %if ARCH_X86_64
Johann124ada52015-07-28 14:00:32 -0700163 %define block_height heightd
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100164 %define sec_str sec_strideq
165 %else
Johann124ada52015-07-28 14:00:32 -0700166 %define block_height dword heightm
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100167 %define sec_str sec_stridemp
168 %endif
169 %else
170 cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
171 x_offset, y_offset, dst, dst_stride, height, sse
Johann124ada52015-07-28 14:00:32 -0700172 %define block_height heightd
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100173 %endif
174
175 %define bilin_filter bilin_filter_m
176 %endif
177%endif
178
179 ASSERT %1 <= 16 ; m6 overflows if w > 16
180 pxor m6, m6 ; sum
181 pxor m7, m7 ; sse
182
183%if %1 < 16
Johann124ada52015-07-28 14:00:32 -0700184 sar block_height, 1
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100185%endif
Yunqing Wang789ae442015-02-04 12:02:06 -0800186%if %2 == 1 ; avg
187 shl sec_str, 1
188%endif
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100189
190 ; FIXME(rbultje) replace by jumptable?
191 test x_offsetd, x_offsetd
192 jnz .x_nonzero
193 ; x_offset == 0
194 test y_offsetd, y_offsetd
195 jnz .x_zero_y_nonzero
196
197 ; x_offset == 0 && y_offset == 0
198.x_zero_y_zero_loop:
199%if %1 == 16
200 movu m0, [srcq]
201 movu m2, [srcq + 16]
202 mova m1, [dstq]
203 mova m3, [dstq + 16]
204%if %2 == 1 ; avg
205 pavgw m0, [secq]
206 pavgw m2, [secq+16]
207%endif
208 SUM_SSE m0, m1, m2, m3, m6, m7
209
210 lea srcq, [srcq + src_strideq*2]
211 lea dstq, [dstq + dst_strideq*2]
212%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800213 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100214%endif
215%else ; %1 < 16
216 movu m0, [srcq]
217 movu m2, [srcq + src_strideq*2]
218 mova m1, [dstq]
219 mova m3, [dstq + dst_strideq*2]
220%if %2 == 1 ; avg
221 pavgw m0, [secq]
Yunqing Wang789ae442015-02-04 12:02:06 -0800222 add secq, sec_str
223 pavgw m2, [secq]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100224%endif
225 SUM_SSE m0, m1, m2, m3, m6, m7
226
227 lea srcq, [srcq + src_strideq*4]
228 lea dstq, [dstq + dst_strideq*4]
229%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800230 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100231%endif
232%endif
Johann124ada52015-07-28 14:00:32 -0700233 dec block_height
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100234 jg .x_zero_y_zero_loop
235 STORE_AND_RET
236
237.x_zero_y_nonzero:
238 cmp y_offsetd, 8
239 jne .x_zero_y_nonhalf
240
241 ; x_offset == 0 && y_offset == 0.5
242.x_zero_y_half_loop:
243%if %1 == 16
244 movu m0, [srcq]
245 movu m1, [srcq+16]
246 movu m4, [srcq+src_strideq*2]
247 movu m5, [srcq+src_strideq*2+16]
248 mova m2, [dstq]
249 mova m3, [dstq+16]
250 pavgw m0, m4
251 pavgw m1, m5
252%if %2 == 1 ; avg
253 pavgw m0, [secq]
254 pavgw m1, [secq+16]
255%endif
256 SUM_SSE m0, m2, m1, m3, m6, m7
257
258 lea srcq, [srcq + src_strideq*2]
259 lea dstq, [dstq + dst_strideq*2]
260%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800261 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100262%endif
263%else ; %1 < 16
264 movu m0, [srcq]
265 movu m1, [srcq+src_strideq*2]
266 movu m5, [srcq+src_strideq*4]
267 mova m2, [dstq]
268 mova m3, [dstq+dst_strideq*2]
269 pavgw m0, m1
270 pavgw m1, m5
271%if %2 == 1 ; avg
272 pavgw m0, [secq]
Yunqing Wang789ae442015-02-04 12:02:06 -0800273 add secq, sec_str
274 pavgw m1, [secq]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100275%endif
276 SUM_SSE m0, m2, m1, m3, m6, m7
277
278 lea srcq, [srcq + src_strideq*4]
279 lea dstq, [dstq + dst_strideq*4]
280%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800281 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100282%endif
283%endif
Johann124ada52015-07-28 14:00:32 -0700284 dec block_height
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100285 jg .x_zero_y_half_loop
286 STORE_AND_RET
287
288.x_zero_y_nonhalf:
289 ; x_offset == 0 && y_offset == bilin interpolation
290%ifdef PIC
291 lea bilin_filter, [bilin_filter_m]
292%endif
293 shl y_offsetd, filter_idx_shift
294%if ARCH_X86_64 && mmsize == 16
295 mova m8, [bilin_filter+y_offsetq]
296 mova m9, [bilin_filter+y_offsetq+16]
297 mova m10, [pw_8]
298%define filter_y_a m8
299%define filter_y_b m9
300%define filter_rnd m10
301%else ; x86-32 or mmx
302%if ARCH_X86=1 && CONFIG_PIC=1
303; x_offset == 0, reuse x_offset reg
304%define tempq x_offsetq
305 add y_offsetq, g_bilin_filterm
306%define filter_y_a [y_offsetq]
307%define filter_y_b [y_offsetq+16]
308 mov tempq, g_pw_8m
309%define filter_rnd [tempq]
310%else
311 add y_offsetq, bilin_filter
312%define filter_y_a [y_offsetq]
313%define filter_y_b [y_offsetq+16]
314%define filter_rnd [pw_8]
315%endif
316%endif
317
318.x_zero_y_other_loop:
319%if %1 == 16
320 movu m0, [srcq]
321 movu m1, [srcq + 16]
322 movu m4, [srcq+src_strideq*2]
323 movu m5, [srcq+src_strideq*2+16]
324 mova m2, [dstq]
325 mova m3, [dstq+16]
326 ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
327 ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
328 ; instructions is the same (5), but it is 1 mul instead of 2, so might be
329 ; slightly faster because of pmullw latency. It would also cut our rodata
330 ; tables in half for this function, and save 1-2 registers on x86-64.
331 pmullw m1, filter_y_a
332 pmullw m5, filter_y_b
333 paddw m1, filter_rnd
334 pmullw m0, filter_y_a
335 pmullw m4, filter_y_b
336 paddw m0, filter_rnd
337 paddw m1, m5
338 paddw m0, m4
339 psrlw m1, 4
340 psrlw m0, 4
341%if %2 == 1 ; avg
342 pavgw m0, [secq]
343 pavgw m1, [secq+16]
344%endif
345 SUM_SSE m0, m2, m1, m3, m6, m7
346
347 lea srcq, [srcq + src_strideq*2]
348 lea dstq, [dstq + dst_strideq*2]
349%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800350 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100351%endif
352%else ; %1 < 16
353 movu m0, [srcq]
354 movu m1, [srcq+src_strideq*2]
355 movu m5, [srcq+src_strideq*4]
356 mova m4, m1
357 mova m2, [dstq]
358 mova m3, [dstq+dst_strideq*2]
359 pmullw m1, filter_y_a
360 pmullw m5, filter_y_b
361 paddw m1, filter_rnd
362 pmullw m0, filter_y_a
363 pmullw m4, filter_y_b
364 paddw m0, filter_rnd
365 paddw m1, m5
366 paddw m0, m4
367 psrlw m1, 4
368 psrlw m0, 4
369%if %2 == 1 ; avg
370 pavgw m0, [secq]
Yunqing Wang789ae442015-02-04 12:02:06 -0800371 add secq, sec_str
372 pavgw m1, [secq]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100373%endif
374 SUM_SSE m0, m2, m1, m3, m6, m7
375
376 lea srcq, [srcq + src_strideq*4]
377 lea dstq, [dstq + dst_strideq*4]
378%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800379 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100380%endif
381%endif
Johann124ada52015-07-28 14:00:32 -0700382 dec block_height
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100383 jg .x_zero_y_other_loop
384%undef filter_y_a
385%undef filter_y_b
386%undef filter_rnd
387 STORE_AND_RET
388
389.x_nonzero:
390 cmp x_offsetd, 8
391 jne .x_nonhalf
392 ; x_offset == 0.5
393 test y_offsetd, y_offsetd
394 jnz .x_half_y_nonzero
395
396 ; x_offset == 0.5 && y_offset == 0
397.x_half_y_zero_loop:
398%if %1 == 16
399 movu m0, [srcq]
400 movu m1, [srcq + 16]
401 movu m4, [srcq + 2]
402 movu m5, [srcq + 18]
403 mova m2, [dstq]
404 mova m3, [dstq + 16]
405 pavgw m0, m4
406 pavgw m1, m5
407%if %2 == 1 ; avg
408 pavgw m0, [secq]
409 pavgw m1, [secq+16]
410%endif
411 SUM_SSE m0, m2, m1, m3, m6, m7
412
413 lea srcq, [srcq + src_strideq*2]
414 lea dstq, [dstq + dst_strideq*2]
415%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800416 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100417%endif
418%else ; %1 < 16
419 movu m0, [srcq]
420 movu m1, [srcq + src_strideq*2]
421 movu m4, [srcq + 2]
422 movu m5, [srcq + src_strideq*2 + 2]
423 mova m2, [dstq]
424 mova m3, [dstq + dst_strideq*2]
425 pavgw m0, m4
426 pavgw m1, m5
427%if %2 == 1 ; avg
428 pavgw m0, [secq]
Yunqing Wang789ae442015-02-04 12:02:06 -0800429 add secq, sec_str
430 pavgw m1, [secq]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100431%endif
432 SUM_SSE m0, m2, m1, m3, m6, m7
433
434 lea srcq, [srcq + src_strideq*4]
435 lea dstq, [dstq + dst_strideq*4]
436%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800437 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100438%endif
439%endif
Johann124ada52015-07-28 14:00:32 -0700440 dec block_height
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100441 jg .x_half_y_zero_loop
442 STORE_AND_RET
443
444.x_half_y_nonzero:
445 cmp y_offsetd, 8
446 jne .x_half_y_nonhalf
447
448 ; x_offset == 0.5 && y_offset == 0.5
449%if %1 == 16
450 movu m0, [srcq]
451 movu m1, [srcq+16]
452 movu m2, [srcq+2]
453 movu m3, [srcq+18]
454 lea srcq, [srcq + src_strideq*2]
455 pavgw m0, m2
456 pavgw m1, m3
457.x_half_y_half_loop:
458 movu m2, [srcq]
459 movu m3, [srcq + 16]
460 movu m4, [srcq + 2]
461 movu m5, [srcq + 18]
462 pavgw m2, m4
463 pavgw m3, m5
464 pavgw m0, m2
465 pavgw m1, m3
466 mova m4, [dstq]
467 mova m5, [dstq + 16]
468%if %2 == 1 ; avg
469 pavgw m0, [secq]
470 pavgw m1, [secq+16]
471%endif
472 SUM_SSE m0, m4, m1, m5, m6, m7
473 mova m0, m2
474 mova m1, m3
475
476 lea srcq, [srcq + src_strideq*2]
477 lea dstq, [dstq + dst_strideq*2]
478%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800479 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100480%endif
481%else ; %1 < 16
482 movu m0, [srcq]
483 movu m2, [srcq+2]
484 lea srcq, [srcq + src_strideq*2]
485 pavgw m0, m2
486.x_half_y_half_loop:
487 movu m2, [srcq]
488 movu m3, [srcq + src_strideq*2]
489 movu m4, [srcq + 2]
490 movu m5, [srcq + src_strideq*2 + 2]
491 pavgw m2, m4
492 pavgw m3, m5
493 pavgw m0, m2
494 pavgw m2, m3
495 mova m4, [dstq]
496 mova m5, [dstq + dst_strideq*2]
497%if %2 == 1 ; avg
498 pavgw m0, [secq]
Yunqing Wang789ae442015-02-04 12:02:06 -0800499 add secq, sec_str
500 pavgw m2, [secq]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100501%endif
502 SUM_SSE m0, m4, m2, m5, m6, m7
503 mova m0, m3
504
505 lea srcq, [srcq + src_strideq*4]
506 lea dstq, [dstq + dst_strideq*4]
507%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800508 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100509%endif
510%endif
Johann124ada52015-07-28 14:00:32 -0700511 dec block_height
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100512 jg .x_half_y_half_loop
513 STORE_AND_RET
514
515.x_half_y_nonhalf:
516 ; x_offset == 0.5 && y_offset == bilin interpolation
517%ifdef PIC
518 lea bilin_filter, [bilin_filter_m]
519%endif
520 shl y_offsetd, filter_idx_shift
521%if ARCH_X86_64 && mmsize == 16
522 mova m8, [bilin_filter+y_offsetq]
523 mova m9, [bilin_filter+y_offsetq+16]
524 mova m10, [pw_8]
525%define filter_y_a m8
526%define filter_y_b m9
527%define filter_rnd m10
528%else ; x86_32
529%if ARCH_X86=1 && CONFIG_PIC=1
530; x_offset == 0.5. We can reuse x_offset reg
531%define tempq x_offsetq
532 add y_offsetq, g_bilin_filterm
533%define filter_y_a [y_offsetq]
534%define filter_y_b [y_offsetq+16]
535 mov tempq, g_pw_8m
536%define filter_rnd [tempq]
537%else
538 add y_offsetq, bilin_filter
539%define filter_y_a [y_offsetq]
540%define filter_y_b [y_offsetq+16]
541%define filter_rnd [pw_8]
542%endif
543%endif
544
545%if %1 == 16
546 movu m0, [srcq]
547 movu m1, [srcq+16]
548 movu m2, [srcq+2]
549 movu m3, [srcq+18]
550 lea srcq, [srcq + src_strideq*2]
551 pavgw m0, m2
552 pavgw m1, m3
553.x_half_y_other_loop:
554 movu m2, [srcq]
555 movu m3, [srcq+16]
556 movu m4, [srcq+2]
557 movu m5, [srcq+18]
558 pavgw m2, m4
559 pavgw m3, m5
560 mova m4, m2
561 mova m5, m3
562 pmullw m1, filter_y_a
563 pmullw m3, filter_y_b
564 paddw m1, filter_rnd
565 paddw m1, m3
566 pmullw m0, filter_y_a
567 pmullw m2, filter_y_b
568 paddw m0, filter_rnd
569 psrlw m1, 4
570 paddw m0, m2
571 mova m2, [dstq]
572 psrlw m0, 4
573 mova m3, [dstq+16]
574%if %2 == 1 ; avg
575 pavgw m0, [secq]
576 pavgw m1, [secq+16]
577%endif
578 SUM_SSE m0, m2, m1, m3, m6, m7
579 mova m0, m4
580 mova m1, m5
581
582 lea srcq, [srcq + src_strideq*2]
583 lea dstq, [dstq + dst_strideq*2]
584%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800585 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100586%endif
587%else ; %1 < 16
588 movu m0, [srcq]
589 movu m2, [srcq+2]
590 lea srcq, [srcq + src_strideq*2]
591 pavgw m0, m2
592.x_half_y_other_loop:
593 movu m2, [srcq]
594 movu m3, [srcq+src_strideq*2]
595 movu m4, [srcq+2]
596 movu m5, [srcq+src_strideq*2+2]
597 pavgw m2, m4
598 pavgw m3, m5
599 mova m4, m2
600 mova m5, m3
601 pmullw m4, filter_y_a
602 pmullw m3, filter_y_b
603 paddw m4, filter_rnd
604 paddw m4, m3
605 pmullw m0, filter_y_a
606 pmullw m2, filter_y_b
607 paddw m0, filter_rnd
608 psrlw m4, 4
609 paddw m0, m2
610 mova m2, [dstq]
611 psrlw m0, 4
612 mova m3, [dstq+dst_strideq*2]
613%if %2 == 1 ; avg
614 pavgw m0, [secq]
Yunqing Wang789ae442015-02-04 12:02:06 -0800615 add secq, sec_str
616 pavgw m4, [secq]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100617%endif
618 SUM_SSE m0, m2, m4, m3, m6, m7
619 mova m0, m5
620
621 lea srcq, [srcq + src_strideq*4]
622 lea dstq, [dstq + dst_strideq*4]
623%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800624 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100625%endif
626%endif
Johann124ada52015-07-28 14:00:32 -0700627 dec block_height
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100628 jg .x_half_y_other_loop
629%undef filter_y_a
630%undef filter_y_b
631%undef filter_rnd
632 STORE_AND_RET
633
634.x_nonhalf:
635 test y_offsetd, y_offsetd
636 jnz .x_nonhalf_y_nonzero
637
638 ; x_offset == bilin interpolation && y_offset == 0
639%ifdef PIC
640 lea bilin_filter, [bilin_filter_m]
641%endif
642 shl x_offsetd, filter_idx_shift
643%if ARCH_X86_64 && mmsize == 16
644 mova m8, [bilin_filter+x_offsetq]
645 mova m9, [bilin_filter+x_offsetq+16]
646 mova m10, [pw_8]
647%define filter_x_a m8
648%define filter_x_b m9
649%define filter_rnd m10
650%else ; x86-32
651%if ARCH_X86=1 && CONFIG_PIC=1
652; y_offset == 0. We can reuse y_offset reg.
653%define tempq y_offsetq
654 add x_offsetq, g_bilin_filterm
655%define filter_x_a [x_offsetq]
656%define filter_x_b [x_offsetq+16]
657 mov tempq, g_pw_8m
658%define filter_rnd [tempq]
659%else
660 add x_offsetq, bilin_filter
661%define filter_x_a [x_offsetq]
662%define filter_x_b [x_offsetq+16]
663%define filter_rnd [pw_8]
664%endif
665%endif
666
667.x_other_y_zero_loop:
668%if %1 == 16
669 movu m0, [srcq]
670 movu m1, [srcq+16]
671 movu m2, [srcq+2]
672 movu m3, [srcq+18]
673 mova m4, [dstq]
674 mova m5, [dstq+16]
675 pmullw m1, filter_x_a
676 pmullw m3, filter_x_b
677 paddw m1, filter_rnd
678 pmullw m0, filter_x_a
679 pmullw m2, filter_x_b
680 paddw m0, filter_rnd
681 paddw m1, m3
682 paddw m0, m2
683 psrlw m1, 4
684 psrlw m0, 4
685%if %2 == 1 ; avg
686 pavgw m0, [secq]
687 pavgw m1, [secq+16]
688%endif
689 SUM_SSE m0, m4, m1, m5, m6, m7
690
691 lea srcq, [srcq+src_strideq*2]
692 lea dstq, [dstq+dst_strideq*2]
693%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800694 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100695%endif
696%else ; %1 < 16
697 movu m0, [srcq]
698 movu m1, [srcq+src_strideq*2]
699 movu m2, [srcq+2]
700 movu m3, [srcq+src_strideq*2+2]
701 mova m4, [dstq]
702 mova m5, [dstq+dst_strideq*2]
703 pmullw m1, filter_x_a
704 pmullw m3, filter_x_b
705 paddw m1, filter_rnd
706 pmullw m0, filter_x_a
707 pmullw m2, filter_x_b
708 paddw m0, filter_rnd
709 paddw m1, m3
710 paddw m0, m2
711 psrlw m1, 4
712 psrlw m0, 4
713%if %2 == 1 ; avg
714 pavgw m0, [secq]
Yunqing Wang789ae442015-02-04 12:02:06 -0800715 add secq, sec_str
716 pavgw m1, [secq]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100717%endif
718 SUM_SSE m0, m4, m1, m5, m6, m7
719
720 lea srcq, [srcq+src_strideq*4]
721 lea dstq, [dstq+dst_strideq*4]
722%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800723 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100724%endif
725%endif
Johann124ada52015-07-28 14:00:32 -0700726 dec block_height
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100727 jg .x_other_y_zero_loop
728%undef filter_x_a
729%undef filter_x_b
730%undef filter_rnd
731 STORE_AND_RET
732
733.x_nonhalf_y_nonzero:
734 cmp y_offsetd, 8
735 jne .x_nonhalf_y_nonhalf
736
737 ; x_offset == bilin interpolation && y_offset == 0.5
738%ifdef PIC
739 lea bilin_filter, [bilin_filter_m]
740%endif
741 shl x_offsetd, filter_idx_shift
742%if ARCH_X86_64 && mmsize == 16
743 mova m8, [bilin_filter+x_offsetq]
744 mova m9, [bilin_filter+x_offsetq+16]
745 mova m10, [pw_8]
746%define filter_x_a m8
747%define filter_x_b m9
748%define filter_rnd m10
749%else ; x86-32
750%if ARCH_X86=1 && CONFIG_PIC=1
751; y_offset == 0.5. We can reuse y_offset reg.
752%define tempq y_offsetq
753 add x_offsetq, g_bilin_filterm
754%define filter_x_a [x_offsetq]
755%define filter_x_b [x_offsetq+16]
756 mov tempq, g_pw_8m
757%define filter_rnd [tempq]
758%else
759 add x_offsetq, bilin_filter
760%define filter_x_a [x_offsetq]
761%define filter_x_b [x_offsetq+16]
762%define filter_rnd [pw_8]
763%endif
764%endif
765
766%if %1 == 16
767 movu m0, [srcq]
768 movu m1, [srcq+16]
769 movu m2, [srcq+2]
770 movu m3, [srcq+18]
771 pmullw m0, filter_x_a
772 pmullw m2, filter_x_b
773 paddw m0, filter_rnd
774 pmullw m1, filter_x_a
775 pmullw m3, filter_x_b
776 paddw m1, filter_rnd
777 paddw m0, m2
778 paddw m1, m3
779 psrlw m0, 4
780 psrlw m1, 4
781 lea srcq, [srcq+src_strideq*2]
782.x_other_y_half_loop:
783 movu m2, [srcq]
784 movu m3, [srcq+16]
785 movu m4, [srcq+2]
786 movu m5, [srcq+18]
787 pmullw m2, filter_x_a
788 pmullw m4, filter_x_b
789 paddw m2, filter_rnd
790 pmullw m3, filter_x_a
791 pmullw m5, filter_x_b
792 paddw m3, filter_rnd
793 paddw m2, m4
794 paddw m3, m5
795 mova m4, [dstq]
796 mova m5, [dstq+16]
797 psrlw m2, 4
798 psrlw m3, 4
799 pavgw m0, m2
800 pavgw m1, m3
801%if %2 == 1 ; avg
802 pavgw m0, [secq]
803 pavgw m1, [secq+16]
804%endif
805 SUM_SSE m0, m4, m1, m5, m6, m7
806 mova m0, m2
807 mova m1, m3
808
809 lea srcq, [srcq+src_strideq*2]
810 lea dstq, [dstq+dst_strideq*2]
811%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800812 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100813%endif
814%else ; %1 < 16
815 movu m0, [srcq]
816 movu m2, [srcq+2]
817 pmullw m0, filter_x_a
818 pmullw m2, filter_x_b
819 paddw m0, filter_rnd
820 paddw m0, m2
821 psrlw m0, 4
822 lea srcq, [srcq+src_strideq*2]
823.x_other_y_half_loop:
824 movu m2, [srcq]
825 movu m3, [srcq+src_strideq*2]
826 movu m4, [srcq+2]
827 movu m5, [srcq+src_strideq*2+2]
828 pmullw m2, filter_x_a
829 pmullw m4, filter_x_b
830 paddw m2, filter_rnd
831 pmullw m3, filter_x_a
832 pmullw m5, filter_x_b
833 paddw m3, filter_rnd
834 paddw m2, m4
835 paddw m3, m5
836 mova m4, [dstq]
837 mova m5, [dstq+dst_strideq*2]
838 psrlw m2, 4
839 psrlw m3, 4
840 pavgw m0, m2
841 pavgw m2, m3
842%if %2 == 1 ; avg
843 pavgw m0, [secq]
Yunqing Wang789ae442015-02-04 12:02:06 -0800844 add secq, sec_str
845 pavgw m2, [secq]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100846%endif
847 SUM_SSE m0, m4, m2, m5, m6, m7
848 mova m0, m3
849
850 lea srcq, [srcq+src_strideq*4]
851 lea dstq, [dstq+dst_strideq*4]
852%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800853 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100854%endif
855%endif
Johann124ada52015-07-28 14:00:32 -0700856 dec block_height
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100857 jg .x_other_y_half_loop
858%undef filter_x_a
859%undef filter_x_b
860%undef filter_rnd
861 STORE_AND_RET
862
863.x_nonhalf_y_nonhalf:
864; loading filter - this is same as in 8-bit depth
865%ifdef PIC
866 lea bilin_filter, [bilin_filter_m]
867%endif
868 shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5
869 shl y_offsetd, filter_idx_shift
870%if ARCH_X86_64 && mmsize == 16
871 mova m8, [bilin_filter+x_offsetq]
872 mova m9, [bilin_filter+x_offsetq+16]
873 mova m10, [bilin_filter+y_offsetq]
874 mova m11, [bilin_filter+y_offsetq+16]
875 mova m12, [pw_8]
876%define filter_x_a m8
877%define filter_x_b m9
878%define filter_y_a m10
879%define filter_y_b m11
880%define filter_rnd m12
881%else ; x86-32
882%if ARCH_X86=1 && CONFIG_PIC=1
883; In this case, there is NO unused register. Used src_stride register. Later,
884; src_stride has to be loaded from stack when it is needed.
885%define tempq src_strideq
886 mov tempq, g_bilin_filterm
887 add x_offsetq, tempq
888 add y_offsetq, tempq
889%define filter_x_a [x_offsetq]
890%define filter_x_b [x_offsetq+16]
891%define filter_y_a [y_offsetq]
892%define filter_y_b [y_offsetq+16]
893
894 mov tempq, g_pw_8m
895%define filter_rnd [tempq]
896%else
897 add x_offsetq, bilin_filter
898 add y_offsetq, bilin_filter
899%define filter_x_a [x_offsetq]
900%define filter_x_b [x_offsetq+16]
901%define filter_y_a [y_offsetq]
902%define filter_y_b [y_offsetq+16]
903%define filter_rnd [pw_8]
904%endif
905%endif
906; end of load filter
907
908 ; x_offset == bilin interpolation && y_offset == bilin interpolation
909%if %1 == 16
910 movu m0, [srcq]
911 movu m2, [srcq+2]
912 movu m1, [srcq+16]
913 movu m3, [srcq+18]
914 pmullw m0, filter_x_a
915 pmullw m2, filter_x_b
916 paddw m0, filter_rnd
917 pmullw m1, filter_x_a
918 pmullw m3, filter_x_b
919 paddw m1, filter_rnd
920 paddw m0, m2
921 paddw m1, m3
922 psrlw m0, 4
923 psrlw m1, 4
924
925 INC_SRC_BY_SRC_STRIDE
926
927.x_other_y_other_loop:
928 movu m2, [srcq]
929 movu m4, [srcq+2]
930 movu m3, [srcq+16]
931 movu m5, [srcq+18]
932 pmullw m2, filter_x_a
933 pmullw m4, filter_x_b
934 paddw m2, filter_rnd
935 pmullw m3, filter_x_a
936 pmullw m5, filter_x_b
937 paddw m3, filter_rnd
938 paddw m2, m4
939 paddw m3, m5
940 psrlw m2, 4
941 psrlw m3, 4
942 mova m4, m2
943 mova m5, m3
944 pmullw m0, filter_y_a
945 pmullw m2, filter_y_b
946 paddw m0, filter_rnd
947 pmullw m1, filter_y_a
948 pmullw m3, filter_y_b
949 paddw m0, m2
950 paddw m1, filter_rnd
951 mova m2, [dstq]
952 paddw m1, m3
953 psrlw m0, 4
954 psrlw m1, 4
955 mova m3, [dstq+16]
956%if %2 == 1 ; avg
957 pavgw m0, [secq]
958 pavgw m1, [secq+16]
959%endif
960 SUM_SSE m0, m2, m1, m3, m6, m7
961 mova m0, m4
962 mova m1, m5
963
964 INC_SRC_BY_SRC_STRIDE
965 lea dstq, [dstq + dst_strideq * 2]
966%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -0800967 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100968%endif
969%else ; %1 < 16
970 movu m0, [srcq]
971 movu m2, [srcq+2]
972 pmullw m0, filter_x_a
973 pmullw m2, filter_x_b
974 paddw m0, filter_rnd
975 paddw m0, m2
976 psrlw m0, 4
977
978 INC_SRC_BY_SRC_STRIDE
979
980.x_other_y_other_loop:
981 movu m2, [srcq]
982 movu m4, [srcq+2]
Peter de Rivaz7361ef72015-12-14 16:35:29 +0000983 INC_SRC_BY_SRC_STRIDE
984 movu m3, [srcq]
985 movu m5, [srcq+2]
Peter de Rivaz48032bf2014-10-16 14:00:54 +0100986 pmullw m2, filter_x_a
987 pmullw m4, filter_x_b
988 paddw m2, filter_rnd
989 pmullw m3, filter_x_a
990 pmullw m5, filter_x_b
991 paddw m3, filter_rnd
992 paddw m2, m4
993 paddw m3, m5
994 psrlw m2, 4
995 psrlw m3, 4
996 mova m4, m2
997 mova m5, m3
998 pmullw m0, filter_y_a
999 pmullw m2, filter_y_b
1000 paddw m0, filter_rnd
1001 pmullw m4, filter_y_a
1002 pmullw m3, filter_y_b
1003 paddw m0, m2
1004 paddw m4, filter_rnd
1005 mova m2, [dstq]
1006 paddw m4, m3
1007 psrlw m0, 4
1008 psrlw m4, 4
1009 mova m3, [dstq+dst_strideq*2]
1010%if %2 == 1 ; avg
1011 pavgw m0, [secq]
Yunqing Wang789ae442015-02-04 12:02:06 -08001012 add secq, sec_str
1013 pavgw m4, [secq]
Peter de Rivaz48032bf2014-10-16 14:00:54 +01001014%endif
1015 SUM_SSE m0, m2, m4, m3, m6, m7
1016 mova m0, m5
1017
Peter de Rivaz7361ef72015-12-14 16:35:29 +00001018 INC_SRC_BY_SRC_STRIDE
Peter de Rivaz48032bf2014-10-16 14:00:54 +01001019 lea dstq, [dstq + dst_strideq * 4]
1020%if %2 == 1 ; avg
Yunqing Wang789ae442015-02-04 12:02:06 -08001021 add secq, sec_str
Peter de Rivaz48032bf2014-10-16 14:00:54 +01001022%endif
1023%endif
Johann124ada52015-07-28 14:00:32 -07001024 dec block_height
Peter de Rivaz48032bf2014-10-16 14:00:54 +01001025 jg .x_other_y_other_loop
1026%undef filter_x_a
1027%undef filter_x_b
1028%undef filter_y_a
1029%undef filter_y_b
1030%undef filter_rnd
1031 STORE_AND_RET
1032%endmacro
1033
1034INIT_XMM sse2
1035SUBPEL_VARIANCE 8
1036SUBPEL_VARIANCE 16
1037
1038INIT_XMM sse2
1039SUBPEL_VARIANCE 8, 1
1040SUBPEL_VARIANCE 16, 1