| ; |
| ; Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
| ; |
| ; Use of this source code is governed by a BSD-style license |
| ; that can be found in the LICENSE file in the root of the source |
| ; tree. An additional intellectual property rights grant can be found |
| ; in the file PATENTS. All contributing project authors may |
| ; be found in the AUTHORS file in the root of the source tree. |
| ; |
| |
| |
| %include "vpx_ports/x86_abi_support.asm" |
| |
| ;void vp8_filter_by_weight16x16_sse2 |
| ;( |
| ; unsigned char *src, |
| ; int src_stride, |
| ; unsigned char *dst, |
| ; int dst_stride, |
| ; int src_weight |
| ;) |
| global sym(vp8_filter_by_weight16x16_sse2) PRIVATE |
| sym(vp8_filter_by_weight16x16_sse2): |
| push rbp |
| mov rbp, rsp |
| SHADOW_ARGS_TO_STACK 5 |
| SAVE_XMM 6 |
| GET_GOT rbx |
| push rsi |
| push rdi |
| ; end prolog |
| |
| movd xmm0, arg(4) ; src_weight |
| pshuflw xmm0, xmm0, 0x0 ; replicate to all low words |
| punpcklqdq xmm0, xmm0 ; replicate to all hi words |
| |
| movdqa xmm1, [GLOBAL(tMFQE)] |
| psubw xmm1, xmm0 ; dst_weight |
| |
| mov rax, arg(0) ; src |
| mov rsi, arg(1) ; src_stride |
| mov rdx, arg(2) ; dst |
| mov rdi, arg(3) ; dst_stride |
| |
| mov rcx, 16 ; loop count |
| pxor xmm6, xmm6 |
| |
| .combine: |
| movdqa xmm2, [rax] |
| movdqa xmm4, [rdx] |
| add rax, rsi |
| |
| ; src * src_weight |
| movdqa xmm3, xmm2 |
| punpcklbw xmm2, xmm6 |
| punpckhbw xmm3, xmm6 |
| pmullw xmm2, xmm0 |
| pmullw xmm3, xmm0 |
| |
| ; dst * dst_weight |
| movdqa xmm5, xmm4 |
| punpcklbw xmm4, xmm6 |
| punpckhbw xmm5, xmm6 |
| pmullw xmm4, xmm1 |
| pmullw xmm5, xmm1 |
| |
| ; sum, round and shift |
| paddw xmm2, xmm4 |
| paddw xmm3, xmm5 |
| paddw xmm2, [GLOBAL(tMFQE_round)] |
| paddw xmm3, [GLOBAL(tMFQE_round)] |
| psrlw xmm2, 4 |
| psrlw xmm3, 4 |
| |
| packuswb xmm2, xmm3 |
| movdqa [rdx], xmm2 |
| add rdx, rdi |
| |
| dec rcx |
| jnz .combine |
| |
| ; begin epilog |
| pop rdi |
| pop rsi |
| RESTORE_GOT |
| RESTORE_XMM |
| UNSHADOW_ARGS |
| pop rbp |
| |
| ret |
| |
| ;void vp8_filter_by_weight8x8_sse2 |
| ;( |
| ; unsigned char *src, |
| ; int src_stride, |
| ; unsigned char *dst, |
| ; int dst_stride, |
| ; int src_weight |
| ;) |
| global sym(vp8_filter_by_weight8x8_sse2) PRIVATE |
| sym(vp8_filter_by_weight8x8_sse2): |
| push rbp |
| mov rbp, rsp |
| SHADOW_ARGS_TO_STACK 5 |
| GET_GOT rbx |
| push rsi |
| push rdi |
| ; end prolog |
| |
| movd xmm0, arg(4) ; src_weight |
| pshuflw xmm0, xmm0, 0x0 ; replicate to all low words |
| punpcklqdq xmm0, xmm0 ; replicate to all hi words |
| |
| movdqa xmm1, [GLOBAL(tMFQE)] |
| psubw xmm1, xmm0 ; dst_weight |
| |
| mov rax, arg(0) ; src |
| mov rsi, arg(1) ; src_stride |
| mov rdx, arg(2) ; dst |
| mov rdi, arg(3) ; dst_stride |
| |
| mov rcx, 8 ; loop count |
| pxor xmm4, xmm4 |
| |
| .combine: |
| movq xmm2, [rax] |
| movq xmm3, [rdx] |
| add rax, rsi |
| |
| ; src * src_weight |
| punpcklbw xmm2, xmm4 |
| pmullw xmm2, xmm0 |
| |
| ; dst * dst_weight |
| punpcklbw xmm3, xmm4 |
| pmullw xmm3, xmm1 |
| |
| ; sum, round and shift |
| paddw xmm2, xmm3 |
| paddw xmm2, [GLOBAL(tMFQE_round)] |
| psrlw xmm2, 4 |
| |
| packuswb xmm2, xmm4 |
| movq [rdx], xmm2 |
| add rdx, rdi |
| |
| dec rcx |
| jnz .combine |
| |
| ; begin epilog |
| pop rdi |
| pop rsi |
| RESTORE_GOT |
| UNSHADOW_ARGS |
| pop rbp |
| |
| ret |
| |
| ;void vp8_variance_and_sad_16x16_sse2 | arg |
| ;( |
| ; unsigned char *src1, 0 |
| ; int stride1, 1 |
| ; unsigned char *src2, 2 |
| ; int stride2, 3 |
| ; unsigned int *variance, 4 |
| ; unsigned int *sad, 5 |
| ;) |
| global sym(vp8_variance_and_sad_16x16_sse2) PRIVATE |
| sym(vp8_variance_and_sad_16x16_sse2): |
| push rbp |
| mov rbp, rsp |
| SHADOW_ARGS_TO_STACK 6 |
| GET_GOT rbx |
| push rsi |
| push rdi |
| ; end prolog |
| |
| mov rax, arg(0) ; src1 |
| mov rcx, arg(1) ; stride1 |
| mov rdx, arg(2) ; src2 |
| mov rdi, arg(3) ; stride2 |
| |
| mov rsi, 16 ; block height |
| |
| ; Prep accumulator registers |
| pxor xmm3, xmm3 ; SAD |
| pxor xmm4, xmm4 ; sum of src2 |
| pxor xmm5, xmm5 ; sum of src2^2 |
| |
| ; Because we're working with the actual output frames |
| ; we can't depend on any kind of data alignment. |
| .accumulate: |
| movdqa xmm0, [rax] ; src1 |
| movdqa xmm1, [rdx] ; src2 |
| add rax, rcx ; src1 + stride1 |
| add rdx, rdi ; src2 + stride2 |
| |
| ; SAD(src1, src2) |
| psadbw xmm0, xmm1 |
| paddusw xmm3, xmm0 |
| |
| ; SUM(src2) |
| pxor xmm2, xmm2 |
| psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0 |
| paddusw xmm4, xmm2 |
| |
| ; pmaddubsw would be ideal if it took two unsigned values. instead, |
| ; it expects a signed and an unsigned value. so instead we zero extend |
| ; and operate on words. |
| pxor xmm2, xmm2 |
| movdqa xmm0, xmm1 |
| punpcklbw xmm0, xmm2 |
| punpckhbw xmm1, xmm2 |
| pmaddwd xmm0, xmm0 |
| pmaddwd xmm1, xmm1 |
| paddd xmm5, xmm0 |
| paddd xmm5, xmm1 |
| |
| sub rsi, 1 |
| jnz .accumulate |
| |
| ; phaddd only operates on adjacent double words. |
| ; Finalize SAD and store |
| movdqa xmm0, xmm3 |
| psrldq xmm0, 8 |
| paddusw xmm0, xmm3 |
| paddd xmm0, [GLOBAL(t128)] |
| psrld xmm0, 8 |
| |
| mov rax, arg(5) |
| movd [rax], xmm0 |
| |
| ; Accumulate sum of src2 |
| movdqa xmm0, xmm4 |
| psrldq xmm0, 8 |
| paddusw xmm0, xmm4 |
| ; Square src2. Ignore high value |
| pmuludq xmm0, xmm0 |
| psrld xmm0, 8 |
| |
| ; phaddw could be used to sum adjacent values but we want |
| ; all the values summed. promote to doubles, accumulate, |
| ; shift and sum |
| pxor xmm2, xmm2 |
| movdqa xmm1, xmm5 |
| punpckldq xmm1, xmm2 |
| punpckhdq xmm5, xmm2 |
| paddd xmm1, xmm5 |
| movdqa xmm2, xmm1 |
| psrldq xmm1, 8 |
| paddd xmm1, xmm2 |
| |
| psubd xmm1, xmm0 |
| |
| ; (variance + 128) >> 8 |
| paddd xmm1, [GLOBAL(t128)] |
| psrld xmm1, 8 |
| mov rax, arg(4) |
| |
| movd [rax], xmm1 |
| |
| |
| ; begin epilog |
| pop rdi |
| pop rsi |
| RESTORE_GOT |
| UNSHADOW_ARGS |
| pop rbp |
| ret |
| |
| SECTION_RODATA |
| align 16 |
| t128: |
| %ifndef __NASM_VER__ |
| ddq 128 |
| %elif CONFIG_BIG_ENDIAN |
| dq 0, 128 |
| %else |
| dq 128, 0 |
| %endif |
| align 16 |
| tMFQE: ; 1 << MFQE_PRECISION |
| times 8 dw 0x10 |
| align 16 |
| tMFQE_round: ; 1 << (MFQE_PRECISION - 1) |
| times 8 dw 0x08 |
| |