| ; |
| ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| ; |
| ; Use of this source code is governed by a BSD-style license |
| ; that can be found in the LICENSE file in the root of the source |
| ; tree. An additional intellectual property rights grant can be found |
| ; in the file PATENTS. All contributing project authors may |
| ; be found in the AUTHORS file in the root of the source tree. |
| ; |
| |
| |
| %include "vpx_ports/x86_abi_support.asm" |
| |
| ;void int vp8_makemask_sse3( |
| ; unsigned char *y, |
| ; unsigned char *u, |
| ; unsigned char *v, |
| ; unsigned char *ym, |
| ; unsigned char *uvm, |
| ; int yp, |
| ; int uvp, |
| ; int ys, |
| ; int us, |
| ; int vs, |
| ; int yt, |
| ; int ut, |
| ; int vt) |
| global sym(vp8_makemask_sse3) |
| sym(vp8_makemask_sse3): |
| push rbp |
| mov rbp, rsp |
| SHADOW_ARGS_TO_STACK 14 |
| push rsi |
| push rdi |
| ; end prolog |
| |
| mov rsi, arg(0) ;y |
| mov rdi, arg(1) ;u |
| mov rcx, arg(2) ;v |
| mov rax, arg(3) ;ym |
| movsxd rbx, dword arg(4) ;yp |
| movsxd rdx, dword arg(5) ;uvp |
| |
| pxor xmm0,xmm0 |
| |
| ;make 16 copies of the center y value |
| movd xmm1, arg(6) |
| pshufb xmm1, xmm0 |
| |
| ; make 16 copies of the center u value |
| movd xmm2, arg(7) |
| pshufb xmm2, xmm0 |
| |
| ; make 16 copies of the center v value |
| movd xmm3, arg(8) |
| pshufb xmm3, xmm0 |
| unpcklpd xmm2, xmm3 |
| |
| ;make 16 copies of the y tolerance |
| movd xmm3, arg(9) |
| pshufb xmm3, xmm0 |
| |
| ;make 16 copies of the u tolerance |
| movd xmm4, arg(10) |
| pshufb xmm4, xmm0 |
| |
| ;make 16 copies of the v tolerance |
| movd xmm5, arg(11) |
| pshufb xmm5, xmm0 |
| unpckhpd xmm4, xmm5 |
| |
| mov r8,8 |
| |
| NextPairOfRows: |
| |
| ;grab the y source values |
| movdqu xmm0, [rsi] |
| |
| ;compute abs difference between source and y target |
| movdqa xmm6, xmm1 |
| movdqa xmm7, xmm0 |
| psubusb xmm0, xmm1 |
| psubusb xmm6, xmm7 |
| por xmm0, xmm6 |
| |
| ;compute abs difference between |
| movdqa xmm6, xmm3 |
| pcmpgtb xmm6, xmm0 |
| |
| ;grab the y source values |
| add rsi, rbx |
| movdqu xmm0, [rsi] |
| |
| ;compute abs difference between source and y target |
| movdqa xmm11, xmm1 |
| movdqa xmm7, xmm0 |
| psubusb xmm0, xmm1 |
| psubusb xmm11, xmm7 |
| por xmm0, xmm11 |
| |
| ;compute abs difference between |
| movdqa xmm11, xmm3 |
| pcmpgtb xmm11, xmm0 |
| |
| |
| ;grab the u and v source values |
| movdqu xmm7, [rdi] |
| movdqu xmm8, [rcx] |
| unpcklpd xmm7, xmm8 |
| |
| ;compute abs difference between source and uv targets |
| movdqa xmm9, xmm2 |
| movdqa xmm10, xmm7 |
| psubusb xmm7, xmm2 |
| psubusb xmm9, xmm10 |
| por xmm7, xmm9 |
| |
| ;check whether the number is < tolerance |
| movdqa xmm0, xmm4 |
| pcmpgtb xmm0, xmm7 |
| |
| ;double u and v masks |
| movdqa xmm8, xmm0 |
| punpckhbw xmm0, xmm0 |
| punpcklbw xmm8, xmm8 |
| |
| ;mask row 0 and output |
| pand xmm6, xmm8 |
| pand xmm6, xmm0 |
| movdqa [rax],xmm6 |
| |
| ;mask row 1 and output |
| pand xmm11, xmm8 |
| pand xmm11, xmm0 |
| movdqa [rax+16],xmm11 |
| |
| |
| ; to the next row or set of rows |
| add rsi, rbx |
| add rdi, rdx |
| add rcx, rdx |
| add rax,32 |
| dec r8 |
| jnz NextPairOfRows |
| |
| |
| ; begin epilog |
| pop rdi |
| pop rsi |
| UNSHADOW_ARGS |
| pop rbp |
| ret |
| |
| ;GROW_HORIZ (register for result, source register or mem local) |
| ; takes source and shifts left and ors with source |
| ; then shifts right and ors with source |
| %macro GROW_HORIZ 2 |
| movdqa %1, %2 |
| movdqa xmm14, %1 |
| movdqa xmm15, %1 |
| pslldq xmm14, 1 |
| psrldq xmm15, 1 |
| por %1,xmm14 |
| por %1,xmm15 |
| %endmacro |
| ;GROW_VERT (result, center row, above row, below row) |
| %macro GROW_VERT 4 |
| movdqa %1,%2 |
| por %1,%3 |
| por %1,%4 |
| %endmacro |
| |
| ;GROW_NEXTLINE (new line to grow, new source, line to write) |
| %macro GROW_NEXTLINE 3 |
| GROW_HORIZ %1, %2 |
| GROW_VERT xmm3, xmm0, xmm1, xmm2 |
| movdqa %3,xmm3 |
| %endmacro |
| |
| |
| ;void int vp8_growmaskmb_sse3( |
| ; unsigned char *om, |
| ; unsigned char *nm, |
| global sym(vp8_growmaskmb_sse3) |
| sym(vp8_growmaskmb_sse3): |
| push rbp |
| mov rbp, rsp |
| SHADOW_ARGS_TO_STACK 2 |
| push rsi |
| push rdi |
| ; end prolog |
| |
| mov rsi, arg(0) ;src |
| mov rdi, arg(1) ;rst |
| |
| GROW_HORIZ xmm0, [rsi] |
| GROW_HORIZ xmm1, [rsi+16] |
| GROW_HORIZ xmm2, [rsi+32] |
| |
| GROW_VERT xmm3, xmm0, xmm1, xmm2 |
| por xmm0,xmm1 |
| movdqa [rdi], xmm0 |
| movdqa [rdi+16],xmm3 |
| |
| GROW_NEXTLINE xmm0,[rsi+48],[rdi+32] |
| GROW_NEXTLINE xmm1,[rsi+64],[rdi+48] |
| GROW_NEXTLINE xmm2,[rsi+80],[rdi+64] |
| GROW_NEXTLINE xmm0,[rsi+96],[rdi+80] |
| GROW_NEXTLINE xmm1,[rsi+112],[rdi+96] |
| GROW_NEXTLINE xmm2,[rsi+128],[rdi+112] |
| GROW_NEXTLINE xmm0,[rsi+144],[rdi+128] |
| GROW_NEXTLINE xmm1,[rsi+160],[rdi+144] |
| GROW_NEXTLINE xmm2,[rsi+176],[rdi+160] |
| GROW_NEXTLINE xmm0,[rsi+192],[rdi+176] |
| GROW_NEXTLINE xmm1,[rsi+208],[rdi+192] |
| GROW_NEXTLINE xmm2,[rsi+224],[rdi+208] |
| GROW_NEXTLINE xmm0,[rsi+240],[rdi+224] |
| |
| por xmm0,xmm2 |
| movdqa [rdi+240], xmm0 |
| |
| ; begin epilog |
| pop rdi |
| pop rsi |
| UNSHADOW_ARGS |
| pop rbp |
| ret |
| |
| |
| |
| ;unsigned int vp8_sad16x16_masked_wmt( |
| ; unsigned char *src_ptr, |
| ; int src_stride, |
| ; unsigned char *ref_ptr, |
| ; int ref_stride, |
| ; unsigned char *mask) |
| global sym(vp8_sad16x16_masked_wmt) |
| sym(vp8_sad16x16_masked_wmt): |
| push rbp |
| mov rbp, rsp |
| SHADOW_ARGS_TO_STACK 5 |
| push rsi |
| push rdi |
| ; end prolog |
| mov rsi, arg(0) ;src_ptr |
| mov rdi, arg(2) ;ref_ptr |
| |
| mov rbx, arg(4) ;mask |
| movsxd rax, dword ptr arg(1) ;src_stride |
| movsxd rdx, dword ptr arg(3) ;ref_stride |
| |
| mov rcx, 16 |
| |
| pxor xmm3, xmm3 |
| |
| NextSadRow: |
| movdqu xmm0, [rsi] |
| movdqu xmm1, [rdi] |
| movdqu xmm2, [rbx] |
| pand xmm0, xmm2 |
| pand xmm1, xmm2 |
| |
| psadbw xmm0, xmm1 |
| paddw xmm3, xmm0 |
| |
| add rsi, rax |
| add rdi, rdx |
| add rbx, 16 |
| |
| dec rcx |
| jnz NextSadRow |
| |
| movdqa xmm4 , xmm3 |
| psrldq xmm4, 8 |
| paddw xmm3, xmm4 |
| movq rax, xmm3 |
| ; begin epilog |
| pop rdi |
| pop rsi |
| UNSHADOW_ARGS |
| pop rbp |
| ret |
| |
| |
| ;unsigned int vp8_sad16x16_unmasked_wmt( |
| ; unsigned char *src_ptr, |
| ; int src_stride, |
| ; unsigned char *ref_ptr, |
| ; int ref_stride, |
| ; unsigned char *mask) |
| global sym(vp8_sad16x16_unmasked_wmt) |
| sym(vp8_sad16x16_unmasked_wmt): |
| push rbp |
| mov rbp, rsp |
| SHADOW_ARGS_TO_STACK 5 |
| push rsi |
| push rdi |
| ; end prolog |
| mov rsi, arg(0) ;src_ptr |
| mov rdi, arg(2) ;ref_ptr |
| |
| mov rbx, arg(4) ;mask |
| movsxd rax, dword ptr arg(1) ;src_stride |
| movsxd rdx, dword ptr arg(3) ;ref_stride |
| |
| mov rcx, 16 |
| |
| pxor xmm3, xmm3 |
| |
| next_vp8_sad16x16_unmasked_wmt: |
| movdqu xmm0, [rsi] |
| movdqu xmm1, [rdi] |
| movdqu xmm2, [rbx] |
| por xmm0, xmm2 |
| por xmm1, xmm2 |
| |
| psadbw xmm0, xmm1 |
| paddw xmm3, xmm0 |
| |
| add rsi, rax |
| add rdi, rdx |
| add rbx, 16 |
| |
| dec rcx |
| jnz next_vp8_sad16x16_unmasked_wmt |
| |
| movdqa xmm4 , xmm3 |
| psrldq xmm4, 8 |
| paddw xmm3, xmm4 |
| movq rax, xmm3 |
| ; begin epilog |
| pop rdi |
| pop rsi |
| UNSHADOW_ARGS |
| pop rbp |
| ret |
| |
| |
| ;unsigned int vp8_masked_predictor_wmt( |
| ; unsigned char *masked, |
| ; unsigned char *unmasked, |
| ; int src_stride, |
| ; unsigned char *dst_ptr, |
| ; int dst_stride, |
| ; unsigned char *mask) |
| global sym(vp8_masked_predictor_wmt) |
| sym(vp8_masked_predictor_wmt): |
| push rbp |
| mov rbp, rsp |
| SHADOW_ARGS_TO_STACK 6 |
| push rsi |
| push rdi |
| ; end prolog |
| mov rsi, arg(0) ;src_ptr |
| mov rdi, arg(1) ;ref_ptr |
| |
| mov rbx, arg(5) ;mask |
| movsxd rax, dword ptr arg(2) ;src_stride |
| mov r11, arg(3) ; destination |
| movsxd rdx, dword ptr arg(4) ;dst_stride |
| |
| mov rcx, 16 |
| |
| pxor xmm3, xmm3 |
| |
| next_vp8_masked_predictor_wmt: |
| movdqu xmm0, [rsi] |
| movdqu xmm1, [rdi] |
| movdqu xmm2, [rbx] |
| |
| pand xmm0, xmm2 |
| pandn xmm2, xmm1 |
| por xmm0, xmm2 |
| movdqu [r11], xmm0 |
| |
| add r11, rdx |
| add rsi, rax |
| add rdi, rdx |
| add rbx, 16 |
| |
| dec rcx |
| jnz next_vp8_masked_predictor_wmt |
| |
| ; begin epilog |
| pop rdi |
| pop rsi |
| UNSHADOW_ARGS |
| pop rbp |
| ret |
| |
| ;unsigned int vp8_masked_predictor_uv_wmt( |
| ; unsigned char *masked, |
| ; unsigned char *unmasked, |
| ; int src_stride, |
| ; unsigned char *dst_ptr, |
| ; int dst_stride, |
| ; unsigned char *mask) |
| global sym(vp8_masked_predictor_uv_wmt) |
| sym(vp8_masked_predictor_uv_wmt): |
| push rbp |
| mov rbp, rsp |
| SHADOW_ARGS_TO_STACK 6 |
| push rsi |
| push rdi |
| ; end prolog |
| mov rsi, arg(0) ;src_ptr |
| mov rdi, arg(1) ;ref_ptr |
| |
| mov rbx, arg(5) ;mask |
| movsxd rax, dword ptr arg(2) ;src_stride |
| mov r11, arg(3) ; destination |
| movsxd rdx, dword ptr arg(4) ;dst_stride |
| |
| mov rcx, 8 |
| |
| pxor xmm3, xmm3 |
| |
| next_vp8_masked_predictor_uv_wmt: |
| movq xmm0, [rsi] |
| movq xmm1, [rdi] |
| movq xmm2, [rbx] |
| |
| pand xmm0, xmm2 |
| pandn xmm2, xmm1 |
| por xmm0, xmm2 |
| movq [r11], xmm0 |
| |
| add r11, rdx |
| add rsi, rax |
| add rdi, rax |
| add rbx, 8 |
| |
| dec rcx |
| jnz next_vp8_masked_predictor_uv_wmt |
| |
| ; begin epilog |
| pop rdi |
| pop rsi |
| UNSHADOW_ARGS |
| pop rbp |
| ret |
| |
| |
| ;unsigned int vp8_uv_from_y_mask( |
| ; unsigned char *ymask, |
| ; unsigned char *uvmask) |
| global sym(vp8_uv_from_y_mask) |
| sym(vp8_uv_from_y_mask): |
| push rbp |
| mov rbp, rsp |
| SHADOW_ARGS_TO_STACK 6 |
| push rsi |
| push rdi |
| ; end prolog |
| mov rsi, arg(0) ;src_ptr |
| mov rdi, arg(1) ;dst_ptr |
| |
| |
| mov rcx, 8 |
| |
| pxor xmm3, xmm3 |
| |
| next_p8_uv_from_y_mask: |
| movdqu xmm0, [rsi] |
| pshufb xmm0, [shuf1b] ;[GLOBAL(shuf1b)] |
| movq [rdi],xmm0 |
| add rdi, 8 |
| add rsi,32 |
| |
| dec rcx |
| jnz next_p8_uv_from_y_mask |
| |
| ; begin epilog |
| pop rdi |
| pop rsi |
| UNSHADOW_ARGS |
| pop rbp |
| ret |
| |
| SECTION_RODATA |
| align 16 |
| shuf1b: |
| db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0 |
| |