vp9/common/x86/vp9_recon_mmx.asm - aom - Git at Google

 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;


 %include "vpx_ports/x86_abi_support.asm"
 ;void vp9_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride)
 global sym(vp9_recon_b_mmx) PRIVATE
 sym(vp9_recon_b_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4
     push        rsi
     push        rdi
     ; end prolog

         mov       rsi, arg(0) ;s
         mov       rdi, arg(2) ;d
         mov       rdx, arg(1) ;q
         movsxd    rax, dword ptr arg(3) ;stride
         pxor      mm0, mm0

         movd      mm1, [rsi]
         punpcklbw mm1, mm0
         paddsw    mm1, [rdx]
         packuswb  mm1,  mm0              ; pack and unpack to saturate
         movd      [rdi], mm1

         movd      mm2, [rsi+16]
         punpcklbw mm2, mm0
         paddsw    mm2, [rdx+32]
         packuswb  mm2, mm0              ; pack and unpack to saturate
         movd      [rdi+rax], mm2

         movd      mm3, [rsi+32]
         punpcklbw mm3, mm0
         paddsw    mm3, [rdx+64]
         packuswb  mm3,  mm0              ; pack and unpack to saturate
         movd      [rdi+2*rax], mm3

         add       rdi, rax
         movd      mm4, [rsi+48]
         punpcklbw mm4, mm0
         paddsw    mm4, [rdx+96]
         packuswb  mm4, mm0              ; pack and unpack to saturate
         movd      [rdi+2*rax], mm4

     ; begin epilog
     pop rdi
     pop rsi
     UNSHADOW_ARGS
     pop         rbp
     ret


 ;void copy_mem8x8_mmx(
 ;    unsigned char *src,
 ;    int src_stride,
 ;    unsigned char *dst,
 ;    int dst_stride
 ;    )
 global sym(vp9_copy_mem8x8_mmx) PRIVATE
 sym(vp9_copy_mem8x8_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4
     push        rsi
     push        rdi
     ; end prolog

         mov         rsi,        arg(0) ;src;
         movq        mm0,        [rsi]

         movsxd      rax,        dword ptr arg(1) ;src_stride;
         mov         rdi,        arg(2) ;dst;

         movq        mm1,        [rsi+rax]
         movq        mm2,        [rsi+rax*2]

         movsxd      rcx,        dword ptr arg(3) ;dst_stride
         lea         rsi,        [rsi+rax*2]

         movq        [rdi],      mm0
         add         rsi,        rax

         movq        [rdi+rcx],      mm1
         movq        [rdi+rcx*2],    mm2


         lea         rdi,        [rdi+rcx*2]
         movq        mm3,        [rsi]

         add         rdi,        rcx
         movq        mm4,        [rsi+rax]

         movq        mm5,        [rsi+rax*2]
         movq        [rdi],      mm3

         lea         rsi,        [rsi+rax*2]
         movq        [rdi+rcx],  mm4

         movq        [rdi+rcx*2],    mm5
         lea         rdi,        [rdi+rcx*2]

         movq        mm0,        [rsi+rax]
         movq        mm1,        [rsi+rax*2]

         movq        [rdi+rcx],  mm0
         movq        [rdi+rcx*2],mm1

     ; begin epilog
     pop rdi
     pop rsi
     UNSHADOW_ARGS
     pop         rbp
     ret


 ;void copy_mem8x4_mmx(
 ;    unsigned char *src,
 ;    int src_stride,
 ;    unsigned char *dst,
 ;    int dst_stride
 ;    )
 global sym(vp9_copy_mem8x4_mmx) PRIVATE
 sym(vp9_copy_mem8x4_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4
     push        rsi
     push        rdi
     ; end prolog

         mov         rsi,        arg(0) ;src;
         movq        mm0,        [rsi]

         movsxd      rax,        dword ptr arg(1) ;src_stride;
         mov         rdi,        arg(2) ;dst;

         movq        mm1,        [rsi+rax]
         movq        mm2,        [rsi+rax*2]

         movsxd      rcx,        dword ptr arg(3) ;dst_stride
         lea         rsi,        [rsi+rax*2]

         movq        [rdi],      mm0
         movq        [rdi+rcx],      mm1

         movq        [rdi+rcx*2],    mm2
         lea         rdi,        [rdi+rcx*2]

         movq        mm3,        [rsi+rax]
         movq        [rdi+rcx],      mm3

     ; begin epilog
     pop rdi
     pop rsi
     UNSHADOW_ARGS
     pop         rbp
     ret


 ;void copy_mem16x16_mmx(
 ;    unsigned char *src,
 ;    int src_stride,
 ;    unsigned char *dst,
 ;    int dst_stride
 ;    )
 global sym(vp9_copy_mem16x16_mmx) PRIVATE
 sym(vp9_copy_mem16x16_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4
     push        rsi
     push        rdi
     ; end prolog

         mov         rsi,        arg(0) ;src;
         movsxd      rax,        dword ptr arg(1) ;src_stride;

         mov         rdi,        arg(2) ;dst;
         movsxd      rcx,        dword ptr arg(3) ;dst_stride

         movq        mm0,            [rsi]
         movq        mm3,            [rsi+8];

         movq        mm1,            [rsi+rax]
         movq        mm4,            [rsi+rax+8]

         movq        mm2,            [rsi+rax*2]
         movq        mm5,            [rsi+rax*2+8]

         lea         rsi,            [rsi+rax*2]
         add         rsi,            rax

         movq        [rdi],          mm0
         movq        [rdi+8],        mm3

         movq        [rdi+rcx],      mm1
         movq        [rdi+rcx+8],    mm4

         movq        [rdi+rcx*2],    mm2
         movq        [rdi+rcx*2+8],  mm5

         lea         rdi,            [rdi+rcx*2]
         add         rdi,            rcx

         movq        mm0,            [rsi]
         movq        mm3,            [rsi+8];

         movq        mm1,            [rsi+rax]
         movq        mm4,            [rsi+rax+8]

         movq        mm2,            [rsi+rax*2]
         movq        mm5,            [rsi+rax*2+8]

         lea         rsi,            [rsi+rax*2]
         add         rsi,            rax

         movq        [rdi],          mm0
         movq        [rdi+8],        mm3

         movq        [rdi+rcx],      mm1
         movq        [rdi+rcx+8],    mm4

         movq        [rdi+rcx*2],    mm2
         movq        [rdi+rcx*2+8],  mm5

         lea         rdi,            [rdi+rcx*2]
         add         rdi,            rcx

         movq        mm0,            [rsi]
         movq        mm3,            [rsi+8];

         movq        mm1,            [rsi+rax]
         movq        mm4,            [rsi+rax+8]

         movq        mm2,            [rsi+rax*2]
         movq        mm5,            [rsi+rax*2+8]

         lea         rsi,            [rsi+rax*2]
         add         rsi,            rax

         movq        [rdi],          mm0
         movq        [rdi+8],        mm3

         movq        [rdi+rcx],      mm1
         movq        [rdi+rcx+8],    mm4

         movq        [rdi+rcx*2],    mm2
         movq        [rdi+rcx*2+8],  mm5

         lea         rdi,            [rdi+rcx*2]
         add         rdi,            rcx

         movq        mm0,            [rsi]
         movq        mm3,            [rsi+8];

         movq        mm1,            [rsi+rax]
         movq        mm4,            [rsi+rax+8]

         movq        mm2,            [rsi+rax*2]
         movq        mm5,            [rsi+rax*2+8]

         lea         rsi,            [rsi+rax*2]
         add         rsi,            rax

         movq        [rdi],          mm0
         movq        [rdi+8],        mm3

         movq        [rdi+rcx],      mm1
         movq        [rdi+rcx+8],    mm4

         movq        [rdi+rcx*2],    mm2
         movq        [rdi+rcx*2+8],  mm5

         lea         rdi,            [rdi+rcx*2]
         add         rdi,            rcx

         movq        mm0,            [rsi]
         movq        mm3,            [rsi+8];

         movq        mm1,            [rsi+rax]
         movq        mm4,            [rsi+rax+8]

         movq        mm2,            [rsi+rax*2]
         movq        mm5,            [rsi+rax*2+8]

         lea         rsi,            [rsi+rax*2]
         add         rsi,            rax

         movq        [rdi],          mm0
         movq        [rdi+8],        mm3

         movq        [rdi+rcx],      mm1
         movq        [rdi+rcx+8],    mm4

         movq        [rdi+rcx*2],    mm2
         movq        [rdi+rcx*2+8],  mm5

         lea         rdi,            [rdi+rcx*2]
         add         rdi,            rcx

         movq        mm0,            [rsi]
         movq        mm3,            [rsi+8];

         movq        [rdi],          mm0
         movq        [rdi+8],        mm3

     ; begin epilog
     pop rdi
     pop rsi
     UNSHADOW_ARGS
     pop         rbp
     ret
	;
	; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
	;
	; Use of this source code is governed by a BSD-style license
	; that can be found in the LICENSE file in the root of the source
	; tree. An additional intellectual property rights grant can be found
	; in the file PATENTS. All contributing project authors may
	; be found in the AUTHORS file in the root of the source tree.
	;


	%include "vpx_ports/x86_abi_support.asm"
	;void vp9_recon_b_mmx(unsigned char s, short q, unsigned char *d, int stride)
	global sym(vp9_recon_b_mmx) PRIVATE
	sym(vp9_recon_b_mmx):
	push rbp
	mov rbp, rsp
	SHADOW_ARGS_TO_STACK 4
	push rsi
	push rdi
	; end prolog

	mov rsi, arg(0) ;s
	mov rdi, arg(2) ;d
	mov rdx, arg(1) ;q
	movsxd rax, dword ptr arg(3) ;stride
	pxor mm0, mm0

	movd mm1, [rsi]
	punpcklbw mm1, mm0
	paddsw mm1, [rdx]
	packuswb mm1, mm0 ; pack and unpack to saturate
	movd [rdi], mm1

	movd mm2, [rsi+16]
	punpcklbw mm2, mm0
	paddsw mm2, [rdx+32]
	packuswb mm2, mm0 ; pack and unpack to saturate
	movd [rdi+rax], mm2

	movd mm3, [rsi+32]
	punpcklbw mm3, mm0
	paddsw mm3, [rdx+64]
	packuswb mm3, mm0 ; pack and unpack to saturate
	movd [rdi+2*rax], mm3

	add rdi, rax
	movd mm4, [rsi+48]
	punpcklbw mm4, mm0
	paddsw mm4, [rdx+96]
	packuswb mm4, mm0 ; pack and unpack to saturate
	movd [rdi+2*rax], mm4

	; begin epilog
	pop rdi
	pop rsi
	UNSHADOW_ARGS
	pop rbp
	ret


	;void copy_mem8x8_mmx(
	; unsigned char *src,
	; int src_stride,
	; unsigned char *dst,
	; int dst_stride
	; )
	global sym(vp9_copy_mem8x8_mmx) PRIVATE
	sym(vp9_copy_mem8x8_mmx):
	push rbp
	mov rbp, rsp
	SHADOW_ARGS_TO_STACK 4
	push rsi
	push rdi
	; end prolog

	mov rsi, arg(0) ;src;
	movq mm0, [rsi]

	movsxd rax, dword ptr arg(1) ;src_stride;
	mov rdi, arg(2) ;dst;

	movq mm1, [rsi+rax]
	movq mm2, [rsi+rax*2]

	movsxd rcx, dword ptr arg(3) ;dst_stride
	lea rsi, [rsi+rax*2]

	movq [rdi], mm0
	add rsi, rax

	movq [rdi+rcx], mm1
	movq [rdi+rcx*2], mm2


	lea rdi, [rdi+rcx*2]
	movq mm3, [rsi]

	add rdi, rcx
	movq mm4, [rsi+rax]

	movq mm5, [rsi+rax*2]
	movq [rdi], mm3

	lea rsi, [rsi+rax*2]
	movq [rdi+rcx], mm4

	movq [rdi+rcx*2], mm5
	lea rdi, [rdi+rcx*2]

	movq mm0, [rsi+rax]
	movq mm1, [rsi+rax*2]

	movq [rdi+rcx], mm0
	movq [rdi+rcx*2],mm1

	; begin epilog
	pop rdi
	pop rsi
	UNSHADOW_ARGS
	pop rbp
	ret


	;void copy_mem8x4_mmx(
	; unsigned char *src,
	; int src_stride,
	; unsigned char *dst,
	; int dst_stride
	; )
	global sym(vp9_copy_mem8x4_mmx) PRIVATE
	sym(vp9_copy_mem8x4_mmx):
	push rbp
	mov rbp, rsp
	SHADOW_ARGS_TO_STACK 4
	push rsi
	push rdi
	; end prolog

	mov rsi, arg(0) ;src;
	movq mm0, [rsi]

	movsxd rax, dword ptr arg(1) ;src_stride;
	mov rdi, arg(2) ;dst;

	movq mm1, [rsi+rax]
	movq mm2, [rsi+rax*2]

	movsxd rcx, dword ptr arg(3) ;dst_stride
	lea rsi, [rsi+rax*2]

	movq [rdi], mm0
	movq [rdi+rcx], mm1

	movq [rdi+rcx*2], mm2
	lea rdi, [rdi+rcx*2]

	movq mm3, [rsi+rax]
	movq [rdi+rcx], mm3

	; begin epilog
	pop rdi
	pop rsi
	UNSHADOW_ARGS
	pop rbp
	ret


	;void copy_mem16x16_mmx(
	; unsigned char *src,
	; int src_stride,
	; unsigned char *dst,
	; int dst_stride
	; )
	global sym(vp9_copy_mem16x16_mmx) PRIVATE
	sym(vp9_copy_mem16x16_mmx):
	push rbp
	mov rbp, rsp
	SHADOW_ARGS_TO_STACK 4
	push rsi
	push rdi
	; end prolog

	mov rsi, arg(0) ;src;
	movsxd rax, dword ptr arg(1) ;src_stride;

	mov rdi, arg(2) ;dst;
	movsxd rcx, dword ptr arg(3) ;dst_stride

	movq mm0, [rsi]
	movq mm3, [rsi+8];

	movq mm1, [rsi+rax]
	movq mm4, [rsi+rax+8]

	movq mm2, [rsi+rax*2]
	movq mm5, [rsi+rax*2+8]

	lea rsi, [rsi+rax*2]
	add rsi, rax

	movq [rdi], mm0
	movq [rdi+8], mm3

	movq [rdi+rcx], mm1
	movq [rdi+rcx+8], mm4

	movq [rdi+rcx*2], mm2
	movq [rdi+rcx*2+8], mm5

	lea rdi, [rdi+rcx*2]
	add rdi, rcx

	movq mm0, [rsi]
	movq mm3, [rsi+8];

	movq mm1, [rsi+rax]
	movq mm4, [rsi+rax+8]

	movq mm2, [rsi+rax*2]
	movq mm5, [rsi+rax*2+8]

	lea rsi, [rsi+rax*2]
	add rsi, rax

	movq [rdi], mm0
	movq [rdi+8], mm3

	movq [rdi+rcx], mm1
	movq [rdi+rcx+8], mm4

	movq [rdi+rcx*2], mm2
	movq [rdi+rcx*2+8], mm5

	lea rdi, [rdi+rcx*2]
	add rdi, rcx

	movq mm0, [rsi]
	movq mm3, [rsi+8];

	movq mm1, [rsi+rax]
	movq mm4, [rsi+rax+8]

	movq mm2, [rsi+rax*2]
	movq mm5, [rsi+rax*2+8]

	lea rsi, [rsi+rax*2]
	add rsi, rax

	movq [rdi], mm0
	movq [rdi+8], mm3

	movq [rdi+rcx], mm1
	movq [rdi+rcx+8], mm4

	movq [rdi+rcx*2], mm2
	movq [rdi+rcx*2+8], mm5

	lea rdi, [rdi+rcx*2]
	add rdi, rcx

	movq mm0, [rsi]
	movq mm3, [rsi+8];

	movq mm1, [rsi+rax]
	movq mm4, [rsi+rax+8]

	movq mm2, [rsi+rax*2]
	movq mm5, [rsi+rax*2+8]

	lea rsi, [rsi+rax*2]
	add rsi, rax

	movq [rdi], mm0
	movq [rdi+8], mm3

	movq [rdi+rcx], mm1
	movq [rdi+rcx+8], mm4

	movq [rdi+rcx*2], mm2
	movq [rdi+rcx*2+8], mm5

	lea rdi, [rdi+rcx*2]
	add rdi, rcx

	movq mm0, [rsi]
	movq mm3, [rsi+8];

	movq mm1, [rsi+rax]
	movq mm4, [rsi+rax+8]

	movq mm2, [rsi+rax*2]
	movq mm5, [rsi+rax*2+8]

	lea rsi, [rsi+rax*2]
	add rsi, rax

	movq [rdi], mm0
	movq [rdi+8], mm3

	movq [rdi+rcx], mm1
	movq [rdi+rcx+8], mm4

	movq [rdi+rcx*2], mm2
	movq [rdi+rcx*2+8], mm5

	lea rdi, [rdi+rcx*2]
	add rdi, rcx

	movq mm0, [rsi]
	movq mm3, [rsi+8];

	movq [rdi], mm0
	movq [rdi+8], mm3

	; begin epilog
	pop rdi
	pop rsi
	UNSHADOW_ARGS
	pop rbp
	ret