|  | ; | 
|  | ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 
|  | ; | 
|  | ;  Use of this source code is governed by a BSD-style license | 
|  | ;  that can be found in the LICENSE file in the root of the source | 
|  | ;  tree. An additional intellectual property rights grant can be found | 
|  | ;  in the file PATENTS.  All contributing project authors may | 
|  | ;  be found in the AUTHORS file in the root of the source tree. | 
|  | ; | 
|  |  | 
|  |  | 
|  | %include "vpx_ports/x86_abi_support.asm" | 
|  |  | 
|  | ;unsigned int vp9_get_mb_ss_mmx( short *src_ptr ) | 
|  | global sym(vp9_get_mb_ss_mmx) PRIVATE | 
|  | sym(vp9_get_mb_ss_mmx): | 
|  | push        rbp | 
|  | mov         rbp, rsp | 
|  | SHADOW_ARGS_TO_STACK 7 | 
|  | GET_GOT     rbx | 
|  | push rsi | 
|  | push rdi | 
|  | sub         rsp, 8 | 
|  | ; end prolog | 
|  |  | 
|  | mov         rax, arg(0) ;src_ptr | 
|  | mov         rcx, 16 | 
|  | pxor        mm4, mm4 | 
|  |  | 
|  | .NEXTROW: | 
|  | movq        mm0, [rax] | 
|  | movq        mm1, [rax+8] | 
|  | movq        mm2, [rax+16] | 
|  | movq        mm3, [rax+24] | 
|  | pmaddwd     mm0, mm0 | 
|  | pmaddwd     mm1, mm1 | 
|  | pmaddwd     mm2, mm2 | 
|  | pmaddwd     mm3, mm3 | 
|  |  | 
|  | paddd       mm4, mm0 | 
|  | paddd       mm4, mm1 | 
|  | paddd       mm4, mm2 | 
|  | paddd       mm4, mm3 | 
|  |  | 
|  | add         rax, 32 | 
|  | dec         rcx | 
|  | ja          .NEXTROW | 
|  | movq        QWORD PTR [rsp], mm4 | 
|  |  | 
|  | ;return sum[0]+sum[1]; | 
|  | movsxd      rax, dword ptr [rsp] | 
|  | movsxd      rcx, dword ptr [rsp+4] | 
|  | add         rax, rcx | 
|  |  | 
|  |  | 
|  | ; begin epilog | 
|  | add rsp, 8 | 
|  | pop rdi | 
|  | pop rsi | 
|  | RESTORE_GOT | 
|  | UNSHADOW_ARGS | 
|  | pop         rbp | 
|  | ret | 
|  |  | 
|  |  | 
|  | ;unsigned int vp9_get8x8var_mmx | 
|  | ;( | 
|  | ;    unsigned char *src_ptr, | 
|  | ;    int  source_stride, | 
|  | ;    unsigned char *ref_ptr, | 
|  | ;    int  recon_stride, | 
|  | ;    unsigned int *SSE, | 
|  | ;    int *Sum | 
|  | ;) | 
|  | global sym(vp9_get8x8var_mmx) PRIVATE | 
|  | sym(vp9_get8x8var_mmx): | 
|  | push        rbp | 
|  | mov         rbp, rsp | 
|  | SHADOW_ARGS_TO_STACK 6 | 
|  | push rsi | 
|  | push rdi | 
|  | push rbx | 
|  | sub         rsp, 16 | 
|  | ; end prolog | 
|  |  | 
|  |  | 
|  | pxor        mm5, mm5                    ; Blank mmx6 | 
|  | pxor        mm6, mm6                    ; Blank mmx7 | 
|  | pxor        mm7, mm7                    ; Blank mmx7 | 
|  |  | 
|  | mov         rax, arg(0) ;[src_ptr]  ; Load base addresses | 
|  | mov         rbx, arg(2) ;[ref_ptr] | 
|  | movsxd      rcx, dword ptr arg(1) ;[source_stride] | 
|  | movsxd      rdx, dword ptr arg(3) ;[recon_stride] | 
|  |  | 
|  | ; Row 1 | 
|  | movq        mm0, [rax]                  ; Copy eight bytes to mm0 | 
|  | movq        mm1, [rbx]                  ; Copy eight bytes to mm1 | 
|  | movq        mm2, mm0                    ; Take copies | 
|  | movq        mm3, mm1                    ; Take copies | 
|  |  | 
|  | punpcklbw   mm0, mm6                    ; unpack to higher prrcision | 
|  | punpcklbw   mm1, mm6 | 
|  | punpckhbw   mm2, mm6                    ; unpack to higher prrcision | 
|  | punpckhbw   mm3, mm6 | 
|  | psubsw      mm0, mm1                    ; A-B (low order) to MM0 | 
|  | psubsw      mm2, mm3                    ; A-B (high order) to MM2 | 
|  |  | 
|  | paddw       mm5, mm0                    ; accumulate differences in mm5 | 
|  | paddw       mm5, mm2                    ; accumulate differences in mm5 | 
|  |  | 
|  | pmaddwd     mm0, mm0                    ; square and accumulate | 
|  | pmaddwd     mm2, mm2                    ; square and accumulate | 
|  | add         rbx,rdx                     ; Inc pointer into ref data | 
|  | add         rax,rcx                     ; Inc pointer into the new data | 
|  | movq        mm1, [rbx]                  ; Copy eight bytes to mm1 | 
|  | paddd       mm7, mm0                    ; accumulate in mm7 | 
|  | paddd       mm7, mm2                    ; accumulate in mm7 | 
|  |  | 
|  |  | 
|  | ; Row 2 | 
|  | movq        mm0, [rax]                  ; Copy eight bytes to mm0 | 
|  | movq        mm2, mm0                    ; Take copies | 
|  | movq        mm3, mm1                    ; Take copies | 
|  |  | 
|  | punpcklbw   mm0, mm6                    ; unpack to higher prrcision | 
|  | punpcklbw   mm1, mm6 | 
|  | punpckhbw   mm2, mm6                    ; unpack to higher prrcision | 
|  | punpckhbw   mm3, mm6 | 
|  | psubsw      mm0, mm1                    ; A-B (low order) to MM0 | 
|  | psubsw      mm2, mm3                    ; A-B (high order) to MM2 | 
|  |  | 
|  | paddw       mm5, mm0                    ; accumulate differences in mm5 | 
|  | paddw       mm5, mm2                    ; accumulate differences in mm5 | 
|  |  | 
|  | pmaddwd     mm0, mm0                    ; square and accumulate | 
|  | pmaddwd     mm2, mm2                    ; square and accumulate | 
|  | add         rbx,rdx                     ; Inc pointer into ref data | 
|  | add         rax,rcx                     ; Inc pointer into the new data | 
|  | movq        mm1, [rbx]                  ; Copy eight bytes to mm1 | 
|  | paddd       mm7, mm0                    ; accumulate in mm7 | 
|  | paddd       mm7, mm2                    ; accumulate in mm7 | 
|  |  | 
|  | ; Row 3 | 
|  | movq        mm0, [rax]                  ; Copy eight bytes to mm0 | 
|  | movq        mm2, mm0                    ; Take copies | 
|  | movq        mm3, mm1                    ; Take copies | 
|  |  | 
|  | punpcklbw   mm0, mm6                    ; unpack to higher prrcision | 
|  | punpcklbw   mm1, mm6 | 
|  | punpckhbw   mm2, mm6                    ; unpack to higher prrcision | 
|  | punpckhbw   mm3, mm6 | 
|  | psubsw      mm0, mm1                    ; A-B (low order) to MM0 | 
|  | psubsw      mm2, mm3                    ; A-B (high order) to MM2 | 
|  |  | 
|  | paddw       mm5, mm0                    ; accumulate differences in mm5 | 
|  | paddw       mm5, mm2                    ; accumulate differences in mm5 | 
|  |  | 
|  | pmaddwd     mm0, mm0                    ; square and accumulate | 
|  | pmaddwd     mm2, mm2                    ; square and accumulate | 
|  | add         rbx,rdx                     ; Inc pointer into ref data | 
|  | add         rax,rcx                     ; Inc pointer into the new data | 
|  | movq        mm1, [rbx]                  ; Copy eight bytes to mm1 | 
|  | paddd       mm7, mm0                    ; accumulate in mm7 | 
|  | paddd       mm7, mm2                    ; accumulate in mm7 | 
|  |  | 
|  | ; Row 4 | 
|  | movq        mm0, [rax]                  ; Copy eight bytes to mm0 | 
|  | movq        mm2, mm0                    ; Take copies | 
|  | movq        mm3, mm1                    ; Take copies | 
|  |  | 
|  | punpcklbw   mm0, mm6                    ; unpack to higher prrcision | 
|  | punpcklbw   mm1, mm6 | 
|  | punpckhbw   mm2, mm6                    ; unpack to higher prrcision | 
|  | punpckhbw   mm3, mm6 | 
|  | psubsw      mm0, mm1                    ; A-B (low order) to MM0 | 
|  | psubsw      mm2, mm3                    ; A-B (high order) to MM2 | 
|  |  | 
|  | paddw       mm5, mm0                    ; accumulate differences in mm5 | 
|  | paddw       mm5, mm2                    ; accumulate differences in mm5 | 
|  |  | 
|  | pmaddwd     mm0, mm0                    ; square and accumulate | 
|  | pmaddwd     mm2, mm2                    ; square and accumulate | 
|  | add         rbx,rdx                     ; Inc pointer into ref data | 
|  | add         rax,rcx                     ; Inc pointer into the new data | 
|  | movq        mm1, [rbx]                  ; Copy eight bytes to mm1 | 
|  | paddd       mm7, mm0                    ; accumulate in mm7 | 
|  | paddd       mm7, mm2                    ; accumulate in mm7 | 
|  |  | 
|  | ; Row 5 | 
|  | movq        mm0, [rax]                  ; Copy eight bytes to mm0 | 
|  | movq        mm2, mm0                    ; Take copies | 
|  | movq        mm3, mm1                    ; Take copies | 
|  |  | 
|  | punpcklbw   mm0, mm6                    ; unpack to higher prrcision | 
|  | punpcklbw   mm1, mm6 | 
|  | punpckhbw   mm2, mm6                    ; unpack to higher prrcision | 
|  | punpckhbw   mm3, mm6 | 
|  | psubsw      mm0, mm1                    ; A-B (low order) to MM0 | 
|  | psubsw      mm2, mm3                    ; A-B (high order) to MM2 | 
|  |  | 
|  | paddw       mm5, mm0                    ; accumulate differences in mm5 | 
|  | paddw       mm5, mm2                    ; accumulate differences in mm5 | 
|  |  | 
|  | pmaddwd     mm0, mm0                    ; square and accumulate | 
|  | pmaddwd     mm2, mm2                    ; square and accumulate | 
|  | add         rbx,rdx                     ; Inc pointer into ref data | 
|  | add         rax,rcx                     ; Inc pointer into the new data | 
|  | movq        mm1, [rbx]                  ; Copy eight bytes to mm1 | 
|  | ;              movq        mm4, [rbx + rdx] | 
|  | paddd       mm7, mm0                    ; accumulate in mm7 | 
|  | paddd       mm7, mm2                    ; accumulate in mm7 | 
|  |  | 
|  | ; Row 6 | 
|  | movq        mm0, [rax]                  ; Copy eight bytes to mm0 | 
|  | movq        mm2, mm0                    ; Take copies | 
|  | movq        mm3, mm1                    ; Take copies | 
|  |  | 
|  | punpcklbw   mm0, mm6                    ; unpack to higher prrcision | 
|  | punpcklbw   mm1, mm6 | 
|  | punpckhbw   mm2, mm6                    ; unpack to higher prrcision | 
|  | punpckhbw   mm3, mm6 | 
|  | psubsw      mm0, mm1                    ; A-B (low order) to MM0 | 
|  | psubsw      mm2, mm3                    ; A-B (high order) to MM2 | 
|  |  | 
|  | paddw       mm5, mm0                    ; accumulate differences in mm5 | 
|  | paddw       mm5, mm2                    ; accumulate differences in mm5 | 
|  |  | 
|  | pmaddwd     mm0, mm0                    ; square and accumulate | 
|  | pmaddwd     mm2, mm2                    ; square and accumulate | 
|  | add         rbx,rdx                     ; Inc pointer into ref data | 
|  | add         rax,rcx                     ; Inc pointer into the new data | 
|  | movq        mm1, [rbx]                  ; Copy eight bytes to mm1 | 
|  | paddd       mm7, mm0                    ; accumulate in mm7 | 
|  | paddd       mm7, mm2                    ; accumulate in mm7 | 
|  |  | 
|  | ; Row 7 | 
|  | movq        mm0, [rax]                  ; Copy eight bytes to mm0 | 
|  | movq        mm2, mm0                    ; Take copies | 
|  | movq        mm3, mm1                    ; Take copies | 
|  |  | 
|  | punpcklbw   mm0, mm6                    ; unpack to higher prrcision | 
|  | punpcklbw   mm1, mm6 | 
|  | punpckhbw   mm2, mm6                    ; unpack to higher prrcision | 
|  | punpckhbw   mm3, mm6 | 
|  | psubsw      mm0, mm1                    ; A-B (low order) to MM0 | 
|  | psubsw      mm2, mm3                    ; A-B (high order) to MM2 | 
|  |  | 
|  | paddw       mm5, mm0                    ; accumulate differences in mm5 | 
|  | paddw       mm5, mm2                    ; accumulate differences in mm5 | 
|  |  | 
|  | pmaddwd     mm0, mm0                    ; square and accumulate | 
|  | pmaddwd     mm2, mm2                    ; square and accumulate | 
|  | add         rbx,rdx                     ; Inc pointer into ref data | 
|  | add         rax,rcx                     ; Inc pointer into the new data | 
|  | movq        mm1, [rbx]                  ; Copy eight bytes to mm1 | 
|  | paddd       mm7, mm0                    ; accumulate in mm7 | 
|  | paddd       mm7, mm2                    ; accumulate in mm7 | 
|  |  | 
|  | ; Row 8 | 
|  | movq        mm0, [rax]                  ; Copy eight bytes to mm0 | 
|  | movq        mm2, mm0                    ; Take copies | 
|  | movq        mm3, mm1                    ; Take copies | 
|  |  | 
|  | punpcklbw   mm0, mm6                    ; unpack to higher prrcision | 
|  | punpcklbw   mm1, mm6 | 
|  | punpckhbw   mm2, mm6                    ; unpack to higher prrcision | 
|  | punpckhbw   mm3, mm6 | 
|  | psubsw      mm0, mm1                    ; A-B (low order) to MM0 | 
|  | psubsw      mm2, mm3                    ; A-B (high order) to MM2 | 
|  |  | 
|  | paddw       mm5, mm0                    ; accumulate differences in mm5 | 
|  | paddw       mm5, mm2                    ; accumulate differences in mm5 | 
|  |  | 
|  | pmaddwd     mm0, mm0                    ; square and accumulate | 
|  | pmaddwd     mm2, mm2                    ; square and accumulate | 
|  | add         rbx,rdx                     ; Inc pointer into ref data | 
|  | add         rax,rcx                     ; Inc pointer into the new data | 
|  | paddd       mm7, mm0                    ; accumulate in mm7 | 
|  | paddd       mm7, mm2                    ; accumulate in mm7 | 
|  |  | 
|  | ; Now accumulate the final results. | 
|  | movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory | 
|  | movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory | 
|  | movsx       rdx, WORD PTR [rsp+8] | 
|  | movsx       rcx, WORD PTR [rsp+10] | 
|  | movsx       rbx, WORD PTR [rsp+12] | 
|  | movsx       rax, WORD PTR [rsp+14] | 
|  | add         rdx, rcx | 
|  | add         rbx, rax | 
|  | add         rdx, rbx    ;XSum | 
|  | movsxd      rax, DWORD PTR [rsp] | 
|  | movsxd      rcx, DWORD PTR [rsp+4] | 
|  | add         rax, rcx    ;XXSum | 
|  | mov         rsi, arg(4) ;SSE | 
|  | mov         rdi, arg(5) ;Sum | 
|  | mov         dword ptr [rsi], eax | 
|  | mov         dword ptr [rdi], edx | 
|  | xor         rax, rax    ; return 0 | 
|  |  | 
|  |  | 
|  | ; begin epilog | 
|  | add rsp, 16 | 
|  | pop rbx | 
|  | pop rdi | 
|  | pop rsi | 
|  | UNSHADOW_ARGS | 
|  | pop         rbp | 
|  | ret | 
|  |  | 
|  |  | 
|  |  | 
|  | ;unsigned int | 
|  | ;vp9_get4x4var_mmx | 
|  | ;( | 
|  | ;    unsigned char *src_ptr, | 
|  | ;    int  source_stride, | 
|  | ;    unsigned char *ref_ptr, | 
|  | ;    int  recon_stride, | 
|  | ;    unsigned int *SSE, | 
|  | ;    int *Sum | 
|  | ;) | 
|  | global sym(vp9_get4x4var_mmx) PRIVATE | 
|  | sym(vp9_get4x4var_mmx): | 
|  | push        rbp | 
|  | mov         rbp, rsp | 
|  | SHADOW_ARGS_TO_STACK 6 | 
|  | push rsi | 
|  | push rdi | 
|  | push rbx | 
|  | sub         rsp, 16 | 
|  | ; end prolog | 
|  |  | 
|  |  | 
|  | pxor        mm5, mm5                    ; Blank mmx6 | 
|  | pxor        mm6, mm6                    ; Blank mmx7 | 
|  | pxor        mm7, mm7                    ; Blank mmx7 | 
|  |  | 
|  | mov         rax, arg(0) ;[src_ptr]  ; Load base addresses | 
|  | mov         rbx, arg(2) ;[ref_ptr] | 
|  | movsxd      rcx, dword ptr arg(1) ;[source_stride] | 
|  | movsxd      rdx, dword ptr arg(3) ;[recon_stride] | 
|  |  | 
|  | ; Row 1 | 
|  | movq        mm0, [rax]                  ; Copy eight bytes to mm0 | 
|  | movq        mm1, [rbx]                  ; Copy eight bytes to mm1 | 
|  | punpcklbw   mm0, mm6                    ; unpack to higher prrcision | 
|  | punpcklbw   mm1, mm6 | 
|  | psubsw      mm0, mm1                    ; A-B (low order) to MM0 | 
|  | paddw       mm5, mm0                    ; accumulate differences in mm5 | 
|  | pmaddwd     mm0, mm0                    ; square and accumulate | 
|  | add         rbx,rdx                     ; Inc pointer into ref data | 
|  | add         rax,rcx                     ; Inc pointer into the new data | 
|  | movq        mm1, [rbx]                  ; Copy eight bytes to mm1 | 
|  | paddd       mm7, mm0                    ; accumulate in mm7 | 
|  |  | 
|  |  | 
|  | ; Row 2 | 
|  | movq        mm0, [rax]                  ; Copy eight bytes to mm0 | 
|  | punpcklbw   mm0, mm6                    ; unpack to higher prrcision | 
|  | punpcklbw   mm1, mm6 | 
|  | psubsw      mm0, mm1                    ; A-B (low order) to MM0 | 
|  | paddw       mm5, mm0                    ; accumulate differences in mm5 | 
|  |  | 
|  | pmaddwd     mm0, mm0                    ; square and accumulate | 
|  | add         rbx,rdx                     ; Inc pointer into ref data | 
|  | add         rax,rcx                     ; Inc pointer into the new data | 
|  | movq        mm1, [rbx]                  ; Copy eight bytes to mm1 | 
|  | paddd       mm7, mm0                    ; accumulate in mm7 | 
|  |  | 
|  | ; Row 3 | 
|  | movq        mm0, [rax]                  ; Copy eight bytes to mm0 | 
|  | punpcklbw   mm0, mm6                    ; unpack to higher prrcision | 
|  | punpcklbw   mm1, mm6 | 
|  | psubsw      mm0, mm1                    ; A-B (low order) to MM0 | 
|  | paddw       mm5, mm0                    ; accumulate differences in mm5 | 
|  |  | 
|  | pmaddwd     mm0, mm0                    ; square and accumulate | 
|  | add         rbx,rdx                     ; Inc pointer into ref data | 
|  | add         rax,rcx                     ; Inc pointer into the new data | 
|  | movq        mm1, [rbx]                  ; Copy eight bytes to mm1 | 
|  | paddd       mm7, mm0                    ; accumulate in mm7 | 
|  |  | 
|  | ; Row 4 | 
|  | movq        mm0, [rax]                  ; Copy eight bytes to mm0 | 
|  |  | 
|  | punpcklbw   mm0, mm6                    ; unpack to higher prrcision | 
|  | punpcklbw   mm1, mm6 | 
|  | psubsw      mm0, mm1                    ; A-B (low order) to MM0 | 
|  |  | 
|  | paddw       mm5, mm0                    ; accumulate differences in mm5 | 
|  |  | 
|  | pmaddwd     mm0, mm0                    ; square and accumulate | 
|  | paddd       mm7, mm0                    ; accumulate in mm7 | 
|  |  | 
|  |  | 
|  | ; Now accumulate the final results. | 
|  | movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory | 
|  | movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory | 
|  | movsx       rdx, WORD PTR [rsp+8] | 
|  | movsx       rcx, WORD PTR [rsp+10] | 
|  | movsx       rbx, WORD PTR [rsp+12] | 
|  | movsx       rax, WORD PTR [rsp+14] | 
|  | add         rdx, rcx | 
|  | add         rbx, rax | 
|  | add         rdx, rbx    ;XSum | 
|  | movsxd      rax, DWORD PTR [rsp] | 
|  | movsxd      rcx, DWORD PTR [rsp+4] | 
|  | add         rax, rcx    ;XXSum | 
|  | mov         rsi, arg(4) ;SSE | 
|  | mov         rdi, arg(5) ;Sum | 
|  | mov         dword ptr [rsi], eax | 
|  | mov         dword ptr [rdi], edx | 
|  | xor         rax, rax    ; return 0 | 
|  |  | 
|  |  | 
|  | ; begin epilog | 
|  | add rsp, 16 | 
|  | pop rbx | 
|  | pop rdi | 
|  | pop rsi | 
|  | UNSHADOW_ARGS | 
|  | pop         rbp | 
|  | ret | 
|  |  | 
|  |  | 
|  |  | 
|  | ;unsigned int | 
|  | ;vp9_get4x4sse_cs_mmx | 
|  | ;( | 
|  | ;    unsigned char *src_ptr, | 
|  | ;    int  source_stride, | 
|  | ;    unsigned char *ref_ptr, | 
|  | ;    int  recon_stride | 
|  | ;) | 
|  | global sym(vp9_get4x4sse_cs_mmx) PRIVATE | 
|  | sym(vp9_get4x4sse_cs_mmx): | 
|  | push        rbp | 
|  | mov         rbp, rsp | 
|  | SHADOW_ARGS_TO_STACK 4 | 
|  | push rsi | 
|  | push rdi | 
|  | push rbx | 
|  | ; end prolog | 
|  |  | 
|  |  | 
|  | pxor        mm6, mm6                    ; Blank mmx7 | 
|  | pxor        mm7, mm7                    ; Blank mmx7 | 
|  |  | 
|  | mov         rax, arg(0) ;[src_ptr]  ; Load base addresses | 
|  | mov         rbx, arg(2) ;[ref_ptr] | 
|  | movsxd      rcx, dword ptr arg(1) ;[source_stride] | 
|  | movsxd      rdx, dword ptr arg(3) ;[recon_stride] | 
|  | ; Row 1 | 
|  | movd        mm0, [rax]                  ; Copy eight bytes to mm0 | 
|  | movd        mm1, [rbx]                  ; Copy eight bytes to mm1 | 
|  | punpcklbw   mm0, mm6                    ; unpack to higher prrcision | 
|  | punpcklbw   mm1, mm6 | 
|  | psubsw      mm0, mm1                    ; A-B (low order) to MM0 | 
|  | pmaddwd     mm0, mm0                    ; square and accumulate | 
|  | add         rbx,rdx                     ; Inc pointer into ref data | 
|  | add         rax,rcx                     ; Inc pointer into the new data | 
|  | movd        mm1, [rbx]                  ; Copy eight bytes to mm1 | 
|  | paddd       mm7, mm0                    ; accumulate in mm7 | 
|  |  | 
|  | ; Row 2 | 
|  | movd        mm0, [rax]                  ; Copy eight bytes to mm0 | 
|  | punpcklbw   mm0, mm6                    ; unpack to higher prrcision | 
|  | punpcklbw   mm1, mm6 | 
|  | psubsw      mm0, mm1                    ; A-B (low order) to MM0 | 
|  | pmaddwd     mm0, mm0                    ; square and accumulate | 
|  | add         rbx,rdx                     ; Inc pointer into ref data | 
|  | add         rax,rcx                     ; Inc pointer into the new data | 
|  | movd        mm1, [rbx]                  ; Copy eight bytes to mm1 | 
|  | paddd       mm7, mm0                    ; accumulate in mm7 | 
|  |  | 
|  | ; Row 3 | 
|  | movd        mm0, [rax]                  ; Copy eight bytes to mm0 | 
|  | punpcklbw   mm1, mm6 | 
|  | punpcklbw   mm0, mm6                    ; unpack to higher prrcision | 
|  | psubsw      mm0, mm1                    ; A-B (low order) to MM0 | 
|  |  | 
|  | pmaddwd     mm0, mm0                    ; square and accumulate | 
|  | add         rbx,rdx                     ; Inc pointer into ref data | 
|  | add         rax,rcx                     ; Inc pointer into the new data | 
|  | movd        mm1, [rbx]                  ; Copy eight bytes to mm1 | 
|  | paddd       mm7, mm0                    ; accumulate in mm7 | 
|  |  | 
|  | ; Row 4 | 
|  | movd        mm0, [rax]                  ; Copy eight bytes to mm0 | 
|  | punpcklbw   mm0, mm6                    ; unpack to higher prrcision | 
|  | punpcklbw   mm1, mm6 | 
|  | psubsw      mm0, mm1                    ; A-B (low order) to MM0 | 
|  | pmaddwd     mm0, mm0                    ; square and accumulate | 
|  | paddd       mm7, mm0                    ; accumulate in mm7 | 
|  |  | 
|  | movq        mm0,    mm7                 ; | 
|  | psrlq       mm7,    32 | 
|  |  | 
|  | paddd       mm0,    mm7 | 
|  | movq        rax,    mm0 | 
|  |  | 
|  |  | 
|  | ; begin epilog | 
|  | pop rbx | 
|  | pop rdi | 
|  | pop rsi | 
|  | UNSHADOW_ARGS | 
|  | pop         rbp | 
|  | ret | 
|  |  | 
|  | %define mmx_filter_shift            7 | 
|  |  | 
|  | ;void vp9_filter_block2d_bil4x4_var_mmx | 
|  | ;( | 
|  | ;    unsigned char *ref_ptr, | 
|  | ;    int ref_pixels_per_line, | 
|  | ;    unsigned char *src_ptr, | 
|  | ;    int src_pixels_per_line, | 
|  | ;    unsigned short *HFilter, | 
|  | ;    unsigned short *VFilter, | 
|  | ;    int *sum, | 
|  | ;    unsigned int *sumsquared | 
|  | ;) | 
|  | global sym(vp9_filter_block2d_bil4x4_var_mmx) PRIVATE | 
|  | sym(vp9_filter_block2d_bil4x4_var_mmx): | 
|  | push        rbp | 
|  | mov         rbp, rsp | 
|  | SHADOW_ARGS_TO_STACK 8 | 
|  | GET_GOT     rbx | 
|  | push rsi | 
|  | push rdi | 
|  | sub         rsp, 16 | 
|  | ; end prolog | 
|  |  | 
|  |  | 
|  | pxor            mm6,            mm6                 ; | 
|  | pxor            mm7,            mm7                 ; | 
|  |  | 
|  | mov             rax,            arg(4) ;HFilter             ; | 
|  | mov             rdx,            arg(5) ;VFilter             ; | 
|  |  | 
|  | mov             rsi,            arg(0) ;ref_ptr              ; | 
|  | mov             rdi,            arg(2) ;src_ptr              ; | 
|  |  | 
|  | mov             rcx,            4                   ; | 
|  | pxor            mm0,            mm0                 ; | 
|  |  | 
|  | movd            mm1,            [rsi]               ; | 
|  | movd            mm3,            [rsi+1]             ; | 
|  |  | 
|  | punpcklbw       mm1,            mm0                 ; | 
|  | pmullw          mm1,            [rax]               ; | 
|  |  | 
|  | punpcklbw       mm3,            mm0                 ; | 
|  | pmullw          mm3,            [rax+8]             ; | 
|  |  | 
|  | paddw           mm1,            mm3                 ; | 
|  | paddw           mm1,            [GLOBAL(mmx_bi_rd)] ; | 
|  |  | 
|  | psraw           mm1,            mmx_filter_shift    ; | 
|  | movq            mm5,            mm1 | 
|  |  | 
|  | %if ABI_IS_32BIT | 
|  | add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ; | 
|  | %else | 
|  | movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ; | 
|  | add             rsi, r8 | 
|  | %endif | 
|  |  | 
|  | .filter_block2d_bil4x4_var_mmx_loop: | 
|  |  | 
|  | movd            mm1,            [rsi]               ; | 
|  | movd            mm3,            [rsi+1]             ; | 
|  |  | 
|  | punpcklbw       mm1,            mm0                 ; | 
|  | pmullw          mm1,            [rax]               ; | 
|  |  | 
|  | punpcklbw       mm3,            mm0                 ; | 
|  | pmullw          mm3,            [rax+8]             ; | 
|  |  | 
|  | paddw           mm1,            mm3                 ; | 
|  | paddw           mm1,            [GLOBAL(mmx_bi_rd)] ; | 
|  |  | 
|  | psraw           mm1,            mmx_filter_shift    ; | 
|  | movq            mm3,            mm5                 ; | 
|  |  | 
|  | movq            mm5,            mm1                 ; | 
|  | pmullw          mm3,            [rdx]               ; | 
|  |  | 
|  | pmullw          mm1,            [rdx+8]             ; | 
|  | paddw           mm1,            mm3                 ; | 
|  |  | 
|  |  | 
|  | paddw           mm1,            [GLOBAL(mmx_bi_rd)] ; | 
|  | psraw           mm1,            mmx_filter_shift    ; | 
|  |  | 
|  | movd            mm3,            [rdi]               ; | 
|  | punpcklbw       mm3,            mm0                 ; | 
|  |  | 
|  | psubw           mm1,            mm3                 ; | 
|  | paddw           mm6,            mm1                 ; | 
|  |  | 
|  | pmaddwd         mm1,            mm1                 ; | 
|  | paddd           mm7,            mm1                 ; | 
|  |  | 
|  | %if ABI_IS_32BIT | 
|  | add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ; | 
|  | add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ; | 
|  | %else | 
|  | movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line | 
|  | movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line | 
|  | add             rsi,            r8 | 
|  | add             rdi,            r9 | 
|  | %endif | 
|  | sub             rcx,            1                   ; | 
|  | jnz             .filter_block2d_bil4x4_var_mmx_loop       ; | 
|  |  | 
|  |  | 
|  | pxor            mm3,            mm3                 ; | 
|  | pxor            mm2,            mm2                 ; | 
|  |  | 
|  | punpcklwd       mm2,            mm6                 ; | 
|  | punpckhwd       mm3,            mm6                 ; | 
|  |  | 
|  | paddd           mm2,            mm3                 ; | 
|  | movq            mm6,            mm2                 ; | 
|  |  | 
|  | psrlq           mm6,            32                  ; | 
|  | paddd           mm2,            mm6                 ; | 
|  |  | 
|  | psrad           mm2,            16                  ; | 
|  | movq            mm4,            mm7                 ; | 
|  |  | 
|  | psrlq           mm4,            32                  ; | 
|  | paddd           mm4,            mm7                 ; | 
|  |  | 
|  | mov             rdi,            arg(6) ;sum | 
|  | mov             rsi,            arg(7) ;sumsquared | 
|  |  | 
|  | movd            dword ptr [rdi],          mm2                 ; | 
|  | movd            dword ptr [rsi],          mm4                 ; | 
|  |  | 
|  |  | 
|  |  | 
|  | ; begin epilog | 
|  | add rsp, 16 | 
|  | pop rdi | 
|  | pop rsi | 
|  | RESTORE_GOT | 
|  | UNSHADOW_ARGS | 
|  | pop         rbp | 
|  | ret | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  | ;void vp9_filter_block2d_bil_var_mmx | 
|  | ;( | 
|  | ;    unsigned char *ref_ptr, | 
|  | ;    int ref_pixels_per_line, | 
|  | ;    unsigned char *src_ptr, | 
|  | ;    int src_pixels_per_line, | 
|  | ;    unsigned int Height, | 
|  | ;    unsigned short *HFilter, | 
|  | ;    unsigned short *VFilter, | 
|  | ;    int *sum, | 
|  | ;    unsigned int *sumsquared | 
|  | ;) | 
|  | global sym(vp9_filter_block2d_bil_var_mmx) PRIVATE | 
|  | sym(vp9_filter_block2d_bil_var_mmx): | 
|  | push        rbp | 
|  | mov         rbp, rsp | 
|  | SHADOW_ARGS_TO_STACK 9 | 
|  | GET_GOT     rbx | 
|  | push rsi | 
|  | push rdi | 
|  | sub         rsp, 16 | 
|  | ; end prolog | 
|  |  | 
|  | pxor            mm6,            mm6                 ; | 
|  | pxor            mm7,            mm7                 ; | 
|  | mov             rax,            arg(5) ;HFilter             ; | 
|  |  | 
|  | mov             rdx,            arg(6) ;VFilter             ; | 
|  | mov             rsi,            arg(0) ;ref_ptr              ; | 
|  |  | 
|  | mov             rdi,            arg(2) ;src_ptr              ; | 
|  | movsxd          rcx,            dword ptr arg(4) ;Height              ; | 
|  |  | 
|  | pxor            mm0,            mm0                 ; | 
|  | movq            mm1,            [rsi]               ; | 
|  |  | 
|  | movq            mm3,            [rsi+1]             ; | 
|  | movq            mm2,            mm1                 ; | 
|  |  | 
|  | movq            mm4,            mm3                 ; | 
|  | punpcklbw       mm1,            mm0                 ; | 
|  |  | 
|  | punpckhbw       mm2,            mm0                 ; | 
|  | pmullw          mm1,            [rax]               ; | 
|  |  | 
|  | pmullw          mm2,            [rax]               ; | 
|  | punpcklbw       mm3,            mm0                 ; | 
|  |  | 
|  | punpckhbw       mm4,            mm0                 ; | 
|  | pmullw          mm3,            [rax+8]             ; | 
|  |  | 
|  | pmullw          mm4,            [rax+8]             ; | 
|  | paddw           mm1,            mm3                 ; | 
|  |  | 
|  | paddw           mm2,            mm4                 ; | 
|  | paddw           mm1,            [GLOBAL(mmx_bi_rd)] ; | 
|  |  | 
|  | psraw           mm1,            mmx_filter_shift    ; | 
|  | paddw           mm2,            [GLOBAL(mmx_bi_rd)] ; | 
|  |  | 
|  | psraw           mm2,            mmx_filter_shift    ; | 
|  | movq            mm5,            mm1 | 
|  |  | 
|  | packuswb        mm5,            mm2                 ; | 
|  | %if ABI_IS_32BIT | 
|  | add             rsi,            dword ptr arg(1) ;ref_pixels_per_line | 
|  | %else | 
|  | movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line | 
|  | add             rsi,            r8 | 
|  | %endif | 
|  |  | 
|  | .filter_block2d_bil_var_mmx_loop: | 
|  |  | 
|  | movq            mm1,            [rsi]               ; | 
|  | movq            mm3,            [rsi+1]             ; | 
|  |  | 
|  | movq            mm2,            mm1                 ; | 
|  | movq            mm4,            mm3                 ; | 
|  |  | 
|  | punpcklbw       mm1,            mm0                 ; | 
|  | punpckhbw       mm2,            mm0                 ; | 
|  |  | 
|  | pmullw          mm1,            [rax]               ; | 
|  | pmullw          mm2,            [rax]               ; | 
|  |  | 
|  | punpcklbw       mm3,            mm0                 ; | 
|  | punpckhbw       mm4,            mm0                 ; | 
|  |  | 
|  | pmullw          mm3,            [rax+8]             ; | 
|  | pmullw          mm4,            [rax+8]             ; | 
|  |  | 
|  | paddw           mm1,            mm3                 ; | 
|  | paddw           mm2,            mm4                 ; | 
|  |  | 
|  | paddw           mm1,            [GLOBAL(mmx_bi_rd)] ; | 
|  | psraw           mm1,            mmx_filter_shift    ; | 
|  |  | 
|  | paddw           mm2,            [GLOBAL(mmx_bi_rd)] ; | 
|  | psraw           mm2,            mmx_filter_shift    ; | 
|  |  | 
|  | movq            mm3,            mm5                 ; | 
|  | movq            mm4,            mm5                 ; | 
|  |  | 
|  | punpcklbw       mm3,            mm0                 ; | 
|  | punpckhbw       mm4,            mm0                 ; | 
|  |  | 
|  | movq            mm5,            mm1                 ; | 
|  | packuswb        mm5,            mm2                 ; | 
|  |  | 
|  | pmullw          mm3,            [rdx]               ; | 
|  | pmullw          mm4,            [rdx]               ; | 
|  |  | 
|  | pmullw          mm1,            [rdx+8]             ; | 
|  | pmullw          mm2,            [rdx+8]             ; | 
|  |  | 
|  | paddw           mm1,            mm3                 ; | 
|  | paddw           mm2,            mm4                 ; | 
|  |  | 
|  | paddw           mm1,            [GLOBAL(mmx_bi_rd)] ; | 
|  | paddw           mm2,            [GLOBAL(mmx_bi_rd)] ; | 
|  |  | 
|  | psraw           mm1,            mmx_filter_shift    ; | 
|  | psraw           mm2,            mmx_filter_shift    ; | 
|  |  | 
|  | movq            mm3,            [rdi]               ; | 
|  | movq            mm4,            mm3                 ; | 
|  |  | 
|  | punpcklbw       mm3,            mm0                 ; | 
|  | punpckhbw       mm4,            mm0                 ; | 
|  |  | 
|  | psubw           mm1,            mm3                 ; | 
|  | psubw           mm2,            mm4                 ; | 
|  |  | 
|  | paddw           mm6,            mm1                 ; | 
|  | pmaddwd         mm1,            mm1                 ; | 
|  |  | 
|  | paddw           mm6,            mm2                 ; | 
|  | pmaddwd         mm2,            mm2                 ; | 
|  |  | 
|  | paddd           mm7,            mm1                 ; | 
|  | paddd           mm7,            mm2                 ; | 
|  |  | 
|  | %if ABI_IS_32BIT | 
|  | add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ; | 
|  | add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ; | 
|  | %else | 
|  | movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ; | 
|  | movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ; | 
|  | add             rsi,            r8 | 
|  | add             rdi,            r9 | 
|  | %endif | 
|  | sub             rcx,            1                   ; | 
|  | jnz             .filter_block2d_bil_var_mmx_loop       ; | 
|  |  | 
|  |  | 
|  | pxor            mm3,            mm3                 ; | 
|  | pxor            mm2,            mm2                 ; | 
|  |  | 
|  | punpcklwd       mm2,            mm6                 ; | 
|  | punpckhwd       mm3,            mm6                 ; | 
|  |  | 
|  | paddd           mm2,            mm3                 ; | 
|  | movq            mm6,            mm2                 ; | 
|  |  | 
|  | psrlq           mm6,            32                  ; | 
|  | paddd           mm2,            mm6                 ; | 
|  |  | 
|  | psrad           mm2,            16                  ; | 
|  | movq            mm4,            mm7                 ; | 
|  |  | 
|  | psrlq           mm4,            32                  ; | 
|  | paddd           mm4,            mm7                 ; | 
|  |  | 
|  | mov             rdi,            arg(7) ;sum | 
|  | mov             rsi,            arg(8) ;sumsquared | 
|  |  | 
|  | movd            dword ptr [rdi],          mm2                 ; | 
|  | movd            dword ptr [rsi],          mm4                 ; | 
|  |  | 
|  | ; begin epilog | 
|  | add rsp, 16 | 
|  | pop rdi | 
|  | pop rsi | 
|  | RESTORE_GOT | 
|  | UNSHADOW_ARGS | 
|  | pop         rbp | 
|  | ret | 
|  |  | 
|  |  | 
|  | SECTION_RODATA | 
|  | ;short mmx_bi_rd[4] = { 64, 64, 64, 64}; | 
|  | align 16 | 
|  | mmx_bi_rd: | 
|  | times 4 dw 64 |