|  | ; | 
|  | ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 
|  | ; | 
|  | ;  Use of this source code is governed by a BSD-style license | 
|  | ;  that can be found in the LICENSE file in the root of the source | 
|  | ;  tree. An additional intellectual property rights grant can be found | 
|  | ;  in the file PATENTS.  All contributing project authors may | 
|  | ;  be found in the AUTHORS file in the root of the source tree. | 
|  | ; | 
|  |  | 
|  |  | 
|  | %include "vpx_ports/x86_abi_support.asm" | 
|  |  | 
|  | %define xmm_filter_shift            7 | 
|  |  | 
|  | ;unsigned int vp9_get_mb_ss_sse2 | 
|  | ;( | 
|  | ;    short *src_ptr | 
|  | ;) | 
|  | global sym(vp9_get_mb_ss_sse2) PRIVATE | 
|  | sym(vp9_get_mb_ss_sse2): | 
|  | push        rbp | 
|  | mov         rbp, rsp | 
|  | SHADOW_ARGS_TO_STACK 1 | 
|  | GET_GOT     rbx | 
|  | push rsi | 
|  | push rdi | 
|  | sub         rsp, 16 | 
|  | ; end prolog | 
|  |  | 
|  |  | 
|  | mov         rax, arg(0) ;[src_ptr] | 
|  | mov         rcx, 8 | 
|  | pxor        xmm4, xmm4 | 
|  |  | 
|  | .NEXTROW: | 
|  | movdqa      xmm0, [rax] | 
|  | movdqa      xmm1, [rax+16] | 
|  | movdqa      xmm2, [rax+32] | 
|  | movdqa      xmm3, [rax+48] | 
|  | pmaddwd     xmm0, xmm0 | 
|  | pmaddwd     xmm1, xmm1 | 
|  | pmaddwd     xmm2, xmm2 | 
|  | pmaddwd     xmm3, xmm3 | 
|  |  | 
|  | paddd       xmm0, xmm1 | 
|  | paddd       xmm2, xmm3 | 
|  | paddd       xmm4, xmm0 | 
|  | paddd       xmm4, xmm2 | 
|  |  | 
|  | add         rax, 0x40 | 
|  | dec         rcx | 
|  | ja          .NEXTROW | 
|  |  | 
|  | movdqa      xmm3,xmm4 | 
|  | psrldq      xmm4,8 | 
|  | paddd       xmm4,xmm3 | 
|  | movdqa      xmm3,xmm4 | 
|  | psrldq      xmm4,4 | 
|  | paddd       xmm4,xmm3 | 
|  | movq        rax,xmm4 | 
|  |  | 
|  |  | 
|  | ; begin epilog | 
|  | add rsp, 16 | 
|  | pop rdi | 
|  | pop rsi | 
|  | RESTORE_GOT | 
|  | UNSHADOW_ARGS | 
|  | pop         rbp | 
|  | ret | 
|  |  | 
|  |  | 
|  | ;unsigned int vp9_get16x16var_sse2 | 
|  | ;( | 
|  | ;    unsigned char   *  src_ptr, | 
|  | ;    int             source_stride, | 
|  | ;    unsigned char   *  ref_ptr, | 
|  | ;    int             recon_stride, | 
|  | ;    unsigned int    *  SSE, | 
|  | ;    int             *  Sum | 
|  | ;) | 
|  | global sym(vp9_get16x16var_sse2) PRIVATE | 
|  | sym(vp9_get16x16var_sse2): | 
|  | push        rbp | 
|  | mov         rbp, rsp | 
|  | SHADOW_ARGS_TO_STACK 6 | 
|  | SAVE_XMM 7 | 
|  | push rbx | 
|  | push rsi | 
|  | push rdi | 
|  | ; end prolog | 
|  |  | 
|  | mov         rsi,            arg(0) ;[src_ptr] | 
|  | mov         rdi,            arg(2) ;[ref_ptr] | 
|  |  | 
|  | movsxd      rax,            DWORD PTR arg(1) ;[source_stride] | 
|  | movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride] | 
|  |  | 
|  | ; Prefetch data | 
|  | lea             rcx,    [rax+rax*2] | 
|  | prefetcht0      [rsi] | 
|  | prefetcht0      [rsi+rax] | 
|  | prefetcht0      [rsi+rax*2] | 
|  | prefetcht0      [rsi+rcx] | 
|  | lea             rbx,    [rsi+rax*4] | 
|  | prefetcht0      [rbx] | 
|  | prefetcht0      [rbx+rax] | 
|  | prefetcht0      [rbx+rax*2] | 
|  | prefetcht0      [rbx+rcx] | 
|  |  | 
|  | lea             rcx,    [rdx+rdx*2] | 
|  | prefetcht0      [rdi] | 
|  | prefetcht0      [rdi+rdx] | 
|  | prefetcht0      [rdi+rdx*2] | 
|  | prefetcht0      [rdi+rcx] | 
|  | lea             rbx,    [rdi+rdx*4] | 
|  | prefetcht0      [rbx] | 
|  | prefetcht0      [rbx+rdx] | 
|  | prefetcht0      [rbx+rdx*2] | 
|  | prefetcht0      [rbx+rcx] | 
|  |  | 
|  | pxor        xmm0,           xmm0                        ; clear xmm0 for unpack | 
|  | pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs | 
|  |  | 
|  | pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse | 
|  | mov         rcx,            16 | 
|  |  | 
|  | .var16loop: | 
|  | movdqu      xmm1,           XMMWORD PTR [rsi] | 
|  | movdqu      xmm2,           XMMWORD PTR [rdi] | 
|  |  | 
|  | prefetcht0      [rsi+rax*8] | 
|  | prefetcht0      [rdi+rdx*8] | 
|  |  | 
|  | movdqa      xmm3,           xmm1 | 
|  | movdqa      xmm4,           xmm2 | 
|  |  | 
|  |  | 
|  | punpcklbw   xmm1,           xmm0 | 
|  | punpckhbw   xmm3,           xmm0 | 
|  |  | 
|  | punpcklbw   xmm2,           xmm0 | 
|  | punpckhbw   xmm4,           xmm0 | 
|  |  | 
|  |  | 
|  | psubw       xmm1,           xmm2 | 
|  | psubw       xmm3,           xmm4 | 
|  |  | 
|  | paddw       xmm7,           xmm1 | 
|  | pmaddwd     xmm1,           xmm1 | 
|  |  | 
|  | paddw       xmm7,           xmm3 | 
|  | pmaddwd     xmm3,           xmm3 | 
|  |  | 
|  | paddd       xmm6,           xmm1 | 
|  | paddd       xmm6,           xmm3 | 
|  |  | 
|  | add         rsi,            rax | 
|  | add         rdi,            rdx | 
|  |  | 
|  | sub         rcx,            1 | 
|  | jnz         .var16loop | 
|  |  | 
|  |  | 
|  | movdqa      xmm1,           xmm6 | 
|  | pxor        xmm6,           xmm6 | 
|  |  | 
|  | pxor        xmm5,           xmm5 | 
|  | punpcklwd   xmm6,           xmm7 | 
|  |  | 
|  | punpckhwd   xmm5,           xmm7 | 
|  | psrad       xmm5,           16 | 
|  |  | 
|  | psrad       xmm6,           16 | 
|  | paddd       xmm6,           xmm5 | 
|  |  | 
|  | movdqa      xmm2,           xmm1 | 
|  | punpckldq   xmm1,           xmm0 | 
|  |  | 
|  | punpckhdq   xmm2,           xmm0 | 
|  | movdqa      xmm7,           xmm6 | 
|  |  | 
|  | paddd       xmm1,           xmm2 | 
|  | punpckldq   xmm6,           xmm0 | 
|  |  | 
|  | punpckhdq   xmm7,           xmm0 | 
|  | paddd       xmm6,           xmm7 | 
|  |  | 
|  | movdqa      xmm2,           xmm1 | 
|  | movdqa      xmm7,           xmm6 | 
|  |  | 
|  | psrldq      xmm1,           8 | 
|  | psrldq      xmm6,           8 | 
|  |  | 
|  | paddd       xmm7,           xmm6 | 
|  | paddd       xmm1,           xmm2 | 
|  |  | 
|  | mov         rax,            arg(5) ;[Sum] | 
|  | mov         rdi,            arg(4) ;[SSE] | 
|  |  | 
|  | movd DWORD PTR [rax],       xmm7 | 
|  | movd DWORD PTR [rdi],       xmm1 | 
|  |  | 
|  |  | 
|  | ; begin epilog | 
|  | pop rdi | 
|  | pop rsi | 
|  | pop rbx | 
|  | RESTORE_XMM | 
|  | UNSHADOW_ARGS | 
|  | pop         rbp | 
|  | ret | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  | ;unsigned int vp9_get8x8var_sse2 | 
|  | ;( | 
|  | ;    unsigned char   *  src_ptr, | 
|  | ;    int             source_stride, | 
|  | ;    unsigned char   *  ref_ptr, | 
|  | ;    int             recon_stride, | 
|  | ;    unsigned int    *  SSE, | 
|  | ;    int             *  Sum | 
|  | ;) | 
|  | global sym(vp9_get8x8var_sse2) PRIVATE | 
|  | sym(vp9_get8x8var_sse2): | 
|  | push        rbp | 
|  | mov         rbp, rsp | 
|  | SHADOW_ARGS_TO_STACK 6 | 
|  | SAVE_XMM 7 | 
|  | GET_GOT     rbx | 
|  | push rsi | 
|  | push rdi | 
|  | sub         rsp, 16 | 
|  | ; end prolog | 
|  |  | 
|  | mov         rsi,            arg(0) ;[src_ptr] | 
|  | mov         rdi,            arg(2) ;[ref_ptr] | 
|  |  | 
|  | movsxd      rax,            DWORD PTR arg(1) ;[source_stride] | 
|  | movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride] | 
|  |  | 
|  | pxor        xmm0,           xmm0                        ; clear xmm0 for unpack | 
|  | pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs | 
|  |  | 
|  | movq        xmm1,           QWORD PTR [rsi] | 
|  | movq        xmm2,           QWORD PTR [rdi] | 
|  |  | 
|  | punpcklbw   xmm1,           xmm0 | 
|  | punpcklbw   xmm2,           xmm0 | 
|  |  | 
|  | psubsw      xmm1,           xmm2 | 
|  | paddw       xmm7,           xmm1 | 
|  |  | 
|  | pmaddwd     xmm1,           xmm1 | 
|  |  | 
|  | movq        xmm2,           QWORD PTR[rsi + rax] | 
|  | movq        xmm3,           QWORD PTR[rdi + rdx] | 
|  |  | 
|  | punpcklbw   xmm2,           xmm0 | 
|  | punpcklbw   xmm3,           xmm0 | 
|  |  | 
|  | psubsw      xmm2,           xmm3 | 
|  | paddw       xmm7,           xmm2 | 
|  |  | 
|  | pmaddwd     xmm2,           xmm2 | 
|  | paddd       xmm1,           xmm2 | 
|  |  | 
|  |  | 
|  | movq        xmm2,           QWORD PTR[rsi + rax * 2] | 
|  | movq        xmm3,           QWORD PTR[rdi + rdx * 2] | 
|  |  | 
|  | punpcklbw   xmm2,           xmm0 | 
|  | punpcklbw   xmm3,           xmm0 | 
|  |  | 
|  | psubsw      xmm2,           xmm3 | 
|  | paddw       xmm7,           xmm2 | 
|  |  | 
|  | pmaddwd     xmm2,           xmm2 | 
|  | paddd       xmm1,           xmm2 | 
|  |  | 
|  |  | 
|  | lea         rsi,            [rsi + rax * 2] | 
|  | lea         rdi,            [rdi + rdx * 2] | 
|  | movq        xmm2,           QWORD PTR[rsi + rax] | 
|  | movq        xmm3,           QWORD PTR[rdi + rdx] | 
|  |  | 
|  | punpcklbw   xmm2,           xmm0 | 
|  | punpcklbw   xmm3,           xmm0 | 
|  |  | 
|  | psubsw      xmm2,           xmm3 | 
|  | paddw       xmm7,           xmm2 | 
|  |  | 
|  | pmaddwd     xmm2,           xmm2 | 
|  | paddd       xmm1,           xmm2 | 
|  |  | 
|  | movq        xmm2,           QWORD PTR[rsi + rax *2] | 
|  | movq        xmm3,           QWORD PTR[rdi + rdx *2] | 
|  |  | 
|  | punpcklbw   xmm2,           xmm0 | 
|  | punpcklbw   xmm3,           xmm0 | 
|  |  | 
|  | psubsw      xmm2,           xmm3 | 
|  | paddw       xmm7,           xmm2 | 
|  |  | 
|  | pmaddwd     xmm2,           xmm2 | 
|  | paddd       xmm1,           xmm2 | 
|  |  | 
|  |  | 
|  | lea         rsi,            [rsi + rax * 2] | 
|  | lea         rdi,            [rdi + rdx * 2] | 
|  |  | 
|  |  | 
|  | movq        xmm2,           QWORD PTR[rsi + rax] | 
|  | movq        xmm3,           QWORD PTR[rdi + rdx] | 
|  |  | 
|  | punpcklbw   xmm2,           xmm0 | 
|  | punpcklbw   xmm3,           xmm0 | 
|  |  | 
|  | psubsw      xmm2,           xmm3 | 
|  | paddw       xmm7,           xmm2 | 
|  |  | 
|  | pmaddwd     xmm2,           xmm2 | 
|  | paddd       xmm1,           xmm2 | 
|  |  | 
|  | movq        xmm2,           QWORD PTR[rsi + rax *2] | 
|  | movq        xmm3,           QWORD PTR[rdi + rdx *2] | 
|  |  | 
|  | punpcklbw   xmm2,           xmm0 | 
|  | punpcklbw   xmm3,           xmm0 | 
|  |  | 
|  | psubsw      xmm2,           xmm3 | 
|  | paddw       xmm7,           xmm2 | 
|  |  | 
|  | pmaddwd     xmm2,           xmm2 | 
|  | paddd       xmm1,           xmm2 | 
|  |  | 
|  |  | 
|  | lea         rsi,            [rsi + rax * 2] | 
|  | lea         rdi,            [rdi + rdx * 2] | 
|  |  | 
|  | movq        xmm2,           QWORD PTR[rsi + rax] | 
|  | movq        xmm3,           QWORD PTR[rdi + rdx] | 
|  |  | 
|  | punpcklbw   xmm2,           xmm0 | 
|  | punpcklbw   xmm3,           xmm0 | 
|  |  | 
|  | psubsw      xmm2,           xmm3 | 
|  | paddw       xmm7,           xmm2 | 
|  |  | 
|  | pmaddwd     xmm2,           xmm2 | 
|  | paddd       xmm1,           xmm2 | 
|  |  | 
|  |  | 
|  | movdqa      xmm6,           xmm7 | 
|  | punpcklwd   xmm6,           xmm0 | 
|  |  | 
|  | punpckhwd   xmm7,           xmm0 | 
|  | movdqa      xmm2,           xmm1 | 
|  |  | 
|  | paddw       xmm6,           xmm7 | 
|  | punpckldq   xmm1,           xmm0 | 
|  |  | 
|  | punpckhdq   xmm2,           xmm0 | 
|  | movdqa      xmm7,           xmm6 | 
|  |  | 
|  | paddd       xmm1,           xmm2 | 
|  | punpckldq   xmm6,           xmm0 | 
|  |  | 
|  | punpckhdq   xmm7,           xmm0 | 
|  | paddw       xmm6,           xmm7 | 
|  |  | 
|  | movdqa      xmm2,           xmm1 | 
|  | movdqa      xmm7,           xmm6 | 
|  |  | 
|  | psrldq      xmm1,           8 | 
|  | psrldq      xmm6,           8 | 
|  |  | 
|  | paddw       xmm7,           xmm6 | 
|  | paddd       xmm1,           xmm2 | 
|  |  | 
|  | mov         rax,            arg(5) ;[Sum] | 
|  | mov         rdi,            arg(4) ;[SSE] | 
|  |  | 
|  | movq        rdx,            xmm7 | 
|  | movsx       rcx,            dx | 
|  |  | 
|  | mov  dword ptr [rax],       ecx | 
|  | movd DWORD PTR [rdi],       xmm1 | 
|  |  | 
|  | ; begin epilog | 
|  | add rsp, 16 | 
|  | pop rdi | 
|  | pop rsi | 
|  | RESTORE_GOT | 
|  | RESTORE_XMM | 
|  | UNSHADOW_ARGS | 
|  | pop         rbp | 
|  | ret | 
|  |  | 
|  | ;void vp9_half_horiz_vert_variance8x_h_sse2 | 
|  | ;( | 
|  | ;    unsigned char *ref_ptr, | 
|  | ;    int ref_pixels_per_line, | 
|  | ;    unsigned char *src_ptr, | 
|  | ;    int src_pixels_per_line, | 
|  | ;    unsigned int Height, | 
|  | ;    int *sum, | 
|  | ;    unsigned int *sumsquared | 
|  | ;) | 
|  | global sym(vp9_half_horiz_vert_variance8x_h_sse2) PRIVATE | 
|  | sym(vp9_half_horiz_vert_variance8x_h_sse2): | 
|  | push        rbp | 
|  | mov         rbp, rsp | 
|  | SHADOW_ARGS_TO_STACK 7 | 
|  | SAVE_XMM 7 | 
|  | GET_GOT     rbx | 
|  | push rsi | 
|  | push rdi | 
|  | ; end prolog | 
|  |  | 
|  | %if ABI_IS_32BIT=0 | 
|  | movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line | 
|  | movsxd          r9, dword ptr arg(3) ;src_pixels_per_line | 
|  | %endif | 
|  |  | 
|  | pxor            xmm6,           xmm6                ;  error accumulator | 
|  | pxor            xmm7,           xmm7                ;  sse eaccumulator | 
|  | mov             rsi,            arg(0) ;ref_ptr              ; | 
|  |  | 
|  | mov             rdi,            arg(2) ;src_ptr              ; | 
|  | movsxd          rcx,            dword ptr arg(4) ;Height              ; | 
|  | movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line | 
|  |  | 
|  | pxor            xmm0,           xmm0                ; | 
|  |  | 
|  | movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8 | 
|  | movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9 | 
|  | pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1 | 
|  |  | 
|  | %if ABI_IS_32BIT | 
|  | add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source | 
|  | %else | 
|  | add             rsi, r8 | 
|  | %endif | 
|  |  | 
|  | .half_horiz_vert_variance8x_h_1: | 
|  |  | 
|  | movq            xmm1,           QWORD PTR [rsi]     ; | 
|  | movq            xmm2,           QWORD PTR [rsi+1]   ; | 
|  | pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1 | 
|  |  | 
|  | pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above | 
|  | punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above | 
|  |  | 
|  | movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8 | 
|  | punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above | 
|  |  | 
|  | psubw           xmm5,           xmm3                ;  xmm5 -= xmm3 | 
|  | paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences | 
|  | pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5 | 
|  | paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences | 
|  |  | 
|  | movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row | 
|  |  | 
|  | %if ABI_IS_32BIT | 
|  | add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source | 
|  | add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination | 
|  | %else | 
|  | add             rsi, r8 | 
|  | add             rdi, r9 | 
|  | %endif | 
|  |  | 
|  | sub             rcx,            1                   ; | 
|  | jnz             .half_horiz_vert_variance8x_h_1     ; | 
|  |  | 
|  | movdq2q         mm6,            xmm6                ; | 
|  | movdq2q         mm7,            xmm7                ; | 
|  |  | 
|  | psrldq          xmm6,           8 | 
|  | psrldq          xmm7,           8 | 
|  |  | 
|  | movdq2q         mm2,            xmm6 | 
|  | movdq2q         mm3,            xmm7 | 
|  |  | 
|  | paddw           mm6,            mm2 | 
|  | paddd           mm7,            mm3 | 
|  |  | 
|  | pxor            mm3,            mm3                 ; | 
|  | pxor            mm2,            mm2                 ; | 
|  |  | 
|  | punpcklwd       mm2,            mm6                 ; | 
|  | punpckhwd       mm3,            mm6                 ; | 
|  |  | 
|  | paddd           mm2,            mm3                 ; | 
|  | movq            mm6,            mm2                 ; | 
|  |  | 
|  | psrlq           mm6,            32                  ; | 
|  | paddd           mm2,            mm6                 ; | 
|  |  | 
|  | psrad           mm2,            16                  ; | 
|  | movq            mm4,            mm7                 ; | 
|  |  | 
|  | psrlq           mm4,            32                  ; | 
|  | paddd           mm4,            mm7                 ; | 
|  |  | 
|  | mov             rsi,            arg(5) ; sum | 
|  | mov             rdi,            arg(6) ; sumsquared | 
|  |  | 
|  | movd            [rsi],          mm2                 ; | 
|  | movd            [rdi],          mm4                 ; | 
|  |  | 
|  |  | 
|  | ; begin epilog | 
|  | pop rdi | 
|  | pop rsi | 
|  | RESTORE_GOT | 
|  | RESTORE_XMM | 
|  | UNSHADOW_ARGS | 
|  | pop         rbp | 
|  | ret | 
|  |  | 
|  | ;void vp9_half_vert_variance8x_h_sse2 | 
|  | ;( | 
|  | ;    unsigned char *ref_ptr, | 
|  | ;    int ref_pixels_per_line, | 
|  | ;    unsigned char *src_ptr, | 
|  | ;    int src_pixels_per_line, | 
|  | ;    unsigned int Height, | 
|  | ;    int *sum, | 
|  | ;    unsigned int *sumsquared | 
|  | ;) | 
|  | global sym(vp9_half_vert_variance8x_h_sse2) PRIVATE | 
|  | sym(vp9_half_vert_variance8x_h_sse2): | 
|  | push        rbp | 
|  | mov         rbp, rsp | 
|  | SHADOW_ARGS_TO_STACK 7 | 
|  | SAVE_XMM 7 | 
|  | GET_GOT     rbx | 
|  | push rsi | 
|  | push rdi | 
|  | ; end prolog | 
|  |  | 
|  | %if ABI_IS_32BIT=0 | 
|  | movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line | 
|  | movsxd          r9, dword ptr arg(3) ;src_pixels_per_line | 
|  | %endif | 
|  |  | 
|  | pxor            xmm6,           xmm6                ;  error accumulator | 
|  | pxor            xmm7,           xmm7                ;  sse eaccumulator | 
|  | mov             rsi,            arg(0) ;ref_ptr              ; | 
|  |  | 
|  | mov             rdi,            arg(2) ;src_ptr              ; | 
|  | movsxd          rcx,            dword ptr arg(4) ;Height              ; | 
|  | movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line | 
|  |  | 
|  | pxor            xmm0,           xmm0                ; | 
|  | .half_vert_variance8x_h_1: | 
|  | movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8 | 
|  | movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9 | 
|  |  | 
|  | pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) | 
|  | punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above | 
|  |  | 
|  | movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8 | 
|  | punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above | 
|  |  | 
|  | psubw           xmm5,           xmm3                ;  xmm5 -= xmm3 | 
|  | paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences | 
|  | pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5 | 
|  | paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences | 
|  |  | 
|  | %if ABI_IS_32BIT | 
|  | add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source | 
|  | add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination | 
|  | %else | 
|  | add             rsi, r8 | 
|  | add             rdi, r9 | 
|  | %endif | 
|  |  | 
|  | sub             rcx,            1                   ; | 
|  | jnz             .half_vert_variance8x_h_1          ; | 
|  |  | 
|  | movdq2q         mm6,            xmm6                ; | 
|  | movdq2q         mm7,            xmm7                ; | 
|  |  | 
|  | psrldq          xmm6,           8 | 
|  | psrldq          xmm7,           8 | 
|  |  | 
|  | movdq2q         mm2,            xmm6 | 
|  | movdq2q         mm3,            xmm7 | 
|  |  | 
|  | paddw           mm6,            mm2 | 
|  | paddd           mm7,            mm3 | 
|  |  | 
|  | pxor            mm3,            mm3                 ; | 
|  | pxor            mm2,            mm2                 ; | 
|  |  | 
|  | punpcklwd       mm2,            mm6                 ; | 
|  | punpckhwd       mm3,            mm6                 ; | 
|  |  | 
|  | paddd           mm2,            mm3                 ; | 
|  | movq            mm6,            mm2                 ; | 
|  |  | 
|  | psrlq           mm6,            32                  ; | 
|  | paddd           mm2,            mm6                 ; | 
|  |  | 
|  | psrad           mm2,            16                  ; | 
|  | movq            mm4,            mm7                 ; | 
|  |  | 
|  | psrlq           mm4,            32                  ; | 
|  | paddd           mm4,            mm7                 ; | 
|  |  | 
|  | mov             rsi,            arg(5) ; sum | 
|  | mov             rdi,            arg(6) ; sumsquared | 
|  |  | 
|  | movd            [rsi],          mm2                 ; | 
|  | movd            [rdi],          mm4                 ; | 
|  |  | 
|  |  | 
|  | ; begin epilog | 
|  | pop rdi | 
|  | pop rsi | 
|  | RESTORE_GOT | 
|  | RESTORE_XMM | 
|  | UNSHADOW_ARGS | 
|  | pop         rbp | 
|  | ret | 
|  |  | 
|  |  | 
|  | ;void vp9_half_horiz_variance8x_h_sse2 | 
|  | ;( | 
|  | ;    unsigned char *ref_ptr, | 
|  | ;    int ref_pixels_per_line, | 
|  | ;    unsigned char *src_ptr, | 
|  | ;    int src_pixels_per_line, | 
|  | ;    unsigned int Height, | 
|  | ;    int *sum, | 
|  | ;    unsigned int *sumsquared | 
|  | ;) | 
|  | global sym(vp9_half_horiz_variance8x_h_sse2) PRIVATE | 
|  | sym(vp9_half_horiz_variance8x_h_sse2): | 
|  | push        rbp | 
|  | mov         rbp, rsp | 
|  | SHADOW_ARGS_TO_STACK 7 | 
|  | SAVE_XMM 7 | 
|  | GET_GOT     rbx | 
|  | push rsi | 
|  | push rdi | 
|  | ; end prolog | 
|  |  | 
|  | %if ABI_IS_32BIT=0 | 
|  | movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line | 
|  | movsxd          r9, dword ptr arg(3) ;src_pixels_per_line | 
|  | %endif | 
|  |  | 
|  | pxor            xmm6,           xmm6                ;  error accumulator | 
|  | pxor            xmm7,           xmm7                ;  sse eaccumulator | 
|  | mov             rsi,            arg(0) ;ref_ptr              ; | 
|  |  | 
|  | mov             rdi,            arg(2) ;src_ptr              ; | 
|  | movsxd          rcx,            dword ptr arg(4) ;Height              ; | 
|  |  | 
|  | pxor            xmm0,           xmm0                ; | 
|  | .half_horiz_variance8x_h_1: | 
|  | movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8 | 
|  | movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9 | 
|  |  | 
|  | pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) | 
|  | punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above | 
|  |  | 
|  | movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8 | 
|  | punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above | 
|  |  | 
|  | psubw           xmm5,           xmm3                ;  xmm5 -= xmm3 | 
|  | paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences | 
|  | pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5 | 
|  | paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences | 
|  |  | 
|  | %if ABI_IS_32BIT | 
|  | add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source | 
|  | add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination | 
|  | %else | 
|  | add             rsi, r8 | 
|  | add             rdi, r9 | 
|  | %endif | 
|  | sub             rcx,            1                   ; | 
|  | jnz             .half_horiz_variance8x_h_1          ; | 
|  |  | 
|  | movdq2q         mm6,            xmm6                ; | 
|  | movdq2q         mm7,            xmm7                ; | 
|  |  | 
|  | psrldq          xmm6,           8 | 
|  | psrldq          xmm7,           8 | 
|  |  | 
|  | movdq2q         mm2,            xmm6 | 
|  | movdq2q         mm3,            xmm7 | 
|  |  | 
|  | paddw           mm6,            mm2 | 
|  | paddd           mm7,            mm3 | 
|  |  | 
|  | pxor            mm3,            mm3                 ; | 
|  | pxor            mm2,            mm2                 ; | 
|  |  | 
|  | punpcklwd       mm2,            mm6                 ; | 
|  | punpckhwd       mm3,            mm6                 ; | 
|  |  | 
|  | paddd           mm2,            mm3                 ; | 
|  | movq            mm6,            mm2                 ; | 
|  |  | 
|  | psrlq           mm6,            32                  ; | 
|  | paddd           mm2,            mm6                 ; | 
|  |  | 
|  | psrad           mm2,            16                  ; | 
|  | movq            mm4,            mm7                 ; | 
|  |  | 
|  | psrlq           mm4,            32                  ; | 
|  | paddd           mm4,            mm7                 ; | 
|  |  | 
|  | mov             rsi,            arg(5) ; sum | 
|  | mov             rdi,            arg(6) ; sumsquared | 
|  |  | 
|  | movd            [rsi],          mm2                 ; | 
|  | movd            [rdi],          mm4                 ; | 
|  |  | 
|  |  | 
|  | ; begin epilog | 
|  | pop rdi | 
|  | pop rsi | 
|  | RESTORE_GOT | 
|  | RESTORE_XMM | 
|  | UNSHADOW_ARGS | 
|  | pop         rbp | 
|  | ret | 
|  |  | 
|  |  | 
|  | SECTION_RODATA | 
|  | ;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; | 
|  | align 16 | 
|  | xmm_bi_rd: | 
|  | times 8 dw 64 | 
|  | align 16 | 
|  | bilinear_filters_sse2: | 
|  | dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0 | 
|  | dw 120, 120, 120, 120, 120, 120, 120, 120,  8,  8,  8,  8,  8,  8,  8,  8 | 
|  | dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 | 
|  | dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24 | 
|  | dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 | 
|  | dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40 | 
|  | dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 | 
|  | dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56 | 
|  | dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 | 
|  | dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72 | 
|  | dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 | 
|  | dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88 | 
|  | dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 | 
|  | dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104 | 
|  | dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 | 
|  | dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120 |