| ; | 
 | ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 
 | ; | 
 | ;  Use of this source code is governed by a BSD-style license | 
 | ;  that can be found in the LICENSE file in the root of the source | 
 | ;  tree. An additional intellectual property rights grant can be found | 
 | ;  in the file PATENTS.  All contributing project authors may | 
 | ;  be found in the AUTHORS file in the root of the source tree. | 
 | ; | 
 |  | 
 |  | 
 | %include "vpx_ports/x86_abi_support.asm" | 
 |  | 
 | %macro PROCESS_16X2X3 1 | 
 | %if %1 | 
 |         movdqa          xmm0,       XMMWORD PTR [rsi] | 
 |         lddqu           xmm5,       XMMWORD PTR [rdi] | 
 |         lddqu           xmm6,       XMMWORD PTR [rdi+1] | 
 |         lddqu           xmm7,       XMMWORD PTR [rdi+2] | 
 |  | 
 |         psadbw          xmm5,       xmm0 | 
 |         psadbw          xmm6,       xmm0 | 
 |         psadbw          xmm7,       xmm0 | 
 | %else | 
 |         movdqa          xmm0,       XMMWORD PTR [rsi] | 
 |         lddqu           xmm1,       XMMWORD PTR [rdi] | 
 |         lddqu           xmm2,       XMMWORD PTR [rdi+1] | 
 |         lddqu           xmm3,       XMMWORD PTR [rdi+2] | 
 |  | 
 |         psadbw          xmm1,       xmm0 | 
 |         psadbw          xmm2,       xmm0 | 
 |         psadbw          xmm3,       xmm0 | 
 |  | 
 |         paddw           xmm5,       xmm1 | 
 |         paddw           xmm6,       xmm2 | 
 |         paddw           xmm7,       xmm3 | 
 | %endif | 
 |         movdqa          xmm0,       XMMWORD PTR [rsi+rax] | 
 |         lddqu           xmm1,       XMMWORD PTR [rdi+rdx] | 
 |         lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1] | 
 |         lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2] | 
 |  | 
 |         lea             rsi,        [rsi+rax*2] | 
 |         lea             rdi,        [rdi+rdx*2] | 
 |  | 
 |         psadbw          xmm1,       xmm0 | 
 |         psadbw          xmm2,       xmm0 | 
 |         psadbw          xmm3,       xmm0 | 
 |  | 
 |         paddw           xmm5,       xmm1 | 
 |         paddw           xmm6,       xmm2 | 
 |         paddw           xmm7,       xmm3 | 
 | %endmacro | 
 |  | 
 | %macro PROCESS_16X2X3_OFFSET 2 | 
 | %if %1 | 
 |         movdqa          xmm0,       XMMWORD PTR [rsi] | 
 |         movdqa          xmm4,       XMMWORD PTR [rdi] | 
 |         movdqa          xmm7,       XMMWORD PTR [rdi+16] | 
 |  | 
 |         movdqa          xmm5,       xmm7 | 
 |         palignr         xmm5,       xmm4,       %2 | 
 |  | 
 |         movdqa          xmm6,       xmm7 | 
 |         palignr         xmm6,       xmm4,       (%2+1) | 
 |  | 
 |         palignr         xmm7,       xmm4,       (%2+2) | 
 |  | 
 |         psadbw          xmm5,       xmm0 | 
 |         psadbw          xmm6,       xmm0 | 
 |         psadbw          xmm7,       xmm0 | 
 | %else | 
 |         movdqa          xmm0,       XMMWORD PTR [rsi] | 
 |         movdqa          xmm4,       XMMWORD PTR [rdi] | 
 |         movdqa          xmm3,       XMMWORD PTR [rdi+16] | 
 |  | 
 |         movdqa          xmm1,       xmm3 | 
 |         palignr         xmm1,       xmm4,       %2 | 
 |  | 
 |         movdqa          xmm2,       xmm3 | 
 |         palignr         xmm2,       xmm4,       (%2+1) | 
 |  | 
 |         palignr         xmm3,       xmm4,       (%2+2) | 
 |  | 
 |         psadbw          xmm1,       xmm0 | 
 |         psadbw          xmm2,       xmm0 | 
 |         psadbw          xmm3,       xmm0 | 
 |  | 
 |         paddw           xmm5,       xmm1 | 
 |         paddw           xmm6,       xmm2 | 
 |         paddw           xmm7,       xmm3 | 
 | %endif | 
 |         movdqa          xmm0,       XMMWORD PTR [rsi+rax] | 
 |         movdqa          xmm4,       XMMWORD PTR [rdi+rdx] | 
 |         movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16] | 
 |  | 
 |         movdqa          xmm1,       xmm3 | 
 |         palignr         xmm1,       xmm4,       %2 | 
 |  | 
 |         movdqa          xmm2,       xmm3 | 
 |         palignr         xmm2,       xmm4,       (%2+1) | 
 |  | 
 |         palignr         xmm3,       xmm4,       (%2+2) | 
 |  | 
 |         lea             rsi,        [rsi+rax*2] | 
 |         lea             rdi,        [rdi+rdx*2] | 
 |  | 
 |         psadbw          xmm1,       xmm0 | 
 |         psadbw          xmm2,       xmm0 | 
 |         psadbw          xmm3,       xmm0 | 
 |  | 
 |         paddw           xmm5,       xmm1 | 
 |         paddw           xmm6,       xmm2 | 
 |         paddw           xmm7,       xmm3 | 
 | %endmacro | 
 |  | 
 | %macro PROCESS_16X16X3_OFFSET 2 | 
 | %2_aligned_by_%1: | 
 |  | 
 |         sub             rdi,        %1 | 
 |  | 
 |         PROCESS_16X2X3_OFFSET 1, %1 | 
 |         PROCESS_16X2X3_OFFSET 0, %1 | 
 |         PROCESS_16X2X3_OFFSET 0, %1 | 
 |         PROCESS_16X2X3_OFFSET 0, %1 | 
 |         PROCESS_16X2X3_OFFSET 0, %1 | 
 |         PROCESS_16X2X3_OFFSET 0, %1 | 
 |         PROCESS_16X2X3_OFFSET 0, %1 | 
 |         PROCESS_16X2X3_OFFSET 0, %1 | 
 |  | 
 |         jmp             %2_store_off | 
 |  | 
 | %endmacro | 
 |  | 
 | %macro PROCESS_16X8X3_OFFSET 2 | 
 | %2_aligned_by_%1: | 
 |  | 
 |         sub             rdi,        %1 | 
 |  | 
 |         PROCESS_16X2X3_OFFSET 1, %1 | 
 |         PROCESS_16X2X3_OFFSET 0, %1 | 
 |         PROCESS_16X2X3_OFFSET 0, %1 | 
 |         PROCESS_16X2X3_OFFSET 0, %1 | 
 |  | 
 |         jmp             %2_store_off | 
 |  | 
 | %endmacro | 
 |  | 
 | ;void int vp9_sad16x16x3_ssse3( | 
 | ;    unsigned char *src_ptr, | 
 | ;    int  src_stride, | 
 | ;    unsigned char *ref_ptr, | 
 | ;    int  ref_stride, | 
 | ;    int  *results) | 
 | global sym(vp9_sad16x16x3_ssse3) PRIVATE | 
 | sym(vp9_sad16x16x3_ssse3): | 
 |     push        rbp | 
 |     mov         rbp, rsp | 
 |     SHADOW_ARGS_TO_STACK 5 | 
 |     SAVE_XMM 7 | 
 |     push        rsi | 
 |     push        rdi | 
 |     push        rcx | 
 |     ; end prolog | 
 |  | 
 |         mov             rsi,        arg(0) ;src_ptr | 
 |         mov             rdi,        arg(2) ;ref_ptr | 
 |  | 
 |         mov             rdx,        0xf | 
 |         and             rdx,        rdi | 
 |  | 
 |         jmp .vp9_sad16x16x3_ssse3_skiptable | 
 | .vp9_sad16x16x3_ssse3_jumptable: | 
 |         dd .vp9_sad16x16x3_ssse3_aligned_by_0  - .vp9_sad16x16x3_ssse3_do_jump | 
 |         dd .vp9_sad16x16x3_ssse3_aligned_by_1  - .vp9_sad16x16x3_ssse3_do_jump | 
 |         dd .vp9_sad16x16x3_ssse3_aligned_by_2  - .vp9_sad16x16x3_ssse3_do_jump | 
 |         dd .vp9_sad16x16x3_ssse3_aligned_by_3  - .vp9_sad16x16x3_ssse3_do_jump | 
 |         dd .vp9_sad16x16x3_ssse3_aligned_by_4  - .vp9_sad16x16x3_ssse3_do_jump | 
 |         dd .vp9_sad16x16x3_ssse3_aligned_by_5  - .vp9_sad16x16x3_ssse3_do_jump | 
 |         dd .vp9_sad16x16x3_ssse3_aligned_by_6  - .vp9_sad16x16x3_ssse3_do_jump | 
 |         dd .vp9_sad16x16x3_ssse3_aligned_by_7  - .vp9_sad16x16x3_ssse3_do_jump | 
 |         dd .vp9_sad16x16x3_ssse3_aligned_by_8  - .vp9_sad16x16x3_ssse3_do_jump | 
 |         dd .vp9_sad16x16x3_ssse3_aligned_by_9  - .vp9_sad16x16x3_ssse3_do_jump | 
 |         dd .vp9_sad16x16x3_ssse3_aligned_by_10 - .vp9_sad16x16x3_ssse3_do_jump | 
 |         dd .vp9_sad16x16x3_ssse3_aligned_by_11 - .vp9_sad16x16x3_ssse3_do_jump | 
 |         dd .vp9_sad16x16x3_ssse3_aligned_by_12 - .vp9_sad16x16x3_ssse3_do_jump | 
 |         dd .vp9_sad16x16x3_ssse3_aligned_by_13 - .vp9_sad16x16x3_ssse3_do_jump | 
 |         dd .vp9_sad16x16x3_ssse3_aligned_by_14 - .vp9_sad16x16x3_ssse3_do_jump | 
 |         dd .vp9_sad16x16x3_ssse3_aligned_by_15 - .vp9_sad16x16x3_ssse3_do_jump | 
 | .vp9_sad16x16x3_ssse3_skiptable: | 
 |  | 
 |         call .vp9_sad16x16x3_ssse3_do_jump | 
 | .vp9_sad16x16x3_ssse3_do_jump: | 
 |         pop             rcx                         ; get the address of do_jump | 
 |         mov             rax,  .vp9_sad16x16x3_ssse3_jumptable - .vp9_sad16x16x3_ssse3_do_jump | 
 |         add             rax,  rcx  ; get the absolute address of vp9_sad16x16x3_ssse3_jumptable | 
 |  | 
 |         movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable | 
 |         add             rcx,        rax | 
 |  | 
 |         movsxd          rax,        dword ptr arg(1) ;src_stride | 
 |         movsxd          rdx,        dword ptr arg(3) ;ref_stride | 
 |  | 
 |         jmp             rcx | 
 |  | 
 |         PROCESS_16X16X3_OFFSET 0,  .vp9_sad16x16x3_ssse3 | 
 |         PROCESS_16X16X3_OFFSET 1,  .vp9_sad16x16x3_ssse3 | 
 |         PROCESS_16X16X3_OFFSET 2,  .vp9_sad16x16x3_ssse3 | 
 |         PROCESS_16X16X3_OFFSET 3,  .vp9_sad16x16x3_ssse3 | 
 |         PROCESS_16X16X3_OFFSET 4,  .vp9_sad16x16x3_ssse3 | 
 |         PROCESS_16X16X3_OFFSET 5,  .vp9_sad16x16x3_ssse3 | 
 |         PROCESS_16X16X3_OFFSET 6,  .vp9_sad16x16x3_ssse3 | 
 |         PROCESS_16X16X3_OFFSET 7,  .vp9_sad16x16x3_ssse3 | 
 |         PROCESS_16X16X3_OFFSET 8,  .vp9_sad16x16x3_ssse3 | 
 |         PROCESS_16X16X3_OFFSET 9,  .vp9_sad16x16x3_ssse3 | 
 |         PROCESS_16X16X3_OFFSET 10, .vp9_sad16x16x3_ssse3 | 
 |         PROCESS_16X16X3_OFFSET 11, .vp9_sad16x16x3_ssse3 | 
 |         PROCESS_16X16X3_OFFSET 12, .vp9_sad16x16x3_ssse3 | 
 |         PROCESS_16X16X3_OFFSET 13, .vp9_sad16x16x3_ssse3 | 
 |         PROCESS_16X16X3_OFFSET 14, .vp9_sad16x16x3_ssse3 | 
 |  | 
 | .vp9_sad16x16x3_ssse3_aligned_by_15: | 
 |         PROCESS_16X2X3 1 | 
 |         PROCESS_16X2X3 0 | 
 |         PROCESS_16X2X3 0 | 
 |         PROCESS_16X2X3 0 | 
 |         PROCESS_16X2X3 0 | 
 |         PROCESS_16X2X3 0 | 
 |         PROCESS_16X2X3 0 | 
 |         PROCESS_16X2X3 0 | 
 |  | 
 | .vp9_sad16x16x3_ssse3_store_off: | 
 |         mov             rdi,        arg(4) ;Results | 
 |  | 
 |         movq            xmm0,       xmm5 | 
 |         psrldq          xmm5,       8 | 
 |  | 
 |         paddw           xmm0,       xmm5 | 
 |         movd            [rdi],      xmm0 | 
 | ;- | 
 |         movq            xmm0,       xmm6 | 
 |         psrldq          xmm6,       8 | 
 |  | 
 |         paddw           xmm0,       xmm6 | 
 |         movd            [rdi+4],    xmm0 | 
 | ;- | 
 |         movq            xmm0,       xmm7 | 
 |         psrldq          xmm7,       8 | 
 |  | 
 |         paddw           xmm0,       xmm7 | 
 |         movd            [rdi+8],    xmm0 | 
 |  | 
 |     ; begin epilog | 
 |     pop         rcx | 
 |     pop         rdi | 
 |     pop         rsi | 
 |     RESTORE_XMM | 
 |     UNSHADOW_ARGS | 
 |     pop         rbp | 
 |     ret | 
 |  | 
 | ;void int vp9_sad16x8x3_ssse3( | 
 | ;    unsigned char *src_ptr, | 
 | ;    int  src_stride, | 
 | ;    unsigned char *ref_ptr, | 
 | ;    int  ref_stride, | 
 | ;    int  *results) | 
 | global sym(vp9_sad16x8x3_ssse3) PRIVATE | 
 | sym(vp9_sad16x8x3_ssse3): | 
 |     push        rbp | 
 |     mov         rbp, rsp | 
 |     SHADOW_ARGS_TO_STACK 5 | 
 |     SAVE_XMM 7 | 
 |     push        rsi | 
 |     push        rdi | 
 |     push        rcx | 
 |     ; end prolog | 
 |  | 
 |         mov             rsi,        arg(0) ;src_ptr | 
 |         mov             rdi,        arg(2) ;ref_ptr | 
 |  | 
 |         mov             rdx,        0xf | 
 |         and             rdx,        rdi | 
 |  | 
 |         jmp .vp9_sad16x8x3_ssse3_skiptable | 
 | .vp9_sad16x8x3_ssse3_jumptable: | 
 |         dd .vp9_sad16x8x3_ssse3_aligned_by_0  - .vp9_sad16x8x3_ssse3_do_jump | 
 |         dd .vp9_sad16x8x3_ssse3_aligned_by_1  - .vp9_sad16x8x3_ssse3_do_jump | 
 |         dd .vp9_sad16x8x3_ssse3_aligned_by_2  - .vp9_sad16x8x3_ssse3_do_jump | 
 |         dd .vp9_sad16x8x3_ssse3_aligned_by_3  - .vp9_sad16x8x3_ssse3_do_jump | 
 |         dd .vp9_sad16x8x3_ssse3_aligned_by_4  - .vp9_sad16x8x3_ssse3_do_jump | 
 |         dd .vp9_sad16x8x3_ssse3_aligned_by_5  - .vp9_sad16x8x3_ssse3_do_jump | 
 |         dd .vp9_sad16x8x3_ssse3_aligned_by_6  - .vp9_sad16x8x3_ssse3_do_jump | 
 |         dd .vp9_sad16x8x3_ssse3_aligned_by_7  - .vp9_sad16x8x3_ssse3_do_jump | 
 |         dd .vp9_sad16x8x3_ssse3_aligned_by_8  - .vp9_sad16x8x3_ssse3_do_jump | 
 |         dd .vp9_sad16x8x3_ssse3_aligned_by_9  - .vp9_sad16x8x3_ssse3_do_jump | 
 |         dd .vp9_sad16x8x3_ssse3_aligned_by_10 - .vp9_sad16x8x3_ssse3_do_jump | 
 |         dd .vp9_sad16x8x3_ssse3_aligned_by_11 - .vp9_sad16x8x3_ssse3_do_jump | 
 |         dd .vp9_sad16x8x3_ssse3_aligned_by_12 - .vp9_sad16x8x3_ssse3_do_jump | 
 |         dd .vp9_sad16x8x3_ssse3_aligned_by_13 - .vp9_sad16x8x3_ssse3_do_jump | 
 |         dd .vp9_sad16x8x3_ssse3_aligned_by_14 - .vp9_sad16x8x3_ssse3_do_jump | 
 |         dd .vp9_sad16x8x3_ssse3_aligned_by_15 - .vp9_sad16x8x3_ssse3_do_jump | 
 | .vp9_sad16x8x3_ssse3_skiptable: | 
 |  | 
 |         call .vp9_sad16x8x3_ssse3_do_jump | 
 | .vp9_sad16x8x3_ssse3_do_jump: | 
 |         pop             rcx                         ; get the address of do_jump | 
 |         mov             rax,  .vp9_sad16x8x3_ssse3_jumptable - .vp9_sad16x8x3_ssse3_do_jump | 
 |         add             rax,  rcx  ; get the absolute address of vp9_sad16x8x3_ssse3_jumptable | 
 |  | 
 |         movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable | 
 |         add             rcx,        rax | 
 |  | 
 |         movsxd          rax,        dword ptr arg(1) ;src_stride | 
 |         movsxd          rdx,        dword ptr arg(3) ;ref_stride | 
 |  | 
 |         jmp             rcx | 
 |  | 
 |         PROCESS_16X8X3_OFFSET 0,  .vp9_sad16x8x3_ssse3 | 
 |         PROCESS_16X8X3_OFFSET 1,  .vp9_sad16x8x3_ssse3 | 
 |         PROCESS_16X8X3_OFFSET 2,  .vp9_sad16x8x3_ssse3 | 
 |         PROCESS_16X8X3_OFFSET 3,  .vp9_sad16x8x3_ssse3 | 
 |         PROCESS_16X8X3_OFFSET 4,  .vp9_sad16x8x3_ssse3 | 
 |         PROCESS_16X8X3_OFFSET 5,  .vp9_sad16x8x3_ssse3 | 
 |         PROCESS_16X8X3_OFFSET 6,  .vp9_sad16x8x3_ssse3 | 
 |         PROCESS_16X8X3_OFFSET 7,  .vp9_sad16x8x3_ssse3 | 
 |         PROCESS_16X8X3_OFFSET 8,  .vp9_sad16x8x3_ssse3 | 
 |         PROCESS_16X8X3_OFFSET 9,  .vp9_sad16x8x3_ssse3 | 
 |         PROCESS_16X8X3_OFFSET 10, .vp9_sad16x8x3_ssse3 | 
 |         PROCESS_16X8X3_OFFSET 11, .vp9_sad16x8x3_ssse3 | 
 |         PROCESS_16X8X3_OFFSET 12, .vp9_sad16x8x3_ssse3 | 
 |         PROCESS_16X8X3_OFFSET 13, .vp9_sad16x8x3_ssse3 | 
 |         PROCESS_16X8X3_OFFSET 14, .vp9_sad16x8x3_ssse3 | 
 |  | 
 | .vp9_sad16x8x3_ssse3_aligned_by_15: | 
 |  | 
 |         PROCESS_16X2X3 1 | 
 |         PROCESS_16X2X3 0 | 
 |         PROCESS_16X2X3 0 | 
 |         PROCESS_16X2X3 0 | 
 |  | 
 | .vp9_sad16x8x3_ssse3_store_off: | 
 |         mov             rdi,        arg(4) ;Results | 
 |  | 
 |         movq            xmm0,       xmm5 | 
 |         psrldq          xmm5,       8 | 
 |  | 
 |         paddw           xmm0,       xmm5 | 
 |         movd            [rdi],      xmm0 | 
 | ;- | 
 |         movq            xmm0,       xmm6 | 
 |         psrldq          xmm6,       8 | 
 |  | 
 |         paddw           xmm0,       xmm6 | 
 |         movd            [rdi+4],    xmm0 | 
 | ;- | 
 |         movq            xmm0,       xmm7 | 
 |         psrldq          xmm7,       8 | 
 |  | 
 |         paddw           xmm0,       xmm7 | 
 |         movd            [rdi+8],    xmm0 | 
 |  | 
 |     ; begin epilog | 
 |     pop         rcx | 
 |     pop         rdi | 
 |     pop         rsi | 
 |     RESTORE_XMM | 
 |     UNSHADOW_ARGS | 
 |     pop         rbp | 
 |     ret |