| ; | 
 | ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 
 | ; | 
 | ;  Use of this source code is governed by a BSD-style license | 
 | ;  that can be found in the LICENSE file in the root of the source | 
 | ;  tree. An additional intellectual property rights grant can be found | 
 | ;  in the file PATENTS.  All contributing project authors may | 
 | ;  be found in the AUTHORS file in the root of the source tree. | 
 | ; | 
 |  | 
 |  | 
 | %include "vpx_ports/x86_abi_support.asm" | 
 |  | 
 | %define xmm_filter_shift            7 | 
 |  | 
 |  | 
 | ;void vp9_filter_block2d_bil_var_ssse3 | 
 | ;( | 
 | ;    unsigned char *ref_ptr, | 
 | ;    int ref_pixels_per_line, | 
 | ;    unsigned char *src_ptr, | 
 | ;    int src_pixels_per_line, | 
 | ;    unsigned int Height, | 
 | ;    int  xoffset, | 
 | ;    int  yoffset, | 
 | ;    int *sum, | 
 | ;    unsigned int *sumsquared;; | 
 | ; | 
 | ;) | 
 | ;Note: The filter coefficient at offset=0 is 128. Since the second register | 
 | ;for Pmaddubsw is signed bytes, we must calculate zero offset seperately. | 
 | global sym(vp9_filter_block2d_bil_var_ssse3) PRIVATE | 
 | sym(vp9_filter_block2d_bil_var_ssse3): | 
 |     push        rbp | 
 |     mov         rbp, rsp | 
 |     SHADOW_ARGS_TO_STACK 9 | 
 |     SAVE_XMM 7 | 
 |     GET_GOT     rbx | 
 |     push rsi | 
 |     push rdi | 
 |     ; end prolog | 
 |  | 
 |         pxor            xmm6,           xmm6 | 
 |         pxor            xmm7,           xmm7 | 
 |  | 
 |         lea             rcx,            [GLOBAL(bilinear_filters_ssse3)] | 
 |         movsxd          rax,            dword ptr arg(5)     ; xoffset | 
 |  | 
 |         cmp             rax,            0                    ; skip first_pass filter if xoffset=0 | 
 |         je              .filter_block2d_bil_var_ssse3_sp_only | 
 |  | 
 |         shl             rax,            4                    ; point to filter coeff with xoffset | 
 |         lea             rax,            [rax + rcx]          ; HFilter | 
 |  | 
 |         movsxd          rdx,            dword ptr arg(6)     ; yoffset | 
 |  | 
 |         cmp             rdx,            0                    ; skip second_pass filter if yoffset=0 | 
 |         je              .filter_block2d_bil_var_ssse3_fp_only | 
 |  | 
 |         shl             rdx,            4 | 
 |         lea             rdx,            [rdx + rcx]          ; VFilter | 
 |  | 
 |         mov             rsi,            arg(0)               ;ref_ptr | 
 |         mov             rdi,            arg(2)               ;src_ptr | 
 |         movsxd          rcx,            dword ptr arg(4)     ;Height | 
 |  | 
 |         movdqu          xmm0,           XMMWORD PTR [rsi] | 
 |         movdqu          xmm1,           XMMWORD PTR [rsi+1] | 
 |         movdqa          xmm2,           xmm0 | 
 |  | 
 |         punpcklbw       xmm0,           xmm1 | 
 |         punpckhbw       xmm2,           xmm1 | 
 |         pmaddubsw       xmm0,           [rax] | 
 |         pmaddubsw       xmm2,           [rax] | 
 |  | 
 |         paddw           xmm0,           [GLOBAL(xmm_bi_rd)] | 
 |         paddw           xmm2,           [GLOBAL(xmm_bi_rd)] | 
 |         psraw           xmm0,           xmm_filter_shift | 
 |         psraw           xmm2,           xmm_filter_shift | 
 |  | 
 |         packuswb        xmm0,           xmm2 | 
 |  | 
 | %if ABI_IS_32BIT | 
 |         add             rsi,            dword ptr arg(1) ;ref_pixels_per_line | 
 | %else | 
 |         movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line | 
 |         movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line | 
 |         lea             rsi,            [rsi + r8] | 
 | %endif | 
 |  | 
 | .filter_block2d_bil_var_ssse3_loop: | 
 |         movdqu          xmm1,           XMMWORD PTR [rsi] | 
 |         movdqu          xmm2,           XMMWORD PTR [rsi+1] | 
 |         movdqa          xmm3,           xmm1 | 
 |  | 
 |         punpcklbw       xmm1,           xmm2 | 
 |         punpckhbw       xmm3,           xmm2 | 
 |         pmaddubsw       xmm1,           [rax] | 
 |         pmaddubsw       xmm3,           [rax] | 
 |  | 
 |         paddw           xmm1,           [GLOBAL(xmm_bi_rd)] | 
 |         paddw           xmm3,           [GLOBAL(xmm_bi_rd)] | 
 |         psraw           xmm1,           xmm_filter_shift | 
 |         psraw           xmm3,           xmm_filter_shift | 
 |         packuswb        xmm1,           xmm3 | 
 |  | 
 |         movdqa          xmm2,           xmm0 | 
 |         movdqa          xmm0,           xmm1 | 
 |         movdqa          xmm3,           xmm2 | 
 |  | 
 |         punpcklbw       xmm2,           xmm1 | 
 |         punpckhbw       xmm3,           xmm1 | 
 |         pmaddubsw       xmm2,           [rdx] | 
 |         pmaddubsw       xmm3,           [rdx] | 
 |  | 
 |         paddw           xmm2,           [GLOBAL(xmm_bi_rd)] | 
 |         paddw           xmm3,           [GLOBAL(xmm_bi_rd)] | 
 |         psraw           xmm2,           xmm_filter_shift | 
 |         psraw           xmm3,           xmm_filter_shift | 
 |  | 
 |         movq            xmm1,           QWORD PTR [rdi] | 
 |         pxor            xmm4,           xmm4 | 
 |         punpcklbw       xmm1,           xmm4 | 
 |         movq            xmm5,           QWORD PTR [rdi+8] | 
 |         punpcklbw       xmm5,           xmm4 | 
 |  | 
 |         psubw           xmm2,           xmm1 | 
 |         psubw           xmm3,           xmm5 | 
 |         paddw           xmm6,           xmm2 | 
 |         paddw           xmm6,           xmm3 | 
 |         pmaddwd         xmm2,           xmm2 | 
 |         pmaddwd         xmm3,           xmm3 | 
 |         paddd           xmm7,           xmm2 | 
 |         paddd           xmm7,           xmm3 | 
 |  | 
 | %if ABI_IS_32BIT | 
 |         add             rsi,            dword ptr arg(1)     ;ref_pixels_per_line | 
 |         add             rdi,            dword ptr arg(3)     ;src_pixels_per_line | 
 | %else | 
 |         lea             rsi,            [rsi + r8] | 
 |         lea             rdi,            [rdi + r9] | 
 | %endif | 
 |  | 
 |         sub             rcx,            1 | 
 |         jnz             .filter_block2d_bil_var_ssse3_loop | 
 |  | 
 |         jmp             .filter_block2d_bil_variance | 
 |  | 
 | .filter_block2d_bil_var_ssse3_sp_only: | 
 |         movsxd          rdx,            dword ptr arg(6)     ; yoffset | 
 |  | 
 |         cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0 | 
 |         je              .filter_block2d_bil_var_ssse3_full_pixel | 
 |  | 
 |         shl             rdx,            4 | 
 |         lea             rdx,            [rdx + rcx]          ; VFilter | 
 |  | 
 |         mov             rsi,            arg(0)               ;ref_ptr | 
 |         mov             rdi,            arg(2)               ;src_ptr | 
 |         movsxd          rcx,            dword ptr arg(4)     ;Height | 
 |         movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line | 
 |  | 
 |         movdqu          xmm1,           XMMWORD PTR [rsi] | 
 |         movdqa          xmm0,           xmm1 | 
 |  | 
 | %if ABI_IS_32BIT=0 | 
 |         movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line | 
 | %endif | 
 |  | 
 |         lea             rsi,            [rsi + rax] | 
 |  | 
 | .filter_block2d_bil_sp_only_loop: | 
 |         movdqu          xmm3,           XMMWORD PTR [rsi] | 
 |         movdqa          xmm2,           xmm1 | 
 |         movdqa          xmm0,           xmm3 | 
 |  | 
 |         punpcklbw       xmm1,           xmm3 | 
 |         punpckhbw       xmm2,           xmm3 | 
 |         pmaddubsw       xmm1,           [rdx] | 
 |         pmaddubsw       xmm2,           [rdx] | 
 |  | 
 |         paddw           xmm1,           [GLOBAL(xmm_bi_rd)] | 
 |         paddw           xmm2,           [GLOBAL(xmm_bi_rd)] | 
 |         psraw           xmm1,           xmm_filter_shift | 
 |         psraw           xmm2,           xmm_filter_shift | 
 |  | 
 |         movq            xmm3,           QWORD PTR [rdi] | 
 |         pxor            xmm4,           xmm4 | 
 |         punpcklbw       xmm3,           xmm4 | 
 |         movq            xmm5,           QWORD PTR [rdi+8] | 
 |         punpcklbw       xmm5,           xmm4 | 
 |  | 
 |         psubw           xmm1,           xmm3 | 
 |         psubw           xmm2,           xmm5 | 
 |         paddw           xmm6,           xmm1 | 
 |         paddw           xmm6,           xmm2 | 
 |         pmaddwd         xmm1,           xmm1 | 
 |         pmaddwd         xmm2,           xmm2 | 
 |         paddd           xmm7,           xmm1 | 
 |         paddd           xmm7,           xmm2 | 
 |  | 
 |         movdqa          xmm1,           xmm0 | 
 |         lea             rsi,            [rsi + rax]          ;ref_pixels_per_line | 
 |  | 
 | %if ABI_IS_32BIT | 
 |         add             rdi,            dword ptr arg(3)     ;src_pixels_per_line | 
 | %else | 
 |         lea             rdi,            [rdi + r9] | 
 | %endif | 
 |  | 
 |         sub             rcx,            1 | 
 |         jnz             .filter_block2d_bil_sp_only_loop | 
 |  | 
 |         jmp             .filter_block2d_bil_variance | 
 |  | 
 | .filter_block2d_bil_var_ssse3_full_pixel: | 
 |         mov             rsi,            arg(0)               ;ref_ptr | 
 |         mov             rdi,            arg(2)               ;src_ptr | 
 |         movsxd          rcx,            dword ptr arg(4)     ;Height | 
 |         movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line | 
 |         movsxd          rdx,            dword ptr arg(3)     ;src_pixels_per_line | 
 |         pxor            xmm0,           xmm0 | 
 |  | 
 | .filter_block2d_bil_full_pixel_loop: | 
 |         movq            xmm1,           QWORD PTR [rsi] | 
 |         punpcklbw       xmm1,           xmm0 | 
 |         movq            xmm2,           QWORD PTR [rsi+8] | 
 |         punpcklbw       xmm2,           xmm0 | 
 |  | 
 |         movq            xmm3,           QWORD PTR [rdi] | 
 |         punpcklbw       xmm3,           xmm0 | 
 |         movq            xmm4,           QWORD PTR [rdi+8] | 
 |         punpcklbw       xmm4,           xmm0 | 
 |  | 
 |         psubw           xmm1,           xmm3 | 
 |         psubw           xmm2,           xmm4 | 
 |         paddw           xmm6,           xmm1 | 
 |         paddw           xmm6,           xmm2 | 
 |         pmaddwd         xmm1,           xmm1 | 
 |         pmaddwd         xmm2,           xmm2 | 
 |         paddd           xmm7,           xmm1 | 
 |         paddd           xmm7,           xmm2 | 
 |  | 
 |         lea             rsi,            [rsi + rax]          ;ref_pixels_per_line | 
 |         lea             rdi,            [rdi + rdx]          ;src_pixels_per_line | 
 |         sub             rcx,            1 | 
 |         jnz             .filter_block2d_bil_full_pixel_loop | 
 |  | 
 |         jmp             .filter_block2d_bil_variance | 
 |  | 
 | .filter_block2d_bil_var_ssse3_fp_only: | 
 |         mov             rsi,            arg(0)               ;ref_ptr | 
 |         mov             rdi,            arg(2)               ;src_ptr | 
 |         movsxd          rcx,            dword ptr arg(4)     ;Height | 
 |         movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line | 
 |  | 
 |         pxor            xmm0,           xmm0 | 
 |  | 
 | %if ABI_IS_32BIT=0 | 
 |         movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line | 
 | %endif | 
 |  | 
 | .filter_block2d_bil_fp_only_loop: | 
 |         movdqu          xmm1,           XMMWORD PTR [rsi] | 
 |         movdqu          xmm2,           XMMWORD PTR [rsi+1] | 
 |         movdqa          xmm3,           xmm1 | 
 |  | 
 |         punpcklbw       xmm1,           xmm2 | 
 |         punpckhbw       xmm3,           xmm2 | 
 |         pmaddubsw       xmm1,           [rax] | 
 |         pmaddubsw       xmm3,           [rax] | 
 |  | 
 |         paddw           xmm1,           [GLOBAL(xmm_bi_rd)] | 
 |         paddw           xmm3,           [GLOBAL(xmm_bi_rd)] | 
 |         psraw           xmm1,           xmm_filter_shift | 
 |         psraw           xmm3,           xmm_filter_shift | 
 |  | 
 |         movq            xmm2,           XMMWORD PTR [rdi] | 
 |         pxor            xmm4,           xmm4 | 
 |         punpcklbw       xmm2,           xmm4 | 
 |         movq            xmm5,           QWORD PTR [rdi+8] | 
 |         punpcklbw       xmm5,           xmm4 | 
 |  | 
 |         psubw           xmm1,           xmm2 | 
 |         psubw           xmm3,           xmm5 | 
 |         paddw           xmm6,           xmm1 | 
 |         paddw           xmm6,           xmm3 | 
 |         pmaddwd         xmm1,           xmm1 | 
 |         pmaddwd         xmm3,           xmm3 | 
 |         paddd           xmm7,           xmm1 | 
 |         paddd           xmm7,           xmm3 | 
 |  | 
 |         lea             rsi,            [rsi + rdx] | 
 | %if ABI_IS_32BIT | 
 |         add             rdi,            dword ptr arg(3)     ;src_pixels_per_line | 
 | %else | 
 |         lea             rdi,            [rdi + r9] | 
 | %endif | 
 |  | 
 |         sub             rcx,            1 | 
 |         jnz             .filter_block2d_bil_fp_only_loop | 
 |  | 
 |         jmp             .filter_block2d_bil_variance | 
 |  | 
 | .filter_block2d_bil_variance: | 
 |         pxor        xmm0,           xmm0 | 
 |         pxor        xmm1,           xmm1 | 
 |         pxor        xmm5,           xmm5 | 
 |  | 
 |         punpcklwd   xmm0,           xmm6 | 
 |         punpckhwd   xmm1,           xmm6 | 
 |         psrad       xmm0,           16 | 
 |         psrad       xmm1,           16 | 
 |         paddd       xmm0,           xmm1 | 
 |         movdqa      xmm1,           xmm0 | 
 |  | 
 |         movdqa      xmm6,           xmm7 | 
 |         punpckldq   xmm6,           xmm5 | 
 |         punpckhdq   xmm7,           xmm5 | 
 |         paddd       xmm6,           xmm7 | 
 |  | 
 |         punpckldq   xmm0,           xmm5 | 
 |         punpckhdq   xmm1,           xmm5 | 
 |         paddd       xmm0,           xmm1 | 
 |  | 
 |         movdqa      xmm7,           xmm6 | 
 |         movdqa      xmm1,           xmm0 | 
 |  | 
 |         psrldq      xmm7,           8 | 
 |         psrldq      xmm1,           8 | 
 |  | 
 |         paddd       xmm6,           xmm7 | 
 |         paddd       xmm0,           xmm1 | 
 |  | 
 |         mov         rsi,            arg(7) ;[Sum] | 
 |         mov         rdi,            arg(8) ;[SSE] | 
 |  | 
 |         movd        [rsi],       xmm0 | 
 |         movd        [rdi],       xmm6 | 
 |  | 
 |     ; begin epilog | 
 |     pop rdi | 
 |     pop rsi | 
 |     RESTORE_GOT | 
 |     RESTORE_XMM | 
 |     UNSHADOW_ARGS | 
 |     pop         rbp | 
 |     ret | 
 |  | 
 |  | 
 | SECTION_RODATA | 
 | align 16 | 
 | xmm_bi_rd: | 
 |     times 8 dw 64 | 
 | align 16 | 
 | bilinear_filters_ssse3: | 
 |     times 8 db 128, 0 | 
 |     times 8 db 120, 8 | 
 |     times 8 db 112, 16 | 
 |     times 8 db 104, 24 | 
 |     times 8 db  96, 32 | 
 |     times 8 db  88, 40 | 
 |     times 8 db  80, 48 | 
 |     times 8 db  72, 56 | 
 |     times 8 db  64, 64 | 
 |     times 8 db  56, 72 | 
 |     times 8 db  48, 80 | 
 |     times 8 db  40, 88 | 
 |     times 8 db  32, 96 | 
 |     times 8 db  24, 104 | 
 |     times 8 db  16, 112 | 
 |     times 8 db   8, 120 |