| ; | 
 | ; Copyright (c) 2016, Alliance for Open Media. All rights reserved | 
 | ; | 
 | ; This source code is subject to the terms of the BSD 2 Clause License and | 
 | ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License | 
 | ; was not distributed with this source code in the LICENSE file, you can | 
 | ; obtain it at www.aomedia.org/license/software. If the Alliance for Open | 
 | ; Media Patent License 1.0 was not distributed with this source code in the | 
 | ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. | 
 | ; | 
 |  | 
 | ; | 
 |  | 
 | %include "aom_ports/x86_abi_support.asm" | 
 |  | 
 | %macro GET_PARAM_4 0 | 
 |     mov         rdx, arg(5)                 ;filter ptr | 
 |     mov         rsi, arg(0)                 ;src_ptr | 
 |     mov         rdi, arg(2)                 ;output_ptr | 
 |     mov         ecx, 0x01000100 | 
 |  | 
 |     movdqa      xmm3, [rdx]                 ;load filters | 
 |     psrldq      xmm3, 6 | 
 |     packsswb    xmm3, xmm3 | 
 |     pshuflw     xmm3, xmm3, 0b              ;k3_k4 | 
 |  | 
 |     movd        xmm2, ecx                   ;rounding_shift | 
 |     pshufd      xmm2, xmm2, 0 | 
 |  | 
 |     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line | 
 |     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch | 
 |     movsxd      rcx, DWORD PTR arg(4)       ;output_height | 
 | %endm | 
 |  | 
 | %macro APPLY_FILTER_4 1 | 
 |     punpcklbw   xmm0, xmm1 | 
 |     pmaddubsw   xmm0, xmm3 | 
 |  | 
 |     pmulhrsw    xmm0, xmm2                  ;rounding(+64)+shift(>>7) | 
 |     packuswb    xmm0, xmm0                  ;pack to byte | 
 |  | 
 | %if %1 | 
 |     movd        xmm1, [rdi] | 
 |     pavgb       xmm0, xmm1 | 
 | %endif | 
 |     movd        [rdi], xmm0 | 
 |     lea         rsi, [rsi + rax] | 
 |     lea         rdi, [rdi + rdx] | 
 |     dec         rcx | 
 | %endm | 
 |  | 
 | %macro GET_PARAM 0 | 
 |     mov         rdx, arg(5)                 ;filter ptr | 
 |     mov         rsi, arg(0)                 ;src_ptr | 
 |     mov         rdi, arg(2)                 ;output_ptr | 
 |     mov         ecx, 0x01000100 | 
 |  | 
 |     movdqa      xmm7, [rdx]                 ;load filters | 
 |     psrldq      xmm7, 6 | 
 |     packsswb    xmm7, xmm7 | 
 |     pshuflw     xmm7, xmm7, 0b              ;k3_k4 | 
 |     punpcklwd   xmm7, xmm7 | 
 |  | 
 |     movd        xmm6, ecx                   ;rounding_shift | 
 |     pshufd      xmm6, xmm6, 0 | 
 |  | 
 |     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line | 
 |     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch | 
 |     movsxd      rcx, DWORD PTR arg(4)       ;output_height | 
 | %endm | 
 |  | 
 | %macro APPLY_FILTER_8 1 | 
 |     punpcklbw   xmm0, xmm1 | 
 |     pmaddubsw   xmm0, xmm7 | 
 |  | 
 |     pmulhrsw    xmm0, xmm6                  ;rounding(+64)+shift(>>7) | 
 |     packuswb    xmm0, xmm0                  ;pack back to byte | 
 |  | 
 | %if %1 | 
 |     movq        xmm1, [rdi] | 
 |     pavgb       xmm0, xmm1 | 
 | %endif | 
 |     movq        [rdi], xmm0                 ;store the result | 
 |  | 
 |     lea         rsi, [rsi + rax] | 
 |     lea         rdi, [rdi + rdx] | 
 |     dec         rcx | 
 | %endm | 
 |  | 
 | %macro APPLY_FILTER_16 1 | 
 |     punpcklbw   xmm0, xmm1 | 
 |     punpckhbw   xmm2, xmm1 | 
 |     pmaddubsw   xmm0, xmm7 | 
 |     pmaddubsw   xmm2, xmm7 | 
 |  | 
 |     pmulhrsw    xmm0, xmm6                  ;rounding(+64)+shift(>>7) | 
 |     pmulhrsw    xmm2, xmm6 | 
 |     packuswb    xmm0, xmm2                  ;pack back to byte | 
 |  | 
 | %if %1 | 
 |     movdqu      xmm1, [rdi] | 
 |     pavgb       xmm0, xmm1 | 
 | %endif | 
 |     movdqu      [rdi], xmm0                 ;store the result | 
 |  | 
 |     lea         rsi, [rsi + rax] | 
 |     lea         rdi, [rdi + rdx] | 
 |     dec         rcx | 
 | %endm | 
 |  | 
 | SECTION .text | 
 |  | 
 | global sym(aom_filter_block1d4_v2_ssse3) PRIVATE | 
 | sym(aom_filter_block1d4_v2_ssse3): | 
 |     push        rbp | 
 |     mov         rbp, rsp | 
 |     SHADOW_ARGS_TO_STACK 6 | 
 |     push        rsi | 
 |     push        rdi | 
 |     ; end prolog | 
 |  | 
 |     GET_PARAM_4 | 
 | .loop: | 
 |     movd        xmm0, [rsi]                 ;load src | 
 |     movd        xmm1, [rsi + rax] | 
 |  | 
 |     APPLY_FILTER_4 0 | 
 |     jnz         .loop | 
 |  | 
 |     ; begin epilog | 
 |     pop         rdi | 
 |     pop         rsi | 
 |     UNSHADOW_ARGS | 
 |     pop         rbp | 
 |     ret | 
 |  | 
 | global sym(aom_filter_block1d8_v2_ssse3) PRIVATE | 
 | sym(aom_filter_block1d8_v2_ssse3): | 
 |     push        rbp | 
 |     mov         rbp, rsp | 
 |     SHADOW_ARGS_TO_STACK 6 | 
 |     SAVE_XMM 7 | 
 |     push        rsi | 
 |     push        rdi | 
 |     ; end prolog | 
 |  | 
 |     GET_PARAM | 
 | .loop: | 
 |     movq        xmm0, [rsi]                 ;0 | 
 |     movq        xmm1, [rsi + rax]           ;1 | 
 |  | 
 |     APPLY_FILTER_8 0 | 
 |     jnz         .loop | 
 |  | 
 |     ; begin epilog | 
 |     pop         rdi | 
 |     pop         rsi | 
 |     RESTORE_XMM | 
 |     UNSHADOW_ARGS | 
 |     pop         rbp | 
 |     ret | 
 |  | 
 | global sym(aom_filter_block1d16_v2_ssse3) PRIVATE | 
 | sym(aom_filter_block1d16_v2_ssse3): | 
 |     push        rbp | 
 |     mov         rbp, rsp | 
 |     SHADOW_ARGS_TO_STACK 6 | 
 |     SAVE_XMM 7 | 
 |     push        rsi | 
 |     push        rdi | 
 |     ; end prolog | 
 |  | 
 |     GET_PARAM | 
 | .loop: | 
 |     movdqu        xmm0, [rsi]               ;0 | 
 |     movdqu        xmm1, [rsi + rax]         ;1 | 
 |     movdqa        xmm2, xmm0 | 
 |  | 
 |     APPLY_FILTER_16 0 | 
 |     jnz         .loop | 
 |  | 
 |     ; begin epilog | 
 |     pop         rdi | 
 |     pop         rsi | 
 |     RESTORE_XMM | 
 |     UNSHADOW_ARGS | 
 |     pop         rbp | 
 |     ret | 
 |  | 
 | global sym(aom_filter_block1d4_h2_ssse3) PRIVATE | 
 | sym(aom_filter_block1d4_h2_ssse3): | 
 |     push        rbp | 
 |     mov         rbp, rsp | 
 |     SHADOW_ARGS_TO_STACK 6 | 
 |     push        rsi | 
 |     push        rdi | 
 |     ; end prolog | 
 |  | 
 |     GET_PARAM_4 | 
 | .loop: | 
 |     movdqu      xmm0, [rsi]                 ;load src | 
 |     movdqa      xmm1, xmm0 | 
 |     psrldq      xmm1, 1 | 
 |  | 
 |     APPLY_FILTER_4 0 | 
 |     jnz         .loop | 
 |  | 
 |     ; begin epilog | 
 |     pop         rdi | 
 |     pop         rsi | 
 |     UNSHADOW_ARGS | 
 |     pop         rbp | 
 |     ret | 
 |  | 
 | global sym(aom_filter_block1d8_h2_ssse3) PRIVATE | 
 | sym(aom_filter_block1d8_h2_ssse3): | 
 |     push        rbp | 
 |     mov         rbp, rsp | 
 |     SHADOW_ARGS_TO_STACK 6 | 
 |     SAVE_XMM 7 | 
 |     push        rsi | 
 |     push        rdi | 
 |     ; end prolog | 
 |  | 
 |     GET_PARAM | 
 | .loop: | 
 |     movdqu      xmm0, [rsi]                 ;load src | 
 |     movdqa      xmm1, xmm0 | 
 |     psrldq      xmm1, 1 | 
 |  | 
 |     APPLY_FILTER_8 0 | 
 |     jnz         .loop | 
 |  | 
 |     ; begin epilog | 
 |     pop         rdi | 
 |     pop         rsi | 
 |     RESTORE_XMM | 
 |     UNSHADOW_ARGS | 
 |     pop         rbp | 
 |     ret | 
 |  | 
 | global sym(aom_filter_block1d16_h2_ssse3) PRIVATE | 
 | sym(aom_filter_block1d16_h2_ssse3): | 
 |     push        rbp | 
 |     mov         rbp, rsp | 
 |     SHADOW_ARGS_TO_STACK 6 | 
 |     SAVE_XMM 7 | 
 |     push        rsi | 
 |     push        rdi | 
 |     ; end prolog | 
 |  | 
 |     GET_PARAM | 
 | .loop: | 
 |     movdqu      xmm0,   [rsi]               ;load src | 
 |     movdqu      xmm1,   [rsi + 1] | 
 |     movdqa      xmm2, xmm0 | 
 |  | 
 |     APPLY_FILTER_16 0 | 
 |     jnz         .loop | 
 |  | 
 |     ; begin epilog | 
 |     pop         rdi | 
 |     pop         rsi | 
 |     RESTORE_XMM | 
 |     UNSHADOW_ARGS | 
 |     pop         rbp | 
 |     ret |