| ; | 
 | ; Copyright (c) 2021, Alliance for Open Media. All rights reserved | 
 | ; | 
 | ; This source code is subject to the terms of the BSD 3-Clause Clear License and the | 
 | ; Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear License was | 
 | ; not distributed with this source code in the LICENSE file, you can obtain it | 
 | ; at aomedia.org/license/software-license/bsd-3-c-c/.  If the Alliance for Open Media Patent | 
 | ; License 1.0 was not distributed with this source code in the PATENTS file, you | 
 | ; can obtain it at aomedia.org/license/patent-license/. | 
 | ; | 
 |  | 
 | ; | 
 |  | 
 | %include "aom_ports/x86_abi_support.asm" | 
 |  | 
 | %macro HIGH_GET_PARAM_4 0 | 
 |     mov         rdx, arg(5)                 ;filter ptr | 
 |     mov         rsi, arg(0)                 ;src_ptr | 
 |     mov         rdi, arg(2)                 ;output_ptr | 
 |     mov         rcx, 0x00000040 | 
 |  | 
 |     movdqa      xmm3, [rdx]                 ;load filters | 
 |     pshuflw     xmm4, xmm3, 11111111b       ;k3 | 
 |     psrldq      xmm3, 8 | 
 |     pshuflw     xmm3, xmm3, 0b              ;k4 | 
 |     punpcklwd   xmm4, xmm3                  ;k3k4 | 
 |  | 
 |     movq        xmm3, rcx                   ;rounding | 
 |     pshufd      xmm3, xmm3, 0 | 
 |  | 
 |     mov         rdx, 0x00010001 | 
 |     movsxd      rcx, DWORD PTR arg(6)       ;bps | 
 |     movq        xmm5, rdx | 
 |     movq        xmm2, rcx | 
 |     pshufd      xmm5, xmm5, 0b | 
 |     movdqa      xmm1, xmm5 | 
 |     psllw       xmm5, xmm2 | 
 |     psubw       xmm5, xmm1                  ;max value (for clamping) | 
 |     pxor        xmm2, xmm2                  ;min value (for clamping) | 
 |  | 
 |     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line | 
 |     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch | 
 |     movsxd      rcx, DWORD PTR arg(4)       ;output_height | 
 | %endm | 
 |  | 
 | %macro HIGH_APPLY_FILTER_4 1 | 
 |  | 
 |     punpcklwd   xmm0, xmm1                  ;two row in one register | 
 |     pmaddwd     xmm0, xmm4                  ;multiply the filter factors | 
 |  | 
 |     paddd       xmm0, xmm3                  ;rounding | 
 |     psrad       xmm0, 7                     ;shift | 
 |     packssdw    xmm0, xmm0                  ;pack to word | 
 |  | 
 |     ;clamp the values | 
 |     pminsw      xmm0, xmm5 | 
 |     pmaxsw      xmm0, xmm2 | 
 |  | 
 | %if %1 | 
 |     movq        xmm1, [rdi] | 
 |     pavgw       xmm0, xmm1 | 
 | %endif | 
 |  | 
 |     movq        [rdi], xmm0 | 
 |     lea         rsi, [rsi + 2*rax] | 
 |     lea         rdi, [rdi + 2*rdx] | 
 |     dec         rcx | 
 | %endm | 
 |  | 
 | %macro HIGH_GET_PARAM 0 | 
 |     mov         rdx, arg(5)                 ;filter ptr | 
 |     mov         rsi, arg(0)                 ;src_ptr | 
 |     mov         rdi, arg(2)                 ;output_ptr | 
 |     mov         rcx, 0x00000040 | 
 |  | 
 |     movdqa      xmm6, [rdx]                 ;load filters | 
 |  | 
 |     pshuflw     xmm7, xmm6, 11111111b       ;k3 | 
 |     pshufhw     xmm6, xmm6, 0b              ;k4 | 
 |     psrldq      xmm6, 8 | 
 |     punpcklwd   xmm7, xmm6                  ;k3k4k3k4k3k4k3k4 | 
 |  | 
 |     movq        xmm4, rcx                   ;rounding | 
 |     pshufd      xmm4, xmm4, 0 | 
 |  | 
 |     mov         rdx, 0x00010001 | 
 |     movsxd      rcx, DWORD PTR arg(6)       ;bps | 
 |     movq        xmm3, rdx | 
 |     movq        xmm5, rcx | 
 |     pshufd      xmm3, xmm3, 0b | 
 |     movdqa      xmm1, xmm3 | 
 |     psllw       xmm3, xmm5 | 
 |     psubw       xmm3, xmm1                  ;max value (for clamping) | 
 |     pxor        xmm5, xmm5                  ;min value (for clamping) | 
 |  | 
 |     movdqa      max, xmm3 | 
 |     movdqa      min, xmm5 | 
 |  | 
 |     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line | 
 |     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch | 
 |     movsxd      rcx, DWORD PTR arg(4)       ;output_height | 
 | %endm | 
 |  | 
 | %macro HIGH_APPLY_FILTER_8 1 | 
 |     movdqa      xmm6, xmm0 | 
 |     punpckhwd   xmm6, xmm1 | 
 |     punpcklwd   xmm0, xmm1 | 
 |     pmaddwd     xmm6, xmm7 | 
 |     pmaddwd     xmm0, xmm7 | 
 |  | 
 |     paddd       xmm6, xmm4                  ;rounding | 
 |     paddd       xmm0, xmm4                  ;rounding | 
 |     psrad       xmm6, 7                     ;shift | 
 |     psrad       xmm0, 7                     ;shift | 
 |     packssdw    xmm0, xmm6                  ;pack back to word | 
 |  | 
 |     ;clamp the values | 
 |     pminsw      xmm0, max | 
 |     pmaxsw      xmm0, min | 
 |  | 
 | %if %1 | 
 |     movdqu      xmm1, [rdi] | 
 |     pavgw       xmm0, xmm1 | 
 | %endif | 
 |     movdqu      [rdi], xmm0                 ;store the result | 
 |  | 
 |     lea         rsi, [rsi + 2*rax] | 
 |     lea         rdi, [rdi + 2*rdx] | 
 |     dec         rcx | 
 | %endm | 
 |  | 
 | %macro HIGH_APPLY_FILTER_16 1 | 
 |     movdqa      xmm5, xmm0 | 
 |     movdqa      xmm6, xmm2 | 
 |     punpckhwd   xmm5, xmm1 | 
 |     punpckhwd   xmm6, xmm3 | 
 |     punpcklwd   xmm0, xmm1 | 
 |     punpcklwd   xmm2, xmm3 | 
 |  | 
 |     pmaddwd     xmm5, xmm7 | 
 |     pmaddwd     xmm6, xmm7 | 
 |     pmaddwd     xmm0, xmm7 | 
 |     pmaddwd     xmm2, xmm7 | 
 |  | 
 |     paddd       xmm5, xmm4                  ;rounding | 
 |     paddd       xmm6, xmm4 | 
 |     paddd       xmm0, xmm4 | 
 |     paddd       xmm2, xmm4 | 
 |  | 
 |     psrad       xmm5, 7                     ;shift | 
 |     psrad       xmm6, 7 | 
 |     psrad       xmm0, 7 | 
 |     psrad       xmm2, 7 | 
 |  | 
 |     packssdw    xmm0, xmm5                  ;pack back to word | 
 |     packssdw    xmm2, xmm6                  ;pack back to word | 
 |  | 
 |     ;clamp the values | 
 |     pminsw      xmm0, max | 
 |     pmaxsw      xmm0, min | 
 |     pminsw      xmm2, max | 
 |     pmaxsw      xmm2, min | 
 |  | 
 | %if %1 | 
 |     movdqu      xmm1, [rdi] | 
 |     movdqu      xmm3, [rdi + 16] | 
 |     pavgw       xmm0, xmm1 | 
 |     pavgw       xmm2, xmm3 | 
 | %endif | 
 |     movdqu      [rdi], xmm0               ;store the result | 
 |     movdqu      [rdi + 16], xmm2          ;store the result | 
 |  | 
 |     lea         rsi, [rsi + 2*rax] | 
 |     lea         rdi, [rdi + 2*rdx] | 
 |     dec         rcx | 
 | %endm | 
 |  | 
 | SECTION .text | 
 |  | 
 | globalsym(aom_highbd_filter_block1d4_v2_sse2) | 
 | sym(aom_highbd_filter_block1d4_v2_sse2): | 
 |     push        rbp | 
 |     mov         rbp, rsp | 
 |     SHADOW_ARGS_TO_STACK 7 | 
 |     push        rsi | 
 |     push        rdi | 
 |     ; end prolog | 
 |  | 
 |     HIGH_GET_PARAM_4 | 
 | .loop: | 
 |     movq        xmm0, [rsi]                 ;load src | 
 |     movq        xmm1, [rsi + 2*rax] | 
 |  | 
 |     HIGH_APPLY_FILTER_4 0 | 
 |     jnz         .loop | 
 |  | 
 |     ; begin epilog | 
 |     pop         rdi | 
 |     pop         rsi | 
 |     UNSHADOW_ARGS | 
 |     pop         rbp | 
 |     ret | 
 |  | 
 | globalsym(aom_highbd_filter_block1d8_v2_sse2) | 
 | sym(aom_highbd_filter_block1d8_v2_sse2): | 
 |     push        rbp | 
 |     mov         rbp, rsp | 
 |     SHADOW_ARGS_TO_STACK 7 | 
 |     SAVE_XMM 8 | 
 |     push        rsi | 
 |     push        rdi | 
 |     ; end prolog | 
 |  | 
 |     ALIGN_STACK 16, rax | 
 |     sub         rsp, 16 * 2 | 
 |     %define max [rsp + 16 * 0] | 
 |     %define min [rsp + 16 * 1] | 
 |  | 
 |     HIGH_GET_PARAM | 
 | .loop: | 
 |     movdqu      xmm0, [rsi]                 ;0 | 
 |     movdqu      xmm1, [rsi + 2*rax]         ;1 | 
 |  | 
 |     HIGH_APPLY_FILTER_8 0 | 
 |     jnz         .loop | 
 |  | 
 |     add rsp, 16 * 2 | 
 |     pop rsp | 
 |  | 
 |     ; begin epilog | 
 |     pop         rdi | 
 |     pop         rsi | 
 |     RESTORE_XMM | 
 |     UNSHADOW_ARGS | 
 |     pop         rbp | 
 |     ret | 
 |  | 
 | globalsym(aom_highbd_filter_block1d16_v2_sse2) | 
 | sym(aom_highbd_filter_block1d16_v2_sse2): | 
 |     push        rbp | 
 |     mov         rbp, rsp | 
 |     SHADOW_ARGS_TO_STACK 7 | 
 |     SAVE_XMM 9 | 
 |     push        rsi | 
 |     push        rdi | 
 |     ; end prolog | 
 |  | 
 |     ALIGN_STACK 16, rax | 
 |     sub         rsp, 16 * 2 | 
 |     %define max [rsp + 16 * 0] | 
 |     %define min [rsp + 16 * 1] | 
 |  | 
 |     HIGH_GET_PARAM | 
 | .loop: | 
 |     movdqu        xmm0, [rsi]               ;0 | 
 |     movdqu        xmm2, [rsi + 16] | 
 |     movdqu        xmm1, [rsi + 2*rax]       ;1 | 
 |     movdqu        xmm3, [rsi + 2*rax + 16] | 
 |  | 
 |     HIGH_APPLY_FILTER_16 0 | 
 |     jnz         .loop | 
 |  | 
 |     add rsp, 16 * 2 | 
 |     pop rsp | 
 |  | 
 |     ; begin epilog | 
 |     pop         rdi | 
 |     pop         rsi | 
 |     RESTORE_XMM | 
 |     UNSHADOW_ARGS | 
 |     pop         rbp | 
 |     ret | 
 |  | 
 | globalsym(aom_highbd_filter_block1d4_h2_sse2) | 
 | sym(aom_highbd_filter_block1d4_h2_sse2): | 
 |     push        rbp | 
 |     mov         rbp, rsp | 
 |     SHADOW_ARGS_TO_STACK 7 | 
 |     push        rsi | 
 |     push        rdi | 
 |     ; end prolog | 
 |  | 
 |     HIGH_GET_PARAM_4 | 
 | .loop: | 
 |     movdqu      xmm0, [rsi]                 ;load src | 
 |     movdqa      xmm1, xmm0 | 
 |     psrldq      xmm1, 2 | 
 |  | 
 |     HIGH_APPLY_FILTER_4 0 | 
 |     jnz         .loop | 
 |  | 
 |     ; begin epilog | 
 |     pop         rdi | 
 |     pop         rsi | 
 |     UNSHADOW_ARGS | 
 |     pop         rbp | 
 |     ret | 
 |  | 
 | globalsym(aom_highbd_filter_block1d8_h2_sse2) | 
 | sym(aom_highbd_filter_block1d8_h2_sse2): | 
 |     push        rbp | 
 |     mov         rbp, rsp | 
 |     SHADOW_ARGS_TO_STACK 7 | 
 |     SAVE_XMM 8 | 
 |     push        rsi | 
 |     push        rdi | 
 |     ; end prolog | 
 |  | 
 |     ALIGN_STACK 16, rax | 
 |     sub         rsp, 16 * 2 | 
 |     %define max [rsp + 16 * 0] | 
 |     %define min [rsp + 16 * 1] | 
 |  | 
 |     HIGH_GET_PARAM | 
 | .loop: | 
 |     movdqu      xmm0, [rsi]                 ;load src | 
 |     movdqu      xmm1, [rsi + 2] | 
 |  | 
 |     HIGH_APPLY_FILTER_8 0 | 
 |     jnz         .loop | 
 |  | 
 |     add rsp, 16 * 2 | 
 |     pop rsp | 
 |  | 
 |     ; begin epilog | 
 |     pop         rdi | 
 |     pop         rsi | 
 |     RESTORE_XMM | 
 |     UNSHADOW_ARGS | 
 |     pop         rbp | 
 |     ret | 
 |  | 
 | globalsym(aom_highbd_filter_block1d16_h2_sse2) | 
 | sym(aom_highbd_filter_block1d16_h2_sse2): | 
 |     push        rbp | 
 |     mov         rbp, rsp | 
 |     SHADOW_ARGS_TO_STACK 7 | 
 |     SAVE_XMM 9 | 
 |     push        rsi | 
 |     push        rdi | 
 |     ; end prolog | 
 |  | 
 |     ALIGN_STACK 16, rax | 
 |     sub         rsp, 16 * 2 | 
 |     %define max [rsp + 16 * 0] | 
 |     %define min [rsp + 16 * 1] | 
 |  | 
 |     HIGH_GET_PARAM | 
 | .loop: | 
 |     movdqu      xmm0,   [rsi]               ;load src | 
 |     movdqu      xmm1,   [rsi + 2] | 
 |     movdqu      xmm2,   [rsi + 16] | 
 |     movdqu      xmm3,   [rsi + 18] | 
 |  | 
 |     HIGH_APPLY_FILTER_16 0 | 
 |     jnz         .loop | 
 |  | 
 |     add rsp, 16 * 2 | 
 |     pop rsp | 
 |  | 
 |     ; begin epilog | 
 |     pop         rdi | 
 |     pop         rsi | 
 |     RESTORE_XMM | 
 |     UNSHADOW_ARGS | 
 |     pop         rbp | 
 |     ret |