| ; |
| ; Copyright (c) 2016, Alliance for Open Media. All rights reserved |
| ; |
| ; This source code is subject to the terms of the BSD 2 Clause License and |
| ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| ; was not distributed with this source code in the LICENSE file, you can |
| ; obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| ; Media Patent License 1.0 was not distributed with this source code in the |
| ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| ; |
| |
| ; |
| |
| |
| %include "aom_ports/x86_abi_support.asm" |
| |
| %macro PROCESS_16X2X3 1 |
| %if %1 |
| movdqa xmm0, XMMWORD PTR [rsi] |
| lddqu xmm5, XMMWORD PTR [rdi] |
| lddqu xmm6, XMMWORD PTR [rdi+1] |
| lddqu xmm7, XMMWORD PTR [rdi+2] |
| |
| psadbw xmm5, xmm0 |
| psadbw xmm6, xmm0 |
| psadbw xmm7, xmm0 |
| %else |
| movdqa xmm0, XMMWORD PTR [rsi] |
| lddqu xmm1, XMMWORD PTR [rdi] |
| lddqu xmm2, XMMWORD PTR [rdi+1] |
| lddqu xmm3, XMMWORD PTR [rdi+2] |
| |
| psadbw xmm1, xmm0 |
| psadbw xmm2, xmm0 |
| psadbw xmm3, xmm0 |
| |
| paddw xmm5, xmm1 |
| paddw xmm6, xmm2 |
| paddw xmm7, xmm3 |
| %endif |
| movdqa xmm0, XMMWORD PTR [rsi+rax] |
| lddqu xmm1, XMMWORD PTR [rdi+rdx] |
| lddqu xmm2, XMMWORD PTR [rdi+rdx+1] |
| lddqu xmm3, XMMWORD PTR [rdi+rdx+2] |
| |
| lea rsi, [rsi+rax*2] |
| lea rdi, [rdi+rdx*2] |
| |
| psadbw xmm1, xmm0 |
| psadbw xmm2, xmm0 |
| psadbw xmm3, xmm0 |
| |
| paddw xmm5, xmm1 |
| paddw xmm6, xmm2 |
| paddw xmm7, xmm3 |
| %endmacro |
| |
| %macro PROCESS_16X2X3_OFFSET 2 |
| %if %1 |
| movdqa xmm0, XMMWORD PTR [rsi] |
| movdqa xmm4, XMMWORD PTR [rdi] |
| movdqa xmm7, XMMWORD PTR [rdi+16] |
| |
| movdqa xmm5, xmm7 |
| palignr xmm5, xmm4, %2 |
| |
| movdqa xmm6, xmm7 |
| palignr xmm6, xmm4, (%2+1) |
| |
| palignr xmm7, xmm4, (%2+2) |
| |
| psadbw xmm5, xmm0 |
| psadbw xmm6, xmm0 |
| psadbw xmm7, xmm0 |
| %else |
| movdqa xmm0, XMMWORD PTR [rsi] |
| movdqa xmm4, XMMWORD PTR [rdi] |
| movdqa xmm3, XMMWORD PTR [rdi+16] |
| |
| movdqa xmm1, xmm3 |
| palignr xmm1, xmm4, %2 |
| |
| movdqa xmm2, xmm3 |
| palignr xmm2, xmm4, (%2+1) |
| |
| palignr xmm3, xmm4, (%2+2) |
| |
| psadbw xmm1, xmm0 |
| psadbw xmm2, xmm0 |
| psadbw xmm3, xmm0 |
| |
| paddw xmm5, xmm1 |
| paddw xmm6, xmm2 |
| paddw xmm7, xmm3 |
| %endif |
| movdqa xmm0, XMMWORD PTR [rsi+rax] |
| movdqa xmm4, XMMWORD PTR [rdi+rdx] |
| movdqa xmm3, XMMWORD PTR [rdi+rdx+16] |
| |
| movdqa xmm1, xmm3 |
| palignr xmm1, xmm4, %2 |
| |
| movdqa xmm2, xmm3 |
| palignr xmm2, xmm4, (%2+1) |
| |
| palignr xmm3, xmm4, (%2+2) |
| |
| lea rsi, [rsi+rax*2] |
| lea rdi, [rdi+rdx*2] |
| |
| psadbw xmm1, xmm0 |
| psadbw xmm2, xmm0 |
| psadbw xmm3, xmm0 |
| |
| paddw xmm5, xmm1 |
| paddw xmm6, xmm2 |
| paddw xmm7, xmm3 |
| %endmacro |
| |
| %macro PROCESS_16X16X3_OFFSET 2 |
| %2_aligned_by_%1: |
| |
| sub rdi, %1 |
| |
| PROCESS_16X2X3_OFFSET 1, %1 |
| PROCESS_16X2X3_OFFSET 0, %1 |
| PROCESS_16X2X3_OFFSET 0, %1 |
| PROCESS_16X2X3_OFFSET 0, %1 |
| PROCESS_16X2X3_OFFSET 0, %1 |
| PROCESS_16X2X3_OFFSET 0, %1 |
| PROCESS_16X2X3_OFFSET 0, %1 |
| PROCESS_16X2X3_OFFSET 0, %1 |
| |
| jmp %2_store_off |
| |
| %endmacro |
| |
| %macro PROCESS_16X8X3_OFFSET 2 |
| %2_aligned_by_%1: |
| |
| sub rdi, %1 |
| |
| PROCESS_16X2X3_OFFSET 1, %1 |
| PROCESS_16X2X3_OFFSET 0, %1 |
| PROCESS_16X2X3_OFFSET 0, %1 |
| PROCESS_16X2X3_OFFSET 0, %1 |
| |
| jmp %2_store_off |
| |
| %endmacro |
| |
| ;void int aom_sad16x16x3_ssse3( |
| ; unsigned char *src_ptr, |
| ; int src_stride, |
| ; unsigned char *ref_ptr, |
| ; int ref_stride, |
| ; int *results) |
| global sym(aom_sad16x16x3_ssse3) PRIVATE |
| sym(aom_sad16x16x3_ssse3): |
| push rbp |
| mov rbp, rsp |
| SHADOW_ARGS_TO_STACK 5 |
| SAVE_XMM 7 |
| push rsi |
| push rdi |
| push rcx |
| ; end prolog |
| |
| mov rsi, arg(0) ;src_ptr |
| mov rdi, arg(2) ;ref_ptr |
| |
| mov rdx, 0xf |
| and rdx, rdi |
| |
| jmp .aom_sad16x16x3_ssse3_skiptable |
| .aom_sad16x16x3_ssse3_jumptable: |
| dd .aom_sad16x16x3_ssse3_aligned_by_0 - .aom_sad16x16x3_ssse3_do_jump |
| dd .aom_sad16x16x3_ssse3_aligned_by_1 - .aom_sad16x16x3_ssse3_do_jump |
| dd .aom_sad16x16x3_ssse3_aligned_by_2 - .aom_sad16x16x3_ssse3_do_jump |
| dd .aom_sad16x16x3_ssse3_aligned_by_3 - .aom_sad16x16x3_ssse3_do_jump |
| dd .aom_sad16x16x3_ssse3_aligned_by_4 - .aom_sad16x16x3_ssse3_do_jump |
| dd .aom_sad16x16x3_ssse3_aligned_by_5 - .aom_sad16x16x3_ssse3_do_jump |
| dd .aom_sad16x16x3_ssse3_aligned_by_6 - .aom_sad16x16x3_ssse3_do_jump |
| dd .aom_sad16x16x3_ssse3_aligned_by_7 - .aom_sad16x16x3_ssse3_do_jump |
| dd .aom_sad16x16x3_ssse3_aligned_by_8 - .aom_sad16x16x3_ssse3_do_jump |
| dd .aom_sad16x16x3_ssse3_aligned_by_9 - .aom_sad16x16x3_ssse3_do_jump |
| dd .aom_sad16x16x3_ssse3_aligned_by_10 - .aom_sad16x16x3_ssse3_do_jump |
| dd .aom_sad16x16x3_ssse3_aligned_by_11 - .aom_sad16x16x3_ssse3_do_jump |
| dd .aom_sad16x16x3_ssse3_aligned_by_12 - .aom_sad16x16x3_ssse3_do_jump |
| dd .aom_sad16x16x3_ssse3_aligned_by_13 - .aom_sad16x16x3_ssse3_do_jump |
| dd .aom_sad16x16x3_ssse3_aligned_by_14 - .aom_sad16x16x3_ssse3_do_jump |
| dd .aom_sad16x16x3_ssse3_aligned_by_15 - .aom_sad16x16x3_ssse3_do_jump |
| .aom_sad16x16x3_ssse3_skiptable: |
| |
| call .aom_sad16x16x3_ssse3_do_jump |
| .aom_sad16x16x3_ssse3_do_jump: |
| pop rcx ; get the address of do_jump |
| mov rax, .aom_sad16x16x3_ssse3_jumptable - .aom_sad16x16x3_ssse3_do_jump |
| add rax, rcx ; get the absolute address of aom_sad16x16x3_ssse3_jumptable |
| |
| movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable |
| add rcx, rax |
| |
| movsxd rax, dword ptr arg(1) ;src_stride |
| movsxd rdx, dword ptr arg(3) ;ref_stride |
| |
| jmp rcx |
| |
| PROCESS_16X16X3_OFFSET 0, .aom_sad16x16x3_ssse3 |
| PROCESS_16X16X3_OFFSET 1, .aom_sad16x16x3_ssse3 |
| PROCESS_16X16X3_OFFSET 2, .aom_sad16x16x3_ssse3 |
| PROCESS_16X16X3_OFFSET 3, .aom_sad16x16x3_ssse3 |
| PROCESS_16X16X3_OFFSET 4, .aom_sad16x16x3_ssse3 |
| PROCESS_16X16X3_OFFSET 5, .aom_sad16x16x3_ssse3 |
| PROCESS_16X16X3_OFFSET 6, .aom_sad16x16x3_ssse3 |
| PROCESS_16X16X3_OFFSET 7, .aom_sad16x16x3_ssse3 |
| PROCESS_16X16X3_OFFSET 8, .aom_sad16x16x3_ssse3 |
| PROCESS_16X16X3_OFFSET 9, .aom_sad16x16x3_ssse3 |
| PROCESS_16X16X3_OFFSET 10, .aom_sad16x16x3_ssse3 |
| PROCESS_16X16X3_OFFSET 11, .aom_sad16x16x3_ssse3 |
| PROCESS_16X16X3_OFFSET 12, .aom_sad16x16x3_ssse3 |
| PROCESS_16X16X3_OFFSET 13, .aom_sad16x16x3_ssse3 |
| PROCESS_16X16X3_OFFSET 14, .aom_sad16x16x3_ssse3 |
| |
| .aom_sad16x16x3_ssse3_aligned_by_15: |
| PROCESS_16X2X3 1 |
| PROCESS_16X2X3 0 |
| PROCESS_16X2X3 0 |
| PROCESS_16X2X3 0 |
| PROCESS_16X2X3 0 |
| PROCESS_16X2X3 0 |
| PROCESS_16X2X3 0 |
| PROCESS_16X2X3 0 |
| |
| .aom_sad16x16x3_ssse3_store_off: |
| mov rdi, arg(4) ;Results |
| |
| movq xmm0, xmm5 |
| psrldq xmm5, 8 |
| |
| paddw xmm0, xmm5 |
| movd [rdi], xmm0 |
| ;- |
| movq xmm0, xmm6 |
| psrldq xmm6, 8 |
| |
| paddw xmm0, xmm6 |
| movd [rdi+4], xmm0 |
| ;- |
| movq xmm0, xmm7 |
| psrldq xmm7, 8 |
| |
| paddw xmm0, xmm7 |
| movd [rdi+8], xmm0 |
| |
| ; begin epilog |
| pop rcx |
| pop rdi |
| pop rsi |
| RESTORE_XMM |
| UNSHADOW_ARGS |
| pop rbp |
| ret |
| |
| ;void int aom_sad16x8x3_ssse3( |
| ; unsigned char *src_ptr, |
| ; int src_stride, |
| ; unsigned char *ref_ptr, |
| ; int ref_stride, |
| ; int *results) |
| global sym(aom_sad16x8x3_ssse3) PRIVATE |
| sym(aom_sad16x8x3_ssse3): |
| push rbp |
| mov rbp, rsp |
| SHADOW_ARGS_TO_STACK 5 |
| SAVE_XMM 7 |
| push rsi |
| push rdi |
| push rcx |
| ; end prolog |
| |
| mov rsi, arg(0) ;src_ptr |
| mov rdi, arg(2) ;ref_ptr |
| |
| mov rdx, 0xf |
| and rdx, rdi |
| |
| jmp .aom_sad16x8x3_ssse3_skiptable |
| .aom_sad16x8x3_ssse3_jumptable: |
| dd .aom_sad16x8x3_ssse3_aligned_by_0 - .aom_sad16x8x3_ssse3_do_jump |
| dd .aom_sad16x8x3_ssse3_aligned_by_1 - .aom_sad16x8x3_ssse3_do_jump |
| dd .aom_sad16x8x3_ssse3_aligned_by_2 - .aom_sad16x8x3_ssse3_do_jump |
| dd .aom_sad16x8x3_ssse3_aligned_by_3 - .aom_sad16x8x3_ssse3_do_jump |
| dd .aom_sad16x8x3_ssse3_aligned_by_4 - .aom_sad16x8x3_ssse3_do_jump |
| dd .aom_sad16x8x3_ssse3_aligned_by_5 - .aom_sad16x8x3_ssse3_do_jump |
| dd .aom_sad16x8x3_ssse3_aligned_by_6 - .aom_sad16x8x3_ssse3_do_jump |
| dd .aom_sad16x8x3_ssse3_aligned_by_7 - .aom_sad16x8x3_ssse3_do_jump |
| dd .aom_sad16x8x3_ssse3_aligned_by_8 - .aom_sad16x8x3_ssse3_do_jump |
| dd .aom_sad16x8x3_ssse3_aligned_by_9 - .aom_sad16x8x3_ssse3_do_jump |
| dd .aom_sad16x8x3_ssse3_aligned_by_10 - .aom_sad16x8x3_ssse3_do_jump |
| dd .aom_sad16x8x3_ssse3_aligned_by_11 - .aom_sad16x8x3_ssse3_do_jump |
| dd .aom_sad16x8x3_ssse3_aligned_by_12 - .aom_sad16x8x3_ssse3_do_jump |
| dd .aom_sad16x8x3_ssse3_aligned_by_13 - .aom_sad16x8x3_ssse3_do_jump |
| dd .aom_sad16x8x3_ssse3_aligned_by_14 - .aom_sad16x8x3_ssse3_do_jump |
| dd .aom_sad16x8x3_ssse3_aligned_by_15 - .aom_sad16x8x3_ssse3_do_jump |
| .aom_sad16x8x3_ssse3_skiptable: |
| |
| call .aom_sad16x8x3_ssse3_do_jump |
| .aom_sad16x8x3_ssse3_do_jump: |
| pop rcx ; get the address of do_jump |
| mov rax, .aom_sad16x8x3_ssse3_jumptable - .aom_sad16x8x3_ssse3_do_jump |
| add rax, rcx ; get the absolute address of aom_sad16x8x3_ssse3_jumptable |
| |
| movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable |
| add rcx, rax |
| |
| movsxd rax, dword ptr arg(1) ;src_stride |
| movsxd rdx, dword ptr arg(3) ;ref_stride |
| |
| jmp rcx |
| |
| PROCESS_16X8X3_OFFSET 0, .aom_sad16x8x3_ssse3 |
| PROCESS_16X8X3_OFFSET 1, .aom_sad16x8x3_ssse3 |
| PROCESS_16X8X3_OFFSET 2, .aom_sad16x8x3_ssse3 |
| PROCESS_16X8X3_OFFSET 3, .aom_sad16x8x3_ssse3 |
| PROCESS_16X8X3_OFFSET 4, .aom_sad16x8x3_ssse3 |
| PROCESS_16X8X3_OFFSET 5, .aom_sad16x8x3_ssse3 |
| PROCESS_16X8X3_OFFSET 6, .aom_sad16x8x3_ssse3 |
| PROCESS_16X8X3_OFFSET 7, .aom_sad16x8x3_ssse3 |
| PROCESS_16X8X3_OFFSET 8, .aom_sad16x8x3_ssse3 |
| PROCESS_16X8X3_OFFSET 9, .aom_sad16x8x3_ssse3 |
| PROCESS_16X8X3_OFFSET 10, .aom_sad16x8x3_ssse3 |
| PROCESS_16X8X3_OFFSET 11, .aom_sad16x8x3_ssse3 |
| PROCESS_16X8X3_OFFSET 12, .aom_sad16x8x3_ssse3 |
| PROCESS_16X8X3_OFFSET 13, .aom_sad16x8x3_ssse3 |
| PROCESS_16X8X3_OFFSET 14, .aom_sad16x8x3_ssse3 |
| |
| .aom_sad16x8x3_ssse3_aligned_by_15: |
| |
| PROCESS_16X2X3 1 |
| PROCESS_16X2X3 0 |
| PROCESS_16X2X3 0 |
| PROCESS_16X2X3 0 |
| |
| .aom_sad16x8x3_ssse3_store_off: |
| mov rdi, arg(4) ;Results |
| |
| movq xmm0, xmm5 |
| psrldq xmm5, 8 |
| |
| paddw xmm0, xmm5 |
| movd [rdi], xmm0 |
| ;- |
| movq xmm0, xmm6 |
| psrldq xmm6, 8 |
| |
| paddw xmm0, xmm6 |
| movd [rdi+4], xmm0 |
| ;- |
| movq xmm0, xmm7 |
| psrldq xmm7, 8 |
| |
| paddw xmm0, xmm7 |
| movd [rdi+8], xmm0 |
| |
| ; begin epilog |
| pop rcx |
| pop rdi |
| pop rsi |
| RESTORE_XMM |
| UNSHADOW_ARGS |
| pop rbp |
| ret |