| ; |
| ; Copyright (c) 2016, Alliance for Open Media. All rights reserved |
| ; |
| ; This source code is subject to the terms of the BSD 2 Clause License and |
| ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| ; was not distributed with this source code in the LICENSE file, you can |
| ; obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| ; Media Patent License 1.0 was not distributed with this source code in the |
| ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| ; |
| |
| ; |
| |
| %include "aom_ports/x86_abi_support.asm" |
| |
| %macro STACK_FRAME_CREATE_X3 0 |
| %if ABI_IS_32BIT |
| %define src_ptr rsi |
| %define src_stride rax |
| %define ref_ptr rdi |
| %define ref_stride rdx |
| %define end_ptr rcx |
| %define ret_var rbx |
| %define result_ptr arg(4) |
| %define height dword ptr arg(4) |
| push rbp |
| mov rbp, rsp |
| push rsi |
| push rdi |
| push rbx |
| |
| mov rsi, arg(0) ; src_ptr |
| mov rdi, arg(2) ; ref_ptr |
| |
| movsxd rax, dword ptr arg(1) ; src_stride |
| movsxd rdx, dword ptr arg(3) ; ref_stride |
| %else |
| %if LIBAOM_YASM_WIN64 |
| SAVE_XMM 7, u |
| %define src_ptr rcx |
| %define src_stride rdx |
| %define ref_ptr r8 |
| %define ref_stride r9 |
| %define end_ptr r10 |
| %define ret_var r11 |
| %define result_ptr [rsp+xmm_stack_space+8+4*8] |
| %define height dword ptr [rsp+xmm_stack_space+8+4*8] |
| %else |
| %define src_ptr rdi |
| %define src_stride rsi |
| %define ref_ptr rdx |
| %define ref_stride rcx |
| %define end_ptr r9 |
| %define ret_var r10 |
| %define result_ptr r8 |
| %define height r8 |
| %endif |
| %endif |
| |
| %endmacro |
| |
| %macro STACK_FRAME_DESTROY_X3 0 |
| %define src_ptr |
| %define src_stride |
| %define ref_ptr |
| %define ref_stride |
| %define end_ptr |
| %define ret_var |
| %define result_ptr |
| %define height |
| |
| %if ABI_IS_32BIT |
| pop rbx |
| pop rdi |
| pop rsi |
| pop rbp |
| %else |
| %if LIBAOM_YASM_WIN64 |
| RESTORE_XMM |
| %endif |
| %endif |
| ret |
| %endmacro |
| |
| %macro PROCESS_16X2X3 5 |
| %if %1==0 |
| movdqa xmm0, XMMWORD PTR [%2] |
| lddqu xmm5, XMMWORD PTR [%3] |
| lddqu xmm6, XMMWORD PTR [%3+1] |
| lddqu xmm7, XMMWORD PTR [%3+2] |
| |
| psadbw xmm5, xmm0 |
| psadbw xmm6, xmm0 |
| psadbw xmm7, xmm0 |
| %else |
| movdqa xmm0, XMMWORD PTR [%2] |
| lddqu xmm1, XMMWORD PTR [%3] |
| lddqu xmm2, XMMWORD PTR [%3+1] |
| lddqu xmm3, XMMWORD PTR [%3+2] |
| |
| psadbw xmm1, xmm0 |
| psadbw xmm2, xmm0 |
| psadbw xmm3, xmm0 |
| |
| paddw xmm5, xmm1 |
| paddw xmm6, xmm2 |
| paddw xmm7, xmm3 |
| %endif |
| movdqa xmm0, XMMWORD PTR [%2+%4] |
| lddqu xmm1, XMMWORD PTR [%3+%5] |
| lddqu xmm2, XMMWORD PTR [%3+%5+1] |
| lddqu xmm3, XMMWORD PTR [%3+%5+2] |
| |
| %if %1==0 || %1==1 |
| lea %2, [%2+%4*2] |
| lea %3, [%3+%5*2] |
| %endif |
| |
| psadbw xmm1, xmm0 |
| psadbw xmm2, xmm0 |
| psadbw xmm3, xmm0 |
| |
| paddw xmm5, xmm1 |
| paddw xmm6, xmm2 |
| paddw xmm7, xmm3 |
| %endmacro |
| |
| %macro PROCESS_8X2X3 5 |
| %if %1==0 |
| movq mm0, QWORD PTR [%2] |
| movq mm5, QWORD PTR [%3] |
| movq mm6, QWORD PTR [%3+1] |
| movq mm7, QWORD PTR [%3+2] |
| |
| psadbw mm5, mm0 |
| psadbw mm6, mm0 |
| psadbw mm7, mm0 |
| %else |
| movq mm0, QWORD PTR [%2] |
| movq mm1, QWORD PTR [%3] |
| movq mm2, QWORD PTR [%3+1] |
| movq mm3, QWORD PTR [%3+2] |
| |
| psadbw mm1, mm0 |
| psadbw mm2, mm0 |
| psadbw mm3, mm0 |
| |
| paddw mm5, mm1 |
| paddw mm6, mm2 |
| paddw mm7, mm3 |
| %endif |
| movq mm0, QWORD PTR [%2+%4] |
| movq mm1, QWORD PTR [%3+%5] |
| movq mm2, QWORD PTR [%3+%5+1] |
| movq mm3, QWORD PTR [%3+%5+2] |
| |
| %if %1==0 || %1==1 |
| lea %2, [%2+%4*2] |
| lea %3, [%3+%5*2] |
| %endif |
| |
| psadbw mm1, mm0 |
| psadbw mm2, mm0 |
| psadbw mm3, mm0 |
| |
| paddw mm5, mm1 |
| paddw mm6, mm2 |
| paddw mm7, mm3 |
| %endmacro |
| |
| ;void int aom_sad16x16x3_sse3( |
| ; unsigned char *src_ptr, |
| ; int src_stride, |
| ; unsigned char *ref_ptr, |
| ; int ref_stride, |
| ; int *results) |
| global sym(aom_sad16x16x3_sse3) PRIVATE |
| sym(aom_sad16x16x3_sse3): |
| |
| STACK_FRAME_CREATE_X3 |
| |
| PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride |
| |
| mov rcx, result_ptr |
| |
| movq xmm0, xmm5 |
| psrldq xmm5, 8 |
| |
| paddw xmm0, xmm5 |
| movd [rcx], xmm0 |
| ;- |
| movq xmm0, xmm6 |
| psrldq xmm6, 8 |
| |
| paddw xmm0, xmm6 |
| movd [rcx+4], xmm0 |
| ;- |
| movq xmm0, xmm7 |
| psrldq xmm7, 8 |
| |
| paddw xmm0, xmm7 |
| movd [rcx+8], xmm0 |
| |
| STACK_FRAME_DESTROY_X3 |
| |
| ;void int aom_sad16x8x3_sse3( |
| ; unsigned char *src_ptr, |
| ; int src_stride, |
| ; unsigned char *ref_ptr, |
| ; int ref_stride, |
| ; int *results) |
| global sym(aom_sad16x8x3_sse3) PRIVATE |
| sym(aom_sad16x8x3_sse3): |
| |
| STACK_FRAME_CREATE_X3 |
| |
| PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride |
| |
| mov rcx, result_ptr |
| |
| movq xmm0, xmm5 |
| psrldq xmm5, 8 |
| |
| paddw xmm0, xmm5 |
| movd [rcx], xmm0 |
| ;- |
| movq xmm0, xmm6 |
| psrldq xmm6, 8 |
| |
| paddw xmm0, xmm6 |
| movd [rcx+4], xmm0 |
| ;- |
| movq xmm0, xmm7 |
| psrldq xmm7, 8 |
| |
| paddw xmm0, xmm7 |
| movd [rcx+8], xmm0 |
| |
| STACK_FRAME_DESTROY_X3 |
| |
| ;void int aom_sad8x16x3_sse3( |
| ; unsigned char *src_ptr, |
| ; int src_stride, |
| ; unsigned char *ref_ptr, |
| ; int ref_stride, |
| ; int *results) |
| global sym(aom_sad8x16x3_sse3) PRIVATE |
| sym(aom_sad8x16x3_sse3): |
| |
| STACK_FRAME_CREATE_X3 |
| |
| PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride |
| |
| mov rcx, result_ptr |
| |
| punpckldq mm5, mm6 |
| |
| movq [rcx], mm5 |
| movd [rcx+8], mm7 |
| |
| STACK_FRAME_DESTROY_X3 |
| |
| ;void int aom_sad8x8x3_sse3( |
| ; unsigned char *src_ptr, |
| ; int src_stride, |
| ; unsigned char *ref_ptr, |
| ; int ref_stride, |
| ; int *results) |
| global sym(aom_sad8x8x3_sse3) PRIVATE |
| sym(aom_sad8x8x3_sse3): |
| |
| STACK_FRAME_CREATE_X3 |
| |
| PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
| PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride |
| |
| mov rcx, result_ptr |
| |
| punpckldq mm5, mm6 |
| |
| movq [rcx], mm5 |
| movd [rcx+8], mm7 |
| |
| STACK_FRAME_DESTROY_X3 |
| |
| ;void int aom_sad4x4x3_sse3( |
| ; unsigned char *src_ptr, |
| ; int src_stride, |
| ; unsigned char *ref_ptr, |
| ; int ref_stride, |
| ; int *results) |
| global sym(aom_sad4x4x3_sse3) PRIVATE |
| sym(aom_sad4x4x3_sse3): |
| |
| STACK_FRAME_CREATE_X3 |
| |
| movd mm0, DWORD PTR [src_ptr] |
| movd mm1, DWORD PTR [ref_ptr] |
| |
| movd mm2, DWORD PTR [src_ptr+src_stride] |
| movd mm3, DWORD PTR [ref_ptr+ref_stride] |
| |
| punpcklbw mm0, mm2 |
| punpcklbw mm1, mm3 |
| |
| movd mm4, DWORD PTR [ref_ptr+1] |
| movd mm5, DWORD PTR [ref_ptr+2] |
| |
| movd mm2, DWORD PTR [ref_ptr+ref_stride+1] |
| movd mm3, DWORD PTR [ref_ptr+ref_stride+2] |
| |
| psadbw mm1, mm0 |
| |
| punpcklbw mm4, mm2 |
| punpcklbw mm5, mm3 |
| |
| psadbw mm4, mm0 |
| psadbw mm5, mm0 |
| |
| lea src_ptr, [src_ptr+src_stride*2] |
| lea ref_ptr, [ref_ptr+ref_stride*2] |
| |
| movd mm0, DWORD PTR [src_ptr] |
| movd mm2, DWORD PTR [ref_ptr] |
| |
| movd mm3, DWORD PTR [src_ptr+src_stride] |
| movd mm6, DWORD PTR [ref_ptr+ref_stride] |
| |
| punpcklbw mm0, mm3 |
| punpcklbw mm2, mm6 |
| |
| movd mm3, DWORD PTR [ref_ptr+1] |
| movd mm7, DWORD PTR [ref_ptr+2] |
| |
| psadbw mm2, mm0 |
| |
| paddw mm1, mm2 |
| |
| movd mm2, DWORD PTR [ref_ptr+ref_stride+1] |
| movd mm6, DWORD PTR [ref_ptr+ref_stride+2] |
| |
| punpcklbw mm3, mm2 |
| punpcklbw mm7, mm6 |
| |
| psadbw mm3, mm0 |
| psadbw mm7, mm0 |
| |
| paddw mm3, mm4 |
| paddw mm7, mm5 |
| |
| mov rcx, result_ptr |
| |
| punpckldq mm1, mm3 |
| |
| movq [rcx], mm1 |
| movd [rcx+8], mm7 |
| |
| STACK_FRAME_DESTROY_X3 |