| ; |
| ; Copyright (c) 2016, Alliance for Open Media. All rights reserved |
| ; |
| ; This source code is subject to the terms of the BSD 2 Clause License and |
| ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| ; was not distributed with this source code in the LICENSE file, you can |
| ; obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| ; Media Patent License 1.0 was not distributed with this source code in the |
| ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| ; |
| |
| ; |
| |
| %include "third_party/x86inc/x86inc.asm" |
| |
| SECTION .text |
| |
| %macro convolve_fn 1-2 |
| %ifidn %1, avg |
| %define AUX_XMM_REGS 4 |
| %else |
| %define AUX_XMM_REGS 0 |
| %endif |
| %ifidn %2, highbd |
| %define pavg pavgw |
| cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ |
| dst, dst_stride, \ |
| fx, fxs, fy, fys, w, h, bd |
| %else |
| %define pavg pavgb |
| cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ |
| dst, dst_stride, \ |
| fx, fxs, fy, fys, w, h |
| %endif |
| mov r4d, dword wm |
| %ifidn %2, highbd |
| shl r4d, 1 |
| shl srcq, 1 |
| shl src_strideq, 1 |
| shl dstq, 1 |
| shl dst_strideq, 1 |
| %else |
| cmp r4d, 4 |
| je .w4 |
| %endif |
| cmp r4d, 8 |
| je .w8 |
| cmp r4d, 16 |
| je .w16 |
| cmp r4d, 32 |
| je .w32 |
| |
| %if CONFIG_AV1 && CONFIG_EXT_PARTITION |
| cmp r4d, 64 |
| je .w64 |
| %ifidn %2, highbd |
| cmp r4d, 128 |
| je .w128 |
| |
| .w256: |
| mov r4d, dword hm |
| .loop256: |
| movu m0, [srcq] |
| movu m1, [srcq+16] |
| movu m2, [srcq+32] |
| movu m3, [srcq+48] |
| %ifidn %1, avg |
| pavg m0, [dstq] |
| pavg m1, [dstq+16] |
| pavg m2, [dstq+32] |
| pavg m3, [dstq+48] |
| %endif |
| mova [dstq ], m0 |
| mova [dstq+16], m1 |
| mova [dstq+32], m2 |
| mova [dstq+48], m3 |
| movu m0, [srcq+64] |
| movu m1, [srcq+80] |
| movu m2, [srcq+96] |
| movu m3, [srcq+112] |
| %ifidn %1, avg |
| pavg m0, [dstq+64] |
| pavg m1, [dstq+80] |
| pavg m2, [dstq+96] |
| pavg m3, [dstq+112] |
| %endif |
| mova [dstq+64], m0 |
| mova [dstq+80], m1 |
| mova [dstq+96], m2 |
| mova [dstq+112], m3 |
| movu m0, [srcq+128] |
| movu m1, [srcq+128+16] |
| movu m2, [srcq+128+32] |
| movu m3, [srcq+128+48] |
| %ifidn %1, avg |
| pavg m0, [dstq+128] |
| pavg m1, [dstq+128+16] |
| pavg m2, [dstq+128+32] |
| pavg m3, [dstq+128+48] |
| %endif |
| mova [dstq+128 ], m0 |
| mova [dstq+128+16], m1 |
| mova [dstq+128+32], m2 |
| mova [dstq+128+48], m3 |
| movu m0, [srcq+128+64] |
| movu m1, [srcq+128+80] |
| movu m2, [srcq+128+96] |
| movu m3, [srcq+128+112] |
| add srcq, src_strideq |
| %ifidn %1, avg |
| pavg m0, [dstq+128+64] |
| pavg m1, [dstq+128+80] |
| pavg m2, [dstq+128+96] |
| pavg m3, [dstq+128+112] |
| %endif |
| mova [dstq+128+64], m0 |
| mova [dstq+128+80], m1 |
| mova [dstq+128+96], m2 |
| mova [dstq+128+112], m3 |
| add dstq, dst_strideq |
| sub r4d, 1 |
| jnz .loop256 |
| RET |
| %endif |
| |
| .w128: |
| mov r4d, dword hm |
| .loop128: |
| movu m0, [srcq] |
| movu m1, [srcq+16] |
| movu m2, [srcq+32] |
| movu m3, [srcq+48] |
| %ifidn %1, avg |
| pavg m0, [dstq] |
| pavg m1, [dstq+16] |
| pavg m2, [dstq+32] |
| pavg m3, [dstq+48] |
| %endif |
| mova [dstq ], m0 |
| mova [dstq+16], m1 |
| mova [dstq+32], m2 |
| mova [dstq+48], m3 |
| movu m0, [srcq+64] |
| movu m1, [srcq+80] |
| movu m2, [srcq+96] |
| movu m3, [srcq+112] |
| add srcq, src_strideq |
| %ifidn %1, avg |
| pavg m0, [dstq+64] |
| pavg m1, [dstq+80] |
| pavg m2, [dstq+96] |
| pavg m3, [dstq+112] |
| %endif |
| mova [dstq+64], m0 |
| mova [dstq+80], m1 |
| mova [dstq+96], m2 |
| mova [dstq+112], m3 |
| add dstq, dst_strideq |
| sub r4d, 1 |
| jnz .loop128 |
| RET |
| |
| %else ; CONFIG_AV1 && CONFIG_EXT_PARTITION |
| |
| %ifidn %2, highbd |
| cmp r4d, 64 |
| je .w64 |
| |
| mov r4d, dword hm |
| .loop128: |
| movu m0, [srcq] |
| movu m1, [srcq+16] |
| movu m2, [srcq+32] |
| movu m3, [srcq+48] |
| %ifidn %1, avg |
| pavg m0, [dstq] |
| pavg m1, [dstq+16] |
| pavg m2, [dstq+32] |
| pavg m3, [dstq+48] |
| %endif |
| mova [dstq ], m0 |
| mova [dstq+16], m1 |
| mova [dstq+32], m2 |
| mova [dstq+48], m3 |
| movu m0, [srcq+64] |
| movu m1, [srcq+80] |
| movu m2, [srcq+96] |
| movu m3, [srcq+112] |
| add srcq, src_strideq |
| %ifidn %1, avg |
| pavg m0, [dstq+64] |
| pavg m1, [dstq+80] |
| pavg m2, [dstq+96] |
| pavg m3, [dstq+112] |
| %endif |
| mova [dstq+64], m0 |
| mova [dstq+80], m1 |
| mova [dstq+96], m2 |
| mova [dstq+112], m3 |
| add dstq, dst_strideq |
| sub r4d, 1 |
| jnz .loop128 |
| RET |
| %endif |
| %endif ; CONFIG_AV1 && CONFIG_EXT_PARTITION |
| |
| .w64: |
| mov r4d, dword hm |
| .loop64: |
| movu m0, [srcq] |
| movu m1, [srcq+16] |
| movu m2, [srcq+32] |
| movu m3, [srcq+48] |
| add srcq, src_strideq |
| %ifidn %1, avg |
| pavg m0, [dstq] |
| pavg m1, [dstq+16] |
| pavg m2, [dstq+32] |
| pavg m3, [dstq+48] |
| %endif |
| mova [dstq ], m0 |
| mova [dstq+16], m1 |
| mova [dstq+32], m2 |
| mova [dstq+48], m3 |
| add dstq, dst_strideq |
| sub r4d, 1 |
| jnz .loop64 |
| RET |
| |
| .w32: |
| mov r4d, dword hm |
| .loop32: |
| movu m0, [srcq] |
| movu m1, [srcq+16] |
| movu m2, [srcq+src_strideq] |
| movu m3, [srcq+src_strideq+16] |
| lea srcq, [srcq+src_strideq*2] |
| %ifidn %1, avg |
| pavg m0, [dstq] |
| pavg m1, [dstq +16] |
| pavg m2, [dstq+dst_strideq] |
| pavg m3, [dstq+dst_strideq+16] |
| %endif |
| mova [dstq ], m0 |
| mova [dstq +16], m1 |
| mova [dstq+dst_strideq ], m2 |
| mova [dstq+dst_strideq+16], m3 |
| lea dstq, [dstq+dst_strideq*2] |
| sub r4d, 2 |
| jnz .loop32 |
| RET |
| |
| .w16: |
| mov r4d, dword hm |
| lea r5q, [src_strideq*3] |
| lea r6q, [dst_strideq*3] |
| .loop16: |
| movu m0, [srcq] |
| movu m1, [srcq+src_strideq] |
| movu m2, [srcq+src_strideq*2] |
| movu m3, [srcq+r5q] |
| lea srcq, [srcq+src_strideq*4] |
| %ifidn %1, avg |
| pavg m0, [dstq] |
| pavg m1, [dstq+dst_strideq] |
| pavg m2, [dstq+dst_strideq*2] |
| pavg m3, [dstq+r6q] |
| %endif |
| mova [dstq ], m0 |
| mova [dstq+dst_strideq ], m1 |
| mova [dstq+dst_strideq*2], m2 |
| mova [dstq+r6q ], m3 |
| lea dstq, [dstq+dst_strideq*4] |
| sub r4d, 4 |
| jnz .loop16 |
| RET |
| |
| .w8: |
| mov r4d, dword hm |
| lea r5q, [src_strideq*3] |
| lea r6q, [dst_strideq*3] |
| .loop8: |
| movh m0, [srcq] |
| movh m1, [srcq+src_strideq] |
| movh m2, [srcq+src_strideq*2] |
| movh m3, [srcq+r5q] |
| lea srcq, [srcq+src_strideq*4] |
| %ifidn %1, avg |
| movh m4, [dstq] |
| movh m5, [dstq+dst_strideq] |
| movh m6, [dstq+dst_strideq*2] |
| movh m7, [dstq+r6q] |
| pavg m0, m4 |
| pavg m1, m5 |
| pavg m2, m6 |
| pavg m3, m7 |
| %endif |
| movh [dstq ], m0 |
| movh [dstq+dst_strideq ], m1 |
| movh [dstq+dst_strideq*2], m2 |
| movh [dstq+r6q ], m3 |
| lea dstq, [dstq+dst_strideq*4] |
| sub r4d, 4 |
| jnz .loop8 |
| RET |
| |
| %ifnidn %2, highbd |
| .w4: |
| mov r4d, dword hm |
| lea r5q, [src_strideq*3] |
| lea r6q, [dst_strideq*3] |
| .loop4: |
| movd m0, [srcq] |
| movd m1, [srcq+src_strideq] |
| movd m2, [srcq+src_strideq*2] |
| movd m3, [srcq+r5q] |
| lea srcq, [srcq+src_strideq*4] |
| %ifidn %1, avg |
| movd m4, [dstq] |
| movd m5, [dstq+dst_strideq] |
| movd m6, [dstq+dst_strideq*2] |
| movd m7, [dstq+r6q] |
| pavg m0, m4 |
| pavg m1, m5 |
| pavg m2, m6 |
| pavg m3, m7 |
| %endif |
| movd [dstq ], m0 |
| movd [dstq+dst_strideq ], m1 |
| movd [dstq+dst_strideq*2], m2 |
| movd [dstq+r6q ], m3 |
| lea dstq, [dstq+dst_strideq*4] |
| sub r4d, 4 |
| jnz .loop4 |
| RET |
| %endif |
| %endmacro |
| |
| INIT_XMM sse2 |
| convolve_fn copy |
| convolve_fn avg |
| %if CONFIG_HIGHBITDEPTH |
| convolve_fn copy, highbd |
| convolve_fn avg, highbd |
| %endif |