| ; |
| ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. |
| ; |
| ; This source code is subject to the terms of the BSD 2 Clause License and |
| ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| ; was not distributed with this source code in the LICENSE file, you can |
| ; obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| ; Media Patent License 1.0 was not distributed with this source code in the |
| ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| ; |
| |
| ; |
| |
| %include "third_party/x86inc/x86inc.asm" |
| |
| SECTION_RODATA |
| pw_64: times 8 dw 64 |
| even_byte_mask: times 8 dw 0x00ff |
| |
| ; %define USE_PMULHRSW |
| ; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss |
| ; when using this instruction. |
| ; |
| ; The add order below (based on ffav1) must be followed to prevent outranges. |
| ; x = k0k1 + k4k5 |
| ; y = k2k3 + k6k7 |
| ; z = signed SAT(x + y) |
| |
| SECTION .text |
| %define LOCAL_VARS_SIZE 16*6 |
| |
| %macro SETUP_LOCAL_VARS 0 |
| ; TODO(slavarnway): using xmm registers for these on AOM_ARCH_X86_64 + |
| ; pmaddubsw has a higher latency on some platforms, this might be eased by |
| ; interleaving the instructions. |
| %define k0k1 [rsp + 16*0] |
| %define k2k3 [rsp + 16*1] |
| %define k4k5 [rsp + 16*2] |
| %define k6k7 [rsp + 16*3] |
| packsswb m4, m4 |
| ; TODO(slavarnway): multiple pshufb instructions had a higher latency on |
| ; some platforms. |
| pshuflw m0, m4, 0b ;k0_k1 |
| pshuflw m1, m4, 01010101b ;k2_k3 |
| pshuflw m2, m4, 10101010b ;k4_k5 |
| pshuflw m3, m4, 11111111b ;k6_k7 |
| punpcklqdq m0, m0 |
| punpcklqdq m1, m1 |
| punpcklqdq m2, m2 |
| punpcklqdq m3, m3 |
| mova k0k1, m0 |
| mova k2k3, m1 |
| mova k4k5, m2 |
| mova k6k7, m3 |
| %if AOM_ARCH_X86_64 |
| %define krd m12 |
| %define tmp0 [rsp + 16*4] |
| %define tmp1 [rsp + 16*5] |
| mova krd, [GLOBAL(pw_64)] |
| %else |
| %define krd [rsp + 16*4] |
| %if CONFIG_PIC=0 |
| mova m6, [GLOBAL(pw_64)] |
| %else |
| ; build constants without accessing global memory |
| pcmpeqb m6, m6 ;all ones |
| psrlw m6, 15 |
| psllw m6, 6 ;aka pw_64 |
| %endif |
| mova krd, m6 |
| %endif |
| %endm |
| |
| ;------------------------------------------------------------------------------- |
| %if AOM_ARCH_X86_64 |
| %define LOCAL_VARS_SIZE_H4 0 |
| %else |
| %define LOCAL_VARS_SIZE_H4 16*4 |
| %endif |
| |
| %macro SUBPIX_HFILTER4 1 |
| cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \ |
| src, sstride, dst, dstride, height, filter |
| mova m4, [filterq] |
| packsswb m4, m4 |
| %if AOM_ARCH_X86_64 |
| %define k0k1k4k5 m8 |
| %define k2k3k6k7 m9 |
| %define krd m10 |
| mova krd, [GLOBAL(pw_64)] |
| pshuflw k0k1k4k5, m4, 0b ;k0_k1 |
| pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5 |
| pshuflw k2k3k6k7, m4, 01010101b ;k2_k3 |
| pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7 |
| %else |
| %define k0k1k4k5 [rsp + 16*0] |
| %define k2k3k6k7 [rsp + 16*1] |
| %define krd [rsp + 16*2] |
| pshuflw m6, m4, 0b ;k0_k1 |
| pshufhw m6, m6, 10101010b ;k0_k1_k4_k5 |
| pshuflw m7, m4, 01010101b ;k2_k3 |
| pshufhw m7, m7, 11111111b ;k2_k3_k6_k7 |
| %if CONFIG_PIC=0 |
| mova m1, [GLOBAL(pw_64)] |
| %else |
| ; build constants without accessing global memory |
| pcmpeqb m1, m1 ;all ones |
| psrlw m1, 15 |
| psllw m1, 6 ;aka pw_64 |
| %endif |
| mova k0k1k4k5, m6 |
| mova k2k3k6k7, m7 |
| mova krd, m1 |
| %endif |
| dec heightd |
| |
| .loop: |
| ;Do two rows at once |
| movu m4, [srcq - 3] |
| movu m5, [srcq + sstrideq - 3] |
| punpckhbw m1, m4, m4 |
| punpcklbw m4, m4 |
| punpckhbw m3, m5, m5 |
| punpcklbw m5, m5 |
| palignr m0, m1, m4, 1 |
| pmaddubsw m0, k0k1k4k5 |
| palignr m1, m4, 5 |
| pmaddubsw m1, k2k3k6k7 |
| palignr m2, m3, m5, 1 |
| pmaddubsw m2, k0k1k4k5 |
| palignr m3, m5, 5 |
| pmaddubsw m3, k2k3k6k7 |
| punpckhqdq m4, m0, m2 |
| punpcklqdq m0, m2 |
| punpckhqdq m5, m1, m3 |
| punpcklqdq m1, m3 |
| paddsw m0, m4 |
| paddsw m1, m5 |
| %ifidn %1, h8_avg |
| movd m4, [dstq] |
| movd m5, [dstq + dstrideq] |
| %endif |
| paddsw m0, m1 |
| paddsw m0, krd |
| psraw m0, 7 |
| %ifidn %1, h8_add_src |
| pxor m3, m3 |
| movu m4, [srcq] |
| movu m5, [srcq + sstrideq] |
| punpckldq m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2 |
| punpcklbw m4, m3 |
| paddsw m0, m4 |
| %endif |
| packuswb m0, m0 |
| psrldq m1, m0, 4 |
| |
| %ifidn %1, h8_avg |
| pavgb m0, m4 |
| pavgb m1, m5 |
| %endif |
| movd [dstq], m0 |
| movd [dstq + dstrideq], m1 |
| |
| lea srcq, [srcq + sstrideq ] |
| prefetcht0 [srcq + 4 * sstrideq - 3] |
| lea srcq, [srcq + sstrideq ] |
| lea dstq, [dstq + 2 * dstrideq ] |
| prefetcht0 [srcq + 2 * sstrideq - 3] |
| |
| sub heightd, 2 |
| jg .loop |
| |
| ; Do last row if output_height is odd |
| jne .done |
| |
| movu m4, [srcq - 3] |
| punpckhbw m1, m4, m4 |
| punpcklbw m4, m4 |
| palignr m0, m1, m4, 1 |
| palignr m1, m4, 5 |
| pmaddubsw m0, k0k1k4k5 |
| pmaddubsw m1, k2k3k6k7 |
| psrldq m2, m0, 8 |
| psrldq m3, m1, 8 |
| paddsw m0, m2 |
| paddsw m1, m3 |
| paddsw m0, m1 |
| paddsw m0, krd |
| psraw m0, 7 |
| %ifidn %1, h8_add_src |
| pxor m3, m3 |
| movu m4, [srcq] |
| punpcklbw m4, m3 |
| paddsw m0, m4 |
| %endif |
| packuswb m0, m0 |
| %ifidn %1, h8_avg |
| movd m4, [dstq] |
| pavgb m0, m4 |
| %endif |
| movd [dstq], m0 |
| .done: |
| REP_RET |
| %endm |
| |
| ;------------------------------------------------------------------------------- |
| %macro SUBPIX_HFILTER8 1 |
| cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ |
| src, sstride, dst, dstride, height, filter |
| mova m4, [filterq] |
| SETUP_LOCAL_VARS |
| dec heightd |
| |
| .loop: |
| ;Do two rows at once |
| movu m0, [srcq - 3] |
| movu m4, [srcq + sstrideq - 3] |
| punpckhbw m1, m0, m0 |
| punpcklbw m0, m0 |
| palignr m5, m1, m0, 13 |
| pmaddubsw m5, k6k7 |
| palignr m2, m1, m0, 5 |
| palignr m3, m1, m0, 9 |
| palignr m1, m0, 1 |
| pmaddubsw m1, k0k1 |
| punpckhbw m6, m4, m4 |
| punpcklbw m4, m4 |
| pmaddubsw m2, k2k3 |
| pmaddubsw m3, k4k5 |
| |
| palignr m7, m6, m4, 13 |
| palignr m0, m6, m4, 5 |
| pmaddubsw m7, k6k7 |
| paddsw m1, m3 |
| paddsw m2, m5 |
| paddsw m1, m2 |
| %ifidn %1, h8_avg |
| movh m2, [dstq] |
| movhps m2, [dstq + dstrideq] |
| %endif |
| palignr m5, m6, m4, 9 |
| palignr m6, m4, 1 |
| pmaddubsw m0, k2k3 |
| pmaddubsw m6, k0k1 |
| paddsw m1, krd |
| pmaddubsw m5, k4k5 |
| psraw m1, 7 |
| paddsw m0, m7 |
| paddsw m6, m5 |
| paddsw m6, m0 |
| paddsw m6, krd |
| psraw m6, 7 |
| %ifidn %1, h8_add_src |
| pxor m3, m3 |
| movu m4, [srcq] |
| movu m5, [srcq + sstrideq] |
| punpcklbw m4, m3 |
| punpcklbw m5, m3 |
| paddsw m1, m4 |
| paddsw m6, m5 |
| %endif |
| packuswb m1, m6 |
| %ifidn %1, h8_avg |
| pavgb m1, m2 |
| %endif |
| movh [dstq], m1 |
| movhps [dstq + dstrideq], m1 |
| |
| lea srcq, [srcq + sstrideq ] |
| prefetcht0 [srcq + 4 * sstrideq - 3] |
| lea srcq, [srcq + sstrideq ] |
| lea dstq, [dstq + 2 * dstrideq ] |
| prefetcht0 [srcq + 2 * sstrideq - 3] |
| sub heightd, 2 |
| jg .loop |
| |
| ; Do last row if output_height is odd |
| jne .done |
| |
| movu m0, [srcq - 3] |
| punpckhbw m3, m0, m0 |
| punpcklbw m0, m0 |
| palignr m1, m3, m0, 1 |
| palignr m2, m3, m0, 5 |
| palignr m4, m3, m0, 13 |
| palignr m3, m0, 9 |
| pmaddubsw m1, k0k1 |
| pmaddubsw m2, k2k3 |
| pmaddubsw m3, k4k5 |
| pmaddubsw m4, k6k7 |
| paddsw m1, m3 |
| paddsw m4, m2 |
| paddsw m1, m4 |
| paddsw m1, krd |
| psraw m1, 7 |
| %ifidn %1, h8_add_src |
| pxor m6, m6 |
| movu m5, [srcq] |
| punpcklbw m5, m6 |
| paddsw m1, m5 |
| %endif |
| packuswb m1, m1 |
| %ifidn %1, h8_avg |
| movh m0, [dstq] |
| pavgb m1, m0 |
| %endif |
| movh [dstq], m1 |
| .done: |
| REP_RET |
| %endm |
| |
| ;------------------------------------------------------------------------------- |
| %macro SUBPIX_HFILTER16 1 |
| cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ |
| src, sstride, dst, dstride, height, filter |
| mova m4, [filterq] |
| SETUP_LOCAL_VARS |
| |
| .loop: |
| prefetcht0 [srcq + 2 * sstrideq -3] |
| |
| movu m0, [srcq - 3] |
| movu m4, [srcq - 2] |
| pmaddubsw m0, k0k1 |
| pmaddubsw m4, k0k1 |
| movu m1, [srcq - 1] |
| movu m5, [srcq + 0] |
| pmaddubsw m1, k2k3 |
| pmaddubsw m5, k2k3 |
| movu m2, [srcq + 1] |
| movu m6, [srcq + 2] |
| pmaddubsw m2, k4k5 |
| pmaddubsw m6, k4k5 |
| movu m3, [srcq + 3] |
| movu m7, [srcq + 4] |
| pmaddubsw m3, k6k7 |
| pmaddubsw m7, k6k7 |
| paddsw m0, m2 |
| paddsw m1, m3 |
| paddsw m0, m1 |
| paddsw m4, m6 |
| paddsw m5, m7 |
| paddsw m4, m5 |
| paddsw m0, krd |
| paddsw m4, krd |
| psraw m0, 7 |
| psraw m4, 7 |
| %ifidn %1, h8_add_src |
| %if AOM_ARCH_X86=1 && CONFIG_PIC=1 |
| pcmpeqb m2, m2 ;all ones |
| psrlw m2, 8 ;even_byte_mask |
| %else |
| mova m2, [GLOBAL(even_byte_mask)] |
| %endif |
| movu m5, [srcq] |
| mova m7, m5 |
| pand m5, m2 |
| psrlw m7, 8 |
| paddsw m0, m5 |
| paddsw m4, m7 |
| %endif |
| packuswb m0, m0 |
| packuswb m4, m4 |
| punpcklbw m0, m4 |
| %ifidn %1, h8_avg |
| pavgb m0, [dstq] |
| %endif |
| lea srcq, [srcq + sstrideq] |
| mova [dstq], m0 |
| lea dstq, [dstq + dstrideq] |
| dec heightd |
| jnz .loop |
| REP_RET |
| %endm |
| |
| INIT_XMM ssse3 |
| SUBPIX_HFILTER16 h8 |
| SUBPIX_HFILTER8 h8 |
| SUBPIX_HFILTER4 h8 |
| |
| ;------------------------------------------------------------------------------- |
| |
| ; TODO(Linfeng): Detect cpu type and choose the code with better performance. |
| %define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1 |
| |
| %if AOM_ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON |
| %define NUM_GENERAL_REG_USED 9 |
| %else |
| %define NUM_GENERAL_REG_USED 6 |
| %endif |
| |
| %macro SUBPIX_VFILTER 2 |
| cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \ |
| src, sstride, dst, dstride, height, filter |
| mova m4, [filterq] |
| SETUP_LOCAL_VARS |
| |
| %ifidn %2, 8 |
| %define movx movh |
| %else |
| %define movx movd |
| %endif |
| |
| dec heightd |
| |
| %if AOM_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON |
| |
| %if AOM_ARCH_X86_64 |
| %define src1q r7 |
| %define sstride6q r8 |
| %define dst_stride dstrideq |
| %else |
| %define src1q filterq |
| %define sstride6q dstrideq |
| %define dst_stride dstridemp |
| %endif |
| mov src1q, srcq |
| add src1q, sstrideq |
| lea sstride6q, [sstrideq + sstrideq * 4] |
| add sstride6q, sstrideq ;pitch * 6 |
| |
| .loop: |
| ;Do two rows at once |
| movx m0, [srcq ] ;A |
| movx m1, [src1q ] ;B |
| punpcklbw m0, m1 ;A B |
| movx m2, [srcq + sstrideq * 2 ] ;C |
| pmaddubsw m0, k0k1 |
| mova m6, m2 |
| movx m3, [src1q + sstrideq * 2] ;D |
| punpcklbw m2, m3 ;C D |
| pmaddubsw m2, k2k3 |
| movx m4, [srcq + sstrideq * 4 ] ;E |
| mova m7, m4 |
| movx m5, [src1q + sstrideq * 4] ;F |
| punpcklbw m4, m5 ;E F |
| pmaddubsw m4, k4k5 |
| punpcklbw m1, m6 ;A B next iter |
| movx m6, [srcq + sstride6q ] ;G |
| punpcklbw m5, m6 ;E F next iter |
| punpcklbw m3, m7 ;C D next iter |
| pmaddubsw m5, k4k5 |
| movx m7, [src1q + sstride6q ] ;H |
| punpcklbw m6, m7 ;G H |
| pmaddubsw m6, k6k7 |
| pmaddubsw m3, k2k3 |
| pmaddubsw m1, k0k1 |
| paddsw m0, m4 |
| paddsw m2, m6 |
| movx m6, [srcq + sstrideq * 8 ] ;H next iter |
| punpcklbw m7, m6 |
| pmaddubsw m7, k6k7 |
| paddsw m0, m2 |
| paddsw m0, krd |
| psraw m0, 7 |
| paddsw m1, m5 |
| %ifidn %1, v8_add_src |
| pxor m6, m6 |
| movu m4, [srcq] |
| punpcklbw m4, m6 |
| paddsw m0, m4 |
| %endif |
| packuswb m0, m0 |
| |
| paddsw m3, m7 |
| paddsw m1, m3 |
| paddsw m1, krd |
| psraw m1, 7 |
| %ifidn %1, v8_add_src |
| movu m4, [src1q] |
| punpcklbw m4, m6 |
| paddsw m1, m4 |
| %endif |
| lea srcq, [srcq + sstrideq * 2 ] |
| lea src1q, [src1q + sstrideq * 2] |
| packuswb m1, m1 |
| |
| %ifidn %1, v8_avg |
| movx m2, [dstq] |
| pavgb m0, m2 |
| %endif |
| movx [dstq], m0 |
| add dstq, dst_stride |
| %ifidn %1, v8_avg |
| movx m3, [dstq] |
| pavgb m1, m3 |
| %endif |
| movx [dstq], m1 |
| add dstq, dst_stride |
| sub heightd, 2 |
| jg .loop |
| |
| ; Do last row if output_height is odd |
| jne .done |
| |
| movx m0, [srcq ] ;A |
| movx m1, [srcq + sstrideq ] ;B |
| movx m6, [srcq + sstride6q ] ;G |
| punpcklbw m0, m1 ;A B |
| movx m7, [src1q + sstride6q ] ;H |
| pmaddubsw m0, k0k1 |
| movx m2, [srcq + sstrideq * 2 ] ;C |
| punpcklbw m6, m7 ;G H |
| movx m3, [src1q + sstrideq * 2] ;D |
| pmaddubsw m6, k6k7 |
| movx m4, [srcq + sstrideq * 4 ] ;E |
| punpcklbw m2, m3 ;C D |
| movx m5, [src1q + sstrideq * 4] ;F |
| punpcklbw m4, m5 ;E F |
| pmaddubsw m2, k2k3 |
| pmaddubsw m4, k4k5 |
| paddsw m2, m6 |
| paddsw m0, m4 |
| paddsw m0, m2 |
| paddsw m0, krd |
| psraw m0, 7 |
| %ifidn %1, v8_add_src |
| pxor m6, m6 |
| movu m4, [srcq] |
| punpcklbw m4, m6 |
| paddsw m0, m4 |
| %endif |
| packuswb m0, m0 |
| %ifidn %1, v8_avg |
| movx m1, [dstq] |
| pavgb m0, m1 |
| %endif |
| movx [dstq], m0 |
| |
| %else |
| ; AOM_ARCH_X86_64 |
| |
| movx m0, [srcq ] ;A |
| movx m1, [srcq + sstrideq ] ;B |
| lea srcq, [srcq + sstrideq * 2 ] |
| movx m2, [srcq] ;C |
| movx m3, [srcq + sstrideq] ;D |
| lea srcq, [srcq + sstrideq * 2 ] |
| movx m4, [srcq] ;E |
| movx m5, [srcq + sstrideq] ;F |
| lea srcq, [srcq + sstrideq * 2 ] |
| movx m6, [srcq] ;G |
| punpcklbw m0, m1 ;A B |
| punpcklbw m1, m2 ;A B next iter |
| punpcklbw m2, m3 ;C D |
| punpcklbw m3, m4 ;C D next iter |
| punpcklbw m4, m5 ;E F |
| punpcklbw m5, m6 ;E F next iter |
| |
| .loop: |
| ;Do two rows at once |
| movx m7, [srcq + sstrideq] ;H |
| lea srcq, [srcq + sstrideq * 2 ] |
| movx m14, [srcq] ;H next iter |
| punpcklbw m6, m7 ;G H |
| punpcklbw m7, m14 ;G H next iter |
| pmaddubsw m8, m0, k0k1 |
| pmaddubsw m9, m1, k0k1 |
| mova m0, m2 |
| mova m1, m3 |
| pmaddubsw m10, m2, k2k3 |
| pmaddubsw m11, m3, k2k3 |
| mova m2, m4 |
| mova m3, m5 |
| pmaddubsw m4, k4k5 |
| pmaddubsw m5, k4k5 |
| paddsw m8, m4 |
| paddsw m9, m5 |
| mova m4, m6 |
| mova m5, m7 |
| pmaddubsw m6, k6k7 |
| pmaddubsw m7, k6k7 |
| paddsw m10, m6 |
| paddsw m11, m7 |
| paddsw m8, m10 |
| paddsw m9, m11 |
| mova m6, m14 |
| paddsw m8, krd |
| paddsw m9, krd |
| psraw m8, 7 |
| psraw m9, 7 |
| %ifidn %2, 4 |
| packuswb m8, m8 |
| packuswb m9, m9 |
| %else |
| packuswb m8, m9 |
| %endif |
| |
| %ifidn %1, v8_avg |
| movx m7, [dstq] |
| %ifidn %2, 4 |
| movx m10, [dstq + dstrideq] |
| pavgb m9, m10 |
| %else |
| movhpd m7, [dstq + dstrideq] |
| %endif |
| pavgb m8, m7 |
| %endif |
| movx [dstq], m8 |
| %ifidn %2, 4 |
| movx [dstq + dstrideq], m9 |
| %else |
| movhpd [dstq + dstrideq], m8 |
| %endif |
| |
| lea dstq, [dstq + dstrideq * 2 ] |
| sub heightd, 2 |
| jg .loop |
| |
| ; Do last row if output_height is odd |
| jne .done |
| |
| movx m7, [srcq + sstrideq] ;H |
| punpcklbw m6, m7 ;G H |
| pmaddubsw m0, k0k1 |
| pmaddubsw m2, k2k3 |
| pmaddubsw m4, k4k5 |
| pmaddubsw m6, k6k7 |
| paddsw m0, m4 |
| paddsw m2, m6 |
| paddsw m0, m2 |
| paddsw m0, krd |
| psraw m0, 7 |
| packuswb m0, m0 |
| %ifidn %1, v8_avg |
| movx m1, [dstq] |
| pavgb m0, m1 |
| %endif |
| movx [dstq], m0 |
| |
| %endif ; AOM_ARCH_X86_64 |
| |
| .done: |
| REP_RET |
| |
| %endm |
| |
| ;------------------------------------------------------------------------------- |
| %macro SUBPIX_VFILTER16 1 |
| cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \ |
| src, sstride, dst, dstride, height, filter |
| mova m4, [filterq] |
| SETUP_LOCAL_VARS |
| |
| %if AOM_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON |
| |
| %if AOM_ARCH_X86_64 |
| %define src1q r7 |
| %define sstride6q r8 |
| %define dst_stride dstrideq |
| %else |
| %define src1q filterq |
| %define sstride6q dstrideq |
| %define dst_stride dstridemp |
| %endif |
| lea src1q, [srcq + sstrideq] |
| lea sstride6q, [sstrideq + sstrideq * 4] |
| add sstride6q, sstrideq ;pitch * 6 |
| |
| .loop: |
| movh m0, [srcq ] ;A |
| movh m1, [src1q ] ;B |
| movh m2, [srcq + sstrideq * 2 ] ;C |
| movh m3, [src1q + sstrideq * 2] ;D |
| movh m4, [srcq + sstrideq * 4 ] ;E |
| movh m5, [src1q + sstrideq * 4] ;F |
| |
| punpcklbw m0, m1 ;A B |
| movh m6, [srcq + sstride6q] ;G |
| punpcklbw m2, m3 ;C D |
| movh m7, [src1q + sstride6q] ;H |
| punpcklbw m4, m5 ;E F |
| pmaddubsw m0, k0k1 |
| movh m3, [srcq + 8] ;A |
| pmaddubsw m2, k2k3 |
| punpcklbw m6, m7 ;G H |
| movh m5, [srcq + sstrideq + 8] ;B |
| pmaddubsw m4, k4k5 |
| punpcklbw m3, m5 ;A B |
| movh m7, [srcq + sstrideq * 2 + 8] ;C |
| pmaddubsw m6, k6k7 |
| movh m5, [src1q + sstrideq * 2 + 8] ;D |
| punpcklbw m7, m5 ;C D |
| paddsw m2, m6 |
| pmaddubsw m3, k0k1 |
| movh m1, [srcq + sstrideq * 4 + 8] ;E |
| paddsw m0, m4 |
| pmaddubsw m7, k2k3 |
| movh m6, [src1q + sstrideq * 4 + 8] ;F |
| punpcklbw m1, m6 ;E F |
| paddsw m0, m2 |
| paddsw m0, krd |
| movh m2, [srcq + sstride6q + 8] ;G |
| pmaddubsw m1, k4k5 |
| movh m5, [src1q + sstride6q + 8] ;H |
| psraw m0, 7 |
| punpcklbw m2, m5 ;G H |
| pmaddubsw m2, k6k7 |
| paddsw m7, m2 |
| paddsw m3, m1 |
| paddsw m3, m7 |
| paddsw m3, krd |
| psraw m3, 7 |
| %ifidn %1, v8_add_src |
| pxor m6, m6 |
| movu m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down |
| mova m5, m4 |
| punpcklbw m4, m6 |
| punpckhbw m5, m6 |
| paddsw m0, m4 |
| paddsw m3, m5 |
| %endif |
| packuswb m0, m3 |
| |
| add srcq, sstrideq |
| add src1q, sstrideq |
| %ifidn %1, v8_avg |
| pavgb m0, [dstq] |
| %endif |
| mova [dstq], m0 |
| add dstq, dst_stride |
| dec heightd |
| jnz .loop |
| REP_RET |
| |
| %else |
| ; AOM_ARCH_X86_64 |
| dec heightd |
| |
| movu m1, [srcq ] ;A |
| movu m3, [srcq + sstrideq ] ;B |
| lea srcq, [srcq + sstrideq * 2] |
| punpcklbw m0, m1, m3 ;A B |
| punpckhbw m1, m3 ;A B |
| movu m5, [srcq] ;C |
| punpcklbw m2, m3, m5 ;A B next iter |
| punpckhbw m3, m5 ;A B next iter |
| mova tmp0, m2 ;store to stack |
| mova tmp1, m3 ;store to stack |
| movu m7, [srcq + sstrideq] ;D |
| lea srcq, [srcq + sstrideq * 2] |
| punpcklbw m4, m5, m7 ;C D |
| punpckhbw m5, m7 ;C D |
| movu m9, [srcq] ;E |
| punpcklbw m6, m7, m9 ;C D next iter |
| punpckhbw m7, m9 ;C D next iter |
| movu m11, [srcq + sstrideq] ;F |
| lea srcq, [srcq + sstrideq * 2] |
| punpcklbw m8, m9, m11 ;E F |
| punpckhbw m9, m11 ;E F |
| movu m2, [srcq] ;G |
| punpcklbw m10, m11, m2 ;E F next iter |
| punpckhbw m11, m2 ;E F next iter |
| |
| .loop: |
| ;Do two rows at once |
| pmaddubsw m13, m0, k0k1 |
| mova m0, m4 |
| pmaddubsw m14, m8, k4k5 |
| pmaddubsw m15, m4, k2k3 |
| mova m4, m8 |
| paddsw m13, m14 |
| movu m3, [srcq + sstrideq] ;H |
| lea srcq, [srcq + sstrideq * 2] |
| punpcklbw m14, m2, m3 ;G H |
| mova m8, m14 |
| pmaddubsw m14, k6k7 |
| paddsw m15, m14 |
| paddsw m13, m15 |
| paddsw m13, krd |
| psraw m13, 7 |
| |
| pmaddubsw m14, m1, k0k1 |
| pmaddubsw m1, m9, k4k5 |
| pmaddubsw m15, m5, k2k3 |
| paddsw m14, m1 |
| mova m1, m5 |
| mova m5, m9 |
| punpckhbw m2, m3 ;G H |
| mova m9, m2 |
| pmaddubsw m2, k6k7 |
| paddsw m15, m2 |
| paddsw m14, m15 |
| paddsw m14, krd |
| psraw m14, 7 |
| packuswb m13, m14 |
| %ifidn %1, v8_avg |
| pavgb m13, [dstq] |
| %endif |
| mova [dstq], m13 |
| |
| ; next iter |
| pmaddubsw m15, tmp0, k0k1 |
| pmaddubsw m14, m10, k4k5 |
| pmaddubsw m13, m6, k2k3 |
| paddsw m15, m14 |
| mova tmp0, m6 |
| mova m6, m10 |
| movu m2, [srcq] ;G next iter |
| punpcklbw m14, m3, m2 ;G H next iter |
| mova m10, m14 |
| pmaddubsw m14, k6k7 |
| paddsw m13, m14 |
| paddsw m15, m13 |
| paddsw m15, krd |
| psraw m15, 7 |
| |
| pmaddubsw m14, tmp1, k0k1 |
| mova tmp1, m7 |
| pmaddubsw m13, m7, k2k3 |
| mova m7, m11 |
| pmaddubsw m11, k4k5 |
| paddsw m14, m11 |
| punpckhbw m3, m2 ;G H next iter |
| mova m11, m3 |
| pmaddubsw m3, k6k7 |
| paddsw m13, m3 |
| paddsw m14, m13 |
| paddsw m14, krd |
| psraw m14, 7 |
| packuswb m15, m14 |
| %ifidn %1, v8_avg |
| pavgb m15, [dstq + dstrideq] |
| %endif |
| mova [dstq + dstrideq], m15 |
| lea dstq, [dstq + dstrideq * 2] |
| sub heightd, 2 |
| jg .loop |
| |
| ; Do last row if output_height is odd |
| jne .done |
| |
| movu m3, [srcq + sstrideq] ;H |
| punpcklbw m6, m2, m3 ;G H |
| punpckhbw m2, m3 ;G H |
| pmaddubsw m0, k0k1 |
| pmaddubsw m1, k0k1 |
| pmaddubsw m4, k2k3 |
| pmaddubsw m5, k2k3 |
| pmaddubsw m8, k4k5 |
| pmaddubsw m9, k4k5 |
| pmaddubsw m6, k6k7 |
| pmaddubsw m2, k6k7 |
| paddsw m0, m8 |
| paddsw m1, m9 |
| paddsw m4, m6 |
| paddsw m5, m2 |
| paddsw m0, m4 |
| paddsw m1, m5 |
| paddsw m0, krd |
| paddsw m1, krd |
| psraw m0, 7 |
| psraw m1, 7 |
| packuswb m0, m1 |
| %ifidn %1, v8_avg |
| pavgb m0, [dstq] |
| %endif |
| mova [dstq], m0 |
| |
| .done: |
| REP_RET |
| |
| %endif ; AOM_ARCH_X86_64 |
| |
| %endm |
| |
| INIT_XMM ssse3 |
| SUBPIX_VFILTER16 v8 |
| SUBPIX_VFILTER v8, 8 |
| SUBPIX_VFILTER v8, 4 |