| ; | 
 | ; Copyright (c) 2016, Alliance for Open Media. All rights reserved | 
 | ; | 
 | ; This source code is subject to the terms of the BSD 2 Clause License and | 
 | ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License | 
 | ; was not distributed with this source code in the LICENSE file, you can | 
 | ; obtain it at www.aomedia.org/license/software. If the Alliance for Open | 
 | ; Media Patent License 1.0 was not distributed with this source code in the | 
 | ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. | 
 | ; | 
 |  | 
 | ; | 
 |  | 
 | %include "third_party/x86inc/x86inc.asm" | 
 |  | 
 | SECTION_RODATA | 
 | pw_64:    times 8 dw 64 | 
 | even_byte_mask: times 8 dw 0x00ff | 
 |  | 
 | ; %define USE_PMULHRSW | 
 | ; NOTE: pmulhrsw has a latency of 5 cycles.  Tests showed a performance loss | 
 | ; when using this instruction. | 
 | ; | 
 | ; The add order below (based on ffav1) must be followed to prevent outranges. | 
 | ; x = k0k1 + k4k5 | 
 | ; y = k2k3 + k6k7 | 
 | ; z = signed SAT(x + y) | 
 |  | 
 | SECTION .text | 
 | %define LOCAL_VARS_SIZE 16*6 | 
 |  | 
 | %macro SETUP_LOCAL_VARS 0 | 
 |     ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 + | 
 |     ; pmaddubsw has a higher latency on some platforms, this might be eased by | 
 |     ; interleaving the instructions. | 
 |     %define    k0k1  [rsp + 16*0] | 
 |     %define    k2k3  [rsp + 16*1] | 
 |     %define    k4k5  [rsp + 16*2] | 
 |     %define    k6k7  [rsp + 16*3] | 
 |     packsswb     m4, m4 | 
 |     ; TODO(slavarnway): multiple pshufb instructions had a higher latency on | 
 |     ; some platforms. | 
 |     pshuflw      m0, m4, 0b              ;k0_k1 | 
 |     pshuflw      m1, m4, 01010101b       ;k2_k3 | 
 |     pshuflw      m2, m4, 10101010b       ;k4_k5 | 
 |     pshuflw      m3, m4, 11111111b       ;k6_k7 | 
 |     punpcklqdq   m0, m0 | 
 |     punpcklqdq   m1, m1 | 
 |     punpcklqdq   m2, m2 | 
 |     punpcklqdq   m3, m3 | 
 |     mova       k0k1, m0 | 
 |     mova       k2k3, m1 | 
 |     mova       k4k5, m2 | 
 |     mova       k6k7, m3 | 
 | %if ARCH_X86_64 | 
 |     %define     krd  m12 | 
 |     %define    tmp0  [rsp + 16*4] | 
 |     %define    tmp1  [rsp + 16*5] | 
 |     mova        krd, [GLOBAL(pw_64)] | 
 | %else | 
 |     %define     krd  [rsp + 16*4] | 
 | %if CONFIG_PIC=0 | 
 |     mova         m6, [GLOBAL(pw_64)] | 
 | %else | 
 |     ; build constants without accessing global memory | 
 |     pcmpeqb      m6, m6                  ;all ones | 
 |     psrlw        m6, 15 | 
 |     psllw        m6, 6                   ;aka pw_64 | 
 | %endif | 
 |     mova        krd, m6 | 
 | %endif | 
 | %endm | 
 |  | 
 | ;------------------------------------------------------------------------------- | 
 | %if ARCH_X86_64 | 
 |   %define LOCAL_VARS_SIZE_H4 0 | 
 | %else | 
 |   %define LOCAL_VARS_SIZE_H4 16*4 | 
 | %endif | 
 |  | 
 | %macro SUBPIX_HFILTER4 1 | 
 | cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \ | 
 |                             src, sstride, dst, dstride, height, filter | 
 |     mova                m4, [filterq] | 
 |     packsswb            m4, m4 | 
 | %if ARCH_X86_64 | 
 |     %define       k0k1k4k5  m8 | 
 |     %define       k2k3k6k7  m9 | 
 |     %define            krd  m10 | 
 |     mova               krd, [GLOBAL(pw_64)] | 
 |     pshuflw       k0k1k4k5, m4, 0b              ;k0_k1 | 
 |     pshufhw       k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5 | 
 |     pshuflw       k2k3k6k7, m4, 01010101b       ;k2_k3 | 
 |     pshufhw       k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7 | 
 | %else | 
 |     %define       k0k1k4k5  [rsp + 16*0] | 
 |     %define       k2k3k6k7  [rsp + 16*1] | 
 |     %define            krd  [rsp + 16*2] | 
 |     pshuflw             m6, m4, 0b              ;k0_k1 | 
 |     pshufhw             m6, m6, 10101010b       ;k0_k1_k4_k5 | 
 |     pshuflw             m7, m4, 01010101b       ;k2_k3 | 
 |     pshufhw             m7, m7, 11111111b       ;k2_k3_k6_k7 | 
 | %if CONFIG_PIC=0 | 
 |     mova                m1, [GLOBAL(pw_64)] | 
 | %else | 
 |     ; build constants without accessing global memory | 
 |     pcmpeqb             m1, m1                  ;all ones | 
 |     psrlw               m1, 15 | 
 |     psllw               m1, 6                   ;aka pw_64 | 
 | %endif | 
 |     mova          k0k1k4k5, m6 | 
 |     mova          k2k3k6k7, m7 | 
 |     mova               krd, m1 | 
 | %endif | 
 |     dec            heightd | 
 |  | 
 | .loop: | 
 |     ;Do two rows at once | 
 |     movu                m4, [srcq - 3] | 
 |     movu                m5, [srcq + sstrideq - 3] | 
 |     punpckhbw           m1, m4, m4 | 
 |     punpcklbw           m4, m4 | 
 |     punpckhbw           m3, m5, m5 | 
 |     punpcklbw           m5, m5 | 
 |     palignr             m0, m1, m4, 1 | 
 |     pmaddubsw           m0, k0k1k4k5 | 
 |     palignr             m1, m4, 5 | 
 |     pmaddubsw           m1, k2k3k6k7 | 
 |     palignr             m2, m3, m5, 1 | 
 |     pmaddubsw           m2, k0k1k4k5 | 
 |     palignr             m3, m5, 5 | 
 |     pmaddubsw           m3, k2k3k6k7 | 
 |     punpckhqdq          m4, m0, m2 | 
 |     punpcklqdq          m0, m2 | 
 |     punpckhqdq          m5, m1, m3 | 
 |     punpcklqdq          m1, m3 | 
 |     paddsw              m0, m4 | 
 |     paddsw              m1, m5 | 
 | %ifidn %1, h8_avg | 
 |     movd                m4, [dstq] | 
 |     movd                m5, [dstq + dstrideq] | 
 | %endif | 
 |     paddsw              m0, m1 | 
 |     paddsw              m0, krd | 
 |     psraw               m0, 7 | 
 | %ifidn %1, h8_add_src | 
 |     pxor                 m3, m3 | 
 |     movu                 m4, [srcq] | 
 |     movu                 m5, [srcq + sstrideq] | 
 |     punpckldq            m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2 | 
 |     punpcklbw            m4, m3 | 
 |     paddsw               m0, m4 | 
 | %endif | 
 |     packuswb            m0, m0 | 
 |     psrldq              m1, m0, 4 | 
 |  | 
 | %ifidn %1, h8_avg | 
 |     pavgb               m0, m4 | 
 |     pavgb               m1, m5 | 
 | %endif | 
 |     movd            [dstq], m0 | 
 |     movd [dstq + dstrideq], m1 | 
 |  | 
 |     lea               srcq, [srcq + sstrideq        ] | 
 |     prefetcht0              [srcq + 4 * sstrideq - 3] | 
 |     lea               srcq, [srcq + sstrideq        ] | 
 |     lea               dstq, [dstq + 2 * dstrideq    ] | 
 |     prefetcht0              [srcq + 2 * sstrideq - 3] | 
 |  | 
 |     sub            heightd, 2 | 
 |     jg               .loop | 
 |  | 
 |     ; Do last row if output_height is odd | 
 |     jne              .done | 
 |  | 
 |     movu                m4, [srcq - 3] | 
 |     punpckhbw           m1, m4, m4 | 
 |     punpcklbw           m4, m4 | 
 |     palignr             m0, m1, m4, 1 | 
 |     palignr             m1, m4, 5 | 
 |     pmaddubsw           m0, k0k1k4k5 | 
 |     pmaddubsw           m1, k2k3k6k7 | 
 |     psrldq              m2, m0, 8 | 
 |     psrldq              m3, m1, 8 | 
 |     paddsw              m0, m2 | 
 |     paddsw              m1, m3 | 
 |     paddsw              m0, m1 | 
 |     paddsw              m0, krd | 
 |     psraw               m0, 7 | 
 | %ifidn %1, h8_add_src | 
 |     pxor                m3, m3 | 
 |     movu                m4, [srcq] | 
 |     punpcklbw           m4, m3 | 
 |     paddsw              m0, m4 | 
 | %endif | 
 |     packuswb            m0, m0 | 
 | %ifidn %1, h8_avg | 
 |     movd                m4, [dstq] | 
 |     pavgb               m0, m4 | 
 | %endif | 
 |     movd            [dstq], m0 | 
 | .done: | 
 |     REP_RET | 
 | %endm | 
 |  | 
 | ;------------------------------------------------------------------------------- | 
 | %macro SUBPIX_HFILTER8 1 | 
 | cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ | 
 |                             src, sstride, dst, dstride, height, filter | 
 |     mova                 m4, [filterq] | 
 |     SETUP_LOCAL_VARS | 
 |     dec             heightd | 
 |  | 
 | .loop: | 
 |     ;Do two rows at once | 
 |     movu                 m0, [srcq - 3] | 
 |     movu                 m4, [srcq + sstrideq - 3] | 
 |     punpckhbw            m1, m0, m0 | 
 |     punpcklbw            m0, m0 | 
 |     palignr              m5, m1, m0, 13 | 
 |     pmaddubsw            m5, k6k7 | 
 |     palignr              m2, m1, m0, 5 | 
 |     palignr              m3, m1, m0, 9 | 
 |     palignr              m1, m0, 1 | 
 |     pmaddubsw            m1, k0k1 | 
 |     punpckhbw            m6, m4, m4 | 
 |     punpcklbw            m4, m4 | 
 |     pmaddubsw            m2, k2k3 | 
 |     pmaddubsw            m3, k4k5 | 
 |  | 
 |     palignr              m7, m6, m4, 13 | 
 |     palignr              m0, m6, m4, 5 | 
 |     pmaddubsw            m7, k6k7 | 
 |     paddsw               m1, m3 | 
 |     paddsw               m2, m5 | 
 |     paddsw               m1, m2 | 
 | %ifidn %1, h8_avg | 
 |     movh                 m2, [dstq] | 
 |     movhps               m2, [dstq + dstrideq] | 
 | %endif | 
 |     palignr              m5, m6, m4, 9 | 
 |     palignr              m6, m4, 1 | 
 |     pmaddubsw            m0, k2k3 | 
 |     pmaddubsw            m6, k0k1 | 
 |     paddsw               m1, krd | 
 |     pmaddubsw            m5, k4k5 | 
 |     psraw                m1, 7 | 
 |     paddsw               m0, m7 | 
 |     paddsw               m6, m5 | 
 |     paddsw               m6, m0 | 
 |     paddsw               m6, krd | 
 |     psraw                m6, 7 | 
 | %ifidn %1, h8_add_src | 
 |     pxor                 m3, m3 | 
 |     movu                 m4, [srcq] | 
 |     movu                 m5, [srcq + sstrideq] | 
 |     punpcklbw            m4, m3 | 
 |     punpcklbw            m5, m3 | 
 |     paddsw               m1, m4 | 
 |     paddsw               m6, m5 | 
 | %endif | 
 |     packuswb             m1, m6 | 
 | %ifidn %1, h8_avg | 
 |     pavgb                m1, m2 | 
 | %endif | 
 |     movh              [dstq], m1 | 
 |     movhps [dstq + dstrideq], m1 | 
 |  | 
 |     lea                srcq, [srcq + sstrideq        ] | 
 |     prefetcht0               [srcq + 4 * sstrideq - 3] | 
 |     lea                srcq, [srcq + sstrideq        ] | 
 |     lea                dstq, [dstq + 2 * dstrideq    ] | 
 |     prefetcht0               [srcq + 2 * sstrideq - 3] | 
 |     sub             heightd, 2 | 
 |     jg                .loop | 
 |  | 
 |     ; Do last row if output_height is odd | 
 |     jne               .done | 
 |  | 
 |     movu                 m0, [srcq - 3] | 
 |     punpckhbw            m3, m0, m0 | 
 |     punpcklbw            m0, m0 | 
 |     palignr              m1, m3, m0, 1 | 
 |     palignr              m2, m3, m0, 5 | 
 |     palignr              m4, m3, m0, 13 | 
 |     palignr              m3, m0, 9 | 
 |     pmaddubsw            m1, k0k1 | 
 |     pmaddubsw            m2, k2k3 | 
 |     pmaddubsw            m3, k4k5 | 
 |     pmaddubsw            m4, k6k7 | 
 |     paddsw               m1, m3 | 
 |     paddsw               m4, m2 | 
 |     paddsw               m1, m4 | 
 |     paddsw               m1, krd | 
 |     psraw                m1, 7 | 
 | %ifidn %1, h8_add_src | 
 |     pxor                 m6, m6 | 
 |     movu                 m5, [srcq] | 
 |     punpcklbw            m5, m6 | 
 |     paddsw               m1, m5 | 
 | %endif | 
 |     packuswb             m1, m1 | 
 | %ifidn %1, h8_avg | 
 |     movh                 m0, [dstq] | 
 |     pavgb                m1, m0 | 
 | %endif | 
 |     movh             [dstq], m1 | 
 | .done: | 
 |     REP_RET | 
 | %endm | 
 |  | 
 | ;------------------------------------------------------------------------------- | 
 | %macro SUBPIX_HFILTER16 1 | 
 | cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ | 
 |                              src, sstride, dst, dstride, height, filter | 
 |     mova          m4, [filterq] | 
 |     SETUP_LOCAL_VARS | 
 |  | 
 | .loop: | 
 |     prefetcht0        [srcq + 2 * sstrideq -3] | 
 |  | 
 |     movu          m0, [srcq - 3] | 
 |     movu          m4, [srcq - 2] | 
 |     pmaddubsw     m0, k0k1 | 
 |     pmaddubsw     m4, k0k1 | 
 |     movu          m1, [srcq - 1] | 
 |     movu          m5, [srcq + 0] | 
 |     pmaddubsw     m1, k2k3 | 
 |     pmaddubsw     m5, k2k3 | 
 |     movu          m2, [srcq + 1] | 
 |     movu          m6, [srcq + 2] | 
 |     pmaddubsw     m2, k4k5 | 
 |     pmaddubsw     m6, k4k5 | 
 |     movu          m3, [srcq + 3] | 
 |     movu          m7, [srcq + 4] | 
 |     pmaddubsw     m3, k6k7 | 
 |     pmaddubsw     m7, k6k7 | 
 |     paddsw        m0, m2 | 
 |     paddsw        m1, m3 | 
 |     paddsw        m0, m1 | 
 |     paddsw        m4, m6 | 
 |     paddsw        m5, m7 | 
 |     paddsw        m4, m5 | 
 |     paddsw        m0, krd | 
 |     paddsw        m4, krd | 
 |     psraw         m0, 7 | 
 |     psraw         m4, 7 | 
 | %ifidn %1, h8_add_src | 
 | %if ARCH_X86=1 && CONFIG_PIC=1 | 
 |     pcmpeqb       m2, m2                  ;all ones | 
 |     psrlw         m2, 8                   ;even_byte_mask | 
 | %else | 
 |     mova          m2, [GLOBAL(even_byte_mask)] | 
 | %endif | 
 |     movu          m5, [srcq] | 
 |     mova          m7, m5 | 
 |     pand          m5, m2 | 
 |     psrlw         m7, 8 | 
 |     paddsw        m0, m5 | 
 |     paddsw        m4, m7 | 
 | %endif | 
 |     packuswb      m0, m0 | 
 |     packuswb      m4, m4 | 
 |     punpcklbw     m0, m4 | 
 | %ifidn %1, h8_avg | 
 |     pavgb         m0, [dstq] | 
 | %endif | 
 |     lea         srcq, [srcq + sstrideq] | 
 |     mova      [dstq], m0 | 
 |     lea         dstq, [dstq + dstrideq] | 
 |     dec      heightd | 
 |     jnz        .loop | 
 |     REP_RET | 
 | %endm | 
 |  | 
 | INIT_XMM ssse3 | 
 | SUBPIX_HFILTER16 h8 | 
 | SUBPIX_HFILTER8  h8 | 
 | SUBPIX_HFILTER4  h8 | 
 |  | 
 | ;------------------------------------------------------------------------------- | 
 |  | 
 | ; TODO(Linfeng): Detect cpu type and choose the code with better performance. | 
 | %define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1 | 
 |  | 
 | %if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON | 
 |     %define NUM_GENERAL_REG_USED 9 | 
 | %else | 
 |     %define NUM_GENERAL_REG_USED 6 | 
 | %endif | 
 |  | 
 | %macro SUBPIX_VFILTER 2 | 
 | cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \ | 
 |                              src, sstride, dst, dstride, height, filter | 
 |     mova          m4, [filterq] | 
 |     SETUP_LOCAL_VARS | 
 |  | 
 | %ifidn %2, 8 | 
 |     %define                movx  movh | 
 | %else | 
 |     %define                movx  movd | 
 | %endif | 
 |  | 
 |     dec                 heightd | 
 |  | 
 | %if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON | 
 |  | 
 | %if ARCH_X86_64 | 
 |     %define               src1q  r7 | 
 |     %define           sstride6q  r8 | 
 |     %define          dst_stride  dstrideq | 
 | %else | 
 |     %define               src1q  filterq | 
 |     %define           sstride6q  dstrideq | 
 |     %define          dst_stride  dstridemp | 
 | %endif | 
 |     mov                   src1q, srcq | 
 |     add                   src1q, sstrideq | 
 |     lea               sstride6q, [sstrideq + sstrideq * 4] | 
 |     add               sstride6q, sstrideq                   ;pitch * 6 | 
 |  | 
 | .loop: | 
 |     ;Do two rows at once | 
 |     movx                     m0, [srcq                ]     ;A | 
 |     movx                     m1, [src1q               ]     ;B | 
 |     punpcklbw                m0, m1                         ;A B | 
 |     movx                     m2, [srcq + sstrideq * 2 ]     ;C | 
 |     pmaddubsw                m0, k0k1 | 
 |     mova                     m6, m2 | 
 |     movx                     m3, [src1q + sstrideq * 2]     ;D | 
 |     punpcklbw                m2, m3                         ;C D | 
 |     pmaddubsw                m2, k2k3 | 
 |     movx                     m4, [srcq + sstrideq * 4 ]     ;E | 
 |     mova                     m7, m4 | 
 |     movx                     m5, [src1q + sstrideq * 4]     ;F | 
 |     punpcklbw                m4, m5                         ;E F | 
 |     pmaddubsw                m4, k4k5 | 
 |     punpcklbw                m1, m6                         ;A B next iter | 
 |     movx                     m6, [srcq + sstride6q    ]     ;G | 
 |     punpcklbw                m5, m6                         ;E F next iter | 
 |     punpcklbw                m3, m7                         ;C D next iter | 
 |     pmaddubsw                m5, k4k5 | 
 |     movx                     m7, [src1q + sstride6q   ]     ;H | 
 |     punpcklbw                m6, m7                         ;G H | 
 |     pmaddubsw                m6, k6k7 | 
 |     pmaddubsw                m3, k2k3 | 
 |     pmaddubsw                m1, k0k1 | 
 |     paddsw                   m0, m4 | 
 |     paddsw                   m2, m6 | 
 |     movx                     m6, [srcq + sstrideq * 8 ]     ;H next iter | 
 |     punpcklbw                m7, m6 | 
 |     pmaddubsw                m7, k6k7 | 
 |     paddsw                   m0, m2 | 
 |     paddsw                   m0, krd | 
 |     psraw                    m0, 7 | 
 |     paddsw                   m1, m5 | 
 | %ifidn %1, v8_add_src | 
 |     pxor                     m6, m6 | 
 |     movu                     m4, [srcq] | 
 |     punpcklbw                m4, m6 | 
 |     paddsw                   m0, m4 | 
 | %endif | 
 |     packuswb                 m0, m0 | 
 |  | 
 |     paddsw                   m3, m7 | 
 |     paddsw                   m1, m3 | 
 |     paddsw                   m1, krd | 
 |     psraw                    m1, 7 | 
 | %ifidn %1, v8_add_src | 
 |     movu                     m4, [src1q] | 
 |     punpcklbw                m4, m6 | 
 |     paddsw                   m1, m4 | 
 | %endif | 
 |     lea                    srcq, [srcq + sstrideq * 2 ] | 
 |     lea                   src1q, [src1q + sstrideq * 2] | 
 |     packuswb                 m1, m1 | 
 |  | 
 | %ifidn %1, v8_avg | 
 |     movx                     m2, [dstq] | 
 |     pavgb                    m0, m2 | 
 | %endif | 
 |     movx                 [dstq], m0 | 
 |     add                    dstq, dst_stride | 
 | %ifidn %1, v8_avg | 
 |     movx                     m3, [dstq] | 
 |     pavgb                    m1, m3 | 
 | %endif | 
 |     movx                 [dstq], m1 | 
 |     add                    dstq, dst_stride | 
 |     sub                 heightd, 2 | 
 |     jg                    .loop | 
 |  | 
 |     ; Do last row if output_height is odd | 
 |     jne                   .done | 
 |  | 
 |     movx                     m0, [srcq                ]     ;A | 
 |     movx                     m1, [srcq + sstrideq     ]     ;B | 
 |     movx                     m6, [srcq + sstride6q    ]     ;G | 
 |     punpcklbw                m0, m1                         ;A B | 
 |     movx                     m7, [src1q + sstride6q   ]     ;H | 
 |     pmaddubsw                m0, k0k1 | 
 |     movx                     m2, [srcq + sstrideq * 2 ]     ;C | 
 |     punpcklbw                m6, m7                         ;G H | 
 |     movx                     m3, [src1q + sstrideq * 2]     ;D | 
 |     pmaddubsw                m6, k6k7 | 
 |     movx                     m4, [srcq + sstrideq * 4 ]     ;E | 
 |     punpcklbw                m2, m3                         ;C D | 
 |     movx                     m5, [src1q + sstrideq * 4]     ;F | 
 |     punpcklbw                m4, m5                         ;E F | 
 |     pmaddubsw                m2, k2k3 | 
 |     pmaddubsw                m4, k4k5 | 
 |     paddsw                   m2, m6 | 
 |     paddsw                   m0, m4 | 
 |     paddsw                   m0, m2 | 
 |     paddsw                   m0, krd | 
 |     psraw                    m0, 7 | 
 | %ifidn %1, v8_add_src | 
 |     pxor                     m6, m6 | 
 |     movu                     m4, [srcq] | 
 |     punpcklbw                m4, m6 | 
 |     paddsw                   m0, m4 | 
 | %endif | 
 |     packuswb                 m0, m0 | 
 | %ifidn %1, v8_avg | 
 |     movx                     m1, [dstq] | 
 |     pavgb                    m0, m1 | 
 | %endif | 
 |     movx                 [dstq], m0 | 
 |  | 
 | %else | 
 |     ; ARCH_X86_64 | 
 |  | 
 |     movx                     m0, [srcq                ]     ;A | 
 |     movx                     m1, [srcq + sstrideq     ]     ;B | 
 |     lea                    srcq, [srcq + sstrideq * 2 ] | 
 |     movx                     m2, [srcq]                     ;C | 
 |     movx                     m3, [srcq + sstrideq]          ;D | 
 |     lea                    srcq, [srcq + sstrideq * 2 ] | 
 |     movx                     m4, [srcq]                     ;E | 
 |     movx                     m5, [srcq + sstrideq]          ;F | 
 |     lea                    srcq, [srcq + sstrideq * 2 ] | 
 |     movx                     m6, [srcq]                     ;G | 
 |     punpcklbw                m0, m1                         ;A B | 
 |     punpcklbw                m1, m2                         ;A B next iter | 
 |     punpcklbw                m2, m3                         ;C D | 
 |     punpcklbw                m3, m4                         ;C D next iter | 
 |     punpcklbw                m4, m5                         ;E F | 
 |     punpcklbw                m5, m6                         ;E F next iter | 
 |  | 
 | .loop: | 
 |     ;Do two rows at once | 
 |     movx                     m7, [srcq + sstrideq]          ;H | 
 |     lea                    srcq, [srcq + sstrideq * 2 ] | 
 |     movx                    m14, [srcq]                     ;H next iter | 
 |     punpcklbw                m6, m7                         ;G H | 
 |     punpcklbw                m7, m14                        ;G H next iter | 
 |     pmaddubsw                m8, m0, k0k1 | 
 |     pmaddubsw                m9, m1, k0k1 | 
 |     mova                     m0, m2 | 
 |     mova                     m1, m3 | 
 |     pmaddubsw               m10, m2, k2k3 | 
 |     pmaddubsw               m11, m3, k2k3 | 
 |     mova                     m2, m4 | 
 |     mova                     m3, m5 | 
 |     pmaddubsw                m4, k4k5 | 
 |     pmaddubsw                m5, k4k5 | 
 |     paddsw                   m8, m4 | 
 |     paddsw                   m9, m5 | 
 |     mova                     m4, m6 | 
 |     mova                     m5, m7 | 
 |     pmaddubsw                m6, k6k7 | 
 |     pmaddubsw                m7, k6k7 | 
 |     paddsw                  m10, m6 | 
 |     paddsw                  m11, m7 | 
 |     paddsw                   m8, m10 | 
 |     paddsw                   m9, m11 | 
 |     mova                     m6, m14 | 
 |     paddsw                   m8, krd | 
 |     paddsw                   m9, krd | 
 |     psraw                    m8, 7 | 
 |     psraw                    m9, 7 | 
 | %ifidn %2, 4 | 
 |     packuswb                 m8, m8 | 
 |     packuswb                 m9, m9 | 
 | %else | 
 |     packuswb                 m8, m9 | 
 | %endif | 
 |  | 
 | %ifidn %1, v8_avg | 
 |     movx                     m7, [dstq] | 
 | %ifidn %2, 4 | 
 |     movx                    m10, [dstq + dstrideq] | 
 |     pavgb                    m9, m10 | 
 | %else | 
 |     movhpd                   m7, [dstq + dstrideq] | 
 | %endif | 
 |     pavgb                    m8, m7 | 
 | %endif | 
 |     movx                 [dstq], m8 | 
 | %ifidn %2, 4 | 
 |     movx      [dstq + dstrideq], m9 | 
 | %else | 
 |     movhpd    [dstq + dstrideq], m8 | 
 | %endif | 
 |  | 
 |     lea                    dstq, [dstq + dstrideq * 2 ] | 
 |     sub                 heightd, 2 | 
 |     jg                    .loop | 
 |  | 
 |     ; Do last row if output_height is odd | 
 |     jne                   .done | 
 |  | 
 |     movx                     m7, [srcq + sstrideq]          ;H | 
 |     punpcklbw                m6, m7                         ;G H | 
 |     pmaddubsw                m0, k0k1 | 
 |     pmaddubsw                m2, k2k3 | 
 |     pmaddubsw                m4, k4k5 | 
 |     pmaddubsw                m6, k6k7 | 
 |     paddsw                   m0, m4 | 
 |     paddsw                   m2, m6 | 
 |     paddsw                   m0, m2 | 
 |     paddsw                   m0, krd | 
 |     psraw                    m0, 7 | 
 |     packuswb                 m0, m0 | 
 | %ifidn %1, v8_avg | 
 |     movx                     m1, [dstq] | 
 |     pavgb                    m0, m1 | 
 | %endif | 
 |     movx                 [dstq], m0 | 
 |  | 
 | %endif ; ARCH_X86_64 | 
 |  | 
 | .done: | 
 |     REP_RET | 
 |  | 
 | %endm | 
 |  | 
 | ;------------------------------------------------------------------------------- | 
 | %macro SUBPIX_VFILTER16 1 | 
 | cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \ | 
 |                              src, sstride, dst, dstride, height, filter | 
 |     mova                     m4, [filterq] | 
 |     SETUP_LOCAL_VARS | 
 |  | 
 | %if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON | 
 |  | 
 | %if ARCH_X86_64 | 
 |     %define               src1q  r7 | 
 |     %define           sstride6q  r8 | 
 |     %define          dst_stride  dstrideq | 
 | %else | 
 |     %define               src1q  filterq | 
 |     %define           sstride6q  dstrideq | 
 |     %define          dst_stride  dstridemp | 
 | %endif | 
 |     lea                   src1q, [srcq + sstrideq] | 
 |     lea               sstride6q, [sstrideq + sstrideq * 4] | 
 |     add               sstride6q, sstrideq                   ;pitch * 6 | 
 |  | 
 | .loop: | 
 |     movh                     m0, [srcq                ]     ;A | 
 |     movh                     m1, [src1q               ]     ;B | 
 |     movh                     m2, [srcq + sstrideq * 2 ]     ;C | 
 |     movh                     m3, [src1q + sstrideq * 2]     ;D | 
 |     movh                     m4, [srcq + sstrideq * 4 ]     ;E | 
 |     movh                     m5, [src1q + sstrideq * 4]     ;F | 
 |  | 
 |     punpcklbw                m0, m1                         ;A B | 
 |     movh                     m6, [srcq + sstride6q]         ;G | 
 |     punpcklbw                m2, m3                         ;C D | 
 |     movh                     m7, [src1q + sstride6q]        ;H | 
 |     punpcklbw                m4, m5                         ;E F | 
 |     pmaddubsw                m0, k0k1 | 
 |     movh                     m3, [srcq + 8]                 ;A | 
 |     pmaddubsw                m2, k2k3 | 
 |     punpcklbw                m6, m7                         ;G H | 
 |     movh                     m5, [srcq + sstrideq + 8]      ;B | 
 |     pmaddubsw                m4, k4k5 | 
 |     punpcklbw                m3, m5                         ;A B | 
 |     movh                     m7, [srcq + sstrideq * 2 + 8]  ;C | 
 |     pmaddubsw                m6, k6k7 | 
 |     movh                     m5, [src1q + sstrideq * 2 + 8] ;D | 
 |     punpcklbw                m7, m5                         ;C D | 
 |     paddsw                   m2, m6 | 
 |     pmaddubsw                m3, k0k1 | 
 |     movh                     m1, [srcq + sstrideq * 4 + 8]  ;E | 
 |     paddsw                   m0, m4 | 
 |     pmaddubsw                m7, k2k3 | 
 |     movh                     m6, [src1q + sstrideq * 4 + 8] ;F | 
 |     punpcklbw                m1, m6                         ;E F | 
 |     paddsw                   m0, m2 | 
 |     paddsw                   m0, krd | 
 |     movh                     m2, [srcq + sstride6q + 8]     ;G | 
 |     pmaddubsw                m1, k4k5 | 
 |     movh                     m5, [src1q + sstride6q + 8]    ;H | 
 |     psraw                    m0, 7 | 
 |     punpcklbw                m2, m5                         ;G H | 
 |     pmaddubsw                m2, k6k7 | 
 |     paddsw                   m7, m2 | 
 |     paddsw                   m3, m1 | 
 |     paddsw                   m3, m7 | 
 |     paddsw                   m3, krd | 
 |     psraw                    m3, 7 | 
 | %ifidn %1, v8_add_src | 
 |     pxor                     m6, m6 | 
 |     movu                     m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down | 
 |     mova                     m5, m4 | 
 |     punpcklbw                m4, m6 | 
 |     punpckhbw                m5, m6 | 
 |     paddsw                   m0, m4 | 
 |     paddsw                   m3, m5 | 
 | %endif | 
 |     packuswb                 m0, m3 | 
 |  | 
 |     add                    srcq, sstrideq | 
 |     add                   src1q, sstrideq | 
 | %ifidn %1, v8_avg | 
 |     pavgb                    m0, [dstq] | 
 | %endif | 
 |     mova                 [dstq], m0 | 
 |     add                    dstq, dst_stride | 
 |     dec                 heightd | 
 |     jnz                   .loop | 
 |     REP_RET | 
 |  | 
 | %else | 
 |     ; ARCH_X86_64 | 
 |     dec                 heightd | 
 |  | 
 |     movu                     m1, [srcq                ]     ;A | 
 |     movu                     m3, [srcq + sstrideq     ]     ;B | 
 |     lea                    srcq, [srcq + sstrideq * 2] | 
 |     punpcklbw                m0, m1, m3                     ;A B | 
 |     punpckhbw                m1, m3                         ;A B | 
 |     movu                     m5, [srcq]                     ;C | 
 |     punpcklbw                m2, m3, m5                     ;A B next iter | 
 |     punpckhbw                m3, m5                         ;A B next iter | 
 |     mova                   tmp0, m2                         ;store to stack | 
 |     mova                   tmp1, m3                         ;store to stack | 
 |     movu                     m7, [srcq + sstrideq]          ;D | 
 |     lea                    srcq, [srcq + sstrideq * 2] | 
 |     punpcklbw                m4, m5, m7                     ;C D | 
 |     punpckhbw                m5, m7                         ;C D | 
 |     movu                     m9, [srcq]                     ;E | 
 |     punpcklbw                m6, m7, m9                     ;C D next iter | 
 |     punpckhbw                m7, m9                         ;C D next iter | 
 |     movu                    m11, [srcq + sstrideq]          ;F | 
 |     lea                    srcq, [srcq + sstrideq * 2] | 
 |     punpcklbw                m8, m9, m11                    ;E F | 
 |     punpckhbw                m9, m11                        ;E F | 
 |     movu                     m2, [srcq]                     ;G | 
 |     punpcklbw               m10, m11, m2                    ;E F next iter | 
 |     punpckhbw               m11, m2                         ;E F next iter | 
 |  | 
 | .loop: | 
 |     ;Do two rows at once | 
 |     pmaddubsw               m13, m0, k0k1 | 
 |     mova                     m0, m4 | 
 |     pmaddubsw               m14, m8, k4k5 | 
 |     pmaddubsw               m15, m4, k2k3 | 
 |     mova                     m4, m8 | 
 |     paddsw                  m13, m14 | 
 |     movu                     m3, [srcq + sstrideq]          ;H | 
 |     lea                    srcq, [srcq + sstrideq * 2] | 
 |     punpcklbw               m14, m2, m3                     ;G H | 
 |     mova                     m8, m14 | 
 |     pmaddubsw               m14, k6k7 | 
 |     paddsw                  m15, m14 | 
 |     paddsw                  m13, m15 | 
 |     paddsw                  m13, krd | 
 |     psraw                   m13, 7 | 
 |  | 
 |     pmaddubsw               m14, m1, k0k1 | 
 |     pmaddubsw                m1, m9, k4k5 | 
 |     pmaddubsw               m15, m5, k2k3 | 
 |     paddsw                  m14, m1 | 
 |     mova                     m1, m5 | 
 |     mova                     m5, m9 | 
 |     punpckhbw                m2, m3                         ;G H | 
 |     mova                     m9, m2 | 
 |     pmaddubsw                m2, k6k7 | 
 |     paddsw                  m15, m2 | 
 |     paddsw                  m14, m15 | 
 |     paddsw                  m14, krd | 
 |     psraw                   m14, 7 | 
 |     packuswb                m13, m14 | 
 | %ifidn %1, v8_avg | 
 |     pavgb                   m13, [dstq] | 
 | %endif | 
 |     mova                 [dstq], m13 | 
 |  | 
 |     ; next iter | 
 |     pmaddubsw               m15, tmp0, k0k1 | 
 |     pmaddubsw               m14, m10, k4k5 | 
 |     pmaddubsw               m13, m6, k2k3 | 
 |     paddsw                  m15, m14 | 
 |     mova                   tmp0, m6 | 
 |     mova                     m6, m10 | 
 |     movu                     m2, [srcq]                     ;G next iter | 
 |     punpcklbw               m14, m3, m2                     ;G H next iter | 
 |     mova                    m10, m14 | 
 |     pmaddubsw               m14, k6k7 | 
 |     paddsw                  m13, m14 | 
 |     paddsw                  m15, m13 | 
 |     paddsw                  m15, krd | 
 |     psraw                   m15, 7 | 
 |  | 
 |     pmaddubsw               m14, tmp1, k0k1 | 
 |     mova                   tmp1, m7 | 
 |     pmaddubsw               m13, m7, k2k3 | 
 |     mova                     m7, m11 | 
 |     pmaddubsw               m11, k4k5 | 
 |     paddsw                  m14, m11 | 
 |     punpckhbw                m3, m2                         ;G H next iter | 
 |     mova                    m11, m3 | 
 |     pmaddubsw                m3, k6k7 | 
 |     paddsw                  m13, m3 | 
 |     paddsw                  m14, m13 | 
 |     paddsw                  m14, krd | 
 |     psraw                   m14, 7 | 
 |     packuswb                m15, m14 | 
 | %ifidn %1, v8_avg | 
 |     pavgb                   m15, [dstq + dstrideq] | 
 | %endif | 
 |     mova      [dstq + dstrideq], m15 | 
 |     lea                    dstq, [dstq + dstrideq * 2] | 
 |     sub                 heightd, 2 | 
 |     jg                    .loop | 
 |  | 
 |     ; Do last row if output_height is odd | 
 |     jne                   .done | 
 |  | 
 |     movu                     m3, [srcq + sstrideq]          ;H | 
 |     punpcklbw                m6, m2, m3                     ;G H | 
 |     punpckhbw                m2, m3                         ;G H | 
 |     pmaddubsw                m0, k0k1 | 
 |     pmaddubsw                m1, k0k1 | 
 |     pmaddubsw                m4, k2k3 | 
 |     pmaddubsw                m5, k2k3 | 
 |     pmaddubsw                m8, k4k5 | 
 |     pmaddubsw                m9, k4k5 | 
 |     pmaddubsw                m6, k6k7 | 
 |     pmaddubsw                m2, k6k7 | 
 |     paddsw                   m0, m8 | 
 |     paddsw                   m1, m9 | 
 |     paddsw                   m4, m6 | 
 |     paddsw                   m5, m2 | 
 |     paddsw                   m0, m4 | 
 |     paddsw                   m1, m5 | 
 |     paddsw                   m0, krd | 
 |     paddsw                   m1, krd | 
 |     psraw                    m0, 7 | 
 |     psraw                    m1, 7 | 
 |     packuswb                 m0, m1 | 
 | %ifidn %1, v8_avg | 
 |     pavgb                    m0, [dstq] | 
 | %endif | 
 |     mova                 [dstq], m0 | 
 |  | 
 | .done: | 
 |     REP_RET | 
 |  | 
 | %endif ; ARCH_X86_64 | 
 |  | 
 | %endm | 
 |  | 
 | INIT_XMM ssse3 | 
 | SUBPIX_VFILTER16     v8 | 
 | SUBPIX_VFILTER       v8, 8 | 
 | SUBPIX_VFILTER       v8, 4 |