|  | ; | 
|  | ; Copyright (c) 2021, Alliance for Open Media. All rights reserved | 
|  | ; | 
|  | ; This source code is subject to the terms of the BSD 3-Clause Clear License and the | 
|  | ; Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear License was | 
|  | ; not distributed with this source code in the LICENSE file, you can obtain it | 
|  | ; at aomedia.org/license/software-license/bsd-3-c-c/.  If the Alliance for Open Media Patent | 
|  | ; License 1.0 was not distributed with this source code in the PATENTS file, you | 
|  | ; can obtain it at aomedia.org/license/patent-license/. | 
|  | ; | 
|  |  | 
|  | %define private_prefix av1 | 
|  |  | 
|  | %include "third_party/x86inc/x86inc.asm" | 
|  |  | 
|  | SECTION .text | 
|  |  | 
|  | %macro TRANSFORM_COLS 0 | 
|  | paddw           m0,        m1 | 
|  | movq            m4,        m0 | 
|  | psubw           m3,        m2 | 
|  | psubw           m4,        m3 | 
|  | psraw           m4,        1 | 
|  | movq            m5,        m4 | 
|  | psubw           m5,        m1 ;b1 | 
|  | psubw           m4,        m2 ;c1 | 
|  | psubw           m0,        m4 | 
|  | paddw           m3,        m5 | 
|  | ; m0 a0 | 
|  | SWAP            1,         4  ; m1 c1 | 
|  | SWAP            2,         3  ; m2 d1 | 
|  | SWAP            3,         5  ; m3 b1 | 
|  | %endmacro | 
|  |  | 
|  | %macro TRANSPOSE_4X4 0 | 
|  | ; 00 01 02 03 | 
|  | ; 10 11 12 13 | 
|  | ; 20 21 22 23 | 
|  | ; 30 31 32 33 | 
|  | punpcklwd       m0,        m1 ; 00 10 01 11  02 12 03 13 | 
|  | punpcklwd       m2,        m3 ; 20 30 21 31  22 32 23 33 | 
|  | mova            m1,        m0 | 
|  | punpckldq       m0,        m2 ; 00 10 20 30  01 11 21 31 | 
|  | punpckhdq       m1,        m2 ; 02 12 22 32  03 13 23 33 | 
|  | %endmacro | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal fwht4x4, 3, 4, 8, input, output, stride | 
|  | lea             r3q,       [inputq + strideq*4] | 
|  | movq            m0,        [inputq] ;a1 | 
|  | movq            m1,        [inputq + strideq*2] ;b1 | 
|  | movq            m2,        [r3q] ;c1 | 
|  | movq            m3,        [r3q + strideq*2] ;d1 | 
|  |  | 
|  | TRANSFORM_COLS | 
|  | TRANSPOSE_4X4 | 
|  | SWAP            1,         2 | 
|  | psrldq          m1,        m0, 8 | 
|  | psrldq          m3,        m2, 8 | 
|  | TRANSFORM_COLS | 
|  | TRANSPOSE_4X4 | 
|  |  | 
|  | psllw           m0,        2 | 
|  | psllw           m1,        2 | 
|  |  | 
|  | ; sign extension | 
|  | mova            m2,             m0 | 
|  | mova            m3,             m1 | 
|  | punpcklwd       m0,             m0 | 
|  | punpcklwd       m1,             m1 | 
|  | punpckhwd       m2,             m2 | 
|  | punpckhwd       m3,             m3 | 
|  | psrad           m0,             16 | 
|  | psrad           m1,             16 | 
|  | psrad           m2,             16 | 
|  | psrad           m3,             16 | 
|  | mova            [outputq],      m0 | 
|  | mova            [outputq + 16], m2 | 
|  | mova            [outputq + 32], m1 | 
|  | mova            [outputq + 48], m3 | 
|  |  | 
|  | RET |