|  | ; | 
|  | ; Copyright (c) 2016, Alliance for Open Media. All rights reserved | 
|  | ; | 
|  | ; This source code is subject to the terms of the BSD 2 Clause License and | 
|  | ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License | 
|  | ; was not distributed with this source code in the LICENSE file, you can | 
|  | ; obtain it at www.aomedia.org/license/software. If the Alliance for Open | 
|  | ; Media Patent License 1.0 was not distributed with this source code in the | 
|  | ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. | 
|  | ; | 
|  |  | 
|  | ; | 
|  |  | 
|  | %include "third_party/x86inc/x86inc.asm" | 
|  |  | 
|  | SECTION .text | 
|  |  | 
|  | %macro REORDER_INPUTS 0 | 
|  | ; a c d b  to  a b c d | 
|  | SWAP 1, 3, 2 | 
|  | %endmacro | 
|  |  | 
|  | %macro TRANSFORM_COLS 0 | 
|  | ; input: | 
|  | ; m0 a | 
|  | ; m1 b | 
|  | ; m2 c | 
|  | ; m3 d | 
|  | paddw           m0,        m2 | 
|  | psubw           m3,        m1 | 
|  |  | 
|  | ; wide subtract | 
|  | punpcklwd       m4,        m0 | 
|  | punpcklwd       m5,        m3 | 
|  | psrad           m4,        16 | 
|  | psrad           m5,        16 | 
|  | psubd           m4,        m5 | 
|  | psrad           m4,        1 | 
|  | packssdw        m4,        m4             ; e | 
|  |  | 
|  | psubw           m5,        m4,        m1  ; b | 
|  | psubw           m4,        m2             ; c | 
|  | psubw           m0,        m5 | 
|  | paddw           m3,        m4 | 
|  | ; m0 a | 
|  | SWAP            1,         5  ; m1 b | 
|  | SWAP            2,         4  ; m2 c | 
|  | ; m3 d | 
|  | %endmacro | 
|  |  | 
|  | %macro TRANSPOSE_4X4 0 | 
|  | punpcklwd       m0,        m2 | 
|  | punpcklwd       m1,        m3 | 
|  | mova            m2,        m0 | 
|  | punpcklwd       m0,        m1 | 
|  | punpckhwd       m2,        m1 | 
|  | pshufd          m1,        m0, 0x0e | 
|  | pshufd          m3,        m2, 0x0e | 
|  | %endmacro | 
|  |  | 
|  | ; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3 | 
|  | %macro TRANSPOSE_4X4_WIDE 0 | 
|  | mova            m3, m0 | 
|  | punpcklwd       m0, m1 | 
|  | punpckhwd       m3, m1 | 
|  | mova            m2, m0 | 
|  | punpcklwd       m0, m3 | 
|  | punpckhwd       m2, m3 | 
|  | pshufd          m1, m0, 0x0e | 
|  | pshufd          m3, m2, 0x0e | 
|  | %endmacro | 
|  |  | 
|  | %macro ADD_STORE_4P_2X 5  ; src1, src2, tmp1, tmp2, zero | 
|  | movd            m%3,       [outputq] | 
|  | movd            m%4,       [outputq + strideq] | 
|  | punpcklbw       m%3,       m%5 | 
|  | punpcklbw       m%4,       m%5 | 
|  | paddw           m%1,       m%3 | 
|  | paddw           m%2,       m%4 | 
|  | packuswb        m%1,       m%5 | 
|  | packuswb        m%2,       m%5 | 
|  | movd            [outputq], m%1 | 
|  | movd            [outputq + strideq], m%2 | 
|  | %endmacro | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride | 
|  | mova            m0,        [inputq +  0] | 
|  | packssdw        m0,        [inputq + 16] | 
|  | mova            m1,        [inputq + 32] | 
|  | packssdw        m1,        [inputq + 48] | 
|  | psraw           m0,        2 | 
|  | psraw           m1,        2 | 
|  |  | 
|  | TRANSPOSE_4X4_WIDE | 
|  | REORDER_INPUTS | 
|  | TRANSFORM_COLS | 
|  | TRANSPOSE_4X4 | 
|  | REORDER_INPUTS | 
|  | TRANSFORM_COLS | 
|  |  | 
|  | pxor            m4, m4 | 
|  | ADD_STORE_4P_2X  0, 1, 5, 6, 4 | 
|  | lea             outputq, [outputq + 2 * strideq] | 
|  | ADD_STORE_4P_2X  2, 3, 5, 6, 4 | 
|  |  | 
|  | RET |