| ; | 
 | ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 
 | ; | 
 | ;  Use of this source code is governed by a BSD-style license | 
 | ;  that can be found in the LICENSE file in the root of the source | 
 | ;  tree. An additional intellectual property rights grant can be found | 
 | ;  in the file PATENTS.  All contributing project authors may | 
 | ;  be found in the AUTHORS file in the root of the source tree. | 
 | ; | 
 |  | 
 |  | 
 | %include "vpx_ports/x86_abi_support.asm" | 
 |  | 
 | ;void vp9_short_fdct4x4_mmx(short *input, short *output, int pitch) | 
 | global sym(vp9_short_fdct4x4_mmx) PRIVATE | 
 | sym(vp9_short_fdct4x4_mmx): | 
 |     push        rbp | 
 |     mov         rbp,        rsp | 
 |     SHADOW_ARGS_TO_STACK 3 | 
 |     GET_GOT     rbx | 
 |     push        rsi | 
 |     push        rdi | 
 |     ; end prolog | 
 |  | 
 |         mov         rsi,        arg(0)      ; input | 
 |         mov         rdi,        arg(1)      ; output | 
 |  | 
 |         movsxd      rax,        dword ptr arg(2) ;pitch | 
 |  | 
 |         lea         rcx,        [rsi + rax*2] | 
 |         ; read the input data | 
 |         movq        mm0,        [rsi] | 
 |         movq        mm1,        [rsi + rax] | 
 |  | 
 |         movq        mm2,        [rcx] | 
 |         movq        mm4,        [rcx + rax] | 
 |  | 
 |         ; transpose for the first stage | 
 |         movq        mm3,        mm0         ; 00 01 02 03 | 
 |         movq        mm5,        mm2         ; 20 21 22 23 | 
 |  | 
 |         punpcklwd   mm0,        mm1         ; 00 10 01 11 | 
 |         punpckhwd   mm3,        mm1         ; 02 12 03 13 | 
 |  | 
 |         punpcklwd   mm2,        mm4         ; 20 30 21 31 | 
 |         punpckhwd   mm5,        mm4         ; 22 32 23 33 | 
 |  | 
 |         movq        mm1,        mm0         ; 00 10 01 11 | 
 |         punpckldq   mm0,        mm2         ; 00 10 20 30 | 
 |  | 
 |         punpckhdq   mm1,        mm2         ; 01 11 21 31 | 
 |  | 
 |         movq        mm2,        mm3         ; 02 12 03 13 | 
 |         punpckldq   mm2,        mm5         ; 02 12 22 32 | 
 |  | 
 |         punpckhdq   mm3,        mm5         ; 03 13 23 33 | 
 |  | 
 |         ; mm0 0 | 
 |         ; mm1 1 | 
 |         ; mm2 2 | 
 |         ; mm3 3 | 
 |  | 
 |         ; first stage | 
 |         movq        mm5,        mm0 | 
 |         movq        mm4,        mm1 | 
 |  | 
 |         paddw       mm0,        mm3         ; a1 = 0 + 3 | 
 |         paddw       mm1,        mm2         ; b1 = 1 + 2 | 
 |  | 
 |         psubw       mm4,        mm2         ; c1 = 1 - 2 | 
 |         psubw       mm5,        mm3         ; d1 = 0 - 3 | 
 |  | 
 |         psllw       mm5,        3 | 
 |         psllw       mm4,        3 | 
 |  | 
 |         psllw       mm0,        3 | 
 |         psllw       mm1,        3 | 
 |  | 
 |         ; output 0 and 2 | 
 |         movq        mm2,        mm0         ; a1 | 
 |  | 
 |         paddw       mm0,        mm1         ; op[0] = a1 + b1 | 
 |         psubw       mm2,        mm1         ; op[2] = a1 - b1 | 
 |  | 
 |         ; output 1 and 3 | 
 |         ; interleave c1, d1 | 
 |         movq        mm1,        mm5         ; d1 | 
 |         punpcklwd   mm1,        mm4         ; c1 d1 | 
 |         punpckhwd   mm5,        mm4         ; c1 d1 | 
 |  | 
 |         movq        mm3,        mm1 | 
 |         movq        mm4,        mm5 | 
 |  | 
 |         pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352 | 
 |         pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352 | 
 |  | 
 |         pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352 | 
 |         pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352 | 
 |  | 
 |         paddd       mm1,        MMWORD PTR[GLOBAL(_14500)] | 
 |         paddd       mm4,        MMWORD PTR[GLOBAL(_14500)] | 
 |         paddd       mm3,        MMWORD PTR[GLOBAL(_7500)] | 
 |         paddd       mm5,        MMWORD PTR[GLOBAL(_7500)] | 
 |  | 
 |         psrad       mm1,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12 | 
 |         psrad       mm4,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12 | 
 |         psrad       mm3,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12 | 
 |         psrad       mm5,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12 | 
 |  | 
 |         packssdw    mm1,        mm4         ; op[1] | 
 |         packssdw    mm3,        mm5         ; op[3] | 
 |  | 
 |         ; done with vertical | 
 |         ; transpose for the second stage | 
 |         movq        mm4,        mm0         ; 00 10 20 30 | 
 |         movq        mm5,        mm2         ; 02 12 22 32 | 
 |  | 
 |         punpcklwd   mm0,        mm1         ; 00 01 10 11 | 
 |         punpckhwd   mm4,        mm1         ; 20 21 30 31 | 
 |  | 
 |         punpcklwd   mm2,        mm3         ; 02 03 12 13 | 
 |         punpckhwd   mm5,        mm3         ; 22 23 32 33 | 
 |  | 
 |         movq        mm1,        mm0         ; 00 01 10 11 | 
 |         punpckldq   mm0,        mm2         ; 00 01 02 03 | 
 |  | 
 |         punpckhdq   mm1,        mm2         ; 01 22 12 13 | 
 |  | 
 |         movq        mm2,        mm4         ; 20 31 30 31 | 
 |         punpckldq   mm2,        mm5         ; 20 21 22 23 | 
 |  | 
 |         punpckhdq   mm4,        mm5         ; 30 31 32 33 | 
 |  | 
 |         ; mm0 0 | 
 |         ; mm1 1 | 
 |         ; mm2 2 | 
 |         ; mm3 4 | 
 |  | 
 |         movq        mm5,        mm0 | 
 |         movq        mm3,        mm1 | 
 |  | 
 |         paddw       mm0,        mm4         ; a1 = 0 + 3 | 
 |         paddw       mm1,        mm2         ; b1 = 1 + 2 | 
 |  | 
 |         psubw       mm3,        mm2         ; c1 = 1 - 2 | 
 |         psubw       mm5,        mm4         ; d1 = 0 - 3 | 
 |  | 
 |         pxor        mm6,        mm6         ; zero out for compare | 
 |  | 
 |         pcmpeqw     mm6,        mm5         ; d1 != 0 | 
 |  | 
 |         pandn       mm6,        MMWORD PTR[GLOBAL(_cmp_mask)]   ; clear upper, | 
 |                                                                 ; and keep bit 0 of lower | 
 |  | 
 |         ; output 0 and 2 | 
 |         movq        mm2,        mm0         ; a1 | 
 |  | 
 |         paddw       mm0,        mm1         ; a1 + b1 | 
 |         psubw       mm2,        mm1         ; a1 - b1 | 
 |  | 
 |         paddw       mm0,        MMWORD PTR[GLOBAL(_7w)] | 
 |         paddw       mm2,        MMWORD PTR[GLOBAL(_7w)] | 
 |  | 
 |         psraw       mm0,        4           ; op[0] = (a1 + b1 + 7)>>4 | 
 |         psraw       mm2,        4           ; op[8] = (a1 - b1 + 7)>>4 | 
 |  | 
 |         movq        MMWORD PTR[rdi + 0 ],  mm0 | 
 |         movq        MMWORD PTR[rdi + 16],  mm2 | 
 |  | 
 |         ; output 1 and 3 | 
 |         ; interleave c1, d1 | 
 |         movq        mm1,        mm5         ; d1 | 
 |         punpcklwd   mm1,        mm3         ; c1 d1 | 
 |         punpckhwd   mm5,        mm3         ; c1 d1 | 
 |  | 
 |         movq        mm3,        mm1 | 
 |         movq        mm4,        mm5 | 
 |  | 
 |         pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352 | 
 |         pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352 | 
 |  | 
 |         pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352 | 
 |         pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352 | 
 |  | 
 |         paddd       mm1,        MMWORD PTR[GLOBAL(_12000)] | 
 |         paddd       mm4,        MMWORD PTR[GLOBAL(_12000)] | 
 |         paddd       mm3,        MMWORD PTR[GLOBAL(_51000)] | 
 |         paddd       mm5,        MMWORD PTR[GLOBAL(_51000)] | 
 |  | 
 |         psrad       mm1,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16 | 
 |         psrad       mm4,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16 | 
 |         psrad       mm3,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16 | 
 |         psrad       mm5,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16 | 
 |  | 
 |         packssdw    mm1,        mm4         ; op[4] | 
 |         packssdw    mm3,        mm5         ; op[12] | 
 |  | 
 |         paddw       mm1,        mm6         ; op[4] += (d1!=0) | 
 |  | 
 |         movq        MMWORD PTR[rdi + 8 ],  mm1 | 
 |         movq        MMWORD PTR[rdi + 24],  mm3 | 
 |  | 
 |      ; begin epilog | 
 |     pop         rdi | 
 |     pop         rsi | 
 |     RESTORE_GOT | 
 |     UNSHADOW_ARGS | 
 |     pop         rbp | 
 |     ret | 
 |  | 
 | SECTION_RODATA | 
 | align 8 | 
 | _5352_2217: | 
 |     dw 5352 | 
 |     dw 2217 | 
 |     dw 5352 | 
 |     dw 2217 | 
 | align 8 | 
 | _2217_neg5352: | 
 |     dw 2217 | 
 |     dw -5352 | 
 |     dw 2217 | 
 |     dw -5352 | 
 | align 8 | 
 | _cmp_mask: | 
 |     times 4 dw 1 | 
 | align 8 | 
 | _7w: | 
 |     times 4 dw 7 | 
 | align 8 | 
 | _14500: | 
 |     times 2 dd 14500 | 
 | align 8 | 
 | _7500: | 
 |     times 2 dd 7500 | 
 | align 8 | 
 | _12000: | 
 |     times 2 dd 12000 | 
 | align 8 | 
 | _51000: | 
 |     times 2 dd 51000 |