vp9/common/x86/idctllm_mmx.asm - aom - Git at Google

 ;
 ;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;

 %include "third_party/x86inc/x86inc.asm"

 SECTION_RODATA
 align 16
 x_s1sqr2:      times 4 dw 0x8A8C
 align 16
 x_c1sqr2less1: times 4 dw 0x4E7B
 align 16
 pw_16:         times 4 dw 16

 SECTION .text


 ; /****************************************************************************
 ; * Notes:
 ; *
 ; * This implementation makes use of 16 bit fixed point version of two multiply
 ; * constants:
 ; *        1.   sqrt(2) * cos (pi/8)
 ; *        2.   sqrt(2) * sin (pi/8)
 ; * Because the first constant is bigger than 1, to maintain the same 16 bit
 ; * fixed point precision as the second one, we use a trick of
 ; *        x * a = x + x*(a-1)
 ; * so
 ; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
 ; *
 ; * For the second constant, because of the 16bit version is 35468, which
 ; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
 ; * number.
 ; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
 ; *
 ; **************************************************************************/

 INIT_MMX

 ;void short_idct4x4llm_mmx(short *input, short *output, int pitch)
 cglobal short_idct4x4llm_mmx, 3,3,0, inp, out, pit
     mova            m0,     [inpq +0]
     mova            m1,     [inpq +8]

     mova            m2,     [inpq+16]
     mova            m3,     [inpq+24]

     psubw           m0,      m2             ; b1= 0-2
     paddw           m2,      m2             ;

     mova            m5,      m1
     paddw           m2,      m0             ; a1 =0+2

     pmulhw          m5,     [x_s1sqr2]       ;
     paddw           m5,      m1             ; ip1 * sin(pi/8) * sqrt(2)

     mova            m7,      m3             ;
     pmulhw          m7,     [x_c1sqr2less1]   ;

     paddw           m7,      m3             ; ip3 * cos(pi/8) * sqrt(2)
     psubw           m7,      m5             ; c1

     mova            m5,      m1
     mova            m4,      m3

     pmulhw          m5,     [x_c1sqr2less1]
     paddw           m5,      m1

     pmulhw          m3,     [x_s1sqr2]
     paddw           m3,      m4

     paddw           m3,      m5             ; d1
     mova            m6,      m2             ; a1

     mova            m4,      m0             ; b1
     paddw           m2,      m3             ;0

     paddw           m4,      m7             ;1
     psubw           m0,      m7             ;2

     psubw           m6,      m3             ;3

     mova            m1,      m2             ; 03 02 01 00
     mova            m3,      m4             ; 23 22 21 20

     punpcklwd       m1,      m0             ; 11 01 10 00
     punpckhwd       m2,      m0             ; 13 03 12 02

     punpcklwd       m3,      m6             ; 31 21 30 20
     punpckhwd       m4,      m6             ; 33 23 32 22

     mova            m0,      m1             ; 11 01 10 00
     mova            m5,      m2             ; 13 03 12 02

     punpckldq       m0,      m3             ; 30 20 10 00
     punpckhdq       m1,      m3             ; 31 21 11 01

     punpckldq       m2,      m4             ; 32 22 12 02
     punpckhdq       m5,      m4             ; 33 23 13 03

     mova            m3,      m5             ; 33 23 13 03

     psubw           m0,      m2             ; b1= 0-2
     paddw           m2,      m2             ;

     mova            m5,      m1
     paddw           m2,      m0             ; a1 =0+2

     pmulhw          m5,     [x_s1sqr2]        ;
     paddw           m5,      m1             ; ip1 * sin(pi/8) * sqrt(2)

     mova            m7,      m3             ;
     pmulhw          m7,     [x_c1sqr2less1]   ;

     paddw           m7,      m3             ; ip3 * cos(pi/8) * sqrt(2)
     psubw           m7,      m5             ; c1

     mova            m5,      m1
     mova            m4,      m3

     pmulhw          m5,     [x_c1sqr2less1]
     paddw           m5,      m1

     pmulhw          m3,     [x_s1sqr2]
     paddw           m3,      m4

     paddw           m3,      m5             ; d1
     paddw           m0,     [pw_16]

     paddw           m2,     [pw_16]
     mova            m6,      m2             ; a1

     mova            m4,      m0             ; b1
     paddw           m2,      m3             ;0

     paddw           m4,      m7             ;1
     psubw           m0,      m7             ;2

     psubw           m6,      m3             ;3
     psraw           m2,      5

     psraw           m0,      5
     psraw           m4,      5

     psraw           m6,      5

     mova            m1,      m2             ; 03 02 01 00
     mova            m3,      m4             ; 23 22 21 20

     punpcklwd       m1,      m0             ; 11 01 10 00
     punpckhwd       m2,      m0             ; 13 03 12 02

     punpcklwd       m3,      m6             ; 31 21 30 20
     punpckhwd       m4,      m6             ; 33 23 32 22

     mova            m0,      m1             ; 11 01 10 00
     mova            m5,      m2             ; 13 03 12 02

     punpckldq       m0,      m3             ; 30 20 10 00
     punpckhdq       m1,      m3             ; 31 21 11 01

     punpckldq       m2,      m4             ; 32 22 12 02
     punpckhdq       m5,      m4             ; 33 23 13 03

     mova        [outq],      m0

     mova     [outq+r2],      m1
     mova [outq+pitq*2],      m2

     add           outq,      pitq
     mova [outq+pitq*2],      m5
     RET

 ;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)
 cglobal short_idct4x4llm_1_mmx,3,3,0,inp,out,pit
     movh            m0,     [inpq]
     paddw           m0,     [pw_16]
     psraw           m0,      5
     punpcklwd       m0,      m0
     punpckldq       m0,      m0

     mova        [outq],      m0
     mova   [outq+pitq],      m0

     mova [outq+pitq*2],      m0
     add             r1,      r2

     mova [outq+pitq*2],      m0
     RET


 ;void dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
 cglobal dc_only_idct_add_mmx, 4,5,0,in_dc,pred,dst,pit,stride
 %if ARCH_X86_64
     movsxd         strideq,      dword stridem
 %else
     mov            strideq,      stridem
 %endif
     pxor                m0,      m0

     movh                m5,      in_dcq ; dc
     paddw               m5,     [pw_16]

     psraw               m5,      5

     punpcklwd           m5,      m5
     punpckldq           m5,      m5

     movh                m1,     [predq]
     punpcklbw           m1,      m0
     paddsw              m1,      m5
     packuswb            m1,      m0              ; pack and unpack to saturate
     movh            [dstq],      m1

     movh                m2,     [predq+pitq]
     punpcklbw           m2,      m0
     paddsw              m2,      m5
     packuswb            m2,      m0              ; pack and unpack to saturate
     movh    [dstq+strideq],      m2

     movh                m3,     [predq+2*pitq]
     punpcklbw           m3,      m0
     paddsw              m3,      m5
     packuswb            m3,      m0              ; pack and unpack to saturate
     movh  [dstq+2*strideq],      m3

     add               dstq,      strideq
     add              predq,      pitq
     movh                m4,     [predq+2*pitq]
     punpcklbw           m4,      m0
     paddsw              m4,      m5
     packuswb            m4,      m0              ; pack and unpack to saturate
     movh  [dstq+2*strideq],      m4
     RET
	;
	; Copyright (c) 2012 The WebM project authors. All Rights Reserved.
	;
	; Use of this source code is governed by a BSD-style license
	; that can be found in the LICENSE file in the root of the source
	; tree. An additional intellectual property rights grant can be found
	; in the file PATENTS. All contributing project authors may
	; be found in the AUTHORS file in the root of the source tree.
	;

	%include "third_party/x86inc/x86inc.asm"

	SECTION_RODATA
	align 16
	x_s1sqr2: times 4 dw 0x8A8C
	align 16
	x_c1sqr2less1: times 4 dw 0x4E7B
	align 16
	pw_16: times 4 dw 16

	SECTION .text


	; /****************************************************************************
	; * Notes:
	; *
	; * This implementation makes use of 16 bit fixed point version of two multiply
	; * constants:
	; * 1. sqrt(2) * cos (pi/8)
	; * 2. sqrt(2) * sin (pi/8)
	; * Because the first constant is bigger than 1, to maintain the same 16 bit
	; * fixed point precision as the second one, we use a trick of
	; * x * a = x + x*(a-1)
	; * so
	; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
	; *
	; * For the second constant, because of the 16bit version is 35468, which
	; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
	; * number.
	; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
	; *
	; **************************************************************************/

	INIT_MMX

	;void short_idct4x4llm_mmx(short input, short output, int pitch)
	cglobal short_idct4x4llm_mmx, 3,3,0, inp, out, pit
	mova m0, [inpq +0]
	mova m1, [inpq +8]

	mova m2, [inpq+16]
	mova m3, [inpq+24]

	psubw m0, m2 ; b1= 0-2
	paddw m2, m2 ;

	mova m5, m1
	paddw m2, m0 ; a1 =0+2

	pmulhw m5, [x_s1sqr2] ;
	paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)

	mova m7, m3 ;
	pmulhw m7, [x_c1sqr2less1] ;

	paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
	psubw m7, m5 ; c1

	mova m5, m1
	mova m4, m3

	pmulhw m5, [x_c1sqr2less1]
	paddw m5, m1

	pmulhw m3, [x_s1sqr2]
	paddw m3, m4

	paddw m3, m5 ; d1
	mova m6, m2 ; a1

	mova m4, m0 ; b1
	paddw m2, m3 ;0

	paddw m4, m7 ;1
	psubw m0, m7 ;2

	psubw m6, m3 ;3

	mova m1, m2 ; 03 02 01 00
	mova m3, m4 ; 23 22 21 20

	punpcklwd m1, m0 ; 11 01 10 00
	punpckhwd m2, m0 ; 13 03 12 02

	punpcklwd m3, m6 ; 31 21 30 20
	punpckhwd m4, m6 ; 33 23 32 22

	mova m0, m1 ; 11 01 10 00
	mova m5, m2 ; 13 03 12 02

	punpckldq m0, m3 ; 30 20 10 00
	punpckhdq m1, m3 ; 31 21 11 01

	punpckldq m2, m4 ; 32 22 12 02
	punpckhdq m5, m4 ; 33 23 13 03

	mova m3, m5 ; 33 23 13 03

	psubw m0, m2 ; b1= 0-2
	paddw m2, m2 ;

	mova m5, m1
	paddw m2, m0 ; a1 =0+2

	pmulhw m5, [x_s1sqr2] ;
	paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)

	mova m7, m3 ;
	pmulhw m7, [x_c1sqr2less1] ;

	paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
	psubw m7, m5 ; c1

	mova m5, m1
	mova m4, m3

	pmulhw m5, [x_c1sqr2less1]
	paddw m5, m1

	pmulhw m3, [x_s1sqr2]
	paddw m3, m4

	paddw m3, m5 ; d1
	paddw m0, [pw_16]

	paddw m2, [pw_16]
	mova m6, m2 ; a1

	mova m4, m0 ; b1
	paddw m2, m3 ;0

	paddw m4, m7 ;1
	psubw m0, m7 ;2

	psubw m6, m3 ;3
	psraw m2, 5

	psraw m0, 5
	psraw m4, 5

	psraw m6, 5

	mova m1, m2 ; 03 02 01 00
	mova m3, m4 ; 23 22 21 20

	punpcklwd m1, m0 ; 11 01 10 00
	punpckhwd m2, m0 ; 13 03 12 02

	punpcklwd m3, m6 ; 31 21 30 20
	punpckhwd m4, m6 ; 33 23 32 22

	mova m0, m1 ; 11 01 10 00
	mova m5, m2 ; 13 03 12 02

	punpckldq m0, m3 ; 30 20 10 00
	punpckhdq m1, m3 ; 31 21 11 01

	punpckldq m2, m4 ; 32 22 12 02
	punpckhdq m5, m4 ; 33 23 13 03

	mova [outq], m0

	mova [outq+r2], m1
	mova [outq+pitq*2], m2

	add outq, pitq
	mova [outq+pitq*2], m5
	RET

	;void short_idct4x4llm_1_mmx(short input, short output, int pitch)
	cglobal short_idct4x4llm_1_mmx,3,3,0,inp,out,pit
	movh m0, [inpq]
	paddw m0, [pw_16]
	psraw m0, 5
	punpcklwd m0, m0
	punpckldq m0, m0

	mova [outq], m0
	mova [outq+pitq], m0

	mova [outq+pitq*2], m0
	add r1, r2

	mova [outq+pitq*2], m0
	RET


	;void dc_only_idct_add_mmx(short input_dc, unsigned char pred_ptr, unsigned char dst_ptr, int pitch, int stride)
	cglobal dc_only_idct_add_mmx, 4,5,0,in_dc,pred,dst,pit,stride
	%if ARCH_X86_64
	movsxd strideq, dword stridem
	%else
	mov strideq, stridem
	%endif
	pxor m0, m0

	movh m5, in_dcq ; dc
	paddw m5, [pw_16]

	psraw m5, 5

	punpcklwd m5, m5
	punpckldq m5, m5

	movh m1, [predq]
	punpcklbw m1, m0
	paddsw m1, m5
	packuswb m1, m0 ; pack and unpack to saturate
	movh [dstq], m1

	movh m2, [predq+pitq]
	punpcklbw m2, m0
	paddsw m2, m5
	packuswb m2, m0 ; pack and unpack to saturate
	movh [dstq+strideq], m2

	movh m3, [predq+2*pitq]
	punpcklbw m3, m0
	paddsw m3, m5
	packuswb m3, m0 ; pack and unpack to saturate
	movh [dstq+2*strideq], m3

	add dstq, strideq
	add predq, pitq
	movh m4, [predq+2*pitq]
	punpcklbw m4, m0
	paddsw m4, m5
	packuswb m4, m0 ; pack and unpack to saturate
	movh [dstq+2*strideq], m4
	RET