| ; | 
 | ; Copyright (c) 2016, Alliance for Open Media. All rights reserved | 
 | ; | 
 | ; This source code is subject to the terms of the BSD 2 Clause License and | 
 | ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License | 
 | ; was not distributed with this source code in the LICENSE file, you can | 
 | ; obtain it at www.aomedia.org/license/software. If the Alliance for Open | 
 | ; Media Patent License 1.0 was not distributed with this source code in the | 
 | ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. | 
 | ; | 
 |  | 
 | ; | 
 |  | 
 | %include "third_party/x86inc/x86inc.asm" | 
 |  | 
 | SECTION_RODATA | 
 |  | 
 | pw_11585x2: times 8 dw 23170 | 
 | pd_8192:    times 4 dd 8192 | 
 |  | 
 | %macro TRANSFORM_COEFFS 2 | 
 | pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2 | 
 | pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1 | 
 | %endmacro | 
 |  | 
 | TRANSFORM_COEFFS 11585,  11585 | 
 | TRANSFORM_COEFFS 15137,   6270 | 
 | TRANSFORM_COEFFS 16069,   3196 | 
 | TRANSFORM_COEFFS  9102,  13623 | 
 |  | 
 | %macro STORE_OUTPUT 2 ; index, result | 
 |   ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); | 
 |   ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); | 
 |   ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); | 
 |   ; _mm_store_si128((__m128i *)(dst_ptr), out0); | 
 |   ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1); | 
 |   pxor               m11, m11 | 
 |   pcmpgtw            m11, m%2 | 
 |   movdqa             m12, m%2 | 
 |   punpcklwd          m%2, m11 | 
 |   punpckhwd          m12, m11 | 
 |   mova               [outputq + 4*%1 +  0], m%2 | 
 |   mova               [outputq + 4*%1 + 16], m12 | 
 | %endmacro | 
 |  | 
 | SECTION .text | 
 |  | 
 | %if ARCH_X86_64 | 
 | INIT_XMM ssse3 | 
 | cglobal fdct8x8, 3, 5, 13, input, output, stride | 
 |  | 
 |   mova               m8, [GLOBAL(pd_8192)] | 
 |   mova              m12, [GLOBAL(pw_11585x2)] | 
 |  | 
 |   lea                r3, [2 * strideq] | 
 |   lea                r4, [4 * strideq] | 
 |   mova               m0, [inputq] | 
 |   mova               m1, [inputq + r3] | 
 |   lea                inputq, [inputq + r4] | 
 |   mova               m2, [inputq] | 
 |   mova               m3, [inputq + r3] | 
 |   lea                inputq, [inputq + r4] | 
 |   mova               m4, [inputq] | 
 |   mova               m5, [inputq + r3] | 
 |   lea                inputq, [inputq + r4] | 
 |   mova               m6, [inputq] | 
 |   mova               m7, [inputq + r3] | 
 |  | 
 |   ; left shift by 2 to increase forward transformation precision | 
 |   psllw              m0, 2 | 
 |   psllw              m1, 2 | 
 |   psllw              m2, 2 | 
 |   psllw              m3, 2 | 
 |   psllw              m4, 2 | 
 |   psllw              m5, 2 | 
 |   psllw              m6, 2 | 
 |   psllw              m7, 2 | 
 |  | 
 |   ; column transform | 
 |   ; stage 1 | 
 |   paddw m10, m0, m7 | 
 |   psubw m0, m7 | 
 |  | 
 |   paddw m9, m1, m6 | 
 |   psubw m1, m6 | 
 |  | 
 |   paddw m7, m2, m5 | 
 |   psubw m2, m5 | 
 |  | 
 |   paddw m6, m3, m4 | 
 |   psubw m3, m4 | 
 |  | 
 |   ; stage 2 | 
 |   paddw m5, m9, m7 | 
 |   psubw m9, m7 | 
 |  | 
 |   paddw m4, m10, m6 | 
 |   psubw m10, m6 | 
 |  | 
 |   paddw m7, m1, m2 | 
 |   psubw m1, m2 | 
 |  | 
 |   ; stage 3 | 
 |   paddw m6, m4, m5 | 
 |   psubw m4, m5 | 
 |  | 
 |   pmulhrsw m1, m12 | 
 |   pmulhrsw m7, m12 | 
 |  | 
 |   ; sin(pi / 8), cos(pi / 8) | 
 |   punpcklwd m2, m10, m9 | 
 |   punpckhwd m10, m9 | 
 |   pmaddwd m5, m2, [GLOBAL(pw_15137_6270)] | 
 |   pmaddwd m2, [GLOBAL(pw_6270_m15137)] | 
 |   pmaddwd m9, m10, [GLOBAL(pw_15137_6270)] | 
 |   pmaddwd m10, [GLOBAL(pw_6270_m15137)] | 
 |   paddd m5, m8 | 
 |   paddd m2, m8 | 
 |   paddd m9, m8 | 
 |   paddd m10, m8 | 
 |   psrad m5, 14 | 
 |   psrad m2, 14 | 
 |   psrad m9, 14 | 
 |   psrad m10, 14 | 
 |   packssdw m5, m9 | 
 |   packssdw m2, m10 | 
 |  | 
 |   pmulhrsw m6, m12 | 
 |   pmulhrsw m4, m12 | 
 |  | 
 |   paddw m9, m3, m1 | 
 |   psubw m3, m1 | 
 |  | 
 |   paddw m10, m0, m7 | 
 |   psubw m0, m7 | 
 |  | 
 |   ; stage 4 | 
 |   ; sin(pi / 16), cos(pi / 16) | 
 |   punpcklwd m1, m10, m9 | 
 |   punpckhwd m10, m9 | 
 |   pmaddwd m7, m1, [GLOBAL(pw_16069_3196)] | 
 |   pmaddwd m1, [GLOBAL(pw_3196_m16069)] | 
 |   pmaddwd m9, m10, [GLOBAL(pw_16069_3196)] | 
 |   pmaddwd m10, [GLOBAL(pw_3196_m16069)] | 
 |   paddd m7, m8 | 
 |   paddd m1, m8 | 
 |   paddd m9, m8 | 
 |   paddd m10, m8 | 
 |   psrad m7, 14 | 
 |   psrad m1, 14 | 
 |   psrad m9, 14 | 
 |   psrad m10, 14 | 
 |   packssdw m7, m9 | 
 |   packssdw m1, m10 | 
 |  | 
 |   ; sin(3 * pi / 16), cos(3 * pi / 16) | 
 |   punpcklwd m11, m0, m3 | 
 |   punpckhwd m0, m3 | 
 |   pmaddwd m9, m11, [GLOBAL(pw_9102_13623)] | 
 |   pmaddwd m11, [GLOBAL(pw_13623_m9102)] | 
 |   pmaddwd m3, m0, [GLOBAL(pw_9102_13623)] | 
 |   pmaddwd m0, [GLOBAL(pw_13623_m9102)] | 
 |   paddd m9, m8 | 
 |   paddd m11, m8 | 
 |   paddd m3, m8 | 
 |   paddd m0, m8 | 
 |   psrad m9, 14 | 
 |   psrad m11, 14 | 
 |   psrad m3, 14 | 
 |   psrad m0, 14 | 
 |   packssdw m9, m3 | 
 |   packssdw m11, m0 | 
 |  | 
 |   ; transpose | 
 |   ; stage 1 | 
 |   punpcklwd m0, m6, m7 | 
 |   punpcklwd m3, m5, m11 | 
 |   punpckhwd m6, m7 | 
 |   punpckhwd m5, m11 | 
 |   punpcklwd m7, m4, m9 | 
 |   punpcklwd m10, m2, m1 | 
 |   punpckhwd m4, m9 | 
 |   punpckhwd m2, m1 | 
 |  | 
 |   ; stage 2 | 
 |   punpckldq m9, m0, m3 | 
 |   punpckldq m1, m6, m5 | 
 |   punpckhdq m0, m3 | 
 |   punpckhdq m6, m5 | 
 |   punpckldq m3, m7, m10 | 
 |   punpckldq m5, m4, m2 | 
 |   punpckhdq m7, m10 | 
 |   punpckhdq m4, m2 | 
 |  | 
 |   ; stage 3 | 
 |   punpcklqdq m10, m9, m3 | 
 |   punpckhqdq m9, m3 | 
 |   punpcklqdq m2, m0, m7 | 
 |   punpckhqdq m0, m7 | 
 |   punpcklqdq m3, m1, m5 | 
 |   punpckhqdq m1, m5 | 
 |   punpcklqdq m7, m6, m4 | 
 |   punpckhqdq m6, m4 | 
 |  | 
 |   ; row transform | 
 |   ; stage 1 | 
 |   paddw m5, m10, m6 | 
 |   psubw m10, m6 | 
 |  | 
 |   paddw m4, m9, m7 | 
 |   psubw m9, m7 | 
 |  | 
 |   paddw m6, m2, m1 | 
 |   psubw m2, m1 | 
 |  | 
 |   paddw m7, m0, m3 | 
 |   psubw m0, m3 | 
 |  | 
 |   ;stage 2 | 
 |   paddw m1, m5, m7 | 
 |   psubw m5, m7 | 
 |  | 
 |   paddw m3, m4, m6 | 
 |   psubw m4, m6 | 
 |  | 
 |   paddw m7, m9, m2 | 
 |   psubw m9, m2 | 
 |  | 
 |   ; stage 3 | 
 |   punpcklwd m6, m1, m3 | 
 |   punpckhwd m1, m3 | 
 |   pmaddwd m2, m6, [GLOBAL(pw_11585_11585)] | 
 |   pmaddwd m6, [GLOBAL(pw_11585_m11585)] | 
 |   pmaddwd m3, m1, [GLOBAL(pw_11585_11585)] | 
 |   pmaddwd m1, [GLOBAL(pw_11585_m11585)] | 
 |   paddd m2, m8 | 
 |   paddd m6, m8 | 
 |   paddd m3, m8 | 
 |   paddd m1, m8 | 
 |   psrad m2, 14 | 
 |   psrad m6, 14 | 
 |   psrad m3, 14 | 
 |   psrad m1, 14 | 
 |   packssdw m2, m3 | 
 |   packssdw m6, m1 | 
 |  | 
 |   pmulhrsw m7, m12 | 
 |   pmulhrsw m9, m12 | 
 |  | 
 |   punpcklwd m3, m5, m4 | 
 |   punpckhwd m5, m4 | 
 |   pmaddwd m1, m3, [GLOBAL(pw_15137_6270)] | 
 |   pmaddwd m3, [GLOBAL(pw_6270_m15137)] | 
 |   pmaddwd m4, m5, [GLOBAL(pw_15137_6270)] | 
 |   pmaddwd m5, [GLOBAL(pw_6270_m15137)] | 
 |   paddd m1, m8 | 
 |   paddd m3, m8 | 
 |   paddd m4, m8 | 
 |   paddd m5, m8 | 
 |   psrad m1, 14 | 
 |   psrad m3, 14 | 
 |   psrad m4, 14 | 
 |   psrad m5, 14 | 
 |   packssdw m1, m4 | 
 |   packssdw m3, m5 | 
 |  | 
 |   paddw m4, m0, m9 | 
 |   psubw m0, m9 | 
 |  | 
 |   paddw m5, m10, m7 | 
 |   psubw m10, m7 | 
 |  | 
 |   ; stage 4 | 
 |   punpcklwd m9, m5, m4 | 
 |   punpckhwd m5, m4 | 
 |   pmaddwd m7, m9, [GLOBAL(pw_16069_3196)] | 
 |   pmaddwd m9, [GLOBAL(pw_3196_m16069)] | 
 |   pmaddwd m4, m5, [GLOBAL(pw_16069_3196)] | 
 |   pmaddwd m5, [GLOBAL(pw_3196_m16069)] | 
 |   paddd m7, m8 | 
 |   paddd m9, m8 | 
 |   paddd m4, m8 | 
 |   paddd m5, m8 | 
 |   psrad m7, 14 | 
 |   psrad m9, 14 | 
 |   psrad m4, 14 | 
 |   psrad m5, 14 | 
 |   packssdw m7, m4 | 
 |   packssdw m9, m5 | 
 |  | 
 |   punpcklwd m4, m10, m0 | 
 |   punpckhwd m10, m0 | 
 |   pmaddwd m5, m4, [GLOBAL(pw_9102_13623)] | 
 |   pmaddwd m4, [GLOBAL(pw_13623_m9102)] | 
 |   pmaddwd m0, m10, [GLOBAL(pw_9102_13623)] | 
 |   pmaddwd m10, [GLOBAL(pw_13623_m9102)] | 
 |   paddd m5, m8 | 
 |   paddd m4, m8 | 
 |   paddd m0, m8 | 
 |   paddd m10, m8 | 
 |   psrad m5, 14 | 
 |   psrad m4, 14 | 
 |   psrad m0, 14 | 
 |   psrad m10, 14 | 
 |   packssdw m5, m0 | 
 |   packssdw m4, m10 | 
 |  | 
 |   ; transpose | 
 |   ; stage 1 | 
 |   punpcklwd m0, m2, m7 | 
 |   punpcklwd m10, m1, m4 | 
 |   punpckhwd m2, m7 | 
 |   punpckhwd m1, m4 | 
 |   punpcklwd m7, m6, m5 | 
 |   punpcklwd m4, m3, m9 | 
 |   punpckhwd m6, m5 | 
 |   punpckhwd m3, m9 | 
 |  | 
 |   ; stage 2 | 
 |   punpckldq m5, m0, m10 | 
 |   punpckldq m9, m2, m1 | 
 |   punpckhdq m0, m10 | 
 |   punpckhdq m2, m1 | 
 |   punpckldq m10, m7, m4 | 
 |   punpckldq m1, m6, m3 | 
 |   punpckhdq m7, m4 | 
 |   punpckhdq m6, m3 | 
 |  | 
 |   ; stage 3 | 
 |   punpcklqdq m4, m5, m10 | 
 |   punpckhqdq m5, m10 | 
 |   punpcklqdq m3, m0, m7 | 
 |   punpckhqdq m0, m7 | 
 |   punpcklqdq m10, m9, m1 | 
 |   punpckhqdq m9, m1 | 
 |   punpcklqdq m7, m2, m6 | 
 |   punpckhqdq m2, m6 | 
 |  | 
 |   psraw m1, m4, 15 | 
 |   psraw m6, m5, 15 | 
 |   psraw m8, m3, 15 | 
 |   psraw m11, m0, 15 | 
 |  | 
 |   psubw m4, m1 | 
 |   psubw m5, m6 | 
 |   psubw m3, m8 | 
 |   psubw m0, m11 | 
 |  | 
 |   psraw m4, 1 | 
 |   psraw m5, 1 | 
 |   psraw m3, 1 | 
 |   psraw m0, 1 | 
 |  | 
 |   psraw m1, m10, 15 | 
 |   psraw m6, m9, 15 | 
 |   psraw m8, m7, 15 | 
 |   psraw m11, m2, 15 | 
 |  | 
 |   psubw m10, m1 | 
 |   psubw m9, m6 | 
 |   psubw m7, m8 | 
 |   psubw m2, m11 | 
 |  | 
 |   psraw m10, 1 | 
 |   psraw m9, 1 | 
 |   psraw m7, 1 | 
 |   psraw m2, 1 | 
 |  | 
 |   STORE_OUTPUT  0,  4 | 
 |   STORE_OUTPUT  8,  5 | 
 |   STORE_OUTPUT 16,  3 | 
 |   STORE_OUTPUT 24,  0 | 
 |   STORE_OUTPUT 32, 10 | 
 |   STORE_OUTPUT 40,  9 | 
 |   STORE_OUTPUT 48,  7 | 
 |   STORE_OUTPUT 56,  2 | 
 |  | 
 |   RET | 
 | %endif |