| ; |
| ; Copyright (c) 2016, Alliance for Open Media. All rights reserved |
| ; |
| ; This source code is subject to the terms of the BSD 2 Clause License and |
| ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| ; was not distributed with this source code in the LICENSE file, you can |
| ; obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| ; Media Patent License 1.0 was not distributed with this source code in the |
| ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| ; |
| |
| ; |
| |
| %include "third_party/x86inc/x86inc.asm" |
| |
| SECTION_RODATA |
| |
| pw_11585x2: times 8 dw 23170 |
| pd_8192: times 4 dd 8192 |
| |
| %macro TRANSFORM_COEFFS 2 |
| pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 |
| pw_%2_m%1: dw %2, -%1, %2, -%1, %2, -%1, %2, -%1 |
| %endmacro |
| |
| TRANSFORM_COEFFS 11585, 11585 |
| TRANSFORM_COEFFS 15137, 6270 |
| TRANSFORM_COEFFS 16069, 3196 |
| TRANSFORM_COEFFS 9102, 13623 |
| |
| %macro STORE_OUTPUT 2 ; index, result |
| ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); |
| ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); |
| ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); |
| ; _mm_store_si128((__m128i *)(dst_ptr), out0); |
| ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1); |
| pxor m11, m11 |
| pcmpgtw m11, m%2 |
| movdqa m12, m%2 |
| punpcklwd m%2, m11 |
| punpckhwd m12, m11 |
| mova [outputq + 4*%1 + 0], m%2 |
| mova [outputq + 4*%1 + 16], m12 |
| %endmacro |
| |
| SECTION .text |
| |
| %if ARCH_X86_64 |
| INIT_XMM ssse3 |
| cglobal fdct8x8, 3, 5, 13, input, output, stride |
| |
| mova m8, [GLOBAL(pd_8192)] |
| mova m12, [GLOBAL(pw_11585x2)] |
| |
| lea r3, [2 * strideq] |
| lea r4, [4 * strideq] |
| mova m0, [inputq] |
| mova m1, [inputq + r3] |
| lea inputq, [inputq + r4] |
| mova m2, [inputq] |
| mova m3, [inputq + r3] |
| lea inputq, [inputq + r4] |
| mova m4, [inputq] |
| mova m5, [inputq + r3] |
| lea inputq, [inputq + r4] |
| mova m6, [inputq] |
| mova m7, [inputq + r3] |
| |
| ; left shift by 2 to increase forward transformation precision |
| psllw m0, 2 |
| psllw m1, 2 |
| psllw m2, 2 |
| psllw m3, 2 |
| psllw m4, 2 |
| psllw m5, 2 |
| psllw m6, 2 |
| psllw m7, 2 |
| |
| ; column transform |
| ; stage 1 |
| paddw m10, m0, m7 |
| psubw m0, m7 |
| |
| paddw m9, m1, m6 |
| psubw m1, m6 |
| |
| paddw m7, m2, m5 |
| psubw m2, m5 |
| |
| paddw m6, m3, m4 |
| psubw m3, m4 |
| |
| ; stage 2 |
| paddw m5, m9, m7 |
| psubw m9, m7 |
| |
| paddw m4, m10, m6 |
| psubw m10, m6 |
| |
| paddw m7, m1, m2 |
| psubw m1, m2 |
| |
| ; stage 3 |
| paddw m6, m4, m5 |
| psubw m4, m5 |
| |
| pmulhrsw m1, m12 |
| pmulhrsw m7, m12 |
| |
| ; sin(pi / 8), cos(pi / 8) |
| punpcklwd m2, m10, m9 |
| punpckhwd m10, m9 |
| pmaddwd m5, m2, [GLOBAL(pw_15137_6270)] |
| pmaddwd m2, [GLOBAL(pw_6270_m15137)] |
| pmaddwd m9, m10, [GLOBAL(pw_15137_6270)] |
| pmaddwd m10, [GLOBAL(pw_6270_m15137)] |
| paddd m5, m8 |
| paddd m2, m8 |
| paddd m9, m8 |
| paddd m10, m8 |
| psrad m5, 14 |
| psrad m2, 14 |
| psrad m9, 14 |
| psrad m10, 14 |
| packssdw m5, m9 |
| packssdw m2, m10 |
| |
| pmulhrsw m6, m12 |
| pmulhrsw m4, m12 |
| |
| paddw m9, m3, m1 |
| psubw m3, m1 |
| |
| paddw m10, m0, m7 |
| psubw m0, m7 |
| |
| ; stage 4 |
| ; sin(pi / 16), cos(pi / 16) |
| punpcklwd m1, m10, m9 |
| punpckhwd m10, m9 |
| pmaddwd m7, m1, [GLOBAL(pw_16069_3196)] |
| pmaddwd m1, [GLOBAL(pw_3196_m16069)] |
| pmaddwd m9, m10, [GLOBAL(pw_16069_3196)] |
| pmaddwd m10, [GLOBAL(pw_3196_m16069)] |
| paddd m7, m8 |
| paddd m1, m8 |
| paddd m9, m8 |
| paddd m10, m8 |
| psrad m7, 14 |
| psrad m1, 14 |
| psrad m9, 14 |
| psrad m10, 14 |
| packssdw m7, m9 |
| packssdw m1, m10 |
| |
| ; sin(3 * pi / 16), cos(3 * pi / 16) |
| punpcklwd m11, m0, m3 |
| punpckhwd m0, m3 |
| pmaddwd m9, m11, [GLOBAL(pw_9102_13623)] |
| pmaddwd m11, [GLOBAL(pw_13623_m9102)] |
| pmaddwd m3, m0, [GLOBAL(pw_9102_13623)] |
| pmaddwd m0, [GLOBAL(pw_13623_m9102)] |
| paddd m9, m8 |
| paddd m11, m8 |
| paddd m3, m8 |
| paddd m0, m8 |
| psrad m9, 14 |
| psrad m11, 14 |
| psrad m3, 14 |
| psrad m0, 14 |
| packssdw m9, m3 |
| packssdw m11, m0 |
| |
| ; transpose |
| ; stage 1 |
| punpcklwd m0, m6, m7 |
| punpcklwd m3, m5, m11 |
| punpckhwd m6, m7 |
| punpckhwd m5, m11 |
| punpcklwd m7, m4, m9 |
| punpcklwd m10, m2, m1 |
| punpckhwd m4, m9 |
| punpckhwd m2, m1 |
| |
| ; stage 2 |
| punpckldq m9, m0, m3 |
| punpckldq m1, m6, m5 |
| punpckhdq m0, m3 |
| punpckhdq m6, m5 |
| punpckldq m3, m7, m10 |
| punpckldq m5, m4, m2 |
| punpckhdq m7, m10 |
| punpckhdq m4, m2 |
| |
| ; stage 3 |
| punpcklqdq m10, m9, m3 |
| punpckhqdq m9, m3 |
| punpcklqdq m2, m0, m7 |
| punpckhqdq m0, m7 |
| punpcklqdq m3, m1, m5 |
| punpckhqdq m1, m5 |
| punpcklqdq m7, m6, m4 |
| punpckhqdq m6, m4 |
| |
| ; row transform |
| ; stage 1 |
| paddw m5, m10, m6 |
| psubw m10, m6 |
| |
| paddw m4, m9, m7 |
| psubw m9, m7 |
| |
| paddw m6, m2, m1 |
| psubw m2, m1 |
| |
| paddw m7, m0, m3 |
| psubw m0, m3 |
| |
| ;stage 2 |
| paddw m1, m5, m7 |
| psubw m5, m7 |
| |
| paddw m3, m4, m6 |
| psubw m4, m6 |
| |
| paddw m7, m9, m2 |
| psubw m9, m2 |
| |
| ; stage 3 |
| punpcklwd m6, m1, m3 |
| punpckhwd m1, m3 |
| pmaddwd m2, m6, [GLOBAL(pw_11585_11585)] |
| pmaddwd m6, [GLOBAL(pw_11585_m11585)] |
| pmaddwd m3, m1, [GLOBAL(pw_11585_11585)] |
| pmaddwd m1, [GLOBAL(pw_11585_m11585)] |
| paddd m2, m8 |
| paddd m6, m8 |
| paddd m3, m8 |
| paddd m1, m8 |
| psrad m2, 14 |
| psrad m6, 14 |
| psrad m3, 14 |
| psrad m1, 14 |
| packssdw m2, m3 |
| packssdw m6, m1 |
| |
| pmulhrsw m7, m12 |
| pmulhrsw m9, m12 |
| |
| punpcklwd m3, m5, m4 |
| punpckhwd m5, m4 |
| pmaddwd m1, m3, [GLOBAL(pw_15137_6270)] |
| pmaddwd m3, [GLOBAL(pw_6270_m15137)] |
| pmaddwd m4, m5, [GLOBAL(pw_15137_6270)] |
| pmaddwd m5, [GLOBAL(pw_6270_m15137)] |
| paddd m1, m8 |
| paddd m3, m8 |
| paddd m4, m8 |
| paddd m5, m8 |
| psrad m1, 14 |
| psrad m3, 14 |
| psrad m4, 14 |
| psrad m5, 14 |
| packssdw m1, m4 |
| packssdw m3, m5 |
| |
| paddw m4, m0, m9 |
| psubw m0, m9 |
| |
| paddw m5, m10, m7 |
| psubw m10, m7 |
| |
| ; stage 4 |
| punpcklwd m9, m5, m4 |
| punpckhwd m5, m4 |
| pmaddwd m7, m9, [GLOBAL(pw_16069_3196)] |
| pmaddwd m9, [GLOBAL(pw_3196_m16069)] |
| pmaddwd m4, m5, [GLOBAL(pw_16069_3196)] |
| pmaddwd m5, [GLOBAL(pw_3196_m16069)] |
| paddd m7, m8 |
| paddd m9, m8 |
| paddd m4, m8 |
| paddd m5, m8 |
| psrad m7, 14 |
| psrad m9, 14 |
| psrad m4, 14 |
| psrad m5, 14 |
| packssdw m7, m4 |
| packssdw m9, m5 |
| |
| punpcklwd m4, m10, m0 |
| punpckhwd m10, m0 |
| pmaddwd m5, m4, [GLOBAL(pw_9102_13623)] |
| pmaddwd m4, [GLOBAL(pw_13623_m9102)] |
| pmaddwd m0, m10, [GLOBAL(pw_9102_13623)] |
| pmaddwd m10, [GLOBAL(pw_13623_m9102)] |
| paddd m5, m8 |
| paddd m4, m8 |
| paddd m0, m8 |
| paddd m10, m8 |
| psrad m5, 14 |
| psrad m4, 14 |
| psrad m0, 14 |
| psrad m10, 14 |
| packssdw m5, m0 |
| packssdw m4, m10 |
| |
| ; transpose |
| ; stage 1 |
| punpcklwd m0, m2, m7 |
| punpcklwd m10, m1, m4 |
| punpckhwd m2, m7 |
| punpckhwd m1, m4 |
| punpcklwd m7, m6, m5 |
| punpcklwd m4, m3, m9 |
| punpckhwd m6, m5 |
| punpckhwd m3, m9 |
| |
| ; stage 2 |
| punpckldq m5, m0, m10 |
| punpckldq m9, m2, m1 |
| punpckhdq m0, m10 |
| punpckhdq m2, m1 |
| punpckldq m10, m7, m4 |
| punpckldq m1, m6, m3 |
| punpckhdq m7, m4 |
| punpckhdq m6, m3 |
| |
| ; stage 3 |
| punpcklqdq m4, m5, m10 |
| punpckhqdq m5, m10 |
| punpcklqdq m3, m0, m7 |
| punpckhqdq m0, m7 |
| punpcklqdq m10, m9, m1 |
| punpckhqdq m9, m1 |
| punpcklqdq m7, m2, m6 |
| punpckhqdq m2, m6 |
| |
| psraw m1, m4, 15 |
| psraw m6, m5, 15 |
| psraw m8, m3, 15 |
| psraw m11, m0, 15 |
| |
| psubw m4, m1 |
| psubw m5, m6 |
| psubw m3, m8 |
| psubw m0, m11 |
| |
| psraw m4, 1 |
| psraw m5, 1 |
| psraw m3, 1 |
| psraw m0, 1 |
| |
| psraw m1, m10, 15 |
| psraw m6, m9, 15 |
| psraw m8, m7, 15 |
| psraw m11, m2, 15 |
| |
| psubw m10, m1 |
| psubw m9, m6 |
| psubw m7, m8 |
| psubw m2, m11 |
| |
| psraw m10, 1 |
| psraw m9, 1 |
| psraw m7, 1 |
| psraw m2, 1 |
| |
| STORE_OUTPUT 0, 4 |
| STORE_OUTPUT 8, 5 |
| STORE_OUTPUT 16, 3 |
| STORE_OUTPUT 24, 0 |
| STORE_OUTPUT 32, 10 |
| STORE_OUTPUT 40, 9 |
| STORE_OUTPUT 48, 7 |
| STORE_OUTPUT 56, 2 |
| |
| RET |
| %endif |