| /* |
| * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
| * |
| * Use of this source code is governed by a BSD-style license |
| * that can be found in the LICENSE file in the root of the source |
| * tree. An additional intellectual property rights grant can be found |
| * in the file PATENTS. All contributing project authors may |
| * be found in the AUTHORS file in the root of the source tree. |
| */ |
| |
| #include "libyuv/row.h" |
| #include "libyuv/rotate_row.h" |
| |
| #ifdef __cplusplus |
| namespace libyuv { |
| extern "C" { |
| #endif |
| |
| // This module is for Visual C x86. |
| #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ |
| defined(_MSC_VER) && !defined(__clang__) |
| |
| __declspec(naked) |
| void TransposeWx8_SSSE3(const uint8* src, int src_stride, |
| uint8* dst, int dst_stride, int width) { |
| __asm { |
| push edi |
| push esi |
| push ebp |
| mov eax, [esp + 12 + 4] // src |
| mov edi, [esp + 12 + 8] // src_stride |
| mov edx, [esp + 12 + 12] // dst |
| mov esi, [esp + 12 + 16] // dst_stride |
| mov ecx, [esp + 12 + 20] // width |
| |
| // Read in the data from the source pointer. |
| // First round of bit swap. |
| align 4 |
| convertloop: |
| movq xmm0, qword ptr [eax] |
| lea ebp, [eax + 8] |
| movq xmm1, qword ptr [eax + edi] |
| lea eax, [eax + 2 * edi] |
| punpcklbw xmm0, xmm1 |
| movq xmm2, qword ptr [eax] |
| movdqa xmm1, xmm0 |
| palignr xmm1, xmm1, 8 |
| movq xmm3, qword ptr [eax + edi] |
| lea eax, [eax + 2 * edi] |
| punpcklbw xmm2, xmm3 |
| movdqa xmm3, xmm2 |
| movq xmm4, qword ptr [eax] |
| palignr xmm3, xmm3, 8 |
| movq xmm5, qword ptr [eax + edi] |
| punpcklbw xmm4, xmm5 |
| lea eax, [eax + 2 * edi] |
| movdqa xmm5, xmm4 |
| movq xmm6, qword ptr [eax] |
| palignr xmm5, xmm5, 8 |
| movq xmm7, qword ptr [eax + edi] |
| punpcklbw xmm6, xmm7 |
| mov eax, ebp |
| movdqa xmm7, xmm6 |
| palignr xmm7, xmm7, 8 |
| // Second round of bit swap. |
| punpcklwd xmm0, xmm2 |
| punpcklwd xmm1, xmm3 |
| movdqa xmm2, xmm0 |
| movdqa xmm3, xmm1 |
| palignr xmm2, xmm2, 8 |
| palignr xmm3, xmm3, 8 |
| punpcklwd xmm4, xmm6 |
| punpcklwd xmm5, xmm7 |
| movdqa xmm6, xmm4 |
| movdqa xmm7, xmm5 |
| palignr xmm6, xmm6, 8 |
| palignr xmm7, xmm7, 8 |
| // Third round of bit swap. |
| // Write to the destination pointer. |
| punpckldq xmm0, xmm4 |
| movq qword ptr [edx], xmm0 |
| movdqa xmm4, xmm0 |
| palignr xmm4, xmm4, 8 |
| movq qword ptr [edx + esi], xmm4 |
| lea edx, [edx + 2 * esi] |
| punpckldq xmm2, xmm6 |
| movdqa xmm6, xmm2 |
| palignr xmm6, xmm6, 8 |
| movq qword ptr [edx], xmm2 |
| punpckldq xmm1, xmm5 |
| movq qword ptr [edx + esi], xmm6 |
| lea edx, [edx + 2 * esi] |
| movdqa xmm5, xmm1 |
| movq qword ptr [edx], xmm1 |
| palignr xmm5, xmm5, 8 |
| punpckldq xmm3, xmm7 |
| movq qword ptr [edx + esi], xmm5 |
| lea edx, [edx + 2 * esi] |
| movq qword ptr [edx], xmm3 |
| movdqa xmm7, xmm3 |
| palignr xmm7, xmm7, 8 |
| sub ecx, 8 |
| movq qword ptr [edx + esi], xmm7 |
| lea edx, [edx + 2 * esi] |
| jg convertloop |
| |
| pop ebp |
| pop esi |
| pop edi |
| ret |
| } |
| } |
| |
| __declspec(naked) |
| void TransposeUVWx8_SSE2(const uint8* src, int src_stride, |
| uint8* dst_a, int dst_stride_a, |
| uint8* dst_b, int dst_stride_b, |
| int w) { |
| __asm { |
| push ebx |
| push esi |
| push edi |
| push ebp |
| mov eax, [esp + 16 + 4] // src |
| mov edi, [esp + 16 + 8] // src_stride |
| mov edx, [esp + 16 + 12] // dst_a |
| mov esi, [esp + 16 + 16] // dst_stride_a |
| mov ebx, [esp + 16 + 20] // dst_b |
| mov ebp, [esp + 16 + 24] // dst_stride_b |
| mov ecx, esp |
| sub esp, 4 + 16 |
| and esp, ~15 |
| mov [esp + 16], ecx |
| mov ecx, [ecx + 16 + 28] // w |
| |
| align 4 |
| convertloop: |
| // Read in the data from the source pointer. |
| // First round of bit swap. |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + edi] |
| lea eax, [eax + 2 * edi] |
| movdqa xmm7, xmm0 // use xmm7 as temp register. |
| punpcklbw xmm0, xmm1 |
| punpckhbw xmm7, xmm1 |
| movdqa xmm1, xmm7 |
| movdqu xmm2, [eax] |
| movdqu xmm3, [eax + edi] |
| lea eax, [eax + 2 * edi] |
| movdqa xmm7, xmm2 |
| punpcklbw xmm2, xmm3 |
| punpckhbw xmm7, xmm3 |
| movdqa xmm3, xmm7 |
| movdqu xmm4, [eax] |
| movdqu xmm5, [eax + edi] |
| lea eax, [eax + 2 * edi] |
| movdqa xmm7, xmm4 |
| punpcklbw xmm4, xmm5 |
| punpckhbw xmm7, xmm5 |
| movdqa xmm5, xmm7 |
| movdqu xmm6, [eax] |
| movdqu xmm7, [eax + edi] |
| lea eax, [eax + 2 * edi] |
| movdqu [esp], xmm5 // backup xmm5 |
| neg edi |
| movdqa xmm5, xmm6 // use xmm5 as temp register. |
| punpcklbw xmm6, xmm7 |
| punpckhbw xmm5, xmm7 |
| movdqa xmm7, xmm5 |
| lea eax, [eax + 8 * edi + 16] |
| neg edi |
| // Second round of bit swap. |
| movdqa xmm5, xmm0 |
| punpcklwd xmm0, xmm2 |
| punpckhwd xmm5, xmm2 |
| movdqa xmm2, xmm5 |
| movdqa xmm5, xmm1 |
| punpcklwd xmm1, xmm3 |
| punpckhwd xmm5, xmm3 |
| movdqa xmm3, xmm5 |
| movdqa xmm5, xmm4 |
| punpcklwd xmm4, xmm6 |
| punpckhwd xmm5, xmm6 |
| movdqa xmm6, xmm5 |
| movdqu xmm5, [esp] // restore xmm5 |
| movdqu [esp], xmm6 // backup xmm6 |
| movdqa xmm6, xmm5 // use xmm6 as temp register. |
| punpcklwd xmm5, xmm7 |
| punpckhwd xmm6, xmm7 |
| movdqa xmm7, xmm6 |
| // Third round of bit swap. |
| // Write to the destination pointer. |
| movdqa xmm6, xmm0 |
| punpckldq xmm0, xmm4 |
| punpckhdq xmm6, xmm4 |
| movdqa xmm4, xmm6 |
| movdqu xmm6, [esp] // restore xmm6 |
| movlpd qword ptr [edx], xmm0 |
| movhpd qword ptr [ebx], xmm0 |
| movlpd qword ptr [edx + esi], xmm4 |
| lea edx, [edx + 2 * esi] |
| movhpd qword ptr [ebx + ebp], xmm4 |
| lea ebx, [ebx + 2 * ebp] |
| movdqa xmm0, xmm2 // use xmm0 as the temp register. |
| punpckldq xmm2, xmm6 |
| movlpd qword ptr [edx], xmm2 |
| movhpd qword ptr [ebx], xmm2 |
| punpckhdq xmm0, xmm6 |
| movlpd qword ptr [edx + esi], xmm0 |
| lea edx, [edx + 2 * esi] |
| movhpd qword ptr [ebx + ebp], xmm0 |
| lea ebx, [ebx + 2 * ebp] |
| movdqa xmm0, xmm1 // use xmm0 as the temp register. |
| punpckldq xmm1, xmm5 |
| movlpd qword ptr [edx], xmm1 |
| movhpd qword ptr [ebx], xmm1 |
| punpckhdq xmm0, xmm5 |
| movlpd qword ptr [edx + esi], xmm0 |
| lea edx, [edx + 2 * esi] |
| movhpd qword ptr [ebx + ebp], xmm0 |
| lea ebx, [ebx + 2 * ebp] |
| movdqa xmm0, xmm3 // use xmm0 as the temp register. |
| punpckldq xmm3, xmm7 |
| movlpd qword ptr [edx], xmm3 |
| movhpd qword ptr [ebx], xmm3 |
| punpckhdq xmm0, xmm7 |
| sub ecx, 8 |
| movlpd qword ptr [edx + esi], xmm0 |
| lea edx, [edx + 2 * esi] |
| movhpd qword ptr [ebx + ebp], xmm0 |
| lea ebx, [ebx + 2 * ebp] |
| jg convertloop |
| |
| mov esp, [esp + 16] |
| pop ebp |
| pop edi |
| pop esi |
| pop ebx |
| ret |
| } |
| } |
| |
| #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) |
| |
| #ifdef __cplusplus |
| } // extern "C" |
| } // namespace libyuv |
| #endif |