|  | /* | 
|  | *  Copyright 2013 The LibYuv Project Authors. All rights reserved. | 
|  | * | 
|  | *  Use of this source code is governed by a BSD-style license | 
|  | *  that can be found in the LICENSE file in the root of the source | 
|  | *  tree. An additional intellectual property rights grant can be found | 
|  | *  in the file PATENTS. All contributing project authors may | 
|  | *  be found in the AUTHORS file in the root of the source tree. | 
|  | */ | 
|  |  | 
|  | #include "libyuv/rotate_row.h" | 
|  | #include "libyuv/row.h" | 
|  |  | 
|  | #ifdef __cplusplus | 
|  | namespace libyuv { | 
|  | extern "C" { | 
|  | #endif | 
|  |  | 
|  | // This module is for 32 bit Visual C x86 and clangcl | 
|  | #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) | 
|  |  | 
|  | __declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src, | 
|  | int src_stride, | 
|  | uint8_t* dst, | 
|  | int dst_stride, | 
|  | int width) { | 
|  | __asm { | 
|  | push      edi | 
|  | push      esi | 
|  | push      ebp | 
|  | mov       eax, [esp + 12 + 4]  // src | 
|  | mov       edi, [esp + 12 + 8]  // src_stride | 
|  | mov       edx, [esp + 12 + 12]  // dst | 
|  | mov       esi, [esp + 12 + 16]  // dst_stride | 
|  | mov       ecx, [esp + 12 + 20]  // width | 
|  |  | 
|  | // Read in the data from the source pointer. | 
|  | // First round of bit swap. | 
|  | align      4 | 
|  | convertloop: | 
|  | movq      xmm0, qword ptr [eax] | 
|  | lea       ebp, [eax + 8] | 
|  | movq      xmm1, qword ptr [eax + edi] | 
|  | lea       eax, [eax + 2 * edi] | 
|  | punpcklbw xmm0, xmm1 | 
|  | movq      xmm2, qword ptr [eax] | 
|  | movdqa    xmm1, xmm0 | 
|  | palignr   xmm1, xmm1, 8 | 
|  | movq      xmm3, qword ptr [eax + edi] | 
|  | lea       eax, [eax + 2 * edi] | 
|  | punpcklbw xmm2, xmm3 | 
|  | movdqa    xmm3, xmm2 | 
|  | movq      xmm4, qword ptr [eax] | 
|  | palignr   xmm3, xmm3, 8 | 
|  | movq      xmm5, qword ptr [eax + edi] | 
|  | punpcklbw xmm4, xmm5 | 
|  | lea       eax, [eax + 2 * edi] | 
|  | movdqa    xmm5, xmm4 | 
|  | movq      xmm6, qword ptr [eax] | 
|  | palignr   xmm5, xmm5, 8 | 
|  | movq      xmm7, qword ptr [eax + edi] | 
|  | punpcklbw xmm6, xmm7 | 
|  | mov       eax, ebp | 
|  | movdqa    xmm7, xmm6 | 
|  | palignr   xmm7, xmm7, 8 | 
|  | // Second round of bit swap. | 
|  | punpcklwd xmm0, xmm2 | 
|  | punpcklwd xmm1, xmm3 | 
|  | movdqa    xmm2, xmm0 | 
|  | movdqa    xmm3, xmm1 | 
|  | palignr   xmm2, xmm2, 8 | 
|  | palignr   xmm3, xmm3, 8 | 
|  | punpcklwd xmm4, xmm6 | 
|  | punpcklwd xmm5, xmm7 | 
|  | movdqa    xmm6, xmm4 | 
|  | movdqa    xmm7, xmm5 | 
|  | palignr   xmm6, xmm6, 8 | 
|  | palignr   xmm7, xmm7, 8 | 
|  | // Third round of bit swap. | 
|  | // Write to the destination pointer. | 
|  | punpckldq xmm0, xmm4 | 
|  | movq      qword ptr [edx], xmm0 | 
|  | movdqa    xmm4, xmm0 | 
|  | palignr   xmm4, xmm4, 8 | 
|  | movq      qword ptr [edx + esi], xmm4 | 
|  | lea       edx, [edx + 2 * esi] | 
|  | punpckldq xmm2, xmm6 | 
|  | movdqa    xmm6, xmm2 | 
|  | palignr   xmm6, xmm6, 8 | 
|  | movq      qword ptr [edx], xmm2 | 
|  | punpckldq xmm1, xmm5 | 
|  | movq      qword ptr [edx + esi], xmm6 | 
|  | lea       edx, [edx + 2 * esi] | 
|  | movdqa    xmm5, xmm1 | 
|  | movq      qword ptr [edx], xmm1 | 
|  | palignr   xmm5, xmm5, 8 | 
|  | punpckldq xmm3, xmm7 | 
|  | movq      qword ptr [edx + esi], xmm5 | 
|  | lea       edx, [edx + 2 * esi] | 
|  | movq      qword ptr [edx], xmm3 | 
|  | movdqa    xmm7, xmm3 | 
|  | palignr   xmm7, xmm7, 8 | 
|  | sub       ecx, 8 | 
|  | movq      qword ptr [edx + esi], xmm7 | 
|  | lea       edx, [edx + 2 * esi] | 
|  | jg        convertloop | 
|  |  | 
|  | pop       ebp | 
|  | pop       esi | 
|  | pop       edi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src, | 
|  | int src_stride, | 
|  | uint8_t* dst_a, | 
|  | int dst_stride_a, | 
|  | uint8_t* dst_b, | 
|  | int dst_stride_b, | 
|  | int w) { | 
|  | __asm { | 
|  | push      ebx | 
|  | push      esi | 
|  | push      edi | 
|  | push      ebp | 
|  | mov       eax, [esp + 16 + 4]  // src | 
|  | mov       edi, [esp + 16 + 8]  // src_stride | 
|  | mov       edx, [esp + 16 + 12]  // dst_a | 
|  | mov       esi, [esp + 16 + 16]  // dst_stride_a | 
|  | mov       ebx, [esp + 16 + 20]  // dst_b | 
|  | mov       ebp, [esp + 16 + 24]  // dst_stride_b | 
|  | mov       ecx, esp | 
|  | sub       esp, 4 + 16 | 
|  | and       esp, ~15 | 
|  | mov       [esp + 16], ecx | 
|  | mov       ecx, [ecx + 16 + 28]  // w | 
|  |  | 
|  | align      4 | 
|  | // Read in the data from the source pointer. | 
|  | // First round of bit swap. | 
|  | convertloop: | 
|  | movdqu    xmm0, [eax] | 
|  | movdqu    xmm1, [eax + edi] | 
|  | lea       eax, [eax + 2 * edi] | 
|  | movdqa    xmm7, xmm0  // use xmm7 as temp register. | 
|  | punpcklbw xmm0, xmm1 | 
|  | punpckhbw xmm7, xmm1 | 
|  | movdqa    xmm1, xmm7 | 
|  | movdqu    xmm2, [eax] | 
|  | movdqu    xmm3, [eax + edi] | 
|  | lea       eax, [eax + 2 * edi] | 
|  | movdqa    xmm7, xmm2 | 
|  | punpcklbw xmm2, xmm3 | 
|  | punpckhbw xmm7, xmm3 | 
|  | movdqa    xmm3, xmm7 | 
|  | movdqu    xmm4, [eax] | 
|  | movdqu    xmm5, [eax + edi] | 
|  | lea       eax, [eax + 2 * edi] | 
|  | movdqa    xmm7, xmm4 | 
|  | punpcklbw xmm4, xmm5 | 
|  | punpckhbw xmm7, xmm5 | 
|  | movdqa    xmm5, xmm7 | 
|  | movdqu    xmm6, [eax] | 
|  | movdqu    xmm7, [eax + edi] | 
|  | lea       eax, [eax + 2 * edi] | 
|  | movdqu    [esp], xmm5  // backup xmm5 | 
|  | neg       edi | 
|  | movdqa    xmm5, xmm6  // use xmm5 as temp register. | 
|  | punpcklbw xmm6, xmm7 | 
|  | punpckhbw xmm5, xmm7 | 
|  | movdqa    xmm7, xmm5 | 
|  | lea       eax, [eax + 8 * edi + 16] | 
|  | neg       edi | 
|  | // Second round of bit swap. | 
|  | movdqa    xmm5, xmm0 | 
|  | punpcklwd xmm0, xmm2 | 
|  | punpckhwd xmm5, xmm2 | 
|  | movdqa    xmm2, xmm5 | 
|  | movdqa    xmm5, xmm1 | 
|  | punpcklwd xmm1, xmm3 | 
|  | punpckhwd xmm5, xmm3 | 
|  | movdqa    xmm3, xmm5 | 
|  | movdqa    xmm5, xmm4 | 
|  | punpcklwd xmm4, xmm6 | 
|  | punpckhwd xmm5, xmm6 | 
|  | movdqa    xmm6, xmm5 | 
|  | movdqu    xmm5, [esp]  // restore xmm5 | 
|  | movdqu    [esp], xmm6  // backup xmm6 | 
|  | movdqa    xmm6, xmm5  // use xmm6 as temp register. | 
|  | punpcklwd xmm5, xmm7 | 
|  | punpckhwd xmm6, xmm7 | 
|  | movdqa    xmm7, xmm6 | 
|  |  | 
|  | // Third round of bit swap. | 
|  | // Write to the destination pointer. | 
|  | movdqa    xmm6, xmm0 | 
|  | punpckldq xmm0, xmm4 | 
|  | punpckhdq xmm6, xmm4 | 
|  | movdqa    xmm4, xmm6 | 
|  | movdqu    xmm6, [esp]  // restore xmm6 | 
|  | movlpd    qword ptr [edx], xmm0 | 
|  | movhpd    qword ptr [ebx], xmm0 | 
|  | movlpd    qword ptr [edx + esi], xmm4 | 
|  | lea       edx, [edx + 2 * esi] | 
|  | movhpd    qword ptr [ebx + ebp], xmm4 | 
|  | lea       ebx, [ebx + 2 * ebp] | 
|  | movdqa    xmm0, xmm2  // use xmm0 as the temp register. | 
|  | punpckldq xmm2, xmm6 | 
|  | movlpd    qword ptr [edx], xmm2 | 
|  | movhpd    qword ptr [ebx], xmm2 | 
|  | punpckhdq xmm0, xmm6 | 
|  | movlpd    qword ptr [edx + esi], xmm0 | 
|  | lea       edx, [edx + 2 * esi] | 
|  | movhpd    qword ptr [ebx + ebp], xmm0 | 
|  | lea       ebx, [ebx + 2 * ebp] | 
|  | movdqa    xmm0, xmm1  // use xmm0 as the temp register. | 
|  | punpckldq xmm1, xmm5 | 
|  | movlpd    qword ptr [edx], xmm1 | 
|  | movhpd    qword ptr [ebx], xmm1 | 
|  | punpckhdq xmm0, xmm5 | 
|  | movlpd    qword ptr [edx + esi], xmm0 | 
|  | lea       edx, [edx + 2 * esi] | 
|  | movhpd    qword ptr [ebx + ebp], xmm0 | 
|  | lea       ebx, [ebx + 2 * ebp] | 
|  | movdqa    xmm0, xmm3  // use xmm0 as the temp register. | 
|  | punpckldq xmm3, xmm7 | 
|  | movlpd    qword ptr [edx], xmm3 | 
|  | movhpd    qword ptr [ebx], xmm3 | 
|  | punpckhdq xmm0, xmm7 | 
|  | sub       ecx, 8 | 
|  | movlpd    qword ptr [edx + esi], xmm0 | 
|  | lea       edx, [edx + 2 * esi] | 
|  | movhpd    qword ptr [ebx + ebp], xmm0 | 
|  | lea       ebx, [ebx + 2 * ebp] | 
|  | jg        convertloop | 
|  |  | 
|  | mov       esp, [esp + 16] | 
|  | pop       ebp | 
|  | pop       edi | 
|  | pop       esi | 
|  | pop       ebx | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) | 
|  |  | 
|  | #ifdef __cplusplus | 
|  | }  // extern "C" | 
|  | }  // namespace libyuv | 
|  | #endif |