|  | /* | 
|  | *  Copyright 2013 The LibYuv Project Authors. All rights reserved. | 
|  | * | 
|  | *  Use of this source code is governed by a BSD-style license | 
|  | *  that can be found in the LICENSE file in the root of the source | 
|  | *  tree. An additional intellectual property rights grant can be found | 
|  | *  in the file PATENTS. All contributing project authors may | 
|  | *  be found in the AUTHORS file in the root of the source tree. | 
|  | */ | 
|  |  | 
|  | #include "libyuv/row.h" | 
|  | #include "libyuv/scale_row.h" | 
|  |  | 
|  | #ifdef __cplusplus | 
|  | namespace libyuv { | 
|  | extern "C" { | 
|  | #endif | 
|  |  | 
|  | // This module is for Visual C x86. | 
|  | #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ | 
|  | defined(_MSC_VER) && !defined(__clang__) | 
|  |  | 
|  | // Offsets for source bytes 0 to 9 | 
|  | static uvec8 kShuf0 = | 
|  | { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; | 
|  |  | 
|  | // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. | 
|  | static uvec8 kShuf1 = | 
|  | { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; | 
|  |  | 
|  | // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. | 
|  | static uvec8 kShuf2 = | 
|  | { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; | 
|  |  | 
|  | // Offsets for source bytes 0 to 10 | 
|  | static uvec8 kShuf01 = | 
|  | { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; | 
|  |  | 
|  | // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. | 
|  | static uvec8 kShuf11 = | 
|  | { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; | 
|  |  | 
|  | // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. | 
|  | static uvec8 kShuf21 = | 
|  | { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; | 
|  |  | 
|  | // Coefficients for source bytes 0 to 10 | 
|  | static uvec8 kMadd01 = | 
|  | { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; | 
|  |  | 
|  | // Coefficients for source bytes 10 to 21 | 
|  | static uvec8 kMadd11 = | 
|  | { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; | 
|  |  | 
|  | // Coefficients for source bytes 21 to 31 | 
|  | static uvec8 kMadd21 = | 
|  | { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; | 
|  |  | 
|  | // Coefficients for source bytes 21 to 31 | 
|  | static vec16 kRound34 = | 
|  | { 2, 2, 2, 2, 2, 2, 2, 2 }; | 
|  |  | 
|  | static uvec8 kShuf38a = | 
|  | { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; | 
|  |  | 
|  | static uvec8 kShuf38b = | 
|  | { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; | 
|  |  | 
|  | // Arrange words 0,3,6 into 0,1,2 | 
|  | static uvec8 kShufAc = | 
|  | { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; | 
|  |  | 
|  | // Arrange words 0,3,6 into 3,4,5 | 
|  | static uvec8 kShufAc3 = | 
|  | { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; | 
|  |  | 
|  | // Scaling values for boxes of 3x3 and 2x3 | 
|  | static uvec16 kScaleAc33 = | 
|  | { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; | 
|  |  | 
|  | // Arrange first value for pixels 0,1,2,3,4,5 | 
|  | static uvec8 kShufAb0 = | 
|  | { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; | 
|  |  | 
|  | // Arrange second value for pixels 0,1,2,3,4,5 | 
|  | static uvec8 kShufAb1 = | 
|  | { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; | 
|  |  | 
|  | // Arrange third value for pixels 0,1,2,3,4,5 | 
|  | static uvec8 kShufAb2 = | 
|  | { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; | 
|  |  | 
|  | // Scaling values for boxes of 3x2 and 2x2 | 
|  | static uvec16 kScaleAb2 = | 
|  | { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; | 
|  |  | 
|  | // Reads 32 pixels, throws half away and writes 16 pixels. | 
|  | __declspec(naked) | 
|  | void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 
|  | uint8* dst_ptr, int dst_width) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]        // src_ptr | 
|  | // src_stride ignored | 
|  | mov        edx, [esp + 12]       // dst_ptr | 
|  | mov        ecx, [esp + 16]       // dst_width | 
|  |  | 
|  | wloop: | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | lea        eax,  [eax + 32] | 
|  | psrlw      xmm0, 8               // isolate odd pixels. | 
|  | psrlw      xmm1, 8 | 
|  | packuswb   xmm0, xmm1 | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 16 | 
|  | jg         wloop | 
|  |  | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Blends 32x1 rectangle to 16x1. | 
|  | __declspec(naked) | 
|  | void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 
|  | uint8* dst_ptr, int dst_width) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]        // src_ptr | 
|  | // src_stride | 
|  | mov        edx, [esp + 12]       // dst_ptr | 
|  | mov        ecx, [esp + 16]       // dst_width | 
|  | pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff | 
|  | psrlw      xmm5, 8 | 
|  |  | 
|  | wloop: | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | lea        eax,  [eax + 32] | 
|  |  | 
|  | movdqa     xmm2, xmm0            // average columns (32 to 16 pixels) | 
|  | psrlw      xmm0, 8 | 
|  | movdqa     xmm3, xmm1 | 
|  | psrlw      xmm1, 8 | 
|  | pand       xmm2, xmm5 | 
|  | pand       xmm3, xmm5 | 
|  | pavgw      xmm0, xmm2 | 
|  | pavgw      xmm1, xmm3 | 
|  | packuswb   xmm0, xmm1 | 
|  |  | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 16 | 
|  | jg         wloop | 
|  |  | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Blends 32x2 rectangle to 16x1. | 
|  | __declspec(naked) | 
|  | void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 
|  | uint8* dst_ptr, int dst_width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]    // src_ptr | 
|  | mov        esi, [esp + 4 + 8]    // src_stride | 
|  | mov        edx, [esp + 4 + 12]   // dst_ptr | 
|  | mov        ecx, [esp + 4 + 16]   // dst_width | 
|  | pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff | 
|  | psrlw      xmm5, 8 | 
|  |  | 
|  | wloop: | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | movdqu     xmm2, [eax + esi] | 
|  | movdqu     xmm3, [eax + esi + 16] | 
|  | lea        eax,  [eax + 32] | 
|  | pavgb      xmm0, xmm2            // average rows | 
|  | pavgb      xmm1, xmm3 | 
|  |  | 
|  | movdqa     xmm2, xmm0            // average columns (32 to 16 pixels) | 
|  | psrlw      xmm0, 8 | 
|  | movdqa     xmm3, xmm1 | 
|  | psrlw      xmm1, 8 | 
|  | pand       xmm2, xmm5 | 
|  | pand       xmm3, xmm5 | 
|  | pavgw      xmm0, xmm2 | 
|  | pavgw      xmm1, xmm3 | 
|  | packuswb   xmm0, xmm1 | 
|  |  | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 16 | 
|  | jg         wloop | 
|  |  | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | #ifdef HAS_SCALEROWDOWN2_AVX2 | 
|  | // Reads 64 pixels, throws half away and writes 32 pixels. | 
|  | __declspec(naked) | 
|  | void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, | 
|  | uint8* dst_ptr, int dst_width) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]        // src_ptr | 
|  | // src_stride ignored | 
|  | mov        edx, [esp + 12]       // dst_ptr | 
|  | mov        ecx, [esp + 16]       // dst_width | 
|  |  | 
|  | wloop: | 
|  | vmovdqu     ymm0, [eax] | 
|  | vmovdqu     ymm1, [eax + 32] | 
|  | lea         eax,  [eax + 64] | 
|  | vpsrlw      ymm0, ymm0, 8        // isolate odd pixels. | 
|  | vpsrlw      ymm1, ymm1, 8 | 
|  | vpackuswb   ymm0, ymm0, ymm1 | 
|  | vpermq      ymm0, ymm0, 0xd8     // unmutate vpackuswb | 
|  | vmovdqu     [edx], ymm0 | 
|  | lea         edx, [edx + 32] | 
|  | sub         ecx, 32 | 
|  | jg          wloop | 
|  |  | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Blends 64x1 rectangle to 32x1. | 
|  | __declspec(naked) | 
|  | void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, | 
|  | uint8* dst_ptr, int dst_width) { | 
|  | __asm { | 
|  | mov         eax, [esp + 4]        // src_ptr | 
|  | // src_stride | 
|  | mov         edx, [esp + 12]       // dst_ptr | 
|  | mov         ecx, [esp + 16]       // dst_width | 
|  |  | 
|  | vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b | 
|  | vpsrlw      ymm4, ymm4, 15 | 
|  | vpackuswb   ymm4, ymm4, ymm4 | 
|  | vpxor       ymm5, ymm5, ymm5      // constant 0 | 
|  |  | 
|  | wloop: | 
|  | vmovdqu     ymm0, [eax] | 
|  | vmovdqu     ymm1, [eax + 32] | 
|  | lea         eax,  [eax + 64] | 
|  |  | 
|  | vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally | 
|  | vpmaddubsw  ymm1, ymm1, ymm4 | 
|  | vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2 | 
|  | vpavgw      ymm1, ymm1, ymm5 | 
|  | vpackuswb   ymm0, ymm0, ymm1 | 
|  | vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb | 
|  |  | 
|  | vmovdqu     [edx], ymm0 | 
|  | lea         edx, [edx + 32] | 
|  | sub         ecx, 32 | 
|  | jg          wloop | 
|  |  | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Blends 64x2 rectangle to 32x1. | 
|  | __declspec(naked) | 
|  | void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, | 
|  | uint8* dst_ptr, int dst_width) { | 
|  | __asm { | 
|  | push        esi | 
|  | mov         eax, [esp + 4 + 4]    // src_ptr | 
|  | mov         esi, [esp + 4 + 8]    // src_stride | 
|  | mov         edx, [esp + 4 + 12]   // dst_ptr | 
|  | mov         ecx, [esp + 4 + 16]   // dst_width | 
|  |  | 
|  | vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b | 
|  | vpsrlw      ymm4, ymm4, 15 | 
|  | vpackuswb   ymm4, ymm4, ymm4 | 
|  | vpxor       ymm5, ymm5, ymm5      // constant 0 | 
|  |  | 
|  | wloop: | 
|  | vmovdqu     ymm0, [eax]           // average rows | 
|  | vmovdqu     ymm1, [eax + 32] | 
|  | vpavgb      ymm0, ymm0, [eax + esi] | 
|  | vpavgb      ymm1, ymm1, [eax + esi + 32] | 
|  | lea         eax,  [eax + 64] | 
|  |  | 
|  | vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally | 
|  | vpmaddubsw  ymm1, ymm1, ymm4 | 
|  | vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2 | 
|  | vpavgw      ymm1, ymm1, ymm5 | 
|  | vpackuswb   ymm0, ymm0, ymm1 | 
|  | vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb | 
|  |  | 
|  | vmovdqu     [edx], ymm0 | 
|  | lea         edx, [edx + 32] | 
|  | sub         ecx, 32 | 
|  | jg          wloop | 
|  |  | 
|  | pop         esi | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_SCALEROWDOWN2_AVX2 | 
|  |  | 
|  | // Point samples 32 pixels to 8 pixels. | 
|  | __declspec(naked) | 
|  | void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 
|  | uint8* dst_ptr, int dst_width) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]        // src_ptr | 
|  | // src_stride ignored | 
|  | mov        edx, [esp + 12]       // dst_ptr | 
|  | mov        ecx, [esp + 16]       // dst_width | 
|  | pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000 | 
|  | psrld      xmm5, 24 | 
|  | pslld      xmm5, 16 | 
|  |  | 
|  | wloop: | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | lea        eax,  [eax + 32] | 
|  | pand       xmm0, xmm5 | 
|  | pand       xmm1, xmm5 | 
|  | packuswb   xmm0, xmm1 | 
|  | psrlw      xmm0, 8 | 
|  | packuswb   xmm0, xmm0 | 
|  | movq       qword ptr [edx], xmm0 | 
|  | lea        edx, [edx + 8] | 
|  | sub        ecx, 8 | 
|  | jg         wloop | 
|  |  | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Blends 32x4 rectangle to 8x1. | 
|  | __declspec(naked) | 
|  | void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 
|  | uint8* dst_ptr, int dst_width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]    // src_ptr | 
|  | mov        esi, [esp + 8 + 8]    // src_stride | 
|  | mov        edx, [esp + 8 + 12]   // dst_ptr | 
|  | mov        ecx, [esp + 8 + 16]   // dst_width | 
|  | lea        edi, [esi + esi * 2]  // src_stride * 3 | 
|  | pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff | 
|  | psrlw      xmm7, 8 | 
|  |  | 
|  | wloop: | 
|  | movdqu     xmm0, [eax]           // average rows | 
|  | movdqu     xmm1, [eax + 16] | 
|  | movdqu     xmm2, [eax + esi] | 
|  | movdqu     xmm3, [eax + esi + 16] | 
|  | pavgb      xmm0, xmm2 | 
|  | pavgb      xmm1, xmm3 | 
|  | movdqu     xmm2, [eax + esi * 2] | 
|  | movdqu     xmm3, [eax + esi * 2 + 16] | 
|  | movdqu     xmm4, [eax + edi] | 
|  | movdqu     xmm5, [eax + edi + 16] | 
|  | lea        eax, [eax + 32] | 
|  | pavgb      xmm2, xmm4 | 
|  | pavgb      xmm3, xmm5 | 
|  | pavgb      xmm0, xmm2 | 
|  | pavgb      xmm1, xmm3 | 
|  |  | 
|  | movdqa     xmm2, xmm0            // average columns (32 to 16 pixels) | 
|  | psrlw      xmm0, 8 | 
|  | movdqa     xmm3, xmm1 | 
|  | psrlw      xmm1, 8 | 
|  | pand       xmm2, xmm7 | 
|  | pand       xmm3, xmm7 | 
|  | pavgw      xmm0, xmm2 | 
|  | pavgw      xmm1, xmm3 | 
|  | packuswb   xmm0, xmm1 | 
|  |  | 
|  | movdqa     xmm2, xmm0            // average columns (16 to 8 pixels) | 
|  | psrlw      xmm0, 8 | 
|  | pand       xmm2, xmm7 | 
|  | pavgw      xmm0, xmm2 | 
|  | packuswb   xmm0, xmm0 | 
|  |  | 
|  | movq       qword ptr [edx], xmm0 | 
|  | lea        edx, [edx + 8] | 
|  | sub        ecx, 8 | 
|  | jg         wloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | #ifdef HAS_SCALEROWDOWN4_AVX2 | 
|  | // Point samples 64 pixels to 16 pixels. | 
|  | __declspec(naked) | 
|  | void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, | 
|  | uint8* dst_ptr, int dst_width) { | 
|  | __asm { | 
|  | mov         eax, [esp + 4]        // src_ptr | 
|  | // src_stride ignored | 
|  | mov         edx, [esp + 12]       // dst_ptr | 
|  | mov         ecx, [esp + 16]       // dst_width | 
|  | vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0x00ff0000 | 
|  | vpsrld      ymm5, ymm5, 24 | 
|  | vpslld      ymm5, ymm5, 16 | 
|  |  | 
|  | wloop: | 
|  | vmovdqu     ymm0, [eax] | 
|  | vmovdqu     ymm1, [eax + 32] | 
|  | lea         eax,  [eax + 64] | 
|  | vpand       ymm0, ymm0, ymm5 | 
|  | vpand       ymm1, ymm1, ymm5 | 
|  | vpackuswb   ymm0, ymm0, ymm1 | 
|  | vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb | 
|  | vpsrlw      ymm0, ymm0, 8 | 
|  | vpackuswb   ymm0, ymm0, ymm0 | 
|  | vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb | 
|  | vmovdqu     [edx], xmm0 | 
|  | lea         edx, [edx + 16] | 
|  | sub         ecx, 16 | 
|  | jg          wloop | 
|  |  | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Blends 64x4 rectangle to 16x1. | 
|  | __declspec(naked) | 
|  | void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, | 
|  | uint8* dst_ptr, int dst_width) { | 
|  | __asm { | 
|  | push        esi | 
|  | push        edi | 
|  | mov         eax, [esp + 8 + 4]    // src_ptr | 
|  | mov         esi, [esp + 8 + 8]    // src_stride | 
|  | mov         edx, [esp + 8 + 12]   // dst_ptr | 
|  | mov         ecx, [esp + 8 + 16]   // dst_width | 
|  | lea         edi, [esi + esi * 2]  // src_stride * 3 | 
|  | vpcmpeqb    ymm7, ymm7, ymm7      // generate mask 0x00ff00ff | 
|  | vpsrlw      ymm7, ymm7, 8 | 
|  |  | 
|  | wloop: | 
|  | vmovdqu     ymm0, [eax]           // average rows | 
|  | vmovdqu     ymm1, [eax + 32] | 
|  | vpavgb      ymm0, ymm0, [eax + esi] | 
|  | vpavgb      ymm1, ymm1, [eax + esi + 32] | 
|  | vmovdqu     ymm2, [eax + esi * 2] | 
|  | vmovdqu     ymm3, [eax + esi * 2 + 32] | 
|  | vpavgb      ymm2, ymm2, [eax + edi] | 
|  | vpavgb      ymm3, ymm3, [eax + edi + 32] | 
|  | lea         eax, [eax + 64] | 
|  | vpavgb      ymm0, ymm0, ymm2 | 
|  | vpavgb      ymm1, ymm1, ymm3 | 
|  |  | 
|  | vpand       ymm2, ymm0, ymm7      // average columns (64 to 32 pixels) | 
|  | vpand       ymm3, ymm1, ymm7 | 
|  | vpsrlw      ymm0, ymm0, 8 | 
|  | vpsrlw      ymm1, ymm1, 8 | 
|  | vpavgw      ymm0, ymm0, ymm2 | 
|  | vpavgw      ymm1, ymm1, ymm3 | 
|  | vpackuswb   ymm0, ymm0, ymm1 | 
|  | vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb | 
|  |  | 
|  | vpand       ymm2, ymm0, ymm7      // average columns (32 to 16 pixels) | 
|  | vpsrlw      ymm0, ymm0, 8 | 
|  | vpavgw      ymm0, ymm0, ymm2 | 
|  | vpackuswb   ymm0, ymm0, ymm0 | 
|  | vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb | 
|  |  | 
|  | vmovdqu     [edx], xmm0 | 
|  | lea         edx, [edx + 16] | 
|  | sub         ecx, 16 | 
|  | jg          wloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_SCALEROWDOWN4_AVX2 | 
|  |  | 
|  | // Point samples 32 pixels to 24 pixels. | 
|  | // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. | 
|  | // Then shuffled to do the scaling. | 
|  |  | 
|  | __declspec(naked) | 
|  | void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, | 
|  | uint8* dst_ptr, int dst_width) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]        // src_ptr | 
|  | // src_stride ignored | 
|  | mov        edx, [esp + 12]       // dst_ptr | 
|  | mov        ecx, [esp + 16]       // dst_width | 
|  | movdqa     xmm3, kShuf0 | 
|  | movdqa     xmm4, kShuf1 | 
|  | movdqa     xmm5, kShuf2 | 
|  |  | 
|  | wloop: | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | lea        eax,  [eax + 32] | 
|  | movdqa     xmm2, xmm1 | 
|  | palignr    xmm1, xmm0, 8 | 
|  | pshufb     xmm0, xmm3 | 
|  | pshufb     xmm1, xmm4 | 
|  | pshufb     xmm2, xmm5 | 
|  | movq       qword ptr [edx], xmm0 | 
|  | movq       qword ptr [edx + 8], xmm1 | 
|  | movq       qword ptr [edx + 16], xmm2 | 
|  | lea        edx, [edx + 24] | 
|  | sub        ecx, 24 | 
|  | jg         wloop | 
|  |  | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Blends 32x2 rectangle to 24x1 | 
|  | // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. | 
|  | // Then shuffled to do the scaling. | 
|  |  | 
|  | // Register usage: | 
|  | // xmm0 src_row 0 | 
|  | // xmm1 src_row 1 | 
|  | // xmm2 shuf 0 | 
|  | // xmm3 shuf 1 | 
|  | // xmm4 shuf 2 | 
|  | // xmm5 madd 0 | 
|  | // xmm6 madd 1 | 
|  | // xmm7 kRound34 | 
|  |  | 
|  | // Note that movdqa+palign may be better than movdqu. | 
|  | __declspec(naked) | 
|  | void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, | 
|  | ptrdiff_t src_stride, | 
|  | uint8* dst_ptr, int dst_width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]    // src_ptr | 
|  | mov        esi, [esp + 4 + 8]    // src_stride | 
|  | mov        edx, [esp + 4 + 12]   // dst_ptr | 
|  | mov        ecx, [esp + 4 + 16]   // dst_width | 
|  | movdqa     xmm2, kShuf01 | 
|  | movdqa     xmm3, kShuf11 | 
|  | movdqa     xmm4, kShuf21 | 
|  | movdqa     xmm5, kMadd01 | 
|  | movdqa     xmm6, kMadd11 | 
|  | movdqa     xmm7, kRound34 | 
|  |  | 
|  | wloop: | 
|  | movdqu     xmm0, [eax]           // pixels 0..7 | 
|  | movdqu     xmm1, [eax + esi] | 
|  | pavgb      xmm0, xmm1 | 
|  | pshufb     xmm0, xmm2 | 
|  | pmaddubsw  xmm0, xmm5 | 
|  | paddsw     xmm0, xmm7 | 
|  | psrlw      xmm0, 2 | 
|  | packuswb   xmm0, xmm0 | 
|  | movq       qword ptr [edx], xmm0 | 
|  | movdqu     xmm0, [eax + 8]       // pixels 8..15 | 
|  | movdqu     xmm1, [eax + esi + 8] | 
|  | pavgb      xmm0, xmm1 | 
|  | pshufb     xmm0, xmm3 | 
|  | pmaddubsw  xmm0, xmm6 | 
|  | paddsw     xmm0, xmm7 | 
|  | psrlw      xmm0, 2 | 
|  | packuswb   xmm0, xmm0 | 
|  | movq       qword ptr [edx + 8], xmm0 | 
|  | movdqu     xmm0, [eax + 16]      // pixels 16..23 | 
|  | movdqu     xmm1, [eax + esi + 16] | 
|  | lea        eax, [eax + 32] | 
|  | pavgb      xmm0, xmm1 | 
|  | pshufb     xmm0, xmm4 | 
|  | movdqa     xmm1, kMadd21 | 
|  | pmaddubsw  xmm0, xmm1 | 
|  | paddsw     xmm0, xmm7 | 
|  | psrlw      xmm0, 2 | 
|  | packuswb   xmm0, xmm0 | 
|  | movq       qword ptr [edx + 16], xmm0 | 
|  | lea        edx, [edx + 24] | 
|  | sub        ecx, 24 | 
|  | jg         wloop | 
|  |  | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Note that movdqa+palign may be better than movdqu. | 
|  | __declspec(naked) | 
|  | void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, | 
|  | ptrdiff_t src_stride, | 
|  | uint8* dst_ptr, int dst_width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]    // src_ptr | 
|  | mov        esi, [esp + 4 + 8]    // src_stride | 
|  | mov        edx, [esp + 4 + 12]   // dst_ptr | 
|  | mov        ecx, [esp + 4 + 16]   // dst_width | 
|  | movdqa     xmm2, kShuf01 | 
|  | movdqa     xmm3, kShuf11 | 
|  | movdqa     xmm4, kShuf21 | 
|  | movdqa     xmm5, kMadd01 | 
|  | movdqa     xmm6, kMadd11 | 
|  | movdqa     xmm7, kRound34 | 
|  |  | 
|  | wloop: | 
|  | movdqu     xmm0, [eax]           // pixels 0..7 | 
|  | movdqu     xmm1, [eax + esi] | 
|  | pavgb      xmm1, xmm0 | 
|  | pavgb      xmm0, xmm1 | 
|  | pshufb     xmm0, xmm2 | 
|  | pmaddubsw  xmm0, xmm5 | 
|  | paddsw     xmm0, xmm7 | 
|  | psrlw      xmm0, 2 | 
|  | packuswb   xmm0, xmm0 | 
|  | movq       qword ptr [edx], xmm0 | 
|  | movdqu     xmm0, [eax + 8]       // pixels 8..15 | 
|  | movdqu     xmm1, [eax + esi + 8] | 
|  | pavgb      xmm1, xmm0 | 
|  | pavgb      xmm0, xmm1 | 
|  | pshufb     xmm0, xmm3 | 
|  | pmaddubsw  xmm0, xmm6 | 
|  | paddsw     xmm0, xmm7 | 
|  | psrlw      xmm0, 2 | 
|  | packuswb   xmm0, xmm0 | 
|  | movq       qword ptr [edx + 8], xmm0 | 
|  | movdqu     xmm0, [eax + 16]      // pixels 16..23 | 
|  | movdqu     xmm1, [eax + esi + 16] | 
|  | lea        eax, [eax + 32] | 
|  | pavgb      xmm1, xmm0 | 
|  | pavgb      xmm0, xmm1 | 
|  | pshufb     xmm0, xmm4 | 
|  | movdqa     xmm1, kMadd21 | 
|  | pmaddubsw  xmm0, xmm1 | 
|  | paddsw     xmm0, xmm7 | 
|  | psrlw      xmm0, 2 | 
|  | packuswb   xmm0, xmm0 | 
|  | movq       qword ptr [edx + 16], xmm0 | 
|  | lea        edx, [edx+24] | 
|  | sub        ecx, 24 | 
|  | jg         wloop | 
|  |  | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // 3/8 point sampler | 
|  |  | 
|  | // Scale 32 pixels to 12 | 
|  | __declspec(naked) | 
|  | void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, | 
|  | uint8* dst_ptr, int dst_width) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]        // src_ptr | 
|  | // src_stride ignored | 
|  | mov        edx, [esp + 12]       // dst_ptr | 
|  | mov        ecx, [esp + 16]       // dst_width | 
|  | movdqa     xmm4, kShuf38a | 
|  | movdqa     xmm5, kShuf38b | 
|  |  | 
|  | xloop: | 
|  | movdqu     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5 | 
|  | movdqu     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11 | 
|  | lea        eax, [eax + 32] | 
|  | pshufb     xmm0, xmm4 | 
|  | pshufb     xmm1, xmm5 | 
|  | paddusb    xmm0, xmm1 | 
|  |  | 
|  | movq       qword ptr [edx], xmm0  // write 12 pixels | 
|  | movhlps    xmm1, xmm0 | 
|  | movd       [edx + 8], xmm1 | 
|  | lea        edx, [edx + 12] | 
|  | sub        ecx, 12 | 
|  | jg         xloop | 
|  |  | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Scale 16x3 pixels to 6x1 with interpolation | 
|  | __declspec(naked) | 
|  | void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, | 
|  | ptrdiff_t src_stride, | 
|  | uint8* dst_ptr, int dst_width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]    // src_ptr | 
|  | mov        esi, [esp + 4 + 8]    // src_stride | 
|  | mov        edx, [esp + 4 + 12]   // dst_ptr | 
|  | mov        ecx, [esp + 4 + 16]   // dst_width | 
|  | movdqa     xmm2, kShufAc | 
|  | movdqa     xmm3, kShufAc3 | 
|  | movdqa     xmm4, kScaleAc33 | 
|  | pxor       xmm5, xmm5 | 
|  |  | 
|  | xloop: | 
|  | movdqu     xmm0, [eax]           // sum up 3 rows into xmm0/1 | 
|  | movdqu     xmm6, [eax + esi] | 
|  | movhlps    xmm1, xmm0 | 
|  | movhlps    xmm7, xmm6 | 
|  | punpcklbw  xmm0, xmm5 | 
|  | punpcklbw  xmm1, xmm5 | 
|  | punpcklbw  xmm6, xmm5 | 
|  | punpcklbw  xmm7, xmm5 | 
|  | paddusw    xmm0, xmm6 | 
|  | paddusw    xmm1, xmm7 | 
|  | movdqu     xmm6, [eax + esi * 2] | 
|  | lea        eax, [eax + 16] | 
|  | movhlps    xmm7, xmm6 | 
|  | punpcklbw  xmm6, xmm5 | 
|  | punpcklbw  xmm7, xmm5 | 
|  | paddusw    xmm0, xmm6 | 
|  | paddusw    xmm1, xmm7 | 
|  |  | 
|  | movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6 | 
|  | psrldq     xmm0, 2 | 
|  | paddusw    xmm6, xmm0 | 
|  | psrldq     xmm0, 2 | 
|  | paddusw    xmm6, xmm0 | 
|  | pshufb     xmm6, xmm2 | 
|  |  | 
|  | movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6 | 
|  | psrldq     xmm1, 2 | 
|  | paddusw    xmm7, xmm1 | 
|  | psrldq     xmm1, 2 | 
|  | paddusw    xmm7, xmm1 | 
|  | pshufb     xmm7, xmm3 | 
|  | paddusw    xmm6, xmm7 | 
|  |  | 
|  | pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6 | 
|  | packuswb   xmm6, xmm6 | 
|  |  | 
|  | movd       [edx], xmm6           // write 6 pixels | 
|  | psrlq      xmm6, 16 | 
|  | movd       [edx + 2], xmm6 | 
|  | lea        edx, [edx + 6] | 
|  | sub        ecx, 6 | 
|  | jg         xloop | 
|  |  | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Scale 16x2 pixels to 6x1 with interpolation | 
|  | __declspec(naked) | 
|  | void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, | 
|  | ptrdiff_t src_stride, | 
|  | uint8* dst_ptr, int dst_width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]    // src_ptr | 
|  | mov        esi, [esp + 4 + 8]    // src_stride | 
|  | mov        edx, [esp + 4 + 12]   // dst_ptr | 
|  | mov        ecx, [esp + 4 + 16]   // dst_width | 
|  | movdqa     xmm2, kShufAb0 | 
|  | movdqa     xmm3, kShufAb1 | 
|  | movdqa     xmm4, kShufAb2 | 
|  | movdqa     xmm5, kScaleAb2 | 
|  |  | 
|  | xloop: | 
|  | movdqu     xmm0, [eax]           // average 2 rows into xmm0 | 
|  | movdqu     xmm1, [eax + esi] | 
|  | lea        eax, [eax + 16] | 
|  | pavgb      xmm0, xmm1 | 
|  |  | 
|  | movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1 | 
|  | pshufb     xmm1, xmm2 | 
|  | movdqa     xmm6, xmm0 | 
|  | pshufb     xmm6, xmm3 | 
|  | paddusw    xmm1, xmm6 | 
|  | pshufb     xmm0, xmm4 | 
|  | paddusw    xmm1, xmm0 | 
|  |  | 
|  | pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2 | 
|  | packuswb   xmm1, xmm1 | 
|  |  | 
|  | movd       [edx], xmm1           // write 6 pixels | 
|  | psrlq      xmm1, 16 | 
|  | movd       [edx + 2], xmm1 | 
|  | lea        edx, [edx + 6] | 
|  | sub        ecx, 6 | 
|  | jg         xloop | 
|  |  | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Reads 16 bytes and accumulates to 16 shorts at a time. | 
|  | __declspec(naked) | 
|  | void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   // src_ptr | 
|  | mov        edx, [esp + 8]   // dst_ptr | 
|  | mov        ecx, [esp + 12]  // src_width | 
|  | pxor       xmm5, xmm5 | 
|  |  | 
|  | // sum rows | 
|  | xloop: | 
|  | movdqu     xmm3, [eax]       // read 16 bytes | 
|  | lea        eax, [eax + 16] | 
|  | movdqu     xmm0, [edx]       // read 16 words from destination | 
|  | movdqu     xmm1, [edx + 16] | 
|  | movdqa     xmm2, xmm3 | 
|  | punpcklbw  xmm2, xmm5 | 
|  | punpckhbw  xmm3, xmm5 | 
|  | paddusw    xmm0, xmm2        // sum 16 words | 
|  | paddusw    xmm1, xmm3 | 
|  | movdqu     [edx], xmm0       // write 16 words to destination | 
|  | movdqu     [edx + 16], xmm1 | 
|  | lea        edx, [edx + 32] | 
|  | sub        ecx, 16 | 
|  | jg         xloop | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | #ifdef HAS_SCALEADDROW_AVX2 | 
|  | // Reads 32 bytes and accumulates to 32 shorts at a time. | 
|  | __declspec(naked) | 
|  | void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { | 
|  | __asm { | 
|  | mov         eax, [esp + 4]   // src_ptr | 
|  | mov         edx, [esp + 8]   // dst_ptr | 
|  | mov         ecx, [esp + 12]  // src_width | 
|  | vpxor       ymm5, ymm5, ymm5 | 
|  |  | 
|  | // sum rows | 
|  | xloop: | 
|  | vmovdqu     ymm3, [eax]       // read 32 bytes | 
|  | lea         eax, [eax + 32] | 
|  | vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck | 
|  | vpunpcklbw  ymm2, ymm3, ymm5 | 
|  | vpunpckhbw  ymm3, ymm3, ymm5 | 
|  | vpaddusw    ymm0, ymm2, [edx] // sum 16 words | 
|  | vpaddusw    ymm1, ymm3, [edx + 32] | 
|  | vmovdqu     [edx], ymm0       // write 32 words to destination | 
|  | vmovdqu     [edx + 32], ymm1 | 
|  | lea         edx, [edx + 64] | 
|  | sub         ecx, 32 | 
|  | jg          xloop | 
|  |  | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_SCALEADDROW_AVX2 | 
|  |  | 
|  | // Bilinear column filtering. SSSE3 version. | 
|  | __declspec(naked) | 
|  | void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, | 
|  | int dst_width, int x, int dx) { | 
|  | __asm { | 
|  | push       ebx | 
|  | push       esi | 
|  | push       edi | 
|  | mov        edi, [esp + 12 + 4]    // dst_ptr | 
|  | mov        esi, [esp + 12 + 8]    // src_ptr | 
|  | mov        ecx, [esp + 12 + 12]   // dst_width | 
|  | movd       xmm2, [esp + 12 + 16]  // x | 
|  | movd       xmm3, [esp + 12 + 20]  // dx | 
|  | mov        eax, 0x04040000      // shuffle to line up fractions with pixel. | 
|  | movd       xmm5, eax | 
|  | pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction. | 
|  | psrlw      xmm6, 9 | 
|  | pextrw     eax, xmm2, 1         // get x0 integer. preroll | 
|  | sub        ecx, 2 | 
|  | jl         xloop29 | 
|  |  | 
|  | movdqa     xmm0, xmm2           // x1 = x0 + dx | 
|  | paddd      xmm0, xmm3 | 
|  | punpckldq  xmm2, xmm0           // x0 x1 | 
|  | punpckldq  xmm3, xmm3           // dx dx | 
|  | paddd      xmm3, xmm3           // dx * 2, dx * 2 | 
|  | pextrw     edx, xmm2, 3         // get x1 integer. preroll | 
|  |  | 
|  | // 2 Pixel loop. | 
|  | xloop2: | 
|  | movdqa     xmm1, xmm2           // x0, x1 fractions. | 
|  | paddd      xmm2, xmm3           // x += dx | 
|  | movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels | 
|  | movd       xmm0, ebx | 
|  | psrlw      xmm1, 9              // 7 bit fractions. | 
|  | movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels | 
|  | movd       xmm4, ebx | 
|  | pshufb     xmm1, xmm5           // 0011 | 
|  | punpcklwd  xmm0, xmm4 | 
|  | pxor       xmm1, xmm6           // 0..7f and 7f..0 | 
|  | pmaddubsw  xmm0, xmm1           // 16 bit, 2 pixels. | 
|  | pextrw     eax, xmm2, 1         // get x0 integer. next iteration. | 
|  | pextrw     edx, xmm2, 3         // get x1 integer. next iteration. | 
|  | psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits. | 
|  | packuswb   xmm0, xmm0           // 8 bits, 2 pixels. | 
|  | movd       ebx, xmm0 | 
|  | mov        [edi], bx | 
|  | lea        edi, [edi + 2] | 
|  | sub        ecx, 2               // 2 pixels | 
|  | jge        xloop2 | 
|  |  | 
|  | xloop29: | 
|  |  | 
|  | add        ecx, 2 - 1 | 
|  | jl         xloop99 | 
|  |  | 
|  | // 1 pixel remainder | 
|  | movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels | 
|  | movd       xmm0, ebx | 
|  | psrlw      xmm2, 9              // 7 bit fractions. | 
|  | pshufb     xmm2, xmm5           // 0011 | 
|  | pxor       xmm2, xmm6           // 0..7f and 7f..0 | 
|  | pmaddubsw  xmm0, xmm2           // 16 bit | 
|  | psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits. | 
|  | packuswb   xmm0, xmm0           // 8 bits | 
|  | movd       ebx, xmm0 | 
|  | mov        [edi], bl | 
|  |  | 
|  | xloop99: | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | pop        ebx | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Reads 16 pixels, duplicates them and writes 32 pixels. | 
|  | __declspec(naked) | 
|  | void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, | 
|  | int dst_width, int x, int dx) { | 
|  | __asm { | 
|  | mov        edx, [esp + 4]    // dst_ptr | 
|  | mov        eax, [esp + 8]    // src_ptr | 
|  | mov        ecx, [esp + 12]   // dst_width | 
|  |  | 
|  | wloop: | 
|  | movdqu     xmm0, [eax] | 
|  | lea        eax,  [eax + 16] | 
|  | movdqa     xmm1, xmm0 | 
|  | punpcklbw  xmm0, xmm0 | 
|  | punpckhbw  xmm1, xmm1 | 
|  | movdqu     [edx], xmm0 | 
|  | movdqu     [edx + 16], xmm1 | 
|  | lea        edx, [edx + 32] | 
|  | sub        ecx, 32 | 
|  | jg         wloop | 
|  |  | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) | 
|  | __declspec(naked) | 
|  | void ScaleARGBRowDown2_SSE2(const uint8* src_argb, | 
|  | ptrdiff_t src_stride, | 
|  | uint8* dst_argb, int dst_width) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]        // src_argb | 
|  | // src_stride ignored | 
|  | mov        edx, [esp + 12]       // dst_argb | 
|  | mov        ecx, [esp + 16]       // dst_width | 
|  |  | 
|  | wloop: | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | lea        eax,  [eax + 32] | 
|  | shufps     xmm0, xmm1, 0xdd | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 4 | 
|  | jg         wloop | 
|  |  | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Blends 8x1 rectangle to 4x1. | 
|  | __declspec(naked) | 
|  | void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, | 
|  | ptrdiff_t src_stride, | 
|  | uint8* dst_argb, int dst_width) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]        // src_argb | 
|  | // src_stride ignored | 
|  | mov        edx, [esp + 12]       // dst_argb | 
|  | mov        ecx, [esp + 16]       // dst_width | 
|  |  | 
|  | wloop: | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | lea        eax,  [eax + 32] | 
|  | movdqa     xmm2, xmm0 | 
|  | shufps     xmm0, xmm1, 0x88      // even pixels | 
|  | shufps     xmm2, xmm1, 0xdd      // odd pixels | 
|  | pavgb      xmm0, xmm2 | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 4 | 
|  | jg         wloop | 
|  |  | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Blends 8x2 rectangle to 4x1. | 
|  | __declspec(naked) | 
|  | void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, | 
|  | ptrdiff_t src_stride, | 
|  | uint8* dst_argb, int dst_width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]    // src_argb | 
|  | mov        esi, [esp + 4 + 8]    // src_stride | 
|  | mov        edx, [esp + 4 + 12]   // dst_argb | 
|  | mov        ecx, [esp + 4 + 16]   // dst_width | 
|  |  | 
|  | wloop: | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | movdqu     xmm2, [eax + esi] | 
|  | movdqu     xmm3, [eax + esi + 16] | 
|  | lea        eax,  [eax + 32] | 
|  | pavgb      xmm0, xmm2            // average rows | 
|  | pavgb      xmm1, xmm3 | 
|  | movdqa     xmm2, xmm0            // average columns (8 to 4 pixels) | 
|  | shufps     xmm0, xmm1, 0x88      // even pixels | 
|  | shufps     xmm2, xmm1, 0xdd      // odd pixels | 
|  | pavgb      xmm0, xmm2 | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 4 | 
|  | jg         wloop | 
|  |  | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Reads 4 pixels at a time. | 
|  | __declspec(naked) | 
|  | void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, | 
|  | int src_stepx, | 
|  | uint8* dst_argb, int dst_width) { | 
|  | __asm { | 
|  | push       ebx | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]    // src_argb | 
|  | // src_stride ignored | 
|  | mov        ebx, [esp + 8 + 12]   // src_stepx | 
|  | mov        edx, [esp + 8 + 16]   // dst_argb | 
|  | mov        ecx, [esp + 8 + 20]   // dst_width | 
|  | lea        ebx, [ebx * 4] | 
|  | lea        edi, [ebx + ebx * 2] | 
|  |  | 
|  | wloop: | 
|  | movd       xmm0, [eax] | 
|  | movd       xmm1, [eax + ebx] | 
|  | punpckldq  xmm0, xmm1 | 
|  | movd       xmm2, [eax + ebx * 2] | 
|  | movd       xmm3, [eax + edi] | 
|  | lea        eax,  [eax + ebx * 4] | 
|  | punpckldq  xmm2, xmm3 | 
|  | punpcklqdq xmm0, xmm2 | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 4 | 
|  | jg         wloop | 
|  |  | 
|  | pop        edi | 
|  | pop        ebx | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Blends four 2x2 to 4x1. | 
|  | __declspec(naked) | 
|  | void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, | 
|  | ptrdiff_t src_stride, | 
|  | int src_stepx, | 
|  | uint8* dst_argb, int dst_width) { | 
|  | __asm { | 
|  | push       ebx | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 12 + 4]    // src_argb | 
|  | mov        esi, [esp + 12 + 8]    // src_stride | 
|  | mov        ebx, [esp + 12 + 12]   // src_stepx | 
|  | mov        edx, [esp + 12 + 16]   // dst_argb | 
|  | mov        ecx, [esp + 12 + 20]   // dst_width | 
|  | lea        esi, [eax + esi]       // row1 pointer | 
|  | lea        ebx, [ebx * 4] | 
|  | lea        edi, [ebx + ebx * 2] | 
|  |  | 
|  | wloop: | 
|  | movq       xmm0, qword ptr [eax]  // row0 4 pairs | 
|  | movhps     xmm0, qword ptr [eax + ebx] | 
|  | movq       xmm1, qword ptr [eax + ebx * 2] | 
|  | movhps     xmm1, qword ptr [eax + edi] | 
|  | lea        eax,  [eax + ebx * 4] | 
|  | movq       xmm2, qword ptr [esi]  // row1 4 pairs | 
|  | movhps     xmm2, qword ptr [esi + ebx] | 
|  | movq       xmm3, qword ptr [esi + ebx * 2] | 
|  | movhps     xmm3, qword ptr [esi + edi] | 
|  | lea        esi,  [esi + ebx * 4] | 
|  | pavgb      xmm0, xmm2            // average rows | 
|  | pavgb      xmm1, xmm3 | 
|  | movdqa     xmm2, xmm0            // average columns (8 to 4 pixels) | 
|  | shufps     xmm0, xmm1, 0x88      // even pixels | 
|  | shufps     xmm2, xmm1, 0xdd      // odd pixels | 
|  | pavgb      xmm0, xmm2 | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 4 | 
|  | jg         wloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | pop        ebx | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Column scaling unfiltered. SSE2 version. | 
|  | __declspec(naked) | 
|  | void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, | 
|  | int dst_width, int x, int dx) { | 
|  | __asm { | 
|  | push       edi | 
|  | push       esi | 
|  | mov        edi, [esp + 8 + 4]    // dst_argb | 
|  | mov        esi, [esp + 8 + 8]    // src_argb | 
|  | mov        ecx, [esp + 8 + 12]   // dst_width | 
|  | movd       xmm2, [esp + 8 + 16]  // x | 
|  | movd       xmm3, [esp + 8 + 20]  // dx | 
|  |  | 
|  | pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0 | 
|  | pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0 | 
|  | paddd      xmm2, xmm0 | 
|  | paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2 | 
|  | pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0 | 
|  | paddd      xmm2, xmm0            // x3 x2 x1 x0 | 
|  | paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4 | 
|  | pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4 | 
|  |  | 
|  | pextrw     eax, xmm2, 1          // get x0 integer. | 
|  | pextrw     edx, xmm2, 3          // get x1 integer. | 
|  |  | 
|  | cmp        ecx, 0 | 
|  | jle        xloop99 | 
|  | sub        ecx, 4 | 
|  | jl         xloop49 | 
|  |  | 
|  | // 4 Pixel loop. | 
|  | xloop4: | 
|  | movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels | 
|  | movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels | 
|  | pextrw     eax, xmm2, 5           // get x2 integer. | 
|  | pextrw     edx, xmm2, 7           // get x3 integer. | 
|  | paddd      xmm2, xmm3             // x += dx | 
|  | punpckldq  xmm0, xmm1             // x0 x1 | 
|  |  | 
|  | movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels | 
|  | movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels | 
|  | pextrw     eax, xmm2, 1           // get x0 integer. next iteration. | 
|  | pextrw     edx, xmm2, 3           // get x1 integer. next iteration. | 
|  | punpckldq  xmm1, xmm4             // x2 x3 | 
|  | punpcklqdq xmm0, xmm1             // x0 x1 x2 x3 | 
|  | movdqu     [edi], xmm0 | 
|  | lea        edi, [edi + 16] | 
|  | sub        ecx, 4                 // 4 pixels | 
|  | jge        xloop4 | 
|  |  | 
|  | xloop49: | 
|  | test       ecx, 2 | 
|  | je         xloop29 | 
|  |  | 
|  | // 2 Pixels. | 
|  | movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels | 
|  | movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels | 
|  | pextrw     eax, xmm2, 5           // get x2 integer. | 
|  | punpckldq  xmm0, xmm1             // x0 x1 | 
|  |  | 
|  | movq       qword ptr [edi], xmm0 | 
|  | lea        edi, [edi + 8] | 
|  |  | 
|  | xloop29: | 
|  | test       ecx, 1 | 
|  | je         xloop99 | 
|  |  | 
|  | // 1 Pixels. | 
|  | movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels | 
|  | movd       dword ptr [edi], xmm0 | 
|  | xloop99: | 
|  |  | 
|  | pop        esi | 
|  | pop        edi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. | 
|  | // TODO(fbarchard): Port to Neon | 
|  |  | 
|  | // Shuffle table for arranging 2 pixels into pairs for pmaddubsw | 
|  | static uvec8 kShuffleColARGB = { | 
|  | 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel | 
|  | 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel | 
|  | }; | 
|  |  | 
|  | // Shuffle table for duplicating 2 fractions into 8 bytes each | 
|  | static uvec8 kShuffleFractions = { | 
|  | 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, | 
|  | }; | 
|  |  | 
|  | __declspec(naked) | 
|  | void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, | 
|  | int dst_width, int x, int dx) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        edi, [esp + 8 + 4]    // dst_argb | 
|  | mov        esi, [esp + 8 + 8]    // src_argb | 
|  | mov        ecx, [esp + 8 + 12]   // dst_width | 
|  | movd       xmm2, [esp + 8 + 16]  // x | 
|  | movd       xmm3, [esp + 8 + 20]  // dx | 
|  | movdqa     xmm4, kShuffleColARGB | 
|  | movdqa     xmm5, kShuffleFractions | 
|  | pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction. | 
|  | psrlw      xmm6, 9 | 
|  | pextrw     eax, xmm2, 1         // get x0 integer. preroll | 
|  | sub        ecx, 2 | 
|  | jl         xloop29 | 
|  |  | 
|  | movdqa     xmm0, xmm2           // x1 = x0 + dx | 
|  | paddd      xmm0, xmm3 | 
|  | punpckldq  xmm2, xmm0           // x0 x1 | 
|  | punpckldq  xmm3, xmm3           // dx dx | 
|  | paddd      xmm3, xmm3           // dx * 2, dx * 2 | 
|  | pextrw     edx, xmm2, 3         // get x1 integer. preroll | 
|  |  | 
|  | // 2 Pixel loop. | 
|  | xloop2: | 
|  | movdqa     xmm1, xmm2           // x0, x1 fractions. | 
|  | paddd      xmm2, xmm3           // x += dx | 
|  | movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels | 
|  | psrlw      xmm1, 9              // 7 bit fractions. | 
|  | movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels | 
|  | pshufb     xmm1, xmm5           // 0000000011111111 | 
|  | pshufb     xmm0, xmm4           // arrange pixels into pairs | 
|  | pxor       xmm1, xmm6           // 0..7f and 7f..0 | 
|  | pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels. | 
|  | pextrw     eax, xmm2, 1         // get x0 integer. next iteration. | 
|  | pextrw     edx, xmm2, 3         // get x1 integer. next iteration. | 
|  | psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits. | 
|  | packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels. | 
|  | movq       qword ptr [edi], xmm0 | 
|  | lea        edi, [edi + 8] | 
|  | sub        ecx, 2               // 2 pixels | 
|  | jge        xloop2 | 
|  |  | 
|  | xloop29: | 
|  |  | 
|  | add        ecx, 2 - 1 | 
|  | jl         xloop99 | 
|  |  | 
|  | // 1 pixel remainder | 
|  | psrlw      xmm2, 9              // 7 bit fractions. | 
|  | movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels | 
|  | pshufb     xmm2, xmm5           // 00000000 | 
|  | pshufb     xmm0, xmm4           // arrange pixels into pairs | 
|  | pxor       xmm2, xmm6           // 0..7f and 7f..0 | 
|  | pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel. | 
|  | psrlw      xmm0, 7 | 
|  | packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel. | 
|  | movd       [edi], xmm0 | 
|  |  | 
|  | xloop99: | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Reads 4 pixels, duplicates them and writes 8 pixels. | 
|  | __declspec(naked) | 
|  | void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, | 
|  | int dst_width, int x, int dx) { | 
|  | __asm { | 
|  | mov        edx, [esp + 4]    // dst_argb | 
|  | mov        eax, [esp + 8]    // src_argb | 
|  | mov        ecx, [esp + 12]   // dst_width | 
|  |  | 
|  | wloop: | 
|  | movdqu     xmm0, [eax] | 
|  | lea        eax,  [eax + 16] | 
|  | movdqa     xmm1, xmm0 | 
|  | punpckldq  xmm0, xmm0 | 
|  | punpckhdq  xmm1, xmm1 | 
|  | movdqu     [edx], xmm0 | 
|  | movdqu     [edx + 16], xmm1 | 
|  | lea        edx, [edx + 32] | 
|  | sub        ecx, 8 | 
|  | jg         wloop | 
|  |  | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Divide num by div and return as 16.16 fixed point result. | 
|  | __declspec(naked) | 
|  | int FixedDiv_X86(int num, int div) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]    // num | 
|  | cdq                          // extend num to 64 bits | 
|  | shld       edx, eax, 16      // 32.16 | 
|  | shl        eax, 16 | 
|  | idiv       dword ptr [esp + 8] | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Divide num by div and return as 16.16 fixed point result. | 
|  | __declspec(naked) | 
|  | int FixedDiv1_X86(int num, int div) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]    // num | 
|  | mov        ecx, [esp + 8]    // denom | 
|  | cdq                          // extend num to 64 bits | 
|  | shld       edx, eax, 16      // 32.16 | 
|  | shl        eax, 16 | 
|  | sub        eax, 0x00010001 | 
|  | sbb        edx, 0 | 
|  | sub        ecx, 1 | 
|  | idiv       ecx | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) | 
|  |  | 
|  | #ifdef __cplusplus | 
|  | }  // extern "C" | 
|  | }  // namespace libyuv | 
|  | #endif |