|  | /* | 
|  | *  Copyright 2011 The LibYuv Project Authors. All rights reserved. | 
|  | * | 
|  | *  Use of this source code is governed by a BSD-style license | 
|  | *  that can be found in the LICENSE file in the root of the source | 
|  | *  tree. An additional intellectual property rights grant can be found | 
|  | *  in the file PATENTS. All contributing project authors may | 
|  | *  be found in the AUTHORS file in the root of the source tree. | 
|  | */ | 
|  |  | 
|  | #include "libyuv/row.h" | 
|  |  | 
|  | #if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \ | 
|  | defined(_MSC_VER) && !defined(__clang__) | 
|  | #include <emmintrin.h> | 
|  | #include <tmmintrin.h>  // For _mm_maddubs_epi16 | 
|  | #endif | 
|  |  | 
|  | #ifdef __cplusplus | 
|  | namespace libyuv { | 
|  | extern "C" { | 
|  | #endif | 
|  |  | 
|  | // This module is for Visual C. | 
|  | #if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \ | 
|  | defined(_MSC_VER) && !defined(__clang__) | 
|  |  | 
|  | struct YuvConstants { | 
|  | lvec8 kUVToB;     // 0 | 
|  | lvec8 kUVToG;     // 32 | 
|  | lvec8 kUVToR;     // 64 | 
|  | lvec16 kUVBiasB;  // 96 | 
|  | lvec16 kUVBiasG;  // 128 | 
|  | lvec16 kUVBiasR;  // 160 | 
|  | lvec16 kYToRgb;   // 192 | 
|  | }; | 
|  |  | 
|  | // BT.601 YUV to RGB reference | 
|  | //  R = (Y - 16) * 1.164              - V * -1.596 | 
|  | //  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813 | 
|  | //  B = (Y - 16) * 1.164 - U * -2.018 | 
|  |  | 
|  | // Y contribution to R,G,B.  Scale and bias. | 
|  | // TODO(fbarchard): Consider moving constants into a common header. | 
|  | #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ | 
|  | #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ | 
|  |  | 
|  | // U and V contributions to R,G,B. | 
|  | #define UB -128 /* max(-128, round(-2.018 * 64)) */ | 
|  | #define UG 25 /* round(0.391 * 64) */ | 
|  | #define VG 52 /* round(0.813 * 64) */ | 
|  | #define VR -102 /* round(-1.596 * 64) */ | 
|  |  | 
|  | // Bias values to subtract 16 from Y and 128 from U and V. | 
|  | #define BB (UB * 128            + YGB) | 
|  | #define BG (UG * 128 + VG * 128 + YGB) | 
|  | #define BR            (VR * 128 + YGB) | 
|  |  | 
|  | // BT601 constants for YUV to RGB. | 
|  | static YuvConstants SIMD_ALIGNED(kYuvConstants) = { | 
|  | { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, | 
|  | UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, | 
|  | { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, | 
|  | UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, | 
|  | { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, | 
|  | 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, | 
|  | { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, | 
|  | { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, | 
|  | { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, | 
|  | { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } | 
|  | }; | 
|  |  | 
|  | // BT601 constants for NV21 where chroma plane is VU instead of UV. | 
|  | static YuvConstants SIMD_ALIGNED(kYvuConstants) = { | 
|  | { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, | 
|  | 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, | 
|  | { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, | 
|  | VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, | 
|  | { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, | 
|  | VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, | 
|  | { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, | 
|  | { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, | 
|  | { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, | 
|  | { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } | 
|  | }; | 
|  |  | 
|  | #undef YG | 
|  | #undef YGB | 
|  | #undef UB | 
|  | #undef UG | 
|  | #undef VG | 
|  | #undef VR | 
|  | #undef BB | 
|  | #undef BG | 
|  | #undef BR | 
|  |  | 
|  | // JPEG YUV to RGB reference | 
|  | // *  R = Y                - V * -1.40200 | 
|  | // *  G = Y - U *  0.34414 - V *  0.71414 | 
|  | // *  B = Y - U * -1.77200 | 
|  |  | 
|  | // Y contribution to R,G,B.  Scale and bias. | 
|  | // TODO(fbarchard): Consider moving constants into a common header. | 
|  | #define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ | 
|  | #define YGBJ 32  /* 64 / 2 */ | 
|  |  | 
|  | // U and V contributions to R,G,B. | 
|  | #define UBJ -113 /* round(-1.77200 * 64) */ | 
|  | #define UGJ 22 /* round(0.34414 * 64) */ | 
|  | #define VGJ 46 /* round(0.71414  * 64) */ | 
|  | #define VRJ -90 /* round(-1.40200 * 64) */ | 
|  |  | 
|  | // Bias values to subtract 16 from Y and 128 from U and V. | 
|  | #define BBJ (UBJ * 128             + YGBJ) | 
|  | #define BGJ (UGJ * 128 + VGJ * 128 + YGBJ) | 
|  | #define BRJ             (VRJ * 128 + YGBJ) | 
|  |  | 
|  | // JPEG constants for YUV to RGB. | 
|  | static YuvConstants SIMD_ALIGNED(kYuvJConstants) = { | 
|  | { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, | 
|  | UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 }, | 
|  | { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, | 
|  | UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, | 
|  | UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, | 
|  | UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ }, | 
|  | { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, | 
|  | 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ }, | 
|  | { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, | 
|  | BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ }, | 
|  | { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, | 
|  | BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ }, | 
|  | { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, | 
|  | BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ }, | 
|  | { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, | 
|  | YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ } | 
|  | }; | 
|  |  | 
|  | #undef YGJ | 
|  | #undef YGBJ | 
|  | #undef UBJ | 
|  | #undef UGJ | 
|  | #undef VGJ | 
|  | #undef VRJ | 
|  | #undef BBJ | 
|  | #undef BGJ | 
|  | #undef BRJ | 
|  |  | 
|  | // 64 bit | 
|  | #if defined(_M_X64) | 
|  | #if defined(HAS_I422TOARGBROW_SSSE3) | 
|  | void I422ToARGBRow_SSSE3(const uint8* y_buf, | 
|  | const uint8* u_buf, | 
|  | const uint8* v_buf, | 
|  | uint8* dst_argb, | 
|  | int width) { | 
|  | __m128i xmm0, xmm1, xmm2, xmm3; | 
|  | const __m128i xmm5 = _mm_set1_epi8(-1); | 
|  | const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; | 
|  |  | 
|  | while (width > 0) { | 
|  | xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); | 
|  | xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); | 
|  | xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); | 
|  | xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); | 
|  | xmm1 = _mm_loadu_si128(&xmm0); | 
|  | xmm2 = _mm_loadu_si128(&xmm0); | 
|  | xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB); | 
|  | xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG); | 
|  | xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR); | 
|  | xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0); | 
|  | xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1); | 
|  | xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2); | 
|  | xmm3 = _mm_loadl_epi64((__m128i*)y_buf); | 
|  | xmm3 = _mm_unpacklo_epi8(xmm3, xmm3); | 
|  | xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb); | 
|  | xmm0 = _mm_adds_epi16(xmm0, xmm3); | 
|  | xmm1 = _mm_adds_epi16(xmm1, xmm3); | 
|  | xmm2 = _mm_adds_epi16(xmm2, xmm3); | 
|  | xmm0 = _mm_srai_epi16(xmm0, 6); | 
|  | xmm1 = _mm_srai_epi16(xmm1, 6); | 
|  | xmm2 = _mm_srai_epi16(xmm2, 6); | 
|  | xmm0 = _mm_packus_epi16(xmm0, xmm0); | 
|  | xmm1 = _mm_packus_epi16(xmm1, xmm1); | 
|  | xmm2 = _mm_packus_epi16(xmm2, xmm2); | 
|  | xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); | 
|  | xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); | 
|  | xmm1 = _mm_loadu_si128(&xmm0); | 
|  | xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); | 
|  | xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); | 
|  |  | 
|  | _mm_storeu_si128((__m128i *)dst_argb, xmm0); | 
|  | _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); | 
|  |  | 
|  | y_buf += 8; | 
|  | u_buf += 4; | 
|  | dst_argb += 32; | 
|  | width -= 8; | 
|  | } | 
|  | } | 
|  | #endif | 
|  | // 32 bit | 
|  | #else  // defined(_M_X64) | 
|  | #ifdef HAS_ARGBTOYROW_SSSE3 | 
|  |  | 
|  | // Constants for ARGB. | 
|  | static const vec8 kARGBToY = { | 
|  | 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 | 
|  | }; | 
|  |  | 
|  | // JPeg full range. | 
|  | static const vec8 kARGBToYJ = { | 
|  | 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 | 
|  | }; | 
|  |  | 
|  | static const vec8 kARGBToU = { | 
|  | 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 | 
|  | }; | 
|  |  | 
|  | static const vec8 kARGBToUJ = { | 
|  | 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 | 
|  | }; | 
|  |  | 
|  | static const vec8 kARGBToV = { | 
|  | -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, | 
|  | }; | 
|  |  | 
|  | static const vec8 kARGBToVJ = { | 
|  | -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 | 
|  | }; | 
|  |  | 
|  | // vpshufb for vphaddw + vpackuswb packed to shorts. | 
|  | static const lvec8 kShufARGBToUV_AVX = { | 
|  | 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, | 
|  | 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 | 
|  | }; | 
|  |  | 
|  | // Constants for BGRA. | 
|  | static const vec8 kBGRAToY = { | 
|  | 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 | 
|  | }; | 
|  |  | 
|  | static const vec8 kBGRAToU = { | 
|  | 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 | 
|  | }; | 
|  |  | 
|  | static const vec8 kBGRAToV = { | 
|  | 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 | 
|  | }; | 
|  |  | 
|  | // Constants for ABGR. | 
|  | static const vec8 kABGRToY = { | 
|  | 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 | 
|  | }; | 
|  |  | 
|  | static const vec8 kABGRToU = { | 
|  | -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 | 
|  | }; | 
|  |  | 
|  | static const vec8 kABGRToV = { | 
|  | 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 | 
|  | }; | 
|  |  | 
|  | // Constants for RGBA. | 
|  | static const vec8 kRGBAToY = { | 
|  | 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 | 
|  | }; | 
|  |  | 
|  | static const vec8 kRGBAToU = { | 
|  | 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 | 
|  | }; | 
|  |  | 
|  | static const vec8 kRGBAToV = { | 
|  | 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 | 
|  | }; | 
|  |  | 
|  | static const uvec8 kAddY16 = { | 
|  | 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u | 
|  | }; | 
|  |  | 
|  | // 7 bit fixed point 0.5. | 
|  | static const vec16 kAddYJ64 = { | 
|  | 64, 64, 64, 64, 64, 64, 64, 64 | 
|  | }; | 
|  |  | 
|  | static const uvec8 kAddUV128 = { | 
|  | 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, | 
|  | 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u | 
|  | }; | 
|  |  | 
|  | static const uvec16 kAddUVJ128 = { | 
|  | 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u | 
|  | }; | 
|  |  | 
|  | // Shuffle table for converting RGB24 to ARGB. | 
|  | static const uvec8 kShuffleMaskRGB24ToARGB = { | 
|  | 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u | 
|  | }; | 
|  |  | 
|  | // Shuffle table for converting RAW to ARGB. | 
|  | static const uvec8 kShuffleMaskRAWToARGB = { | 
|  | 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u | 
|  | }; | 
|  |  | 
|  | // Shuffle table for converting ARGB to RGB24. | 
|  | static const uvec8 kShuffleMaskARGBToRGB24 = { | 
|  | 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u | 
|  | }; | 
|  |  | 
|  | // Shuffle table for converting ARGB to RAW. | 
|  | static const uvec8 kShuffleMaskARGBToRAW = { | 
|  | 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u | 
|  | }; | 
|  |  | 
|  | // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4 | 
|  | static const uvec8 kShuffleMaskARGBToRGB24_0 = { | 
|  | 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u | 
|  | }; | 
|  |  | 
|  | // Shuffle table for converting ARGB to RAW. | 
|  | static const uvec8 kShuffleMaskARGBToRAW_0 = { | 
|  | 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u | 
|  | }; | 
|  |  | 
|  | // Duplicates gray value 3 times and fills in alpha opaque. | 
|  | __declspec(naked) | 
|  | void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]        // src_y | 
|  | mov        edx, [esp + 8]        // dst_argb | 
|  | mov        ecx, [esp + 12]       // pix | 
|  | pcmpeqb    xmm5, xmm5            // generate mask 0xff000000 | 
|  | pslld      xmm5, 24 | 
|  |  | 
|  | convertloop: | 
|  | movq       xmm0, qword ptr [eax] | 
|  | lea        eax,  [eax + 8] | 
|  | punpcklbw  xmm0, xmm0 | 
|  | movdqa     xmm1, xmm0 | 
|  | punpcklwd  xmm0, xmm0 | 
|  | punpckhwd  xmm1, xmm1 | 
|  | por        xmm0, xmm5 | 
|  | por        xmm1, xmm5 | 
|  | movdqu     [edx], xmm0 | 
|  | movdqu     [edx + 16], xmm1 | 
|  | lea        edx, [edx + 32] | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | #ifdef HAS_J400TOARGBROW_AVX2 | 
|  | // Duplicates gray value 3 times and fills in alpha opaque. | 
|  | __declspec(naked) | 
|  | void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) { | 
|  | __asm { | 
|  | mov         eax, [esp + 4]        // src_y | 
|  | mov         edx, [esp + 8]        // dst_argb | 
|  | mov         ecx, [esp + 12]       // pix | 
|  | vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0xff000000 | 
|  | vpslld      ymm5, ymm5, 24 | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu     xmm0, [eax] | 
|  | lea         eax,  [eax + 16] | 
|  | vpermq      ymm0, ymm0, 0xd8 | 
|  | vpunpcklbw  ymm0, ymm0, ymm0 | 
|  | vpermq      ymm0, ymm0, 0xd8 | 
|  | vpunpckhwd  ymm1, ymm0, ymm0 | 
|  | vpunpcklwd  ymm0, ymm0, ymm0 | 
|  | vpor        ymm0, ymm0, ymm5 | 
|  | vpor        ymm1, ymm1, ymm5 | 
|  | vmovdqu     [edx], ymm0 | 
|  | vmovdqu     [edx + 32], ymm1 | 
|  | lea         edx, [edx + 64] | 
|  | sub         ecx, 16 | 
|  | jg          convertloop | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_J400TOARGBROW_AVX2 | 
|  |  | 
|  | __declspec(naked) | 
|  | void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { | 
|  | __asm { | 
|  | mov       eax, [esp + 4]   // src_rgb24 | 
|  | mov       edx, [esp + 8]   // dst_argb | 
|  | mov       ecx, [esp + 12]  // pix | 
|  | pcmpeqb   xmm5, xmm5       // generate mask 0xff000000 | 
|  | pslld     xmm5, 24 | 
|  | movdqa    xmm4, kShuffleMaskRGB24ToARGB | 
|  |  | 
|  | convertloop: | 
|  | movdqu    xmm0, [eax] | 
|  | movdqu    xmm1, [eax + 16] | 
|  | movdqu    xmm3, [eax + 32] | 
|  | lea       eax, [eax + 48] | 
|  | movdqa    xmm2, xmm3 | 
|  | palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]} | 
|  | pshufb    xmm2, xmm4 | 
|  | por       xmm2, xmm5 | 
|  | palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]} | 
|  | pshufb    xmm0, xmm4 | 
|  | movdqu    [edx + 32], xmm2 | 
|  | por       xmm0, xmm5 | 
|  | pshufb    xmm1, xmm4 | 
|  | movdqu    [edx], xmm0 | 
|  | por       xmm1, xmm5 | 
|  | palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]} | 
|  | pshufb    xmm3, xmm4 | 
|  | movdqu    [edx + 16], xmm1 | 
|  | por       xmm3, xmm5 | 
|  | movdqu    [edx + 48], xmm3 | 
|  | lea       edx, [edx + 64] | 
|  | sub       ecx, 16 | 
|  | jg        convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, | 
|  | int pix) { | 
|  | __asm { | 
|  | mov       eax, [esp + 4]   // src_raw | 
|  | mov       edx, [esp + 8]   // dst_argb | 
|  | mov       ecx, [esp + 12]  // pix | 
|  | pcmpeqb   xmm5, xmm5       // generate mask 0xff000000 | 
|  | pslld     xmm5, 24 | 
|  | movdqa    xmm4, kShuffleMaskRAWToARGB | 
|  |  | 
|  | convertloop: | 
|  | movdqu    xmm0, [eax] | 
|  | movdqu    xmm1, [eax + 16] | 
|  | movdqu    xmm3, [eax + 32] | 
|  | lea       eax, [eax + 48] | 
|  | movdqa    xmm2, xmm3 | 
|  | palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]} | 
|  | pshufb    xmm2, xmm4 | 
|  | por       xmm2, xmm5 | 
|  | palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]} | 
|  | pshufb    xmm0, xmm4 | 
|  | movdqu    [edx + 32], xmm2 | 
|  | por       xmm0, xmm5 | 
|  | pshufb    xmm1, xmm4 | 
|  | movdqu    [edx], xmm0 | 
|  | por       xmm1, xmm5 | 
|  | palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]} | 
|  | pshufb    xmm3, xmm4 | 
|  | movdqu    [edx + 16], xmm1 | 
|  | por       xmm3, xmm5 | 
|  | movdqu    [edx + 48], xmm3 | 
|  | lea       edx, [edx + 64] | 
|  | sub       ecx, 16 | 
|  | jg        convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // pmul method to replicate bits. | 
|  | // Math to replicate bits: | 
|  | // (v << 8) | (v << 3) | 
|  | // v * 256 + v * 8 | 
|  | // v * (256 + 8) | 
|  | // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 | 
|  | // 20 instructions. | 
|  | __declspec(naked) | 
|  | void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, | 
|  | int pix) { | 
|  | __asm { | 
|  | mov       eax, 0x01080108  // generate multiplier to repeat 5 bits | 
|  | movd      xmm5, eax | 
|  | pshufd    xmm5, xmm5, 0 | 
|  | mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits | 
|  | movd      xmm6, eax | 
|  | pshufd    xmm6, xmm6, 0 | 
|  | pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red | 
|  | psllw     xmm3, 11 | 
|  | pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green | 
|  | psllw     xmm4, 10 | 
|  | psrlw     xmm4, 5 | 
|  | pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha | 
|  | psllw     xmm7, 8 | 
|  |  | 
|  | mov       eax, [esp + 4]   // src_rgb565 | 
|  | mov       edx, [esp + 8]   // dst_argb | 
|  | mov       ecx, [esp + 12]  // pix | 
|  | sub       edx, eax | 
|  | sub       edx, eax | 
|  |  | 
|  | convertloop: | 
|  | movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565 | 
|  | movdqa    xmm1, xmm0 | 
|  | movdqa    xmm2, xmm0 | 
|  | pand      xmm1, xmm3    // R in upper 5 bits | 
|  | psllw     xmm2, 11      // B in upper 5 bits | 
|  | pmulhuw   xmm1, xmm5    // * (256 + 8) | 
|  | pmulhuw   xmm2, xmm5    // * (256 + 8) | 
|  | psllw     xmm1, 8 | 
|  | por       xmm1, xmm2    // RB | 
|  | pand      xmm0, xmm4    // G in middle 6 bits | 
|  | pmulhuw   xmm0, xmm6    // << 5 * (256 + 4) | 
|  | por       xmm0, xmm7    // AG | 
|  | movdqa    xmm2, xmm1 | 
|  | punpcklbw xmm1, xmm0 | 
|  | punpckhbw xmm2, xmm0 | 
|  | movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB | 
|  | movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB | 
|  | lea       eax, [eax + 16] | 
|  | sub       ecx, 8 | 
|  | jg        convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | #ifdef HAS_RGB565TOARGBROW_AVX2 | 
|  | // pmul method to replicate bits. | 
|  | // Math to replicate bits: | 
|  | // (v << 8) | (v << 3) | 
|  | // v * 256 + v * 8 | 
|  | // v * (256 + 8) | 
|  | // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 | 
|  | __declspec(naked) | 
|  | void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, | 
|  | int pix) { | 
|  | __asm { | 
|  | mov        eax, 0x01080108  // generate multiplier to repeat 5 bits | 
|  | vmovd      xmm5, eax | 
|  | vbroadcastss ymm5, xmm5 | 
|  | mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits | 
|  | movd       xmm6, eax | 
|  | vbroadcastss ymm6, xmm6 | 
|  | vpcmpeqb   ymm3, ymm3, ymm3       // generate mask 0xf800f800 for Red | 
|  | vpsllw     ymm3, ymm3, 11 | 
|  | vpcmpeqb   ymm4, ymm4, ymm4       // generate mask 0x07e007e0 for Green | 
|  | vpsllw     ymm4, ymm4, 10 | 
|  | vpsrlw     ymm4, ymm4, 5 | 
|  | vpcmpeqb   ymm7, ymm7, ymm7       // generate mask 0xff00ff00 for Alpha | 
|  | vpsllw     ymm7, ymm7, 8 | 
|  |  | 
|  | mov        eax, [esp + 4]   // src_rgb565 | 
|  | mov        edx, [esp + 8]   // dst_argb | 
|  | mov        ecx, [esp + 12]  // pix | 
|  | sub        edx, eax | 
|  | sub        edx, eax | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm0, [eax]   // fetch 16 pixels of bgr565 | 
|  | vpand      ymm1, ymm0, ymm3    // R in upper 5 bits | 
|  | vpsllw     ymm2, ymm0, 11      // B in upper 5 bits | 
|  | vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8) | 
|  | vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8) | 
|  | vpsllw     ymm1, ymm1, 8 | 
|  | vpor       ymm1, ymm1, ymm2    // RB | 
|  | vpand      ymm0, ymm0, ymm4    // G in middle 6 bits | 
|  | vpmulhuw   ymm0, ymm0, ymm6    // << 5 * (256 + 4) | 
|  | vpor       ymm0, ymm0, ymm7    // AG | 
|  | vpermq     ymm0, ymm0, 0xd8    // mutate for unpack | 
|  | vpermq     ymm1, ymm1, 0xd8 | 
|  | vpunpckhbw ymm2, ymm1, ymm0 | 
|  | vpunpcklbw ymm1, ymm1, ymm0 | 
|  | vmovdqu    [eax * 2 + edx], ymm1  // store 4 pixels of ARGB | 
|  | vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 4 pixels of ARGB | 
|  | lea       eax, [eax + 32] | 
|  | sub       ecx, 16 | 
|  | jg        convertloop | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_RGB565TOARGBROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_ARGB1555TOARGBROW_AVX2 | 
|  | __declspec(naked) | 
|  | void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, | 
|  | int pix) { | 
|  | __asm { | 
|  | mov        eax, 0x01080108  // generate multiplier to repeat 5 bits | 
|  | vmovd      xmm5, eax | 
|  | vbroadcastss ymm5, xmm5 | 
|  | mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits | 
|  | movd       xmm6, eax | 
|  | vbroadcastss ymm6, xmm6 | 
|  | vpcmpeqb   ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red | 
|  | vpsllw     ymm3, ymm3, 11 | 
|  | vpsrlw     ymm4, ymm3, 6    // generate mask 0x03e003e0 for Green | 
|  | vpcmpeqb   ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha | 
|  | vpsllw     ymm7, ymm7, 8 | 
|  |  | 
|  | mov        eax,  [esp + 4]   // src_argb1555 | 
|  | mov        edx,  [esp + 8]   // dst_argb | 
|  | mov        ecx,  [esp + 12]  // pix | 
|  | sub        edx,  eax | 
|  | sub        edx,  eax | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm0, [eax]         // fetch 16 pixels of 1555 | 
|  | vpsllw     ymm1, ymm0, 1       // R in upper 5 bits | 
|  | vpsllw     ymm2, ymm0, 11      // B in upper 5 bits | 
|  | vpand      ymm1, ymm1, ymm3 | 
|  | vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8) | 
|  | vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8) | 
|  | vpsllw     ymm1, ymm1, 8 | 
|  | vpor       ymm1, ymm1, ymm2    // RB | 
|  | vpsraw     ymm2, ymm0, 8       // A | 
|  | vpand      ymm0, ymm0, ymm4    // G in middle 5 bits | 
|  | vpmulhuw   ymm0, ymm0, ymm6    // << 6 * (256 + 8) | 
|  | vpand      ymm2, ymm2, ymm7 | 
|  | vpor       ymm0, ymm0, ymm2    // AG | 
|  | vpermq     ymm0, ymm0, 0xd8    // mutate for unpack | 
|  | vpermq     ymm1, ymm1, 0xd8 | 
|  | vpunpckhbw ymm2, ymm1, ymm0 | 
|  | vpunpcklbw ymm1, ymm1, ymm0 | 
|  | vmovdqu    [eax * 2 + edx], ymm1  // store 8 pixels of ARGB | 
|  | vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 8 pixels of ARGB | 
|  | lea       eax, [eax + 32] | 
|  | sub       ecx, 16 | 
|  | jg        convertloop | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGB1555TOARGBROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_ARGB4444TOARGBROW_AVX2 | 
|  | __declspec(naked) | 
|  | void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, | 
|  | int pix) { | 
|  | __asm { | 
|  | mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f | 
|  | vmovd     xmm4, eax | 
|  | vbroadcastss ymm4, xmm4 | 
|  | vpslld    ymm5, ymm4, 4     // 0xf0f0f0f0 for high nibbles | 
|  | mov       eax,  [esp + 4]   // src_argb4444 | 
|  | mov       edx,  [esp + 8]   // dst_argb | 
|  | mov       ecx,  [esp + 12]  // pix | 
|  | sub       edx,  eax | 
|  | sub       edx,  eax | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm0, [eax]         // fetch 16 pixels of bgra4444 | 
|  | vpand      ymm2, ymm0, ymm5    // mask high nibbles | 
|  | vpand      ymm0, ymm0, ymm4    // mask low nibbles | 
|  | vpsrlw     ymm3, ymm2, 4 | 
|  | vpsllw     ymm1, ymm0, 4 | 
|  | vpor       ymm2, ymm2, ymm3 | 
|  | vpor       ymm0, ymm0, ymm1 | 
|  | vpermq     ymm0, ymm0, 0xd8    // mutate for unpack | 
|  | vpermq     ymm2, ymm2, 0xd8 | 
|  | vpunpckhbw ymm1, ymm0, ymm2 | 
|  | vpunpcklbw ymm0, ymm0, ymm2 | 
|  | vmovdqu    [eax * 2 + edx], ymm0  // store 8 pixels of ARGB | 
|  | vmovdqu    [eax * 2 + edx + 32], ymm1  // store next 8 pixels of ARGB | 
|  | lea       eax, [eax + 32] | 
|  | sub       ecx, 16 | 
|  | jg        convertloop | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGB4444TOARGBROW_AVX2 | 
|  |  | 
|  | // 24 instructions | 
|  | __declspec(naked) | 
|  | void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, | 
|  | int pix) { | 
|  | __asm { | 
|  | mov       eax, 0x01080108  // generate multiplier to repeat 5 bits | 
|  | movd      xmm5, eax | 
|  | pshufd    xmm5, xmm5, 0 | 
|  | mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits | 
|  | movd      xmm6, eax | 
|  | pshufd    xmm6, xmm6, 0 | 
|  | pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red | 
|  | psllw     xmm3, 11 | 
|  | movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green | 
|  | psrlw     xmm4, 6 | 
|  | pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha | 
|  | psllw     xmm7, 8 | 
|  |  | 
|  | mov       eax, [esp + 4]   // src_argb1555 | 
|  | mov       edx, [esp + 8]   // dst_argb | 
|  | mov       ecx, [esp + 12]  // pix | 
|  | sub       edx, eax | 
|  | sub       edx, eax | 
|  |  | 
|  | convertloop: | 
|  | movdqu    xmm0, [eax]   // fetch 8 pixels of 1555 | 
|  | movdqa    xmm1, xmm0 | 
|  | movdqa    xmm2, xmm0 | 
|  | psllw     xmm1, 1       // R in upper 5 bits | 
|  | psllw     xmm2, 11      // B in upper 5 bits | 
|  | pand      xmm1, xmm3 | 
|  | pmulhuw   xmm2, xmm5    // * (256 + 8) | 
|  | pmulhuw   xmm1, xmm5    // * (256 + 8) | 
|  | psllw     xmm1, 8 | 
|  | por       xmm1, xmm2    // RB | 
|  | movdqa    xmm2, xmm0 | 
|  | pand      xmm0, xmm4    // G in middle 5 bits | 
|  | psraw     xmm2, 8       // A | 
|  | pmulhuw   xmm0, xmm6    // << 6 * (256 + 8) | 
|  | pand      xmm2, xmm7 | 
|  | por       xmm0, xmm2    // AG | 
|  | movdqa    xmm2, xmm1 | 
|  | punpcklbw xmm1, xmm0 | 
|  | punpckhbw xmm2, xmm0 | 
|  | movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB | 
|  | movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB | 
|  | lea       eax, [eax + 16] | 
|  | sub       ecx, 8 | 
|  | jg        convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // 18 instructions. | 
|  | __declspec(naked) | 
|  | void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, | 
|  | int pix) { | 
|  | __asm { | 
|  | mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f | 
|  | movd      xmm4, eax | 
|  | pshufd    xmm4, xmm4, 0 | 
|  | movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles | 
|  | pslld     xmm5, 4 | 
|  | mov       eax, [esp + 4]   // src_argb4444 | 
|  | mov       edx, [esp + 8]   // dst_argb | 
|  | mov       ecx, [esp + 12]  // pix | 
|  | sub       edx, eax | 
|  | sub       edx, eax | 
|  |  | 
|  | convertloop: | 
|  | movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444 | 
|  | movdqa    xmm2, xmm0 | 
|  | pand      xmm0, xmm4    // mask low nibbles | 
|  | pand      xmm2, xmm5    // mask high nibbles | 
|  | movdqa    xmm1, xmm0 | 
|  | movdqa    xmm3, xmm2 | 
|  | psllw     xmm1, 4 | 
|  | psrlw     xmm3, 4 | 
|  | por       xmm0, xmm1 | 
|  | por       xmm2, xmm3 | 
|  | movdqa    xmm1, xmm0 | 
|  | punpcklbw xmm0, xmm2 | 
|  | punpckhbw xmm1, xmm2 | 
|  | movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB | 
|  | movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB | 
|  | lea       eax, [eax + 16] | 
|  | sub       ecx, 8 | 
|  | jg        convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { | 
|  | __asm { | 
|  | mov       eax, [esp + 4]   // src_argb | 
|  | mov       edx, [esp + 8]   // dst_rgb | 
|  | mov       ecx, [esp + 12]  // pix | 
|  | movdqa    xmm6, kShuffleMaskARGBToRGB24 | 
|  |  | 
|  | convertloop: | 
|  | movdqu    xmm0, [eax]   // fetch 16 pixels of argb | 
|  | movdqu    xmm1, [eax + 16] | 
|  | movdqu    xmm2, [eax + 32] | 
|  | movdqu    xmm3, [eax + 48] | 
|  | lea       eax, [eax + 64] | 
|  | pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB | 
|  | pshufb    xmm1, xmm6 | 
|  | pshufb    xmm2, xmm6 | 
|  | pshufb    xmm3, xmm6 | 
|  | movdqa    xmm4, xmm1   // 4 bytes from 1 for 0 | 
|  | psrldq    xmm1, 4      // 8 bytes from 1 | 
|  | pslldq    xmm4, 12     // 4 bytes from 1 for 0 | 
|  | movdqa    xmm5, xmm2   // 8 bytes from 2 for 1 | 
|  | por       xmm0, xmm4   // 4 bytes from 1 for 0 | 
|  | pslldq    xmm5, 8      // 8 bytes from 2 for 1 | 
|  | movdqu    [edx], xmm0  // store 0 | 
|  | por       xmm1, xmm5   // 8 bytes from 2 for 1 | 
|  | psrldq    xmm2, 8      // 4 bytes from 2 | 
|  | pslldq    xmm3, 4      // 12 bytes from 3 for 2 | 
|  | por       xmm2, xmm3   // 12 bytes from 3 for 2 | 
|  | movdqu    [edx + 16], xmm1   // store 1 | 
|  | movdqu    [edx + 32], xmm2   // store 2 | 
|  | lea       edx, [edx + 48] | 
|  | sub       ecx, 16 | 
|  | jg        convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { | 
|  | __asm { | 
|  | mov       eax, [esp + 4]   // src_argb | 
|  | mov       edx, [esp + 8]   // dst_rgb | 
|  | mov       ecx, [esp + 12]  // pix | 
|  | movdqa    xmm6, kShuffleMaskARGBToRAW | 
|  |  | 
|  | convertloop: | 
|  | movdqu    xmm0, [eax]   // fetch 16 pixels of argb | 
|  | movdqu    xmm1, [eax + 16] | 
|  | movdqu    xmm2, [eax + 32] | 
|  | movdqu    xmm3, [eax + 48] | 
|  | lea       eax, [eax + 64] | 
|  | pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB | 
|  | pshufb    xmm1, xmm6 | 
|  | pshufb    xmm2, xmm6 | 
|  | pshufb    xmm3, xmm6 | 
|  | movdqa    xmm4, xmm1   // 4 bytes from 1 for 0 | 
|  | psrldq    xmm1, 4      // 8 bytes from 1 | 
|  | pslldq    xmm4, 12     // 4 bytes from 1 for 0 | 
|  | movdqa    xmm5, xmm2   // 8 bytes from 2 for 1 | 
|  | por       xmm0, xmm4   // 4 bytes from 1 for 0 | 
|  | pslldq    xmm5, 8      // 8 bytes from 2 for 1 | 
|  | movdqu    [edx], xmm0  // store 0 | 
|  | por       xmm1, xmm5   // 8 bytes from 2 for 1 | 
|  | psrldq    xmm2, 8      // 4 bytes from 2 | 
|  | pslldq    xmm3, 4      // 12 bytes from 3 for 2 | 
|  | por       xmm2, xmm3   // 12 bytes from 3 for 2 | 
|  | movdqu    [edx + 16], xmm1   // store 1 | 
|  | movdqu    [edx + 32], xmm2   // store 2 | 
|  | lea       edx, [edx + 48] | 
|  | sub       ecx, 16 | 
|  | jg        convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // 4 pixels | 
|  | __declspec(naked) | 
|  | void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 
|  | __asm { | 
|  | mov       eax, [esp + 4]   // src_argb | 
|  | mov       edx, [esp + 8]   // dst_rgb | 
|  | mov       ecx, [esp + 12]  // pix | 
|  | pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f | 
|  | psrld     xmm3, 27 | 
|  | pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0 | 
|  | psrld     xmm4, 26 | 
|  | pslld     xmm4, 5 | 
|  | pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800 | 
|  | pslld     xmm5, 11 | 
|  |  | 
|  | convertloop: | 
|  | movdqu    xmm0, [eax]   // fetch 4 pixels of argb | 
|  | movdqa    xmm1, xmm0    // B | 
|  | movdqa    xmm2, xmm0    // G | 
|  | pslld     xmm0, 8       // R | 
|  | psrld     xmm1, 3       // B | 
|  | psrld     xmm2, 5       // G | 
|  | psrad     xmm0, 16      // R | 
|  | pand      xmm1, xmm3    // B | 
|  | pand      xmm2, xmm4    // G | 
|  | pand      xmm0, xmm5    // R | 
|  | por       xmm1, xmm2    // BG | 
|  | por       xmm0, xmm1    // BGR | 
|  | packssdw  xmm0, xmm0 | 
|  | lea       eax, [eax + 16] | 
|  | movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565 | 
|  | lea       edx, [edx + 8] | 
|  | sub       ecx, 4 | 
|  | jg        convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // 8 pixels | 
|  | __declspec(naked) | 
|  | void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, | 
|  | const uint32 dither4, int pix) { | 
|  | __asm { | 
|  |  | 
|  | mov       eax, [esp + 4]   // src_argb | 
|  | mov       edx, [esp + 8]   // dst_rgb | 
|  | movd      xmm6, [esp + 12] // dither4 | 
|  | mov       ecx, [esp + 16]  // pix | 
|  | punpcklbw xmm6, xmm6       // make dither 16 bytes | 
|  | movdqa    xmm7, xmm6 | 
|  | punpcklwd xmm6, xmm6 | 
|  | punpckhwd xmm7, xmm7 | 
|  | pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f | 
|  | psrld     xmm3, 27 | 
|  | pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0 | 
|  | psrld     xmm4, 26 | 
|  | pslld     xmm4, 5 | 
|  | pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800 | 
|  | pslld     xmm5, 11 | 
|  |  | 
|  | convertloop: | 
|  | movdqu    xmm0, [eax]   // fetch 4 pixels of argb | 
|  | paddusb   xmm0, xmm6    // add dither | 
|  | movdqa    xmm1, xmm0    // B | 
|  | movdqa    xmm2, xmm0    // G | 
|  | pslld     xmm0, 8       // R | 
|  | psrld     xmm1, 3       // B | 
|  | psrld     xmm2, 5       // G | 
|  | psrad     xmm0, 16      // R | 
|  | pand      xmm1, xmm3    // B | 
|  | pand      xmm2, xmm4    // G | 
|  | pand      xmm0, xmm5    // R | 
|  | por       xmm1, xmm2    // BG | 
|  | por       xmm0, xmm1    // BGR | 
|  | packssdw  xmm0, xmm0 | 
|  | lea       eax, [eax + 16] | 
|  | movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565 | 
|  | lea       edx, [edx + 8] | 
|  | sub       ecx, 4 | 
|  | jg        convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 | 
|  | __declspec(naked) | 
|  | void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, | 
|  | const uint32 dither4, int pix) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]      // src_argb | 
|  | mov        edx, [esp + 8]      // dst_rgb | 
|  | vbroadcastss xmm6, [esp + 12]  // dither4 | 
|  | mov        ecx, [esp + 16]     // pix | 
|  | vpunpcklbw xmm6, xmm6, xmm6    // make dither 32 bytes | 
|  | vpermq     ymm6, ymm6, 0xd8 | 
|  | vpunpcklwd ymm6, ymm6, ymm6 | 
|  | vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f | 
|  | vpsrld     ymm3, ymm3, 27 | 
|  | vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0 | 
|  | vpsrld     ymm4, ymm4, 26 | 
|  | vpslld     ymm4, ymm4, 5 | 
|  | vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800 | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb | 
|  | vpaddusb   ymm0, ymm0, ymm6    // add dither | 
|  | vpsrld     ymm2, ymm0, 5       // G | 
|  | vpsrld     ymm1, ymm0, 3       // B | 
|  | vpsrld     ymm0, ymm0, 8       // R | 
|  | vpand      ymm2, ymm2, ymm4    // G | 
|  | vpand      ymm1, ymm1, ymm3    // B | 
|  | vpand      ymm0, ymm0, ymm5    // R | 
|  | vpor       ymm1, ymm1, ymm2    // BG | 
|  | vpor       ymm0, ymm0, ymm1    // BGR | 
|  | vpackusdw  ymm0, ymm0, ymm0 | 
|  | vpermq     ymm0, ymm0, 0xd8 | 
|  | lea        eax, [eax + 32] | 
|  | vmovdqu    [edx], xmm0         // store 8 pixels of RGB565 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBTORGB565DITHERROW_AVX2 | 
|  |  | 
|  | // TODO(fbarchard): Improve sign extension/packing. | 
|  | __declspec(naked) | 
|  | void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 
|  | __asm { | 
|  | mov       eax, [esp + 4]   // src_argb | 
|  | mov       edx, [esp + 8]   // dst_rgb | 
|  | mov       ecx, [esp + 12]  // pix | 
|  | pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f | 
|  | psrld     xmm4, 27 | 
|  | movdqa    xmm5, xmm4       // generate mask 0x000003e0 | 
|  | pslld     xmm5, 5 | 
|  | movdqa    xmm6, xmm4       // generate mask 0x00007c00 | 
|  | pslld     xmm6, 10 | 
|  | pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000 | 
|  | pslld     xmm7, 15 | 
|  |  | 
|  | convertloop: | 
|  | movdqu    xmm0, [eax]   // fetch 4 pixels of argb | 
|  | movdqa    xmm1, xmm0    // B | 
|  | movdqa    xmm2, xmm0    // G | 
|  | movdqa    xmm3, xmm0    // R | 
|  | psrad     xmm0, 16      // A | 
|  | psrld     xmm1, 3       // B | 
|  | psrld     xmm2, 6       // G | 
|  | psrld     xmm3, 9       // R | 
|  | pand      xmm0, xmm7    // A | 
|  | pand      xmm1, xmm4    // B | 
|  | pand      xmm2, xmm5    // G | 
|  | pand      xmm3, xmm6    // R | 
|  | por       xmm0, xmm1    // BA | 
|  | por       xmm2, xmm3    // GR | 
|  | por       xmm0, xmm2    // BGRA | 
|  | packssdw  xmm0, xmm0 | 
|  | lea       eax, [eax + 16] | 
|  | movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555 | 
|  | lea       edx, [edx + 8] | 
|  | sub       ecx, 4 | 
|  | jg        convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 
|  | __asm { | 
|  | mov       eax, [esp + 4]   // src_argb | 
|  | mov       edx, [esp + 8]   // dst_rgb | 
|  | mov       ecx, [esp + 12]  // pix | 
|  | pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000 | 
|  | psllw     xmm4, 12 | 
|  | movdqa    xmm3, xmm4       // generate mask 0x00f000f0 | 
|  | psrlw     xmm3, 8 | 
|  |  | 
|  | convertloop: | 
|  | movdqu    xmm0, [eax]   // fetch 4 pixels of argb | 
|  | movdqa    xmm1, xmm0 | 
|  | pand      xmm0, xmm3    // low nibble | 
|  | pand      xmm1, xmm4    // high nibble | 
|  | psrld     xmm0, 4 | 
|  | psrld     xmm1, 8 | 
|  | por       xmm0, xmm1 | 
|  | packuswb  xmm0, xmm0 | 
|  | lea       eax, [eax + 16] | 
|  | movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444 | 
|  | lea       edx, [edx + 8] | 
|  | sub       ecx, 4 | 
|  | jg        convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | #ifdef HAS_ARGBTORGB565ROW_AVX2 | 
|  | __declspec(naked) | 
|  | void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]      // src_argb | 
|  | mov        edx, [esp + 8]      // dst_rgb | 
|  | mov        ecx, [esp + 12]     // pix | 
|  | vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f | 
|  | vpsrld     ymm3, ymm3, 27 | 
|  | vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0 | 
|  | vpsrld     ymm4, ymm4, 26 | 
|  | vpslld     ymm4, ymm4, 5 | 
|  | vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800 | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb | 
|  | vpsrld     ymm2, ymm0, 5       // G | 
|  | vpsrld     ymm1, ymm0, 3       // B | 
|  | vpsrld     ymm0, ymm0, 8       // R | 
|  | vpand      ymm2, ymm2, ymm4    // G | 
|  | vpand      ymm1, ymm1, ymm3    // B | 
|  | vpand      ymm0, ymm0, ymm5    // R | 
|  | vpor       ymm1, ymm1, ymm2    // BG | 
|  | vpor       ymm0, ymm0, ymm1    // BGR | 
|  | vpackusdw  ymm0, ymm0, ymm0 | 
|  | vpermq     ymm0, ymm0, 0xd8 | 
|  | lea        eax, [eax + 32] | 
|  | vmovdqu    [edx], xmm0         // store 8 pixels of RGB565 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBTORGB565ROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_ARGBTOARGB1555ROW_AVX2 | 
|  | __declspec(naked) | 
|  | void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]      // src_argb | 
|  | mov        edx, [esp + 8]      // dst_rgb | 
|  | mov        ecx, [esp + 12]     // pix | 
|  | vpcmpeqb   ymm4, ymm4, ymm4 | 
|  | vpsrld     ymm4, ymm4, 27      // generate mask 0x0000001f | 
|  | vpslld     ymm5, ymm4, 5       // generate mask 0x000003e0 | 
|  | vpslld     ymm6, ymm4, 10      // generate mask 0x00007c00 | 
|  | vpcmpeqb   ymm7, ymm7, ymm7    // generate mask 0xffff8000 | 
|  | vpslld     ymm7, ymm7, 15 | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb | 
|  | vpsrld     ymm3, ymm0, 9       // R | 
|  | vpsrld     ymm2, ymm0, 6       // G | 
|  | vpsrld     ymm1, ymm0, 3       // B | 
|  | vpsrad     ymm0, ymm0, 16      // A | 
|  | vpand      ymm3, ymm3, ymm6    // R | 
|  | vpand      ymm2, ymm2, ymm5    // G | 
|  | vpand      ymm1, ymm1, ymm4    // B | 
|  | vpand      ymm0, ymm0, ymm7    // A | 
|  | vpor       ymm0, ymm0, ymm1    // BA | 
|  | vpor       ymm2, ymm2, ymm3    // GR | 
|  | vpor       ymm0, ymm0, ymm2    // BGRA | 
|  | vpackssdw  ymm0, ymm0, ymm0 | 
|  | vpermq     ymm0, ymm0, 0xd8 | 
|  | lea        eax, [eax + 32] | 
|  | vmovdqu    [edx], xmm0         // store 8 pixels of ARGB1555 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBTOARGB1555ROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_ARGBTOARGB4444ROW_AVX2 | 
|  | __declspec(naked) | 
|  | void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   // src_argb | 
|  | mov        edx, [esp + 8]   // dst_rgb | 
|  | mov        ecx, [esp + 12]  // pix | 
|  | vpcmpeqb   ymm4, ymm4, ymm4   // generate mask 0xf000f000 | 
|  | vpsllw     ymm4, ymm4, 12 | 
|  | vpsrlw     ymm3, ymm4, 8      // generate mask 0x00f000f0 | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb | 
|  | vpand      ymm1, ymm0, ymm4    // high nibble | 
|  | vpand      ymm0, ymm0, ymm3    // low nibble | 
|  | vpsrld     ymm1, ymm1, 8 | 
|  | vpsrld     ymm0, ymm0, 4 | 
|  | vpor       ymm0, ymm0, ymm1 | 
|  | vpackuswb  ymm0, ymm0, ymm0 | 
|  | vpermq     ymm0, ymm0, 0xd8 | 
|  | lea        eax, [eax + 32] | 
|  | vmovdqu    [edx], xmm0         // store 8 pixels of ARGB4444 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBTOARGB4444ROW_AVX2 | 
|  |  | 
|  | // Convert 16 ARGB pixels (64 bytes) to 16 Y values. | 
|  | __declspec(naked) | 
|  | void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   /* src_argb */ | 
|  | mov        edx, [esp + 8]   /* dst_y */ | 
|  | mov        ecx, [esp + 12]  /* pix */ | 
|  | movdqa     xmm4, kARGBToY | 
|  | movdqa     xmm5, kAddY16 | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | movdqu     xmm2, [eax + 32] | 
|  | movdqu     xmm3, [eax + 48] | 
|  | pmaddubsw  xmm0, xmm4 | 
|  | pmaddubsw  xmm1, xmm4 | 
|  | pmaddubsw  xmm2, xmm4 | 
|  | pmaddubsw  xmm3, xmm4 | 
|  | lea        eax, [eax + 64] | 
|  | phaddw     xmm0, xmm1 | 
|  | phaddw     xmm2, xmm3 | 
|  | psrlw      xmm0, 7 | 
|  | psrlw      xmm2, 7 | 
|  | packuswb   xmm0, xmm2 | 
|  | paddb      xmm0, xmm5 | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. | 
|  | // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. | 
|  | __declspec(naked) | 
|  | void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   /* src_argb */ | 
|  | mov        edx, [esp + 8]   /* dst_y */ | 
|  | mov        ecx, [esp + 12]  /* pix */ | 
|  | movdqa     xmm4, kARGBToYJ | 
|  | movdqa     xmm5, kAddYJ64 | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | movdqu     xmm2, [eax + 32] | 
|  | movdqu     xmm3, [eax + 48] | 
|  | pmaddubsw  xmm0, xmm4 | 
|  | pmaddubsw  xmm1, xmm4 | 
|  | pmaddubsw  xmm2, xmm4 | 
|  | pmaddubsw  xmm3, xmm4 | 
|  | lea        eax, [eax + 64] | 
|  | phaddw     xmm0, xmm1 | 
|  | phaddw     xmm2, xmm3 | 
|  | paddw      xmm0, xmm5  // Add .5 for rounding. | 
|  | paddw      xmm2, xmm5 | 
|  | psrlw      xmm0, 7 | 
|  | psrlw      xmm2, 7 | 
|  | packuswb   xmm0, xmm2 | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | #ifdef HAS_ARGBTOYROW_AVX2 | 
|  | // vpermd for vphaddw + vpackuswb vpermd. | 
|  | static const lvec32 kPermdARGBToY_AVX = { | 
|  | 0, 4, 1, 5, 2, 6, 3, 7 | 
|  | }; | 
|  |  | 
|  | // Convert 32 ARGB pixels (128 bytes) to 32 Y values. | 
|  | __declspec(naked) | 
|  | void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   /* src_argb */ | 
|  | mov        edx, [esp + 8]   /* dst_y */ | 
|  | mov        ecx, [esp + 12]  /* pix */ | 
|  | vbroadcastf128 ymm4, kARGBToY | 
|  | vbroadcastf128 ymm5, kAddY16 | 
|  | vmovdqu    ymm6, kPermdARGBToY_AVX | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm0, [eax] | 
|  | vmovdqu    ymm1, [eax + 32] | 
|  | vmovdqu    ymm2, [eax + 64] | 
|  | vmovdqu    ymm3, [eax + 96] | 
|  | vpmaddubsw ymm0, ymm0, ymm4 | 
|  | vpmaddubsw ymm1, ymm1, ymm4 | 
|  | vpmaddubsw ymm2, ymm2, ymm4 | 
|  | vpmaddubsw ymm3, ymm3, ymm4 | 
|  | lea        eax, [eax + 128] | 
|  | vphaddw    ymm0, ymm0, ymm1  // mutates. | 
|  | vphaddw    ymm2, ymm2, ymm3 | 
|  | vpsrlw     ymm0, ymm0, 7 | 
|  | vpsrlw     ymm2, ymm2, 7 | 
|  | vpackuswb  ymm0, ymm0, ymm2  // mutates. | 
|  | vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation. | 
|  | vpaddb     ymm0, ymm0, ymm5  // add 16 for Y | 
|  | vmovdqu    [edx], ymm0 | 
|  | lea        edx, [edx + 32] | 
|  | sub        ecx, 32 | 
|  | jg         convertloop | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  //  HAS_ARGBTOYROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_ARGBTOYJROW_AVX2 | 
|  | // Convert 32 ARGB pixels (128 bytes) to 32 Y values. | 
|  | __declspec(naked) | 
|  | void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   /* src_argb */ | 
|  | mov        edx, [esp + 8]   /* dst_y */ | 
|  | mov        ecx, [esp + 12]  /* pix */ | 
|  | vbroadcastf128 ymm4, kARGBToYJ | 
|  | vbroadcastf128 ymm5, kAddYJ64 | 
|  | vmovdqu    ymm6, kPermdARGBToY_AVX | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm0, [eax] | 
|  | vmovdqu    ymm1, [eax + 32] | 
|  | vmovdqu    ymm2, [eax + 64] | 
|  | vmovdqu    ymm3, [eax + 96] | 
|  | vpmaddubsw ymm0, ymm0, ymm4 | 
|  | vpmaddubsw ymm1, ymm1, ymm4 | 
|  | vpmaddubsw ymm2, ymm2, ymm4 | 
|  | vpmaddubsw ymm3, ymm3, ymm4 | 
|  | lea        eax, [eax + 128] | 
|  | vphaddw    ymm0, ymm0, ymm1  // mutates. | 
|  | vphaddw    ymm2, ymm2, ymm3 | 
|  | vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding. | 
|  | vpaddw     ymm2, ymm2, ymm5 | 
|  | vpsrlw     ymm0, ymm0, 7 | 
|  | vpsrlw     ymm2, ymm2, 7 | 
|  | vpackuswb  ymm0, ymm0, ymm2  // mutates. | 
|  | vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation. | 
|  | vmovdqu    [edx], ymm0 | 
|  | lea        edx, [edx + 32] | 
|  | sub        ecx, 32 | 
|  | jg         convertloop | 
|  |  | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  //  HAS_ARGBTOYJROW_AVX2 | 
|  |  | 
|  | __declspec(naked) | 
|  | void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   /* src_argb */ | 
|  | mov        edx, [esp + 8]   /* dst_y */ | 
|  | mov        ecx, [esp + 12]  /* pix */ | 
|  | movdqa     xmm4, kBGRAToY | 
|  | movdqa     xmm5, kAddY16 | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | movdqu     xmm2, [eax + 32] | 
|  | movdqu     xmm3, [eax + 48] | 
|  | pmaddubsw  xmm0, xmm4 | 
|  | pmaddubsw  xmm1, xmm4 | 
|  | pmaddubsw  xmm2, xmm4 | 
|  | pmaddubsw  xmm3, xmm4 | 
|  | lea        eax, [eax + 64] | 
|  | phaddw     xmm0, xmm1 | 
|  | phaddw     xmm2, xmm3 | 
|  | psrlw      xmm0, 7 | 
|  | psrlw      xmm2, 7 | 
|  | packuswb   xmm0, xmm2 | 
|  | paddb      xmm0, xmm5 | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   /* src_argb */ | 
|  | mov        edx, [esp + 8]   /* dst_y */ | 
|  | mov        ecx, [esp + 12]  /* pix */ | 
|  | movdqa     xmm4, kABGRToY | 
|  | movdqa     xmm5, kAddY16 | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | movdqu     xmm2, [eax + 32] | 
|  | movdqu     xmm3, [eax + 48] | 
|  | pmaddubsw  xmm0, xmm4 | 
|  | pmaddubsw  xmm1, xmm4 | 
|  | pmaddubsw  xmm2, xmm4 | 
|  | pmaddubsw  xmm3, xmm4 | 
|  | lea        eax, [eax + 64] | 
|  | phaddw     xmm0, xmm1 | 
|  | phaddw     xmm2, xmm3 | 
|  | psrlw      xmm0, 7 | 
|  | psrlw      xmm2, 7 | 
|  | packuswb   xmm0, xmm2 | 
|  | paddb      xmm0, xmm5 | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   /* src_argb */ | 
|  | mov        edx, [esp + 8]   /* dst_y */ | 
|  | mov        ecx, [esp + 12]  /* pix */ | 
|  | movdqa     xmm4, kRGBAToY | 
|  | movdqa     xmm5, kAddY16 | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | movdqu     xmm2, [eax + 32] | 
|  | movdqu     xmm3, [eax + 48] | 
|  | pmaddubsw  xmm0, xmm4 | 
|  | pmaddubsw  xmm1, xmm4 | 
|  | pmaddubsw  xmm2, xmm4 | 
|  | pmaddubsw  xmm3, xmm4 | 
|  | lea        eax, [eax + 64] | 
|  | phaddw     xmm0, xmm1 | 
|  | phaddw     xmm2, xmm3 | 
|  | psrlw      xmm0, 7 | 
|  | psrlw      xmm2, 7 | 
|  | packuswb   xmm0, xmm2 | 
|  | paddb      xmm0, xmm5 | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 
|  | uint8* dst_u, uint8* dst_v, int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // src_argb | 
|  | mov        esi, [esp + 8 + 8]   // src_stride_argb | 
|  | mov        edx, [esp + 8 + 12]  // dst_u | 
|  | mov        edi, [esp + 8 + 16]  // dst_v | 
|  | mov        ecx, [esp + 8 + 20]  // pix | 
|  | movdqa     xmm5, kAddUV128 | 
|  | movdqa     xmm6, kARGBToV | 
|  | movdqa     xmm7, kARGBToU | 
|  | sub        edi, edx             // stride from u to v | 
|  |  | 
|  | convertloop: | 
|  | /* step 1 - subsample 16x2 argb pixels to 8x1 */ | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm4, [eax + esi] | 
|  | pavgb      xmm0, xmm4 | 
|  | movdqu     xmm1, [eax + 16] | 
|  | movdqu     xmm4, [eax + esi + 16] | 
|  | pavgb      xmm1, xmm4 | 
|  | movdqu     xmm2, [eax + 32] | 
|  | movdqu     xmm4, [eax + esi + 32] | 
|  | pavgb      xmm2, xmm4 | 
|  | movdqu     xmm3, [eax + 48] | 
|  | movdqu     xmm4, [eax + esi + 48] | 
|  | pavgb      xmm3, xmm4 | 
|  |  | 
|  | lea        eax,  [eax + 64] | 
|  | movdqa     xmm4, xmm0 | 
|  | shufps     xmm0, xmm1, 0x88 | 
|  | shufps     xmm4, xmm1, 0xdd | 
|  | pavgb      xmm0, xmm4 | 
|  | movdqa     xmm4, xmm2 | 
|  | shufps     xmm2, xmm3, 0x88 | 
|  | shufps     xmm4, xmm3, 0xdd | 
|  | pavgb      xmm2, xmm4 | 
|  |  | 
|  | // step 2 - convert to U and V | 
|  | // from here down is very similar to Y code except | 
|  | // instead of 16 different pixels, its 8 pixels of U and 8 of V | 
|  | movdqa     xmm1, xmm0 | 
|  | movdqa     xmm3, xmm2 | 
|  | pmaddubsw  xmm0, xmm7  // U | 
|  | pmaddubsw  xmm2, xmm7 | 
|  | pmaddubsw  xmm1, xmm6  // V | 
|  | pmaddubsw  xmm3, xmm6 | 
|  | phaddw     xmm0, xmm2 | 
|  | phaddw     xmm1, xmm3 | 
|  | psraw      xmm0, 8 | 
|  | psraw      xmm1, 8 | 
|  | packsswb   xmm0, xmm1 | 
|  | paddb      xmm0, xmm5            // -> unsigned | 
|  |  | 
|  | // step 3 - store 8 U and 8 V values | 
|  | movlps     qword ptr [edx], xmm0 // U | 
|  | movhps     qword ptr [edx + edi], xmm0 // V | 
|  | lea        edx, [edx + 8] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 
|  | uint8* dst_u, uint8* dst_v, int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // src_argb | 
|  | mov        esi, [esp + 8 + 8]   // src_stride_argb | 
|  | mov        edx, [esp + 8 + 12]  // dst_u | 
|  | mov        edi, [esp + 8 + 16]  // dst_v | 
|  | mov        ecx, [esp + 8 + 20]  // pix | 
|  | movdqa     xmm5, kAddUVJ128 | 
|  | movdqa     xmm6, kARGBToVJ | 
|  | movdqa     xmm7, kARGBToUJ | 
|  | sub        edi, edx             // stride from u to v | 
|  |  | 
|  | convertloop: | 
|  | /* step 1 - subsample 16x2 argb pixels to 8x1 */ | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm4, [eax + esi] | 
|  | pavgb      xmm0, xmm4 | 
|  | movdqu     xmm1, [eax + 16] | 
|  | movdqu     xmm4, [eax + esi + 16] | 
|  | pavgb      xmm1, xmm4 | 
|  | movdqu     xmm2, [eax + 32] | 
|  | movdqu     xmm4, [eax + esi + 32] | 
|  | pavgb      xmm2, xmm4 | 
|  | movdqu     xmm3, [eax + 48] | 
|  | movdqu     xmm4, [eax + esi + 48] | 
|  | pavgb      xmm3, xmm4 | 
|  |  | 
|  | lea        eax,  [eax + 64] | 
|  | movdqa     xmm4, xmm0 | 
|  | shufps     xmm0, xmm1, 0x88 | 
|  | shufps     xmm4, xmm1, 0xdd | 
|  | pavgb      xmm0, xmm4 | 
|  | movdqa     xmm4, xmm2 | 
|  | shufps     xmm2, xmm3, 0x88 | 
|  | shufps     xmm4, xmm3, 0xdd | 
|  | pavgb      xmm2, xmm4 | 
|  |  | 
|  | // step 2 - convert to U and V | 
|  | // from here down is very similar to Y code except | 
|  | // instead of 16 different pixels, its 8 pixels of U and 8 of V | 
|  | movdqa     xmm1, xmm0 | 
|  | movdqa     xmm3, xmm2 | 
|  | pmaddubsw  xmm0, xmm7  // U | 
|  | pmaddubsw  xmm2, xmm7 | 
|  | pmaddubsw  xmm1, xmm6  // V | 
|  | pmaddubsw  xmm3, xmm6 | 
|  | phaddw     xmm0, xmm2 | 
|  | phaddw     xmm1, xmm3 | 
|  | paddw      xmm0, xmm5            // +.5 rounding -> unsigned | 
|  | paddw      xmm1, xmm5 | 
|  | psraw      xmm0, 8 | 
|  | psraw      xmm1, 8 | 
|  | packsswb   xmm0, xmm1 | 
|  |  | 
|  | // step 3 - store 8 U and 8 V values | 
|  | movlps     qword ptr [edx], xmm0 // U | 
|  | movhps     qword ptr [edx + edi], xmm0 // V | 
|  | lea        edx, [edx + 8] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | #ifdef HAS_ARGBTOUVROW_AVX2 | 
|  | __declspec(naked) | 
|  | void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, | 
|  | uint8* dst_u, uint8* dst_v, int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // src_argb | 
|  | mov        esi, [esp + 8 + 8]   // src_stride_argb | 
|  | mov        edx, [esp + 8 + 12]  // dst_u | 
|  | mov        edi, [esp + 8 + 16]  // dst_v | 
|  | mov        ecx, [esp + 8 + 20]  // pix | 
|  | vbroadcastf128 ymm5, kAddUV128 | 
|  | vbroadcastf128 ymm6, kARGBToV | 
|  | vbroadcastf128 ymm7, kARGBToU | 
|  | sub        edi, edx             // stride from u to v | 
|  |  | 
|  | convertloop: | 
|  | /* step 1 - subsample 32x2 argb pixels to 16x1 */ | 
|  | vmovdqu    ymm0, [eax] | 
|  | vmovdqu    ymm1, [eax + 32] | 
|  | vmovdqu    ymm2, [eax + 64] | 
|  | vmovdqu    ymm3, [eax + 96] | 
|  | vpavgb     ymm0, ymm0, [eax + esi] | 
|  | vpavgb     ymm1, ymm1, [eax + esi + 32] | 
|  | vpavgb     ymm2, ymm2, [eax + esi + 64] | 
|  | vpavgb     ymm3, ymm3, [eax + esi + 96] | 
|  | lea        eax,  [eax + 128] | 
|  | vshufps    ymm4, ymm0, ymm1, 0x88 | 
|  | vshufps    ymm0, ymm0, ymm1, 0xdd | 
|  | vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps | 
|  | vshufps    ymm4, ymm2, ymm3, 0x88 | 
|  | vshufps    ymm2, ymm2, ymm3, 0xdd | 
|  | vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps | 
|  |  | 
|  | // step 2 - convert to U and V | 
|  | // from here down is very similar to Y code except | 
|  | // instead of 32 different pixels, its 16 pixels of U and 16 of V | 
|  | vpmaddubsw ymm1, ymm0, ymm7  // U | 
|  | vpmaddubsw ymm3, ymm2, ymm7 | 
|  | vpmaddubsw ymm0, ymm0, ymm6  // V | 
|  | vpmaddubsw ymm2, ymm2, ymm6 | 
|  | vphaddw    ymm1, ymm1, ymm3  // mutates | 
|  | vphaddw    ymm0, ymm0, ymm2 | 
|  | vpsraw     ymm1, ymm1, 8 | 
|  | vpsraw     ymm0, ymm0, 8 | 
|  | vpacksswb  ymm0, ymm1, ymm0  // mutates | 
|  | vpermq     ymm0, ymm0, 0xd8  // For vpacksswb | 
|  | vpshufb    ymm0, ymm0, kShufARGBToUV_AVX  // For vshufps + vphaddw | 
|  | vpaddb     ymm0, ymm0, ymm5  // -> unsigned | 
|  |  | 
|  | // step 3 - store 16 U and 16 V values | 
|  | vextractf128 [edx], ymm0, 0 // U | 
|  | vextractf128 [edx + edi], ymm0, 1 // V | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 32 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBTOUVROW_AVX2 | 
|  |  | 
|  | __declspec(naked) | 
|  | void ARGBToUV444Row_SSSE3(const uint8* src_argb0, | 
|  | uint8* dst_u, uint8* dst_v, int width) { | 
|  | __asm { | 
|  | push       edi | 
|  | mov        eax, [esp + 4 + 4]   // src_argb | 
|  | mov        edx, [esp + 4 + 8]   // dst_u | 
|  | mov        edi, [esp + 4 + 12]  // dst_v | 
|  | mov        ecx, [esp + 4 + 16]  // pix | 
|  | movdqa     xmm5, kAddUV128 | 
|  | movdqa     xmm6, kARGBToV | 
|  | movdqa     xmm7, kARGBToU | 
|  | sub        edi, edx             // stride from u to v | 
|  |  | 
|  | convertloop: | 
|  | /* convert to U and V */ | 
|  | movdqu     xmm0, [eax]          // U | 
|  | movdqu     xmm1, [eax + 16] | 
|  | movdqu     xmm2, [eax + 32] | 
|  | movdqu     xmm3, [eax + 48] | 
|  | pmaddubsw  xmm0, xmm7 | 
|  | pmaddubsw  xmm1, xmm7 | 
|  | pmaddubsw  xmm2, xmm7 | 
|  | pmaddubsw  xmm3, xmm7 | 
|  | phaddw     xmm0, xmm1 | 
|  | phaddw     xmm2, xmm3 | 
|  | psraw      xmm0, 8 | 
|  | psraw      xmm2, 8 | 
|  | packsswb   xmm0, xmm2 | 
|  | paddb      xmm0, xmm5 | 
|  | movdqu     [edx], xmm0 | 
|  |  | 
|  | movdqu     xmm0, [eax]          // V | 
|  | movdqu     xmm1, [eax + 16] | 
|  | movdqu     xmm2, [eax + 32] | 
|  | movdqu     xmm3, [eax + 48] | 
|  | pmaddubsw  xmm0, xmm6 | 
|  | pmaddubsw  xmm1, xmm6 | 
|  | pmaddubsw  xmm2, xmm6 | 
|  | pmaddubsw  xmm3, xmm6 | 
|  | phaddw     xmm0, xmm1 | 
|  | phaddw     xmm2, xmm3 | 
|  | psraw      xmm0, 8 | 
|  | psraw      xmm2, 8 | 
|  | packsswb   xmm0, xmm2 | 
|  | paddb      xmm0, xmm5 | 
|  | lea        eax,  [eax + 64] | 
|  | movdqu     [edx + edi], xmm0 | 
|  | lea        edx,  [edx + 16] | 
|  | sub        ecx,  16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void ARGBToUV422Row_SSSE3(const uint8* src_argb0, | 
|  | uint8* dst_u, uint8* dst_v, int width) { | 
|  | __asm { | 
|  | push       edi | 
|  | mov        eax, [esp + 4 + 4]   // src_argb | 
|  | mov        edx, [esp + 4 + 8]   // dst_u | 
|  | mov        edi, [esp + 4 + 12]  // dst_v | 
|  | mov        ecx, [esp + 4 + 16]  // pix | 
|  | movdqa     xmm5, kAddUV128 | 
|  | movdqa     xmm6, kARGBToV | 
|  | movdqa     xmm7, kARGBToU | 
|  | sub        edi, edx             // stride from u to v | 
|  |  | 
|  | convertloop: | 
|  | /* step 1 - subsample 16x2 argb pixels to 8x1 */ | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | movdqu     xmm2, [eax + 32] | 
|  | movdqu     xmm3, [eax + 48] | 
|  | lea        eax,  [eax + 64] | 
|  | movdqa     xmm4, xmm0 | 
|  | shufps     xmm0, xmm1, 0x88 | 
|  | shufps     xmm4, xmm1, 0xdd | 
|  | pavgb      xmm0, xmm4 | 
|  | movdqa     xmm4, xmm2 | 
|  | shufps     xmm2, xmm3, 0x88 | 
|  | shufps     xmm4, xmm3, 0xdd | 
|  | pavgb      xmm2, xmm4 | 
|  |  | 
|  | // step 2 - convert to U and V | 
|  | // from here down is very similar to Y code except | 
|  | // instead of 16 different pixels, its 8 pixels of U and 8 of V | 
|  | movdqa     xmm1, xmm0 | 
|  | movdqa     xmm3, xmm2 | 
|  | pmaddubsw  xmm0, xmm7  // U | 
|  | pmaddubsw  xmm2, xmm7 | 
|  | pmaddubsw  xmm1, xmm6  // V | 
|  | pmaddubsw  xmm3, xmm6 | 
|  | phaddw     xmm0, xmm2 | 
|  | phaddw     xmm1, xmm3 | 
|  | psraw      xmm0, 8 | 
|  | psraw      xmm1, 8 | 
|  | packsswb   xmm0, xmm1 | 
|  | paddb      xmm0, xmm5            // -> unsigned | 
|  |  | 
|  | // step 3 - store 8 U and 8 V values | 
|  | movlps     qword ptr [edx], xmm0 // U | 
|  | movhps     qword ptr [edx + edi], xmm0 // V | 
|  | lea        edx, [edx + 8] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 
|  | uint8* dst_u, uint8* dst_v, int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // src_argb | 
|  | mov        esi, [esp + 8 + 8]   // src_stride_argb | 
|  | mov        edx, [esp + 8 + 12]  // dst_u | 
|  | mov        edi, [esp + 8 + 16]  // dst_v | 
|  | mov        ecx, [esp + 8 + 20]  // pix | 
|  | movdqa     xmm5, kAddUV128 | 
|  | movdqa     xmm6, kBGRAToV | 
|  | movdqa     xmm7, kBGRAToU | 
|  | sub        edi, edx             // stride from u to v | 
|  |  | 
|  | convertloop: | 
|  | /* step 1 - subsample 16x2 argb pixels to 8x1 */ | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm4, [eax + esi] | 
|  | pavgb      xmm0, xmm4 | 
|  | movdqu     xmm1, [eax + 16] | 
|  | movdqu     xmm4, [eax + esi + 16] | 
|  | pavgb      xmm1, xmm4 | 
|  | movdqu     xmm2, [eax + 32] | 
|  | movdqu     xmm4, [eax + esi + 32] | 
|  | pavgb      xmm2, xmm4 | 
|  | movdqu     xmm3, [eax + 48] | 
|  | movdqu     xmm4, [eax + esi + 48] | 
|  | pavgb      xmm3, xmm4 | 
|  |  | 
|  | lea        eax,  [eax + 64] | 
|  | movdqa     xmm4, xmm0 | 
|  | shufps     xmm0, xmm1, 0x88 | 
|  | shufps     xmm4, xmm1, 0xdd | 
|  | pavgb      xmm0, xmm4 | 
|  | movdqa     xmm4, xmm2 | 
|  | shufps     xmm2, xmm3, 0x88 | 
|  | shufps     xmm4, xmm3, 0xdd | 
|  | pavgb      xmm2, xmm4 | 
|  |  | 
|  | // step 2 - convert to U and V | 
|  | // from here down is very similar to Y code except | 
|  | // instead of 16 different pixels, its 8 pixels of U and 8 of V | 
|  | movdqa     xmm1, xmm0 | 
|  | movdqa     xmm3, xmm2 | 
|  | pmaddubsw  xmm0, xmm7  // U | 
|  | pmaddubsw  xmm2, xmm7 | 
|  | pmaddubsw  xmm1, xmm6  // V | 
|  | pmaddubsw  xmm3, xmm6 | 
|  | phaddw     xmm0, xmm2 | 
|  | phaddw     xmm1, xmm3 | 
|  | psraw      xmm0, 8 | 
|  | psraw      xmm1, 8 | 
|  | packsswb   xmm0, xmm1 | 
|  | paddb      xmm0, xmm5            // -> unsigned | 
|  |  | 
|  | // step 3 - store 8 U and 8 V values | 
|  | movlps     qword ptr [edx], xmm0 // U | 
|  | movhps     qword ptr [edx + edi], xmm0 // V | 
|  | lea        edx, [edx + 8] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 
|  | uint8* dst_u, uint8* dst_v, int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // src_argb | 
|  | mov        esi, [esp + 8 + 8]   // src_stride_argb | 
|  | mov        edx, [esp + 8 + 12]  // dst_u | 
|  | mov        edi, [esp + 8 + 16]  // dst_v | 
|  | mov        ecx, [esp + 8 + 20]  // pix | 
|  | movdqa     xmm5, kAddUV128 | 
|  | movdqa     xmm6, kABGRToV | 
|  | movdqa     xmm7, kABGRToU | 
|  | sub        edi, edx             // stride from u to v | 
|  |  | 
|  | convertloop: | 
|  | /* step 1 - subsample 16x2 argb pixels to 8x1 */ | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm4, [eax + esi] | 
|  | pavgb      xmm0, xmm4 | 
|  | movdqu     xmm1, [eax + 16] | 
|  | movdqu     xmm4, [eax + esi + 16] | 
|  | pavgb      xmm1, xmm4 | 
|  | movdqu     xmm2, [eax + 32] | 
|  | movdqu     xmm4, [eax + esi + 32] | 
|  | pavgb      xmm2, xmm4 | 
|  | movdqu     xmm3, [eax + 48] | 
|  | movdqu     xmm4, [eax + esi + 48] | 
|  | pavgb      xmm3, xmm4 | 
|  |  | 
|  | lea        eax,  [eax + 64] | 
|  | movdqa     xmm4, xmm0 | 
|  | shufps     xmm0, xmm1, 0x88 | 
|  | shufps     xmm4, xmm1, 0xdd | 
|  | pavgb      xmm0, xmm4 | 
|  | movdqa     xmm4, xmm2 | 
|  | shufps     xmm2, xmm3, 0x88 | 
|  | shufps     xmm4, xmm3, 0xdd | 
|  | pavgb      xmm2, xmm4 | 
|  |  | 
|  | // step 2 - convert to U and V | 
|  | // from here down is very similar to Y code except | 
|  | // instead of 16 different pixels, its 8 pixels of U and 8 of V | 
|  | movdqa     xmm1, xmm0 | 
|  | movdqa     xmm3, xmm2 | 
|  | pmaddubsw  xmm0, xmm7  // U | 
|  | pmaddubsw  xmm2, xmm7 | 
|  | pmaddubsw  xmm1, xmm6  // V | 
|  | pmaddubsw  xmm3, xmm6 | 
|  | phaddw     xmm0, xmm2 | 
|  | phaddw     xmm1, xmm3 | 
|  | psraw      xmm0, 8 | 
|  | psraw      xmm1, 8 | 
|  | packsswb   xmm0, xmm1 | 
|  | paddb      xmm0, xmm5            // -> unsigned | 
|  |  | 
|  | // step 3 - store 8 U and 8 V values | 
|  | movlps     qword ptr [edx], xmm0 // U | 
|  | movhps     qword ptr [edx + edi], xmm0 // V | 
|  | lea        edx, [edx + 8] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 
|  | uint8* dst_u, uint8* dst_v, int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // src_argb | 
|  | mov        esi, [esp + 8 + 8]   // src_stride_argb | 
|  | mov        edx, [esp + 8 + 12]  // dst_u | 
|  | mov        edi, [esp + 8 + 16]  // dst_v | 
|  | mov        ecx, [esp + 8 + 20]  // pix | 
|  | movdqa     xmm5, kAddUV128 | 
|  | movdqa     xmm6, kRGBAToV | 
|  | movdqa     xmm7, kRGBAToU | 
|  | sub        edi, edx             // stride from u to v | 
|  |  | 
|  | convertloop: | 
|  | /* step 1 - subsample 16x2 argb pixels to 8x1 */ | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm4, [eax + esi] | 
|  | pavgb      xmm0, xmm4 | 
|  | movdqu     xmm1, [eax + 16] | 
|  | movdqu     xmm4, [eax + esi + 16] | 
|  | pavgb      xmm1, xmm4 | 
|  | movdqu     xmm2, [eax + 32] | 
|  | movdqu     xmm4, [eax + esi + 32] | 
|  | pavgb      xmm2, xmm4 | 
|  | movdqu     xmm3, [eax + 48] | 
|  | movdqu     xmm4, [eax + esi + 48] | 
|  | pavgb      xmm3, xmm4 | 
|  |  | 
|  | lea        eax,  [eax + 64] | 
|  | movdqa     xmm4, xmm0 | 
|  | shufps     xmm0, xmm1, 0x88 | 
|  | shufps     xmm4, xmm1, 0xdd | 
|  | pavgb      xmm0, xmm4 | 
|  | movdqa     xmm4, xmm2 | 
|  | shufps     xmm2, xmm3, 0x88 | 
|  | shufps     xmm4, xmm3, 0xdd | 
|  | pavgb      xmm2, xmm4 | 
|  |  | 
|  | // step 2 - convert to U and V | 
|  | // from here down is very similar to Y code except | 
|  | // instead of 16 different pixels, its 8 pixels of U and 8 of V | 
|  | movdqa     xmm1, xmm0 | 
|  | movdqa     xmm3, xmm2 | 
|  | pmaddubsw  xmm0, xmm7  // U | 
|  | pmaddubsw  xmm2, xmm7 | 
|  | pmaddubsw  xmm1, xmm6  // V | 
|  | pmaddubsw  xmm3, xmm6 | 
|  | phaddw     xmm0, xmm2 | 
|  | phaddw     xmm1, xmm3 | 
|  | psraw      xmm0, 8 | 
|  | psraw      xmm1, 8 | 
|  | packsswb   xmm0, xmm1 | 
|  | paddb      xmm0, xmm5            // -> unsigned | 
|  |  | 
|  | // step 3 - store 8 U and 8 V values | 
|  | movlps     qword ptr [edx], xmm0 // U | 
|  | movhps     qword ptr [edx + edi], xmm0 // V | 
|  | lea        edx, [edx + 8] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBTOYROW_SSSE3 | 
|  |  | 
|  | // Read 16 UV from 444 | 
|  | #define READYUV444_AVX2 __asm {                                                \ | 
|  | __asm vmovdqu    xmm0, [esi]                  /* U */         /* NOLINT */ \ | 
|  | __asm vmovdqu    xmm1, [esi + edi]            /* V */         /* NOLINT */ \ | 
|  | __asm lea        esi,  [esi + 16]                                          \ | 
|  | __asm vpermq     ymm0, ymm0, 0xd8                                          \ | 
|  | __asm vpermq     ymm1, ymm1, 0xd8                                          \ | 
|  | __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \ | 
|  | } | 
|  |  | 
|  | // Read 8 UV from 422, upsample to 16 UV. | 
|  | #define READYUV422_AVX2 __asm {                                                \ | 
|  | __asm vmovq      xmm0, qword ptr [esi]        /* U */         /* NOLINT */ \ | 
|  | __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */         /* NOLINT */ \ | 
|  | __asm lea        esi,  [esi + 8]                                           \ | 
|  | __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \ | 
|  | __asm vpermq     ymm0, ymm0, 0xd8                                          \ | 
|  | __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \ | 
|  | } | 
|  |  | 
|  | // Read 4 UV from 411, upsample to 16 UV. | 
|  | #define READYUV411_AVX2 __asm {                                                \ | 
|  | __asm vmovd      xmm0, dword ptr [esi]        /* U */         /* NOLINT */ \ | 
|  | __asm vmovd      xmm1, dword ptr [esi + edi]  /* V */         /* NOLINT */ \ | 
|  | __asm lea        esi,  [esi + 4]                                           \ | 
|  | __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \ | 
|  | __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \ | 
|  | __asm vpermq     ymm0, ymm0, 0xd8                                          \ | 
|  | __asm vpunpckldq ymm0, ymm0, ymm0             /* UVUVUVUV (upsample) */    \ | 
|  | } | 
|  |  | 
|  | // Read 8 UV from NV12, upsample to 16 UV. | 
|  | #define READNV12_AVX2 __asm {                                                  \ | 
|  | __asm vmovdqu    xmm0, [esi]                  /* UV */                     \ | 
|  | __asm lea        esi,  [esi + 16]                                          \ | 
|  | __asm vpermq     ymm0, ymm0, 0xd8                                          \ | 
|  | __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \ | 
|  | } | 
|  |  | 
|  | // Convert 16 pixels: 16 UV and 16 Y. | 
|  | #define YUVTORGB_AVX2(YuvConstants) __asm {                                    \ | 
|  | /* Step 1: Find 8 UV contributions to 16 R,G,B values */                   \ | 
|  | __asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR        /* scale R UV */   \ | 
|  | __asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG        /* scale G UV */   \ | 
|  | __asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB        /* scale B UV */   \ | 
|  | __asm vmovdqu    ymm3, YuvConstants.kUVBiasR                               \ | 
|  | __asm vpsubw     ymm2, ymm3, ymm2                                          \ | 
|  | __asm vmovdqu    ymm3, YuvConstants.kUVBiasG                               \ | 
|  | __asm vpsubw     ymm1, ymm3, ymm1                                          \ | 
|  | __asm vmovdqu    ymm3, YuvConstants.kUVBiasB                               \ | 
|  | __asm vpsubw     ymm0, ymm3, ymm0                                          \ | 
|  | /* Step 2: Find Y contribution to 16 R,G,B values */                       \ | 
|  | __asm vmovdqu    xmm3, [eax]                  /* NOLINT */                 \ | 
|  | __asm lea        eax, [eax + 16]                                           \ | 
|  | __asm vpermq     ymm3, ymm3, 0xd8                                          \ | 
|  | __asm vpunpcklbw ymm3, ymm3, ymm3                                          \ | 
|  | __asm vpmulhuw   ymm3, ymm3, YuvConstants.kYToRgb                          \ | 
|  | __asm vpaddsw    ymm0, ymm0, ymm3           /* B += Y */                   \ | 
|  | __asm vpaddsw    ymm1, ymm1, ymm3           /* G += Y */                   \ | 
|  | __asm vpaddsw    ymm2, ymm2, ymm3           /* R += Y */                   \ | 
|  | __asm vpsraw     ymm0, ymm0, 6                                             \ | 
|  | __asm vpsraw     ymm1, ymm1, 6                                             \ | 
|  | __asm vpsraw     ymm2, ymm2, 6                                             \ | 
|  | __asm vpackuswb  ymm0, ymm0, ymm0           /* B */                        \ | 
|  | __asm vpackuswb  ymm1, ymm1, ymm1           /* G */                        \ | 
|  | __asm vpackuswb  ymm2, ymm2, ymm2           /* R */                        \ | 
|  | } | 
|  |  | 
|  | // Store 16 ARGB values. | 
|  | #define STOREARGB_AVX2 __asm {                                                 \ | 
|  | /* Step 3: Weave into ARGB */                                              \ | 
|  | __asm vpunpcklbw ymm0, ymm0, ymm1           /* BG */                       \ | 
|  | __asm vpermq     ymm0, ymm0, 0xd8                                          \ | 
|  | __asm vpunpcklbw ymm2, ymm2, ymm5           /* RA */                       \ | 
|  | __asm vpermq     ymm2, ymm2, 0xd8                                          \ | 
|  | __asm vpunpcklwd ymm1, ymm0, ymm2           /* BGRA first 8 pixels */      \ | 
|  | __asm vpunpckhwd ymm0, ymm0, ymm2           /* BGRA next 8 pixels */       \ | 
|  | __asm vmovdqu    0[edx], ymm1                                              \ | 
|  | __asm vmovdqu    32[edx], ymm0                                             \ | 
|  | __asm lea        edx,  [edx + 64]                                          \ | 
|  | } | 
|  |  | 
|  | #ifdef HAS_I422TOARGBROW_AVX2 | 
|  | // 16 pixels | 
|  | // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 
|  | __declspec(naked) | 
|  | void I422ToARGBRow_AVX2(const uint8* y_buf, | 
|  | const uint8* u_buf, | 
|  | const uint8* v_buf, | 
|  | uint8* dst_argb, | 
|  | int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // Y | 
|  | mov        esi, [esp + 8 + 8]   // U | 
|  | mov        edi, [esp + 8 + 12]  // V | 
|  | mov        edx, [esp + 8 + 16]  // argb | 
|  | mov        ecx, [esp + 8 + 20]  // width | 
|  | sub        edi, esi | 
|  | vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha | 
|  |  | 
|  | convertloop: | 
|  | READYUV422_AVX2 | 
|  | YUVTORGB_AVX2(kYuvConstants) | 
|  | STOREARGB_AVX2 | 
|  |  | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_I422TOARGBROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_J422TOARGBROW_AVX2 | 
|  | // 16 pixels | 
|  | // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 
|  | __declspec(naked) | 
|  | void J422ToARGBRow_AVX2(const uint8* y_buf, | 
|  | const uint8* u_buf, | 
|  | const uint8* v_buf, | 
|  | uint8* dst_argb, | 
|  | int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // Y | 
|  | mov        esi, [esp + 8 + 8]   // U | 
|  | mov        edi, [esp + 8 + 12]  // V | 
|  | mov        edx, [esp + 8 + 16]  // argb | 
|  | mov        ecx, [esp + 8 + 20]  // width | 
|  | sub        edi, esi | 
|  | vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha | 
|  |  | 
|  | convertloop: | 
|  | READYUV422_AVX2 | 
|  | YUVTORGB_AVX2(kYuvJConstants) | 
|  | STOREARGB_AVX2 | 
|  |  | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_J422TOARGBROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_I444TOARGBROW_AVX2 | 
|  | // 16 pixels | 
|  | // 16 UV values with 16 Y producing 16 ARGB (64 bytes). | 
|  | __declspec(naked) | 
|  | void I444ToARGBRow_AVX2(const uint8* y_buf, | 
|  | const uint8* u_buf, | 
|  | const uint8* v_buf, | 
|  | uint8* dst_argb, | 
|  | int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // Y | 
|  | mov        esi, [esp + 8 + 8]   // U | 
|  | mov        edi, [esp + 8 + 12]  // V | 
|  | mov        edx, [esp + 8 + 16]  // argb | 
|  | mov        ecx, [esp + 8 + 20]  // width | 
|  | sub        edi, esi | 
|  | vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha | 
|  |  | 
|  | convertloop: | 
|  | READYUV444_AVX2 | 
|  | YUVTORGB_AVX2(kYuvConstants) | 
|  | STOREARGB_AVX2 | 
|  |  | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_I444TOARGBROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_I411TOARGBROW_AVX2 | 
|  | // 16 pixels | 
|  | // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 
|  | __declspec(naked) | 
|  | void I411ToARGBRow_AVX2(const uint8* y_buf, | 
|  | const uint8* u_buf, | 
|  | const uint8* v_buf, | 
|  | uint8* dst_argb, | 
|  | int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // Y | 
|  | mov        esi, [esp + 8 + 8]   // U | 
|  | mov        edi, [esp + 8 + 12]  // V | 
|  | mov        edx, [esp + 8 + 16]  // argb | 
|  | mov        ecx, [esp + 8 + 20]  // width | 
|  | sub        edi, esi | 
|  | vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha | 
|  |  | 
|  | convertloop: | 
|  | READYUV411_AVX2 | 
|  | YUVTORGB_AVX2(kYuvConstants) | 
|  | STOREARGB_AVX2 | 
|  |  | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_I411TOARGBROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_NV12TOARGBROW_AVX2 | 
|  | // 16 pixels. | 
|  | // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 
|  | __declspec(naked) | 
|  | void NV12ToARGBRow_AVX2(const uint8* y_buf, | 
|  | const uint8* uv_buf, | 
|  | uint8* dst_argb, | 
|  | int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]   // Y | 
|  | mov        esi, [esp + 4 + 8]   // UV | 
|  | mov        edx, [esp + 4 + 12]  // argb | 
|  | mov        ecx, [esp + 4 + 16]  // width | 
|  | vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha | 
|  |  | 
|  | convertloop: | 
|  | READNV12_AVX2 | 
|  | YUVTORGB_AVX2(kYuvConstants) | 
|  | STOREARGB_AVX2 | 
|  |  | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        esi | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_NV12TOARGBROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_NV21TOARGBROW_AVX2 | 
|  | // 16 pixels. | 
|  | // 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes). | 
|  | __declspec(naked) | 
|  | void NV21ToARGBRow_AVX2(const uint8* y_buf, | 
|  | const uint8* uv_buf, | 
|  | uint8* dst_argb, | 
|  | int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]   // Y | 
|  | mov        esi, [esp + 4 + 8]   // UV | 
|  | mov        edx, [esp + 4 + 12]  // argb | 
|  | mov        ecx, [esp + 4 + 16]  // width | 
|  | vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha | 
|  |  | 
|  | convertloop: | 
|  | READNV12_AVX2 | 
|  | YUVTORGB_AVX2(kYvuConstants) | 
|  | STOREARGB_AVX2 | 
|  |  | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        esi | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_NV21TOARGBROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_I422TOBGRAROW_AVX2 | 
|  | // 16 pixels | 
|  | // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). | 
|  | // TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3. | 
|  | __declspec(naked) | 
|  | void I422ToBGRARow_AVX2(const uint8* y_buf, | 
|  | const uint8* u_buf, | 
|  | const uint8* v_buf, | 
|  | uint8* dst_argb, | 
|  | int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // Y | 
|  | mov        esi, [esp + 8 + 8]   // U | 
|  | mov        edi, [esp + 8 + 12]  // V | 
|  | mov        edx, [esp + 8 + 16]  // argb | 
|  | mov        ecx, [esp + 8 + 20]  // width | 
|  | sub        edi, esi | 
|  | vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha | 
|  |  | 
|  | convertloop: | 
|  | READYUV422_AVX2 | 
|  | YUVTORGB_AVX2(kYuvConstants) | 
|  |  | 
|  | // Step 3: Weave into BGRA | 
|  | vpunpcklbw ymm1, ymm1, ymm0           // GB | 
|  | vpermq     ymm1, ymm1, 0xd8 | 
|  | vpunpcklbw ymm2, ymm5, ymm2           // AR | 
|  | vpermq     ymm2, ymm2, 0xd8 | 
|  | vpunpcklwd ymm0, ymm2, ymm1           // ARGB first 8 pixels | 
|  | vpunpckhwd ymm2, ymm2, ymm1           // ARGB next 8 pixels | 
|  | vmovdqu    [edx], ymm0 | 
|  | vmovdqu    [edx + 32], ymm2 | 
|  | lea        edx,  [edx + 64] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_I422TOBGRAROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_I422TORGBAROW_AVX2 | 
|  | // 16 pixels | 
|  | // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). | 
|  | // TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3. | 
|  | __declspec(naked) | 
|  | void I422ToRGBARow_AVX2(const uint8* y_buf, | 
|  | const uint8* u_buf, | 
|  | const uint8* v_buf, | 
|  | uint8* dst_argb, | 
|  | int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // Y | 
|  | mov        esi, [esp + 8 + 8]   // U | 
|  | mov        edi, [esp + 8 + 12]  // V | 
|  | mov        edx, [esp + 8 + 16]  // argb | 
|  | mov        ecx, [esp + 8 + 20]  // width | 
|  | sub        edi, esi | 
|  | vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha | 
|  |  | 
|  | convertloop: | 
|  | READYUV422_AVX2 | 
|  | YUVTORGB_AVX2(kYuvConstants) | 
|  |  | 
|  | // Step 3: Weave into RGBA | 
|  | vpunpcklbw ymm1, ymm1, ymm2           // GR | 
|  | vpermq     ymm1, ymm1, 0xd8 | 
|  | vpunpcklbw ymm2, ymm5, ymm0           // AB | 
|  | vpermq     ymm2, ymm2, 0xd8 | 
|  | vpunpcklwd ymm0, ymm2, ymm1           // ABGR first 8 pixels | 
|  | vpunpckhwd ymm1, ymm2, ymm1           // ABGR next 8 pixels | 
|  | vmovdqu    [edx], ymm0 | 
|  | vmovdqu    [edx + 32], ymm1 | 
|  | lea        edx,  [edx + 64] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_I422TORGBAROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_I422TOABGRROW_AVX2 | 
|  | // 16 pixels | 
|  | // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). | 
|  | // TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3. | 
|  | __declspec(naked) | 
|  | void I422ToABGRRow_AVX2(const uint8* y_buf, | 
|  | const uint8* u_buf, | 
|  | const uint8* v_buf, | 
|  | uint8* dst_argb, | 
|  | int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // Y | 
|  | mov        esi, [esp + 8 + 8]   // U | 
|  | mov        edi, [esp + 8 + 12]  // V | 
|  | mov        edx, [esp + 8 + 16]  // argb | 
|  | mov        ecx, [esp + 8 + 20]  // width | 
|  | sub        edi, esi | 
|  | vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha | 
|  |  | 
|  | convertloop: | 
|  | READYUV422_AVX2 | 
|  | YUVTORGB_AVX2(kYuvConstants) | 
|  |  | 
|  | // Step 3: Weave into ABGR | 
|  | vpunpcklbw ymm1, ymm2, ymm1           // RG | 
|  | vpermq     ymm1, ymm1, 0xd8 | 
|  | vpunpcklbw ymm2, ymm0, ymm5           // BA | 
|  | vpermq     ymm2, ymm2, 0xd8 | 
|  | vpunpcklwd ymm0, ymm1, ymm2           // RGBA first 8 pixels | 
|  | vpunpckhwd ymm1, ymm1, ymm2           // RGBA next 8 pixels | 
|  | vmovdqu    [edx], ymm0 | 
|  | vmovdqu    [edx + 32], ymm1 | 
|  | lea        edx,  [edx + 64] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_I422TOABGRROW_AVX2 | 
|  |  | 
|  | #if defined(HAS_I422TOARGBROW_SSSE3) | 
|  | // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. | 
|  |  | 
|  | // Read 8 UV from 444. | 
|  | #define READYUV444 __asm {                                                     \ | 
|  | __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \ | 
|  | __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \ | 
|  | __asm lea        esi,  [esi + 8]                                           \ | 
|  | __asm punpcklbw  xmm0, xmm1           /* UV */                             \ | 
|  | } | 
|  |  | 
|  | // Read 4 UV from 422, upsample to 8 UV. | 
|  | #define READYUV422 __asm {                                                     \ | 
|  | __asm movd       xmm0, [esi]          /* U */                              \ | 
|  | __asm movd       xmm1, [esi + edi]    /* V */                              \ | 
|  | __asm lea        esi,  [esi + 4]                                           \ | 
|  | __asm punpcklbw  xmm0, xmm1           /* UV */                             \ | 
|  | __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \ | 
|  | } | 
|  |  | 
|  | // Read 2 UV from 411, upsample to 8 UV. | 
|  | #define READYUV411 __asm {                                                     \ | 
|  | __asm movzx      ebx, word ptr [esi]        /* U */           /* NOLINT */ \ | 
|  | __asm movd       xmm0, ebx                                                 \ | 
|  | __asm movzx      ebx, word ptr [esi + edi]  /* V */           /* NOLINT */ \ | 
|  | __asm movd       xmm1, ebx                                                 \ | 
|  | __asm lea        esi,  [esi + 2]                                           \ | 
|  | __asm punpcklbw  xmm0, xmm1           /* UV */                             \ | 
|  | __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \ | 
|  | __asm punpckldq  xmm0, xmm0           /* UVUVUVUV (upsample) */            \ | 
|  | } | 
|  |  | 
|  | // Read 4 UV from NV12, upsample to 8 UV. | 
|  | #define READNV12 __asm {                                                       \ | 
|  | __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \ | 
|  | __asm lea        esi,  [esi + 8]                                           \ | 
|  | __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \ | 
|  | } | 
|  |  | 
|  | // Convert 8 pixels: 8 UV and 8 Y. | 
|  | #define YUVTORGB(YuvConstants) __asm {                                         \ | 
|  | /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \ | 
|  | __asm movdqa     xmm1, xmm0                                                \ | 
|  | __asm movdqa     xmm2, xmm0                                                \ | 
|  | __asm movdqa     xmm3, xmm0                                                \ | 
|  | __asm movdqa     xmm0, YuvConstants.kUVBiasB /* unbias back to signed */   \ | 
|  | __asm pmaddubsw  xmm1, YuvConstants.kUVToB   /* scale B UV */              \ | 
|  | __asm psubw      xmm0, xmm1                                                \ | 
|  | __asm movdqa     xmm1, YuvConstants.kUVBiasG                               \ | 
|  | __asm pmaddubsw  xmm2, YuvConstants.kUVToG   /* scale G UV */              \ | 
|  | __asm psubw      xmm1, xmm2                                                \ | 
|  | __asm movdqa     xmm2, YuvConstants.kUVBiasR                               \ | 
|  | __asm pmaddubsw  xmm3, YuvConstants.kUVToR   /* scale R UV */              \ | 
|  | __asm psubw      xmm2, xmm3                                                \ | 
|  | /* Step 2: Find Y contribution to 8 R,G,B values */                        \ | 
|  | __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \ | 
|  | __asm lea        eax, [eax + 8]                                            \ | 
|  | __asm punpcklbw  xmm3, xmm3                                                \ | 
|  | __asm pmulhuw    xmm3, YuvConstants.kYToRgb                                \ | 
|  | __asm paddsw     xmm0, xmm3           /* B += Y */                         \ | 
|  | __asm paddsw     xmm1, xmm3           /* G += Y */                         \ | 
|  | __asm paddsw     xmm2, xmm3           /* R += Y */                         \ | 
|  | __asm psraw      xmm0, 6                                                   \ | 
|  | __asm psraw      xmm1, 6                                                   \ | 
|  | __asm psraw      xmm2, 6                                                   \ | 
|  | __asm packuswb   xmm0, xmm0           /* B */                              \ | 
|  | __asm packuswb   xmm1, xmm1           /* G */                              \ | 
|  | __asm packuswb   xmm2, xmm2           /* R */                              \ | 
|  | } | 
|  |  | 
|  | // Store 8 ARGB values. | 
|  | #define STOREARGB __asm {                                                      \ | 
|  | /* Step 3: Weave into ARGB */                                              \ | 
|  | __asm punpcklbw  xmm0, xmm1           /* BG */                             \ | 
|  | __asm punpcklbw  xmm2, xmm5           /* RA */                             \ | 
|  | __asm movdqa     xmm1, xmm0                                                \ | 
|  | __asm punpcklwd  xmm0, xmm2           /* BGRA first 4 pixels */            \ | 
|  | __asm punpckhwd  xmm1, xmm2           /* BGRA next 4 pixels */             \ | 
|  | __asm movdqu     0[edx], xmm0                                              \ | 
|  | __asm movdqu     16[edx], xmm1                                             \ | 
|  | __asm lea        edx,  [edx + 32]                                          \ | 
|  | } | 
|  |  | 
|  | // Store 8 BGRA values. | 
|  | #define STOREBGRA __asm {                                                      \ | 
|  | /* Step 3: Weave into BGRA */                                              \ | 
|  | __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \ | 
|  | __asm punpcklbw  xmm1, xmm0           /* GB */                             \ | 
|  | __asm punpcklbw  xmm5, xmm2           /* AR */                             \ | 
|  | __asm movdqa     xmm0, xmm5                                                \ | 
|  | __asm punpcklwd  xmm5, xmm1           /* BGRA first 4 pixels */            \ | 
|  | __asm punpckhwd  xmm0, xmm1           /* BGRA next 4 pixels */             \ | 
|  | __asm movdqu     0[edx], xmm5                                              \ | 
|  | __asm movdqu     16[edx], xmm0                                             \ | 
|  | __asm lea        edx,  [edx + 32]                                          \ | 
|  | } | 
|  |  | 
|  | // Store 8 ABGR values. | 
|  | #define STOREABGR __asm {                                                      \ | 
|  | /* Step 3: Weave into ABGR */                                              \ | 
|  | __asm punpcklbw  xmm2, xmm1           /* RG */                             \ | 
|  | __asm punpcklbw  xmm0, xmm5           /* BA */                             \ | 
|  | __asm movdqa     xmm1, xmm2                                                \ | 
|  | __asm punpcklwd  xmm2, xmm0           /* RGBA first 4 pixels */            \ | 
|  | __asm punpckhwd  xmm1, xmm0           /* RGBA next 4 pixels */             \ | 
|  | __asm movdqu     0[edx], xmm2                                              \ | 
|  | __asm movdqu     16[edx], xmm1                                             \ | 
|  | __asm lea        edx,  [edx + 32]                                          \ | 
|  | } | 
|  |  | 
|  | // Store 8 RGBA values. | 
|  | #define STORERGBA __asm {                                                      \ | 
|  | /* Step 3: Weave into RGBA */                                              \ | 
|  | __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \ | 
|  | __asm punpcklbw  xmm1, xmm2           /* GR */                             \ | 
|  | __asm punpcklbw  xmm5, xmm0           /* AB */                             \ | 
|  | __asm movdqa     xmm0, xmm5                                                \ | 
|  | __asm punpcklwd  xmm5, xmm1           /* RGBA first 4 pixels */            \ | 
|  | __asm punpckhwd  xmm0, xmm1           /* RGBA next 4 pixels */             \ | 
|  | __asm movdqu     0[edx], xmm5                                              \ | 
|  | __asm movdqu     16[edx], xmm0                                             \ | 
|  | __asm lea        edx,  [edx + 32]                                          \ | 
|  | } | 
|  |  | 
|  | // Store 8 RGB24 values. | 
|  | #define STORERGB24 __asm {                                                     \ | 
|  | /* Step 3: Weave into RRGB */                                              \ | 
|  | __asm punpcklbw  xmm0, xmm1           /* BG */                             \ | 
|  | __asm punpcklbw  xmm2, xmm2           /* RR */                             \ | 
|  | __asm movdqa     xmm1, xmm0                                                \ | 
|  | __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \ | 
|  | __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \ | 
|  | /* Step 4: RRGB -> RGB24 */                                                \ | 
|  | __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \ | 
|  | __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \ | 
|  | __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \ | 
|  | __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \ | 
|  | __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \ | 
|  | __asm lea        edx,  [edx + 24]                                          \ | 
|  | } | 
|  |  | 
|  | // Store 8 RAW values. | 
|  | #define STORERAW __asm {                                                       \ | 
|  | /* Step 3: Weave into RRGB */                                              \ | 
|  | __asm punpcklbw  xmm0, xmm1           /* BG */                             \ | 
|  | __asm punpcklbw  xmm2, xmm2           /* RR */                             \ | 
|  | __asm movdqa     xmm1, xmm0                                                \ | 
|  | __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \ | 
|  | __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \ | 
|  | /* Step 4: RRGB -> RAW */                                                  \ | 
|  | __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \ | 
|  | __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \ | 
|  | __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \ | 
|  | __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \ | 
|  | __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \ | 
|  | __asm lea        edx,  [edx + 24]                                          \ | 
|  | } | 
|  |  | 
|  | // Store 8 RGB565 values. | 
|  | #define STORERGB565 __asm {                                                    \ | 
|  | /* Step 3: Weave into RRGB */                                              \ | 
|  | __asm punpcklbw  xmm0, xmm1           /* BG */                             \ | 
|  | __asm punpcklbw  xmm2, xmm2           /* RR */                             \ | 
|  | __asm movdqa     xmm1, xmm0                                                \ | 
|  | __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \ | 
|  | __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \ | 
|  | /* Step 4: RRGB -> RGB565 */                                               \ | 
|  | __asm movdqa     xmm3, xmm0    /* B  first 4 pixels of argb */             \ | 
|  | __asm movdqa     xmm2, xmm0    /* G */                                     \ | 
|  | __asm pslld      xmm0, 8       /* R */                                     \ | 
|  | __asm psrld      xmm3, 3       /* B */                                     \ | 
|  | __asm psrld      xmm2, 5       /* G */                                     \ | 
|  | __asm psrad      xmm0, 16      /* R */                                     \ | 
|  | __asm pand       xmm3, xmm5    /* B */                                     \ | 
|  | __asm pand       xmm2, xmm6    /* G */                                     \ | 
|  | __asm pand       xmm0, xmm7    /* R */                                     \ | 
|  | __asm por        xmm3, xmm2    /* BG */                                    \ | 
|  | __asm por        xmm0, xmm3    /* BGR */                                   \ | 
|  | __asm movdqa     xmm3, xmm1    /* B  next 4 pixels of argb */              \ | 
|  | __asm movdqa     xmm2, xmm1    /* G */                                     \ | 
|  | __asm pslld      xmm1, 8       /* R */                                     \ | 
|  | __asm psrld      xmm3, 3       /* B */                                     \ | 
|  | __asm psrld      xmm2, 5       /* G */                                     \ | 
|  | __asm psrad      xmm1, 16      /* R */                                     \ | 
|  | __asm pand       xmm3, xmm5    /* B */                                     \ | 
|  | __asm pand       xmm2, xmm6    /* G */                                     \ | 
|  | __asm pand       xmm1, xmm7    /* R */                                     \ | 
|  | __asm por        xmm3, xmm2    /* BG */                                    \ | 
|  | __asm por        xmm1, xmm3    /* BGR */                                   \ | 
|  | __asm packssdw   xmm0, xmm1                                                \ | 
|  | __asm movdqu     0[edx], xmm0  /* store 8 pixels of RGB565 */              \ | 
|  | __asm lea        edx, [edx + 16]                                           \ | 
|  | } | 
|  |  | 
|  | // 8 pixels. | 
|  | // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). | 
|  | __declspec(naked) | 
|  | void I444ToARGBRow_SSSE3(const uint8* y_buf, | 
|  | const uint8* u_buf, | 
|  | const uint8* v_buf, | 
|  | uint8* dst_argb, | 
|  | int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // Y | 
|  | mov        esi, [esp + 8 + 8]   // U | 
|  | mov        edi, [esp + 8 + 12]  // V | 
|  | mov        edx, [esp + 8 + 16]  // argb | 
|  | mov        ecx, [esp + 8 + 20]  // width | 
|  | sub        edi, esi | 
|  | pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha | 
|  |  | 
|  | convertloop: | 
|  | READYUV444 | 
|  | YUVTORGB(kYuvConstants) | 
|  | STOREARGB | 
|  |  | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // 8 pixels. | 
|  | // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). | 
|  | __declspec(naked) | 
|  | void I422ToRGB24Row_SSSE3(const uint8* y_buf, | 
|  | const uint8* u_buf, | 
|  | const uint8* v_buf, | 
|  | uint8* dst_rgb24, | 
|  | int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // Y | 
|  | mov        esi, [esp + 8 + 8]   // U | 
|  | mov        edi, [esp + 8 + 12]  // V | 
|  | mov        edx, [esp + 8 + 16]  // rgb24 | 
|  | mov        ecx, [esp + 8 + 20]  // width | 
|  | sub        edi, esi | 
|  | movdqa     xmm5, kShuffleMaskARGBToRGB24_0 | 
|  | movdqa     xmm6, kShuffleMaskARGBToRGB24 | 
|  |  | 
|  | convertloop: | 
|  | READYUV422 | 
|  | YUVTORGB(kYuvConstants) | 
|  | STORERGB24 | 
|  |  | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // 8 pixels. | 
|  | // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes). | 
|  | __declspec(naked) | 
|  | void I422ToRAWRow_SSSE3(const uint8* y_buf, | 
|  | const uint8* u_buf, | 
|  | const uint8* v_buf, | 
|  | uint8* dst_raw, | 
|  | int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // Y | 
|  | mov        esi, [esp + 8 + 8]   // U | 
|  | mov        edi, [esp + 8 + 12]  // V | 
|  | mov        edx, [esp + 8 + 16]  // raw | 
|  | mov        ecx, [esp + 8 + 20]  // width | 
|  | sub        edi, esi | 
|  | movdqa     xmm5, kShuffleMaskARGBToRAW_0 | 
|  | movdqa     xmm6, kShuffleMaskARGBToRAW | 
|  |  | 
|  | convertloop: | 
|  | READYUV422 | 
|  | YUVTORGB(kYuvConstants) | 
|  | STORERAW | 
|  |  | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // 8 pixels | 
|  | // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). | 
|  | __declspec(naked) | 
|  | void I422ToRGB565Row_SSSE3(const uint8* y_buf, | 
|  | const uint8* u_buf, | 
|  | const uint8* v_buf, | 
|  | uint8* rgb565_buf, | 
|  | int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // Y | 
|  | mov        esi, [esp + 8 + 8]   // U | 
|  | mov        edi, [esp + 8 + 12]  // V | 
|  | mov        edx, [esp + 8 + 16]  // rgb565 | 
|  | mov        ecx, [esp + 8 + 20]  // width | 
|  | sub        edi, esi | 
|  | pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f | 
|  | psrld      xmm5, 27 | 
|  | pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0 | 
|  | psrld      xmm6, 26 | 
|  | pslld      xmm6, 5 | 
|  | pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800 | 
|  | pslld      xmm7, 11 | 
|  |  | 
|  | convertloop: | 
|  | READYUV422 | 
|  | YUVTORGB(kYuvConstants) | 
|  | STORERGB565 | 
|  |  | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // 8 pixels. | 
|  | // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). | 
|  | __declspec(naked) | 
|  | void I422ToARGBRow_SSSE3(const uint8* y_buf, | 
|  | const uint8* u_buf, | 
|  | const uint8* v_buf, | 
|  | uint8* dst_argb, | 
|  | int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // Y | 
|  | mov        esi, [esp + 8 + 8]   // U | 
|  | mov        edi, [esp + 8 + 12]  // V | 
|  | mov        edx, [esp + 8 + 16]  // argb | 
|  | mov        ecx, [esp + 8 + 20]  // width | 
|  | sub        edi, esi | 
|  | pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha | 
|  |  | 
|  | convertloop: | 
|  | READYUV422 | 
|  | YUVTORGB(kYuvConstants) | 
|  | STOREARGB | 
|  |  | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // 8 pixels. | 
|  | // JPeg color space version of I422ToARGB | 
|  | // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). | 
|  | __declspec(naked) | 
|  | void J422ToARGBRow_SSSE3(const uint8* y_buf, | 
|  | const uint8* u_buf, | 
|  | const uint8* v_buf, | 
|  | uint8* dst_argb, | 
|  | int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // Y | 
|  | mov        esi, [esp + 8 + 8]   // U | 
|  | mov        edi, [esp + 8 + 12]  // V | 
|  | mov        edx, [esp + 8 + 16]  // argb | 
|  | mov        ecx, [esp + 8 + 20]  // width | 
|  | sub        edi, esi | 
|  | pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha | 
|  |  | 
|  | convertloop: | 
|  | READYUV422 | 
|  | YUVTORGB(kYuvJConstants) | 
|  | STOREARGB | 
|  |  | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // 8 pixels. | 
|  | // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). | 
|  | // Similar to I420 but duplicate UV once more. | 
|  | __declspec(naked) | 
|  | void I411ToARGBRow_SSSE3(const uint8* y_buf, | 
|  | const uint8* u_buf, | 
|  | const uint8* v_buf, | 
|  | uint8* dst_argb, | 
|  | int width) { | 
|  | __asm { | 
|  | push       ebx | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 12 + 4]   // Y | 
|  | mov        esi, [esp + 12 + 8]   // U | 
|  | mov        edi, [esp + 12 + 12]  // V | 
|  | mov        edx, [esp + 12 + 16]  // argb | 
|  | mov        ecx, [esp + 12 + 20]  // width | 
|  | sub        edi, esi | 
|  | pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha | 
|  |  | 
|  | convertloop: | 
|  | READYUV411  // modifies EBX | 
|  | YUVTORGB(kYuvConstants) | 
|  | STOREARGB | 
|  |  | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | pop        ebx | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // 8 pixels. | 
|  | // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). | 
|  | __declspec(naked) | 
|  | void NV12ToARGBRow_SSSE3(const uint8* y_buf, | 
|  | const uint8* uv_buf, | 
|  | uint8* dst_argb, | 
|  | int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]   // Y | 
|  | mov        esi, [esp + 4 + 8]   // UV | 
|  | mov        edx, [esp + 4 + 12]  // argb | 
|  | mov        ecx, [esp + 4 + 16]  // width | 
|  | pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha | 
|  |  | 
|  | convertloop: | 
|  | READNV12 | 
|  | YUVTORGB(kYuvConstants) | 
|  | STOREARGB | 
|  |  | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // 8 pixels. | 
|  | // 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes). | 
|  | __declspec(naked) | 
|  | void NV21ToARGBRow_SSSE3(const uint8* y_buf, | 
|  | const uint8* uv_buf, | 
|  | uint8* dst_argb, | 
|  | int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]   // Y | 
|  | mov        esi, [esp + 4 + 8]   // UV | 
|  | mov        edx, [esp + 4 + 12]  // argb | 
|  | mov        ecx, [esp + 4 + 16]  // width | 
|  | pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha | 
|  |  | 
|  | convertloop: | 
|  | READNV12 | 
|  | YUVTORGB(kYvuConstants) | 
|  | STOREARGB | 
|  |  | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void I422ToBGRARow_SSSE3(const uint8* y_buf, | 
|  | const uint8* u_buf, | 
|  | const uint8* v_buf, | 
|  | uint8* dst_bgra, | 
|  | int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // Y | 
|  | mov        esi, [esp + 8 + 8]   // U | 
|  | mov        edi, [esp + 8 + 12]  // V | 
|  | mov        edx, [esp + 8 + 16]  // bgra | 
|  | mov        ecx, [esp + 8 + 20]  // width | 
|  | sub        edi, esi | 
|  |  | 
|  | convertloop: | 
|  | READYUV422 | 
|  | YUVTORGB(kYuvConstants) | 
|  | STOREBGRA | 
|  |  | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void I422ToABGRRow_SSSE3(const uint8* y_buf, | 
|  | const uint8* u_buf, | 
|  | const uint8* v_buf, | 
|  | uint8* dst_abgr, | 
|  | int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // Y | 
|  | mov        esi, [esp + 8 + 8]   // U | 
|  | mov        edi, [esp + 8 + 12]  // V | 
|  | mov        edx, [esp + 8 + 16]  // abgr | 
|  | mov        ecx, [esp + 8 + 20]  // width | 
|  | sub        edi, esi | 
|  | pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha | 
|  |  | 
|  | convertloop: | 
|  | READYUV422 | 
|  | YUVTORGB(kYuvConstants) | 
|  | STOREABGR | 
|  |  | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void I422ToRGBARow_SSSE3(const uint8* y_buf, | 
|  | const uint8* u_buf, | 
|  | const uint8* v_buf, | 
|  | uint8* dst_rgba, | 
|  | int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // Y | 
|  | mov        esi, [esp + 8 + 8]   // U | 
|  | mov        edi, [esp + 8 + 12]  // V | 
|  | mov        edx, [esp + 8 + 16]  // rgba | 
|  | mov        ecx, [esp + 8 + 20]  // width | 
|  | sub        edi, esi | 
|  |  | 
|  | convertloop: | 
|  | READYUV422 | 
|  | YUVTORGB(kYuvConstants) | 
|  | STORERGBA | 
|  |  | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | #endif  // HAS_I422TOARGBROW_SSSE3 | 
|  |  | 
|  | #ifdef HAS_I400TOARGBROW_SSE2 | 
|  | // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). | 
|  | __declspec(naked) | 
|  | void I400ToARGBRow_SSE2(const uint8* y_buf, | 
|  | uint8* rgb_buf, | 
|  | int width) { | 
|  | __asm { | 
|  | mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256) | 
|  | movd       xmm2, eax | 
|  | pshufd     xmm2, xmm2,0 | 
|  | mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16) | 
|  | movd       xmm3, eax | 
|  | pshufd     xmm3, xmm3, 0 | 
|  | pcmpeqb    xmm4, xmm4           // generate mask 0xff000000 | 
|  | pslld      xmm4, 24 | 
|  |  | 
|  | mov        eax, [esp + 4]       // Y | 
|  | mov        edx, [esp + 8]       // rgb | 
|  | mov        ecx, [esp + 12]      // width | 
|  |  | 
|  | convertloop: | 
|  | // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 | 
|  | movq       xmm0, qword ptr [eax] | 
|  | lea        eax, [eax + 8] | 
|  | punpcklbw  xmm0, xmm0           // Y.Y | 
|  | pmulhuw    xmm0, xmm2 | 
|  | psubusw    xmm0, xmm3 | 
|  | psrlw      xmm0, 6 | 
|  | packuswb   xmm0, xmm0           // G | 
|  |  | 
|  | // Step 2: Weave into ARGB | 
|  | punpcklbw  xmm0, xmm0           // GG | 
|  | movdqa     xmm1, xmm0 | 
|  | punpcklwd  xmm0, xmm0           // BGRA first 4 pixels | 
|  | punpckhwd  xmm1, xmm1           // BGRA next 4 pixels | 
|  | por        xmm0, xmm4 | 
|  | por        xmm1, xmm4 | 
|  | movdqu     [edx], xmm0 | 
|  | movdqu     [edx + 16], xmm1 | 
|  | lea        edx,  [edx + 32] | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_I400TOARGBROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_I400TOARGBROW_AVX2 | 
|  | // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). | 
|  | // note: vpunpcklbw mutates and vpackuswb unmutates. | 
|  | __declspec(naked) | 
|  | void I400ToARGBRow_AVX2(const uint8* y_buf, | 
|  | uint8* rgb_buf, | 
|  | int width) { | 
|  | __asm { | 
|  | mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256) | 
|  | vmovd      xmm2, eax | 
|  | vbroadcastss ymm2, xmm2 | 
|  | mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16) | 
|  | vmovd      xmm3, eax | 
|  | vbroadcastss ymm3, xmm3 | 
|  | vpcmpeqb   ymm4, ymm4, ymm4     // generate mask 0xff000000 | 
|  | vpslld     ymm4, ymm4, 24 | 
|  |  | 
|  | mov        eax, [esp + 4]       // Y | 
|  | mov        edx, [esp + 8]       // rgb | 
|  | mov        ecx, [esp + 12]      // width | 
|  |  | 
|  | convertloop: | 
|  | // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164 | 
|  | vmovdqu    xmm0, [eax] | 
|  | lea        eax, [eax + 16] | 
|  | vpermq     ymm0, ymm0, 0xd8           // vpunpcklbw mutates | 
|  | vpunpcklbw ymm0, ymm0, ymm0           // Y.Y | 
|  | vpmulhuw   ymm0, ymm0, ymm2 | 
|  | vpsubusw   ymm0, ymm0, ymm3 | 
|  | vpsrlw     ymm0, ymm0, 6 | 
|  | vpackuswb  ymm0, ymm0, ymm0           // G.  still mutated: 3120 | 
|  |  | 
|  | // TODO(fbarchard): Weave alpha with unpack. | 
|  | // Step 2: Weave into ARGB | 
|  | vpunpcklbw ymm1, ymm0, ymm0           // GG - mutates | 
|  | vpermq     ymm1, ymm1, 0xd8 | 
|  | vpunpcklwd ymm0, ymm1, ymm1           // GGGG first 8 pixels | 
|  | vpunpckhwd ymm1, ymm1, ymm1           // GGGG next 8 pixels | 
|  | vpor       ymm0, ymm0, ymm4 | 
|  | vpor       ymm1, ymm1, ymm4 | 
|  | vmovdqu    [edx], ymm0 | 
|  | vmovdqu    [edx + 32], ymm1 | 
|  | lea        edx,  [edx + 64] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_I400TOARGBROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_MIRRORROW_SSSE3 | 
|  | // Shuffle table for reversing the bytes. | 
|  | static const uvec8 kShuffleMirror = { | 
|  | 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u | 
|  | }; | 
|  |  | 
|  | // TODO(fbarchard): Replace lea with -16 offset. | 
|  | __declspec(naked) | 
|  | void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { | 
|  | __asm { | 
|  | mov       eax, [esp + 4]   // src | 
|  | mov       edx, [esp + 8]   // dst | 
|  | mov       ecx, [esp + 12]  // width | 
|  | movdqa    xmm5, kShuffleMirror | 
|  |  | 
|  | convertloop: | 
|  | movdqu    xmm0, [eax - 16 + ecx] | 
|  | pshufb    xmm0, xmm5 | 
|  | movdqu    [edx], xmm0 | 
|  | lea       edx, [edx + 16] | 
|  | sub       ecx, 16 | 
|  | jg        convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_MIRRORROW_SSSE3 | 
|  |  | 
|  | #ifdef HAS_MIRRORROW_AVX2 | 
|  | __declspec(naked) | 
|  | void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { | 
|  | __asm { | 
|  | mov       eax, [esp + 4]   // src | 
|  | mov       edx, [esp + 8]   // dst | 
|  | mov       ecx, [esp + 12]  // width | 
|  | vbroadcastf128 ymm5, kShuffleMirror | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu   ymm0, [eax - 32 + ecx] | 
|  | vpshufb   ymm0, ymm0, ymm5 | 
|  | vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs | 
|  | vmovdqu   [edx], ymm0 | 
|  | lea       edx, [edx + 32] | 
|  | sub       ecx, 32 | 
|  | jg        convertloop | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_MIRRORROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_MIRRORROW_SSE2 | 
|  | __declspec(naked) | 
|  | void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { | 
|  | __asm { | 
|  | mov       eax, [esp + 4]   // src | 
|  | mov       edx, [esp + 8]   // dst | 
|  | mov       ecx, [esp + 12]  // width | 
|  |  | 
|  | convertloop: | 
|  | movdqu    xmm0, [eax - 16 + ecx] | 
|  | movdqa    xmm1, xmm0        // swap bytes | 
|  | psllw     xmm0, 8 | 
|  | psrlw     xmm1, 8 | 
|  | por       xmm0, xmm1 | 
|  | pshuflw   xmm0, xmm0, 0x1b  // swap words | 
|  | pshufhw   xmm0, xmm0, 0x1b | 
|  | pshufd    xmm0, xmm0, 0x4e  // swap qwords | 
|  | movdqu    [edx], xmm0 | 
|  | lea       edx, [edx + 16] | 
|  | sub       ecx, 16 | 
|  | jg        convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_MIRRORROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_MIRRORROW_UV_SSSE3 | 
|  | // Shuffle table for reversing the bytes of UV channels. | 
|  | static const uvec8 kShuffleMirrorUV = { | 
|  | 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u | 
|  | }; | 
|  |  | 
|  | __declspec(naked) | 
|  | void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, | 
|  | int width) { | 
|  | __asm { | 
|  | push      edi | 
|  | mov       eax, [esp + 4 + 4]   // src | 
|  | mov       edx, [esp + 4 + 8]   // dst_u | 
|  | mov       edi, [esp + 4 + 12]  // dst_v | 
|  | mov       ecx, [esp + 4 + 16]  // width | 
|  | movdqa    xmm1, kShuffleMirrorUV | 
|  | lea       eax, [eax + ecx * 2 - 16] | 
|  | sub       edi, edx | 
|  |  | 
|  | convertloop: | 
|  | movdqu    xmm0, [eax] | 
|  | lea       eax, [eax - 16] | 
|  | pshufb    xmm0, xmm1 | 
|  | movlpd    qword ptr [edx], xmm0 | 
|  | movhpd    qword ptr [edx + edi], xmm0 | 
|  | lea       edx, [edx + 8] | 
|  | sub       ecx, 8 | 
|  | jg        convertloop | 
|  |  | 
|  | pop       edi | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_MIRRORROW_UV_SSSE3 | 
|  |  | 
|  | #ifdef HAS_ARGBMIRRORROW_SSE2 | 
|  | __declspec(naked) | 
|  | void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { | 
|  | __asm { | 
|  | mov       eax, [esp + 4]   // src | 
|  | mov       edx, [esp + 8]   // dst | 
|  | mov       ecx, [esp + 12]  // width | 
|  | lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels. | 
|  |  | 
|  | convertloop: | 
|  | movdqu    xmm0, [eax] | 
|  | lea       eax, [eax - 16] | 
|  | pshufd    xmm0, xmm0, 0x1b | 
|  | movdqu    [edx], xmm0 | 
|  | lea       edx, [edx + 16] | 
|  | sub       ecx, 4 | 
|  | jg        convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBMIRRORROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_ARGBMIRRORROW_AVX2 | 
|  | // Shuffle table for reversing the bytes. | 
|  | static const ulvec32 kARGBShuffleMirror_AVX2 = { | 
|  | 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u | 
|  | }; | 
|  |  | 
|  | __declspec(naked) | 
|  | void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { | 
|  | __asm { | 
|  | mov       eax, [esp + 4]   // src | 
|  | mov       edx, [esp + 8]   // dst | 
|  | mov       ecx, [esp + 12]  // width | 
|  | vmovdqu   ymm5, kARGBShuffleMirror_AVX2 | 
|  |  | 
|  | convertloop: | 
|  | vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order | 
|  | vmovdqu   [edx], ymm0 | 
|  | lea       edx, [edx + 32] | 
|  | sub       ecx, 8 | 
|  | jg        convertloop | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBMIRRORROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_SPLITUVROW_SSE2 | 
|  | __declspec(naked) | 
|  | void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { | 
|  | __asm { | 
|  | push       edi | 
|  | mov        eax, [esp + 4 + 4]    // src_uv | 
|  | mov        edx, [esp + 4 + 8]    // dst_u | 
|  | mov        edi, [esp + 4 + 12]   // dst_v | 
|  | mov        ecx, [esp + 4 + 16]   // pix | 
|  | pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff | 
|  | psrlw      xmm5, 8 | 
|  | sub        edi, edx | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | lea        eax,  [eax + 32] | 
|  | movdqa     xmm2, xmm0 | 
|  | movdqa     xmm3, xmm1 | 
|  | pand       xmm0, xmm5   // even bytes | 
|  | pand       xmm1, xmm5 | 
|  | packuswb   xmm0, xmm1 | 
|  | psrlw      xmm2, 8      // odd bytes | 
|  | psrlw      xmm3, 8 | 
|  | packuswb   xmm2, xmm3 | 
|  | movdqu     [edx], xmm0 | 
|  | movdqu     [edx + edi], xmm2 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | #endif  // HAS_SPLITUVROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_SPLITUVROW_AVX2 | 
|  | __declspec(naked) | 
|  | void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { | 
|  | __asm { | 
|  | push       edi | 
|  | mov        eax, [esp + 4 + 4]    // src_uv | 
|  | mov        edx, [esp + 4 + 8]    // dst_u | 
|  | mov        edi, [esp + 4 + 12]   // dst_v | 
|  | mov        ecx, [esp + 4 + 16]   // pix | 
|  | vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff | 
|  | vpsrlw     ymm5, ymm5, 8 | 
|  | sub        edi, edx | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm0, [eax] | 
|  | vmovdqu    ymm1, [eax + 32] | 
|  | lea        eax,  [eax + 64] | 
|  | vpsrlw     ymm2, ymm0, 8      // odd bytes | 
|  | vpsrlw     ymm3, ymm1, 8 | 
|  | vpand      ymm0, ymm0, ymm5   // even bytes | 
|  | vpand      ymm1, ymm1, ymm5 | 
|  | vpackuswb  ymm0, ymm0, ymm1 | 
|  | vpackuswb  ymm2, ymm2, ymm3 | 
|  | vpermq     ymm0, ymm0, 0xd8 | 
|  | vpermq     ymm2, ymm2, 0xd8 | 
|  | vmovdqu    [edx], ymm0 | 
|  | vmovdqu    [edx + edi], ymm2 | 
|  | lea        edx, [edx + 32] | 
|  | sub        ecx, 32 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_SPLITUVROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_MERGEUVROW_SSE2 | 
|  | __declspec(naked) | 
|  | void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, | 
|  | int width) { | 
|  | __asm { | 
|  | push       edi | 
|  | mov        eax, [esp + 4 + 4]    // src_u | 
|  | mov        edx, [esp + 4 + 8]    // src_v | 
|  | mov        edi, [esp + 4 + 12]   // dst_uv | 
|  | mov        ecx, [esp + 4 + 16]   // width | 
|  | sub        edx, eax | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax]      // read 16 U's | 
|  | movdqu     xmm1, [eax + edx]  // and 16 V's | 
|  | lea        eax,  [eax + 16] | 
|  | movdqa     xmm2, xmm0 | 
|  | punpcklbw  xmm0, xmm1       // first 8 UV pairs | 
|  | punpckhbw  xmm2, xmm1       // next 8 UV pairs | 
|  | movdqu     [edi], xmm0 | 
|  | movdqu     [edi + 16], xmm2 | 
|  | lea        edi, [edi + 32] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  //  HAS_MERGEUVROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_MERGEUVROW_AVX2 | 
|  | __declspec(naked) | 
|  | void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, | 
|  | int width) { | 
|  | __asm { | 
|  | push       edi | 
|  | mov        eax, [esp + 4 + 4]    // src_u | 
|  | mov        edx, [esp + 4 + 8]    // src_v | 
|  | mov        edi, [esp + 4 + 12]   // dst_uv | 
|  | mov        ecx, [esp + 4 + 16]   // width | 
|  | sub        edx, eax | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm0, [eax]           // read 32 U's | 
|  | vmovdqu    ymm1, [eax + edx]     // and 32 V's | 
|  | lea        eax,  [eax + 32] | 
|  | vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2 | 
|  | vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3 | 
|  | vextractf128 [edi], ymm2, 0       // bytes 0..15 | 
|  | vextractf128 [edi + 16], ymm0, 0  // bytes 16..31 | 
|  | vextractf128 [edi + 32], ymm2, 1  // bytes 32..47 | 
|  | vextractf128 [edi + 48], ymm0, 1  // bytes 47..63 | 
|  | lea        edi, [edi + 64] | 
|  | sub        ecx, 32 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  //  HAS_MERGEUVROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_COPYROW_SSE2 | 
|  | // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. | 
|  | __declspec(naked) | 
|  | void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   // src | 
|  | mov        edx, [esp + 8]   // dst | 
|  | mov        ecx, [esp + 12]  // count | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | lea        eax, [eax + 32] | 
|  | movdqu     [edx], xmm0 | 
|  | movdqu     [edx + 16], xmm1 | 
|  | lea        edx, [edx + 32] | 
|  | sub        ecx, 32 | 
|  | jg         convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_COPYROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_COPYROW_AVX | 
|  | // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time. | 
|  | __declspec(naked) | 
|  | void CopyRow_AVX(const uint8* src, uint8* dst, int count) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   // src | 
|  | mov        edx, [esp + 8]   // dst | 
|  | mov        ecx, [esp + 12]  // count | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm0, [eax] | 
|  | vmovdqu    ymm1, [eax + 32] | 
|  | lea        eax, [eax + 64] | 
|  | vmovdqu    [edx], ymm0 | 
|  | vmovdqu    [edx + 32], ymm1 | 
|  | lea        edx, [edx + 64] | 
|  | sub        ecx, 64 | 
|  | jg         convertloop | 
|  |  | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_COPYROW_AVX | 
|  |  | 
|  | // Multiple of 1. | 
|  | __declspec(naked) | 
|  | void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { | 
|  | __asm { | 
|  | mov        eax, esi | 
|  | mov        edx, edi | 
|  | mov        esi, [esp + 4]   // src | 
|  | mov        edi, [esp + 8]   // dst | 
|  | mov        ecx, [esp + 12]  // count | 
|  | rep movsb | 
|  | mov        edi, edx | 
|  | mov        esi, eax | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | #ifdef HAS_ARGBCOPYALPHAROW_SSE2 | 
|  | // width in pixels | 
|  | __declspec(naked) | 
|  | void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   // src | 
|  | mov        edx, [esp + 8]   // dst | 
|  | mov        ecx, [esp + 12]  // count | 
|  | pcmpeqb    xmm0, xmm0       // generate mask 0xff000000 | 
|  | pslld      xmm0, 24 | 
|  | pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff | 
|  | psrld      xmm1, 8 | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm2, [eax] | 
|  | movdqu     xmm3, [eax + 16] | 
|  | lea        eax, [eax + 32] | 
|  | movdqu     xmm4, [edx] | 
|  | movdqu     xmm5, [edx + 16] | 
|  | pand       xmm2, xmm0 | 
|  | pand       xmm3, xmm0 | 
|  | pand       xmm4, xmm1 | 
|  | pand       xmm5, xmm1 | 
|  | por        xmm2, xmm4 | 
|  | por        xmm3, xmm5 | 
|  | movdqu     [edx], xmm2 | 
|  | movdqu     [edx + 16], xmm3 | 
|  | lea        edx, [edx + 32] | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  |  | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBCOPYALPHAROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_ARGBCOPYALPHAROW_AVX2 | 
|  | // width in pixels | 
|  | __declspec(naked) | 
|  | void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   // src | 
|  | mov        edx, [esp + 8]   // dst | 
|  | mov        ecx, [esp + 12]  // count | 
|  | vpcmpeqb   ymm0, ymm0, ymm0 | 
|  | vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm1, [eax] | 
|  | vmovdqu    ymm2, [eax + 32] | 
|  | lea        eax, [eax + 64] | 
|  | vpblendvb  ymm1, ymm1, [edx], ymm0 | 
|  | vpblendvb  ymm2, ymm2, [edx + 32], ymm0 | 
|  | vmovdqu    [edx], ymm1 | 
|  | vmovdqu    [edx + 32], ymm2 | 
|  | lea        edx, [edx + 64] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBCOPYALPHAROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 | 
|  | // width in pixels | 
|  | __declspec(naked) | 
|  | void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   // src | 
|  | mov        edx, [esp + 8]   // dst | 
|  | mov        ecx, [esp + 12]  // count | 
|  | pcmpeqb    xmm0, xmm0       // generate mask 0xff000000 | 
|  | pslld      xmm0, 24 | 
|  | pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff | 
|  | psrld      xmm1, 8 | 
|  |  | 
|  | convertloop: | 
|  | movq       xmm2, qword ptr [eax]  // 8 Y's | 
|  | lea        eax, [eax + 8] | 
|  | punpcklbw  xmm2, xmm2 | 
|  | punpckhwd  xmm3, xmm2 | 
|  | punpcklwd  xmm2, xmm2 | 
|  | movdqu     xmm4, [edx] | 
|  | movdqu     xmm5, [edx + 16] | 
|  | pand       xmm2, xmm0 | 
|  | pand       xmm3, xmm0 | 
|  | pand       xmm4, xmm1 | 
|  | pand       xmm5, xmm1 | 
|  | por        xmm2, xmm4 | 
|  | por        xmm3, xmm5 | 
|  | movdqu     [edx], xmm2 | 
|  | movdqu     [edx + 16], xmm3 | 
|  | lea        edx, [edx + 32] | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  |  | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 | 
|  | // width in pixels | 
|  | __declspec(naked) | 
|  | void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   // src | 
|  | mov        edx, [esp + 8]   // dst | 
|  | mov        ecx, [esp + 12]  // count | 
|  | vpcmpeqb   ymm0, ymm0, ymm0 | 
|  | vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff | 
|  |  | 
|  | convertloop: | 
|  | vpmovzxbd  ymm1, qword ptr [eax] | 
|  | vpmovzxbd  ymm2, qword ptr [eax + 8] | 
|  | lea        eax, [eax + 16] | 
|  | vpslld     ymm1, ymm1, 24 | 
|  | vpslld     ymm2, ymm2, 24 | 
|  | vpblendvb  ymm1, ymm1, [edx], ymm0 | 
|  | vpblendvb  ymm2, ymm2, [edx + 32], ymm0 | 
|  | vmovdqu    [edx], ymm1 | 
|  | vmovdqu    [edx + 32], ymm2 | 
|  | lea        edx, [edx + 64] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_SETROW_X86 | 
|  | // Write 'count' bytes using an 8 bit value repeated. | 
|  | // Count should be multiple of 4. | 
|  | __declspec(naked) | 
|  | void SetRow_X86(uint8* dst, uint8 v8, int count) { | 
|  | __asm { | 
|  | movzx      eax, byte ptr [esp + 8]    // v8 | 
|  | mov        edx, 0x01010101  // Duplicate byte to all bytes. | 
|  | mul        edx              // overwrites edx with upper part of result. | 
|  | mov        edx, edi | 
|  | mov        edi, [esp + 4]   // dst | 
|  | mov        ecx, [esp + 12]  // count | 
|  | shr        ecx, 2 | 
|  | rep stosd | 
|  | mov        edi, edx | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Write 'count' bytes using an 8 bit value repeated. | 
|  | __declspec(naked) | 
|  | void SetRow_ERMS(uint8* dst, uint8 v8, int count) { | 
|  | __asm { | 
|  | mov        edx, edi | 
|  | mov        edi, [esp + 4]   // dst | 
|  | mov        eax, [esp + 8]   // v8 | 
|  | mov        ecx, [esp + 12]  // count | 
|  | rep stosb | 
|  | mov        edi, edx | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // Write 'count' 32 bit values. | 
|  | __declspec(naked) | 
|  | void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { | 
|  | __asm { | 
|  | mov        edx, edi | 
|  | mov        edi, [esp + 4]   // dst | 
|  | mov        eax, [esp + 8]   // v32 | 
|  | mov        ecx, [esp + 12]  // count | 
|  | rep stosd | 
|  | mov        edi, edx | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_SETROW_X86 | 
|  |  | 
|  | #ifdef HAS_YUY2TOYROW_AVX2 | 
|  | __declspec(naked) | 
|  | void YUY2ToYRow_AVX2(const uint8* src_yuy2, | 
|  | uint8* dst_y, int pix) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]    // src_yuy2 | 
|  | mov        edx, [esp + 8]    // dst_y | 
|  | mov        ecx, [esp + 12]   // pix | 
|  | vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff | 
|  | vpsrlw     ymm5, ymm5, 8 | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm0, [eax] | 
|  | vmovdqu    ymm1, [eax + 32] | 
|  | lea        eax,  [eax + 64] | 
|  | vpand      ymm0, ymm0, ymm5   // even bytes are Y | 
|  | vpand      ymm1, ymm1, ymm5 | 
|  | vpackuswb  ymm0, ymm0, ymm1   // mutates. | 
|  | vpermq     ymm0, ymm0, 0xd8 | 
|  | vmovdqu    [edx], ymm0 | 
|  | lea        edx, [edx + 32] | 
|  | sub        ecx, 32 | 
|  | jg         convertloop | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, | 
|  | uint8* dst_u, uint8* dst_v, int pix) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]    // src_yuy2 | 
|  | mov        esi, [esp + 8 + 8]    // stride_yuy2 | 
|  | mov        edx, [esp + 8 + 12]   // dst_u | 
|  | mov        edi, [esp + 8 + 16]   // dst_v | 
|  | mov        ecx, [esp + 8 + 20]   // pix | 
|  | vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff | 
|  | vpsrlw     ymm5, ymm5, 8 | 
|  | sub        edi, edx | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm0, [eax] | 
|  | vmovdqu    ymm1, [eax + 32] | 
|  | vpavgb     ymm0, ymm0, [eax + esi] | 
|  | vpavgb     ymm1, ymm1, [eax + esi + 32] | 
|  | lea        eax,  [eax + 64] | 
|  | vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV | 
|  | vpsrlw     ymm1, ymm1, 8 | 
|  | vpackuswb  ymm0, ymm0, ymm1   // mutates. | 
|  | vpermq     ymm0, ymm0, 0xd8 | 
|  | vpand      ymm1, ymm0, ymm5  // U | 
|  | vpsrlw     ymm0, ymm0, 8     // V | 
|  | vpackuswb  ymm1, ymm1, ymm1  // mutates. | 
|  | vpackuswb  ymm0, ymm0, ymm0  // mutates. | 
|  | vpermq     ymm1, ymm1, 0xd8 | 
|  | vpermq     ymm0, ymm0, 0xd8 | 
|  | vextractf128 [edx], ymm1, 0  // U | 
|  | vextractf128 [edx + edi], ymm0, 0 // V | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 32 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, | 
|  | uint8* dst_u, uint8* dst_v, int pix) { | 
|  | __asm { | 
|  | push       edi | 
|  | mov        eax, [esp + 4 + 4]    // src_yuy2 | 
|  | mov        edx, [esp + 4 + 8]    // dst_u | 
|  | mov        edi, [esp + 4 + 12]   // dst_v | 
|  | mov        ecx, [esp + 4 + 16]   // pix | 
|  | vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff | 
|  | vpsrlw     ymm5, ymm5, 8 | 
|  | sub        edi, edx | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm0, [eax] | 
|  | vmovdqu    ymm1, [eax + 32] | 
|  | lea        eax,  [eax + 64] | 
|  | vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV | 
|  | vpsrlw     ymm1, ymm1, 8 | 
|  | vpackuswb  ymm0, ymm0, ymm1   // mutates. | 
|  | vpermq     ymm0, ymm0, 0xd8 | 
|  | vpand      ymm1, ymm0, ymm5  // U | 
|  | vpsrlw     ymm0, ymm0, 8     // V | 
|  | vpackuswb  ymm1, ymm1, ymm1  // mutates. | 
|  | vpackuswb  ymm0, ymm0, ymm0  // mutates. | 
|  | vpermq     ymm1, ymm1, 0xd8 | 
|  | vpermq     ymm0, ymm0, 0xd8 | 
|  | vextractf128 [edx], ymm1, 0  // U | 
|  | vextractf128 [edx + edi], ymm0, 0 // V | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 32 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void UYVYToYRow_AVX2(const uint8* src_uyvy, | 
|  | uint8* dst_y, int pix) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]    // src_uyvy | 
|  | mov        edx, [esp + 8]    // dst_y | 
|  | mov        ecx, [esp + 12]   // pix | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm0, [eax] | 
|  | vmovdqu    ymm1, [eax + 32] | 
|  | lea        eax,  [eax + 64] | 
|  | vpsrlw     ymm0, ymm0, 8      // odd bytes are Y | 
|  | vpsrlw     ymm1, ymm1, 8 | 
|  | vpackuswb  ymm0, ymm0, ymm1   // mutates. | 
|  | vpermq     ymm0, ymm0, 0xd8 | 
|  | vmovdqu    [edx], ymm0 | 
|  | lea        edx, [edx + 32] | 
|  | sub        ecx, 32 | 
|  | jg         convertloop | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, | 
|  | uint8* dst_u, uint8* dst_v, int pix) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]    // src_yuy2 | 
|  | mov        esi, [esp + 8 + 8]    // stride_yuy2 | 
|  | mov        edx, [esp + 8 + 12]   // dst_u | 
|  | mov        edi, [esp + 8 + 16]   // dst_v | 
|  | mov        ecx, [esp + 8 + 20]   // pix | 
|  | vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff | 
|  | vpsrlw     ymm5, ymm5, 8 | 
|  | sub        edi, edx | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm0, [eax] | 
|  | vmovdqu    ymm1, [eax + 32] | 
|  | vpavgb     ymm0, ymm0, [eax + esi] | 
|  | vpavgb     ymm1, ymm1, [eax + esi + 32] | 
|  | lea        eax,  [eax + 64] | 
|  | vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV | 
|  | vpand      ymm1, ymm1, ymm5 | 
|  | vpackuswb  ymm0, ymm0, ymm1   // mutates. | 
|  | vpermq     ymm0, ymm0, 0xd8 | 
|  | vpand      ymm1, ymm0, ymm5  // U | 
|  | vpsrlw     ymm0, ymm0, 8     // V | 
|  | vpackuswb  ymm1, ymm1, ymm1  // mutates. | 
|  | vpackuswb  ymm0, ymm0, ymm0  // mutates. | 
|  | vpermq     ymm1, ymm1, 0xd8 | 
|  | vpermq     ymm0, ymm0, 0xd8 | 
|  | vextractf128 [edx], ymm1, 0  // U | 
|  | vextractf128 [edx + edi], ymm0, 0 // V | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 32 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void UYVYToUV422Row_AVX2(const uint8* src_uyvy, | 
|  | uint8* dst_u, uint8* dst_v, int pix) { | 
|  | __asm { | 
|  | push       edi | 
|  | mov        eax, [esp + 4 + 4]    // src_yuy2 | 
|  | mov        edx, [esp + 4 + 8]    // dst_u | 
|  | mov        edi, [esp + 4 + 12]   // dst_v | 
|  | mov        ecx, [esp + 4 + 16]   // pix | 
|  | vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff | 
|  | vpsrlw     ymm5, ymm5, 8 | 
|  | sub        edi, edx | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm0, [eax] | 
|  | vmovdqu    ymm1, [eax + 32] | 
|  | lea        eax,  [eax + 64] | 
|  | vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV | 
|  | vpand      ymm1, ymm1, ymm5 | 
|  | vpackuswb  ymm0, ymm0, ymm1   // mutates. | 
|  | vpermq     ymm0, ymm0, 0xd8 | 
|  | vpand      ymm1, ymm0, ymm5  // U | 
|  | vpsrlw     ymm0, ymm0, 8     // V | 
|  | vpackuswb  ymm1, ymm1, ymm1  // mutates. | 
|  | vpackuswb  ymm0, ymm0, ymm0  // mutates. | 
|  | vpermq     ymm1, ymm1, 0xd8 | 
|  | vpermq     ymm0, ymm0, 0xd8 | 
|  | vextractf128 [edx], ymm1, 0  // U | 
|  | vextractf128 [edx + edi], ymm0, 0 // V | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 32 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_YUY2TOYROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_YUY2TOYROW_SSE2 | 
|  | __declspec(naked) | 
|  | void YUY2ToYRow_SSE2(const uint8* src_yuy2, | 
|  | uint8* dst_y, int pix) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]    // src_yuy2 | 
|  | mov        edx, [esp + 8]    // dst_y | 
|  | mov        ecx, [esp + 12]   // pix | 
|  | pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff | 
|  | psrlw      xmm5, 8 | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | lea        eax,  [eax + 32] | 
|  | pand       xmm0, xmm5   // even bytes are Y | 
|  | pand       xmm1, xmm5 | 
|  | packuswb   xmm0, xmm1 | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, | 
|  | uint8* dst_u, uint8* dst_v, int pix) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]    // src_yuy2 | 
|  | mov        esi, [esp + 8 + 8]    // stride_yuy2 | 
|  | mov        edx, [esp + 8 + 12]   // dst_u | 
|  | mov        edi, [esp + 8 + 16]   // dst_v | 
|  | mov        ecx, [esp + 8 + 20]   // pix | 
|  | pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff | 
|  | psrlw      xmm5, 8 | 
|  | sub        edi, edx | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | movdqu     xmm2, [eax + esi] | 
|  | movdqu     xmm3, [eax + esi + 16] | 
|  | lea        eax,  [eax + 32] | 
|  | pavgb      xmm0, xmm2 | 
|  | pavgb      xmm1, xmm3 | 
|  | psrlw      xmm0, 8      // YUYV -> UVUV | 
|  | psrlw      xmm1, 8 | 
|  | packuswb   xmm0, xmm1 | 
|  | movdqa     xmm1, xmm0 | 
|  | pand       xmm0, xmm5  // U | 
|  | packuswb   xmm0, xmm0 | 
|  | psrlw      xmm1, 8     // V | 
|  | packuswb   xmm1, xmm1 | 
|  | movq       qword ptr [edx], xmm0 | 
|  | movq       qword ptr [edx + edi], xmm1 | 
|  | lea        edx, [edx + 8] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, | 
|  | uint8* dst_u, uint8* dst_v, int pix) { | 
|  | __asm { | 
|  | push       edi | 
|  | mov        eax, [esp + 4 + 4]    // src_yuy2 | 
|  | mov        edx, [esp + 4 + 8]    // dst_u | 
|  | mov        edi, [esp + 4 + 12]   // dst_v | 
|  | mov        ecx, [esp + 4 + 16]   // pix | 
|  | pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff | 
|  | psrlw      xmm5, 8 | 
|  | sub        edi, edx | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | lea        eax,  [eax + 32] | 
|  | psrlw      xmm0, 8      // YUYV -> UVUV | 
|  | psrlw      xmm1, 8 | 
|  | packuswb   xmm0, xmm1 | 
|  | movdqa     xmm1, xmm0 | 
|  | pand       xmm0, xmm5  // U | 
|  | packuswb   xmm0, xmm0 | 
|  | psrlw      xmm1, 8     // V | 
|  | packuswb   xmm1, xmm1 | 
|  | movq       qword ptr [edx], xmm0 | 
|  | movq       qword ptr [edx + edi], xmm1 | 
|  | lea        edx, [edx + 8] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void UYVYToYRow_SSE2(const uint8* src_uyvy, | 
|  | uint8* dst_y, int pix) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]    // src_uyvy | 
|  | mov        edx, [esp + 8]    // dst_y | 
|  | mov        ecx, [esp + 12]   // pix | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | lea        eax,  [eax + 32] | 
|  | psrlw      xmm0, 8    // odd bytes are Y | 
|  | psrlw      xmm1, 8 | 
|  | packuswb   xmm0, xmm1 | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, | 
|  | uint8* dst_u, uint8* dst_v, int pix) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]    // src_yuy2 | 
|  | mov        esi, [esp + 8 + 8]    // stride_yuy2 | 
|  | mov        edx, [esp + 8 + 12]   // dst_u | 
|  | mov        edi, [esp + 8 + 16]   // dst_v | 
|  | mov        ecx, [esp + 8 + 20]   // pix | 
|  | pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff | 
|  | psrlw      xmm5, 8 | 
|  | sub        edi, edx | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | movdqu     xmm2, [eax + esi] | 
|  | movdqu     xmm3, [eax + esi + 16] | 
|  | lea        eax,  [eax + 32] | 
|  | pavgb      xmm0, xmm2 | 
|  | pavgb      xmm1, xmm3 | 
|  | pand       xmm0, xmm5   // UYVY -> UVUV | 
|  | pand       xmm1, xmm5 | 
|  | packuswb   xmm0, xmm1 | 
|  | movdqa     xmm1, xmm0 | 
|  | pand       xmm0, xmm5  // U | 
|  | packuswb   xmm0, xmm0 | 
|  | psrlw      xmm1, 8     // V | 
|  | packuswb   xmm1, xmm1 | 
|  | movq       qword ptr [edx], xmm0 | 
|  | movq       qword ptr [edx + edi], xmm1 | 
|  | lea        edx, [edx + 8] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void UYVYToUV422Row_SSE2(const uint8* src_uyvy, | 
|  | uint8* dst_u, uint8* dst_v, int pix) { | 
|  | __asm { | 
|  | push       edi | 
|  | mov        eax, [esp + 4 + 4]    // src_yuy2 | 
|  | mov        edx, [esp + 4 + 8]    // dst_u | 
|  | mov        edi, [esp + 4 + 12]   // dst_v | 
|  | mov        ecx, [esp + 4 + 16]   // pix | 
|  | pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff | 
|  | psrlw      xmm5, 8 | 
|  | sub        edi, edx | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | lea        eax,  [eax + 32] | 
|  | pand       xmm0, xmm5   // UYVY -> UVUV | 
|  | pand       xmm1, xmm5 | 
|  | packuswb   xmm0, xmm1 | 
|  | movdqa     xmm1, xmm0 | 
|  | pand       xmm0, xmm5  // U | 
|  | packuswb   xmm0, xmm0 | 
|  | psrlw      xmm1, 8     // V | 
|  | packuswb   xmm1, xmm1 | 
|  | movq       qword ptr [edx], xmm0 | 
|  | movq       qword ptr [edx + edi], xmm1 | 
|  | lea        edx, [edx + 8] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_YUY2TOYROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_ARGBBLENDROW_SSE2 | 
|  | // Blend 8 pixels at a time. | 
|  | __declspec(naked) | 
|  | void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | 
|  | uint8* dst_argb, int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]   // src_argb0 | 
|  | mov        esi, [esp + 4 + 8]   // src_argb1 | 
|  | mov        edx, [esp + 4 + 12]  // dst_argb | 
|  | mov        ecx, [esp + 4 + 16]  // width | 
|  | pcmpeqb    xmm7, xmm7       // generate constant 1 | 
|  | psrlw      xmm7, 15 | 
|  | pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff | 
|  | psrlw      xmm6, 8 | 
|  | pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00 | 
|  | psllw      xmm5, 8 | 
|  | pcmpeqb    xmm4, xmm4       // generate mask 0xff000000 | 
|  | pslld      xmm4, 24 | 
|  | sub        ecx, 4 | 
|  | jl         convertloop4b    // less than 4 pixels? | 
|  |  | 
|  | // 4 pixel loop. | 
|  | convertloop4: | 
|  | movdqu     xmm3, [eax]      // src argb | 
|  | lea        eax, [eax + 16] | 
|  | movdqa     xmm0, xmm3       // src argb | 
|  | pxor       xmm3, xmm4       // ~alpha | 
|  | movdqu     xmm2, [esi]      // _r_b | 
|  | psrlw      xmm3, 8          // alpha | 
|  | pshufhw    xmm3, xmm3, 0F5h // 8 alpha words | 
|  | pshuflw    xmm3, xmm3, 0F5h | 
|  | pand       xmm2, xmm6       // _r_b | 
|  | paddw      xmm3, xmm7       // 256 - alpha | 
|  | pmullw     xmm2, xmm3       // _r_b * alpha | 
|  | movdqu     xmm1, [esi]      // _a_g | 
|  | lea        esi, [esi + 16] | 
|  | psrlw      xmm1, 8          // _a_g | 
|  | por        xmm0, xmm4       // set alpha to 255 | 
|  | pmullw     xmm1, xmm3       // _a_g * alpha | 
|  | psrlw      xmm2, 8          // _r_b convert to 8 bits again | 
|  | paddusb    xmm0, xmm2       // + src argb | 
|  | pand       xmm1, xmm5       // a_g_ convert to 8 bits again | 
|  | paddusb    xmm0, xmm1       // + src argb | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 4 | 
|  | jge        convertloop4 | 
|  |  | 
|  | convertloop4b: | 
|  | add        ecx, 4 - 1 | 
|  | jl         convertloop1b | 
|  |  | 
|  | // 1 pixel loop. | 
|  | convertloop1: | 
|  | movd       xmm3, [eax]      // src argb | 
|  | lea        eax, [eax + 4] | 
|  | movdqa     xmm0, xmm3       // src argb | 
|  | pxor       xmm3, xmm4       // ~alpha | 
|  | movd       xmm2, [esi]      // _r_b | 
|  | psrlw      xmm3, 8          // alpha | 
|  | pshufhw    xmm3, xmm3, 0F5h // 8 alpha words | 
|  | pshuflw    xmm3, xmm3, 0F5h | 
|  | pand       xmm2, xmm6       // _r_b | 
|  | paddw      xmm3, xmm7       // 256 - alpha | 
|  | pmullw     xmm2, xmm3       // _r_b * alpha | 
|  | movd       xmm1, [esi]      // _a_g | 
|  | lea        esi, [esi + 4] | 
|  | psrlw      xmm1, 8          // _a_g | 
|  | por        xmm0, xmm4       // set alpha to 255 | 
|  | pmullw     xmm1, xmm3       // _a_g * alpha | 
|  | psrlw      xmm2, 8          // _r_b convert to 8 bits again | 
|  | paddusb    xmm0, xmm2       // + src argb | 
|  | pand       xmm1, xmm5       // a_g_ convert to 8 bits again | 
|  | paddusb    xmm0, xmm1       // + src argb | 
|  | movd       [edx], xmm0 | 
|  | lea        edx, [edx + 4] | 
|  | sub        ecx, 1 | 
|  | jge        convertloop1 | 
|  |  | 
|  | convertloop1b: | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBBLENDROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_ARGBBLENDROW_SSSE3 | 
|  | // Shuffle table for isolating alpha. | 
|  | static const uvec8 kShuffleAlpha = { | 
|  | 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, | 
|  | 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 | 
|  | }; | 
|  | // Same as SSE2, but replaces: | 
|  | //    psrlw      xmm3, 8          // alpha | 
|  | //    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words | 
|  | //    pshuflw    xmm3, xmm3, 0F5h | 
|  | // with.. | 
|  | //    pshufb     xmm3, kShuffleAlpha // alpha | 
|  | // Blend 8 pixels at a time. | 
|  |  | 
|  | __declspec(naked) | 
|  | void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, | 
|  | uint8* dst_argb, int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]   // src_argb0 | 
|  | mov        esi, [esp + 4 + 8]   // src_argb1 | 
|  | mov        edx, [esp + 4 + 12]  // dst_argb | 
|  | mov        ecx, [esp + 4 + 16]  // width | 
|  | pcmpeqb    xmm7, xmm7       // generate constant 0x0001 | 
|  | psrlw      xmm7, 15 | 
|  | pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff | 
|  | psrlw      xmm6, 8 | 
|  | pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00 | 
|  | psllw      xmm5, 8 | 
|  | pcmpeqb    xmm4, xmm4       // generate mask 0xff000000 | 
|  | pslld      xmm4, 24 | 
|  | sub        ecx, 4 | 
|  | jl         convertloop4b    // less than 4 pixels? | 
|  |  | 
|  | // 4 pixel loop. | 
|  | convertloop4: | 
|  | movdqu     xmm3, [eax]      // src argb | 
|  | lea        eax, [eax + 16] | 
|  | movdqa     xmm0, xmm3       // src argb | 
|  | pxor       xmm3, xmm4       // ~alpha | 
|  | movdqu     xmm2, [esi]      // _r_b | 
|  | pshufb     xmm3, kShuffleAlpha // alpha | 
|  | pand       xmm2, xmm6       // _r_b | 
|  | paddw      xmm3, xmm7       // 256 - alpha | 
|  | pmullw     xmm2, xmm3       // _r_b * alpha | 
|  | movdqu     xmm1, [esi]      // _a_g | 
|  | lea        esi, [esi + 16] | 
|  | psrlw      xmm1, 8          // _a_g | 
|  | por        xmm0, xmm4       // set alpha to 255 | 
|  | pmullw     xmm1, xmm3       // _a_g * alpha | 
|  | psrlw      xmm2, 8          // _r_b convert to 8 bits again | 
|  | paddusb    xmm0, xmm2       // + src argb | 
|  | pand       xmm1, xmm5       // a_g_ convert to 8 bits again | 
|  | paddusb    xmm0, xmm1       // + src argb | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 4 | 
|  | jge        convertloop4 | 
|  |  | 
|  | convertloop4b: | 
|  | add        ecx, 4 - 1 | 
|  | jl         convertloop1b | 
|  |  | 
|  | // 1 pixel loop. | 
|  | convertloop1: | 
|  | movd       xmm3, [eax]      // src argb | 
|  | lea        eax, [eax + 4] | 
|  | movdqa     xmm0, xmm3       // src argb | 
|  | pxor       xmm3, xmm4       // ~alpha | 
|  | movd       xmm2, [esi]      // _r_b | 
|  | pshufb     xmm3, kShuffleAlpha // alpha | 
|  | pand       xmm2, xmm6       // _r_b | 
|  | paddw      xmm3, xmm7       // 256 - alpha | 
|  | pmullw     xmm2, xmm3       // _r_b * alpha | 
|  | movd       xmm1, [esi]      // _a_g | 
|  | lea        esi, [esi + 4] | 
|  | psrlw      xmm1, 8          // _a_g | 
|  | por        xmm0, xmm4       // set alpha to 255 | 
|  | pmullw     xmm1, xmm3       // _a_g * alpha | 
|  | psrlw      xmm2, 8          // _r_b convert to 8 bits again | 
|  | paddusb    xmm0, xmm2       // + src argb | 
|  | pand       xmm1, xmm5       // a_g_ convert to 8 bits again | 
|  | paddusb    xmm0, xmm1       // + src argb | 
|  | movd       [edx], xmm0 | 
|  | lea        edx, [edx + 4] | 
|  | sub        ecx, 1 | 
|  | jge        convertloop1 | 
|  |  | 
|  | convertloop1b: | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBBLENDROW_SSSE3 | 
|  |  | 
|  | #ifdef HAS_ARGBATTENUATEROW_SSE2 | 
|  | // Attenuate 4 pixels at a time. | 
|  | __declspec(naked) | 
|  | void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   // src_argb0 | 
|  | mov        edx, [esp + 8]   // dst_argb | 
|  | mov        ecx, [esp + 12]  // width | 
|  | pcmpeqb    xmm4, xmm4       // generate mask 0xff000000 | 
|  | pslld      xmm4, 24 | 
|  | pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff | 
|  | psrld      xmm5, 8 | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax]      // read 4 pixels | 
|  | punpcklbw  xmm0, xmm0       // first 2 | 
|  | pshufhw    xmm2, xmm0, 0FFh // 8 alpha words | 
|  | pshuflw    xmm2, xmm2, 0FFh | 
|  | pmulhuw    xmm0, xmm2       // rgb * a | 
|  | movdqu     xmm1, [eax]      // read 4 pixels | 
|  | punpckhbw  xmm1, xmm1       // next 2 pixels | 
|  | pshufhw    xmm2, xmm1, 0FFh // 8 alpha words | 
|  | pshuflw    xmm2, xmm2, 0FFh | 
|  | pmulhuw    xmm1, xmm2       // rgb * a | 
|  | movdqu     xmm2, [eax]      // alphas | 
|  | lea        eax, [eax + 16] | 
|  | psrlw      xmm0, 8 | 
|  | pand       xmm2, xmm4 | 
|  | psrlw      xmm1, 8 | 
|  | packuswb   xmm0, xmm1 | 
|  | pand       xmm0, xmm5       // keep original alphas | 
|  | por        xmm0, xmm2 | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 4 | 
|  | jg         convertloop | 
|  |  | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBATTENUATEROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_ARGBATTENUATEROW_SSSE3 | 
|  | // Shuffle table duplicating alpha. | 
|  | static const uvec8 kShuffleAlpha0 = { | 
|  | 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, | 
|  | }; | 
|  | static const uvec8 kShuffleAlpha1 = { | 
|  | 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, | 
|  | 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, | 
|  | }; | 
|  | __declspec(naked) | 
|  | void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   // src_argb0 | 
|  | mov        edx, [esp + 8]   // dst_argb | 
|  | mov        ecx, [esp + 12]  // width | 
|  | pcmpeqb    xmm3, xmm3       // generate mask 0xff000000 | 
|  | pslld      xmm3, 24 | 
|  | movdqa     xmm4, kShuffleAlpha0 | 
|  | movdqa     xmm5, kShuffleAlpha1 | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax]      // read 4 pixels | 
|  | pshufb     xmm0, xmm4       // isolate first 2 alphas | 
|  | movdqu     xmm1, [eax]      // read 4 pixels | 
|  | punpcklbw  xmm1, xmm1       // first 2 pixel rgbs | 
|  | pmulhuw    xmm0, xmm1       // rgb * a | 
|  | movdqu     xmm1, [eax]      // read 4 pixels | 
|  | pshufb     xmm1, xmm5       // isolate next 2 alphas | 
|  | movdqu     xmm2, [eax]      // read 4 pixels | 
|  | punpckhbw  xmm2, xmm2       // next 2 pixel rgbs | 
|  | pmulhuw    xmm1, xmm2       // rgb * a | 
|  | movdqu     xmm2, [eax]      // mask original alpha | 
|  | lea        eax, [eax + 16] | 
|  | pand       xmm2, xmm3 | 
|  | psrlw      xmm0, 8 | 
|  | psrlw      xmm1, 8 | 
|  | packuswb   xmm0, xmm1 | 
|  | por        xmm0, xmm2       // copy original alpha | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 4 | 
|  | jg         convertloop | 
|  |  | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBATTENUATEROW_SSSE3 | 
|  |  | 
|  | #ifdef HAS_ARGBATTENUATEROW_AVX2 | 
|  | // Shuffle table duplicating alpha. | 
|  | static const uvec8 kShuffleAlpha_AVX2 = { | 
|  | 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u | 
|  | }; | 
|  | __declspec(naked) | 
|  | void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   // src_argb0 | 
|  | mov        edx, [esp + 8]   // dst_argb | 
|  | mov        ecx, [esp + 12]  // width | 
|  | sub        edx, eax | 
|  | vbroadcastf128 ymm4,kShuffleAlpha_AVX2 | 
|  | vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000 | 
|  | vpslld     ymm5, ymm5, 24 | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm6, [eax]       // read 8 pixels. | 
|  | vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated. | 
|  | vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated. | 
|  | vpshufb    ymm2, ymm0, ymm4  // low 4 alphas | 
|  | vpshufb    ymm3, ymm1, ymm4  // high 4 alphas | 
|  | vpmulhuw   ymm0, ymm0, ymm2  // rgb * a | 
|  | vpmulhuw   ymm1, ymm1, ymm3  // rgb * a | 
|  | vpand      ymm6, ymm6, ymm5  // isolate alpha | 
|  | vpsrlw     ymm0, ymm0, 8 | 
|  | vpsrlw     ymm1, ymm1, 8 | 
|  | vpackuswb  ymm0, ymm0, ymm1  // unmutated. | 
|  | vpor       ymm0, ymm0, ymm6  // copy original alpha | 
|  | vmovdqu    [eax + edx], ymm0 | 
|  | lea        eax, [eax + 32] | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  |  | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBATTENUATEROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_ARGBUNATTENUATEROW_SSE2 | 
|  | // Unattenuate 4 pixels at a time. | 
|  | __declspec(naked) | 
|  | void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, | 
|  | int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // src_argb0 | 
|  | mov        edx, [esp + 8 + 8]   // dst_argb | 
|  | mov        ecx, [esp + 8 + 12]  // width | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax]      // read 4 pixels | 
|  | movzx      esi, byte ptr [eax + 3]  // first alpha | 
|  | movzx      edi, byte ptr [eax + 7]  // second alpha | 
|  | punpcklbw  xmm0, xmm0       // first 2 | 
|  | movd       xmm2, dword ptr fixed_invtbl8[esi * 4] | 
|  | movd       xmm3, dword ptr fixed_invtbl8[edi * 4] | 
|  | pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a | 
|  | pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words | 
|  | movlhps    xmm2, xmm3 | 
|  | pmulhuw    xmm0, xmm2       // rgb * a | 
|  |  | 
|  | movdqu     xmm1, [eax]      // read 4 pixels | 
|  | movzx      esi, byte ptr [eax + 11]  // third alpha | 
|  | movzx      edi, byte ptr [eax + 15]  // forth alpha | 
|  | punpckhbw  xmm1, xmm1       // next 2 | 
|  | movd       xmm2, dword ptr fixed_invtbl8[esi * 4] | 
|  | movd       xmm3, dword ptr fixed_invtbl8[edi * 4] | 
|  | pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words | 
|  | pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words | 
|  | movlhps    xmm2, xmm3 | 
|  | pmulhuw    xmm1, xmm2       // rgb * a | 
|  | lea        eax, [eax + 16] | 
|  |  | 
|  | packuswb   xmm0, xmm1 | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 4 | 
|  | jg         convertloop | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBUNATTENUATEROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_ARGBUNATTENUATEROW_AVX2 | 
|  | // Shuffle table duplicating alpha. | 
|  | static const uvec8 kUnattenShuffleAlpha_AVX2 = { | 
|  | 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u | 
|  | }; | 
|  | // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. | 
|  | // USE_GATHER is not on by default, due to being a slow instruction. | 
|  | #ifdef USE_GATHER | 
|  | __declspec(naked) | 
|  | void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, | 
|  | int width) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   // src_argb0 | 
|  | mov        edx, [esp + 8]   // dst_argb | 
|  | mov        ecx, [esp + 12]  // width | 
|  | sub        edx, eax | 
|  | vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2 | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm6, [eax]       // read 8 pixels. | 
|  | vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather. | 
|  | vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits. | 
|  | vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated. | 
|  | vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated. | 
|  | vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a | 
|  | vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a | 
|  | vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated. | 
|  | vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a | 
|  | vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas | 
|  | vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia | 
|  | vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia | 
|  | vpackuswb  ymm0, ymm0, ymm1  // unmutated. | 
|  | vmovdqu    [eax + edx], ymm0 | 
|  | lea        eax, [eax + 32] | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  |  | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #else  // USE_GATHER | 
|  | __declspec(naked) | 
|  | void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, | 
|  | int width) { | 
|  | __asm { | 
|  |  | 
|  | mov        eax, [esp + 4]   // src_argb0 | 
|  | mov        edx, [esp + 8]   // dst_argb | 
|  | mov        ecx, [esp + 12]  // width | 
|  | sub        edx, eax | 
|  | vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2 | 
|  |  | 
|  | push       esi | 
|  | push       edi | 
|  |  | 
|  | convertloop: | 
|  | // replace VPGATHER | 
|  | movzx      esi, byte ptr [eax + 3]                 // alpha0 | 
|  | movzx      edi, byte ptr [eax + 7]                 // alpha1 | 
|  | vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a0] | 
|  | vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a1] | 
|  | movzx      esi, byte ptr [eax + 11]                // alpha2 | 
|  | movzx      edi, byte ptr [eax + 15]                // alpha3 | 
|  | vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0] | 
|  | vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a2] | 
|  | vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a3] | 
|  | movzx      esi, byte ptr [eax + 19]                // alpha4 | 
|  | movzx      edi, byte ptr [eax + 23]                // alpha5 | 
|  | vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2] | 
|  | vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a4] | 
|  | vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a5] | 
|  | movzx      esi, byte ptr [eax + 27]                // alpha6 | 
|  | movzx      edi, byte ptr [eax + 31]                // alpha7 | 
|  | vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4] | 
|  | vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a6] | 
|  | vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a7] | 
|  | vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6] | 
|  | vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0] | 
|  | vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4] | 
|  | vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] | 
|  | // end of VPGATHER | 
|  |  | 
|  | vmovdqu    ymm6, [eax]       // read 8 pixels. | 
|  | vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated. | 
|  | vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated. | 
|  | vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a | 
|  | vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated. | 
|  | vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a | 
|  | vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas | 
|  | vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia | 
|  | vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia | 
|  | vpackuswb  ymm0, ymm0, ymm1  // unmutated. | 
|  | vmovdqu    [eax + edx], ymm0 | 
|  | lea        eax, [eax + 32] | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // USE_GATHER | 
|  | #endif  // HAS_ARGBATTENUATEROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_ARGBGRAYROW_SSSE3 | 
|  | // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. | 
|  | __declspec(naked) | 
|  | void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   /* src_argb */ | 
|  | mov        edx, [esp + 8]   /* dst_argb */ | 
|  | mov        ecx, [esp + 12]  /* width */ | 
|  | movdqa     xmm4, kARGBToYJ | 
|  | movdqa     xmm5, kAddYJ64 | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax]  // G | 
|  | movdqu     xmm1, [eax + 16] | 
|  | pmaddubsw  xmm0, xmm4 | 
|  | pmaddubsw  xmm1, xmm4 | 
|  | phaddw     xmm0, xmm1 | 
|  | paddw      xmm0, xmm5  // Add .5 for rounding. | 
|  | psrlw      xmm0, 7 | 
|  | packuswb   xmm0, xmm0   // 8 G bytes | 
|  | movdqu     xmm2, [eax]  // A | 
|  | movdqu     xmm3, [eax + 16] | 
|  | lea        eax, [eax + 32] | 
|  | psrld      xmm2, 24 | 
|  | psrld      xmm3, 24 | 
|  | packuswb   xmm2, xmm3 | 
|  | packuswb   xmm2, xmm2   // 8 A bytes | 
|  | movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA | 
|  | punpcklbw  xmm0, xmm0   // 8 GG words | 
|  | punpcklbw  xmm3, xmm2   // 8 GA words | 
|  | movdqa     xmm1, xmm0 | 
|  | punpcklwd  xmm0, xmm3   // GGGA first 4 | 
|  | punpckhwd  xmm1, xmm3   // GGGA next 4 | 
|  | movdqu     [edx], xmm0 | 
|  | movdqu     [edx + 16], xmm1 | 
|  | lea        edx, [edx + 32] | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBGRAYROW_SSSE3 | 
|  |  | 
|  | #ifdef HAS_ARGBSEPIAROW_SSSE3 | 
|  | //    b = (r * 35 + g * 68 + b * 17) >> 7 | 
|  | //    g = (r * 45 + g * 88 + b * 22) >> 7 | 
|  | //    r = (r * 50 + g * 98 + b * 24) >> 7 | 
|  | // Constant for ARGB color to sepia tone. | 
|  | static const vec8 kARGBToSepiaB = { | 
|  | 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 | 
|  | }; | 
|  |  | 
|  | static const vec8 kARGBToSepiaG = { | 
|  | 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 | 
|  | }; | 
|  |  | 
|  | static const vec8 kARGBToSepiaR = { | 
|  | 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 | 
|  | }; | 
|  |  | 
|  | // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. | 
|  | __declspec(naked) | 
|  | void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   /* dst_argb */ | 
|  | mov        ecx, [esp + 8]   /* width */ | 
|  | movdqa     xmm2, kARGBToSepiaB | 
|  | movdqa     xmm3, kARGBToSepiaG | 
|  | movdqa     xmm4, kARGBToSepiaR | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax]  // B | 
|  | movdqu     xmm6, [eax + 16] | 
|  | pmaddubsw  xmm0, xmm2 | 
|  | pmaddubsw  xmm6, xmm2 | 
|  | phaddw     xmm0, xmm6 | 
|  | psrlw      xmm0, 7 | 
|  | packuswb   xmm0, xmm0   // 8 B values | 
|  | movdqu     xmm5, [eax]  // G | 
|  | movdqu     xmm1, [eax + 16] | 
|  | pmaddubsw  xmm5, xmm3 | 
|  | pmaddubsw  xmm1, xmm3 | 
|  | phaddw     xmm5, xmm1 | 
|  | psrlw      xmm5, 7 | 
|  | packuswb   xmm5, xmm5   // 8 G values | 
|  | punpcklbw  xmm0, xmm5   // 8 BG values | 
|  | movdqu     xmm5, [eax]  // R | 
|  | movdqu     xmm1, [eax + 16] | 
|  | pmaddubsw  xmm5, xmm4 | 
|  | pmaddubsw  xmm1, xmm4 | 
|  | phaddw     xmm5, xmm1 | 
|  | psrlw      xmm5, 7 | 
|  | packuswb   xmm5, xmm5   // 8 R values | 
|  | movdqu     xmm6, [eax]  // A | 
|  | movdqu     xmm1, [eax + 16] | 
|  | psrld      xmm6, 24 | 
|  | psrld      xmm1, 24 | 
|  | packuswb   xmm6, xmm1 | 
|  | packuswb   xmm6, xmm6   // 8 A values | 
|  | punpcklbw  xmm5, xmm6   // 8 RA values | 
|  | movdqa     xmm1, xmm0   // Weave BG, RA together | 
|  | punpcklwd  xmm0, xmm5   // BGRA first 4 | 
|  | punpckhwd  xmm1, xmm5   // BGRA next 4 | 
|  | movdqu     [eax], xmm0 | 
|  | movdqu     [eax + 16], xmm1 | 
|  | lea        eax, [eax + 32] | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBSEPIAROW_SSSE3 | 
|  |  | 
|  | #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 | 
|  | // Tranform 8 ARGB pixels (32 bytes) with color matrix. | 
|  | // Same as Sepia except matrix is provided. | 
|  | // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R | 
|  | // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. | 
|  | __declspec(naked) | 
|  | void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | 
|  | const int8* matrix_argb, int width) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   /* src_argb */ | 
|  | mov        edx, [esp + 8]   /* dst_argb */ | 
|  | mov        ecx, [esp + 12]  /* matrix_argb */ | 
|  | movdqu     xmm5, [ecx] | 
|  | pshufd     xmm2, xmm5, 0x00 | 
|  | pshufd     xmm3, xmm5, 0x55 | 
|  | pshufd     xmm4, xmm5, 0xaa | 
|  | pshufd     xmm5, xmm5, 0xff | 
|  | mov        ecx, [esp + 16]  /* width */ | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax]  // B | 
|  | movdqu     xmm7, [eax + 16] | 
|  | pmaddubsw  xmm0, xmm2 | 
|  | pmaddubsw  xmm7, xmm2 | 
|  | movdqu     xmm6, [eax]  // G | 
|  | movdqu     xmm1, [eax + 16] | 
|  | pmaddubsw  xmm6, xmm3 | 
|  | pmaddubsw  xmm1, xmm3 | 
|  | phaddsw    xmm0, xmm7   // B | 
|  | phaddsw    xmm6, xmm1   // G | 
|  | psraw      xmm0, 6      // B | 
|  | psraw      xmm6, 6      // G | 
|  | packuswb   xmm0, xmm0   // 8 B values | 
|  | packuswb   xmm6, xmm6   // 8 G values | 
|  | punpcklbw  xmm0, xmm6   // 8 BG values | 
|  | movdqu     xmm1, [eax]  // R | 
|  | movdqu     xmm7, [eax + 16] | 
|  | pmaddubsw  xmm1, xmm4 | 
|  | pmaddubsw  xmm7, xmm4 | 
|  | phaddsw    xmm1, xmm7   // R | 
|  | movdqu     xmm6, [eax]  // A | 
|  | movdqu     xmm7, [eax + 16] | 
|  | pmaddubsw  xmm6, xmm5 | 
|  | pmaddubsw  xmm7, xmm5 | 
|  | phaddsw    xmm6, xmm7   // A | 
|  | psraw      xmm1, 6      // R | 
|  | psraw      xmm6, 6      // A | 
|  | packuswb   xmm1, xmm1   // 8 R values | 
|  | packuswb   xmm6, xmm6   // 8 A values | 
|  | punpcklbw  xmm1, xmm6   // 8 RA values | 
|  | movdqa     xmm6, xmm0   // Weave BG, RA together | 
|  | punpcklwd  xmm0, xmm1   // BGRA first 4 | 
|  | punpckhwd  xmm6, xmm1   // BGRA next 4 | 
|  | movdqu     [edx], xmm0 | 
|  | movdqu     [edx + 16], xmm6 | 
|  | lea        eax, [eax + 32] | 
|  | lea        edx, [edx + 32] | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3 | 
|  |  | 
|  | #ifdef HAS_ARGBQUANTIZEROW_SSE2 | 
|  | // Quantize 4 ARGB pixels (16 bytes). | 
|  | __declspec(naked) | 
|  | void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, | 
|  | int interval_offset, int width) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]    /* dst_argb */ | 
|  | movd       xmm2, [esp + 8]   /* scale */ | 
|  | movd       xmm3, [esp + 12]  /* interval_size */ | 
|  | movd       xmm4, [esp + 16]  /* interval_offset */ | 
|  | mov        ecx, [esp + 20]   /* width */ | 
|  | pshuflw    xmm2, xmm2, 040h | 
|  | pshufd     xmm2, xmm2, 044h | 
|  | pshuflw    xmm3, xmm3, 040h | 
|  | pshufd     xmm3, xmm3, 044h | 
|  | pshuflw    xmm4, xmm4, 040h | 
|  | pshufd     xmm4, xmm4, 044h | 
|  | pxor       xmm5, xmm5  // constant 0 | 
|  | pcmpeqb    xmm6, xmm6  // generate mask 0xff000000 | 
|  | pslld      xmm6, 24 | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax]  // read 4 pixels | 
|  | punpcklbw  xmm0, xmm5   // first 2 pixels | 
|  | pmulhuw    xmm0, xmm2   // pixel * scale >> 16 | 
|  | movdqu     xmm1, [eax]  // read 4 pixels | 
|  | punpckhbw  xmm1, xmm5   // next 2 pixels | 
|  | pmulhuw    xmm1, xmm2 | 
|  | pmullw     xmm0, xmm3   // * interval_size | 
|  | movdqu     xmm7, [eax]  // read 4 pixels | 
|  | pmullw     xmm1, xmm3 | 
|  | pand       xmm7, xmm6   // mask alpha | 
|  | paddw      xmm0, xmm4   // + interval_size / 2 | 
|  | paddw      xmm1, xmm4 | 
|  | packuswb   xmm0, xmm1 | 
|  | por        xmm0, xmm7 | 
|  | movdqu     [eax], xmm0 | 
|  | lea        eax, [eax + 16] | 
|  | sub        ecx, 4 | 
|  | jg         convertloop | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBQUANTIZEROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_ARGBSHADEROW_SSE2 | 
|  | // Shade 4 pixels at a time by specified value. | 
|  | __declspec(naked) | 
|  | void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, | 
|  | uint32 value) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   // src_argb | 
|  | mov        edx, [esp + 8]   // dst_argb | 
|  | mov        ecx, [esp + 12]  // width | 
|  | movd       xmm2, [esp + 16]  // value | 
|  | punpcklbw  xmm2, xmm2 | 
|  | punpcklqdq xmm2, xmm2 | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax]      // read 4 pixels | 
|  | lea        eax, [eax + 16] | 
|  | movdqa     xmm1, xmm0 | 
|  | punpcklbw  xmm0, xmm0       // first 2 | 
|  | punpckhbw  xmm1, xmm1       // next 2 | 
|  | pmulhuw    xmm0, xmm2       // argb * value | 
|  | pmulhuw    xmm1, xmm2       // argb * value | 
|  | psrlw      xmm0, 8 | 
|  | psrlw      xmm1, 8 | 
|  | packuswb   xmm0, xmm1 | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 4 | 
|  | jg         convertloop | 
|  |  | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBSHADEROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_ARGBMULTIPLYROW_SSE2 | 
|  | // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. | 
|  | __declspec(naked) | 
|  | void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | 
|  | uint8* dst_argb, int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]   // src_argb0 | 
|  | mov        esi, [esp + 4 + 8]   // src_argb1 | 
|  | mov        edx, [esp + 4 + 12]  // dst_argb | 
|  | mov        ecx, [esp + 4 + 16]  // width | 
|  | pxor       xmm5, xmm5  // constant 0 | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax]        // read 4 pixels from src_argb0 | 
|  | movdqu     xmm2, [esi]        // read 4 pixels from src_argb1 | 
|  | movdqu     xmm1, xmm0 | 
|  | movdqu     xmm3, xmm2 | 
|  | punpcklbw  xmm0, xmm0         // first 2 | 
|  | punpckhbw  xmm1, xmm1         // next 2 | 
|  | punpcklbw  xmm2, xmm5         // first 2 | 
|  | punpckhbw  xmm3, xmm5         // next 2 | 
|  | pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2 | 
|  | pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2 | 
|  | lea        eax, [eax + 16] | 
|  | lea        esi, [esi + 16] | 
|  | packuswb   xmm0, xmm1 | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 4 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBMULTIPLYROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_ARGBADDROW_SSE2 | 
|  | // Add 2 rows of ARGB pixels together, 4 pixels at a time. | 
|  | // TODO(fbarchard): Port this to posix, neon and other math functions. | 
|  | __declspec(naked) | 
|  | void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | 
|  | uint8* dst_argb, int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]   // src_argb0 | 
|  | mov        esi, [esp + 4 + 8]   // src_argb1 | 
|  | mov        edx, [esp + 4 + 12]  // dst_argb | 
|  | mov        ecx, [esp + 4 + 16]  // width | 
|  |  | 
|  | sub        ecx, 4 | 
|  | jl         convertloop49 | 
|  |  | 
|  | convertloop4: | 
|  | movdqu     xmm0, [eax]        // read 4 pixels from src_argb0 | 
|  | lea        eax, [eax + 16] | 
|  | movdqu     xmm1, [esi]        // read 4 pixels from src_argb1 | 
|  | lea        esi, [esi + 16] | 
|  | paddusb    xmm0, xmm1         // src_argb0 + src_argb1 | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 4 | 
|  | jge        convertloop4 | 
|  |  | 
|  | convertloop49: | 
|  | add        ecx, 4 - 1 | 
|  | jl         convertloop19 | 
|  |  | 
|  | convertloop1: | 
|  | movd       xmm0, [eax]        // read 1 pixels from src_argb0 | 
|  | lea        eax, [eax + 4] | 
|  | movd       xmm1, [esi]        // read 1 pixels from src_argb1 | 
|  | lea        esi, [esi + 4] | 
|  | paddusb    xmm0, xmm1         // src_argb0 + src_argb1 | 
|  | movd       [edx], xmm0 | 
|  | lea        edx, [edx + 4] | 
|  | sub        ecx, 1 | 
|  | jge        convertloop1 | 
|  |  | 
|  | convertloop19: | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBADDROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_ARGBSUBTRACTROW_SSE2 | 
|  | // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. | 
|  | __declspec(naked) | 
|  | void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | 
|  | uint8* dst_argb, int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]   // src_argb0 | 
|  | mov        esi, [esp + 4 + 8]   // src_argb1 | 
|  | mov        edx, [esp + 4 + 12]  // dst_argb | 
|  | mov        ecx, [esp + 4 + 16]  // width | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax]        // read 4 pixels from src_argb0 | 
|  | lea        eax, [eax + 16] | 
|  | movdqu     xmm1, [esi]        // read 4 pixels from src_argb1 | 
|  | lea        esi, [esi + 16] | 
|  | psubusb    xmm0, xmm1         // src_argb0 - src_argb1 | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 4 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBSUBTRACTROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_ARGBMULTIPLYROW_AVX2 | 
|  | // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. | 
|  | __declspec(naked) | 
|  | void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, | 
|  | uint8* dst_argb, int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]   // src_argb0 | 
|  | mov        esi, [esp + 4 + 8]   // src_argb1 | 
|  | mov        edx, [esp + 4 + 12]  // dst_argb | 
|  | mov        ecx, [esp + 4 + 16]  // width | 
|  | vpxor      ymm5, ymm5, ymm5     // constant 0 | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0 | 
|  | lea        eax, [eax + 32] | 
|  | vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1 | 
|  | lea        esi, [esi + 32] | 
|  | vpunpcklbw ymm0, ymm1, ymm1   // low 4 | 
|  | vpunpckhbw ymm1, ymm1, ymm1   // high 4 | 
|  | vpunpcklbw ymm2, ymm3, ymm5   // low 4 | 
|  | vpunpckhbw ymm3, ymm3, ymm5   // high 4 | 
|  | vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4 | 
|  | vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4 | 
|  | vpackuswb  ymm0, ymm0, ymm1 | 
|  | vmovdqu    [edx], ymm0 | 
|  | lea        edx, [edx + 32] | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        esi | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBMULTIPLYROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_ARGBADDROW_AVX2 | 
|  | // Add 2 rows of ARGB pixels together, 8 pixels at a time. | 
|  | __declspec(naked) | 
|  | void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, | 
|  | uint8* dst_argb, int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]   // src_argb0 | 
|  | mov        esi, [esp + 4 + 8]   // src_argb1 | 
|  | mov        edx, [esp + 4 + 12]  // dst_argb | 
|  | mov        ecx, [esp + 4 + 16]  // width | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0 | 
|  | lea        eax, [eax + 32] | 
|  | vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1 | 
|  | lea        esi, [esi + 32] | 
|  | vmovdqu    [edx], ymm0 | 
|  | lea        edx, [edx + 32] | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        esi | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBADDROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_ARGBSUBTRACTROW_AVX2 | 
|  | // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. | 
|  | __declspec(naked) | 
|  | void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, | 
|  | uint8* dst_argb, int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]   // src_argb0 | 
|  | mov        esi, [esp + 4 + 8]   // src_argb1 | 
|  | mov        edx, [esp + 4 + 12]  // dst_argb | 
|  | mov        ecx, [esp + 4 + 16]  // width | 
|  |  | 
|  | convertloop: | 
|  | vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0 | 
|  | lea        eax, [eax + 32] | 
|  | vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1 | 
|  | lea        esi, [esi + 32] | 
|  | vmovdqu    [edx], ymm0 | 
|  | lea        edx, [edx + 32] | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        esi | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBSUBTRACTROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_SOBELXROW_SSE2 | 
|  | // SobelX as a matrix is | 
|  | // -1  0  1 | 
|  | // -2  0  2 | 
|  | // -1  0  1 | 
|  | __declspec(naked) | 
|  | void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, | 
|  | const uint8* src_y2, uint8* dst_sobelx, int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   // src_y0 | 
|  | mov        esi, [esp + 8 + 8]   // src_y1 | 
|  | mov        edi, [esp + 8 + 12]  // src_y2 | 
|  | mov        edx, [esp + 8 + 16]  // dst_sobelx | 
|  | mov        ecx, [esp + 8 + 20]  // width | 
|  | sub        esi, eax | 
|  | sub        edi, eax | 
|  | sub        edx, eax | 
|  | pxor       xmm5, xmm5  // constant 0 | 
|  |  | 
|  | convertloop: | 
|  | movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0] | 
|  | movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2] | 
|  | punpcklbw  xmm0, xmm5 | 
|  | punpcklbw  xmm1, xmm5 | 
|  | psubw      xmm0, xmm1 | 
|  | movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0] | 
|  | movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2] | 
|  | punpcklbw  xmm1, xmm5 | 
|  | punpcklbw  xmm2, xmm5 | 
|  | psubw      xmm1, xmm2 | 
|  | movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0] | 
|  | movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2] | 
|  | punpcklbw  xmm2, xmm5 | 
|  | punpcklbw  xmm3, xmm5 | 
|  | psubw      xmm2, xmm3 | 
|  | paddw      xmm0, xmm2 | 
|  | paddw      xmm0, xmm1 | 
|  | paddw      xmm0, xmm1 | 
|  | pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw | 
|  | psubw      xmm1, xmm0 | 
|  | pmaxsw     xmm0, xmm1 | 
|  | packuswb   xmm0, xmm0 | 
|  | movq       qword ptr [eax + edx], xmm0 | 
|  | lea        eax, [eax + 8] | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_SOBELXROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_SOBELYROW_SSE2 | 
|  | // SobelY as a matrix is | 
|  | // -1 -2 -1 | 
|  | //  0  0  0 | 
|  | //  1  2  1 | 
|  | __declspec(naked) | 
|  | void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, | 
|  | uint8* dst_sobely, int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]   // src_y0 | 
|  | mov        esi, [esp + 4 + 8]   // src_y1 | 
|  | mov        edx, [esp + 4 + 12]  // dst_sobely | 
|  | mov        ecx, [esp + 4 + 16]  // width | 
|  | sub        esi, eax | 
|  | sub        edx, eax | 
|  | pxor       xmm5, xmm5  // constant 0 | 
|  |  | 
|  | convertloop: | 
|  | movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0] | 
|  | movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0] | 
|  | punpcklbw  xmm0, xmm5 | 
|  | punpcklbw  xmm1, xmm5 | 
|  | psubw      xmm0, xmm1 | 
|  | movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1] | 
|  | movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1] | 
|  | punpcklbw  xmm1, xmm5 | 
|  | punpcklbw  xmm2, xmm5 | 
|  | psubw      xmm1, xmm2 | 
|  | movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2] | 
|  | movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2] | 
|  | punpcklbw  xmm2, xmm5 | 
|  | punpcklbw  xmm3, xmm5 | 
|  | psubw      xmm2, xmm3 | 
|  | paddw      xmm0, xmm2 | 
|  | paddw      xmm0, xmm1 | 
|  | paddw      xmm0, xmm1 | 
|  | pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw | 
|  | psubw      xmm1, xmm0 | 
|  | pmaxsw     xmm0, xmm1 | 
|  | packuswb   xmm0, xmm0 | 
|  | movq       qword ptr [eax + edx], xmm0 | 
|  | lea        eax, [eax + 8] | 
|  | sub        ecx, 8 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_SOBELYROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_SOBELROW_SSE2 | 
|  | // Adds Sobel X and Sobel Y and stores Sobel into ARGB. | 
|  | // A = 255 | 
|  | // R = Sobel | 
|  | // G = Sobel | 
|  | // B = Sobel | 
|  | __declspec(naked) | 
|  | void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, | 
|  | uint8* dst_argb, int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]   // src_sobelx | 
|  | mov        esi, [esp + 4 + 8]   // src_sobely | 
|  | mov        edx, [esp + 4 + 12]  // dst_argb | 
|  | mov        ecx, [esp + 4 + 16]  // width | 
|  | sub        esi, eax | 
|  | pcmpeqb    xmm5, xmm5           // alpha 255 | 
|  | pslld      xmm5, 24             // 0xff000000 | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax]            // read 16 pixels src_sobelx | 
|  | movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely | 
|  | lea        eax, [eax + 16] | 
|  | paddusb    xmm0, xmm1             // sobel = sobelx + sobely | 
|  | movdqa     xmm2, xmm0             // GG | 
|  | punpcklbw  xmm2, xmm0             // First 8 | 
|  | punpckhbw  xmm0, xmm0             // Next 8 | 
|  | movdqa     xmm1, xmm2             // GGGG | 
|  | punpcklwd  xmm1, xmm2             // First 4 | 
|  | punpckhwd  xmm2, xmm2             // Next 4 | 
|  | por        xmm1, xmm5             // GGGA | 
|  | por        xmm2, xmm5 | 
|  | movdqa     xmm3, xmm0             // GGGG | 
|  | punpcklwd  xmm3, xmm0             // Next 4 | 
|  | punpckhwd  xmm0, xmm0             // Last 4 | 
|  | por        xmm3, xmm5             // GGGA | 
|  | por        xmm0, xmm5 | 
|  | movdqu     [edx], xmm1 | 
|  | movdqu     [edx + 16], xmm2 | 
|  | movdqu     [edx + 32], xmm3 | 
|  | movdqu     [edx + 48], xmm0 | 
|  | lea        edx, [edx + 64] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_SOBELROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_SOBELTOPLANEROW_SSE2 | 
|  | // Adds Sobel X and Sobel Y and stores Sobel into a plane. | 
|  | __declspec(naked) | 
|  | void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, | 
|  | uint8* dst_y, int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]   // src_sobelx | 
|  | mov        esi, [esp + 4 + 8]   // src_sobely | 
|  | mov        edx, [esp + 4 + 12]  // dst_argb | 
|  | mov        ecx, [esp + 4 + 16]  // width | 
|  | sub        esi, eax | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax]            // read 16 pixels src_sobelx | 
|  | movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely | 
|  | lea        eax, [eax + 16] | 
|  | paddusb    xmm0, xmm1             // sobel = sobelx + sobely | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_SOBELTOPLANEROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_SOBELXYROW_SSE2 | 
|  | // Mixes Sobel X, Sobel Y and Sobel into ARGB. | 
|  | // A = 255 | 
|  | // R = Sobel X | 
|  | // G = Sobel | 
|  | // B = Sobel Y | 
|  | __declspec(naked) | 
|  | void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, | 
|  | uint8* dst_argb, int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]   // src_sobelx | 
|  | mov        esi, [esp + 4 + 8]   // src_sobely | 
|  | mov        edx, [esp + 4 + 12]  // dst_argb | 
|  | mov        ecx, [esp + 4 + 16]  // width | 
|  | sub        esi, eax | 
|  | pcmpeqb    xmm5, xmm5           // alpha 255 | 
|  |  | 
|  | convertloop: | 
|  | movdqu     xmm0, [eax]            // read 16 pixels src_sobelx | 
|  | movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely | 
|  | lea        eax, [eax + 16] | 
|  | movdqa     xmm2, xmm0 | 
|  | paddusb    xmm2, xmm1             // sobel = sobelx + sobely | 
|  | movdqa     xmm3, xmm0             // XA | 
|  | punpcklbw  xmm3, xmm5 | 
|  | punpckhbw  xmm0, xmm5 | 
|  | movdqa     xmm4, xmm1             // YS | 
|  | punpcklbw  xmm4, xmm2 | 
|  | punpckhbw  xmm1, xmm2 | 
|  | movdqa     xmm6, xmm4             // YSXA | 
|  | punpcklwd  xmm6, xmm3             // First 4 | 
|  | punpckhwd  xmm4, xmm3             // Next 4 | 
|  | movdqa     xmm7, xmm1             // YSXA | 
|  | punpcklwd  xmm7, xmm0             // Next 4 | 
|  | punpckhwd  xmm1, xmm0             // Last 4 | 
|  | movdqu     [edx], xmm6 | 
|  | movdqu     [edx + 16], xmm4 | 
|  | movdqu     [edx + 32], xmm7 | 
|  | movdqu     [edx + 48], xmm1 | 
|  | lea        edx, [edx + 64] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_SOBELXYROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 | 
|  | // Consider float CumulativeSum. | 
|  | // Consider calling CumulativeSum one row at time as needed. | 
|  | // Consider circular CumulativeSum buffer of radius * 2 + 1 height. | 
|  | // Convert cumulative sum for an area to an average for 1 pixel. | 
|  | // topleft is pointer to top left of CumulativeSum buffer for area. | 
|  | // botleft is pointer to bottom left of CumulativeSum buffer. | 
|  | // width is offset from left to right of area in CumulativeSum buffer measured | 
|  | //   in number of ints. | 
|  | // area is the number of pixels in the area being averaged. | 
|  | // dst points to pixel to store result to. | 
|  | // count is number of averaged pixels to produce. | 
|  | // Does 4 pixels at a time. | 
|  | void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, | 
|  | int width, int area, uint8* dst, | 
|  | int count) { | 
|  | __asm { | 
|  | mov        eax, topleft  // eax topleft | 
|  | mov        esi, botleft  // esi botleft | 
|  | mov        edx, width | 
|  | movd       xmm5, area | 
|  | mov        edi, dst | 
|  | mov        ecx, count | 
|  | cvtdq2ps   xmm5, xmm5 | 
|  | rcpss      xmm4, xmm5  // 1.0f / area | 
|  | pshufd     xmm4, xmm4, 0 | 
|  | sub        ecx, 4 | 
|  | jl         l4b | 
|  |  | 
|  | cmp        area, 128  // 128 pixels will not overflow 15 bits. | 
|  | ja         l4 | 
|  |  | 
|  | pshufd     xmm5, xmm5, 0        // area | 
|  | pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0 | 
|  | psrld      xmm6, 16 | 
|  | cvtdq2ps   xmm6, xmm6 | 
|  | addps      xmm5, xmm6           // (65536.0 + area - 1) | 
|  | mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area | 
|  | cvtps2dq   xmm5, xmm5           // 0.16 fixed point | 
|  | packssdw   xmm5, xmm5           // 16 bit shorts | 
|  |  | 
|  | // 4 pixel loop small blocks. | 
|  | s4: | 
|  | // top left | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | movdqu     xmm2, [eax + 32] | 
|  | movdqu     xmm3, [eax + 48] | 
|  |  | 
|  | // - top right | 
|  | psubd      xmm0, [eax + edx * 4] | 
|  | psubd      xmm1, [eax + edx * 4 + 16] | 
|  | psubd      xmm2, [eax + edx * 4 + 32] | 
|  | psubd      xmm3, [eax + edx * 4 + 48] | 
|  | lea        eax, [eax + 64] | 
|  |  | 
|  | // - bottom left | 
|  | psubd      xmm0, [esi] | 
|  | psubd      xmm1, [esi + 16] | 
|  | psubd      xmm2, [esi + 32] | 
|  | psubd      xmm3, [esi + 48] | 
|  |  | 
|  | // + bottom right | 
|  | paddd      xmm0, [esi + edx * 4] | 
|  | paddd      xmm1, [esi + edx * 4 + 16] | 
|  | paddd      xmm2, [esi + edx * 4 + 32] | 
|  | paddd      xmm3, [esi + edx * 4 + 48] | 
|  | lea        esi, [esi + 64] | 
|  |  | 
|  | packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers | 
|  | packssdw   xmm2, xmm3 | 
|  |  | 
|  | pmulhuw    xmm0, xmm5 | 
|  | pmulhuw    xmm2, xmm5 | 
|  |  | 
|  | packuswb   xmm0, xmm2 | 
|  | movdqu     [edi], xmm0 | 
|  | lea        edi, [edi + 16] | 
|  | sub        ecx, 4 | 
|  | jge        s4 | 
|  |  | 
|  | jmp        l4b | 
|  |  | 
|  | // 4 pixel loop | 
|  | l4: | 
|  | // top left | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | movdqu     xmm2, [eax + 32] | 
|  | movdqu     xmm3, [eax + 48] | 
|  |  | 
|  | // - top right | 
|  | psubd      xmm0, [eax + edx * 4] | 
|  | psubd      xmm1, [eax + edx * 4 + 16] | 
|  | psubd      xmm2, [eax + edx * 4 + 32] | 
|  | psubd      xmm3, [eax + edx * 4 + 48] | 
|  | lea        eax, [eax + 64] | 
|  |  | 
|  | // - bottom left | 
|  | psubd      xmm0, [esi] | 
|  | psubd      xmm1, [esi + 16] | 
|  | psubd      xmm2, [esi + 32] | 
|  | psubd      xmm3, [esi + 48] | 
|  |  | 
|  | // + bottom right | 
|  | paddd      xmm0, [esi + edx * 4] | 
|  | paddd      xmm1, [esi + edx * 4 + 16] | 
|  | paddd      xmm2, [esi + edx * 4 + 32] | 
|  | paddd      xmm3, [esi + edx * 4 + 48] | 
|  | lea        esi, [esi + 64] | 
|  |  | 
|  | cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area | 
|  | cvtdq2ps   xmm1, xmm1 | 
|  | mulps      xmm0, xmm4 | 
|  | mulps      xmm1, xmm4 | 
|  | cvtdq2ps   xmm2, xmm2 | 
|  | cvtdq2ps   xmm3, xmm3 | 
|  | mulps      xmm2, xmm4 | 
|  | mulps      xmm3, xmm4 | 
|  | cvtps2dq   xmm0, xmm0 | 
|  | cvtps2dq   xmm1, xmm1 | 
|  | cvtps2dq   xmm2, xmm2 | 
|  | cvtps2dq   xmm3, xmm3 | 
|  | packssdw   xmm0, xmm1 | 
|  | packssdw   xmm2, xmm3 | 
|  | packuswb   xmm0, xmm2 | 
|  | movdqu     [edi], xmm0 | 
|  | lea        edi, [edi + 16] | 
|  | sub        ecx, 4 | 
|  | jge        l4 | 
|  |  | 
|  | l4b: | 
|  | add        ecx, 4 - 1 | 
|  | jl         l1b | 
|  |  | 
|  | // 1 pixel loop | 
|  | l1: | 
|  | movdqu     xmm0, [eax] | 
|  | psubd      xmm0, [eax + edx * 4] | 
|  | lea        eax, [eax + 16] | 
|  | psubd      xmm0, [esi] | 
|  | paddd      xmm0, [esi + edx * 4] | 
|  | lea        esi, [esi + 16] | 
|  | cvtdq2ps   xmm0, xmm0 | 
|  | mulps      xmm0, xmm4 | 
|  | cvtps2dq   xmm0, xmm0 | 
|  | packssdw   xmm0, xmm0 | 
|  | packuswb   xmm0, xmm0 | 
|  | movd       dword ptr [edi], xmm0 | 
|  | lea        edi, [edi + 4] | 
|  | sub        ecx, 1 | 
|  | jge        l1 | 
|  | l1b: | 
|  | } | 
|  | } | 
|  | #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 | 
|  | // Creates a table of cumulative sums where each value is a sum of all values | 
|  | // above and to the left of the value. | 
|  | void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, | 
|  | const int32* previous_cumsum, int width) { | 
|  | __asm { | 
|  | mov        eax, row | 
|  | mov        edx, cumsum | 
|  | mov        esi, previous_cumsum | 
|  | mov        ecx, width | 
|  | pxor       xmm0, xmm0 | 
|  | pxor       xmm1, xmm1 | 
|  |  | 
|  | sub        ecx, 4 | 
|  | jl         l4b | 
|  | test       edx, 15 | 
|  | jne        l4b | 
|  |  | 
|  | // 4 pixel loop | 
|  | l4: | 
|  | movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes. | 
|  | lea        eax, [eax + 16] | 
|  | movdqa     xmm4, xmm2 | 
|  |  | 
|  | punpcklbw  xmm2, xmm1 | 
|  | movdqa     xmm3, xmm2 | 
|  | punpcklwd  xmm2, xmm1 | 
|  | punpckhwd  xmm3, xmm1 | 
|  |  | 
|  | punpckhbw  xmm4, xmm1 | 
|  | movdqa     xmm5, xmm4 | 
|  | punpcklwd  xmm4, xmm1 | 
|  | punpckhwd  xmm5, xmm1 | 
|  |  | 
|  | paddd      xmm0, xmm2 | 
|  | movdqu     xmm2, [esi]  // previous row above. | 
|  | paddd      xmm2, xmm0 | 
|  |  | 
|  | paddd      xmm0, xmm3 | 
|  | movdqu     xmm3, [esi + 16] | 
|  | paddd      xmm3, xmm0 | 
|  |  | 
|  | paddd      xmm0, xmm4 | 
|  | movdqu     xmm4, [esi + 32] | 
|  | paddd      xmm4, xmm0 | 
|  |  | 
|  | paddd      xmm0, xmm5 | 
|  | movdqu     xmm5, [esi + 48] | 
|  | lea        esi, [esi + 64] | 
|  | paddd      xmm5, xmm0 | 
|  |  | 
|  | movdqu     [edx], xmm2 | 
|  | movdqu     [edx + 16], xmm3 | 
|  | movdqu     [edx + 32], xmm4 | 
|  | movdqu     [edx + 48], xmm5 | 
|  |  | 
|  | lea        edx, [edx + 64] | 
|  | sub        ecx, 4 | 
|  | jge        l4 | 
|  |  | 
|  | l4b: | 
|  | add        ecx, 4 - 1 | 
|  | jl         l1b | 
|  |  | 
|  | // 1 pixel loop | 
|  | l1: | 
|  | movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes. | 
|  | lea        eax, [eax + 4] | 
|  | punpcklbw  xmm2, xmm1 | 
|  | punpcklwd  xmm2, xmm1 | 
|  | paddd      xmm0, xmm2 | 
|  | movdqu     xmm2, [esi] | 
|  | lea        esi, [esi + 16] | 
|  | paddd      xmm2, xmm0 | 
|  | movdqu     [edx], xmm2 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 1 | 
|  | jge        l1 | 
|  |  | 
|  | l1b: | 
|  | } | 
|  | } | 
|  | #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_ARGBAFFINEROW_SSE2 | 
|  | // Copy ARGB pixels from source image with slope to a row of destination. | 
|  | __declspec(naked) | 
|  | LIBYUV_API | 
|  | void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, | 
|  | uint8* dst_argb, const float* uv_dudv, int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 12]  // src_argb | 
|  | mov        esi, [esp + 16]  // stride | 
|  | mov        edx, [esp + 20]  // dst_argb | 
|  | mov        ecx, [esp + 24]  // pointer to uv_dudv | 
|  | movq       xmm2, qword ptr [ecx]  // uv | 
|  | movq       xmm7, qword ptr [ecx + 8]  // dudv | 
|  | mov        ecx, [esp + 28]  // width | 
|  | shl        esi, 16          // 4, stride | 
|  | add        esi, 4 | 
|  | movd       xmm5, esi | 
|  | sub        ecx, 4 | 
|  | jl         l4b | 
|  |  | 
|  | // setup for 4 pixel loop | 
|  | pshufd     xmm7, xmm7, 0x44  // dup dudv | 
|  | pshufd     xmm5, xmm5, 0  // dup 4, stride | 
|  | movdqa     xmm0, xmm2    // x0, y0, x1, y1 | 
|  | addps      xmm0, xmm7 | 
|  | movlhps    xmm2, xmm0 | 
|  | movdqa     xmm4, xmm7 | 
|  | addps      xmm4, xmm4    // dudv *= 2 | 
|  | movdqa     xmm3, xmm2    // x2, y2, x3, y3 | 
|  | addps      xmm3, xmm4 | 
|  | addps      xmm4, xmm4    // dudv *= 4 | 
|  |  | 
|  | // 4 pixel loop | 
|  | l4: | 
|  | cvttps2dq  xmm0, xmm2    // x, y float to int first 2 | 
|  | cvttps2dq  xmm1, xmm3    // x, y float to int next 2 | 
|  | packssdw   xmm0, xmm1    // x, y as 8 shorts | 
|  | pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride. | 
|  | movd       esi, xmm0 | 
|  | pshufd     xmm0, xmm0, 0x39  // shift right | 
|  | movd       edi, xmm0 | 
|  | pshufd     xmm0, xmm0, 0x39  // shift right | 
|  | movd       xmm1, [eax + esi]  // read pixel 0 | 
|  | movd       xmm6, [eax + edi]  // read pixel 1 | 
|  | punpckldq  xmm1, xmm6     // combine pixel 0 and 1 | 
|  | addps      xmm2, xmm4    // x, y += dx, dy first 2 | 
|  | movq       qword ptr [edx], xmm1 | 
|  | movd       esi, xmm0 | 
|  | pshufd     xmm0, xmm0, 0x39  // shift right | 
|  | movd       edi, xmm0 | 
|  | movd       xmm6, [eax + esi]  // read pixel 2 | 
|  | movd       xmm0, [eax + edi]  // read pixel 3 | 
|  | punpckldq  xmm6, xmm0     // combine pixel 2 and 3 | 
|  | addps      xmm3, xmm4    // x, y += dx, dy next 2 | 
|  | movq       qword ptr 8[edx], xmm6 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 4 | 
|  | jge        l4 | 
|  |  | 
|  | l4b: | 
|  | add        ecx, 4 - 1 | 
|  | jl         l1b | 
|  |  | 
|  | // 1 pixel loop | 
|  | l1: | 
|  | cvttps2dq  xmm0, xmm2    // x, y float to int | 
|  | packssdw   xmm0, xmm0    // x, y as shorts | 
|  | pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride | 
|  | addps      xmm2, xmm7    // x, y += dx, dy | 
|  | movd       esi, xmm0 | 
|  | movd       xmm0, [eax + esi]  // copy a pixel | 
|  | movd       [edx], xmm0 | 
|  | lea        edx, [edx + 4] | 
|  | sub        ecx, 1 | 
|  | jge        l1 | 
|  | l1b: | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBAFFINEROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_INTERPOLATEROW_AVX2 | 
|  | // Bilinear filter 32x2 -> 32x1 | 
|  | __declspec(naked) | 
|  | void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, | 
|  | ptrdiff_t src_stride, int dst_width, | 
|  | int source_y_fraction) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        edi, [esp + 8 + 4]   // dst_ptr | 
|  | mov        esi, [esp + 8 + 8]   // src_ptr | 
|  | mov        edx, [esp + 8 + 12]  // src_stride | 
|  | mov        ecx, [esp + 8 + 16]  // dst_width | 
|  | mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255) | 
|  | shr        eax, 1 | 
|  | // Dispatch to specialized filters if applicable. | 
|  | cmp        eax, 0 | 
|  | je         xloop100  // 0 / 128.  Blend 100 / 0. | 
|  | sub        edi, esi | 
|  | cmp        eax, 32 | 
|  | je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25. | 
|  | cmp        eax, 64 | 
|  | je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50. | 
|  | cmp        eax, 96 | 
|  | je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75. | 
|  |  | 
|  | vmovd      xmm0, eax  // high fraction 0..127 | 
|  | neg        eax | 
|  | add        eax, 128 | 
|  | vmovd      xmm5, eax  // low fraction 128..1 | 
|  | vpunpcklbw xmm5, xmm5, xmm0 | 
|  | vpunpcklwd xmm5, xmm5, xmm5 | 
|  | vpxor      ymm0, ymm0, ymm0 | 
|  | vpermd     ymm5, ymm0, ymm5 | 
|  |  | 
|  | xloop: | 
|  | vmovdqu    ymm0, [esi] | 
|  | vmovdqu    ymm2, [esi + edx] | 
|  | vpunpckhbw ymm1, ymm0, ymm2  // mutates | 
|  | vpunpcklbw ymm0, ymm0, ymm2  // mutates | 
|  | vpmaddubsw ymm0, ymm0, ymm5 | 
|  | vpmaddubsw ymm1, ymm1, ymm5 | 
|  | vpsrlw     ymm0, ymm0, 7 | 
|  | vpsrlw     ymm1, ymm1, 7 | 
|  | vpackuswb  ymm0, ymm0, ymm1  // unmutates | 
|  | vmovdqu    [esi + edi], ymm0 | 
|  | lea        esi, [esi + 32] | 
|  | sub        ecx, 32 | 
|  | jg         xloop | 
|  | jmp        xloop99 | 
|  |  | 
|  | // Blend 25 / 75. | 
|  | xloop25: | 
|  | vmovdqu    ymm0, [esi] | 
|  | vmovdqu    ymm1, [esi + edx] | 
|  | vpavgb     ymm0, ymm0, ymm1 | 
|  | vpavgb     ymm0, ymm0, ymm1 | 
|  | vmovdqu    [esi + edi], ymm0 | 
|  | lea        esi, [esi + 32] | 
|  | sub        ecx, 32 | 
|  | jg         xloop25 | 
|  | jmp        xloop99 | 
|  |  | 
|  | // Blend 50 / 50. | 
|  | xloop50: | 
|  | vmovdqu    ymm0, [esi] | 
|  | vpavgb     ymm0, ymm0, [esi + edx] | 
|  | vmovdqu    [esi + edi], ymm0 | 
|  | lea        esi, [esi + 32] | 
|  | sub        ecx, 32 | 
|  | jg         xloop50 | 
|  | jmp        xloop99 | 
|  |  | 
|  | // Blend 75 / 25. | 
|  | xloop75: | 
|  | vmovdqu    ymm1, [esi] | 
|  | vmovdqu    ymm0, [esi + edx] | 
|  | vpavgb     ymm0, ymm0, ymm1 | 
|  | vpavgb     ymm0, ymm0, ymm1 | 
|  | vmovdqu    [esi + edi], ymm0 | 
|  | lea        esi, [esi + 32] | 
|  | sub        ecx, 32 | 
|  | jg         xloop75 | 
|  | jmp        xloop99 | 
|  |  | 
|  | // Blend 100 / 0 - Copy row unchanged. | 
|  | xloop100: | 
|  | rep movsb | 
|  |  | 
|  | xloop99: | 
|  | pop        edi | 
|  | pop        esi | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_INTERPOLATEROW_AVX2 | 
|  |  | 
|  | // Bilinear filter 16x2 -> 16x1 | 
|  | __declspec(naked) | 
|  | void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, | 
|  | ptrdiff_t src_stride, int dst_width, | 
|  | int source_y_fraction) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        edi, [esp + 8 + 4]   // dst_ptr | 
|  | mov        esi, [esp + 8 + 8]   // src_ptr | 
|  | mov        edx, [esp + 8 + 12]  // src_stride | 
|  | mov        ecx, [esp + 8 + 16]  // dst_width | 
|  | mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255) | 
|  | sub        edi, esi | 
|  | shr        eax, 1 | 
|  | // Dispatch to specialized filters if applicable. | 
|  | cmp        eax, 0 | 
|  | je         xloop100  // 0 / 128.  Blend 100 / 0. | 
|  | cmp        eax, 32 | 
|  | je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25. | 
|  | cmp        eax, 64 | 
|  | je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50. | 
|  | cmp        eax, 96 | 
|  | je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75. | 
|  |  | 
|  | movd       xmm0, eax  // high fraction 0..127 | 
|  | neg        eax | 
|  | add        eax, 128 | 
|  | movd       xmm5, eax  // low fraction 128..1 | 
|  | punpcklbw  xmm5, xmm0 | 
|  | punpcklwd  xmm5, xmm5 | 
|  | pshufd     xmm5, xmm5, 0 | 
|  |  | 
|  | xloop: | 
|  | movdqu     xmm0, [esi] | 
|  | movdqu     xmm2, [esi + edx] | 
|  | movdqu     xmm1, xmm0 | 
|  | punpcklbw  xmm0, xmm2 | 
|  | punpckhbw  xmm1, xmm2 | 
|  | pmaddubsw  xmm0, xmm5 | 
|  | pmaddubsw  xmm1, xmm5 | 
|  | psrlw      xmm0, 7 | 
|  | psrlw      xmm1, 7 | 
|  | packuswb   xmm0, xmm1 | 
|  | movdqu     [esi + edi], xmm0 | 
|  | lea        esi, [esi + 16] | 
|  | sub        ecx, 16 | 
|  | jg         xloop | 
|  | jmp        xloop99 | 
|  |  | 
|  | // Blend 25 / 75. | 
|  | xloop25: | 
|  | movdqu     xmm0, [esi] | 
|  | movdqu     xmm1, [esi + edx] | 
|  | pavgb      xmm0, xmm1 | 
|  | pavgb      xmm0, xmm1 | 
|  | movdqu     [esi + edi], xmm0 | 
|  | lea        esi, [esi + 16] | 
|  | sub        ecx, 16 | 
|  | jg         xloop25 | 
|  | jmp        xloop99 | 
|  |  | 
|  | // Blend 50 / 50. | 
|  | xloop50: | 
|  | movdqu     xmm0, [esi] | 
|  | movdqu     xmm1, [esi + edx] | 
|  | pavgb      xmm0, xmm1 | 
|  | movdqu     [esi + edi], xmm0 | 
|  | lea        esi, [esi + 16] | 
|  | sub        ecx, 16 | 
|  | jg         xloop50 | 
|  | jmp        xloop99 | 
|  |  | 
|  | // Blend 75 / 25. | 
|  | xloop75: | 
|  | movdqu     xmm1, [esi] | 
|  | movdqu     xmm0, [esi + edx] | 
|  | pavgb      xmm0, xmm1 | 
|  | pavgb      xmm0, xmm1 | 
|  | movdqu     [esi + edi], xmm0 | 
|  | lea        esi, [esi + 16] | 
|  | sub        ecx, 16 | 
|  | jg         xloop75 | 
|  | jmp        xloop99 | 
|  |  | 
|  | // Blend 100 / 0 - Copy row unchanged. | 
|  | xloop100: | 
|  | movdqu     xmm0, [esi] | 
|  | movdqu     [esi + edi], xmm0 | 
|  | lea        esi, [esi + 16] | 
|  | sub        ecx, 16 | 
|  | jg         xloop100 | 
|  |  | 
|  | xloop99: | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | #ifdef HAS_INTERPOLATEROW_SSE2 | 
|  | // Bilinear filter 16x2 -> 16x1 | 
|  | __declspec(naked) | 
|  | void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, | 
|  | ptrdiff_t src_stride, int dst_width, | 
|  | int source_y_fraction) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        edi, [esp + 8 + 4]   // dst_ptr | 
|  | mov        esi, [esp + 8 + 8]   // src_ptr | 
|  | mov        edx, [esp + 8 + 12]  // src_stride | 
|  | mov        ecx, [esp + 8 + 16]  // dst_width | 
|  | mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255) | 
|  | sub        edi, esi | 
|  | // Dispatch to specialized filters if applicable. | 
|  | cmp        eax, 0 | 
|  | je         xloop100  // 0 / 256.  Blend 100 / 0. | 
|  | cmp        eax, 64 | 
|  | je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25. | 
|  | cmp        eax, 128 | 
|  | je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50. | 
|  | cmp        eax, 192 | 
|  | je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75. | 
|  |  | 
|  | movd       xmm5, eax            // xmm5 = y fraction | 
|  | punpcklbw  xmm5, xmm5 | 
|  | psrlw      xmm5, 1 | 
|  | punpcklwd  xmm5, xmm5 | 
|  | punpckldq  xmm5, xmm5 | 
|  | punpcklqdq xmm5, xmm5 | 
|  | pxor       xmm4, xmm4 | 
|  |  | 
|  | xloop: | 
|  | movdqu     xmm0, [esi]  // row0 | 
|  | movdqu     xmm2, [esi + edx]  // row1 | 
|  | movdqu     xmm1, xmm0 | 
|  | movdqu     xmm3, xmm2 | 
|  | punpcklbw  xmm2, xmm4 | 
|  | punpckhbw  xmm3, xmm4 | 
|  | punpcklbw  xmm0, xmm4 | 
|  | punpckhbw  xmm1, xmm4 | 
|  | psubw      xmm2, xmm0  // row1 - row0 | 
|  | psubw      xmm3, xmm1 | 
|  | paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16 | 
|  | paddw      xmm3, xmm3 | 
|  | pmulhw     xmm2, xmm5  // scale diff | 
|  | pmulhw     xmm3, xmm5 | 
|  | paddw      xmm0, xmm2  // sum rows | 
|  | paddw      xmm1, xmm3 | 
|  | packuswb   xmm0, xmm1 | 
|  | movdqu     [esi + edi], xmm0 | 
|  | lea        esi, [esi + 16] | 
|  | sub        ecx, 16 | 
|  | jg         xloop | 
|  | jmp        xloop99 | 
|  |  | 
|  | // Blend 25 / 75. | 
|  | xloop25: | 
|  | movdqu     xmm0, [esi] | 
|  | movdqu     xmm1, [esi + edx] | 
|  | pavgb      xmm0, xmm1 | 
|  | pavgb      xmm0, xmm1 | 
|  | movdqu     [esi + edi], xmm0 | 
|  | lea        esi, [esi + 16] | 
|  | sub        ecx, 16 | 
|  | jg         xloop25 | 
|  | jmp        xloop99 | 
|  |  | 
|  | // Blend 50 / 50. | 
|  | xloop50: | 
|  | movdqu     xmm0, [esi] | 
|  | movdqu     xmm1, [esi + edx] | 
|  | pavgb      xmm0, xmm1 | 
|  | movdqu     [esi + edi], xmm0 | 
|  | lea        esi, [esi + 16] | 
|  | sub        ecx, 16 | 
|  | jg         xloop50 | 
|  | jmp        xloop99 | 
|  |  | 
|  | // Blend 75 / 25. | 
|  | xloop75: | 
|  | movdqu     xmm1, [esi] | 
|  | movdqu     xmm0, [esi + edx] | 
|  | pavgb      xmm0, xmm1 | 
|  | pavgb      xmm0, xmm1 | 
|  | movdqu     [esi + edi], xmm0 | 
|  | lea        esi, [esi + 16] | 
|  | sub        ecx, 16 | 
|  | jg         xloop75 | 
|  | jmp        xloop99 | 
|  |  | 
|  | // Blend 100 / 0 - Copy row unchanged. | 
|  | xloop100: | 
|  | movdqu     xmm0, [esi] | 
|  | movdqu     [esi + edi], xmm0 | 
|  | lea        esi, [esi + 16] | 
|  | sub        ecx, 16 | 
|  | jg         xloop100 | 
|  |  | 
|  | xloop99: | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_INTERPOLATEROW_SSE2 | 
|  |  | 
|  | // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 
|  | __declspec(naked) | 
|  | void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | 
|  | const uint8* shuffler, int pix) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]    // src_argb | 
|  | mov        edx, [esp + 8]    // dst_argb | 
|  | mov        ecx, [esp + 12]   // shuffler | 
|  | movdqu     xmm5, [ecx] | 
|  | mov        ecx, [esp + 16]   // pix | 
|  |  | 
|  | wloop: | 
|  | movdqu     xmm0, [eax] | 
|  | movdqu     xmm1, [eax + 16] | 
|  | lea        eax, [eax + 32] | 
|  | pshufb     xmm0, xmm5 | 
|  | pshufb     xmm1, xmm5 | 
|  | movdqu     [edx], xmm0 | 
|  | movdqu     [edx + 16], xmm1 | 
|  | lea        edx, [edx + 32] | 
|  | sub        ecx, 8 | 
|  | jg         wloop | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | #ifdef HAS_ARGBSHUFFLEROW_AVX2 | 
|  | __declspec(naked) | 
|  | void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, | 
|  | const uint8* shuffler, int pix) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]     // src_argb | 
|  | mov        edx, [esp + 8]     // dst_argb | 
|  | mov        ecx, [esp + 12]    // shuffler | 
|  | vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low. | 
|  | mov        ecx, [esp + 16]    // pix | 
|  |  | 
|  | wloop: | 
|  | vmovdqu    ymm0, [eax] | 
|  | vmovdqu    ymm1, [eax + 32] | 
|  | lea        eax, [eax + 64] | 
|  | vpshufb    ymm0, ymm0, ymm5 | 
|  | vpshufb    ymm1, ymm1, ymm5 | 
|  | vmovdqu    [edx], ymm0 | 
|  | vmovdqu    [edx + 32], ymm1 | 
|  | lea        edx, [edx + 64] | 
|  | sub        ecx, 16 | 
|  | jg         wloop | 
|  |  | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBSHUFFLEROW_AVX2 | 
|  |  | 
|  | __declspec(naked) | 
|  | void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, | 
|  | const uint8* shuffler, int pix) { | 
|  | __asm { | 
|  | push       ebx | 
|  | push       esi | 
|  | mov        eax, [esp + 8 + 4]    // src_argb | 
|  | mov        edx, [esp + 8 + 8]    // dst_argb | 
|  | mov        esi, [esp + 8 + 12]   // shuffler | 
|  | mov        ecx, [esp + 8 + 16]   // pix | 
|  | pxor       xmm5, xmm5 | 
|  |  | 
|  | mov        ebx, [esi]   // shuffler | 
|  | cmp        ebx, 0x03000102 | 
|  | je         shuf_3012 | 
|  | cmp        ebx, 0x00010203 | 
|  | je         shuf_0123 | 
|  | cmp        ebx, 0x00030201 | 
|  | je         shuf_0321 | 
|  | cmp        ebx, 0x02010003 | 
|  | je         shuf_2103 | 
|  |  | 
|  | // TODO(fbarchard): Use one source pointer and 3 offsets. | 
|  | shuf_any1: | 
|  | movzx      ebx, byte ptr [esi] | 
|  | movzx      ebx, byte ptr [eax + ebx] | 
|  | mov        [edx], bl | 
|  | movzx      ebx, byte ptr [esi + 1] | 
|  | movzx      ebx, byte ptr [eax + ebx] | 
|  | mov        [edx + 1], bl | 
|  | movzx      ebx, byte ptr [esi + 2] | 
|  | movzx      ebx, byte ptr [eax + ebx] | 
|  | mov        [edx + 2], bl | 
|  | movzx      ebx, byte ptr [esi + 3] | 
|  | movzx      ebx, byte ptr [eax + ebx] | 
|  | mov        [edx + 3], bl | 
|  | lea        eax, [eax + 4] | 
|  | lea        edx, [edx + 4] | 
|  | sub        ecx, 1 | 
|  | jg         shuf_any1 | 
|  | jmp        shuf99 | 
|  |  | 
|  | shuf_0123: | 
|  | movdqu     xmm0, [eax] | 
|  | lea        eax, [eax + 16] | 
|  | movdqa     xmm1, xmm0 | 
|  | punpcklbw  xmm0, xmm5 | 
|  | punpckhbw  xmm1, xmm5 | 
|  | pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB | 
|  | pshuflw    xmm0, xmm0, 01Bh | 
|  | pshufhw    xmm1, xmm1, 01Bh | 
|  | pshuflw    xmm1, xmm1, 01Bh | 
|  | packuswb   xmm0, xmm1 | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 4 | 
|  | jg         shuf_0123 | 
|  | jmp        shuf99 | 
|  |  | 
|  | shuf_0321: | 
|  | movdqu     xmm0, [eax] | 
|  | lea        eax, [eax + 16] | 
|  | movdqa     xmm1, xmm0 | 
|  | punpcklbw  xmm0, xmm5 | 
|  | punpckhbw  xmm1, xmm5 | 
|  | pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB | 
|  | pshuflw    xmm0, xmm0, 039h | 
|  | pshufhw    xmm1, xmm1, 039h | 
|  | pshuflw    xmm1, xmm1, 039h | 
|  | packuswb   xmm0, xmm1 | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 4 | 
|  | jg         shuf_0321 | 
|  | jmp        shuf99 | 
|  |  | 
|  | shuf_2103: | 
|  | movdqu     xmm0, [eax] | 
|  | lea        eax, [eax + 16] | 
|  | movdqa     xmm1, xmm0 | 
|  | punpcklbw  xmm0, xmm5 | 
|  | punpckhbw  xmm1, xmm5 | 
|  | pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA | 
|  | pshuflw    xmm0, xmm0, 093h | 
|  | pshufhw    xmm1, xmm1, 093h | 
|  | pshuflw    xmm1, xmm1, 093h | 
|  | packuswb   xmm0, xmm1 | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 4 | 
|  | jg         shuf_2103 | 
|  | jmp        shuf99 | 
|  |  | 
|  | shuf_3012: | 
|  | movdqu     xmm0, [eax] | 
|  | lea        eax, [eax + 16] | 
|  | movdqa     xmm1, xmm0 | 
|  | punpcklbw  xmm0, xmm5 | 
|  | punpckhbw  xmm1, xmm5 | 
|  | pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB | 
|  | pshuflw    xmm0, xmm0, 0C6h | 
|  | pshufhw    xmm1, xmm1, 0C6h | 
|  | pshuflw    xmm1, xmm1, 0C6h | 
|  | packuswb   xmm0, xmm1 | 
|  | movdqu     [edx], xmm0 | 
|  | lea        edx, [edx + 16] | 
|  | sub        ecx, 4 | 
|  | jg         shuf_3012 | 
|  |  | 
|  | shuf99: | 
|  | pop        esi | 
|  | pop        ebx | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | // YUY2 - Macro-pixel = 2 image pixels | 
|  | // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... | 
|  |  | 
|  | // UYVY - Macro-pixel = 2 image pixels | 
|  | // U0Y0V0Y1 | 
|  |  | 
|  | __declspec(naked) | 
|  | void I422ToYUY2Row_SSE2(const uint8* src_y, | 
|  | const uint8* src_u, | 
|  | const uint8* src_v, | 
|  | uint8* dst_frame, int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]    // src_y | 
|  | mov        esi, [esp + 8 + 8]    // src_u | 
|  | mov        edx, [esp + 8 + 12]   // src_v | 
|  | mov        edi, [esp + 8 + 16]   // dst_frame | 
|  | mov        ecx, [esp + 8 + 20]   // width | 
|  | sub        edx, esi | 
|  |  | 
|  | convertloop: | 
|  | movq       xmm2, qword ptr [esi] // U | 
|  | movq       xmm3, qword ptr [esi + edx] // V | 
|  | lea        esi, [esi + 8] | 
|  | punpcklbw  xmm2, xmm3 // UV | 
|  | movdqu     xmm0, [eax] // Y | 
|  | lea        eax, [eax + 16] | 
|  | movdqa     xmm1, xmm0 | 
|  | punpcklbw  xmm0, xmm2 // YUYV | 
|  | punpckhbw  xmm1, xmm2 | 
|  | movdqu     [edi], xmm0 | 
|  | movdqu     [edi + 16], xmm1 | 
|  | lea        edi, [edi + 32] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | __declspec(naked) | 
|  | void I422ToUYVYRow_SSE2(const uint8* src_y, | 
|  | const uint8* src_u, | 
|  | const uint8* src_v, | 
|  | uint8* dst_frame, int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]    // src_y | 
|  | mov        esi, [esp + 8 + 8]    // src_u | 
|  | mov        edx, [esp + 8 + 12]   // src_v | 
|  | mov        edi, [esp + 8 + 16]   // dst_frame | 
|  | mov        ecx, [esp + 8 + 20]   // width | 
|  | sub        edx, esi | 
|  |  | 
|  | convertloop: | 
|  | movq       xmm2, qword ptr [esi] // U | 
|  | movq       xmm3, qword ptr [esi + edx] // V | 
|  | lea        esi, [esi + 8] | 
|  | punpcklbw  xmm2, xmm3 // UV | 
|  | movdqu     xmm0, [eax] // Y | 
|  | movdqa     xmm1, xmm2 | 
|  | lea        eax, [eax + 16] | 
|  | punpcklbw  xmm1, xmm0 // UYVY | 
|  | punpckhbw  xmm2, xmm0 | 
|  | movdqu     [edi], xmm1 | 
|  | movdqu     [edi + 16], xmm2 | 
|  | lea        edi, [edi + 32] | 
|  | sub        ecx, 16 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  |  | 
|  | #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 | 
|  | __declspec(naked) | 
|  | void ARGBPolynomialRow_SSE2(const uint8* src_argb, | 
|  | uint8* dst_argb, const float* poly, | 
|  | int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]   /* src_argb */ | 
|  | mov        edx, [esp + 4 + 8]   /* dst_argb */ | 
|  | mov        esi, [esp + 4 + 12]  /* poly */ | 
|  | mov        ecx, [esp + 4 + 16]  /* width */ | 
|  | pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints. | 
|  |  | 
|  | // 2 pixel loop. | 
|  | convertloop: | 
|  | //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel | 
|  | //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel | 
|  | movq       xmm0, qword ptr [eax]  // BGRABGRA | 
|  | lea        eax, [eax + 8] | 
|  | punpcklbw  xmm0, xmm3 | 
|  | movdqa     xmm4, xmm0 | 
|  | punpcklwd  xmm0, xmm3  // pixel 0 | 
|  | punpckhwd  xmm4, xmm3  // pixel 1 | 
|  | cvtdq2ps   xmm0, xmm0  // 4 floats | 
|  | cvtdq2ps   xmm4, xmm4 | 
|  | movdqa     xmm1, xmm0  // X | 
|  | movdqa     xmm5, xmm4 | 
|  | mulps      xmm0, [esi + 16]  // C1 * X | 
|  | mulps      xmm4, [esi + 16] | 
|  | addps      xmm0, [esi]  // result = C0 + C1 * X | 
|  | addps      xmm4, [esi] | 
|  | movdqa     xmm2, xmm1 | 
|  | movdqa     xmm6, xmm5 | 
|  | mulps      xmm2, xmm1  // X * X | 
|  | mulps      xmm6, xmm5 | 
|  | mulps      xmm1, xmm2  // X * X * X | 
|  | mulps      xmm5, xmm6 | 
|  | mulps      xmm2, [esi + 32]  // C2 * X * X | 
|  | mulps      xmm6, [esi + 32] | 
|  | mulps      xmm1, [esi + 48]  // C3 * X * X * X | 
|  | mulps      xmm5, [esi + 48] | 
|  | addps      xmm0, xmm2  // result += C2 * X * X | 
|  | addps      xmm4, xmm6 | 
|  | addps      xmm0, xmm1  // result += C3 * X * X * X | 
|  | addps      xmm4, xmm5 | 
|  | cvttps2dq  xmm0, xmm0 | 
|  | cvttps2dq  xmm4, xmm4 | 
|  | packuswb   xmm0, xmm4 | 
|  | packuswb   xmm0, xmm0 | 
|  | movq       qword ptr [edx], xmm0 | 
|  | lea        edx, [edx + 8] | 
|  | sub        ecx, 2 | 
|  | jg         convertloop | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBPOLYNOMIALROW_SSE2 | 
|  |  | 
|  | #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 | 
|  | __declspec(naked) | 
|  | void ARGBPolynomialRow_AVX2(const uint8* src_argb, | 
|  | uint8* dst_argb, const float* poly, | 
|  | int width) { | 
|  | __asm { | 
|  | mov        eax, [esp + 4]   /* src_argb */ | 
|  | mov        edx, [esp + 8]   /* dst_argb */ | 
|  | mov        ecx, [esp + 12]   /* poly */ | 
|  | vbroadcastf128 ymm4, [ecx]       // C0 | 
|  | vbroadcastf128 ymm5, [ecx + 16]  // C1 | 
|  | vbroadcastf128 ymm6, [ecx + 32]  // C2 | 
|  | vbroadcastf128 ymm7, [ecx + 48]  // C3 | 
|  | mov        ecx, [esp + 16]  /* width */ | 
|  |  | 
|  | // 2 pixel loop. | 
|  | convertloop: | 
|  | vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels | 
|  | lea         eax, [eax + 8] | 
|  | vcvtdq2ps   ymm0, ymm0        // X 8 floats | 
|  | vmulps      ymm2, ymm0, ymm0  // X * X | 
|  | vmulps      ymm3, ymm0, ymm7  // C3 * X | 
|  | vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X | 
|  | vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X | 
|  | vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X | 
|  | vcvttps2dq  ymm0, ymm0 | 
|  | vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000 | 
|  | vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000 | 
|  | vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000 | 
|  | vmovq       qword ptr [edx], xmm0 | 
|  | lea         edx, [edx + 8] | 
|  | sub         ecx, 2 | 
|  | jg          convertloop | 
|  | vzeroupper | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBPOLYNOMIALROW_AVX2 | 
|  |  | 
|  | #ifdef HAS_ARGBCOLORTABLEROW_X86 | 
|  | // Tranform ARGB pixels with color table. | 
|  | __declspec(naked) | 
|  | void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, | 
|  | int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]   /* dst_argb */ | 
|  | mov        esi, [esp + 4 + 8]   /* table_argb */ | 
|  | mov        ecx, [esp + 4 + 12]  /* width */ | 
|  |  | 
|  | // 1 pixel loop. | 
|  | convertloop: | 
|  | movzx      edx, byte ptr [eax] | 
|  | lea        eax, [eax + 4] | 
|  | movzx      edx, byte ptr [esi + edx * 4] | 
|  | mov        byte ptr [eax - 4], dl | 
|  | movzx      edx, byte ptr [eax - 4 + 1] | 
|  | movzx      edx, byte ptr [esi + edx * 4 + 1] | 
|  | mov        byte ptr [eax - 4 + 1], dl | 
|  | movzx      edx, byte ptr [eax - 4 + 2] | 
|  | movzx      edx, byte ptr [esi + edx * 4 + 2] | 
|  | mov        byte ptr [eax - 4 + 2], dl | 
|  | movzx      edx, byte ptr [eax - 4 + 3] | 
|  | movzx      edx, byte ptr [esi + edx * 4 + 3] | 
|  | mov        byte ptr [eax - 4 + 3], dl | 
|  | dec        ecx | 
|  | jg         convertloop | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBCOLORTABLEROW_X86 | 
|  |  | 
|  | #ifdef HAS_RGBCOLORTABLEROW_X86 | 
|  | // Tranform RGB pixels with color table. | 
|  | __declspec(naked) | 
|  | void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { | 
|  | __asm { | 
|  | push       esi | 
|  | mov        eax, [esp + 4 + 4]   /* dst_argb */ | 
|  | mov        esi, [esp + 4 + 8]   /* table_argb */ | 
|  | mov        ecx, [esp + 4 + 12]  /* width */ | 
|  |  | 
|  | // 1 pixel loop. | 
|  | convertloop: | 
|  | movzx      edx, byte ptr [eax] | 
|  | lea        eax, [eax + 4] | 
|  | movzx      edx, byte ptr [esi + edx * 4] | 
|  | mov        byte ptr [eax - 4], dl | 
|  | movzx      edx, byte ptr [eax - 4 + 1] | 
|  | movzx      edx, byte ptr [esi + edx * 4 + 1] | 
|  | mov        byte ptr [eax - 4 + 1], dl | 
|  | movzx      edx, byte ptr [eax - 4 + 2] | 
|  | movzx      edx, byte ptr [esi + edx * 4 + 2] | 
|  | mov        byte ptr [eax - 4 + 2], dl | 
|  | dec        ecx | 
|  | jg         convertloop | 
|  |  | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_RGBCOLORTABLEROW_X86 | 
|  |  | 
|  | #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 
|  | // Tranform RGB pixels with luma table. | 
|  | __declspec(naked) | 
|  | void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | 
|  | int width, | 
|  | const uint8* luma, uint32 lumacoeff) { | 
|  | __asm { | 
|  | push       esi | 
|  | push       edi | 
|  | mov        eax, [esp + 8 + 4]   /* src_argb */ | 
|  | mov        edi, [esp + 8 + 8]   /* dst_argb */ | 
|  | mov        ecx, [esp + 8 + 12]  /* width */ | 
|  | movd       xmm2, dword ptr [esp + 8 + 16]  // luma table | 
|  | movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff | 
|  | pshufd     xmm2, xmm2, 0 | 
|  | pshufd     xmm3, xmm3, 0 | 
|  | pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00 | 
|  | psllw      xmm4, 8 | 
|  | pxor       xmm5, xmm5 | 
|  |  | 
|  | // 4 pixel loop. | 
|  | convertloop: | 
|  | movdqu     xmm0, qword ptr [eax]      // generate luma ptr | 
|  | pmaddubsw  xmm0, xmm3 | 
|  | phaddw     xmm0, xmm0 | 
|  | pand       xmm0, xmm4  // mask out low bits | 
|  | punpcklwd  xmm0, xmm5 | 
|  | paddd      xmm0, xmm2  // add table base | 
|  | movd       esi, xmm0 | 
|  | pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32 | 
|  |  | 
|  | movzx      edx, byte ptr [eax] | 
|  | movzx      edx, byte ptr [esi + edx] | 
|  | mov        byte ptr [edi], dl | 
|  | movzx      edx, byte ptr [eax + 1] | 
|  | movzx      edx, byte ptr [esi + edx] | 
|  | mov        byte ptr [edi + 1], dl | 
|  | movzx      edx, byte ptr [eax + 2] | 
|  | movzx      edx, byte ptr [esi + edx] | 
|  | mov        byte ptr [edi + 2], dl | 
|  | movzx      edx, byte ptr [eax + 3]  // copy alpha. | 
|  | mov        byte ptr [edi + 3], dl | 
|  |  | 
|  | movd       esi, xmm0 | 
|  | pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32 | 
|  |  | 
|  | movzx      edx, byte ptr [eax + 4] | 
|  | movzx      edx, byte ptr [esi + edx] | 
|  | mov        byte ptr [edi + 4], dl | 
|  | movzx      edx, byte ptr [eax + 5] | 
|  | movzx      edx, byte ptr [esi + edx] | 
|  | mov        byte ptr [edi + 5], dl | 
|  | movzx      edx, byte ptr [eax + 6] | 
|  | movzx      edx, byte ptr [esi + edx] | 
|  | mov        byte ptr [edi + 6], dl | 
|  | movzx      edx, byte ptr [eax + 7]  // copy alpha. | 
|  | mov        byte ptr [edi + 7], dl | 
|  |  | 
|  | movd       esi, xmm0 | 
|  | pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32 | 
|  |  | 
|  | movzx      edx, byte ptr [eax + 8] | 
|  | movzx      edx, byte ptr [esi + edx] | 
|  | mov        byte ptr [edi + 8], dl | 
|  | movzx      edx, byte ptr [eax + 9] | 
|  | movzx      edx, byte ptr [esi + edx] | 
|  | mov        byte ptr [edi + 9], dl | 
|  | movzx      edx, byte ptr [eax + 10] | 
|  | movzx      edx, byte ptr [esi + edx] | 
|  | mov        byte ptr [edi + 10], dl | 
|  | movzx      edx, byte ptr [eax + 11]  // copy alpha. | 
|  | mov        byte ptr [edi + 11], dl | 
|  |  | 
|  | movd       esi, xmm0 | 
|  |  | 
|  | movzx      edx, byte ptr [eax + 12] | 
|  | movzx      edx, byte ptr [esi + edx] | 
|  | mov        byte ptr [edi + 12], dl | 
|  | movzx      edx, byte ptr [eax + 13] | 
|  | movzx      edx, byte ptr [esi + edx] | 
|  | mov        byte ptr [edi + 13], dl | 
|  | movzx      edx, byte ptr [eax + 14] | 
|  | movzx      edx, byte ptr [esi + edx] | 
|  | mov        byte ptr [edi + 14], dl | 
|  | movzx      edx, byte ptr [eax + 15]  // copy alpha. | 
|  | mov        byte ptr [edi + 15], dl | 
|  |  | 
|  | lea        eax, [eax + 16] | 
|  | lea        edi, [edi + 16] | 
|  | sub        ecx, 4 | 
|  | jg         convertloop | 
|  |  | 
|  | pop        edi | 
|  | pop        esi | 
|  | ret | 
|  | } | 
|  | } | 
|  | #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 
|  |  | 
|  | #endif  // defined(_M_X64) | 
|  | #endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) | 
|  |  | 
|  | #ifdef __cplusplus | 
|  | }  // extern "C" | 
|  | }  // namespace libyuv | 
|  | #endif |