| /* |
| * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
| * |
| * Use of this source code is governed by a BSD-style license |
| * that can be found in the LICENSE file in the root of the source |
| * tree. An additional intellectual property rights grant can be found |
| * in the file PATENTS. All contributing project authors may |
| * be found in the AUTHORS file in the root of the source tree. |
| */ |
| |
| #include "libyuv/row.h" |
| |
| #if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \ |
| defined(_MSC_VER) && !defined(__clang__) |
| #include <emmintrin.h> |
| #include <tmmintrin.h> // For _mm_maddubs_epi16 |
| #endif |
| |
| #ifdef __cplusplus |
| namespace libyuv { |
| extern "C" { |
| #endif |
| |
| // This module is for Visual C. |
| #if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \ |
| defined(_MSC_VER) && !defined(__clang__) |
| |
| struct YuvConstants { |
| lvec8 kUVToB; // 0 |
| lvec8 kUVToG; // 32 |
| lvec8 kUVToR; // 64 |
| lvec16 kUVBiasB; // 96 |
| lvec16 kUVBiasG; // 128 |
| lvec16 kUVBiasR; // 160 |
| lvec16 kYToRgb; // 192 |
| }; |
| |
| // BT.601 YUV to RGB reference |
| // R = (Y - 16) * 1.164 - V * -1.596 |
| // G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 |
| // B = (Y - 16) * 1.164 - U * -2.018 |
| |
| // Y contribution to R,G,B. Scale and bias. |
| // TODO(fbarchard): Consider moving constants into a common header. |
| #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ |
| #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ |
| |
| // U and V contributions to R,G,B. |
| #define UB -128 /* max(-128, round(-2.018 * 64)) */ |
| #define UG 25 /* round(0.391 * 64) */ |
| #define VG 52 /* round(0.813 * 64) */ |
| #define VR -102 /* round(-1.596 * 64) */ |
| |
| // Bias values to subtract 16 from Y and 128 from U and V. |
| #define BB (UB * 128 + YGB) |
| #define BG (UG * 128 + VG * 128 + YGB) |
| #define BR (VR * 128 + YGB) |
| |
| // BT601 constants for YUV to RGB. |
| static YuvConstants SIMD_ALIGNED(kYuvConstants) = { |
| { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, |
| UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, |
| { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, |
| UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, |
| { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, |
| 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, |
| { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, |
| { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, |
| { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, |
| { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } |
| }; |
| |
| // BT601 constants for NV21 where chroma plane is VU instead of UV. |
| static YuvConstants SIMD_ALIGNED(kYvuConstants) = { |
| { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, |
| 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, |
| { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, |
| VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, |
| { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, |
| VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, |
| { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, |
| { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, |
| { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, |
| { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } |
| }; |
| |
| #undef YG |
| #undef YGB |
| #undef UB |
| #undef UG |
| #undef VG |
| #undef VR |
| #undef BB |
| #undef BG |
| #undef BR |
| |
| // JPEG YUV to RGB reference |
| // * R = Y - V * -1.40200 |
| // * G = Y - U * 0.34414 - V * 0.71414 |
| // * B = Y - U * -1.77200 |
| |
| // Y contribution to R,G,B. Scale and bias. |
| // TODO(fbarchard): Consider moving constants into a common header. |
| #define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ |
| #define YGBJ 32 /* 64 / 2 */ |
| |
| // U and V contributions to R,G,B. |
| #define UBJ -113 /* round(-1.77200 * 64) */ |
| #define UGJ 22 /* round(0.34414 * 64) */ |
| #define VGJ 46 /* round(0.71414 * 64) */ |
| #define VRJ -90 /* round(-1.40200 * 64) */ |
| |
| // Bias values to subtract 16 from Y and 128 from U and V. |
| #define BBJ (UBJ * 128 + YGBJ) |
| #define BGJ (UGJ * 128 + VGJ * 128 + YGBJ) |
| #define BRJ (VRJ * 128 + YGBJ) |
| |
| // JPEG constants for YUV to RGB. |
| static YuvConstants SIMD_ALIGNED(kYuvJConstants) = { |
| { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, |
| UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 }, |
| { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, |
| UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, |
| UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, |
| UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ }, |
| { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, |
| 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ }, |
| { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, |
| BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ }, |
| { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, |
| BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ }, |
| { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, |
| BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ }, |
| { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, |
| YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ } |
| }; |
| |
| #undef YGJ |
| #undef YGBJ |
| #undef UBJ |
| #undef UGJ |
| #undef VGJ |
| #undef VRJ |
| #undef BBJ |
| #undef BGJ |
| #undef BRJ |
| |
| // 64 bit |
| #if defined(_M_X64) |
| #if defined(HAS_I422TOARGBROW_SSSE3) |
| void I422ToARGBRow_SSSE3(const uint8* y_buf, |
| const uint8* u_buf, |
| const uint8* v_buf, |
| uint8* dst_argb, |
| int width) { |
| __m128i xmm0, xmm1, xmm2, xmm3; |
| const __m128i xmm5 = _mm_set1_epi8(-1); |
| const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; |
| |
| while (width > 0) { |
| xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); |
| xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); |
| xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); |
| xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); |
| xmm1 = _mm_loadu_si128(&xmm0); |
| xmm2 = _mm_loadu_si128(&xmm0); |
| xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB); |
| xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG); |
| xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR); |
| xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0); |
| xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1); |
| xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2); |
| xmm3 = _mm_loadl_epi64((__m128i*)y_buf); |
| xmm3 = _mm_unpacklo_epi8(xmm3, xmm3); |
| xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb); |
| xmm0 = _mm_adds_epi16(xmm0, xmm3); |
| xmm1 = _mm_adds_epi16(xmm1, xmm3); |
| xmm2 = _mm_adds_epi16(xmm2, xmm3); |
| xmm0 = _mm_srai_epi16(xmm0, 6); |
| xmm1 = _mm_srai_epi16(xmm1, 6); |
| xmm2 = _mm_srai_epi16(xmm2, 6); |
| xmm0 = _mm_packus_epi16(xmm0, xmm0); |
| xmm1 = _mm_packus_epi16(xmm1, xmm1); |
| xmm2 = _mm_packus_epi16(xmm2, xmm2); |
| xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); |
| xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); |
| xmm1 = _mm_loadu_si128(&xmm0); |
| xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); |
| xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); |
| |
| _mm_storeu_si128((__m128i *)dst_argb, xmm0); |
| _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); |
| |
| y_buf += 8; |
| u_buf += 4; |
| dst_argb += 32; |
| width -= 8; |
| } |
| } |
| #endif |
| // 32 bit |
| #else // defined(_M_X64) |
| #ifdef HAS_ARGBTOYROW_SSSE3 |
| |
| // Constants for ARGB. |
| static const vec8 kARGBToY = { |
| 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 |
| }; |
| |
| // JPeg full range. |
| static const vec8 kARGBToYJ = { |
| 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 |
| }; |
| |
| static const vec8 kARGBToU = { |
| 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 |
| }; |
| |
| static const vec8 kARGBToUJ = { |
| 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 |
| }; |
| |
| static const vec8 kARGBToV = { |
| -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, |
| }; |
| |
| static const vec8 kARGBToVJ = { |
| -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 |
| }; |
| |
| // vpshufb for vphaddw + vpackuswb packed to shorts. |
| static const lvec8 kShufARGBToUV_AVX = { |
| 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, |
| 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 |
| }; |
| |
| // Constants for BGRA. |
| static const vec8 kBGRAToY = { |
| 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 |
| }; |
| |
| static const vec8 kBGRAToU = { |
| 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 |
| }; |
| |
| static const vec8 kBGRAToV = { |
| 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 |
| }; |
| |
| // Constants for ABGR. |
| static const vec8 kABGRToY = { |
| 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 |
| }; |
| |
| static const vec8 kABGRToU = { |
| -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 |
| }; |
| |
| static const vec8 kABGRToV = { |
| 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 |
| }; |
| |
| // Constants for RGBA. |
| static const vec8 kRGBAToY = { |
| 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 |
| }; |
| |
| static const vec8 kRGBAToU = { |
| 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 |
| }; |
| |
| static const vec8 kRGBAToV = { |
| 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 |
| }; |
| |
| static const uvec8 kAddY16 = { |
| 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u |
| }; |
| |
| // 7 bit fixed point 0.5. |
| static const vec16 kAddYJ64 = { |
| 64, 64, 64, 64, 64, 64, 64, 64 |
| }; |
| |
| static const uvec8 kAddUV128 = { |
| 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, |
| 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u |
| }; |
| |
| static const uvec16 kAddUVJ128 = { |
| 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u |
| }; |
| |
| // Shuffle table for converting RGB24 to ARGB. |
| static const uvec8 kShuffleMaskRGB24ToARGB = { |
| 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u |
| }; |
| |
| // Shuffle table for converting RAW to ARGB. |
| static const uvec8 kShuffleMaskRAWToARGB = { |
| 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u |
| }; |
| |
| // Shuffle table for converting ARGB to RGB24. |
| static const uvec8 kShuffleMaskARGBToRGB24 = { |
| 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u |
| }; |
| |
| // Shuffle table for converting ARGB to RAW. |
| static const uvec8 kShuffleMaskARGBToRAW = { |
| 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u |
| }; |
| |
| // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 |
| static const uvec8 kShuffleMaskARGBToRGB24_0 = { |
| 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u |
| }; |
| |
| // Shuffle table for converting ARGB to RAW. |
| static const uvec8 kShuffleMaskARGBToRAW_0 = { |
| 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u |
| }; |
| |
| // Duplicates gray value 3 times and fills in alpha opaque. |
| __declspec(naked) |
| void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { |
| __asm { |
| mov eax, [esp + 4] // src_y |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // pix |
| pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
| pslld xmm5, 24 |
| |
| convertloop: |
| movq xmm0, qword ptr [eax] |
| lea eax, [eax + 8] |
| punpcklbw xmm0, xmm0 |
| movdqa xmm1, xmm0 |
| punpcklwd xmm0, xmm0 |
| punpckhwd xmm1, xmm1 |
| por xmm0, xmm5 |
| por xmm1, xmm5 |
| movdqu [edx], xmm0 |
| movdqu [edx + 16], xmm1 |
| lea edx, [edx + 32] |
| sub ecx, 8 |
| jg convertloop |
| ret |
| } |
| } |
| |
| #ifdef HAS_J400TOARGBROW_AVX2 |
| // Duplicates gray value 3 times and fills in alpha opaque. |
| __declspec(naked) |
| void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) { |
| __asm { |
| mov eax, [esp + 4] // src_y |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // pix |
| vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 |
| vpslld ymm5, ymm5, 24 |
| |
| convertloop: |
| vmovdqu xmm0, [eax] |
| lea eax, [eax + 16] |
| vpermq ymm0, ymm0, 0xd8 |
| vpunpcklbw ymm0, ymm0, ymm0 |
| vpermq ymm0, ymm0, 0xd8 |
| vpunpckhwd ymm1, ymm0, ymm0 |
| vpunpcklwd ymm0, ymm0, ymm0 |
| vpor ymm0, ymm0, ymm5 |
| vpor ymm1, ymm1, ymm5 |
| vmovdqu [edx], ymm0 |
| vmovdqu [edx + 32], ymm1 |
| lea edx, [edx + 64] |
| sub ecx, 16 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_J400TOARGBROW_AVX2 |
| |
| __declspec(naked) |
| void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { |
| __asm { |
| mov eax, [esp + 4] // src_rgb24 |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // pix |
| pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
| pslld xmm5, 24 |
| movdqa xmm4, kShuffleMaskRGB24ToARGB |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| movdqu xmm3, [eax + 32] |
| lea eax, [eax + 48] |
| movdqa xmm2, xmm3 |
| palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} |
| pshufb xmm2, xmm4 |
| por xmm2, xmm5 |
| palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} |
| pshufb xmm0, xmm4 |
| movdqu [edx + 32], xmm2 |
| por xmm0, xmm5 |
| pshufb xmm1, xmm4 |
| movdqu [edx], xmm0 |
| por xmm1, xmm5 |
| palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} |
| pshufb xmm3, xmm4 |
| movdqu [edx + 16], xmm1 |
| por xmm3, xmm5 |
| movdqu [edx + 48], xmm3 |
| lea edx, [edx + 64] |
| sub ecx, 16 |
| jg convertloop |
| ret |
| } |
| } |
| |
| __declspec(naked) |
| void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, |
| int pix) { |
| __asm { |
| mov eax, [esp + 4] // src_raw |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // pix |
| pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
| pslld xmm5, 24 |
| movdqa xmm4, kShuffleMaskRAWToARGB |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| movdqu xmm3, [eax + 32] |
| lea eax, [eax + 48] |
| movdqa xmm2, xmm3 |
| palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} |
| pshufb xmm2, xmm4 |
| por xmm2, xmm5 |
| palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} |
| pshufb xmm0, xmm4 |
| movdqu [edx + 32], xmm2 |
| por xmm0, xmm5 |
| pshufb xmm1, xmm4 |
| movdqu [edx], xmm0 |
| por xmm1, xmm5 |
| palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} |
| pshufb xmm3, xmm4 |
| movdqu [edx + 16], xmm1 |
| por xmm3, xmm5 |
| movdqu [edx + 48], xmm3 |
| lea edx, [edx + 64] |
| sub ecx, 16 |
| jg convertloop |
| ret |
| } |
| } |
| |
| // pmul method to replicate bits. |
| // Math to replicate bits: |
| // (v << 8) | (v << 3) |
| // v * 256 + v * 8 |
| // v * (256 + 8) |
| // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 |
| // 20 instructions. |
| __declspec(naked) |
| void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, |
| int pix) { |
| __asm { |
| mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
| movd xmm5, eax |
| pshufd xmm5, xmm5, 0 |
| mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits |
| movd xmm6, eax |
| pshufd xmm6, xmm6, 0 |
| pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red |
| psllw xmm3, 11 |
| pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green |
| psllw xmm4, 10 |
| psrlw xmm4, 5 |
| pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha |
| psllw xmm7, 8 |
| |
| mov eax, [esp + 4] // src_rgb565 |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // pix |
| sub edx, eax |
| sub edx, eax |
| |
| convertloop: |
| movdqu xmm0, [eax] // fetch 8 pixels of bgr565 |
| movdqa xmm1, xmm0 |
| movdqa xmm2, xmm0 |
| pand xmm1, xmm3 // R in upper 5 bits |
| psllw xmm2, 11 // B in upper 5 bits |
| pmulhuw xmm1, xmm5 // * (256 + 8) |
| pmulhuw xmm2, xmm5 // * (256 + 8) |
| psllw xmm1, 8 |
| por xmm1, xmm2 // RB |
| pand xmm0, xmm4 // G in middle 6 bits |
| pmulhuw xmm0, xmm6 // << 5 * (256 + 4) |
| por xmm0, xmm7 // AG |
| movdqa xmm2, xmm1 |
| punpcklbw xmm1, xmm0 |
| punpckhbw xmm2, xmm0 |
| movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB |
| movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB |
| lea eax, [eax + 16] |
| sub ecx, 8 |
| jg convertloop |
| ret |
| } |
| } |
| |
| #ifdef HAS_RGB565TOARGBROW_AVX2 |
| // pmul method to replicate bits. |
| // Math to replicate bits: |
| // (v << 8) | (v << 3) |
| // v * 256 + v * 8 |
| // v * (256 + 8) |
| // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 |
| __declspec(naked) |
| void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, |
| int pix) { |
| __asm { |
| mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
| vmovd xmm5, eax |
| vbroadcastss ymm5, xmm5 |
| mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits |
| movd xmm6, eax |
| vbroadcastss ymm6, xmm6 |
| vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red |
| vpsllw ymm3, ymm3, 11 |
| vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green |
| vpsllw ymm4, ymm4, 10 |
| vpsrlw ymm4, ymm4, 5 |
| vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha |
| vpsllw ymm7, ymm7, 8 |
| |
| mov eax, [esp + 4] // src_rgb565 |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // pix |
| sub edx, eax |
| sub edx, eax |
| |
| convertloop: |
| vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 |
| vpand ymm1, ymm0, ymm3 // R in upper 5 bits |
| vpsllw ymm2, ymm0, 11 // B in upper 5 bits |
| vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) |
| vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) |
| vpsllw ymm1, ymm1, 8 |
| vpor ymm1, ymm1, ymm2 // RB |
| vpand ymm0, ymm0, ymm4 // G in middle 6 bits |
| vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4) |
| vpor ymm0, ymm0, ymm7 // AG |
| vpermq ymm0, ymm0, 0xd8 // mutate for unpack |
| vpermq ymm1, ymm1, 0xd8 |
| vpunpckhbw ymm2, ymm1, ymm0 |
| vpunpcklbw ymm1, ymm1, ymm0 |
| vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB |
| vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB |
| lea eax, [eax + 32] |
| sub ecx, 16 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_RGB565TOARGBROW_AVX2 |
| |
| #ifdef HAS_ARGB1555TOARGBROW_AVX2 |
| __declspec(naked) |
| void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, |
| int pix) { |
| __asm { |
| mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
| vmovd xmm5, eax |
| vbroadcastss ymm5, xmm5 |
| mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits |
| movd xmm6, eax |
| vbroadcastss ymm6, xmm6 |
| vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red |
| vpsllw ymm3, ymm3, 11 |
| vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green |
| vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha |
| vpsllw ymm7, ymm7, 8 |
| |
| mov eax, [esp + 4] // src_argb1555 |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // pix |
| sub edx, eax |
| sub edx, eax |
| |
| convertloop: |
| vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 |
| vpsllw ymm1, ymm0, 1 // R in upper 5 bits |
| vpsllw ymm2, ymm0, 11 // B in upper 5 bits |
| vpand ymm1, ymm1, ymm3 |
| vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) |
| vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) |
| vpsllw ymm1, ymm1, 8 |
| vpor ymm1, ymm1, ymm2 // RB |
| vpsraw ymm2, ymm0, 8 // A |
| vpand ymm0, ymm0, ymm4 // G in middle 5 bits |
| vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8) |
| vpand ymm2, ymm2, ymm7 |
| vpor ymm0, ymm0, ymm2 // AG |
| vpermq ymm0, ymm0, 0xd8 // mutate for unpack |
| vpermq ymm1, ymm1, 0xd8 |
| vpunpckhbw ymm2, ymm1, ymm0 |
| vpunpcklbw ymm1, ymm1, ymm0 |
| vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB |
| vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB |
| lea eax, [eax + 32] |
| sub ecx, 16 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGB1555TOARGBROW_AVX2 |
| |
| #ifdef HAS_ARGB4444TOARGBROW_AVX2 |
| __declspec(naked) |
| void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, |
| int pix) { |
| __asm { |
| mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f |
| vmovd xmm4, eax |
| vbroadcastss ymm4, xmm4 |
| vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles |
| mov eax, [esp + 4] // src_argb4444 |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // pix |
| sub edx, eax |
| sub edx, eax |
| |
| convertloop: |
| vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 |
| vpand ymm2, ymm0, ymm5 // mask high nibbles |
| vpand ymm0, ymm0, ymm4 // mask low nibbles |
| vpsrlw ymm3, ymm2, 4 |
| vpsllw ymm1, ymm0, 4 |
| vpor ymm2, ymm2, ymm3 |
| vpor ymm0, ymm0, ymm1 |
| vpermq ymm0, ymm0, 0xd8 // mutate for unpack |
| vpermq ymm2, ymm2, 0xd8 |
| vpunpckhbw ymm1, ymm0, ymm2 |
| vpunpcklbw ymm0, ymm0, ymm2 |
| vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB |
| vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB |
| lea eax, [eax + 32] |
| sub ecx, 16 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGB4444TOARGBROW_AVX2 |
| |
| // 24 instructions |
| __declspec(naked) |
| void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, |
| int pix) { |
| __asm { |
| mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
| movd xmm5, eax |
| pshufd xmm5, xmm5, 0 |
| mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits |
| movd xmm6, eax |
| pshufd xmm6, xmm6, 0 |
| pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red |
| psllw xmm3, 11 |
| movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green |
| psrlw xmm4, 6 |
| pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha |
| psllw xmm7, 8 |
| |
| mov eax, [esp + 4] // src_argb1555 |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // pix |
| sub edx, eax |
| sub edx, eax |
| |
| convertloop: |
| movdqu xmm0, [eax] // fetch 8 pixels of 1555 |
| movdqa xmm1, xmm0 |
| movdqa xmm2, xmm0 |
| psllw xmm1, 1 // R in upper 5 bits |
| psllw xmm2, 11 // B in upper 5 bits |
| pand xmm1, xmm3 |
| pmulhuw xmm2, xmm5 // * (256 + 8) |
| pmulhuw xmm1, xmm5 // * (256 + 8) |
| psllw xmm1, 8 |
| por xmm1, xmm2 // RB |
| movdqa xmm2, xmm0 |
| pand xmm0, xmm4 // G in middle 5 bits |
| psraw xmm2, 8 // A |
| pmulhuw xmm0, xmm6 // << 6 * (256 + 8) |
| pand xmm2, xmm7 |
| por xmm0, xmm2 // AG |
| movdqa xmm2, xmm1 |
| punpcklbw xmm1, xmm0 |
| punpckhbw xmm2, xmm0 |
| movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB |
| movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB |
| lea eax, [eax + 16] |
| sub ecx, 8 |
| jg convertloop |
| ret |
| } |
| } |
| |
| // 18 instructions. |
| __declspec(naked) |
| void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, |
| int pix) { |
| __asm { |
| mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f |
| movd xmm4, eax |
| pshufd xmm4, xmm4, 0 |
| movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles |
| pslld xmm5, 4 |
| mov eax, [esp + 4] // src_argb4444 |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // pix |
| sub edx, eax |
| sub edx, eax |
| |
| convertloop: |
| movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 |
| movdqa xmm2, xmm0 |
| pand xmm0, xmm4 // mask low nibbles |
| pand xmm2, xmm5 // mask high nibbles |
| movdqa xmm1, xmm0 |
| movdqa xmm3, xmm2 |
| psllw xmm1, 4 |
| psrlw xmm3, 4 |
| por xmm0, xmm1 |
| por xmm2, xmm3 |
| movdqa xmm1, xmm0 |
| punpcklbw xmm0, xmm2 |
| punpckhbw xmm1, xmm2 |
| movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB |
| movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB |
| lea eax, [eax + 16] |
| sub ecx, 8 |
| jg convertloop |
| ret |
| } |
| } |
| |
| __declspec(naked) |
| void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { |
| __asm { |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_rgb |
| mov ecx, [esp + 12] // pix |
| movdqa xmm6, kShuffleMaskARGBToRGB24 |
| |
| convertloop: |
| movdqu xmm0, [eax] // fetch 16 pixels of argb |
| movdqu xmm1, [eax + 16] |
| movdqu xmm2, [eax + 32] |
| movdqu xmm3, [eax + 48] |
| lea eax, [eax + 64] |
| pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB |
| pshufb xmm1, xmm6 |
| pshufb xmm2, xmm6 |
| pshufb xmm3, xmm6 |
| movdqa xmm4, xmm1 // 4 bytes from 1 for 0 |
| psrldq xmm1, 4 // 8 bytes from 1 |
| pslldq xmm4, 12 // 4 bytes from 1 for 0 |
| movdqa xmm5, xmm2 // 8 bytes from 2 for 1 |
| por xmm0, xmm4 // 4 bytes from 1 for 0 |
| pslldq xmm5, 8 // 8 bytes from 2 for 1 |
| movdqu [edx], xmm0 // store 0 |
| por xmm1, xmm5 // 8 bytes from 2 for 1 |
| psrldq xmm2, 8 // 4 bytes from 2 |
| pslldq xmm3, 4 // 12 bytes from 3 for 2 |
| por xmm2, xmm3 // 12 bytes from 3 for 2 |
| movdqu [edx + 16], xmm1 // store 1 |
| movdqu [edx + 32], xmm2 // store 2 |
| lea edx, [edx + 48] |
| sub ecx, 16 |
| jg convertloop |
| ret |
| } |
| } |
| |
| __declspec(naked) |
| void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { |
| __asm { |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_rgb |
| mov ecx, [esp + 12] // pix |
| movdqa xmm6, kShuffleMaskARGBToRAW |
| |
| convertloop: |
| movdqu xmm0, [eax] // fetch 16 pixels of argb |
| movdqu xmm1, [eax + 16] |
| movdqu xmm2, [eax + 32] |
| movdqu xmm3, [eax + 48] |
| lea eax, [eax + 64] |
| pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB |
| pshufb xmm1, xmm6 |
| pshufb xmm2, xmm6 |
| pshufb xmm3, xmm6 |
| movdqa xmm4, xmm1 // 4 bytes from 1 for 0 |
| psrldq xmm1, 4 // 8 bytes from 1 |
| pslldq xmm4, 12 // 4 bytes from 1 for 0 |
| movdqa xmm5, xmm2 // 8 bytes from 2 for 1 |
| por xmm0, xmm4 // 4 bytes from 1 for 0 |
| pslldq xmm5, 8 // 8 bytes from 2 for 1 |
| movdqu [edx], xmm0 // store 0 |
| por xmm1, xmm5 // 8 bytes from 2 for 1 |
| psrldq xmm2, 8 // 4 bytes from 2 |
| pslldq xmm3, 4 // 12 bytes from 3 for 2 |
| por xmm2, xmm3 // 12 bytes from 3 for 2 |
| movdqu [edx + 16], xmm1 // store 1 |
| movdqu [edx + 32], xmm2 // store 2 |
| lea edx, [edx + 48] |
| sub ecx, 16 |
| jg convertloop |
| ret |
| } |
| } |
| |
| // 4 pixels |
| __declspec(naked) |
| void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
| __asm { |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_rgb |
| mov ecx, [esp + 12] // pix |
| pcmpeqb xmm3, xmm3 // generate mask 0x0000001f |
| psrld xmm3, 27 |
| pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 |
| psrld xmm4, 26 |
| pslld xmm4, 5 |
| pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 |
| pslld xmm5, 11 |
| |
| convertloop: |
| movdqu xmm0, [eax] // fetch 4 pixels of argb |
| movdqa xmm1, xmm0 // B |
| movdqa xmm2, xmm0 // G |
| pslld xmm0, 8 // R |
| psrld xmm1, 3 // B |
| psrld xmm2, 5 // G |
| psrad xmm0, 16 // R |
| pand xmm1, xmm3 // B |
| pand xmm2, xmm4 // G |
| pand xmm0, xmm5 // R |
| por xmm1, xmm2 // BG |
| por xmm0, xmm1 // BGR |
| packssdw xmm0, xmm0 |
| lea eax, [eax + 16] |
| movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 |
| lea edx, [edx + 8] |
| sub ecx, 4 |
| jg convertloop |
| ret |
| } |
| } |
| |
| // 8 pixels |
| __declspec(naked) |
| void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, |
| const uint32 dither4, int pix) { |
| __asm { |
| |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_rgb |
| movd xmm6, [esp + 12] // dither4 |
| mov ecx, [esp + 16] // pix |
| punpcklbw xmm6, xmm6 // make dither 16 bytes |
| movdqa xmm7, xmm6 |
| punpcklwd xmm6, xmm6 |
| punpckhwd xmm7, xmm7 |
| pcmpeqb xmm3, xmm3 // generate mask 0x0000001f |
| psrld xmm3, 27 |
| pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 |
| psrld xmm4, 26 |
| pslld xmm4, 5 |
| pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 |
| pslld xmm5, 11 |
| |
| convertloop: |
| movdqu xmm0, [eax] // fetch 4 pixels of argb |
| paddusb xmm0, xmm6 // add dither |
| movdqa xmm1, xmm0 // B |
| movdqa xmm2, xmm0 // G |
| pslld xmm0, 8 // R |
| psrld xmm1, 3 // B |
| psrld xmm2, 5 // G |
| psrad xmm0, 16 // R |
| pand xmm1, xmm3 // B |
| pand xmm2, xmm4 // G |
| pand xmm0, xmm5 // R |
| por xmm1, xmm2 // BG |
| por xmm0, xmm1 // BGR |
| packssdw xmm0, xmm0 |
| lea eax, [eax + 16] |
| movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 |
| lea edx, [edx + 8] |
| sub ecx, 4 |
| jg convertloop |
| ret |
| } |
| } |
| |
| #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 |
| __declspec(naked) |
| void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, |
| const uint32 dither4, int pix) { |
| __asm { |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_rgb |
| vbroadcastss xmm6, [esp + 12] // dither4 |
| mov ecx, [esp + 16] // pix |
| vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes |
| vpermq ymm6, ymm6, 0xd8 |
| vpunpcklwd ymm6, ymm6, ymm6 |
| vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f |
| vpsrld ymm3, ymm3, 27 |
| vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 |
| vpsrld ymm4, ymm4, 26 |
| vpslld ymm4, ymm4, 5 |
| vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 |
| |
| convertloop: |
| vmovdqu ymm0, [eax] // fetch 8 pixels of argb |
| vpaddusb ymm0, ymm0, ymm6 // add dither |
| vpsrld ymm2, ymm0, 5 // G |
| vpsrld ymm1, ymm0, 3 // B |
| vpsrld ymm0, ymm0, 8 // R |
| vpand ymm2, ymm2, ymm4 // G |
| vpand ymm1, ymm1, ymm3 // B |
| vpand ymm0, ymm0, ymm5 // R |
| vpor ymm1, ymm1, ymm2 // BG |
| vpor ymm0, ymm0, ymm1 // BGR |
| vpackusdw ymm0, ymm0, ymm0 |
| vpermq ymm0, ymm0, 0xd8 |
| lea eax, [eax + 32] |
| vmovdqu [edx], xmm0 // store 8 pixels of RGB565 |
| lea edx, [edx + 16] |
| sub ecx, 8 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBTORGB565DITHERROW_AVX2 |
| |
| // TODO(fbarchard): Improve sign extension/packing. |
| __declspec(naked) |
| void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
| __asm { |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_rgb |
| mov ecx, [esp + 12] // pix |
| pcmpeqb xmm4, xmm4 // generate mask 0x0000001f |
| psrld xmm4, 27 |
| movdqa xmm5, xmm4 // generate mask 0x000003e0 |
| pslld xmm5, 5 |
| movdqa xmm6, xmm4 // generate mask 0x00007c00 |
| pslld xmm6, 10 |
| pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 |
| pslld xmm7, 15 |
| |
| convertloop: |
| movdqu xmm0, [eax] // fetch 4 pixels of argb |
| movdqa xmm1, xmm0 // B |
| movdqa xmm2, xmm0 // G |
| movdqa xmm3, xmm0 // R |
| psrad xmm0, 16 // A |
| psrld xmm1, 3 // B |
| psrld xmm2, 6 // G |
| psrld xmm3, 9 // R |
| pand xmm0, xmm7 // A |
| pand xmm1, xmm4 // B |
| pand xmm2, xmm5 // G |
| pand xmm3, xmm6 // R |
| por xmm0, xmm1 // BA |
| por xmm2, xmm3 // GR |
| por xmm0, xmm2 // BGRA |
| packssdw xmm0, xmm0 |
| lea eax, [eax + 16] |
| movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 |
| lea edx, [edx + 8] |
| sub ecx, 4 |
| jg convertloop |
| ret |
| } |
| } |
| |
| __declspec(naked) |
| void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
| __asm { |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_rgb |
| mov ecx, [esp + 12] // pix |
| pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 |
| psllw xmm4, 12 |
| movdqa xmm3, xmm4 // generate mask 0x00f000f0 |
| psrlw xmm3, 8 |
| |
| convertloop: |
| movdqu xmm0, [eax] // fetch 4 pixels of argb |
| movdqa xmm1, xmm0 |
| pand xmm0, xmm3 // low nibble |
| pand xmm1, xmm4 // high nibble |
| psrld xmm0, 4 |
| psrld xmm1, 8 |
| por xmm0, xmm1 |
| packuswb xmm0, xmm0 |
| lea eax, [eax + 16] |
| movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 |
| lea edx, [edx + 8] |
| sub ecx, 4 |
| jg convertloop |
| ret |
| } |
| } |
| |
| #ifdef HAS_ARGBTORGB565ROW_AVX2 |
| __declspec(naked) |
| void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
| __asm { |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_rgb |
| mov ecx, [esp + 12] // pix |
| vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f |
| vpsrld ymm3, ymm3, 27 |
| vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 |
| vpsrld ymm4, ymm4, 26 |
| vpslld ymm4, ymm4, 5 |
| vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 |
| |
| convertloop: |
| vmovdqu ymm0, [eax] // fetch 8 pixels of argb |
| vpsrld ymm2, ymm0, 5 // G |
| vpsrld ymm1, ymm0, 3 // B |
| vpsrld ymm0, ymm0, 8 // R |
| vpand ymm2, ymm2, ymm4 // G |
| vpand ymm1, ymm1, ymm3 // B |
| vpand ymm0, ymm0, ymm5 // R |
| vpor ymm1, ymm1, ymm2 // BG |
| vpor ymm0, ymm0, ymm1 // BGR |
| vpackusdw ymm0, ymm0, ymm0 |
| vpermq ymm0, ymm0, 0xd8 |
| lea eax, [eax + 32] |
| vmovdqu [edx], xmm0 // store 8 pixels of RGB565 |
| lea edx, [edx + 16] |
| sub ecx, 8 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBTORGB565ROW_AVX2 |
| |
| #ifdef HAS_ARGBTOARGB1555ROW_AVX2 |
| __declspec(naked) |
| void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
| __asm { |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_rgb |
| mov ecx, [esp + 12] // pix |
| vpcmpeqb ymm4, ymm4, ymm4 |
| vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f |
| vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 |
| vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 |
| vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 |
| vpslld ymm7, ymm7, 15 |
| |
| convertloop: |
| vmovdqu ymm0, [eax] // fetch 8 pixels of argb |
| vpsrld ymm3, ymm0, 9 // R |
| vpsrld ymm2, ymm0, 6 // G |
| vpsrld ymm1, ymm0, 3 // B |
| vpsrad ymm0, ymm0, 16 // A |
| vpand ymm3, ymm3, ymm6 // R |
| vpand ymm2, ymm2, ymm5 // G |
| vpand ymm1, ymm1, ymm4 // B |
| vpand ymm0, ymm0, ymm7 // A |
| vpor ymm0, ymm0, ymm1 // BA |
| vpor ymm2, ymm2, ymm3 // GR |
| vpor ymm0, ymm0, ymm2 // BGRA |
| vpackssdw ymm0, ymm0, ymm0 |
| vpermq ymm0, ymm0, 0xd8 |
| lea eax, [eax + 32] |
| vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555 |
| lea edx, [edx + 16] |
| sub ecx, 8 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBTOARGB1555ROW_AVX2 |
| |
| #ifdef HAS_ARGBTOARGB4444ROW_AVX2 |
| __declspec(naked) |
| void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
| __asm { |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_rgb |
| mov ecx, [esp + 12] // pix |
| vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 |
| vpsllw ymm4, ymm4, 12 |
| vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 |
| |
| convertloop: |
| vmovdqu ymm0, [eax] // fetch 8 pixels of argb |
| vpand ymm1, ymm0, ymm4 // high nibble |
| vpand ymm0, ymm0, ymm3 // low nibble |
| vpsrld ymm1, ymm1, 8 |
| vpsrld ymm0, ymm0, 4 |
| vpor ymm0, ymm0, ymm1 |
| vpackuswb ymm0, ymm0, ymm0 |
| vpermq ymm0, ymm0, 0xd8 |
| lea eax, [eax + 32] |
| vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 |
| lea edx, [edx + 16] |
| sub ecx, 8 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBTOARGB4444ROW_AVX2 |
| |
| // Convert 16 ARGB pixels (64 bytes) to 16 Y values. |
| __declspec(naked) |
| void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| __asm { |
| mov eax, [esp + 4] /* src_argb */ |
| mov edx, [esp + 8] /* dst_y */ |
| mov ecx, [esp + 12] /* pix */ |
| movdqa xmm4, kARGBToY |
| movdqa xmm5, kAddY16 |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| movdqu xmm2, [eax + 32] |
| movdqu xmm3, [eax + 48] |
| pmaddubsw xmm0, xmm4 |
| pmaddubsw xmm1, xmm4 |
| pmaddubsw xmm2, xmm4 |
| pmaddubsw xmm3, xmm4 |
| lea eax, [eax + 64] |
| phaddw xmm0, xmm1 |
| phaddw xmm2, xmm3 |
| psrlw xmm0, 7 |
| psrlw xmm2, 7 |
| packuswb xmm0, xmm2 |
| paddb xmm0, xmm5 |
| movdqu [edx], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 16 |
| jg convertloop |
| ret |
| } |
| } |
| |
| // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. |
| // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. |
| __declspec(naked) |
| void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| __asm { |
| mov eax, [esp + 4] /* src_argb */ |
| mov edx, [esp + 8] /* dst_y */ |
| mov ecx, [esp + 12] /* pix */ |
| movdqa xmm4, kARGBToYJ |
| movdqa xmm5, kAddYJ64 |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| movdqu xmm2, [eax + 32] |
| movdqu xmm3, [eax + 48] |
| pmaddubsw xmm0, xmm4 |
| pmaddubsw xmm1, xmm4 |
| pmaddubsw xmm2, xmm4 |
| pmaddubsw xmm3, xmm4 |
| lea eax, [eax + 64] |
| phaddw xmm0, xmm1 |
| phaddw xmm2, xmm3 |
| paddw xmm0, xmm5 // Add .5 for rounding. |
| paddw xmm2, xmm5 |
| psrlw xmm0, 7 |
| psrlw xmm2, 7 |
| packuswb xmm0, xmm2 |
| movdqu [edx], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 16 |
| jg convertloop |
| ret |
| } |
| } |
| |
| #ifdef HAS_ARGBTOYROW_AVX2 |
| // vpermd for vphaddw + vpackuswb vpermd. |
| static const lvec32 kPermdARGBToY_AVX = { |
| 0, 4, 1, 5, 2, 6, 3, 7 |
| }; |
| |
| // Convert 32 ARGB pixels (128 bytes) to 32 Y values. |
| __declspec(naked) |
| void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { |
| __asm { |
| mov eax, [esp + 4] /* src_argb */ |
| mov edx, [esp + 8] /* dst_y */ |
| mov ecx, [esp + 12] /* pix */ |
| vbroadcastf128 ymm4, kARGBToY |
| vbroadcastf128 ymm5, kAddY16 |
| vmovdqu ymm6, kPermdARGBToY_AVX |
| |
| convertloop: |
| vmovdqu ymm0, [eax] |
| vmovdqu ymm1, [eax + 32] |
| vmovdqu ymm2, [eax + 64] |
| vmovdqu ymm3, [eax + 96] |
| vpmaddubsw ymm0, ymm0, ymm4 |
| vpmaddubsw ymm1, ymm1, ymm4 |
| vpmaddubsw ymm2, ymm2, ymm4 |
| vpmaddubsw ymm3, ymm3, ymm4 |
| lea eax, [eax + 128] |
| vphaddw ymm0, ymm0, ymm1 // mutates. |
| vphaddw ymm2, ymm2, ymm3 |
| vpsrlw ymm0, ymm0, 7 |
| vpsrlw ymm2, ymm2, 7 |
| vpackuswb ymm0, ymm0, ymm2 // mutates. |
| vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. |
| vpaddb ymm0, ymm0, ymm5 // add 16 for Y |
| vmovdqu [edx], ymm0 |
| lea edx, [edx + 32] |
| sub ecx, 32 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBTOYROW_AVX2 |
| |
| #ifdef HAS_ARGBTOYJROW_AVX2 |
| // Convert 32 ARGB pixels (128 bytes) to 32 Y values. |
| __declspec(naked) |
| void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { |
| __asm { |
| mov eax, [esp + 4] /* src_argb */ |
| mov edx, [esp + 8] /* dst_y */ |
| mov ecx, [esp + 12] /* pix */ |
| vbroadcastf128 ymm4, kARGBToYJ |
| vbroadcastf128 ymm5, kAddYJ64 |
| vmovdqu ymm6, kPermdARGBToY_AVX |
| |
| convertloop: |
| vmovdqu ymm0, [eax] |
| vmovdqu ymm1, [eax + 32] |
| vmovdqu ymm2, [eax + 64] |
| vmovdqu ymm3, [eax + 96] |
| vpmaddubsw ymm0, ymm0, ymm4 |
| vpmaddubsw ymm1, ymm1, ymm4 |
| vpmaddubsw ymm2, ymm2, ymm4 |
| vpmaddubsw ymm3, ymm3, ymm4 |
| lea eax, [eax + 128] |
| vphaddw ymm0, ymm0, ymm1 // mutates. |
| vphaddw ymm2, ymm2, ymm3 |
| vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding. |
| vpaddw ymm2, ymm2, ymm5 |
| vpsrlw ymm0, ymm0, 7 |
| vpsrlw ymm2, ymm2, 7 |
| vpackuswb ymm0, ymm0, ymm2 // mutates. |
| vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. |
| vmovdqu [edx], ymm0 |
| lea edx, [edx + 32] |
| sub ecx, 32 |
| jg convertloop |
| |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBTOYJROW_AVX2 |
| |
| __declspec(naked) |
| void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| __asm { |
| mov eax, [esp + 4] /* src_argb */ |
| mov edx, [esp + 8] /* dst_y */ |
| mov ecx, [esp + 12] /* pix */ |
| movdqa xmm4, kBGRAToY |
| movdqa xmm5, kAddY16 |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| movdqu xmm2, [eax + 32] |
| movdqu xmm3, [eax + 48] |
| pmaddubsw xmm0, xmm4 |
| pmaddubsw xmm1, xmm4 |
| pmaddubsw xmm2, xmm4 |
| pmaddubsw xmm3, xmm4 |
| lea eax, [eax + 64] |
| phaddw xmm0, xmm1 |
| phaddw xmm2, xmm3 |
| psrlw xmm0, 7 |
| psrlw xmm2, 7 |
| packuswb xmm0, xmm2 |
| paddb xmm0, xmm5 |
| movdqu [edx], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 16 |
| jg convertloop |
| ret |
| } |
| } |
| |
| __declspec(naked) |
| void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| __asm { |
| mov eax, [esp + 4] /* src_argb */ |
| mov edx, [esp + 8] /* dst_y */ |
| mov ecx, [esp + 12] /* pix */ |
| movdqa xmm4, kABGRToY |
| movdqa xmm5, kAddY16 |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| movdqu xmm2, [eax + 32] |
| movdqu xmm3, [eax + 48] |
| pmaddubsw xmm0, xmm4 |
| pmaddubsw xmm1, xmm4 |
| pmaddubsw xmm2, xmm4 |
| pmaddubsw xmm3, xmm4 |
| lea eax, [eax + 64] |
| phaddw xmm0, xmm1 |
| phaddw xmm2, xmm3 |
| psrlw xmm0, 7 |
| psrlw xmm2, 7 |
| packuswb xmm0, xmm2 |
| paddb xmm0, xmm5 |
| movdqu [edx], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 16 |
| jg convertloop |
| ret |
| } |
| } |
| |
| __declspec(naked) |
| void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| __asm { |
| mov eax, [esp + 4] /* src_argb */ |
| mov edx, [esp + 8] /* dst_y */ |
| mov ecx, [esp + 12] /* pix */ |
| movdqa xmm4, kRGBAToY |
| movdqa xmm5, kAddY16 |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| movdqu xmm2, [eax + 32] |
| movdqu xmm3, [eax + 48] |
| pmaddubsw xmm0, xmm4 |
| pmaddubsw xmm1, xmm4 |
| pmaddubsw xmm2, xmm4 |
| pmaddubsw xmm3, xmm4 |
| lea eax, [eax + 64] |
| phaddw xmm0, xmm1 |
| phaddw xmm2, xmm3 |
| psrlw xmm0, 7 |
| psrlw xmm2, 7 |
| packuswb xmm0, xmm2 |
| paddb xmm0, xmm5 |
| movdqu [edx], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 16 |
| jg convertloop |
| ret |
| } |
| } |
| |
| __declspec(naked) |
| void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| uint8* dst_u, uint8* dst_v, int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // src_argb |
| mov esi, [esp + 8 + 8] // src_stride_argb |
| mov edx, [esp + 8 + 12] // dst_u |
| mov edi, [esp + 8 + 16] // dst_v |
| mov ecx, [esp + 8 + 20] // pix |
| movdqa xmm5, kAddUV128 |
| movdqa xmm6, kARGBToV |
| movdqa xmm7, kARGBToU |
| sub edi, edx // stride from u to v |
| |
| convertloop: |
| /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| movdqu xmm0, [eax] |
| movdqu xmm4, [eax + esi] |
| pavgb xmm0, xmm4 |
| movdqu xmm1, [eax + 16] |
| movdqu xmm4, [eax + esi + 16] |
| pavgb xmm1, xmm4 |
| movdqu xmm2, [eax + 32] |
| movdqu xmm4, [eax + esi + 32] |
| pavgb xmm2, xmm4 |
| movdqu xmm3, [eax + 48] |
| movdqu xmm4, [eax + esi + 48] |
| pavgb xmm3, xmm4 |
| |
| lea eax, [eax + 64] |
| movdqa xmm4, xmm0 |
| shufps xmm0, xmm1, 0x88 |
| shufps xmm4, xmm1, 0xdd |
| pavgb xmm0, xmm4 |
| movdqa xmm4, xmm2 |
| shufps xmm2, xmm3, 0x88 |
| shufps xmm4, xmm3, 0xdd |
| pavgb xmm2, xmm4 |
| |
| // step 2 - convert to U and V |
| // from here down is very similar to Y code except |
| // instead of 16 different pixels, its 8 pixels of U and 8 of V |
| movdqa xmm1, xmm0 |
| movdqa xmm3, xmm2 |
| pmaddubsw xmm0, xmm7 // U |
| pmaddubsw xmm2, xmm7 |
| pmaddubsw xmm1, xmm6 // V |
| pmaddubsw xmm3, xmm6 |
| phaddw xmm0, xmm2 |
| phaddw xmm1, xmm3 |
| psraw xmm0, 8 |
| psraw xmm1, 8 |
| packsswb xmm0, xmm1 |
| paddb xmm0, xmm5 // -> unsigned |
| |
| // step 3 - store 8 U and 8 V values |
| movlps qword ptr [edx], xmm0 // U |
| movhps qword ptr [edx + edi], xmm0 // V |
| lea edx, [edx + 8] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| __declspec(naked) |
| void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| uint8* dst_u, uint8* dst_v, int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // src_argb |
| mov esi, [esp + 8 + 8] // src_stride_argb |
| mov edx, [esp + 8 + 12] // dst_u |
| mov edi, [esp + 8 + 16] // dst_v |
| mov ecx, [esp + 8 + 20] // pix |
| movdqa xmm5, kAddUVJ128 |
| movdqa xmm6, kARGBToVJ |
| movdqa xmm7, kARGBToUJ |
| sub edi, edx // stride from u to v |
| |
| convertloop: |
| /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| movdqu xmm0, [eax] |
| movdqu xmm4, [eax + esi] |
| pavgb xmm0, xmm4 |
| movdqu xmm1, [eax + 16] |
| movdqu xmm4, [eax + esi + 16] |
| pavgb xmm1, xmm4 |
| movdqu xmm2, [eax + 32] |
| movdqu xmm4, [eax + esi + 32] |
| pavgb xmm2, xmm4 |
| movdqu xmm3, [eax + 48] |
| movdqu xmm4, [eax + esi + 48] |
| pavgb xmm3, xmm4 |
| |
| lea eax, [eax + 64] |
| movdqa xmm4, xmm0 |
| shufps xmm0, xmm1, 0x88 |
| shufps xmm4, xmm1, 0xdd |
| pavgb xmm0, xmm4 |
| movdqa xmm4, xmm2 |
| shufps xmm2, xmm3, 0x88 |
| shufps xmm4, xmm3, 0xdd |
| pavgb xmm2, xmm4 |
| |
| // step 2 - convert to U and V |
| // from here down is very similar to Y code except |
| // instead of 16 different pixels, its 8 pixels of U and 8 of V |
| movdqa xmm1, xmm0 |
| movdqa xmm3, xmm2 |
| pmaddubsw xmm0, xmm7 // U |
| pmaddubsw xmm2, xmm7 |
| pmaddubsw xmm1, xmm6 // V |
| pmaddubsw xmm3, xmm6 |
| phaddw xmm0, xmm2 |
| phaddw xmm1, xmm3 |
| paddw xmm0, xmm5 // +.5 rounding -> unsigned |
| paddw xmm1, xmm5 |
| psraw xmm0, 8 |
| psraw xmm1, 8 |
| packsswb xmm0, xmm1 |
| |
| // step 3 - store 8 U and 8 V values |
| movlps qword ptr [edx], xmm0 // U |
| movhps qword ptr [edx + edi], xmm0 // V |
| lea edx, [edx + 8] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| #ifdef HAS_ARGBTOUVROW_AVX2 |
| __declspec(naked) |
| void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, |
| uint8* dst_u, uint8* dst_v, int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // src_argb |
| mov esi, [esp + 8 + 8] // src_stride_argb |
| mov edx, [esp + 8 + 12] // dst_u |
| mov edi, [esp + 8 + 16] // dst_v |
| mov ecx, [esp + 8 + 20] // pix |
| vbroadcastf128 ymm5, kAddUV128 |
| vbroadcastf128 ymm6, kARGBToV |
| vbroadcastf128 ymm7, kARGBToU |
| sub edi, edx // stride from u to v |
| |
| convertloop: |
| /* step 1 - subsample 32x2 argb pixels to 16x1 */ |
| vmovdqu ymm0, [eax] |
| vmovdqu ymm1, [eax + 32] |
| vmovdqu ymm2, [eax + 64] |
| vmovdqu ymm3, [eax + 96] |
| vpavgb ymm0, ymm0, [eax + esi] |
| vpavgb ymm1, ymm1, [eax + esi + 32] |
| vpavgb ymm2, ymm2, [eax + esi + 64] |
| vpavgb ymm3, ymm3, [eax + esi + 96] |
| lea eax, [eax + 128] |
| vshufps ymm4, ymm0, ymm1, 0x88 |
| vshufps ymm0, ymm0, ymm1, 0xdd |
| vpavgb ymm0, ymm0, ymm4 // mutated by vshufps |
| vshufps ymm4, ymm2, ymm3, 0x88 |
| vshufps ymm2, ymm2, ymm3, 0xdd |
| vpavgb ymm2, ymm2, ymm4 // mutated by vshufps |
| |
| // step 2 - convert to U and V |
| // from here down is very similar to Y code except |
| // instead of 32 different pixels, its 16 pixels of U and 16 of V |
| vpmaddubsw ymm1, ymm0, ymm7 // U |
| vpmaddubsw ymm3, ymm2, ymm7 |
| vpmaddubsw ymm0, ymm0, ymm6 // V |
| vpmaddubsw ymm2, ymm2, ymm6 |
| vphaddw ymm1, ymm1, ymm3 // mutates |
| vphaddw ymm0, ymm0, ymm2 |
| vpsraw ymm1, ymm1, 8 |
| vpsraw ymm0, ymm0, 8 |
| vpacksswb ymm0, ymm1, ymm0 // mutates |
| vpermq ymm0, ymm0, 0xd8 // For vpacksswb |
| vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw |
| vpaddb ymm0, ymm0, ymm5 // -> unsigned |
| |
| // step 3 - store 16 U and 16 V values |
| vextractf128 [edx], ymm0, 0 // U |
| vextractf128 [edx + edi], ymm0, 1 // V |
| lea edx, [edx + 16] |
| sub ecx, 32 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBTOUVROW_AVX2 |
| |
| __declspec(naked) |
| void ARGBToUV444Row_SSSE3(const uint8* src_argb0, |
| uint8* dst_u, uint8* dst_v, int width) { |
| __asm { |
| push edi |
| mov eax, [esp + 4 + 4] // src_argb |
| mov edx, [esp + 4 + 8] // dst_u |
| mov edi, [esp + 4 + 12] // dst_v |
| mov ecx, [esp + 4 + 16] // pix |
| movdqa xmm5, kAddUV128 |
| movdqa xmm6, kARGBToV |
| movdqa xmm7, kARGBToU |
| sub edi, edx // stride from u to v |
| |
| convertloop: |
| /* convert to U and V */ |
| movdqu xmm0, [eax] // U |
| movdqu xmm1, [eax + 16] |
| movdqu xmm2, [eax + 32] |
| movdqu xmm3, [eax + 48] |
| pmaddubsw xmm0, xmm7 |
| pmaddubsw xmm1, xmm7 |
| pmaddubsw xmm2, xmm7 |
| pmaddubsw xmm3, xmm7 |
| phaddw xmm0, xmm1 |
| phaddw xmm2, xmm3 |
| psraw xmm0, 8 |
| psraw xmm2, 8 |
| packsswb xmm0, xmm2 |
| paddb xmm0, xmm5 |
| movdqu [edx], xmm0 |
| |
| movdqu xmm0, [eax] // V |
| movdqu xmm1, [eax + 16] |
| movdqu xmm2, [eax + 32] |
| movdqu xmm3, [eax + 48] |
| pmaddubsw xmm0, xmm6 |
| pmaddubsw xmm1, xmm6 |
| pmaddubsw xmm2, xmm6 |
| pmaddubsw xmm3, xmm6 |
| phaddw xmm0, xmm1 |
| phaddw xmm2, xmm3 |
| psraw xmm0, 8 |
| psraw xmm2, 8 |
| packsswb xmm0, xmm2 |
| paddb xmm0, xmm5 |
| lea eax, [eax + 64] |
| movdqu [edx + edi], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| ret |
| } |
| } |
| |
| __declspec(naked) |
| void ARGBToUV422Row_SSSE3(const uint8* src_argb0, |
| uint8* dst_u, uint8* dst_v, int width) { |
| __asm { |
| push edi |
| mov eax, [esp + 4 + 4] // src_argb |
| mov edx, [esp + 4 + 8] // dst_u |
| mov edi, [esp + 4 + 12] // dst_v |
| mov ecx, [esp + 4 + 16] // pix |
| movdqa xmm5, kAddUV128 |
| movdqa xmm6, kARGBToV |
| movdqa xmm7, kARGBToU |
| sub edi, edx // stride from u to v |
| |
| convertloop: |
| /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| movdqu xmm2, [eax + 32] |
| movdqu xmm3, [eax + 48] |
| lea eax, [eax + 64] |
| movdqa xmm4, xmm0 |
| shufps xmm0, xmm1, 0x88 |
| shufps xmm4, xmm1, 0xdd |
| pavgb xmm0, xmm4 |
| movdqa xmm4, xmm2 |
| shufps xmm2, xmm3, 0x88 |
| shufps xmm4, xmm3, 0xdd |
| pavgb xmm2, xmm4 |
| |
| // step 2 - convert to U and V |
| // from here down is very similar to Y code except |
| // instead of 16 different pixels, its 8 pixels of U and 8 of V |
| movdqa xmm1, xmm0 |
| movdqa xmm3, xmm2 |
| pmaddubsw xmm0, xmm7 // U |
| pmaddubsw xmm2, xmm7 |
| pmaddubsw xmm1, xmm6 // V |
| pmaddubsw xmm3, xmm6 |
| phaddw xmm0, xmm2 |
| phaddw xmm1, xmm3 |
| psraw xmm0, 8 |
| psraw xmm1, 8 |
| packsswb xmm0, xmm1 |
| paddb xmm0, xmm5 // -> unsigned |
| |
| // step 3 - store 8 U and 8 V values |
| movlps qword ptr [edx], xmm0 // U |
| movhps qword ptr [edx + edi], xmm0 // V |
| lea edx, [edx + 8] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| ret |
| } |
| } |
| |
| __declspec(naked) |
| void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| uint8* dst_u, uint8* dst_v, int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // src_argb |
| mov esi, [esp + 8 + 8] // src_stride_argb |
| mov edx, [esp + 8 + 12] // dst_u |
| mov edi, [esp + 8 + 16] // dst_v |
| mov ecx, [esp + 8 + 20] // pix |
| movdqa xmm5, kAddUV128 |
| movdqa xmm6, kBGRAToV |
| movdqa xmm7, kBGRAToU |
| sub edi, edx // stride from u to v |
| |
| convertloop: |
| /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| movdqu xmm0, [eax] |
| movdqu xmm4, [eax + esi] |
| pavgb xmm0, xmm4 |
| movdqu xmm1, [eax + 16] |
| movdqu xmm4, [eax + esi + 16] |
| pavgb xmm1, xmm4 |
| movdqu xmm2, [eax + 32] |
| movdqu xmm4, [eax + esi + 32] |
| pavgb xmm2, xmm4 |
| movdqu xmm3, [eax + 48] |
| movdqu xmm4, [eax + esi + 48] |
| pavgb xmm3, xmm4 |
| |
| lea eax, [eax + 64] |
| movdqa xmm4, xmm0 |
| shufps xmm0, xmm1, 0x88 |
| shufps xmm4, xmm1, 0xdd |
| pavgb xmm0, xmm4 |
| movdqa xmm4, xmm2 |
| shufps xmm2, xmm3, 0x88 |
| shufps xmm4, xmm3, 0xdd |
| pavgb xmm2, xmm4 |
| |
| // step 2 - convert to U and V |
| // from here down is very similar to Y code except |
| // instead of 16 different pixels, its 8 pixels of U and 8 of V |
| movdqa xmm1, xmm0 |
| movdqa xmm3, xmm2 |
| pmaddubsw xmm0, xmm7 // U |
| pmaddubsw xmm2, xmm7 |
| pmaddubsw xmm1, xmm6 // V |
| pmaddubsw xmm3, xmm6 |
| phaddw xmm0, xmm2 |
| phaddw xmm1, xmm3 |
| psraw xmm0, 8 |
| psraw xmm1, 8 |
| packsswb xmm0, xmm1 |
| paddb xmm0, xmm5 // -> unsigned |
| |
| // step 3 - store 8 U and 8 V values |
| movlps qword ptr [edx], xmm0 // U |
| movhps qword ptr [edx + edi], xmm0 // V |
| lea edx, [edx + 8] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| __declspec(naked) |
| void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| uint8* dst_u, uint8* dst_v, int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // src_argb |
| mov esi, [esp + 8 + 8] // src_stride_argb |
| mov edx, [esp + 8 + 12] // dst_u |
| mov edi, [esp + 8 + 16] // dst_v |
| mov ecx, [esp + 8 + 20] // pix |
| movdqa xmm5, kAddUV128 |
| movdqa xmm6, kABGRToV |
| movdqa xmm7, kABGRToU |
| sub edi, edx // stride from u to v |
| |
| convertloop: |
| /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| movdqu xmm0, [eax] |
| movdqu xmm4, [eax + esi] |
| pavgb xmm0, xmm4 |
| movdqu xmm1, [eax + 16] |
| movdqu xmm4, [eax + esi + 16] |
| pavgb xmm1, xmm4 |
| movdqu xmm2, [eax + 32] |
| movdqu xmm4, [eax + esi + 32] |
| pavgb xmm2, xmm4 |
| movdqu xmm3, [eax + 48] |
| movdqu xmm4, [eax + esi + 48] |
| pavgb xmm3, xmm4 |
| |
| lea eax, [eax + 64] |
| movdqa xmm4, xmm0 |
| shufps xmm0, xmm1, 0x88 |
| shufps xmm4, xmm1, 0xdd |
| pavgb xmm0, xmm4 |
| movdqa xmm4, xmm2 |
| shufps xmm2, xmm3, 0x88 |
| shufps xmm4, xmm3, 0xdd |
| pavgb xmm2, xmm4 |
| |
| // step 2 - convert to U and V |
| // from here down is very similar to Y code except |
| // instead of 16 different pixels, its 8 pixels of U and 8 of V |
| movdqa xmm1, xmm0 |
| movdqa xmm3, xmm2 |
| pmaddubsw xmm0, xmm7 // U |
| pmaddubsw xmm2, xmm7 |
| pmaddubsw xmm1, xmm6 // V |
| pmaddubsw xmm3, xmm6 |
| phaddw xmm0, xmm2 |
| phaddw xmm1, xmm3 |
| psraw xmm0, 8 |
| psraw xmm1, 8 |
| packsswb xmm0, xmm1 |
| paddb xmm0, xmm5 // -> unsigned |
| |
| // step 3 - store 8 U and 8 V values |
| movlps qword ptr [edx], xmm0 // U |
| movhps qword ptr [edx + edi], xmm0 // V |
| lea edx, [edx + 8] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| __declspec(naked) |
| void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| uint8* dst_u, uint8* dst_v, int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // src_argb |
| mov esi, [esp + 8 + 8] // src_stride_argb |
| mov edx, [esp + 8 + 12] // dst_u |
| mov edi, [esp + 8 + 16] // dst_v |
| mov ecx, [esp + 8 + 20] // pix |
| movdqa xmm5, kAddUV128 |
| movdqa xmm6, kRGBAToV |
| movdqa xmm7, kRGBAToU |
| sub edi, edx // stride from u to v |
| |
| convertloop: |
| /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| movdqu xmm0, [eax] |
| movdqu xmm4, [eax + esi] |
| pavgb xmm0, xmm4 |
| movdqu xmm1, [eax + 16] |
| movdqu xmm4, [eax + esi + 16] |
| pavgb xmm1, xmm4 |
| movdqu xmm2, [eax + 32] |
| movdqu xmm4, [eax + esi + 32] |
| pavgb xmm2, xmm4 |
| movdqu xmm3, [eax + 48] |
| movdqu xmm4, [eax + esi + 48] |
| pavgb xmm3, xmm4 |
| |
| lea eax, [eax + 64] |
| movdqa xmm4, xmm0 |
| shufps xmm0, xmm1, 0x88 |
| shufps xmm4, xmm1, 0xdd |
| pavgb xmm0, xmm4 |
| movdqa xmm4, xmm2 |
| shufps xmm2, xmm3, 0x88 |
| shufps xmm4, xmm3, 0xdd |
| pavgb xmm2, xmm4 |
| |
| // step 2 - convert to U and V |
| // from here down is very similar to Y code except |
| // instead of 16 different pixels, its 8 pixels of U and 8 of V |
| movdqa xmm1, xmm0 |
| movdqa xmm3, xmm2 |
| pmaddubsw xmm0, xmm7 // U |
| pmaddubsw xmm2, xmm7 |
| pmaddubsw xmm1, xmm6 // V |
| pmaddubsw xmm3, xmm6 |
| phaddw xmm0, xmm2 |
| phaddw xmm1, xmm3 |
| psraw xmm0, 8 |
| psraw xmm1, 8 |
| packsswb xmm0, xmm1 |
| paddb xmm0, xmm5 // -> unsigned |
| |
| // step 3 - store 8 U and 8 V values |
| movlps qword ptr [edx], xmm0 // U |
| movhps qword ptr [edx + edi], xmm0 // V |
| lea edx, [edx + 8] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| #endif // HAS_ARGBTOYROW_SSSE3 |
| |
| // Read 16 UV from 444 |
| #define READYUV444_AVX2 __asm { \ |
| __asm vmovdqu xmm0, [esi] /* U */ /* NOLINT */ \ |
| __asm vmovdqu xmm1, [esi + edi] /* V */ /* NOLINT */ \ |
| __asm lea esi, [esi + 16] \ |
| __asm vpermq ymm0, ymm0, 0xd8 \ |
| __asm vpermq ymm1, ymm1, 0xd8 \ |
| __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ |
| } |
| |
| // Read 8 UV from 422, upsample to 16 UV. |
| #define READYUV422_AVX2 __asm { \ |
| __asm vmovq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ |
| __asm vmovq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ |
| __asm lea esi, [esi + 8] \ |
| __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ |
| __asm vpermq ymm0, ymm0, 0xd8 \ |
| __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
| } |
| |
| // Read 4 UV from 411, upsample to 16 UV. |
| #define READYUV411_AVX2 __asm { \ |
| __asm vmovd xmm0, dword ptr [esi] /* U */ /* NOLINT */ \ |
| __asm vmovd xmm1, dword ptr [esi + edi] /* V */ /* NOLINT */ \ |
| __asm lea esi, [esi + 4] \ |
| __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ |
| __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
| __asm vpermq ymm0, ymm0, 0xd8 \ |
| __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \ |
| } |
| |
| // Read 8 UV from NV12, upsample to 16 UV. |
| #define READNV12_AVX2 __asm { \ |
| __asm vmovdqu xmm0, [esi] /* UV */ \ |
| __asm lea esi, [esi + 16] \ |
| __asm vpermq ymm0, ymm0, 0xd8 \ |
| __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
| } |
| |
| // Convert 16 pixels: 16 UV and 16 Y. |
| #define YUVTORGB_AVX2(YuvConstants) __asm { \ |
| /* Step 1: Find 8 UV contributions to 16 R,G,B values */ \ |
| __asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR /* scale R UV */ \ |
| __asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG /* scale G UV */ \ |
| __asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB /* scale B UV */ \ |
| __asm vmovdqu ymm3, YuvConstants.kUVBiasR \ |
| __asm vpsubw ymm2, ymm3, ymm2 \ |
| __asm vmovdqu ymm3, YuvConstants.kUVBiasG \ |
| __asm vpsubw ymm1, ymm3, ymm1 \ |
| __asm vmovdqu ymm3, YuvConstants.kUVBiasB \ |
| __asm vpsubw ymm0, ymm3, ymm0 \ |
| /* Step 2: Find Y contribution to 16 R,G,B values */ \ |
| __asm vmovdqu xmm3, [eax] /* NOLINT */ \ |
| __asm lea eax, [eax + 16] \ |
| __asm vpermq ymm3, ymm3, 0xd8 \ |
| __asm vpunpcklbw ymm3, ymm3, ymm3 \ |
| __asm vpmulhuw ymm3, ymm3, YuvConstants.kYToRgb \ |
| __asm vpaddsw ymm0, ymm0, ymm3 /* B += Y */ \ |
| __asm vpaddsw ymm1, ymm1, ymm3 /* G += Y */ \ |
| __asm vpaddsw ymm2, ymm2, ymm3 /* R += Y */ \ |
| __asm vpsraw ymm0, ymm0, 6 \ |
| __asm vpsraw ymm1, ymm1, 6 \ |
| __asm vpsraw ymm2, ymm2, 6 \ |
| __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \ |
| __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \ |
| __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \ |
| } |
| |
| // Store 16 ARGB values. |
| #define STOREARGB_AVX2 __asm { \ |
| /* Step 3: Weave into ARGB */ \ |
| __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ |
| __asm vpermq ymm0, ymm0, 0xd8 \ |
| __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ |
| __asm vpermq ymm2, ymm2, 0xd8 \ |
| __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ |
| __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ |
| __asm vmovdqu 0[edx], ymm1 \ |
| __asm vmovdqu 32[edx], ymm0 \ |
| __asm lea edx, [edx + 64] \ |
| } |
| |
| #ifdef HAS_I422TOARGBROW_AVX2 |
| // 16 pixels |
| // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| __declspec(naked) |
| void I422ToARGBRow_AVX2(const uint8* y_buf, |
| const uint8* u_buf, |
| const uint8* v_buf, |
| uint8* dst_argb, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // Y |
| mov esi, [esp + 8 + 8] // U |
| mov edi, [esp + 8 + 12] // V |
| mov edx, [esp + 8 + 16] // argb |
| mov ecx, [esp + 8 + 20] // width |
| sub edi, esi |
| vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| |
| convertloop: |
| READYUV422_AVX2 |
| YUVTORGB_AVX2(kYuvConstants) |
| STOREARGB_AVX2 |
| |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_I422TOARGBROW_AVX2 |
| |
| #ifdef HAS_J422TOARGBROW_AVX2 |
| // 16 pixels |
| // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| __declspec(naked) |
| void J422ToARGBRow_AVX2(const uint8* y_buf, |
| const uint8* u_buf, |
| const uint8* v_buf, |
| uint8* dst_argb, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // Y |
| mov esi, [esp + 8 + 8] // U |
| mov edi, [esp + 8 + 12] // V |
| mov edx, [esp + 8 + 16] // argb |
| mov ecx, [esp + 8 + 20] // width |
| sub edi, esi |
| vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| |
| convertloop: |
| READYUV422_AVX2 |
| YUVTORGB_AVX2(kYuvJConstants) |
| STOREARGB_AVX2 |
| |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_J422TOARGBROW_AVX2 |
| |
| #ifdef HAS_I444TOARGBROW_AVX2 |
| // 16 pixels |
| // 16 UV values with 16 Y producing 16 ARGB (64 bytes). |
| __declspec(naked) |
| void I444ToARGBRow_AVX2(const uint8* y_buf, |
| const uint8* u_buf, |
| const uint8* v_buf, |
| uint8* dst_argb, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // Y |
| mov esi, [esp + 8 + 8] // U |
| mov edi, [esp + 8 + 12] // V |
| mov edx, [esp + 8 + 16] // argb |
| mov ecx, [esp + 8 + 20] // width |
| sub edi, esi |
| vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| |
| convertloop: |
| READYUV444_AVX2 |
| YUVTORGB_AVX2(kYuvConstants) |
| STOREARGB_AVX2 |
| |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_I444TOARGBROW_AVX2 |
| |
| #ifdef HAS_I411TOARGBROW_AVX2 |
| // 16 pixels |
| // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| __declspec(naked) |
| void I411ToARGBRow_AVX2(const uint8* y_buf, |
| const uint8* u_buf, |
| const uint8* v_buf, |
| uint8* dst_argb, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // Y |
| mov esi, [esp + 8 + 8] // U |
| mov edi, [esp + 8 + 12] // V |
| mov edx, [esp + 8 + 16] // argb |
| mov ecx, [esp + 8 + 20] // width |
| sub edi, esi |
| vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| |
| convertloop: |
| READYUV411_AVX2 |
| YUVTORGB_AVX2(kYuvConstants) |
| STOREARGB_AVX2 |
| |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_I411TOARGBROW_AVX2 |
| |
| #ifdef HAS_NV12TOARGBROW_AVX2 |
| // 16 pixels. |
| // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| __declspec(naked) |
| void NV12ToARGBRow_AVX2(const uint8* y_buf, |
| const uint8* uv_buf, |
| uint8* dst_argb, |
| int width) { |
| __asm { |
| push esi |
| mov eax, [esp + 4 + 4] // Y |
| mov esi, [esp + 4 + 8] // UV |
| mov edx, [esp + 4 + 12] // argb |
| mov ecx, [esp + 4 + 16] // width |
| vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| |
| convertloop: |
| READNV12_AVX2 |
| YUVTORGB_AVX2(kYuvConstants) |
| STOREARGB_AVX2 |
| |
| sub ecx, 16 |
| jg convertloop |
| |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_NV12TOARGBROW_AVX2 |
| |
| #ifdef HAS_NV21TOARGBROW_AVX2 |
| // 16 pixels. |
| // 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes). |
| __declspec(naked) |
| void NV21ToARGBRow_AVX2(const uint8* y_buf, |
| const uint8* uv_buf, |
| uint8* dst_argb, |
| int width) { |
| __asm { |
| push esi |
| mov eax, [esp + 4 + 4] // Y |
| mov esi, [esp + 4 + 8] // UV |
| mov edx, [esp + 4 + 12] // argb |
| mov ecx, [esp + 4 + 16] // width |
| vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| |
| convertloop: |
| READNV12_AVX2 |
| YUVTORGB_AVX2(kYvuConstants) |
| STOREARGB_AVX2 |
| |
| sub ecx, 16 |
| jg convertloop |
| |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_NV21TOARGBROW_AVX2 |
| |
| #ifdef HAS_I422TOBGRAROW_AVX2 |
| // 16 pixels |
| // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). |
| // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. |
| __declspec(naked) |
| void I422ToBGRARow_AVX2(const uint8* y_buf, |
| const uint8* u_buf, |
| const uint8* v_buf, |
| uint8* dst_argb, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // Y |
| mov esi, [esp + 8 + 8] // U |
| mov edi, [esp + 8 + 12] // V |
| mov edx, [esp + 8 + 16] // argb |
| mov ecx, [esp + 8 + 20] // width |
| sub edi, esi |
| vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| |
| convertloop: |
| READYUV422_AVX2 |
| YUVTORGB_AVX2(kYuvConstants) |
| |
| // Step 3: Weave into BGRA |
| vpunpcklbw ymm1, ymm1, ymm0 // GB |
| vpermq ymm1, ymm1, 0xd8 |
| vpunpcklbw ymm2, ymm5, ymm2 // AR |
| vpermq ymm2, ymm2, 0xd8 |
| vpunpcklwd ymm0, ymm2, ymm1 // ARGB first 8 pixels |
| vpunpckhwd ymm2, ymm2, ymm1 // ARGB next 8 pixels |
| vmovdqu [edx], ymm0 |
| vmovdqu [edx + 32], ymm2 |
| lea edx, [edx + 64] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_I422TOBGRAROW_AVX2 |
| |
| #ifdef HAS_I422TORGBAROW_AVX2 |
| // 16 pixels |
| // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). |
| // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. |
| __declspec(naked) |
| void I422ToRGBARow_AVX2(const uint8* y_buf, |
| const uint8* u_buf, |
| const uint8* v_buf, |
| uint8* dst_argb, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // Y |
| mov esi, [esp + 8 + 8] // U |
| mov edi, [esp + 8 + 12] // V |
| mov edx, [esp + 8 + 16] // argb |
| mov ecx, [esp + 8 + 20] // width |
| sub edi, esi |
| vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| |
| convertloop: |
| READYUV422_AVX2 |
| YUVTORGB_AVX2(kYuvConstants) |
| |
| // Step 3: Weave into RGBA |
| vpunpcklbw ymm1, ymm1, ymm2 // GR |
| vpermq ymm1, ymm1, 0xd8 |
| vpunpcklbw ymm2, ymm5, ymm0 // AB |
| vpermq ymm2, ymm2, 0xd8 |
| vpunpcklwd ymm0, ymm2, ymm1 // ABGR first 8 pixels |
| vpunpckhwd ymm1, ymm2, ymm1 // ABGR next 8 pixels |
| vmovdqu [edx], ymm0 |
| vmovdqu [edx + 32], ymm1 |
| lea edx, [edx + 64] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_I422TORGBAROW_AVX2 |
| |
| #ifdef HAS_I422TOABGRROW_AVX2 |
| // 16 pixels |
| // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). |
| // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. |
| __declspec(naked) |
| void I422ToABGRRow_AVX2(const uint8* y_buf, |
| const uint8* u_buf, |
| const uint8* v_buf, |
| uint8* dst_argb, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // Y |
| mov esi, [esp + 8 + 8] // U |
| mov edi, [esp + 8 + 12] // V |
| mov edx, [esp + 8 + 16] // argb |
| mov ecx, [esp + 8 + 20] // width |
| sub edi, esi |
| vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| |
| convertloop: |
| READYUV422_AVX2 |
| YUVTORGB_AVX2(kYuvConstants) |
| |
| // Step 3: Weave into ABGR |
| vpunpcklbw ymm1, ymm2, ymm1 // RG |
| vpermq ymm1, ymm1, 0xd8 |
| vpunpcklbw ymm2, ymm0, ymm5 // BA |
| vpermq ymm2, ymm2, 0xd8 |
| vpunpcklwd ymm0, ymm1, ymm2 // RGBA first 8 pixels |
| vpunpckhwd ymm1, ymm1, ymm2 // RGBA next 8 pixels |
| vmovdqu [edx], ymm0 |
| vmovdqu [edx + 32], ymm1 |
| lea edx, [edx + 64] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_I422TOABGRROW_AVX2 |
| |
| #if defined(HAS_I422TOARGBROW_SSSE3) |
| // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. |
| |
| // Read 8 UV from 444. |
| #define READYUV444 __asm { \ |
| __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ |
| __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ |
| __asm lea esi, [esi + 8] \ |
| __asm punpcklbw xmm0, xmm1 /* UV */ \ |
| } |
| |
| // Read 4 UV from 422, upsample to 8 UV. |
| #define READYUV422 __asm { \ |
| __asm movd xmm0, [esi] /* U */ \ |
| __asm movd xmm1, [esi + edi] /* V */ \ |
| __asm lea esi, [esi + 4] \ |
| __asm punpcklbw xmm0, xmm1 /* UV */ \ |
| __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
| } |
| |
| // Read 2 UV from 411, upsample to 8 UV. |
| #define READYUV411 __asm { \ |
| __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \ |
| __asm movd xmm0, ebx \ |
| __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \ |
| __asm movd xmm1, ebx \ |
| __asm lea esi, [esi + 2] \ |
| __asm punpcklbw xmm0, xmm1 /* UV */ \ |
| __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
| __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \ |
| } |
| |
| // Read 4 UV from NV12, upsample to 8 UV. |
| #define READNV12 __asm { \ |
| __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \ |
| __asm lea esi, [esi + 8] \ |
| __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
| } |
| |
| // Convert 8 pixels: 8 UV and 8 Y. |
| #define YUVTORGB(YuvConstants) __asm { \ |
| /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ |
| __asm movdqa xmm1, xmm0 \ |
| __asm movdqa xmm2, xmm0 \ |
| __asm movdqa xmm3, xmm0 \ |
| __asm movdqa xmm0, YuvConstants.kUVBiasB /* unbias back to signed */ \ |
| __asm pmaddubsw xmm1, YuvConstants.kUVToB /* scale B UV */ \ |
| __asm psubw xmm0, xmm1 \ |
| __asm movdqa xmm1, YuvConstants.kUVBiasG \ |
| __asm pmaddubsw xmm2, YuvConstants.kUVToG /* scale G UV */ \ |
| __asm psubw xmm1, xmm2 \ |
| __asm movdqa xmm2, YuvConstants.kUVBiasR \ |
| __asm pmaddubsw xmm3, YuvConstants.kUVToR /* scale R UV */ \ |
| __asm psubw xmm2, xmm3 \ |
| /* Step 2: Find Y contribution to 8 R,G,B values */ \ |
| __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ |
| __asm lea eax, [eax + 8] \ |
| __asm punpcklbw xmm3, xmm3 \ |
| __asm pmulhuw xmm3, YuvConstants.kYToRgb \ |
| __asm paddsw xmm0, xmm3 /* B += Y */ \ |
| __asm paddsw xmm1, xmm3 /* G += Y */ \ |
| __asm paddsw xmm2, xmm3 /* R += Y */ \ |
| __asm psraw xmm0, 6 \ |
| __asm psraw xmm1, 6 \ |
| __asm psraw xmm2, 6 \ |
| __asm packuswb xmm0, xmm0 /* B */ \ |
| __asm packuswb xmm1, xmm1 /* G */ \ |
| __asm packuswb xmm2, xmm2 /* R */ \ |
| } |
| |
| // Store 8 ARGB values. |
| #define STOREARGB __asm { \ |
| /* Step 3: Weave into ARGB */ \ |
| __asm punpcklbw xmm0, xmm1 /* BG */ \ |
| __asm punpcklbw xmm2, xmm5 /* RA */ \ |
| __asm movdqa xmm1, xmm0 \ |
| __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ |
| __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ |
| __asm movdqu 0[edx], xmm0 \ |
| __asm movdqu 16[edx], xmm1 \ |
| __asm lea edx, [edx + 32] \ |
| } |
| |
| // Store 8 BGRA values. |
| #define STOREBGRA __asm { \ |
| /* Step 3: Weave into BGRA */ \ |
| __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ |
| __asm punpcklbw xmm1, xmm0 /* GB */ \ |
| __asm punpcklbw xmm5, xmm2 /* AR */ \ |
| __asm movdqa xmm0, xmm5 \ |
| __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ |
| __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ |
| __asm movdqu 0[edx], xmm5 \ |
| __asm movdqu 16[edx], xmm0 \ |
| __asm lea edx, [edx + 32] \ |
| } |
| |
| // Store 8 ABGR values. |
| #define STOREABGR __asm { \ |
| /* Step 3: Weave into ABGR */ \ |
| __asm punpcklbw xmm2, xmm1 /* RG */ \ |
| __asm punpcklbw xmm0, xmm5 /* BA */ \ |
| __asm movdqa xmm1, xmm2 \ |
| __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \ |
| __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \ |
| __asm movdqu 0[edx], xmm2 \ |
| __asm movdqu 16[edx], xmm1 \ |
| __asm lea edx, [edx + 32] \ |
| } |
| |
| // Store 8 RGBA values. |
| #define STORERGBA __asm { \ |
| /* Step 3: Weave into RGBA */ \ |
| __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ |
| __asm punpcklbw xmm1, xmm2 /* GR */ \ |
| __asm punpcklbw xmm5, xmm0 /* AB */ \ |
| __asm movdqa xmm0, xmm5 \ |
| __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ |
| __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ |
| __asm movdqu 0[edx], xmm5 \ |
| __asm movdqu 16[edx], xmm0 \ |
| __asm lea edx, [edx + 32] \ |
| } |
| |
| // Store 8 RGB24 values. |
| #define STORERGB24 __asm { \ |
| /* Step 3: Weave into RRGB */ \ |
| __asm punpcklbw xmm0, xmm1 /* BG */ \ |
| __asm punpcklbw xmm2, xmm2 /* RR */ \ |
| __asm movdqa xmm1, xmm0 \ |
| __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ |
| __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ |
| /* Step 4: RRGB -> RGB24 */ \ |
| __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ |
| __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ |
| __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ |
| __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ |
| __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ |
| __asm lea edx, [edx + 24] \ |
| } |
| |
| // Store 8 RAW values. |
| #define STORERAW __asm { \ |
| /* Step 3: Weave into RRGB */ \ |
| __asm punpcklbw xmm0, xmm1 /* BG */ \ |
| __asm punpcklbw xmm2, xmm2 /* RR */ \ |
| __asm movdqa xmm1, xmm0 \ |
| __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ |
| __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ |
| /* Step 4: RRGB -> RAW */ \ |
| __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ |
| __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ |
| __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ |
| __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ |
| __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ |
| __asm lea edx, [edx + 24] \ |
| } |
| |
| // Store 8 RGB565 values. |
| #define STORERGB565 __asm { \ |
| /* Step 3: Weave into RRGB */ \ |
| __asm punpcklbw xmm0, xmm1 /* BG */ \ |
| __asm punpcklbw xmm2, xmm2 /* RR */ \ |
| __asm movdqa xmm1, xmm0 \ |
| __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ |
| __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ |
| /* Step 4: RRGB -> RGB565 */ \ |
| __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ |
| __asm movdqa xmm2, xmm0 /* G */ \ |
| __asm pslld xmm0, 8 /* R */ \ |
| __asm psrld xmm3, 3 /* B */ \ |
| __asm psrld xmm2, 5 /* G */ \ |
| __asm psrad xmm0, 16 /* R */ \ |
| __asm pand xmm3, xmm5 /* B */ \ |
| __asm pand xmm2, xmm6 /* G */ \ |
| __asm pand xmm0, xmm7 /* R */ \ |
| __asm por xmm3, xmm2 /* BG */ \ |
| __asm por xmm0, xmm3 /* BGR */ \ |
| __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ |
| __asm movdqa xmm2, xmm1 /* G */ \ |
| __asm pslld xmm1, 8 /* R */ \ |
| __asm psrld xmm3, 3 /* B */ \ |
| __asm psrld xmm2, 5 /* G */ \ |
| __asm psrad xmm1, 16 /* R */ \ |
| __asm pand xmm3, xmm5 /* B */ \ |
| __asm pand xmm2, xmm6 /* G */ \ |
| __asm pand xmm1, xmm7 /* R */ \ |
| __asm por xmm3, xmm2 /* BG */ \ |
| __asm por xmm1, xmm3 /* BGR */ \ |
| __asm packssdw xmm0, xmm1 \ |
| __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ |
| __asm lea edx, [edx + 16] \ |
| } |
| |
| // 8 pixels. |
| // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). |
| __declspec(naked) |
| void I444ToARGBRow_SSSE3(const uint8* y_buf, |
| const uint8* u_buf, |
| const uint8* v_buf, |
| uint8* dst_argb, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // Y |
| mov esi, [esp + 8 + 8] // U |
| mov edi, [esp + 8 + 12] // V |
| mov edx, [esp + 8 + 16] // argb |
| mov ecx, [esp + 8 + 20] // width |
| sub edi, esi |
| pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| |
| convertloop: |
| READYUV444 |
| YUVTORGB(kYuvConstants) |
| STOREARGB |
| |
| sub ecx, 8 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| // 8 pixels. |
| // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). |
| __declspec(naked) |
| void I422ToRGB24Row_SSSE3(const uint8* y_buf, |
| const uint8* u_buf, |
| const uint8* v_buf, |
| uint8* dst_rgb24, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // Y |
| mov esi, [esp + 8 + 8] // U |
| mov edi, [esp + 8 + 12] // V |
| mov edx, [esp + 8 + 16] // rgb24 |
| mov ecx, [esp + 8 + 20] // width |
| sub edi, esi |
| movdqa xmm5, kShuffleMaskARGBToRGB24_0 |
| movdqa xmm6, kShuffleMaskARGBToRGB24 |
| |
| convertloop: |
| READYUV422 |
| YUVTORGB(kYuvConstants) |
| STORERGB24 |
| |
| sub ecx, 8 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| // 8 pixels. |
| // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes). |
| __declspec(naked) |
| void I422ToRAWRow_SSSE3(const uint8* y_buf, |
| const uint8* u_buf, |
| const uint8* v_buf, |
| uint8* dst_raw, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // Y |
| mov esi, [esp + 8 + 8] // U |
| mov edi, [esp + 8 + 12] // V |
| mov edx, [esp + 8 + 16] // raw |
| mov ecx, [esp + 8 + 20] // width |
| sub edi, esi |
| movdqa xmm5, kShuffleMaskARGBToRAW_0 |
| movdqa xmm6, kShuffleMaskARGBToRAW |
| |
| convertloop: |
| READYUV422 |
| YUVTORGB(kYuvConstants) |
| STORERAW |
| |
| sub ecx, 8 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| // 8 pixels |
| // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). |
| __declspec(naked) |
| void I422ToRGB565Row_SSSE3(const uint8* y_buf, |
| const uint8* u_buf, |
| const uint8* v_buf, |
| uint8* rgb565_buf, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // Y |
| mov esi, [esp + 8 + 8] // U |
| mov edi, [esp + 8 + 12] // V |
| mov edx, [esp + 8 + 16] // rgb565 |
| mov ecx, [esp + 8 + 20] // width |
| sub edi, esi |
| pcmpeqb xmm5, xmm5 // generate mask 0x0000001f |
| psrld xmm5, 27 |
| pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 |
| psrld xmm6, 26 |
| pslld xmm6, 5 |
| pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 |
| pslld xmm7, 11 |
| |
| convertloop: |
| READYUV422 |
| YUVTORGB(kYuvConstants) |
| STORERGB565 |
| |
| sub ecx, 8 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| // 8 pixels. |
| // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| __declspec(naked) |
| void I422ToARGBRow_SSSE3(const uint8* y_buf, |
| const uint8* u_buf, |
| const uint8* v_buf, |
| uint8* dst_argb, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // Y |
| mov esi, [esp + 8 + 8] // U |
| mov edi, [esp + 8 + 12] // V |
| mov edx, [esp + 8 + 16] // argb |
| mov ecx, [esp + 8 + 20] // width |
| sub edi, esi |
| pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| |
| convertloop: |
| READYUV422 |
| YUVTORGB(kYuvConstants) |
| STOREARGB |
| |
| sub ecx, 8 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| // 8 pixels. |
| // JPeg color space version of I422ToARGB |
| // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| __declspec(naked) |
| void J422ToARGBRow_SSSE3(const uint8* y_buf, |
| const uint8* u_buf, |
| const uint8* v_buf, |
| uint8* dst_argb, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // Y |
| mov esi, [esp + 8 + 8] // U |
| mov edi, [esp + 8 + 12] // V |
| mov edx, [esp + 8 + 16] // argb |
| mov ecx, [esp + 8 + 20] // width |
| sub edi, esi |
| pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| |
| convertloop: |
| READYUV422 |
| YUVTORGB(kYuvJConstants) |
| STOREARGB |
| |
| sub ecx, 8 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| // 8 pixels. |
| // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| // Similar to I420 but duplicate UV once more. |
| __declspec(naked) |
| void I411ToARGBRow_SSSE3(const uint8* y_buf, |
| const uint8* u_buf, |
| const uint8* v_buf, |
| uint8* dst_argb, |
| int width) { |
| __asm { |
| push ebx |
| push esi |
| push edi |
| mov eax, [esp + 12 + 4] // Y |
| mov esi, [esp + 12 + 8] // U |
| mov edi, [esp + 12 + 12] // V |
| mov edx, [esp + 12 + 16] // argb |
| mov ecx, [esp + 12 + 20] // width |
| sub edi, esi |
| pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| |
| convertloop: |
| READYUV411 // modifies EBX |
| YUVTORGB(kYuvConstants) |
| STOREARGB |
| |
| sub ecx, 8 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| pop ebx |
| ret |
| } |
| } |
| |
| // 8 pixels. |
| // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| __declspec(naked) |
| void NV12ToARGBRow_SSSE3(const uint8* y_buf, |
| const uint8* uv_buf, |
| uint8* dst_argb, |
| int width) { |
| __asm { |
| push esi |
| mov eax, [esp + 4 + 4] // Y |
| mov esi, [esp + 4 + 8] // UV |
| mov edx, [esp + 4 + 12] // argb |
| mov ecx, [esp + 4 + 16] // width |
| pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| |
| convertloop: |
| READNV12 |
| YUVTORGB(kYuvConstants) |
| STOREARGB |
| |
| sub ecx, 8 |
| jg convertloop |
| |
| pop esi |
| ret |
| } |
| } |
| |
| // 8 pixels. |
| // 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes). |
| __declspec(naked) |
| void NV21ToARGBRow_SSSE3(const uint8* y_buf, |
| const uint8* uv_buf, |
| uint8* dst_argb, |
| int width) { |
| __asm { |
| push esi |
| mov eax, [esp + 4 + 4] // Y |
| mov esi, [esp + 4 + 8] // UV |
| mov edx, [esp + 4 + 12] // argb |
| mov ecx, [esp + 4 + 16] // width |
| pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| |
| convertloop: |
| READNV12 |
| YUVTORGB(kYvuConstants) |
| STOREARGB |
| |
| sub ecx, 8 |
| jg convertloop |
| |
| pop esi |
| ret |
| } |
| } |
| |
| __declspec(naked) |
| void I422ToBGRARow_SSSE3(const uint8* y_buf, |
| const uint8* u_buf, |
| const uint8* v_buf, |
| uint8* dst_bgra, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // Y |
| mov esi, [esp + 8 + 8] // U |
| mov edi, [esp + 8 + 12] // V |
| mov edx, [esp + 8 + 16] // bgra |
| mov ecx, [esp + 8 + 20] // width |
| sub edi, esi |
| |
| convertloop: |
| READYUV422 |
| YUVTORGB(kYuvConstants) |
| STOREBGRA |
| |
| sub ecx, 8 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| __declspec(naked) |
| void I422ToABGRRow_SSSE3(const uint8* y_buf, |
| const uint8* u_buf, |
| const uint8* v_buf, |
| uint8* dst_abgr, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // Y |
| mov esi, [esp + 8 + 8] // U |
| mov edi, [esp + 8 + 12] // V |
| mov edx, [esp + 8 + 16] // abgr |
| mov ecx, [esp + 8 + 20] // width |
| sub edi, esi |
| pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| |
| convertloop: |
| READYUV422 |
| YUVTORGB(kYuvConstants) |
| STOREABGR |
| |
| sub ecx, 8 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| __declspec(naked) |
| void I422ToRGBARow_SSSE3(const uint8* y_buf, |
| const uint8* u_buf, |
| const uint8* v_buf, |
| uint8* dst_rgba, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // Y |
| mov esi, [esp + 8 + 8] // U |
| mov edi, [esp + 8 + 12] // V |
| mov edx, [esp + 8 + 16] // rgba |
| mov ecx, [esp + 8 |