| /* |
| * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
| * |
| * Use of this source code is governed by a BSD-style license |
| * that can be found in the LICENSE file in the root of the source |
| * tree. An additional intellectual property rights grant can be found |
| * in the file PATENTS. All contributing project authors may |
| * be found in the AUTHORS file in the root of the source tree. |
| */ |
| |
| #include "libyuv/row.h" |
| |
| // This module is for Visual C 32/64 bit and clangcl 32 bit |
| #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ |
| (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__))) |
| |
| #if defined(_M_X64) |
| #include <emmintrin.h> |
| #include <tmmintrin.h> // For _mm_maddubs_epi16 |
| #endif |
| |
| #ifdef __cplusplus |
| namespace libyuv { |
| extern "C" { |
| #endif |
| |
| // 64 bit |
| #if defined(_M_X64) |
| |
| // Read 4 UV from 422, upsample to 8 UV. |
| #define READYUV422 \ |
| xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ |
| xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ |
| xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ |
| xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ |
| u_buf += 4; \ |
| xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ |
| xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ |
| y_buf += 8; |
| |
| // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. |
| #define READYUVA422 \ |
| xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ |
| xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ |
| xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ |
| xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ |
| u_buf += 4; \ |
| xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ |
| xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ |
| y_buf += 8; \ |
| xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ |
| a_buf += 8; |
| |
| // Convert 8 pixels: 8 UV and 8 Y. |
| #define YUVTORGB(yuvconstants) \ |
| xmm1 = _mm_loadu_si128(&xmm0); \ |
| xmm2 = _mm_loadu_si128(&xmm0); \ |
| xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \ |
| xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \ |
| xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \ |
| xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \ |
| xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \ |
| xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \ |
| xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ |
| xmm0 = _mm_adds_epi16(xmm0, xmm4); \ |
| xmm1 = _mm_adds_epi16(xmm1, xmm4); \ |
| xmm2 = _mm_adds_epi16(xmm2, xmm4); \ |
| xmm0 = _mm_srai_epi16(xmm0, 6); \ |
| xmm1 = _mm_srai_epi16(xmm1, 6); \ |
| xmm2 = _mm_srai_epi16(xmm2, 6); \ |
| xmm0 = _mm_packus_epi16(xmm0, xmm0); \ |
| xmm1 = _mm_packus_epi16(xmm1, xmm1); \ |
| xmm2 = _mm_packus_epi16(xmm2, xmm2); |
| |
| // Store 8 ARGB values. |
| #define STOREARGB \ |
| xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ |
| xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \ |
| xmm1 = _mm_loadu_si128(&xmm0); \ |
| xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \ |
| xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \ |
| _mm_storeu_si128((__m128i*)dst_argb, xmm0); \ |
| _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \ |
| dst_argb += 32; |
| |
| #if defined(HAS_I422TOARGBROW_SSSE3) |
| void I422ToARGBRow_SSSE3(const uint8_t* y_buf, |
| const uint8_t* u_buf, |
| const uint8_t* v_buf, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| __m128i xmm0, xmm1, xmm2, xmm4; |
| const __m128i xmm5 = _mm_set1_epi8(-1); |
| const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; |
| while (width > 0) { |
| READYUV422 |
| YUVTORGB(yuvconstants) |
| STOREARGB |
| width -= 8; |
| } |
| } |
| #endif |
| |
| #if defined(HAS_I422ALPHATOARGBROW_SSSE3) |
| void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, |
| const uint8_t* u_buf, |
| const uint8_t* v_buf, |
| const uint8_t* a_buf, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| __m128i xmm0, xmm1, xmm2, xmm4, xmm5; |
| const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; |
| while (width > 0) { |
| READYUVA422 |
| YUVTORGB(yuvconstants) |
| STOREARGB |
| width -= 8; |
| } |
| } |
| #endif |
| |
| // 32 bit |
| #else // defined(_M_X64) |
| #ifdef HAS_ARGBTOYROW_SSSE3 |
| |
| // Constants for ARGB. |
| static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, |
| 13, 65, 33, 0, 13, 65, 33, 0}; |
| |
| // JPeg full range. |
| static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, |
| 15, 75, 38, 0, 15, 75, 38, 0}; |
| |
| static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, |
| 112, -74, -38, 0, 112, -74, -38, 0}; |
| |
| static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, |
| 127, -84, -43, 0, 127, -84, -43, 0}; |
| |
| static const vec8 kARGBToV = { |
| -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, |
| }; |
| |
| static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, |
| -20, -107, 127, 0, -20, -107, 127, 0}; |
| |
| // vpshufb for vphaddw + vpackuswb packed to shorts. |
| static const lvec8 kShufARGBToUV_AVX = { |
| 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, |
| 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; |
| |
| // Constants for BGRA. |
| static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13, |
| 0, 33, 65, 13, 0, 33, 65, 13}; |
| |
| static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, |
| 0, -38, -74, 112, 0, -38, -74, 112}; |
| |
| static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, |
| 0, 112, -94, -18, 0, 112, -94, -18}; |
| |
| // Constants for ABGR. |
| static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0, |
| 33, 65, 13, 0, 33, 65, 13, 0}; |
| |
| static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, |
| -38, -74, 112, 0, -38, -74, 112, 0}; |
| |
| static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, |
| 112, -94, -18, 0, 112, -94, -18, 0}; |
| |
| // Constants for RGBA. |
| static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33, |
| 0, 13, 65, 33, 0, 13, 65, 33}; |
| |
| static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, |
| 0, 112, -74, -38, 0, 112, -74, -38}; |
| |
| static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, |
| 0, -18, -94, 112, 0, -18, -94, 112}; |
| |
| static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, |
| 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u}; |
| |
| // 7 bit fixed point 0.5. |
| static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; |
| |
| static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, |
| 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; |
| |
| static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, |
| 0x8080u, 0x8080u, 0x8080u, 0x8080u}; |
| |
| // Shuffle table for converting RGB24 to ARGB. |
| static const uvec8 kShuffleMaskRGB24ToARGB = { |
| 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; |
| |
| // Shuffle table for converting RAW to ARGB. |
| static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, |
| 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; |
| |
| // Shuffle table for converting RAW to RGB24. First 8. |
| static const uvec8 kShuffleMaskRAWToRGB24_0 = { |
| 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, |
| 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; |
| |
| // Shuffle table for converting RAW to RGB24. Middle 8. |
| static const uvec8 kShuffleMaskRAWToRGB24_1 = { |
| 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, |
| 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; |
| |
| // Shuffle table for converting RAW to RGB24. Last 8. |
| static const uvec8 kShuffleMaskRAWToRGB24_2 = { |
| 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, |
| 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; |
| |
| // Shuffle table for converting ARGB to RGB24. |
| static const uvec8 kShuffleMaskARGBToRGB24 = { |
| 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u}; |
| |
| // Shuffle table for converting ARGB to RAW. |
| static const uvec8 kShuffleMaskARGBToRAW = { |
| 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u}; |
| |
| // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 |
| static const uvec8 kShuffleMaskARGBToRGB24_0 = { |
| 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; |
| |
| // YUY2 shuf 16 Y to 32 Y. |
| static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, |
| 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, |
| 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; |
| |
| // YUY2 shuf 8 UV to 16 UV. |
| static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, |
| 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, |
| 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; |
| |
| // UYVY shuf 16 Y to 32 Y. |
| static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, |
| 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, |
| 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; |
| |
| // UYVY shuf 8 UV to 16 UV. |
| static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, |
| 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, |
| 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; |
| |
| // NV21 shuf 8 VU to 16 UV. |
| static const lvec8 kShuffleNV21 = { |
| 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, |
| 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, |
| }; |
| |
| // Duplicates gray value 3 times and fills in alpha opaque. |
| __declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_y |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // width |
| pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
| pslld xmm5, 24 |
| |
| convertloop: |
| movq xmm0, qword ptr [eax] |
| lea eax, [eax + 8] |
| punpcklbw xmm0, xmm0 |
| movdqa xmm1, xmm0 |
| punpcklwd xmm0, xmm0 |
| punpckhwd xmm1, xmm1 |
| por xmm0, xmm5 |
| por xmm1, xmm5 |
| movdqu [edx], xmm0 |
| movdqu [edx + 16], xmm1 |
| lea edx, [edx + 32] |
| sub ecx, 8 |
| jg convertloop |
| ret |
| } |
| } |
| |
| #ifdef HAS_J400TOARGBROW_AVX2 |
| // Duplicates gray value 3 times and fills in alpha opaque. |
| __declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_y |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // width |
| vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 |
| vpslld ymm5, ymm5, 24 |
| |
| convertloop: |
| vmovdqu xmm0, [eax] |
| lea eax, [eax + 16] |
| vpermq ymm0, ymm0, 0xd8 |
| vpunpcklbw ymm0, ymm0, ymm0 |
| vpermq ymm0, ymm0, 0xd8 |
| vpunpckhwd ymm1, ymm0, ymm0 |
| vpunpcklwd ymm0, ymm0, ymm0 |
| vpor ymm0, ymm0, ymm5 |
| vpor ymm1, ymm1, ymm5 |
| vmovdqu [edx], ymm0 |
| vmovdqu [edx + 32], ymm1 |
| lea edx, [edx + 64] |
| sub ecx, 16 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_J400TOARGBROW_AVX2 |
| |
| __declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_rgb24 |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // width |
| pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
| pslld xmm5, 24 |
| movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| movdqu xmm3, [eax + 32] |
| lea eax, [eax + 48] |
| movdqa xmm2, xmm3 |
| palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} |
| pshufb xmm2, xmm4 |
| por xmm2, xmm5 |
| palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} |
| pshufb xmm0, xmm4 |
| movdqu [edx + 32], xmm2 |
| por xmm0, xmm5 |
| pshufb xmm1, xmm4 |
| movdqu [edx], xmm0 |
| por xmm1, xmm5 |
| palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} |
| pshufb xmm3, xmm4 |
| movdqu [edx + 16], xmm1 |
| por xmm3, xmm5 |
| movdqu [edx + 48], xmm3 |
| lea edx, [edx + 64] |
| sub ecx, 16 |
| jg convertloop |
| ret |
| } |
| } |
| |
| __declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_raw |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // width |
| pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
| pslld xmm5, 24 |
| movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| movdqu xmm3, [eax + 32] |
| lea eax, [eax + 48] |
| movdqa xmm2, xmm3 |
| palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} |
| pshufb xmm2, xmm4 |
| por xmm2, xmm5 |
| palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} |
| pshufb xmm0, xmm4 |
| movdqu [edx + 32], xmm2 |
| por xmm0, xmm5 |
| pshufb xmm1, xmm4 |
| movdqu [edx], xmm0 |
| por xmm1, xmm5 |
| palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} |
| pshufb xmm3, xmm4 |
| movdqu [edx + 16], xmm1 |
| por xmm3, xmm5 |
| movdqu [edx + 48], xmm3 |
| lea edx, [edx + 64] |
| sub ecx, 16 |
| jg convertloop |
| ret |
| } |
| } |
| |
| __declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, |
| uint8_t* dst_rgb24, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_raw |
| mov edx, [esp + 8] // dst_rgb24 |
| mov ecx, [esp + 12] // width |
| movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0 |
| movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1 |
| movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2 |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 4] |
| movdqu xmm2, [eax + 8] |
| lea eax, [eax + 24] |
| pshufb xmm0, xmm3 |
| pshufb xmm1, xmm4 |
| pshufb xmm2, xmm5 |
| movq qword ptr [edx], xmm0 |
| movq qword ptr [edx + 8], xmm1 |
| movq qword ptr [edx + 16], xmm2 |
| lea edx, [edx + 24] |
| sub ecx, 8 |
| jg convertloop |
| ret |
| } |
| } |
| |
| // pmul method to replicate bits. |
| // Math to replicate bits: |
| // (v << 8) | (v << 3) |
| // v * 256 + v * 8 |
| // v * (256 + 8) |
| // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 |
| // 20 instructions. |
| __declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
| movd xmm5, eax |
| pshufd xmm5, xmm5, 0 |
| mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits |
| movd xmm6, eax |
| pshufd xmm6, xmm6, 0 |
| pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red |
| psllw xmm3, 11 |
| pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green |
| psllw xmm4, 10 |
| psrlw xmm4, 5 |
| pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha |
| psllw xmm7, 8 |
| |
| mov eax, [esp + 4] // src_rgb565 |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // width |
| sub edx, eax |
| sub edx, eax |
| |
| convertloop: |
| movdqu xmm0, [eax] // fetch 8 pixels of bgr565 |
| movdqa xmm1, xmm0 |
| movdqa xmm2, xmm0 |
| pand xmm1, xmm3 // R in upper 5 bits |
| psllw xmm2, 11 // B in upper 5 bits |
| pmulhuw xmm1, xmm5 // * (256 + 8) |
| pmulhuw xmm2, xmm5 // * (256 + 8) |
| psllw xmm1, 8 |
| por xmm1, xmm2 // RB |
| pand xmm0, xmm4 // G in middle 6 bits |
| pmulhuw xmm0, xmm6 // << 5 * (256 + 4) |
| por xmm0, xmm7 // AG |
| movdqa xmm2, xmm1 |
| punpcklbw xmm1, xmm0 |
| punpckhbw xmm2, xmm0 |
| movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB |
| movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB |
| lea eax, [eax + 16] |
| sub ecx, 8 |
| jg convertloop |
| ret |
| } |
| } |
| |
| #ifdef HAS_RGB565TOARGBROW_AVX2 |
| // pmul method to replicate bits. |
| // Math to replicate bits: |
| // (v << 8) | (v << 3) |
| // v * 256 + v * 8 |
| // v * (256 + 8) |
| // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 |
| __declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
| vmovd xmm5, eax |
| vbroadcastss ymm5, xmm5 |
| mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits |
| vmovd xmm6, eax |
| vbroadcastss ymm6, xmm6 |
| vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red |
| vpsllw ymm3, ymm3, 11 |
| vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green |
| vpsllw ymm4, ymm4, 10 |
| vpsrlw ymm4, ymm4, 5 |
| vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha |
| vpsllw ymm7, ymm7, 8 |
| |
| mov eax, [esp + 4] // src_rgb565 |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // width |
| sub edx, eax |
| sub edx, eax |
| |
| convertloop: |
| vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 |
| vpand ymm1, ymm0, ymm3 // R in upper 5 bits |
| vpsllw ymm2, ymm0, 11 // B in upper 5 bits |
| vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) |
| vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) |
| vpsllw ymm1, ymm1, 8 |
| vpor ymm1, ymm1, ymm2 // RB |
| vpand ymm0, ymm0, ymm4 // G in middle 6 bits |
| vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4) |
| vpor ymm0, ymm0, ymm7 // AG |
| vpermq ymm0, ymm0, 0xd8 // mutate for unpack |
| vpermq ymm1, ymm1, 0xd8 |
| vpunpckhbw ymm2, ymm1, ymm0 |
| vpunpcklbw ymm1, ymm1, ymm0 |
| vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB |
| vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB |
| lea eax, [eax + 32] |
| sub ecx, 16 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_RGB565TOARGBROW_AVX2 |
| |
| #ifdef HAS_ARGB1555TOARGBROW_AVX2 |
| __declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
| vmovd xmm5, eax |
| vbroadcastss ymm5, xmm5 |
| mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits |
| vmovd xmm6, eax |
| vbroadcastss ymm6, xmm6 |
| vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red |
| vpsllw ymm3, ymm3, 11 |
| vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green |
| vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha |
| vpsllw ymm7, ymm7, 8 |
| |
| mov eax, [esp + 4] // src_argb1555 |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // width |
| sub edx, eax |
| sub edx, eax |
| |
| convertloop: |
| vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 |
| vpsllw ymm1, ymm0, 1 // R in upper 5 bits |
| vpsllw ymm2, ymm0, 11 // B in upper 5 bits |
| vpand ymm1, ymm1, ymm3 |
| vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) |
| vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) |
| vpsllw ymm1, ymm1, 8 |
| vpor ymm1, ymm1, ymm2 // RB |
| vpsraw ymm2, ymm0, 8 // A |
| vpand ymm0, ymm0, ymm4 // G in middle 5 bits |
| vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8) |
| vpand ymm2, ymm2, ymm7 |
| vpor ymm0, ymm0, ymm2 // AG |
| vpermq ymm0, ymm0, 0xd8 // mutate for unpack |
| vpermq ymm1, ymm1, 0xd8 |
| vpunpckhbw ymm2, ymm1, ymm0 |
| vpunpcklbw ymm1, ymm1, ymm0 |
| vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB |
| vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB |
| lea eax, [eax + 32] |
| sub ecx, 16 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGB1555TOARGBROW_AVX2 |
| |
| #ifdef HAS_ARGB4444TOARGBROW_AVX2 |
| __declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f |
| vmovd xmm4, eax |
| vbroadcastss ymm4, xmm4 |
| vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles |
| mov eax, [esp + 4] // src_argb4444 |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // width |
| sub edx, eax |
| sub edx, eax |
| |
| convertloop: |
| vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 |
| vpand ymm2, ymm0, ymm5 // mask high nibbles |
| vpand ymm0, ymm0, ymm4 // mask low nibbles |
| vpsrlw ymm3, ymm2, 4 |
| vpsllw ymm1, ymm0, 4 |
| vpor ymm2, ymm2, ymm3 |
| vpor ymm0, ymm0, ymm1 |
| vpermq ymm0, ymm0, 0xd8 // mutate for unpack |
| vpermq ymm2, ymm2, 0xd8 |
| vpunpckhbw ymm1, ymm0, ymm2 |
| vpunpcklbw ymm0, ymm0, ymm2 |
| vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB |
| vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB |
| lea eax, [eax + 32] |
| sub ecx, 16 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGB4444TOARGBROW_AVX2 |
| |
| // 24 instructions |
| __declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
| movd xmm5, eax |
| pshufd xmm5, xmm5, 0 |
| mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits |
| movd xmm6, eax |
| pshufd xmm6, xmm6, 0 |
| pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red |
| psllw xmm3, 11 |
| movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green |
| psrlw xmm4, 6 |
| pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha |
| psllw xmm7, 8 |
| |
| mov eax, [esp + 4] // src_argb1555 |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // width |
| sub edx, eax |
| sub edx, eax |
| |
| convertloop: |
| movdqu xmm0, [eax] // fetch 8 pixels of 1555 |
| movdqa xmm1, xmm0 |
| movdqa xmm2, xmm0 |
| psllw xmm1, 1 // R in upper 5 bits |
| psllw xmm2, 11 // B in upper 5 bits |
| pand xmm1, xmm3 |
| pmulhuw xmm2, xmm5 // * (256 + 8) |
| pmulhuw xmm1, xmm5 // * (256 + 8) |
| psllw xmm1, 8 |
| por xmm1, xmm2 // RB |
| movdqa xmm2, xmm0 |
| pand xmm0, xmm4 // G in middle 5 bits |
| psraw xmm2, 8 // A |
| pmulhuw xmm0, xmm6 // << 6 * (256 + 8) |
| pand xmm2, xmm7 |
| por xmm0, xmm2 // AG |
| movdqa xmm2, xmm1 |
| punpcklbw xmm1, xmm0 |
| punpckhbw xmm2, xmm0 |
| movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB |
| movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB |
| lea eax, [eax + 16] |
| sub ecx, 8 |
| jg convertloop |
| ret |
| } |
| } |
| |
| // 18 instructions. |
| __declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f |
| movd xmm4, eax |
| pshufd xmm4, xmm4, 0 |
| movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles |
| pslld xmm5, 4 |
| mov eax, [esp + 4] // src_argb4444 |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // width |
| sub edx, eax |
| sub edx, eax |
| |
| convertloop: |
| movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 |
| movdqa xmm2, xmm0 |
| pand xmm0, xmm4 // mask low nibbles |
| pand xmm2, xmm5 // mask high nibbles |
| movdqa xmm1, xmm0 |
| movdqa xmm3, xmm2 |
| psllw xmm1, 4 |
| psrlw xmm3, 4 |
| por xmm0, xmm1 |
| por xmm2, xmm3 |
| movdqa xmm1, xmm0 |
| punpcklbw xmm0, xmm2 |
| punpckhbw xmm1, xmm2 |
| movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB |
| movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB |
| lea eax, [eax + 16] |
| sub ecx, 8 |
| jg convertloop |
| ret |
| } |
| } |
| |
| __declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb, |
| uint8_t* dst_rgb, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_rgb |
| mov ecx, [esp + 12] // width |
| movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 |
| |
| convertloop: |
| movdqu xmm0, [eax] // fetch 16 pixels of argb |
| movdqu xmm1, [eax + 16] |
| movdqu xmm2, [eax + 32] |
| movdqu xmm3, [eax + 48] |
| lea eax, [eax + 64] |
| pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB |
| pshufb xmm1, xmm6 |
| pshufb xmm2, xmm6 |
| pshufb xmm3, xmm6 |
| movdqa xmm4, xmm1 // 4 bytes from 1 for 0 |
| psrldq xmm1, 4 // 8 bytes from 1 |
| pslldq xmm4, 12 // 4 bytes from 1 for 0 |
| movdqa xmm5, xmm2 // 8 bytes from 2 for 1 |
| por xmm0, xmm4 // 4 bytes from 1 for 0 |
| pslldq xmm5, 8 // 8 bytes from 2 for 1 |
| movdqu [edx], xmm0 // store 0 |
| por xmm1, xmm5 // 8 bytes from 2 for 1 |
| psrldq xmm2, 8 // 4 bytes from 2 |
| pslldq xmm3, 4 // 12 bytes from 3 for 2 |
| por xmm2, xmm3 // 12 bytes from 3 for 2 |
| movdqu [edx + 16], xmm1 // store 1 |
| movdqu [edx + 32], xmm2 // store 2 |
| lea edx, [edx + 48] |
| sub ecx, 16 |
| jg convertloop |
| ret |
| } |
| } |
| |
| __declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb, |
| uint8_t* dst_rgb, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_rgb |
| mov ecx, [esp + 12] // width |
| movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW |
| |
| convertloop: |
| movdqu xmm0, [eax] // fetch 16 pixels of argb |
| movdqu xmm1, [eax + 16] |
| movdqu xmm2, [eax + 32] |
| movdqu xmm3, [eax + 48] |
| lea eax, [eax + 64] |
| pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB |
| pshufb xmm1, xmm6 |
| pshufb xmm2, xmm6 |
| pshufb xmm3, xmm6 |
| movdqa xmm4, xmm1 // 4 bytes from 1 for 0 |
| psrldq xmm1, 4 // 8 bytes from 1 |
| pslldq xmm4, 12 // 4 bytes from 1 for 0 |
| movdqa xmm5, xmm2 // 8 bytes from 2 for 1 |
| por xmm0, xmm4 // 4 bytes from 1 for 0 |
| pslldq xmm5, 8 // 8 bytes from 2 for 1 |
| movdqu [edx], xmm0 // store 0 |
| por xmm1, xmm5 // 8 bytes from 2 for 1 |
| psrldq xmm2, 8 // 4 bytes from 2 |
| pslldq xmm3, 4 // 12 bytes from 3 for 2 |
| por xmm2, xmm3 // 12 bytes from 3 for 2 |
| movdqu [edx + 16], xmm1 // store 1 |
| movdqu [edx + 32], xmm2 // store 2 |
| lea edx, [edx + 48] |
| sub ecx, 16 |
| jg convertloop |
| ret |
| } |
| } |
| |
| __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb, |
| uint8_t* dst_rgb, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_rgb |
| mov ecx, [esp + 12] // width |
| pcmpeqb xmm3, xmm3 // generate mask 0x0000001f |
| psrld xmm3, 27 |
| pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 |
| psrld xmm4, 26 |
| pslld xmm4, 5 |
| pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 |
| pslld xmm5, 11 |
| |
| convertloop: |
| movdqu xmm0, [eax] // fetch 4 pixels of argb |
| movdqa xmm1, xmm0 // B |
| movdqa xmm2, xmm0 // G |
| pslld xmm0, 8 // R |
| psrld xmm1, 3 // B |
| psrld xmm2, 5 // G |
| psrad xmm0, 16 // R |
| pand xmm1, xmm3 // B |
| pand xmm2, xmm4 // G |
| pand xmm0, xmm5 // R |
| por xmm1, xmm2 // BG |
| por xmm0, xmm1 // BGR |
| packssdw xmm0, xmm0 |
| lea eax, [eax + 16] |
| movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 |
| lea edx, [edx + 8] |
| sub ecx, 4 |
| jg convertloop |
| ret |
| } |
| } |
| |
| __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb, |
| uint8_t* dst_rgb, |
| const uint32_t dither4, |
| int width) { |
| __asm { |
| |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_rgb |
| movd xmm6, [esp + 12] // dither4 |
| mov ecx, [esp + 16] // width |
| punpcklbw xmm6, xmm6 // make dither 16 bytes |
| movdqa xmm7, xmm6 |
| punpcklwd xmm6, xmm6 |
| punpckhwd xmm7, xmm7 |
| pcmpeqb xmm3, xmm3 // generate mask 0x0000001f |
| psrld xmm3, 27 |
| pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 |
| psrld xmm4, 26 |
| pslld xmm4, 5 |
| pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 |
| pslld xmm5, 11 |
| |
| convertloop: |
| movdqu xmm0, [eax] // fetch 4 pixels of argb |
| paddusb xmm0, xmm6 // add dither |
| movdqa xmm1, xmm0 // B |
| movdqa xmm2, xmm0 // G |
| pslld xmm0, 8 // R |
| psrld xmm1, 3 // B |
| psrld xmm2, 5 // G |
| psrad xmm0, 16 // R |
| pand xmm1, xmm3 // B |
| pand xmm2, xmm4 // G |
| pand xmm0, xmm5 // R |
| por xmm1, xmm2 // BG |
| por xmm0, xmm1 // BGR |
| packssdw xmm0, xmm0 |
| lea eax, [eax + 16] |
| movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 |
| lea edx, [edx + 8] |
| sub ecx, 4 |
| jg convertloop |
| ret |
| } |
| } |
| |
| #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 |
| __declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb, |
| uint8_t* dst_rgb, |
| const uint32_t dither4, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_rgb |
| vbroadcastss xmm6, [esp + 12] // dither4 |
| mov ecx, [esp + 16] // width |
| vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes |
| vpermq ymm6, ymm6, 0xd8 |
| vpunpcklwd ymm6, ymm6, ymm6 |
| vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f |
| vpsrld ymm3, ymm3, 27 |
| vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 |
| vpsrld ymm4, ymm4, 26 |
| vpslld ymm4, ymm4, 5 |
| vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 |
| |
| convertloop: |
| vmovdqu ymm0, [eax] // fetch 8 pixels of argb |
| vpaddusb ymm0, ymm0, ymm6 // add dither |
| vpsrld ymm2, ymm0, 5 // G |
| vpsrld ymm1, ymm0, 3 // B |
| vpsrld ymm0, ymm0, 8 // R |
| vpand ymm2, ymm2, ymm4 // G |
| vpand ymm1, ymm1, ymm3 // B |
| vpand ymm0, ymm0, ymm5 // R |
| vpor ymm1, ymm1, ymm2 // BG |
| vpor ymm0, ymm0, ymm1 // BGR |
| vpackusdw ymm0, ymm0, ymm0 |
| vpermq ymm0, ymm0, 0xd8 |
| lea eax, [eax + 32] |
| vmovdqu [edx], xmm0 // store 8 pixels of RGB565 |
| lea edx, [edx + 16] |
| sub ecx, 8 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBTORGB565DITHERROW_AVX2 |
| |
| // TODO(fbarchard): Improve sign extension/packing. |
| __declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb, |
| uint8_t* dst_rgb, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_rgb |
| mov ecx, [esp + 12] // width |
| pcmpeqb xmm4, xmm4 // generate mask 0x0000001f |
| psrld xmm4, 27 |
| movdqa xmm5, xmm4 // generate mask 0x000003e0 |
| pslld xmm5, 5 |
| movdqa xmm6, xmm4 // generate mask 0x00007c00 |
| pslld xmm6, 10 |
| pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 |
| pslld xmm7, 15 |
| |
| convertloop: |
| movdqu xmm0, [eax] // fetch 4 pixels of argb |
| movdqa xmm1, xmm0 // B |
| movdqa xmm2, xmm0 // G |
| movdqa xmm3, xmm0 // R |
| psrad xmm0, 16 // A |
| psrld xmm1, 3 // B |
| psrld xmm2, 6 // G |
| psrld xmm3, 9 // R |
| pand xmm0, xmm7 // A |
| pand xmm1, xmm4 // B |
| pand xmm2, xmm5 // G |
| pand xmm3, xmm6 // R |
| por xmm0, xmm1 // BA |
| por xmm2, xmm3 // GR |
| por xmm0, xmm2 // BGRA |
| packssdw xmm0, xmm0 |
| lea eax, [eax + 16] |
| movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 |
| lea edx, [edx + 8] |
| sub ecx, 4 |
| jg convertloop |
| ret |
| } |
| } |
| |
| __declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb, |
| uint8_t* dst_rgb, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_rgb |
| mov ecx, [esp + 12] // width |
| pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 |
| psllw xmm4, 12 |
| movdqa xmm3, xmm4 // generate mask 0x00f000f0 |
| psrlw xmm3, 8 |
| |
| convertloop: |
| movdqu xmm0, [eax] // fetch 4 pixels of argb |
| movdqa xmm1, xmm0 |
| pand xmm0, xmm3 // low nibble |
| pand xmm1, xmm4 // high nibble |
| psrld xmm0, 4 |
| psrld xmm1, 8 |
| por xmm0, xmm1 |
| packuswb xmm0, xmm0 |
| lea eax, [eax + 16] |
| movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 |
| lea edx, [edx + 8] |
| sub ecx, 4 |
| jg convertloop |
| ret |
| } |
| } |
| |
| #ifdef HAS_ARGBTORGB565ROW_AVX2 |
| __declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, |
| uint8_t* dst_rgb, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_rgb |
| mov ecx, [esp + 12] // width |
| vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f |
| vpsrld ymm3, ymm3, 27 |
| vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 |
| vpsrld ymm4, ymm4, 26 |
| vpslld ymm4, ymm4, 5 |
| vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 |
| |
| convertloop: |
| vmovdqu ymm0, [eax] // fetch 8 pixels of argb |
| vpsrld ymm2, ymm0, 5 // G |
| vpsrld ymm1, ymm0, 3 // B |
| vpsrld ymm0, ymm0, 8 // R |
| vpand ymm2, ymm2, ymm4 // G |
| vpand ymm1, ymm1, ymm3 // B |
| vpand ymm0, ymm0, ymm5 // R |
| vpor ymm1, ymm1, ymm2 // BG |
| vpor ymm0, ymm0, ymm1 // BGR |
| vpackusdw ymm0, ymm0, ymm0 |
| vpermq ymm0, ymm0, 0xd8 |
| lea eax, [eax + 32] |
| vmovdqu [edx], xmm0 // store 8 pixels of RGB565 |
| lea edx, [edx + 16] |
| sub ecx, 8 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBTORGB565ROW_AVX2 |
| |
| #ifdef HAS_ARGBTOARGB1555ROW_AVX2 |
| __declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb, |
| uint8_t* dst_rgb, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_rgb |
| mov ecx, [esp + 12] // width |
| vpcmpeqb ymm4, ymm4, ymm4 |
| vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f |
| vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 |
| vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 |
| vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 |
| vpslld ymm7, ymm7, 15 |
| |
| convertloop: |
| vmovdqu ymm0, [eax] // fetch 8 pixels of argb |
| vpsrld ymm3, ymm0, 9 // R |
| vpsrld ymm2, ymm0, 6 // G |
| vpsrld ymm1, ymm0, 3 // B |
| vpsrad ymm0, ymm0, 16 // A |
| vpand ymm3, ymm3, ymm6 // R |
| vpand ymm2, ymm2, ymm5 // G |
| vpand ymm1, ymm1, ymm4 // B |
| vpand ymm0, ymm0, ymm7 // A |
| vpor ymm0, ymm0, ymm1 // BA |
| vpor ymm2, ymm2, ymm3 // GR |
| vpor ymm0, ymm0, ymm2 // BGRA |
| vpackssdw ymm0, ymm0, ymm0 |
| vpermq ymm0, ymm0, 0xd8 |
| lea eax, [eax + 32] |
| vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555 |
| lea edx, [edx + 16] |
| sub ecx, 8 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBTOARGB1555ROW_AVX2 |
| |
| #ifdef HAS_ARGBTOARGB4444ROW_AVX2 |
| __declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb, |
| uint8_t* dst_rgb, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_rgb |
| mov ecx, [esp + 12] // width |
| vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 |
| vpsllw ymm4, ymm4, 12 |
| vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 |
| |
| convertloop: |
| vmovdqu ymm0, [eax] // fetch 8 pixels of argb |
| vpand ymm1, ymm0, ymm4 // high nibble |
| vpand ymm0, ymm0, ymm3 // low nibble |
| vpsrld ymm1, ymm1, 8 |
| vpsrld ymm0, ymm0, 4 |
| vpor ymm0, ymm0, ymm1 |
| vpackuswb ymm0, ymm0, ymm0 |
| vpermq ymm0, ymm0, 0xd8 |
| lea eax, [eax + 32] |
| vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 |
| lea edx, [edx + 16] |
| sub ecx, 8 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBTOARGB4444ROW_AVX2 |
| |
| // Convert 16 ARGB pixels (64 bytes) to 16 Y values. |
| __declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb, |
| uint8_t* dst_y, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] /* src_argb */ |
| mov edx, [esp + 8] /* dst_y */ |
| mov ecx, [esp + 12] /* width */ |
| movdqa xmm4, xmmword ptr kARGBToY |
| movdqa xmm5, xmmword ptr kAddY16 |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| movdqu xmm2, [eax + 32] |
| movdqu xmm3, [eax + 48] |
| pmaddubsw xmm0, xmm4 |
| pmaddubsw xmm1, xmm4 |
| pmaddubsw xmm2, xmm4 |
| pmaddubsw xmm3, xmm4 |
| lea eax, [eax + 64] |
| phaddw xmm0, xmm1 |
| phaddw xmm2, xmm3 |
| psrlw xmm0, 7 |
| psrlw xmm2, 7 |
| packuswb xmm0, xmm2 |
| paddb xmm0, xmm5 |
| movdqu [edx], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 16 |
| jg convertloop |
| ret |
| } |
| } |
| |
| // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. |
| // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. |
| __declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb, |
| uint8_t* dst_y, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] /* src_argb */ |
| mov edx, [esp + 8] /* dst_y */ |
| mov ecx, [esp + 12] /* width */ |
| movdqa xmm4, xmmword ptr kARGBToYJ |
| movdqa xmm5, xmmword ptr kAddYJ64 |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| movdqu xmm2, [eax + 32] |
| movdqu xmm3, [eax + 48] |
| pmaddubsw xmm0, xmm4 |
| pmaddubsw xmm1, xmm4 |
| pmaddubsw xmm2, xmm4 |
| pmaddubsw xmm3, xmm4 |
| lea eax, [eax + 64] |
| phaddw xmm0, xmm1 |
| phaddw xmm2, xmm3 |
| paddw xmm0, xmm5 // Add .5 for rounding. |
| paddw xmm2, xmm5 |
| psrlw xmm0, 7 |
| psrlw xmm2, 7 |
| packuswb xmm0, xmm2 |
| movdqu [edx], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 16 |
| jg convertloop |
| ret |
| } |
| } |
| |
| #ifdef HAS_ARGBTOYROW_AVX2 |
| // vpermd for vphaddw + vpackuswb vpermd. |
| static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; |
| |
| // Convert 32 ARGB pixels (128 bytes) to 32 Y values. |
| __declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb, |
| uint8_t* dst_y, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] /* src_argb */ |
| mov edx, [esp + 8] /* dst_y */ |
| mov ecx, [esp + 12] /* width */ |
| vbroadcastf128 ymm4, xmmword ptr kARGBToY |
| vbroadcastf128 ymm5, xmmword ptr kAddY16 |
| vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX |
| |
| convertloop: |
| vmovdqu ymm0, [eax] |
| vmovdqu ymm1, [eax + 32] |
| vmovdqu ymm2, [eax + 64] |
| vmovdqu ymm3, [eax + 96] |
| vpmaddubsw ymm0, ymm0, ymm4 |
| vpmaddubsw ymm1, ymm1, ymm4 |
| vpmaddubsw ymm2, ymm2, ymm4 |
| vpmaddubsw ymm3, ymm3, ymm4 |
| lea eax, [eax + 128] |
| vphaddw ymm0, ymm0, ymm1 // mutates. |
| vphaddw ymm2, ymm2, ymm3 |
| vpsrlw ymm0, ymm0, 7 |
| vpsrlw ymm2, ymm2, 7 |
| vpackuswb ymm0, ymm0, ymm2 // mutates. |
| vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. |
| vpaddb ymm0, ymm0, ymm5 // add 16 for Y |
| vmovdqu [edx], ymm0 |
| lea edx, [edx + 32] |
| sub ecx, 32 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBTOYROW_AVX2 |
| |
| #ifdef HAS_ARGBTOYJROW_AVX2 |
| // Convert 32 ARGB pixels (128 bytes) to 32 Y values. |
| __declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb, |
| uint8_t* dst_y, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] /* src_argb */ |
| mov edx, [esp + 8] /* dst_y */ |
| mov ecx, [esp + 12] /* width */ |
| vbroadcastf128 ymm4, xmmword ptr kARGBToYJ |
| vbroadcastf128 ymm5, xmmword ptr kAddYJ64 |
| vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX |
| |
| convertloop: |
| vmovdqu ymm0, [eax] |
| vmovdqu ymm1, [eax + 32] |
| vmovdqu ymm2, [eax + 64] |
| vmovdqu ymm3, [eax + 96] |
| vpmaddubsw ymm0, ymm0, ymm4 |
| vpmaddubsw ymm1, ymm1, ymm4 |
| vpmaddubsw ymm2, ymm2, ymm4 |
| vpmaddubsw ymm3, ymm3, ymm4 |
| lea eax, [eax + 128] |
| vphaddw ymm0, ymm0, ymm1 // mutates. |
| vphaddw ymm2, ymm2, ymm3 |
| vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding. |
| vpaddw ymm2, ymm2, ymm5 |
| vpsrlw ymm0, ymm0, 7 |
| vpsrlw ymm2, ymm2, 7 |
| vpackuswb ymm0, ymm0, ymm2 // mutates. |
| vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. |
| vmovdqu [edx], ymm0 |
| lea edx, [edx + 32] |
| sub ecx, 32 |
| jg convertloop |
| |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBTOYJROW_AVX2 |
| |
| __declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb, |
| uint8_t* dst_y, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] /* src_argb */ |
| mov edx, [esp + 8] /* dst_y */ |
| mov ecx, [esp + 12] /* width */ |
| movdqa xmm4, xmmword ptr kBGRAToY |
| movdqa xmm5, xmmword ptr kAddY16 |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| movdqu xmm2, [eax + 32] |
| movdqu xmm3, [eax + 48] |
| pmaddubsw xmm0, xmm4 |
| pmaddubsw xmm1, xmm4 |
| pmaddubsw xmm2, xmm4 |
| pmaddubsw xmm3, xmm4 |
| lea eax, [eax + 64] |
| phaddw xmm0, xmm1 |
| phaddw xmm2, xmm3 |
| psrlw xmm0, 7 |
| psrlw xmm2, 7 |
| packuswb xmm0, xmm2 |
| paddb xmm0, xmm5 |
| movdqu [edx], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 16 |
| jg convertloop |
| ret |
| } |
| } |
| |
| __declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb, |
| uint8_t* dst_y, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] /* src_argb */ |
| mov edx, [esp + 8] /* dst_y */ |
| mov ecx, [esp + 12] /* width */ |
| movdqa xmm4, xmmword ptr kABGRToY |
| movdqa xmm5, xmmword ptr kAddY16 |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| movdqu xmm2, [eax + 32] |
| movdqu xmm3, [eax + 48] |
| pmaddubsw xmm0, xmm4 |
| pmaddubsw xmm1, xmm4 |
| pmaddubsw xmm2, xmm4 |
| pmaddubsw xmm3, xmm4 |
| lea eax, [eax + 64] |
| phaddw xmm0, xmm1 |
| phaddw xmm2, xmm3 |
| psrlw xmm0, 7 |
| psrlw xmm2, 7 |
| packuswb xmm0, xmm2 |
| paddb xmm0, xmm5 |
| movdqu [edx], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 16 |
| jg convertloop |
| ret |
| } |
| } |
| |
| __declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb, |
| uint8_t* dst_y, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] /* src_argb */ |
| mov edx, [esp + 8] /* dst_y */ |
| mov ecx, [esp + 12] /* width */ |
| movdqa xmm4, xmmword ptr kRGBAToY |
| movdqa xmm5, xmmword ptr kAddY16 |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| movdqu xmm2, [eax + 32] |
| movdqu xmm3, [eax + 48] |
| pmaddubsw xmm0, xmm4 |
| pmaddubsw xmm1, xmm4 |
| pmaddubsw xmm2, xmm4 |
| pmaddubsw xmm3, xmm4 |
| lea eax, [eax + 64] |
| phaddw xmm0, xmm1 |
| phaddw xmm2, xmm3 |
| psrlw xmm0, 7 |
| psrlw xmm2, 7 |
| packuswb xmm0, xmm2 |
| paddb xmm0, xmm5 |
| movdqu [edx], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 16 |
| jg convertloop |
| ret |
| } |
| } |
| |
| __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, |
| int src_stride_argb, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // src_argb |
| mov esi, [esp + 8 + 8] // src_stride_argb |
| mov edx, [esp + 8 + 12] // dst_u |
| mov edi, [esp + 8 + 16] // dst_v |
| mov ecx, [esp + 8 + 20] // width |
| movdqa xmm5, xmmword ptr kAddUV128 |
| movdqa xmm6, xmmword ptr kARGBToV |
| movdqa xmm7, xmmword ptr kARGBToU |
| sub edi, edx // stride from u to v |
| |
| convertloop: |
| /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| movdqu xmm0, [eax] |
| movdqu xmm4, [eax + esi] |
| pavgb xmm0, xmm4 |
| movdqu xmm1, [eax + 16] |
| movdqu xmm4, [eax + esi + 16] |
| pavgb xmm1, xmm4 |
| movdqu xmm2, [eax + 32] |
| movdqu xmm4, [eax + esi + 32] |
| pavgb xmm2, xmm4 |
| movdqu xmm3, [eax + 48] |
| movdqu xmm4, [eax + esi + 48] |
| pavgb xmm3, xmm4 |
| |
| lea eax, [eax + 64] |
| movdqa xmm4, xmm0 |
| shufps xmm0, xmm1, 0x88 |
| shufps xmm4, xmm1, 0xdd |
| pavgb xmm0, xmm4 |
| movdqa xmm4, xmm2 |
| shufps xmm2, xmm3, 0x88 |
| shufps xmm4, xmm3, 0xdd |
| pavgb xmm2, xmm4 |
| |
| // step 2 - convert to U and V |
| // from here down is very similar to Y code except |
| // instead of 16 different pixels, its 8 pixels of U and 8 of V |
| movdqa xmm1, xmm0 |
| movdqa xmm3, xmm2 |
| pmaddubsw xmm0, xmm7 // U |
| pmaddubsw xmm2, xmm7 |
| pmaddubsw xmm1, xmm6 // V |
| pmaddubsw xmm3, xmm6 |
| phaddw xmm0, xmm2 |
| phaddw xmm1, xmm3 |
| psraw xmm0, 8 |
| psraw xmm1, 8 |
| packsswb xmm0, xmm1 |
| paddb xmm0, xmm5 // -> unsigned |
| |
| // step 3 - store 8 U and 8 V values |
| movlps qword ptr [edx], xmm0 // U |
| movhps qword ptr [edx + edi], xmm0 // V |
| lea edx, [edx + 8] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, |
| int src_stride_argb, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // src_argb |
| mov esi, [esp + 8 + 8] // src_stride_argb |
| mov edx, [esp + 8 + 12] // dst_u |
| mov edi, [esp + 8 + 16] // dst_v |
| mov ecx, [esp + 8 + 20] // width |
| movdqa xmm5, xmmword ptr kAddUVJ128 |
| movdqa xmm6, xmmword ptr kARGBToVJ |
| movdqa xmm7, xmmword ptr kARGBToUJ |
| sub edi, edx // stride from u to v |
| |
| convertloop: |
| /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| movdqu xmm0, [eax] |
| movdqu xmm4, [eax + esi] |
| pavgb xmm0, xmm4 |
| movdqu xmm1, [eax + 16] |
| movdqu xmm4, [eax + esi + 16] |
| pavgb xmm1, xmm4 |
| movdqu xmm2, [eax + 32] |
| movdqu xmm4, [eax + esi + 32] |
| pavgb xmm2, xmm4 |
| movdqu xmm3, [eax + 48] |
| movdqu xmm4, [eax + esi + 48] |
| pavgb xmm3, xmm4 |
| |
| lea eax, [eax + 64] |
| movdqa xmm4, xmm0 |
| shufps xmm0, xmm1, 0x88 |
| shufps xmm4, xmm1, 0xdd |
| pavgb xmm0, xmm4 |
| movdqa xmm4, xmm2 |
| shufps xmm2, xmm3, 0x88 |
| shufps xmm4, xmm3, 0xdd |
| pavgb xmm2, xmm4 |
| |
| // step 2 - convert to U and V |
| // from here down is very similar to Y code except |
| // instead of 16 different pixels, its 8 pixels of U and 8 of V |
| movdqa xmm1, xmm0 |
| movdqa xmm3, xmm2 |
| pmaddubsw xmm0, xmm7 // U |
| pmaddubsw xmm2, xmm7 |
| pmaddubsw xmm1, xmm6 // V |
| pmaddubsw xmm3, xmm6 |
| phaddw xmm0, xmm2 |
| phaddw xmm1, xmm3 |
| paddw xmm0, xmm5 // +.5 rounding -> unsigned |
| paddw xmm1, xmm5 |
| psraw xmm0, 8 |
| psraw xmm1, 8 |
| packsswb xmm0, xmm1 |
| |
| // step 3 - store 8 U and 8 V values |
| movlps qword ptr [edx], xmm0 // U |
| movhps qword ptr [edx + edi], xmm0 // V |
| lea edx, [edx + 8] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| #ifdef HAS_ARGBTOUVROW_AVX2 |
| __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0, |
| int src_stride_argb, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // src_argb |
| mov esi, [esp + 8 + 8] // src_stride_argb |
| mov edx, [esp + 8 + 12] // dst_u |
| mov edi, [esp + 8 + 16] // dst_v |
| mov ecx, [esp + 8 + 20] // width |
| vbroadcastf128 ymm5, xmmword ptr kAddUV128 |
| vbroadcastf128 ymm6, xmmword ptr kARGBToV |
| vbroadcastf128 ymm7, xmmword ptr kARGBToU |
| sub edi, edx // stride from u to v |
| |
| convertloop: |
| /* step 1 - subsample 32x2 argb pixels to 16x1 */ |
| vmovdqu ymm0, [eax] |
| vmovdqu ymm1, [eax + 32] |
| vmovdqu ymm2, [eax + 64] |
| vmovdqu ymm3, [eax + 96] |
| vpavgb ymm0, ymm0, [eax + esi] |
| vpavgb ymm1, ymm1, [eax + esi + 32] |
| vpavgb ymm2, ymm2, [eax + esi + 64] |
| vpavgb ymm3, ymm3, [eax + esi + 96] |
| lea eax, [eax + 128] |
| vshufps ymm4, ymm0, ymm1, 0x88 |
| vshufps ymm0, ymm0, ymm1, 0xdd |
| vpavgb ymm0, ymm0, ymm4 // mutated by vshufps |
| vshufps ymm4, ymm2, ymm3, 0x88 |
| vshufps ymm2, ymm2, ymm3, 0xdd |
| vpavgb ymm2, ymm2, ymm4 // mutated by vshufps |
| |
| // step 2 - convert to U and V |
| // from here down is very similar to Y code except |
| // instead of 32 different pixels, its 16 pixels of U and 16 of V |
| vpmaddubsw ymm1, ymm0, ymm7 // U |
| vpmaddubsw ymm3, ymm2, ymm7 |
| vpmaddubsw ymm0, ymm0, ymm6 // V |
| vpmaddubsw ymm2, ymm2, ymm6 |
| vphaddw ymm1, ymm1, ymm3 // mutates |
| vphaddw ymm0, ymm0, ymm2 |
| vpsraw ymm1, ymm1, 8 |
| vpsraw ymm0, ymm0, 8 |
| vpacksswb ymm0, ymm1, ymm0 // mutates |
| vpermq ymm0, ymm0, 0xd8 // For vpacksswb |
| vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw |
| vpaddb ymm0, ymm0, ymm5 // -> unsigned |
| |
| // step 3 - store 16 U and 16 V values |
| vextractf128 [edx], ymm0, 0 // U |
| vextractf128 [edx + edi], ymm0, 1 // V |
| lea edx, [edx + 16] |
| sub ecx, 32 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBTOUVROW_AVX2 |
| |
| #ifdef HAS_ARGBTOUVJROW_AVX2 |
| __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, |
| int src_stride_argb, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // src_argb |
| mov esi, [esp + 8 + 8] // src_stride_argb |
| mov edx, [esp + 8 + 12] // dst_u |
| mov edi, [esp + 8 + 16] // dst_v |
| mov ecx, [esp + 8 + 20] // width |
| vbroadcastf128 ymm5, xmmword ptr kAddUVJ128 |
| vbroadcastf128 ymm6, xmmword ptr kARGBToVJ |
| vbroadcastf128 ymm7, xmmword ptr kARGBToUJ |
| sub edi, edx // stride from u to v |
| |
| convertloop: |
| /* step 1 - subsample 32x2 argb pixels to 16x1 */ |
| vmovdqu ymm0, [eax] |
| vmovdqu ymm1, [eax + 32] |
| vmovdqu ymm2, [eax + 64] |
| vmovdqu ymm3, [eax + 96] |
| vpavgb ymm0, ymm0, [eax + esi] |
| vpavgb ymm1, ymm1, [eax + esi + 32] |
| vpavgb ymm2, ymm2, [eax + esi + 64] |
| vpavgb ymm3, ymm3, [eax + esi + 96] |
| lea eax, [eax + 128] |
| vshufps ymm4, ymm0, ymm1, 0x88 |
| vshufps ymm0, ymm0, ymm1, 0xdd |
| vpavgb ymm0, ymm0, ymm4 // mutated by vshufps |
| vshufps ymm4, ymm2, ymm3, 0x88 |
| vshufps ymm2, ymm2, ymm3, 0xdd |
| vpavgb ymm2, ymm2, ymm4 // mutated by vshufps |
| |
| // step 2 - convert to U and V |
| // from here down is very similar to Y code except |
| // instead of 32 different pixels, its 16 pixels of U and 16 of V |
| vpmaddubsw ymm1, ymm0, ymm7 // U |
| vpmaddubsw ymm3, ymm2, ymm7 |
| vpmaddubsw ymm0, ymm0, ymm6 // V |
| vpmaddubsw ymm2, ymm2, ymm6 |
| vphaddw ymm1, ymm1, ymm3 // mutates |
| vphaddw ymm0, ymm0, ymm2 |
| vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned |
| vpaddw ymm0, ymm0, ymm5 |
| vpsraw ymm1, ymm1, 8 |
| vpsraw ymm0, ymm0, 8 |
| vpacksswb ymm0, ymm1, ymm0 // mutates |
| vpermq ymm0, ymm0, 0xd8 // For vpacksswb |
| vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw |
| |
| // step 3 - store 16 U and 16 V values |
| vextractf128 [edx], ymm0, 0 // U |
| vextractf128 [edx + edi], ymm0, 1 // V |
| lea edx, [edx + 16] |
| sub ecx, 32 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBTOUVJROW_AVX2 |
| |
| __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| __asm { |
| push edi |
| mov eax, [esp + 4 + 4] // src_argb |
| mov edx, [esp + 4 + 8] // dst_u |
| mov edi, [esp + 4 + 12] // dst_v |
| mov ecx, [esp + 4 + 16] // width |
| movdqa xmm5, xmmword ptr kAddUV128 |
| movdqa xmm6, xmmword ptr kARGBToV |
| movdqa xmm7, xmmword ptr kARGBToU |
| sub edi, edx // stride from u to v |
| |
| convertloop: |
| /* convert to U and V */ |
| movdqu xmm0, [eax] // U |
| movdqu xmm1, [eax + 16] |
| movdqu xmm2, [eax + 32] |
| movdqu xmm3, [eax + 48] |
| pmaddubsw xmm0, xmm7 |
| pmaddubsw xmm1, xmm7 |
| pmaddubsw xmm2, xmm7 |
| pmaddubsw xmm3, xmm7 |
| phaddw xmm0, xmm1 |
| phaddw xmm2, xmm3 |
| psraw xmm0, 8 |
| psraw xmm2, 8 |
| packsswb xmm0, xmm2 |
| paddb xmm0, xmm5 |
| movdqu [edx], xmm0 |
| |
| movdqu xmm0, [eax] // V |
| movdqu xmm1, [eax + 16] |
| movdqu xmm2, [eax + 32] |
| movdqu xmm3, [eax + 48] |
| pmaddubsw xmm0, xmm6 |
| pmaddubsw xmm1, xmm6 |
| pmaddubsw xmm2, xmm6 |
| pmaddubsw xmm3, xmm6 |
| phaddw xmm0, xmm1 |
| phaddw xmm2, xmm3 |
| psraw xmm0, 8 |
| psraw xmm2, 8 |
| packsswb xmm0, xmm2 |
| paddb xmm0, xmm5 |
| lea eax, [eax + 64] |
| movdqu [edx + edi], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| ret |
| } |
| } |
| |
| __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0, |
| int src_stride_argb, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // src_argb |
| mov esi, [esp + 8 + 8] // src_stride_argb |
| mov edx, [esp + 8 + 12] // dst_u |
| mov edi, [esp + 8 + 16] // dst_v |
| mov ecx, [esp + 8 + 20] // width |
| movdqa xmm5, xmmword ptr kAddUV128 |
| movdqa xmm6, xmmword ptr kBGRAToV |
| movdqa xmm7, xmmword ptr kBGRAToU |
| sub edi, edx // stride from u to v |
| |
| convertloop: |
| /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| movdqu xmm0, [eax] |
| movdqu xmm4, [eax + esi] |
| pavgb xmm0, xmm4 |
| movdqu xmm1, [eax + 16] |
| movdqu xmm4, [eax + esi + 16] |
| pavgb xmm1, xmm4 |
| movdqu xmm2, [eax + 32] |
| movdqu xmm4, [eax + esi + 32] |
| pavgb xmm2, xmm4 |
| movdqu xmm3, [eax + 48] |
| movdqu xmm4, [eax + esi + 48] |
| pavgb xmm3, xmm4 |
| |
| lea eax, [eax + 64] |
| movdqa xmm4, xmm0 |
| shufps xmm0, xmm1, 0x88 |
| shufps xmm4, xmm1, 0xdd |
| pavgb xmm0, xmm4 |
| movdqa xmm4, xmm2 |
| shufps xmm2, xmm3, 0x88 |
| shufps xmm4, xmm3, 0xdd |
| pavgb xmm2, xmm4 |
| |
| // step 2 - convert to U and V |
| // from here down is very similar to Y code except |
| // instead of 16 different pixels, its 8 pixels of U and 8 of V |
| movdqa xmm1, xmm0 |
| movdqa xmm3, xmm2 |
| pmaddubsw xmm0, xmm7 // U |
| pmaddubsw xmm2, xmm7 |
| pmaddubsw xmm1, xmm6 // V |
| pmaddubsw xmm3, xmm6 |
| phaddw xmm0, xmm2 |
| phaddw xmm1, xmm3 |
| psraw xmm0, 8 |
| psraw xmm1, 8 |
| packsswb xmm0, xmm1 |
| paddb xmm0, xmm5 // -> unsigned |
| |
| // step 3 - store 8 U and 8 V values |
| movlps qword ptr [edx], xmm0 // U |
| movhps qword ptr [edx + edi], xmm0 // V |
| lea edx, [edx + 8] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0, |
| int src_stride_argb, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // src_argb |
| mov esi, [esp + 8 + 8] // src_stride_argb |
| mov edx, [esp + 8 + 12] // dst_u |
| mov edi, [esp + 8 + 16] // dst_v |
| mov ecx, [esp + 8 + 20] // width |
| movdqa xmm5, xmmword ptr kAddUV128 |
| movdqa xmm6, xmmword ptr kABGRToV |
| movdqa xmm7, xmmword ptr kABGRToU |
| sub edi, edx // stride from u to v |
| |
| convertloop: |
| /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| movdqu xmm0, [eax] |
| movdqu xmm4, [eax + esi] |
| pavgb xmm0, xmm4 |
| movdqu xmm1, [eax + 16] |
| movdqu xmm4, [eax + esi + 16] |
| pavgb xmm1, xmm4 |
| movdqu xmm2, [eax + 32] |
| movdqu xmm4, [eax + esi + 32] |
| pavgb xmm2, xmm4 |
| movdqu xmm3, [eax + 48] |
| movdqu xmm4, [eax + esi + 48] |
| pavgb xmm3, xmm4 |
| |
| lea eax, [eax + 64] |
| movdqa xmm4, xmm0 |
| shufps xmm0, xmm1, 0x88 |
| shufps xmm4, xmm1, 0xdd |
| pavgb xmm0, xmm4 |
| movdqa xmm4, xmm2 |
| shufps xmm2, xmm3, 0x88 |
| shufps xmm4, xmm3, 0xdd |
| pavgb xmm2, xmm4 |
| |
| // step 2 - convert to U and V |
| // from here down is very similar to Y code except |
| // instead of 16 different pixels, its 8 pixels of U and 8 of V |
| movdqa xmm1, xmm0 |
| movdqa xmm3, xmm2 |
| pmaddubsw xmm0, xmm7 // U |
| pmaddubsw xmm2, xmm7 |
| pmaddubsw xmm1, xmm6 // V |
| pmaddubsw xmm3, xmm6 |
| phaddw xmm0, xmm2 |
| phaddw xmm1, xmm3 |
| psraw xmm0, 8 |
| psraw xmm1, 8 |
| packsswb xmm0, xmm1 |
| paddb xmm0, xmm5 // -> unsigned |
| |
| // step 3 - store 8 U and 8 V values |
| movlps qword ptr [edx], xmm0 // U |
| movhps qword ptr [edx + edi], xmm0 // V |
| lea edx, [edx + 8] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0, |
| int src_stride_argb, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // src_argb |
| mov esi, [esp + 8 + 8] // src_stride_argb |
| mov edx, [esp + 8 + 12] // dst_u |
| mov edi, [esp + 8 + 16] // dst_v |
| mov ecx, [esp + 8 + 20] // width |
| movdqa xmm5, xmmword ptr kAddUV128 |
| movdqa xmm6, xmmword ptr kRGBAToV |
| movdqa xmm7, xmmword ptr kRGBAToU |
| sub edi, edx // stride from u to v |
| |
| convertloop: |
| /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| movdqu xmm0, [eax] |
| movdqu xmm4, [eax + esi] |
| pavgb xmm0, xmm4 |
| movdqu xmm1, [eax + 16] |
| movdqu xmm4, [eax + esi + 16] |
| pavgb xmm1, xmm4 |
| movdqu xmm2, [eax + 32] |
| movdqu xmm4, [eax + esi + 32] |
| pavgb xmm2, xmm4 |
| movdqu xmm3, [eax + 48] |
| movdqu xmm4, [eax + esi + 48] |
| pavgb xmm3, xmm4 |
| |
| lea eax, [eax + 64] |
| movdqa xmm4, xmm0 |
| shufps xmm0, xmm1, 0x88 |
| shufps xmm4, xmm1, 0xdd |
| pavgb xmm0, xmm4 |
| movdqa xmm4, xmm2 |
| shufps xmm2, xmm3, 0x88 |
| shufps xmm4, xmm3, 0xdd |
| pavgb xmm2, xmm4 |
| |
| // step 2 - convert to U and V |
| // from here down is very similar to Y code except |
| // instead of 16 different pixels, its 8 pixels of U and 8 of V |
| movdqa xmm1, xmm0 |
| movdqa xmm3, xmm2 |
| pmaddubsw xmm0, xmm7 // U |
| pmaddubsw xmm2, xmm7 |
| pmaddubsw xmm1, xmm6 // V |
| pmaddubsw xmm3, xmm6 |
| phaddw xmm0, xmm2 |
| phaddw xmm1, xmm3 |
| psraw xmm0, 8 |
| psraw xmm1, 8 |
| packsswb xmm0, xmm1 |
| paddb xmm0, xmm5 // -> unsigned |
| |
| // step 3 - store 8 U and 8 V values |
| movlps qword ptr [edx], xmm0 // U |
| movhps qword ptr [edx + edi], xmm0 // V |
| lea edx, [edx + 8] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| #endif // HAS_ARGBTOYROW_SSSE3 |
| |
| // Read 16 UV from 444 |
| #define READYUV444_AVX2 \ |
| __asm { \ |
| __asm vmovdqu xmm0, [esi] /* U */ \ |
| __asm vmovdqu xmm1, [esi + edi] /* V */ \ |
| __asm lea esi, [esi + 16] \ |
| __asm vpermq ymm0, ymm0, 0xd8 \ |
| __asm vpermq ymm1, ymm1, 0xd8 \ |
| __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ |
| __asm vmovdqu xmm4, [eax] /* Y */ \ |
| __asm vpermq ymm4, ymm4, 0xd8 \ |
| __asm vpunpcklbw ymm4, ymm4, ymm4 \ |
| __asm lea eax, [eax + 16]} |
| |
| // Read 8 UV from 422, upsample to 16 UV. |
| #define READYUV422_AVX2 \ |
| __asm { \ |
| __asm vmovq xmm0, qword ptr [esi] /* U */ \ |
| __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ |
| __asm lea esi, [esi + 8] \ |
| __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ |
| __asm vpermq ymm0, ymm0, 0xd8 \ |
| __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
| __asm vmovdqu xmm4, [eax] /* Y */ \ |
| __asm vpermq ymm4, ymm4, 0xd8 \ |
| __asm vpunpcklbw ymm4, ymm4, ymm4 \ |
| __asm lea eax, [eax + 16]} |
| |
| // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. |
| #define READYUVA422_AVX2 \ |
| __asm { \ |
| __asm vmovq xmm0, qword ptr [esi] /* U */ \ |
| __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ |
| __asm lea esi, [esi + 8] \ |
| __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ |
| __asm vpermq ymm0, ymm0, 0xd8 \ |
| __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
| __asm vmovdqu xmm4, [eax] /* Y */ \ |
| __asm vpermq ymm4, ymm4, 0xd8 \ |
| __asm vpunpcklbw ymm4, ymm4, ymm4 \ |
| __asm lea eax, [eax + 16] \ |
| __asm vmovdqu xmm5, [ebp] /* A */ \ |
| __asm vpermq ymm5, ymm5, 0xd8 \ |
| __asm lea ebp, [ebp + 16]} |
| |
| // Read 8 UV from NV12, upsample to 16 UV. |
| #define READNV12_AVX2 \ |
| __asm { \ |
| __asm vmovdqu xmm0, [esi] /* UV */ \ |
| __asm lea esi, [esi + 16] \ |
| __asm vpermq ymm0, ymm0, 0xd8 \ |
| __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
| __asm vmovdqu xmm4, [eax] /* Y */ \ |
| __asm vpermq ymm4, ymm4, 0xd8 \ |
| __asm vpunpcklbw ymm4, ymm4, ymm4 \ |
| __asm lea eax, [eax + 16]} |
| |
| // Read 8 UV from NV21, upsample to 16 UV. |
| #define READNV21_AVX2 \ |
| __asm { \ |
| __asm vmovdqu xmm0, [esi] /* UV */ \ |
| __asm lea esi, [esi + 16] \ |
| __asm vpermq ymm0, ymm0, 0xd8 \ |
| __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \ |
| __asm vmovdqu xmm4, [eax] /* Y */ \ |
| __asm vpermq ymm4, ymm4, 0xd8 \ |
| __asm vpunpcklbw ymm4, ymm4, ymm4 \ |
| __asm lea eax, [eax + 16]} |
| |
| // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. |
| #define READYUY2_AVX2 \ |
| __asm { \ |
| __asm vmovdqu ymm4, [eax] /* YUY2 */ \ |
| __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \ |
| __asm vmovdqu ymm0, [eax] /* UV */ \ |
| __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \ |
| __asm lea eax, [eax + 32]} |
| |
| // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. |
| #define READUYVY_AVX2 \ |
| __asm { \ |
| __asm vmovdqu ymm4, [eax] /* UYVY */ \ |
| __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \ |
| __asm vmovdqu ymm0, [eax] /* UV */ \ |
| __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \ |
| __asm lea eax, [eax + 32]} |
| |
| // Convert 16 pixels: 16 UV and 16 Y. |
| #define YUVTORGB_AVX2(YuvConstants) \ |
| __asm { \ |
| __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\ |
| __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\ |
| __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\ |
| __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \ |
| __asm vpsubw ymm2, ymm3, ymm2 \ |
| __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \ |
| __asm vpsubw ymm1, ymm3, ymm1 \ |
| __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \ |
| __asm vpsubw ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */ \ |
| __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \ |
| __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \ |
| __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \ |
| __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \ |
| __asm vpsraw ymm0, ymm0, 6 \ |
| __asm vpsraw ymm1, ymm1, 6 \ |
| __asm vpsraw ymm2, ymm2, 6 \ |
| __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \ |
| __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \ |
| __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \ |
| } |
| |
| // Store 16 ARGB values. |
| #define STOREARGB_AVX2 \ |
| __asm { \ |
| __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ |
| __asm vpermq ymm0, ymm0, 0xd8 \ |
| __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ |
| __asm vpermq ymm2, ymm2, 0xd8 \ |
| __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ |
| __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ |
| __asm vmovdqu 0[edx], ymm1 \ |
| __asm vmovdqu 32[edx], ymm0 \ |
| __asm lea edx, [edx + 64]} |
| |
| // Store 16 RGBA values. |
| #define STORERGBA_AVX2 \ |
| __asm { \ |
| __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ |
| __asm vpermq ymm1, ymm1, 0xd8 \ |
| __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ |
| __asm vpermq ymm2, ymm2, 0xd8 \ |
| __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ |
| __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ |
| __asm vmovdqu [edx], ymm0 \ |
| __asm vmovdqu [edx + 32], ymm1 \ |
| __asm lea edx, [edx + 64]} |
| |
| #ifdef HAS_I422TOARGBROW_AVX2 |
| // 16 pixels |
| // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| __declspec(naked) void I422ToARGBRow_AVX2( |
| const uint8_t* y_buf, |
| const uint8_t* u_buf, |
| const uint8_t* v_buf, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| push ebx |
| mov eax, [esp + 12 + 4] // Y |
| mov esi, [esp + 12 + 8] // U |
| mov edi, [esp + 12 + 12] // V |
| mov edx, [esp + 12 + 16] // argb |
| mov ebx, [esp + 12 + 20] // yuvconstants |
| mov ecx, [esp + 12 + 24] // width |
| sub edi, esi |
| vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| |
| convertloop: |
| READYUV422_AVX2 |
| YUVTORGB_AVX2(ebx) |
| STOREARGB_AVX2 |
| |
| sub ecx, 16 |
| jg convertloop |
| |
| pop ebx |
| pop edi |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_I422TOARGBROW_AVX2 |
| |
| #ifdef HAS_I422ALPHATOARGBROW_AVX2 |
| // 16 pixels |
| // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. |
| __declspec(naked) void I422AlphaToARGBRow_AVX2( |
| const uint8_t* y_buf, |
| const uint8_t* u_buf, |
| const uint8_t* v_buf, |
| const uint8_t* a_buf, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| push ebx |
| push ebp |
| mov eax, [esp + 16 + 4] // Y |
| mov esi, [esp + 16 + 8] // U |
| mov edi, [esp + 16 + 12] // V |
| mov ebp, [esp + 16 + 16] // A |
| mov edx, [esp + 16 + 20] // argb |
| mov ebx, [esp + 16 + 24] // yuvconstants |
| mov ecx, [esp + 16 + 28] // width |
| sub edi, esi |
| |
| convertloop: |
| READYUVA422_AVX2 |
| YUVTORGB_AVX2(ebx) |
| STOREARGB_AVX2 |
| |
| sub ecx, 16 |
| jg convertloop |
| |
| pop ebp |
| pop ebx |
| pop edi |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_I422ALPHATOARGBROW_AVX2 |
| |
| #ifdef HAS_I444TOARGBROW_AVX2 |
| // 16 pixels |
| // 16 UV values with 16 Y producing 16 ARGB (64 bytes). |
| __declspec(naked) void I444ToARGBRow_AVX2( |
| const uint8_t* y_buf, |
| const uint8_t* u_buf, |
| const uint8_t* v_buf, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| push ebx |
| mov eax, [esp + 12 + 4] // Y |
| mov esi, [esp + 12 + 8] // U |
| mov edi, [esp + 12 + 12] // V |
| mov edx, [esp + 12 + 16] // argb |
| mov ebx, [esp + 12 + 20] // yuvconstants |
| mov ecx, [esp + 12 + 24] // width |
| sub edi, esi |
| vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| convertloop: |
| READYUV444_AVX2 |
| YUVTORGB_AVX2(ebx) |
| STOREARGB_AVX2 |
| |
| sub ecx, 16 |
| jg convertloop |
| |
| pop ebx |
| pop edi |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_I444TOARGBROW_AVX2 |
| |
| #ifdef HAS_NV12TOARGBROW_AVX2 |
| // 16 pixels. |
| // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| __declspec(naked) void NV12ToARGBRow_AVX2( |
| const uint8_t* y_buf, |
| const uint8_t* uv_buf, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| __asm { |
| push esi |
| push ebx |
| mov eax, [esp + 8 + 4] // Y |
| mov esi, [esp + 8 + 8] // UV |
| mov edx, [esp + 8 + 12] // argb |
| mov ebx, [esp + 8 + 16] // yuvconstants |
| mov ecx, [esp + 8 + 20] // width |
| vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| |
| convertloop: |
| READNV12_AVX2 |
| YUVTORGB_AVX2(ebx) |
| STOREARGB_AVX2 |
| |
| sub ecx, 16 |
| jg convertloop |
| |
| pop ebx |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_NV12TOARGBROW_AVX2 |
| |
| #ifdef HAS_NV21TOARGBROW_AVX2 |
| // 16 pixels. |
| // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| __declspec(naked) void NV21ToARGBRow_AVX2( |
| const uint8_t* y_buf, |
| const uint8_t* vu_buf, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| __asm { |
| push esi |
| push ebx |
| mov eax, [esp + 8 + 4] // Y |
| mov esi, [esp + 8 + 8] // VU |
| mov edx, [esp + 8 + 12] // argb |
| mov ebx, [esp + 8 + 16] // yuvconstants |
| mov ecx, [esp + 8 + 20] // width |
| vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| |
| convertloop: |
| READNV21_AVX2 |
| YUVTORGB_AVX2(ebx) |
| STOREARGB_AVX2 |
| |
| sub ecx, 16 |
| jg convertloop |
| |
| pop ebx |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_NV21TOARGBROW_AVX2 |
| |
| #ifdef HAS_YUY2TOARGBROW_AVX2 |
| // 16 pixels. |
| // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
| __declspec(naked) void YUY2ToARGBRow_AVX2( |
| const uint8_t* src_yuy2, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| __asm { |
| push ebx |
| mov eax, [esp + 4 + 4] // yuy2 |
| mov edx, [esp + 4 + 8] // argb |
| mov ebx, [esp + 4 + 12] // yuvconstants |
| mov ecx, [esp + 4 + 16] // width |
| vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| |
| convertloop: |
| READYUY2_AVX2 |
| YUVTORGB_AVX2(ebx) |
| STOREARGB_AVX2 |
| |
| sub ecx, 16 |
| jg convertloop |
| |
| pop ebx |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_YUY2TOARGBROW_AVX2 |
| |
| #ifdef HAS_UYVYTOARGBROW_AVX2 |
| // 16 pixels. |
| // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
| __declspec(naked) void UYVYToARGBRow_AVX2( |
| const uint8_t* src_uyvy, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| __asm { |
| push ebx |
| mov eax, [esp + 4 + 4] // uyvy |
| mov edx, [esp + 4 + 8] // argb |
| mov ebx, [esp + 4 + 12] // yuvconstants |
| mov ecx, [esp + 4 + 16] // width |
| vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| |
| convertloop: |
| READUYVY_AVX2 |
| YUVTORGB_AVX2(ebx) |
| STOREARGB_AVX2 |
| |
| sub ecx, 16 |
| jg convertloop |
| |
| pop ebx |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_UYVYTOARGBROW_AVX2 |
| |
| #ifdef HAS_I422TORGBAROW_AVX2 |
| // 16 pixels |
| // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). |
| __declspec(naked) void I422ToRGBARow_AVX2( |
| const uint8_t* y_buf, |
| const uint8_t* u_buf, |
| const uint8_t* v_buf, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| push ebx |
| mov eax, [esp + 12 + 4] // Y |
| mov esi, [esp + 12 + 8] // U |
| mov edi, [esp + 12 + 12] // V |
| mov edx, [esp + 12 + 16] // abgr |
| mov ebx, [esp + 12 + 20] // yuvconstants |
| mov ecx, [esp + 12 + 24] // width |
| sub edi, esi |
| vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| |
| convertloop: |
| READYUV422_AVX2 |
| YUVTORGB_AVX2(ebx) |
| STORERGBA_AVX2 |
| |
| sub ecx, 16 |
| jg convertloop |
| |
| pop ebx |
| pop edi |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_I422TORGBAROW_AVX2 |
| |
| #if defined(HAS_I422TOARGBROW_SSSE3) |
| // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. |
| // Allows a conversion with half size scaling. |
| |
| // Read 8 UV from 444. |
| #define READYUV444 \ |
| __asm { \ |
| __asm movq xmm0, qword ptr [esi] /* U */ \ |
| __asm movq xmm1, qword ptr [esi + edi] /* V */ \ |
| __asm lea esi, [esi + 8] \ |
| __asm punpcklbw xmm0, xmm1 /* UV */ \ |
| __asm movq xmm4, qword ptr [eax] \ |
| __asm punpcklbw xmm4, xmm4 \ |
| __asm lea eax, [eax + 8]} |
| |
| // Read 4 UV from 422, upsample to 8 UV. |
| #define READYUV422 \ |
| __asm { \ |
| __asm movd xmm0, [esi] /* U */ \ |
| __asm movd xmm1, [esi + edi] /* V */ \ |
| __asm lea esi, [esi + 4] \ |
| __asm punpcklbw xmm0, xmm1 /* UV */ \ |
| __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
| __asm movq xmm4, qword ptr [eax] \ |
| __asm punpcklbw xmm4, xmm4 \ |
| __asm lea eax, [eax + 8]} |
| |
| // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. |
| #define READYUVA422 \ |
| __asm { \ |
| __asm movd xmm0, [esi] /* U */ \ |
| __asm movd xmm1, [esi + edi] /* V */ \ |
| __asm lea esi, [esi + 4] \ |
| __asm punpcklbw xmm0, xmm1 /* UV */ \ |
| __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
| __asm movq xmm4, qword ptr [eax] /* Y */ \ |
| __asm punpcklbw xmm4, xmm4 \ |
| __asm lea eax, [eax + 8] \ |
| __asm movq xmm5, qword ptr [ebp] /* A */ \ |
| __asm lea ebp, [ebp + 8]} |
| |
| // Read 4 UV from NV12, upsample to 8 UV. |
| #define READNV12 \ |
| __asm { \ |
| __asm movq xmm0, qword ptr [esi] /* UV */ \ |
| __asm lea esi, [esi + 8] \ |
| __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
| __asm movq xmm4, qword ptr [eax] \ |
| __asm punpcklbw xmm4, xmm4 \ |
| __asm lea eax, [eax + 8]} |
| |
| // Read 4 VU from NV21, upsample to 8 UV. |
| #define READNV21 \ |
| __asm { \ |
| __asm movq xmm0, qword ptr [esi] /* UV */ \ |
| __asm lea esi, [esi + 8] \ |
| __asm pshufb xmm0, xmmword ptr kShuffleNV21 \ |
| __asm movq xmm4, qword ptr [eax] \ |
| __asm punpcklbw xmm4, xmm4 \ |
| __asm lea eax, [eax + 8]} |
| |
| // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV. |
| #define READYUY2 \ |
| __asm { \ |
| __asm movdqu xmm4, [eax] /* YUY2 */ \ |
| __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \ |
| __asm movdqu xmm0, [eax] /* UV */ \ |
| __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \ |
| __asm lea eax, [eax + 16]} |
| |
| // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV. |
| #define READUYVY \ |
| __asm { \ |
| __asm movdqu xmm4, [eax] /* UYVY */ \ |
| __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \ |
| __asm movdqu xmm0, [eax] /* UV */ \ |
| __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \ |
| __asm lea eax, [eax + 16]} |
| |
| // Convert 8 pixels: 8 UV and 8 Y. |
| #define YUVTORGB(YuvConstants) \ |
| __asm { \ |
| __asm movdqa xmm1, xmm0 \ |
| __asm movdqa xmm2, xmm0 \ |
| __asm movdqa xmm3, xmm0 \ |
| __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \ |
| __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \ |
| __asm psubw xmm0, xmm1 \ |
| __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \ |
| __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \ |
| __asm psubw xmm1, xmm2 \ |
| __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \ |
| __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \ |
| __asm psubw xmm2, xmm3 \ |
| __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \ |
| __asm paddsw xmm0, xmm4 /* B += Y */ \ |
| __asm paddsw xmm1, xmm4 /* G += Y */ \ |
| __asm paddsw xmm2, xmm4 /* R += Y */ \ |
| __asm psraw xmm0, 6 \ |
| __asm psraw xmm1, 6 \ |
| __asm psraw xmm2, 6 \ |
| __asm packuswb xmm0, xmm0 /* B */ \ |
| __asm packuswb xmm1, xmm1 /* G */ \ |
| __asm packuswb xmm2, xmm2 /* R */ \ |
| } |
| |
| // Store 8 ARGB values. |
| #define STOREARGB \ |
| __asm { \ |
| __asm punpcklbw xmm0, xmm1 /* BG */ \ |
| __asm punpcklbw xmm2, xmm5 /* RA */ \ |
| __asm movdqa xmm1, xmm0 \ |
| __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ |
| __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ |
| __asm movdqu 0[edx], xmm0 \ |
| __asm movdqu 16[edx], xmm1 \ |
| __asm lea edx, [edx + 32]} |
| |
| // Store 8 BGRA values. |
| #define STOREBGRA \ |
| __asm { \ |
| __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ |
| __asm punpcklbw xmm1, xmm0 /* GB */ \ |
| __asm punpcklbw xmm5, xmm2 /* AR */ \ |
| __asm movdqa xmm0, xmm5 \ |
| __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ |
| __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ |
| __asm movdqu 0[edx], xmm5 \ |
| __asm movdqu 16[edx], xmm0 \ |
| __asm lea edx, [edx + 32]} |
| |
| // Store 8 RGBA values. |
| #define STORERGBA \ |
| __asm { \ |
| __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ |
| __asm punpcklbw xmm1, xmm2 /* GR */ \ |
| __asm punpcklbw xmm5, xmm0 /* AB */ \ |
| __asm movdqa xmm0, xmm5 \ |
| __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ |
| __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ |
| __asm movdqu 0[edx], xmm5 \ |
| __asm movdqu 16[edx], xmm0 \ |
| __asm lea edx, [edx + 32]} |
| |
| // Store 8 RGB24 values. |
| #define STORERGB24 \ |
| __asm {/* Weave into RRGB */ \ |
| __asm punpcklbw xmm0, xmm1 /* BG */ \ |
| __asm punpcklbw xmm2, xmm2 /* RR */ \ |
| __asm movdqa xmm1, xmm0 \ |
| __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ |
| __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \ |
| __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ |
| __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ |
| __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ |
| __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ |
| __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ |
| __asm lea edx, [edx + 24]} |
| |
| // Store 8 RGB565 values. |
| #define STORERGB565 \ |
| __asm {/* Weave into RRGB */ \ |
| __asm punpcklbw xmm0, xmm1 /* BG */ \ |
| __asm punpcklbw xmm2, xmm2 /* RR */ \ |
| __asm movdqa xmm1, xmm0 \ |
| __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ |
| __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \ |
| __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ |
| __asm movdqa xmm2, xmm0 /* G */ \ |
| __asm pslld xmm0, 8 /* R */ \ |
| __asm psrld xmm3, 3 /* B */ \ |
| __asm psrld xmm2, 5 /* G */ \ |
| __asm psrad xmm0, 16 /* R */ \ |
| __asm pand xmm3, xmm5 /* B */ \ |
| __asm pand xmm2, xmm6 /* G */ \ |
| __asm pand xmm0, xmm7 /* R */ \ |
| __asm por xmm3, xmm2 /* BG */ \ |
| __asm por xmm0, xmm3 /* BGR */ \ |
| __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ |
| __asm movdqa xmm2, xmm1 /* G */ \ |
| __asm pslld xmm1, 8 /* R */ \ |
| __asm psrld xmm3, 3 /* B */ \ |
| __asm psrld xmm2, 5 /* G */ \ |
| __asm psrad xmm1, 16 /* R */ \ |
| __asm pand xmm3, xmm5 /* B */ \ |
| __asm pand xmm2, xmm6 /* G */ \ |
| __asm pand xmm1, xmm7 /* R */ \ |
| __asm por xmm3, xmm2 /* BG */ \ |
| __asm por xmm1, xmm3 /* BGR */ \ |
| __asm packssdw xmm0, xmm1 \ |
| __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ |
| __asm lea edx, [edx + 16]} |
| |
| // 8 pixels. |
| // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). |
| __declspec(naked) void I444ToARGBRow_SSSE3( |
| const uint8_t* y_buf, |
| const uint8_t* u_buf, |
| const uint8_t* v_buf, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| push ebx |
| mov eax, [esp + 12 + 4] // Y |
| mov esi, [esp + 12 + 8] // U |
| mov edi, [esp + 12 + 12] // V |
| mov edx, [esp + 12 + 16] // argb |
| mov ebx, [esp + 12 + 20] // yuvconstants |
| mov ecx, [esp + 12 + 24] // width |
| sub edi, esi |
| pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| |
| convertloop: |
| READYUV444 |
| YUVTORGB(ebx) |
| STOREARGB |
| |
| sub ecx, 8 |
| jg convertloop |
| |
| pop ebx |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| // 8 pixels. |
| // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). |
| __declspec(naked) void I422ToRGB24Row_SSSE3( |
| const uint8_t* y_buf, |
| const uint8_t* u_buf, |
| const uint8_t* v_buf, |
| uint8_t* dst_rgb24, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| push ebx |
| mov eax, [esp + 12 + 4] // Y |
| mov esi, [esp + 12 + 8] // U |
| mov edi, [esp + 12 + 12] // V |
| mov edx, [esp + 12 + 16] // argb |
| mov ebx, [esp + 12 + 20] // yuvconstants |
| mov ecx, [esp + 12 + 24] // width |
| sub edi, esi |
| movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 |
| movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 |
| |
| convertloop: |
| READYUV422 |
| YUVTORGB(ebx) |
| STORERGB24 |
| |
| sub ecx, 8 |
| jg convertloop |
| |
| pop ebx |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| // 8 pixels |
| // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). |
| __declspec(naked) void I422ToRGB565Row_SSSE3( |
| const uint8_t* y_buf, |
| const uint8_t* u_buf, |
| const uint8_t* v_buf, |
| uint8_t* rgb565_buf, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| push ebx |
| mov eax, [esp + 12 + 4] // Y |
| mov esi, [esp + 12 + 8] // U |
| mov edi, [esp + 12 + 12] // V |
| mov edx, [esp + 12 + 16] // argb |
| mov ebx, [esp + 12 + 20] // yuvconstants |
| mov ecx, [esp + 12 + 24] // width |
| sub edi, esi |
| pcmpeqb xmm5, xmm5 // generate mask 0x0000001f |
| psrld xmm5, 27 |
| pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 |
| psrld xmm6, 26 |
| pslld xmm6, 5 |
| pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 |
| pslld xmm7, 11 |
| |
| convertloop: |
| READYUV422 |
| YUVTORGB(ebx) |
| STORERGB565 |
| |
| sub ecx, 8 |
| jg convertloop |
| |
| pop ebx |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| // 8 pixels. |
| // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| __declspec(naked) void I422ToARGBRow_SSSE3( |
| const uint8_t* y_buf, |
| const uint8_t* u_buf, |
| const uint8_t* v_buf, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| push ebx |
| mov eax, [esp + 12 + 4] // Y |
| mov esi, [esp + 12 + 8] // U |
| mov edi, [esp + 12 + 12] // V |
| mov edx, [esp + 12 + 16] // argb |
| mov ebx, [esp + 12 + 20] // yuvconstants |
| mov ecx, [esp + 12 + 24] // width |
| sub edi, esi |
| pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| |
| convertloop: |
| READYUV422 |
| YUVTORGB(ebx) |
| STOREARGB |
| |
| sub ecx, 8 |
| jg convertloop |
| |
| pop ebx |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| // 8 pixels. |
| // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB. |
| __declspec(naked) void I422AlphaToARGBRow_SSSE3( |
| const uint8_t* y_buf, |
| const uint8_t* u_buf, |
| const uint8_t* v_buf, |
| const uint8_t* a_buf, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| push ebx |
| push ebp |
| mov eax, [esp + 16 + 4] // Y |
| mov esi, [esp + 16 + 8] // U |
| mov edi, [esp + 16 + 12] // V |
| mov ebp, [esp + 16 + 16] // A |
| mov edx, [esp + 16 + 20] // argb |
| mov ebx, [esp + 16 + 24] // yuvconstants |
| mov ecx, [esp + 16 + 28] // width |
| sub edi, esi |
| |
| convertloop: |
| READYUVA422 |
| YUVTORGB(ebx) |
| STOREARGB |
| |
| sub ecx, 8 |
| jg convertloop |
| |
| pop ebp |
| pop ebx |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| // 8 pixels. |
| // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| __declspec(naked) void NV12ToARGBRow_SSSE3( |
| const uint8_t* y_buf, |
| const uint8_t* uv_buf, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| __asm { |
| push esi |
| push ebx |
| mov eax, [esp + 8 + 4] // Y |
| mov esi, [esp + 8 + 8] // UV |
| mov edx, [esp + 8 + 12] // argb |
| mov ebx, [esp + 8 + 16] // yuvconstants |
| mov ecx, [esp + 8 + 20] // width |
| pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| |
| convertloop: |
| READNV12 |
| YUVTORGB(ebx) |
| STOREARGB |
| |
| sub ecx, 8 |
| jg convertloop |
| |
| pop ebx |
| pop esi |
| ret |
| } |
| } |
| |
| // 8 pixels. |
| // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| __declspec(naked) void NV21ToARGBRow_SSSE3( |
| const uint8_t* y_buf, |
| const uint8_t* vu_buf, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| __asm { |
| push esi |
| push ebx |
| mov eax, [esp + 8 + 4] // Y |
| mov esi, [esp + 8 + 8] // VU |
| mov edx, [esp + 8 + 12] // argb |
| mov ebx, [esp + 8 + 16] // yuvconstants |
| mov ecx, [esp + 8 + 20] // width |
| pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| |
| convertloop: |
| READNV21 |
| YUVTORGB(ebx) |
| STOREARGB |
| |
| sub ecx, 8 |
| jg convertloop |
| |
| pop ebx |
| pop esi |
| ret |
| } |
| } |
| |
| // 8 pixels. |
| // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes). |
| __declspec(naked) void YUY2ToARGBRow_SSSE3( |
| const uint8_t* src_yuy2, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| __asm { |
| push ebx |
| mov eax, [esp + 4 + 4] // yuy2 |
| mov edx, [esp + 4 + 8] // argb |
| mov ebx, [esp + 4 + 12] // yuvconstants |
| mov ecx, [esp + 4 + 16] // width |
| pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| |
| convertloop: |
| READYUY2 |
| YUVTORGB(ebx) |
| STOREARGB |
| |
| sub ecx, 8 |
| jg convertloop |
| |
| pop ebx |
| ret |
| } |
| } |
| |
| // 8 pixels. |
| // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes). |
| __declspec(naked) void UYVYToARGBRow_SSSE3( |
| const uint8_t* src_uyvy, |
| uint8_t* dst_argb, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| __asm { |
| push ebx |
| mov eax, [esp + 4 + 4] // uyvy |
| mov edx, [esp + 4 + 8] // argb |
| mov ebx, [esp + 4 + 12] // yuvconstants |
| mov ecx, [esp + 4 + 16] // width |
| pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| |
| convertloop: |
| READUYVY |
| YUVTORGB(ebx) |
| STOREARGB |
| |
| sub ecx, 8 |
| jg convertloop |
| |
| pop ebx |
| ret |
| } |
| } |
| |
| __declspec(naked) void I422ToRGBARow_SSSE3( |
| const uint8_t* y_buf, |
| const uint8_t* u_buf, |
| const uint8_t* v_buf, |
| uint8_t* dst_rgba, |
| const struct YuvConstants* yuvconstants, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| push ebx |
| mov eax, [esp + 12 + 4] // Y |
| mov esi, [esp + 12 + 8] // U |
| mov edi, [esp + 12 + 12] // V |
| mov edx, [esp + 12 + 16] // argb |
| mov ebx, [esp + 12 + 20] // yuvconstants |
| mov ecx, [esp + 12 + 24] // width |
| sub edi, esi |
| |
| convertloop: |
| READYUV422 |
| YUVTORGB(ebx) |
| STORERGBA |
| |
| sub ecx, 8 |
| jg convertloop |
| |
| pop ebx |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| #endif // HAS_I422TOARGBROW_SSSE3 |
| |
| // I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter |
| #ifdef HAS_I400TOARGBROW_SSE2 |
| // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). |
| __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf, |
| uint8_t* rgb_buf, |
| const struct YuvConstants*, |
| int width) { |
| __asm { |
| mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) |
| movd xmm2, eax |
| pshufd xmm2, xmm2,0 |
| mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) |
| movd xmm3, eax |
| pshufd xmm3, xmm3, 0 |
| pcmpeqb xmm4, xmm4 // generate mask 0xff000000 |
| pslld xmm4, 24 |
| |
| mov eax, [esp + 4] // Y |
| mov edx, [esp + 8] // rgb |
| mov ecx, [esp + 12] // width |
| |
| convertloop: |
| // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 |
| movq xmm0, qword ptr [eax] |
| lea eax, [eax + 8] |
| punpcklbw xmm0, xmm0 // Y.Y |
| pmulhuw xmm0, xmm2 |
| psubusw xmm0, xmm3 |
| psrlw xmm0, 6 |
| packuswb xmm0, xmm0 // G |
| |
| // Step 2: Weave into ARGB |
| punpcklbw xmm0, xmm0 // GG |
| movdqa xmm1, xmm0 |
| punpcklwd xmm0, xmm0 // BGRA first 4 pixels |
| punpckhwd xmm1, xmm1 // BGRA next 4 pixels |
| por xmm0, xmm4 |
| por xmm1, xmm4 |
| movdqu [edx], xmm0 |
| movdqu [edx + 16], xmm1 |
| lea edx, [edx + 32] |
| sub ecx, 8 |
| jg convertloop |
| ret |
| } |
| } |
| #endif // HAS_I400TOARGBROW_SSE2 |
| |
| #ifdef HAS_I400TOARGBROW_AVX2 |
| // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). |
| // note: vpunpcklbw mutates and vpackuswb unmutates. |
| __declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf, |
| uint8_t* rgb_buf, |
| const struct YuvConstants*, |
| int width) { |
| __asm { |
| mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) |
| vmovd xmm2, eax |
| vbroadcastss ymm2, xmm2 |
| mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) |
| vmovd xmm3, eax |
| vbroadcastss ymm3, xmm3 |
| vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 |
| vpslld ymm4, ymm4, 24 |
| |
| mov eax, [esp + 4] // Y |
| mov edx, [esp + 8] // rgb |
| mov ecx, [esp + 12] // width |
| |
| convertloop: |
| // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164 |
| vmovdqu xmm0, [eax] |
| lea eax, [eax + 16] |
| vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates |
| vpunpcklbw ymm0, ymm0, ymm0 // Y.Y |
| vpmulhuw ymm0, ymm0, ymm2 |
| vpsubusw ymm0, ymm0, ymm3 |
| vpsrlw ymm0, ymm0, 6 |
| vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 |
| |
| // TODO(fbarchard): Weave alpha with unpack. |
| // Step 2: Weave into ARGB |
| vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates |
| vpermq ymm1, ymm1, 0xd8 |
| vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels |
| vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels |
| vpor ymm0, ymm0, ymm4 |
| vpor ymm1, ymm1, ymm4 |
| vmovdqu [edx], ymm0 |
| vmovdqu [edx + 32], ymm1 |
| lea edx, [edx + 64] |
| sub ecx, 16 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_I400TOARGBROW_AVX2 |
| |
| #ifdef HAS_MIRRORROW_SSSE3 |
| // Shuffle table for reversing the bytes. |
| static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, |
| 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; |
| |
| // TODO(fbarchard): Replace lea with -16 offset. |
| __declspec(naked) void MirrorRow_SSSE3(const uint8_t* src, |
| uint8_t* dst, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src |
| mov edx, [esp + 8] // dst |
| mov ecx, [esp + 12] // width |
| movdqa xmm5, xmmword ptr kShuffleMirror |
| |
| convertloop: |
| movdqu xmm0, [eax - 16 + ecx] |
| pshufb xmm0, xmm5 |
| movdqu [edx], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 16 |
| jg convertloop |
| ret |
| } |
| } |
| #endif // HAS_MIRRORROW_SSSE3 |
| |
| #ifdef HAS_MIRRORROW_AVX2 |
| __declspec(naked) void MirrorRow_AVX2(const uint8_t* src, |
| uint8_t* dst, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src |
| mov edx, [esp + 8] // dst |
| mov ecx, [esp + 12] // width |
| vbroadcastf128 ymm5, xmmword ptr kShuffleMirror |
| |
| convertloop: |
| vmovdqu ymm0, [eax - 32 + ecx] |
| vpshufb ymm0, ymm0, ymm5 |
| vpermq ymm0, ymm0, 0x4e // swap high and low halfs |
| vmovdqu [edx], ymm0 |
| lea edx, [edx + 32] |
| sub ecx, 32 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_MIRRORROW_AVX2 |
| |
| #ifdef HAS_MIRRORSPLITUVROW_SSSE3 |
| // Shuffle table for reversing the bytes of UV channels. |
| static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, |
| 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; |
| |
| __declspec(naked) void MirrorSplitUVRow_SSSE3(const uint8_t* src, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| __asm { |
| push edi |
| mov eax, [esp + 4 + 4] // src |
| mov edx, [esp + 4 + 8] // dst_u |
| mov edi, [esp + 4 + 12] // dst_v |
| mov ecx, [esp + 4 + 16] // width |
| movdqa xmm1, xmmword ptr kShuffleMirrorUV |
| lea eax, [eax + ecx * 2 - 16] |
| sub edi, edx |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| lea eax, [eax - 16] |
| pshufb xmm0, xmm1 |
| movlpd qword ptr [edx], xmm0 |
| movhpd qword ptr [edx + edi], xmm0 |
| lea edx, [edx + 8] |
| sub ecx, 8 |
| jg convertloop |
| |
| pop edi |
| ret |
| } |
| } |
| #endif // HAS_MIRRORSPLITUVROW_SSSE3 |
| |
| #ifdef HAS_ARGBMIRRORROW_SSE2 |
| __declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src, |
| uint8_t* dst, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src |
| mov edx, [esp + 8] // dst |
| mov ecx, [esp + 12] // width |
| lea eax, [eax - 16 + ecx * 4] // last 4 pixels. |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| lea eax, [eax - 16] |
| pshufd xmm0, xmm0, 0x1b |
| movdqu [edx], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 4 |
| jg convertloop |
| ret |
| } |
| } |
| #endif // HAS_ARGBMIRRORROW_SSE2 |
| |
| #ifdef HAS_ARGBMIRRORROW_AVX2 |
| // Shuffle table for reversing the bytes. |
| static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; |
| |
| __declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src, |
| uint8_t* dst, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src |
| mov edx, [esp + 8] // dst |
| mov ecx, [esp + 12] // width |
| vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2 |
| |
| convertloop: |
| vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order |
| vmovdqu [edx], ymm0 |
| lea edx, [edx + 32] |
| sub ecx, 8 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBMIRRORROW_AVX2 |
| |
| #ifdef HAS_SPLITUVROW_SSE2 |
| __declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| __asm { |
| push edi |
| mov eax, [esp + 4 + 4] // src_uv |
| mov edx, [esp + 4 + 8] // dst_u |
| mov edi, [esp + 4 + 12] // dst_v |
| mov ecx, [esp + 4 + 16] // width |
| pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| psrlw xmm5, 8 |
| sub edi, edx |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| lea eax, [eax + 32] |
| movdqa xmm2, xmm0 |
| movdqa xmm3, xmm1 |
| pand xmm0, xmm5 // even bytes |
| pand xmm1, xmm5 |
| packuswb xmm0, xmm1 |
| psrlw xmm2, 8 // odd bytes |
| psrlw xmm3, 8 |
| packuswb xmm2, xmm3 |
| movdqu [edx], xmm0 |
| movdqu [edx + edi], xmm2 |
| lea edx, [edx + 16] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| ret |
| } |
| } |
| |
| #endif // HAS_SPLITUVROW_SSE2 |
| |
| #ifdef HAS_SPLITUVROW_AVX2 |
| __declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| __asm { |
| push edi |
| mov eax, [esp + 4 + 4] // src_uv |
| mov edx, [esp + 4 + 8] // dst_u |
| mov edi, [esp + 4 + 12] // dst_v |
| mov ecx, [esp + 4 + 16] // width |
| vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
| vpsrlw ymm5, ymm5, 8 |
| sub edi, edx |
| |
| convertloop: |
| vmovdqu ymm0, [eax] |
| vmovdqu ymm1, [eax + 32] |
| lea eax, [eax + 64] |
| vpsrlw ymm2, ymm0, 8 // odd bytes |
| vpsrlw ymm3, ymm1, 8 |
| vpand ymm0, ymm0, ymm5 // even bytes |
| vpand ymm1, ymm1, ymm5 |
| vpackuswb ymm0, ymm0, ymm1 |
| vpackuswb ymm2, ymm2, ymm3 |
| vpermq ymm0, ymm0, 0xd8 |
| vpermq ymm2, ymm2, 0xd8 |
| vmovdqu [edx], ymm0 |
| vmovdqu [edx + edi], ymm2 |
| lea edx, [edx + 32] |
| sub ecx, 32 |
| jg convertloop |
| |
| pop edi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_SPLITUVROW_AVX2 |
| |
| #ifdef HAS_MERGEUVROW_SSE2 |
| __declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u, |
| const uint8_t* src_v, |
| uint8_t* dst_uv, |
| int width) { |
| __asm { |
| push edi |
| mov eax, [esp + 4 + 4] // src_u |
| mov edx, [esp + 4 + 8] // src_v |
| mov edi, [esp + 4 + 12] // dst_uv |
| mov ecx, [esp + 4 + 16] // width |
| sub edx, eax |
| |
| convertloop: |
| movdqu xmm0, [eax] // read 16 U's |
| movdqu xmm1, [eax + edx] // and 16 V's |
| lea eax, [eax + 16] |
| movdqa xmm2, xmm0 |
| punpcklbw xmm0, xmm1 // first 8 UV pairs |
| punpckhbw xmm2, xmm1 // next 8 UV pairs |
| movdqu [edi], xmm0 |
| movdqu [edi + 16], xmm2 |
| lea edi, [edi + 32] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| ret |
| } |
| } |
| #endif // HAS_MERGEUVROW_SSE2 |
| |
| #ifdef HAS_MERGEUVROW_AVX2 |
| __declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u, |
| const uint8_t* src_v, |
| uint8_t* dst_uv, |
| int width) { |
| __asm { |
| push edi |
| mov eax, [esp + 4 + 4] // src_u |
| mov edx, [esp + 4 + 8] // src_v |
| mov edi, [esp + 4 + 12] // dst_uv |
| mov ecx, [esp + 4 + 16] // width |
| sub edx, eax |
| |
| convertloop: |
| vmovdqu ymm0, [eax] // read 32 U's |
| vmovdqu ymm1, [eax + edx] // and 32 V's |
| lea eax, [eax + 32] |
| vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 |
| vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 |
| vextractf128 [edi], ymm2, 0 // bytes 0..15 |
| vextractf128 [edi + 16], ymm0, 0 // bytes 16..31 |
| vextractf128 [edi + 32], ymm2, 1 // bytes 32..47 |
| vextractf128 [edi + 48], ymm0, 1 // bytes 47..63 |
| lea edi, [edi + 64] |
| sub ecx, 32 |
| jg convertloop |
| |
| pop edi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_MERGEUVROW_AVX2 |
| |
| #ifdef HAS_COPYROW_SSE2 |
| // CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time. |
| __declspec(naked) void CopyRow_SSE2(const uint8_t* src, |
| uint8_t* dst, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src |
| mov edx, [esp + 8] // dst |
| mov ecx, [esp + 12] // width |
| test eax, 15 |
| jne convertloopu |
| test edx, 15 |
| jne convertloopu |
| |
| convertloopa: |
| movdqa xmm0, [eax] |
| movdqa xmm1, [eax + 16] |
| lea eax, [eax + 32] |
| movdqa [edx], xmm0 |
| movdqa [edx + 16], xmm1 |
| lea edx, [edx + 32] |
| sub ecx, 32 |
| jg convertloopa |
| ret |
| |
| convertloopu: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| lea eax, [eax + 32] |
| movdqu [edx], xmm0 |
| movdqu [edx + 16], xmm1 |
| lea edx, [edx + 32] |
| sub ecx, 32 |
| jg convertloopu |
| ret |
| } |
| } |
| #endif // HAS_COPYROW_SSE2 |
| |
| #ifdef HAS_COPYROW_AVX |
| // CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time. |
| __declspec(naked) void CopyRow_AVX(const uint8_t* src, |
| uint8_t* dst, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src |
| mov edx, [esp + 8] // dst |
| mov ecx, [esp + 12] // width |
| |
| convertloop: |
| vmovdqu ymm0, [eax] |
| vmovdqu ymm1, [eax + 32] |
| lea eax, [eax + 64] |
| vmovdqu [edx], ymm0 |
| vmovdqu [edx + 32], ymm1 |
| lea edx, [edx + 64] |
| sub ecx, 64 |
| jg convertloop |
| |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_COPYROW_AVX |
| |
| // Multiple of 1. |
| __declspec(naked) void CopyRow_ERMS(const uint8_t* src, |
| uint8_t* dst, |
| int width) { |
| __asm { |
| mov eax, esi |
| mov edx, edi |
| mov esi, [esp + 4] // src |
| mov edi, [esp + 8] // dst |
| mov ecx, [esp + 12] // width |
| rep movsb |
| mov edi, edx |
| mov esi, eax |
| ret |
| } |
| } |
| |
| #ifdef HAS_ARGBCOPYALPHAROW_SSE2 |
| // width in pixels |
| __declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src, |
| uint8_t* dst, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src |
| mov edx, [esp + 8] // dst |
| mov ecx, [esp + 12] // width |
| pcmpeqb xmm0, xmm0 // generate mask 0xff000000 |
| pslld xmm0, 24 |
| pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff |
| psrld xmm1, 8 |
| |
| convertloop: |
| movdqu xmm2, [eax] |
| movdqu xmm3, [eax + 16] |
| lea eax, [eax + 32] |
| movdqu xmm4, [edx] |
| movdqu xmm5, [edx + 16] |
| pand xmm2, xmm0 |
| pand xmm3, xmm0 |
| pand xmm4, xmm1 |
| pand xmm5, xmm1 |
| por xmm2, xmm4 |
| por xmm3, xmm5 |
| movdqu [edx], xmm2 |
| movdqu [edx + 16], xmm3 |
| lea edx, [edx + 32] |
| sub ecx, 8 |
| jg convertloop |
| |
| ret |
| } |
| } |
| #endif // HAS_ARGBCOPYALPHAROW_SSE2 |
| |
| #ifdef HAS_ARGBCOPYALPHAROW_AVX2 |
| // width in pixels |
| __declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src, |
| uint8_t* dst, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src |
| mov edx, [esp + 8] // dst |
| mov ecx, [esp + 12] // width |
| vpcmpeqb ymm0, ymm0, ymm0 |
| vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff |
| |
| convertloop: |
| vmovdqu ymm1, [eax] |
| vmovdqu ymm2, [eax + 32] |
| lea eax, [eax + 64] |
| vpblendvb ymm1, ymm1, [edx], ymm0 |
| vpblendvb ymm2, ymm2, [edx + 32], ymm0 |
| vmovdqu [edx], ymm1 |
| vmovdqu [edx + 32], ymm2 |
| lea edx, [edx + 64] |
| sub ecx, 16 |
| jg convertloop |
| |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBCOPYALPHAROW_AVX2 |
| |
| #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 |
| // width in pixels |
| __declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, |
| uint8_t* dst_a, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_a |
| mov ecx, [esp + 12] // width |
| |
| extractloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| lea eax, [eax + 32] |
| psrld xmm0, 24 |
| psrld xmm1, 24 |
| packssdw xmm0, xmm1 |
| packuswb xmm0, xmm0 |
| movq qword ptr [edx], xmm0 |
| lea edx, [edx + 8] |
| sub ecx, 8 |
| jg extractloop |
| |
| ret |
| } |
| } |
| #endif // HAS_ARGBEXTRACTALPHAROW_SSE2 |
| |
| #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 |
| // width in pixels |
| __declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, |
| uint8_t* dst_a, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_a |
| mov ecx, [esp + 12] // width |
| vmovdqa ymm4, ymmword ptr kPermdARGBToY_AVX |
| |
| extractloop: |
| vmovdqu ymm0, [eax] |
| vmovdqu ymm1, [eax + 32] |
| vpsrld ymm0, ymm0, 24 |
| vpsrld ymm1, ymm1, 24 |
| vmovdqu ymm2, [eax + 64] |
| vmovdqu ymm3, [eax + 96] |
| lea eax, [eax + 128] |
| vpackssdw ymm0, ymm0, ymm1 // mutates |
| vpsrld ymm2, ymm2, 24 |
| vpsrld ymm3, ymm3, 24 |
| vpackssdw ymm2, ymm2, ymm3 // mutates |
| vpackuswb ymm0, ymm0, ymm2 // mutates |
| vpermd ymm0, ymm4, ymm0 // unmutate |
| vmovdqu [edx], ymm0 |
| lea edx, [edx + 32] |
| sub ecx, 32 |
| jg extractloop |
| |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBEXTRACTALPHAROW_AVX2 |
| |
| #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 |
| // width in pixels |
| __declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, |
| uint8_t* dst, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src |
| mov edx, [esp + 8] // dst |
| mov ecx, [esp + 12] // width |
| pcmpeqb xmm0, xmm0 // generate mask 0xff000000 |
| pslld xmm0, 24 |
| pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff |
| psrld xmm1, 8 |
| |
| convertloop: |
| movq xmm2, qword ptr [eax] // 8 Y's |
| lea eax, [eax + 8] |
| punpcklbw xmm2, xmm2 |
| punpckhwd xmm3, xmm2 |
| punpcklwd xmm2, xmm2 |
| movdqu xmm4, [edx] |
| movdqu xmm5, [edx + 16] |
| pand xmm2, xmm0 |
| pand xmm3, xmm0 |
| pand xmm4, xmm1 |
| pand xmm5, xmm1 |
| por xmm2, xmm4 |
| por xmm3, xmm5 |
| movdqu [edx], xmm2 |
| movdqu [edx + 16], xmm3 |
| lea edx, [edx + 32] |
| sub ecx, 8 |
| jg convertloop |
| |
| ret |
| } |
| } |
| #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 |
| |
| #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 |
| // width in pixels |
| __declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, |
| uint8_t* dst, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src |
| mov edx, [esp + 8] // dst |
| mov ecx, [esp + 12] // width |
| vpcmpeqb ymm0, ymm0, ymm0 |
| vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff |
| |
| convertloop: |
| vpmovzxbd ymm1, qword ptr [eax] |
| vpmovzxbd ymm2, qword ptr [eax + 8] |
| lea eax, [eax + 16] |
| vpslld ymm1, ymm1, 24 |
| vpslld ymm2, ymm2, 24 |
| vpblendvb ymm1, ymm1, [edx], ymm0 |
| vpblendvb ymm2, ymm2, [edx + 32], ymm0 |
| vmovdqu [edx], ymm1 |
| vmovdqu [edx + 32], ymm2 |
| lea edx, [edx + 64] |
| sub ecx, 16 |
| jg convertloop |
| |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 |
| |
| #ifdef HAS_SETROW_X86 |
| // Write 'width' bytes using an 8 bit value repeated. |
| // width should be multiple of 4. |
| __declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { |
| __asm { |
| movzx eax, byte ptr [esp + 8] // v8 |
| mov edx, 0x01010101 // Duplicate byte to all bytes. |
| mul edx // overwrites edx with upper part of result. |
| mov edx, edi |
| mov edi, [esp + 4] // dst |
| mov ecx, [esp + 12] // width |
| shr ecx, 2 |
| rep stosd |
| mov edi, edx |
| ret |
| } |
| } |
| |
| // Write 'width' bytes using an 8 bit value repeated. |
| __declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { |
| __asm { |
| mov edx, edi |
| mov edi, [esp + 4] // dst |
| mov eax, [esp + 8] // v8 |
| mov ecx, [esp + 12] // width |
| rep stosb |
| mov edi, edx |
| ret |
| } |
| } |
| |
| // Write 'width' 32 bit values. |
| __declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb, |
| uint32_t v32, |
| int width) { |
| __asm { |
| mov edx, edi |
| mov edi, [esp + 4] // dst |
| mov eax, [esp + 8] // v32 |
| mov ecx, [esp + 12] // width |
| rep stosd |
| mov edi, edx |
| ret |
| } |
| } |
| #endif // HAS_SETROW_X86 |
| |
| #ifdef HAS_YUY2TOYROW_AVX2 |
| __declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, |
| uint8_t* dst_y, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_yuy2 |
| mov edx, [esp + 8] // dst_y |
| mov ecx, [esp + 12] // width |
| vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
| vpsrlw ymm5, ymm5, 8 |
| |
| convertloop: |
| vmovdqu ymm0, [eax] |
| vmovdqu ymm1, [eax + 32] |
| lea eax, [eax + 64] |
| vpand ymm0, ymm0, ymm5 // even bytes are Y |
| vpand ymm1, ymm1, ymm5 |
| vpackuswb ymm0, ymm0, ymm1 // mutates. |
| vpermq ymm0, ymm0, 0xd8 |
| vmovdqu [edx], ymm0 |
| lea edx, [edx + 32] |
| sub ecx, 32 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| |
| __declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, |
| int stride_yuy2, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // src_yuy2 |
| mov esi, [esp + 8 + 8] // stride_yuy2 |
| mov edx, [esp + 8 + 12] // dst_u |
| mov edi, [esp + 8 + 16] // dst_v |
| mov ecx, [esp + 8 + 20] // width |
| vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
| vpsrlw ymm5, ymm5, 8 |
| sub edi, edx |
| |
| convertloop: |
| vmovdqu ymm0, [eax] |
| vmovdqu ymm1, [eax + 32] |
| vpavgb ymm0, ymm0, [eax + esi] |
| vpavgb ymm1, ymm1, [eax + esi + 32] |
| lea eax, [eax + 64] |
| vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV |
| vpsrlw ymm1, ymm1, 8 |
| vpackuswb ymm0, ymm0, ymm1 // mutates. |
| vpermq ymm0, ymm0, 0xd8 |
| vpand ymm1, ymm0, ymm5 // U |
| vpsrlw ymm0, ymm0, 8 // V |
| vpackuswb ymm1, ymm1, ymm1 // mutates. |
| vpackuswb ymm0, ymm0, ymm0 // mutates. |
| vpermq ymm1, ymm1, 0xd8 |
| vpermq ymm0, ymm0, 0xd8 |
| vextractf128 [edx], ymm1, 0 // U |
| vextractf128 [edx + edi], ymm0, 0 // V |
| lea edx, [edx + 16] |
| sub ecx, 32 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| |
| __declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| __asm { |
| push edi |
| mov eax, [esp + 4 + 4] // src_yuy2 |
| mov edx, [esp + 4 + 8] // dst_u |
| mov edi, [esp + 4 + 12] // dst_v |
| mov ecx, [esp + 4 + 16] // width |
| vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
| vpsrlw ymm5, ymm5, 8 |
| sub edi, edx |
| |
| convertloop: |
| vmovdqu ymm0, [eax] |
| vmovdqu ymm1, [eax + 32] |
| lea eax, [eax + 64] |
| vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV |
| vpsrlw ymm1, ymm1, 8 |
| vpackuswb ymm0, ymm0, ymm1 // mutates. |
| vpermq ymm0, ymm0, 0xd8 |
| vpand ymm1, ymm0, ymm5 // U |
| vpsrlw ymm0, ymm0, 8 // V |
| vpackuswb ymm1, ymm1, ymm1 // mutates. |
| vpackuswb ymm0, ymm0, ymm0 // mutates. |
| vpermq ymm1, ymm1, 0xd8 |
| vpermq ymm0, ymm0, 0xd8 |
| vextractf128 [edx], ymm1, 0 // U |
| vextractf128 [edx + edi], ymm0, 0 // V |
| lea edx, [edx + 16] |
| sub ecx, 32 |
| jg convertloop |
| |
| pop edi |
| vzeroupper |
| ret |
| } |
| } |
| |
| __declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy, |
| uint8_t* dst_y, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_uyvy |
| mov edx, [esp + 8] // dst_y |
| mov ecx, [esp + 12] // width |
| |
| convertloop: |
| vmovdqu ymm0, [eax] |
| vmovdqu ymm1, [eax + 32] |
| lea eax, [eax + 64] |
| vpsrlw ymm0, ymm0, 8 // odd bytes are Y |
| vpsrlw ymm1, ymm1, 8 |
| vpackuswb ymm0, ymm0, ymm1 // mutates. |
| vpermq ymm0, ymm0, 0xd8 |
| vmovdqu [edx], ymm0 |
| lea edx, [edx + 32] |
| sub ecx, 32 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| |
| __declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, |
| int stride_uyvy, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // src_yuy2 |
| mov esi, [esp + 8 + 8] // stride_yuy2 |
| mov edx, [esp + 8 + 12] // dst_u |
| mov edi, [esp + 8 + 16] // dst_v |
| mov ecx, [esp + 8 + 20] // width |
| vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
| vpsrlw ymm5, ymm5, 8 |
| sub edi, edx |
| |
| convertloop: |
| vmovdqu ymm0, [eax] |
| vmovdqu ymm1, [eax + 32] |
| vpavgb ymm0, ymm0, [eax + esi] |
| vpavgb ymm1, ymm1, [eax + esi + 32] |
| lea eax, [eax + 64] |
| vpand ymm0, ymm0, ymm5 // UYVY -> UVUV |
| vpand ymm1, ymm1, ymm5 |
| vpackuswb ymm0, ymm0, ymm1 // mutates. |
| vpermq ymm0, ymm0, 0xd8 |
| vpand ymm1, ymm0, ymm5 // U |
| vpsrlw ymm0, ymm0, 8 // V |
| vpackuswb ymm1, ymm1, ymm1 // mutates. |
| vpackuswb ymm0, ymm0, ymm0 // mutates. |
| vpermq ymm1, ymm1, 0xd8 |
| vpermq ymm0, ymm0, 0xd8 |
| vextractf128 [edx], ymm1, 0 // U |
| vextractf128 [edx + edi], ymm0, 0 // V |
| lea edx, [edx + 16] |
| sub ecx, 32 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| |
| __declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| __asm { |
| push edi |
| mov eax, [esp + 4 + 4] // src_yuy2 |
| mov edx, [esp + 4 + 8] // dst_u |
| mov edi, [esp + 4 + 12] // dst_v |
| mov ecx, [esp + 4 + 16] // width |
| vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
| vpsrlw ymm5, ymm5, 8 |
| sub edi, edx |
| |
| convertloop: |
| vmovdqu ymm0, [eax] |
| vmovdqu ymm1, [eax + 32] |
| lea eax, [eax + 64] |
| vpand ymm0, ymm0, ymm5 // UYVY -> UVUV |
| vpand ymm1, ymm1, ymm5 |
| vpackuswb ymm0, ymm0, ymm1 // mutates. |
| vpermq ymm0, ymm0, 0xd8 |
| vpand ymm1, ymm0, ymm5 // U |
| vpsrlw ymm0, ymm0, 8 // V |
| vpackuswb ymm1, ymm1, ymm1 // mutates. |
| vpackuswb ymm0, ymm0, ymm0 // mutates. |
| vpermq ymm1, ymm1, 0xd8 |
| vpermq ymm0, ymm0, 0xd8 |
| vextractf128 [edx], ymm1, 0 // U |
| vextractf128 [edx + edi], ymm0, 0 // V |
| lea edx, [edx + 16] |
| sub ecx, 32 |
| jg convertloop |
| |
| pop edi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_YUY2TOYROW_AVX2 |
| |
| #ifdef HAS_YUY2TOYROW_SSE2 |
| __declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, |
| uint8_t* dst_y, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_yuy2 |
| mov edx, [esp + 8] // dst_y |
| mov ecx, [esp + 12] // width |
| pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| psrlw xmm5, 8 |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| lea eax, [eax + 32] |
| pand xmm0, xmm5 // even bytes are Y |
| pand xmm1, xmm5 |
| packuswb xmm0, xmm1 |
| movdqu [edx], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 16 |
| jg convertloop |
| ret |
| } |
| } |
| |
| __declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, |
| int stride_yuy2, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // src_yuy2 |
| mov esi, [esp + 8 + 8] // stride_yuy2 |
| mov edx, [esp + 8 + 12] // dst_u |
| mov edi, [esp + 8 + 16] // dst_v |
| mov ecx, [esp + 8 + 20] // width |
| pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| psrlw xmm5, 8 |
| sub edi, edx |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| movdqu xmm2, [eax + esi] |
| movdqu xmm3, [eax + esi + 16] |
| lea eax, [eax + 32] |
| pavgb xmm0, xmm2 |
| pavgb xmm1, xmm3 |
| psrlw xmm0, 8 // YUYV -> UVUV |
| psrlw xmm1, 8 |
| packuswb xmm0, xmm1 |
| movdqa xmm1, xmm0 |
| pand xmm0, xmm5 // U |
| packuswb xmm0, xmm0 |
| psrlw xmm1, 8 // V |
| packuswb xmm1, xmm1 |
| movq qword ptr [edx], xmm0 |
| movq qword ptr [edx + edi], xmm1 |
| lea edx, [edx + 8] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| __declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| __asm { |
| push edi |
| mov eax, [esp + 4 + 4] // src_yuy2 |
| mov edx, [esp + 4 + 8] // dst_u |
| mov edi, [esp + 4 + 12] // dst_v |
| mov ecx, [esp + 4 + 16] // width |
| pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| psrlw xmm5, 8 |
| sub edi, edx |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| lea eax, [eax + 32] |
| psrlw xmm0, 8 // YUYV -> UVUV |
| psrlw xmm1, 8 |
| packuswb xmm0, xmm1 |
| movdqa xmm1, xmm0 |
| pand xmm0, xmm5 // U |
| packuswb xmm0, xmm0 |
| psrlw xmm1, 8 // V |
| packuswb xmm1, xmm1 |
| movq qword ptr [edx], xmm0 |
| movq qword ptr [edx + edi], xmm1 |
| lea edx, [edx + 8] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| ret |
| } |
| } |
| |
| __declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy, |
| uint8_t* dst_y, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_uyvy |
| mov edx, [esp + 8] // dst_y |
| mov ecx, [esp + 12] // width |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| lea eax, [eax + 32] |
| psrlw xmm0, 8 // odd bytes are Y |
| psrlw xmm1, 8 |
| packuswb xmm0, xmm1 |
| movdqu [edx], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 16 |
| jg convertloop |
| ret |
| } |
| } |
| |
| __declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, |
| int stride_uyvy, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // src_yuy2 |
| mov esi, [esp + 8 + 8] // stride_yuy2 |
| mov edx, [esp + 8 + 12] // dst_u |
| mov edi, [esp + 8 + 16] // dst_v |
| mov ecx, [esp + 8 + 20] // width |
| pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| psrlw xmm5, 8 |
| sub edi, edx |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| movdqu xmm2, [eax + esi] |
| movdqu xmm3, [eax + esi + 16] |
| lea eax, [eax + 32] |
| pavgb xmm0, xmm2 |
| pavgb xmm1, xmm3 |
| pand xmm0, xmm5 // UYVY -> UVUV |
| pand xmm1, xmm5 |
| packuswb xmm0, xmm1 |
| movdqa xmm1, xmm0 |
| pand xmm0, xmm5 // U |
| packuswb xmm0, xmm0 |
| psrlw xmm1, 8 // V |
| packuswb xmm1, xmm1 |
| movq qword ptr [edx], xmm0 |
| movq qword ptr [edx + edi], xmm1 |
| lea edx, [edx + 8] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| __declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, |
| uint8_t* dst_u, |
| uint8_t* dst_v, |
| int width) { |
| __asm { |
| push edi |
| mov eax, [esp + 4 + 4] // src_yuy2 |
| mov edx, [esp + 4 + 8] // dst_u |
| mov edi, [esp + 4 + 12] // dst_v |
| mov ecx, [esp + 4 + 16] // width |
| pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| psrlw xmm5, 8 |
| sub edi, edx |
| |
| convertloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| lea eax, [eax + 32] |
| pand xmm0, xmm5 // UYVY -> UVUV |
| pand xmm1, xmm5 |
| packuswb xmm0, xmm1 |
| movdqa xmm1, xmm0 |
| pand xmm0, xmm5 // U |
| packuswb xmm0, xmm0 |
| psrlw xmm1, 8 // V |
| packuswb xmm1, xmm1 |
| movq qword ptr [edx], xmm0 |
| movq qword ptr [edx + edi], xmm1 |
| lea edx, [edx + 8] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| ret |
| } |
| } |
| #endif // HAS_YUY2TOYROW_SSE2 |
| |
| #ifdef HAS_BLENDPLANEROW_SSSE3 |
| // Blend 8 pixels at a time. |
| // unsigned version of math |
| // =((A2*C2)+(B2*(255-C2))+255)/256 |
| // signed version of math |
| // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 |
| __declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0, |
| const uint8_t* src1, |
| const uint8_t* alpha, |
| uint8_t* dst, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 |
| psllw xmm5, 8 |
| mov eax, 0x80808080 // 128 for biasing image to signed. |
| movd xmm6, eax |
| pshufd xmm6, xmm6, 0x00 |
| |
| mov eax, 0x807f807f // 32768 + 127 for unbias and round. |
| movd xmm7, eax |
| pshufd xmm7, xmm7, 0x00 |
| mov eax, [esp + 8 + 4] // src0 |
| mov edx, [esp + 8 + 8] // src1 |
| mov esi, [esp + 8 + 12] // alpha |
| mov edi, [esp + 8 + 16] // dst |
| mov ecx, [esp + 8 + 20] // width |
| sub eax, esi |
| sub edx, esi |
| sub edi, esi |
| |
| // 8 pixel loop. |
| convertloop8: |
| movq xmm0, qword ptr [esi] // alpha |
| punpcklbw xmm0, xmm0 |
| pxor xmm0, xmm5 // a, 255-a |
| movq xmm1, qword ptr [eax + esi] // src0 |
| movq xmm2, qword ptr [edx + esi] // src1 |
| punpcklbw xmm1, xmm2 |
| psubb xmm1, xmm6 // bias src0/1 - 128 |
| pmaddubsw xmm0, xmm1 |
| paddw xmm0, xmm7 // unbias result - 32768 and round. |
| psrlw xmm0, 8 |
| packuswb xmm0, xmm0 |
| movq qword ptr [edi + esi], xmm0 |
| lea esi, [esi + 8] |
| sub ecx, 8 |
| jg convertloop8 |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| #endif // HAS_BLENDPLANEROW_SSSE3 |
| |
| #ifdef HAS_BLENDPLANEROW_AVX2 |
| // Blend 32 pixels at a time. |
| // unsigned version of math |
| // =((A2*C2)+(B2*(255-C2))+255)/256 |
| // signed version of math |
| // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 |
| __declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0, |
| const uint8_t* src1, |
| const uint8_t* alpha, |
| uint8_t* dst, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00 |
| vpsllw ymm5, ymm5, 8 |
| mov eax, 0x80808080 // 128 for biasing image to signed. |
| vmovd xmm6, eax |
| vbroadcastss ymm6, xmm6 |
| mov eax, 0x807f807f // 32768 + 127 for unbias and round. |
| vmovd xmm7, eax |
| vbroadcastss ymm7, xmm7 |
| mov eax, [esp + 8 + 4] // src0 |
| mov edx, [esp + 8 + 8] // src1 |
| mov esi, [esp + 8 + 12] // alpha |
| mov edi, [esp + 8 + 16] // dst |
| mov ecx, [esp + 8 + 20] // width |
| sub eax, esi |
| sub edx, esi |
| sub edi, esi |
| |
| // 32 pixel loop. |
| convertloop32: |
| vmovdqu ymm0, [esi] // alpha |
| vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31 |
| vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23 |
| vpxor ymm3, ymm3, ymm5 // a, 255-a |
| vpxor ymm0, ymm0, ymm5 // a, 255-a |
| vmovdqu ymm1, [eax + esi] // src0 |
| vmovdqu ymm2, [edx + esi] // src1 |
| vpunpckhbw ymm4, ymm1, ymm2 |
| vpunpcklbw ymm1, ymm1, ymm2 |
| vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128 |
| vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 |
| vpmaddubsw ymm3, ymm3, ymm4 |
| vpmaddubsw ymm0, ymm0, ymm1 |
| vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round. |
| vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. |
| vpsrlw ymm3, ymm3, 8 |
| vpsrlw ymm0, ymm0, 8 |
| vpackuswb ymm0, ymm0, ymm3 |
| vmovdqu [edi + esi], ymm0 |
| lea esi, [esi + 32] |
| sub ecx, 32 |
| jg convertloop32 |
| |
| pop edi |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_BLENDPLANEROW_AVX2 |
| |
| #ifdef HAS_ARGBBLENDROW_SSSE3 |
| // Shuffle table for isolating alpha. |
| static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, |
| 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; |
| |
| // Blend 8 pixels at a time. |
| __declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, |
| const uint8_t* src_argb1, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| push esi |
| mov eax, [esp + 4 + 4] // src_argb0 |
| mov esi, [esp + 4 + 8] // src_argb1 |
| mov edx, [esp + 4 + 12] // dst_argb |
| mov ecx, [esp + 4 + 16] // width |
| pcmpeqb xmm7, xmm7 // generate constant 0x0001 |
| psrlw xmm7, 15 |
| pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff |
| psrlw xmm6, 8 |
| pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 |
| psllw xmm5, 8 |
| pcmpeqb xmm4, xmm4 // generate mask 0xff000000 |
| pslld xmm4, 24 |
| sub ecx, 4 |
| jl convertloop4b // less than 4 pixels? |
| |
| // 4 pixel loop. |
| convertloop4: |
| movdqu xmm3, [eax] // src argb |
| lea eax, [eax + 16] |
| movdqa xmm0, xmm3 // src argb |
| pxor xmm3, xmm4 // ~alpha |
| movdqu xmm2, [esi] // _r_b |
| pshufb xmm3, xmmword ptr kShuffleAlpha // alpha |
| pand xmm2, xmm6 // _r_b |
| paddw xmm3, xmm7 // 256 - alpha |
| pmullw xmm2, xmm3 // _r_b * alpha |
| movdqu xmm1, [esi] // _a_g |
| lea esi, [esi + 16] |
| psrlw xmm1, 8 // _a_g |
| por xmm0, xmm4 // set alpha to 255 |
| pmullw xmm1, xmm3 // _a_g * alpha |
| psrlw xmm2, 8 // _r_b convert to 8 bits again |
| paddusb xmm0, xmm2 // + src argb |
| pand xmm1, xmm5 // a_g_ convert to 8 bits again |
| paddusb xmm0, xmm1 // + src argb |
| movdqu [edx], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 4 |
| jge convertloop4 |
| |
| convertloop4b: |
| add ecx, 4 - 1 |
| jl convertloop1b |
| |
| // 1 pixel loop. |
| convertloop1: |
| movd xmm3, [eax] // src argb |
| lea eax, [eax + 4] |
| movdqa xmm0, xmm3 // src argb |
| pxor xmm3, xmm4 // ~alpha |
| movd xmm2, [esi] // _r_b |
| pshufb xmm3, xmmword ptr kShuffleAlpha // alpha |
| pand xmm2, xmm6 // _r_b |
| paddw xmm3, xmm7 // 256 - alpha |
| pmullw xmm2, xmm3 // _r_b * alpha |
| movd xmm1, [esi] // _a_g |
| lea esi, [esi + 4] |
| psrlw xmm1, 8 // _a_g |
| por xmm0, xmm4 // set alpha to 255 |
| pmullw xmm1, xmm3 // _a_g * alpha |
| psrlw xmm2, 8 // _r_b convert to 8 bits again |
| paddusb xmm0, xmm2 // + src argb |
| pand xmm1, xmm5 // a_g_ convert to 8 bits again |
| paddusb xmm0, xmm1 // + src argb |
| movd [edx], xmm0 |
| lea edx, [edx + 4] |
| sub ecx, 1 |
| jge convertloop1 |
| |
| convertloop1b: |
| pop esi |
| ret |
| } |
| } |
| #endif // HAS_ARGBBLENDROW_SSSE3 |
| |
| #ifdef HAS_ARGBATTENUATEROW_SSSE3 |
| // Shuffle table duplicating alpha. |
| static const uvec8 kShuffleAlpha0 = { |
| 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, |
| }; |
| static const uvec8 kShuffleAlpha1 = { |
| 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, |
| 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, |
| }; |
| __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_argb0 |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // width |
| pcmpeqb xmm3, xmm3 // generate mask 0xff000000 |
| pslld xmm3, 24 |
| movdqa xmm4, xmmword ptr kShuffleAlpha0 |
| movdqa xmm5, xmmword ptr kShuffleAlpha1 |
| |
| convertloop: |
| movdqu xmm0, [eax] // read 4 pixels |
| pshufb xmm0, xmm4 // isolate first 2 alphas |
| movdqu xmm1, [eax] // read 4 pixels |
| punpcklbw xmm1, xmm1 // first 2 pixel rgbs |
| pmulhuw xmm0, xmm1 // rgb * a |
| movdqu xmm1, [eax] // read 4 pixels |
| pshufb xmm1, xmm5 // isolate next 2 alphas |
| movdqu xmm2, [eax] // read 4 pixels |
| punpckhbw xmm2, xmm2 // next 2 pixel rgbs |
| pmulhuw xmm1, xmm2 // rgb * a |
| movdqu xmm2, [eax] // mask original alpha |
| lea eax, [eax + 16] |
| pand xmm2, xmm3 |
| psrlw xmm0, 8 |
| psrlw xmm1, 8 |
| packuswb xmm0, xmm1 |
| por xmm0, xmm2 // copy original alpha |
| movdqu [edx], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 4 |
| jg convertloop |
| |
| ret |
| } |
| } |
| #endif // HAS_ARGBATTENUATEROW_SSSE3 |
| |
| #ifdef HAS_ARGBATTENUATEROW_AVX2 |
| // Shuffle table duplicating alpha. |
| static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, |
| 128u, 128u, 14u, 15u, 14u, 15u, |
| 14u, 15u, 128u, 128u}; |
| __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_argb0 |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // width |
| sub edx, eax |
| vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2 |
| vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 |
| vpslld ymm5, ymm5, 24 |
| |
| convertloop: |
| vmovdqu ymm6, [eax] // read 8 pixels. |
| vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. |
| vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. |
| vpshufb ymm2, ymm0, ymm4 // low 4 alphas |
| vpshufb ymm3, ymm1, ymm4 // high 4 alphas |
| vpmulhuw ymm0, ymm0, ymm2 // rgb * a |
| vpmulhuw ymm1, ymm1, ymm3 // rgb * a |
| vpand ymm6, ymm6, ymm5 // isolate alpha |
| vpsrlw ymm0, ymm0, 8 |
| vpsrlw ymm1, ymm1, 8 |
| vpackuswb ymm0, ymm0, ymm1 // unmutated. |
| vpor ymm0, ymm0, ymm6 // copy original alpha |
| vmovdqu [eax + edx], ymm0 |
| lea eax, [eax + 32] |
| sub ecx, 8 |
| jg convertloop |
| |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBATTENUATEROW_AVX2 |
| |
| #ifdef HAS_ARGBUNATTENUATEROW_SSE2 |
| // Unattenuate 4 pixels at a time. |
| __declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| push ebx |
| push esi |
| push edi |
| mov eax, [esp + 12 + 4] // src_argb |
| mov edx, [esp + 12 + 8] // dst_argb |
| mov ecx, [esp + 12 + 12] // width |
| lea ebx, fixed_invtbl8 |
| |
| convertloop: |
| movdqu xmm0, [eax] // read 4 pixels |
| movzx esi, byte ptr [eax + 3] // first alpha |
| movzx edi, byte ptr [eax + 7] // second alpha |
| punpcklbw xmm0, xmm0 // first 2 |
| movd xmm2, dword ptr [ebx + esi * 4] |
| movd xmm3, dword ptr [ebx + edi * 4] |
| pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a |
| pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words |
| movlhps xmm2, xmm3 |
| pmulhuw xmm0, xmm2 // rgb * a |
| |
| movdqu xmm1, [eax] // read 4 pixels |
| movzx esi, byte ptr [eax + 11] // third alpha |
| movzx edi, byte ptr [eax + 15] // forth alpha |
| punpckhbw xmm1, xmm1 // next 2 |
| movd xmm2, dword ptr [ebx + esi * 4] |
| movd xmm3, dword ptr [ebx + edi * 4] |
| pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words |
| pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words |
| movlhps xmm2, xmm3 |
| pmulhuw xmm1, xmm2 // rgb * a |
| lea eax, [eax + 16] |
| packuswb xmm0, xmm1 |
| movdqu [edx], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 4 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| pop ebx |
| ret |
| } |
| } |
| #endif // HAS_ARGBUNATTENUATEROW_SSE2 |
| |
| #ifdef HAS_ARGBUNATTENUATEROW_AVX2 |
| // Shuffle table duplicating alpha. |
| static const uvec8 kUnattenShuffleAlpha_AVX2 = { |
| 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u}; |
| // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. |
| // USE_GATHER is not on by default, due to being a slow instruction. |
| #ifdef USE_GATHER |
| __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_argb0 |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // width |
| sub edx, eax |
| vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2 |
| |
| convertloop: |
| vmovdqu ymm6, [eax] // read 8 pixels. |
| vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. |
| vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. |
| vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. |
| vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. |
| vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a |
| vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a |
| vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. |
| vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a |
| vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas |
| vpmulhuw ymm0, ymm0, ymm2 // rgb * ia |
| vpmulhuw ymm1, ymm1, ymm3 // rgb * ia |
| vpackuswb ymm0, ymm0, ymm1 // unmutated. |
| vmovdqu [eax + edx], ymm0 |
| lea eax, [eax + 32] |
| sub ecx, 8 |
| jg convertloop |
| |
| vzeroupper |
| ret |
| } |
| } |
| #else // USE_GATHER |
| __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| |
| push ebx |
| push esi |
| push edi |
| mov eax, [esp + 12 + 4] // src_argb |
| mov edx, [esp + 12 + 8] // dst_argb |
| mov ecx, [esp + 12 + 12] // width |
| sub edx, eax |
| lea ebx, fixed_invtbl8 |
| vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2 |
| |
| convertloop: |
| // replace VPGATHER |
| movzx esi, byte ptr [eax + 3] // alpha0 |
| movzx edi, byte ptr [eax + 7] // alpha1 |
| vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0] |
| vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1] |
| movzx esi, byte ptr [eax + 11] // alpha2 |
| movzx edi, byte ptr [eax + 15] // alpha3 |
| vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] |
| vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2] |
| vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3] |
| movzx esi, byte ptr [eax + 19] // alpha4 |
| movzx edi, byte ptr [eax + 23] // alpha5 |
| vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] |
| vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4] |
| vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5] |
| movzx esi, byte ptr [eax + 27] // alpha6 |
| movzx edi, byte ptr [eax + 31] // alpha7 |
| vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] |
| vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6] |
| vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7] |
| vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] |
| vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] |
| vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] |
| vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] |
| // end of VPGATHER |
| |
| vmovdqu ymm6, [eax] // read 8 pixels. |
| vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. |
| vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. |
| vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a |
| vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. |
| vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a |
| vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas |
| vpmulhuw ymm0, ymm0, ymm2 // rgb * ia |
| vpmulhuw ymm1, ymm1, ymm3 // rgb * ia |
| vpackuswb ymm0, ymm0, ymm1 // unmutated. |
| vmovdqu [eax + edx], ymm0 |
| lea eax, [eax + 32] |
| sub ecx, 8 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| pop ebx |
| vzeroupper |
| ret |
| } |
| } |
| #endif // USE_GATHER |
| #endif // HAS_ARGBATTENUATEROW_AVX2 |
| |
| #ifdef HAS_ARGBGRAYROW_SSSE3 |
| // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. |
| __declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] /* src_argb */ |
| mov edx, [esp + 8] /* dst_argb */ |
| mov ecx, [esp + 12] /* width */ |
| movdqa xmm4, xmmword ptr kARGBToYJ |
| movdqa xmm5, xmmword ptr kAddYJ64 |
| |
| convertloop: |
| movdqu xmm0, [eax] // G |
| movdqu xmm1, [eax + 16] |
| pmaddubsw xmm0, xmm4 |
| pmaddubsw xmm1, xmm4 |
| phaddw xmm0, xmm1 |
| paddw xmm0, xmm5 // Add .5 for rounding. |
| psrlw xmm0, 7 |
| packuswb xmm0, xmm0 // 8 G bytes |
| movdqu xmm2, [eax] // A |
| movdqu xmm3, [eax + 16] |
| lea eax, [eax + 32] |
| psrld xmm2, 24 |
| psrld xmm3, 24 |
| packuswb xmm2, xmm3 |
| packuswb xmm2, xmm2 // 8 A bytes |
| movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA |
| punpcklbw xmm0, xmm0 // 8 GG words |
| punpcklbw xmm3, xmm2 // 8 GA words |
| movdqa xmm1, xmm0 |
| punpcklwd xmm0, xmm3 // GGGA first 4 |
| punpckhwd xmm1, xmm3 // GGGA next 4 |
| movdqu [edx], xmm0 |
| movdqu [edx + 16], xmm1 |
| lea edx, [edx + 32] |
| sub ecx, 8 |
| jg convertloop |
| ret |
| } |
| } |
| #endif // HAS_ARGBGRAYROW_SSSE3 |
| |
| #ifdef HAS_ARGBSEPIAROW_SSSE3 |
| // b = (r * 35 + g * 68 + b * 17) >> 7 |
| // g = (r * 45 + g * 88 + b * 22) >> 7 |
| // r = (r * 50 + g * 98 + b * 24) >> 7 |
| // Constant for ARGB color to sepia tone. |
| static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, |
| 17, 68, 35, 0, 17, 68, 35, 0}; |
| |
| static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, |
| 22, 88, 45, 0, 22, 88, 45, 0}; |
| |
| static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, |
| 24, 98, 50, 0, 24, 98, 50, 0}; |
| |
| // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. |
| __declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { |
| __asm { |
| mov eax, [esp + 4] /* dst_argb */ |
| mov ecx, [esp + 8] /* width */ |
| movdqa xmm2, xmmword ptr kARGBToSepiaB |
| movdqa xmm3, xmmword ptr kARGBToSepiaG |
| movdqa xmm4, xmmword ptr kARGBToSepiaR |
| |
| convertloop: |
| movdqu xmm0, [eax] // B |
| movdqu xmm6, [eax + 16] |
| pmaddubsw xmm0, xmm2 |
| pmaddubsw xmm6, xmm2 |
| phaddw xmm0, xmm6 |
| psrlw xmm0, 7 |
| packuswb xmm0, xmm0 // 8 B values |
| movdqu xmm5, [eax] // G |
| movdqu xmm1, [eax + 16] |
| pmaddubsw xmm5, xmm3 |
| pmaddubsw xmm1, xmm3 |
| phaddw xmm5, xmm1 |
| psrlw xmm5, 7 |
| packuswb xmm5, xmm5 // 8 G values |
| punpcklbw xmm0, xmm5 // 8 BG values |
| movdqu xmm5, [eax] // R |
| movdqu xmm1, [eax + 16] |
| pmaddubsw xmm5, xmm4 |
| pmaddubsw xmm1, xmm4 |
| phaddw xmm5, xmm1 |
| psrlw xmm5, 7 |
| packuswb xmm5, xmm5 // 8 R values |
| movdqu xmm6, [eax] // A |
| movdqu xmm1, [eax + 16] |
| psrld xmm6, 24 |
| psrld xmm1, 24 |
| packuswb xmm6, xmm1 |
| packuswb xmm6, xmm6 // 8 A values |
| punpcklbw xmm5, xmm6 // 8 RA values |
| movdqa xmm1, xmm0 // Weave BG, RA together |
| punpcklwd xmm0, xmm5 // BGRA first 4 |
| punpckhwd xmm1, xmm5 // BGRA next 4 |
| movdqu [eax], xmm0 |
| movdqu [eax + 16], xmm1 |
| lea eax, [eax + 32] |
| sub ecx, 8 |
| jg convertloop |
| ret |
| } |
| } |
| #endif // HAS_ARGBSEPIAROW_SSSE3 |
| |
| #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 |
| // Tranform 8 ARGB pixels (32 bytes) with color matrix. |
| // Same as Sepia except matrix is provided. |
| // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R |
| // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. |
| __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, |
| uint8_t* dst_argb, |
| const int8_t* matrix_argb, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] /* src_argb */ |
| mov edx, [esp + 8] /* dst_argb */ |
| mov ecx, [esp + 12] /* matrix_argb */ |
| movdqu xmm5, [ecx] |
| pshufd xmm2, xmm5, 0x00 |
| pshufd xmm3, xmm5, 0x55 |
| pshufd xmm4, xmm5, 0xaa |
| pshufd xmm5, xmm5, 0xff |
| mov ecx, [esp + 16] /* width */ |
| |
| convertloop: |
| movdqu xmm0, [eax] // B |
| movdqu xmm7, [eax + 16] |
| pmaddubsw xmm0, xmm2 |
| pmaddubsw xmm7, xmm2 |
| movdqu xmm6, [eax] // G |
| movdqu xmm1, [eax + 16] |
| pmaddubsw xmm6, xmm3 |
| pmaddubsw xmm1, xmm3 |
| phaddsw xmm0, xmm7 // B |
| phaddsw xmm6, xmm1 // G |
| psraw xmm0, 6 // B |
| psraw xmm6, 6 // G |
| packuswb xmm0, xmm0 // 8 B values |
| packuswb xmm6, xmm6 // 8 G values |
| punpcklbw xmm0, xmm6 // 8 BG values |
| movdqu xmm1, [eax] // R |
| movdqu xmm7, [eax + 16] |
| pmaddubsw xmm1, xmm4 |
| pmaddubsw xmm7, xmm4 |
| phaddsw xmm1, xmm7 // R |
| movdqu xmm6, [eax] // A |
| movdqu xmm7, [eax + 16] |
| pmaddubsw xmm6, xmm5 |
| pmaddubsw xmm7, xmm5 |
| phaddsw xmm6, xmm7 // A |
| psraw xmm1, 6 // R |
| psraw xmm6, 6 // A |
| packuswb xmm1, xmm1 // 8 R values |
| packuswb xmm6, xmm6 // 8 A values |
| punpcklbw xmm1, xmm6 // 8 RA values |
| movdqa xmm6, xmm0 // Weave BG, RA together |
| punpcklwd xmm0, xmm1 // BGRA first 4 |
| punpckhwd xmm6, xmm1 // BGRA next 4 |
| movdqu [edx], xmm0 |
| movdqu [edx + 16], xmm6 |
| lea eax, [eax + 32] |
| lea edx, [edx + 32] |
| sub ecx, 8 |
| jg convertloop |
| ret |
| } |
| } |
| #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 |
| |
| #ifdef HAS_ARGBQUANTIZEROW_SSE2 |
| // Quantize 4 ARGB pixels (16 bytes). |
| __declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, |
| int scale, |
| int interval_size, |
| int interval_offset, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] /* dst_argb */ |
| movd xmm2, [esp + 8] /* scale */ |
| movd xmm3, [esp + 12] /* interval_size */ |
| movd xmm4, [esp + 16] /* interval_offset */ |
| mov ecx, [esp + 20] /* width */ |
| pshuflw xmm2, xmm2, 040h |
| pshufd xmm2, xmm2, 044h |
| pshuflw xmm3, xmm3, 040h |
| pshufd xmm3, xmm3, 044h |
| pshuflw xmm4, xmm4, 040h |
| pshufd xmm4, xmm4, 044h |
| pxor xmm5, xmm5 // constant 0 |
| pcmpeqb xmm6, xmm6 // generate mask 0xff000000 |
| pslld xmm6, 24 |
| |
| convertloop: |
| movdqu xmm0, [eax] // read 4 pixels |
| punpcklbw xmm0, xmm5 // first 2 pixels |
| pmulhuw xmm0, xmm2 // pixel * scale >> 16 |
| movdqu xmm1, [eax] // read 4 pixels |
| punpckhbw xmm1, xmm5 // next 2 pixels |
| pmulhuw xmm1, xmm2 |
| pmullw xmm0, xmm3 // * interval_size |
| movdqu xmm7, [eax] // read 4 pixels |
| pmullw xmm1, xmm3 |
| pand xmm7, xmm6 // mask alpha |
| paddw xmm0, xmm4 // + interval_size / 2 |
| paddw xmm1, xmm4 |
| packuswb xmm0, xmm1 |
| por xmm0, xmm7 |
| movdqu [eax], xmm0 |
| lea eax, [eax + 16] |
| sub ecx, 4 |
| jg convertloop |
| ret |
| } |
| } |
| #endif // HAS_ARGBQUANTIZEROW_SSE2 |
| |
| #ifdef HAS_ARGBSHADEROW_SSE2 |
| // Shade 4 pixels at a time by specified value. |
| __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb, |
| uint8_t* dst_argb, |
| int width, |
| uint32_t value) { |
| __asm { |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // width |
| movd xmm2, [esp + 16] // value |
| punpcklbw xmm2, xmm2 |
| punpcklqdq xmm2, xmm2 |
| |
| convertloop: |
| movdqu xmm0, [eax] // read 4 pixels |
| lea eax, [eax + 16] |
| movdqa xmm1, xmm0 |
| punpcklbw xmm0, xmm0 // first 2 |
| punpckhbw xmm1, xmm1 // next 2 |
| pmulhuw xmm0, xmm2 // argb * value |
| pmulhuw xmm1, xmm2 // argb * value |
| psrlw xmm0, 8 |
| psrlw xmm1, 8 |
| packuswb xmm0, xmm1 |
| movdqu [edx], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 4 |
| jg convertloop |
| |
| ret |
| } |
| } |
| #endif // HAS_ARGBSHADEROW_SSE2 |
| |
| #ifdef HAS_ARGBMULTIPLYROW_SSE2 |
| // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. |
| __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, |
| const uint8_t* src_argb1, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| push esi |
| mov eax, [esp + 4 + 4] // src_argb0 |
| mov esi, [esp + 4 + 8] // src_argb1 |
| mov edx, [esp + 4 + 12] // dst_argb |
| mov ecx, [esp + 4 + 16] // width |
| pxor xmm5, xmm5 // constant 0 |
| |
| convertloop: |
| movdqu xmm0, [eax] // read 4 pixels from src_argb0 |
| movdqu xmm2, [esi] // read 4 pixels from src_argb1 |
| movdqu xmm1, xmm0 |
| movdqu xmm3, xmm2 |
| punpcklbw xmm0, xmm0 // first 2 |
| punpckhbw xmm1, xmm1 // next 2 |
| punpcklbw xmm2, xmm5 // first 2 |
| punpckhbw xmm3, xmm5 // next 2 |
| pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 |
| pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 |
| lea eax, [eax + 16] |
| lea esi, [esi + 16] |
| packuswb xmm0, xmm1 |
| movdqu [edx], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 4 |
| jg convertloop |
| |
| pop esi |
| ret |
| } |
| } |
| #endif // HAS_ARGBMULTIPLYROW_SSE2 |
| |
| #ifdef HAS_ARGBADDROW_SSE2 |
| // Add 2 rows of ARGB pixels together, 4 pixels at a time. |
| // TODO(fbarchard): Port this to posix, neon and other math functions. |
| __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0, |
| const uint8_t* src_argb1, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| push esi |
| mov eax, [esp + 4 + 4] // src_argb0 |
| mov esi, [esp + 4 + 8] // src_argb1 |
| mov edx, [esp + 4 + 12] // dst_argb |
| mov ecx, [esp + 4 + 16] // width |
| |
| sub ecx, 4 |
| jl convertloop49 |
| |
| convertloop4: |
| movdqu xmm0, [eax] // read 4 pixels from src_argb0 |
| lea eax, [eax + 16] |
| movdqu xmm1, [esi] // read 4 pixels from src_argb1 |
| lea esi, [esi + 16] |
| paddusb xmm0, xmm1 // src_argb0 + src_argb1 |
| movdqu [edx], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 4 |
| jge convertloop4 |
| |
| convertloop49: |
| add ecx, 4 - 1 |
| jl convertloop19 |
| |
| convertloop1: |
| movd xmm0, [eax] // read 1 pixels from src_argb0 |
| lea eax, [eax + 4] |
| movd xmm1, [esi] // read 1 pixels from src_argb1 |
| lea esi, [esi + 4] |
| paddusb xmm0, xmm1 // src_argb0 + src_argb1 |
| movd [edx], xmm0 |
| lea edx, [edx + 4] |
| sub ecx, 1 |
| jge convertloop1 |
| |
| convertloop19: |
| pop esi |
| ret |
| } |
| } |
| #endif // HAS_ARGBADDROW_SSE2 |
| |
| #ifdef HAS_ARGBSUBTRACTROW_SSE2 |
| // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. |
| __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, |
| const uint8_t* src_argb1, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| push esi |
| mov eax, [esp + 4 + 4] // src_argb0 |
| mov esi, [esp + 4 + 8] // src_argb1 |
| mov edx, [esp + 4 + 12] // dst_argb |
| mov ecx, [esp + 4 + 16] // width |
| |
| convertloop: |
| movdqu xmm0, [eax] // read 4 pixels from src_argb0 |
| lea eax, [eax + 16] |
| movdqu xmm1, [esi] // read 4 pixels from src_argb1 |
| lea esi, [esi + 16] |
| psubusb xmm0, xmm1 // src_argb0 - src_argb1 |
| movdqu [edx], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 4 |
| jg convertloop |
| |
| pop esi |
| ret |
| } |
| } |
| #endif // HAS_ARGBSUBTRACTROW_SSE2 |
| |
| #ifdef HAS_ARGBMULTIPLYROW_AVX2 |
| // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. |
| __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, |
| const uint8_t* src_argb1, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| push esi |
| mov eax, [esp + 4 + 4] // src_argb0 |
| mov esi, [esp + 4 + 8] // src_argb1 |
| mov edx, [esp + 4 + 12] // dst_argb |
| mov ecx, [esp + 4 + 16] // width |
| vpxor ymm5, ymm5, ymm5 // constant 0 |
| |
| convertloop: |
| vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 |
| lea eax, [eax + 32] |
| vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 |
| lea esi, [esi + 32] |
| vpunpcklbw ymm0, ymm1, ymm1 // low 4 |
| vpunpckhbw ymm1, ymm1, ymm1 // high 4 |
| vpunpcklbw ymm2, ymm3, ymm5 // low 4 |
| vpunpckhbw ymm3, ymm3, ymm5 // high 4 |
| vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 |
| vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 |
| vpackuswb ymm0, ymm0, ymm1 |
| vmovdqu [edx], ymm0 |
| lea edx, [edx + 32] |
| sub ecx, 8 |
| jg convertloop |
| |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBMULTIPLYROW_AVX2 |
| |
| #ifdef HAS_ARGBADDROW_AVX2 |
| // Add 2 rows of ARGB pixels together, 8 pixels at a time. |
| __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0, |
| const uint8_t* src_argb1, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| push esi |
| mov eax, [esp + 4 + 4] // src_argb0 |
| mov esi, [esp + 4 + 8] // src_argb1 |
| mov edx, [esp + 4 + 12] // dst_argb |
| mov ecx, [esp + 4 + 16] // width |
| |
| convertloop: |
| vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 |
| lea eax, [eax + 32] |
| vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 |
| lea esi, [esi + 32] |
| vmovdqu [edx], ymm0 |
| lea edx, [edx + 32] |
| sub ecx, 8 |
| jg convertloop |
| |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBADDROW_AVX2 |
| |
| #ifdef HAS_ARGBSUBTRACTROW_AVX2 |
| // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. |
| __declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, |
| const uint8_t* src_argb1, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| push esi |
| mov eax, [esp + 4 + 4] // src_argb0 |
| mov esi, [esp + 4 + 8] // src_argb1 |
| mov edx, [esp + 4 + 12] // dst_argb |
| mov ecx, [esp + 4 + 16] // width |
| |
| convertloop: |
| vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 |
| lea eax, [eax + 32] |
| vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 |
| lea esi, [esi + 32] |
| vmovdqu [edx], ymm0 |
| lea edx, [edx + 32] |
| sub ecx, 8 |
| jg convertloop |
| |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBSUBTRACTROW_AVX2 |
| |
| #ifdef HAS_SOBELXROW_SSE2 |
| // SobelX as a matrix is |
| // -1 0 1 |
| // -2 0 2 |
| // -1 0 1 |
| __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0, |
| const uint8_t* src_y1, |
| const uint8_t* src_y2, |
| uint8_t* dst_sobelx, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // src_y0 |
| mov esi, [esp + 8 + 8] // src_y1 |
| mov edi, [esp + 8 + 12] // src_y2 |
| mov edx, [esp + 8 + 16] // dst_sobelx |
| mov ecx, [esp + 8 + 20] // width |
| sub esi, eax |
| sub edi, eax |
| sub edx, eax |
| pxor xmm5, xmm5 // constant 0 |
| |
| convertloop: |
| movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] |
| movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] |
| punpcklbw xmm0, xmm5 |
| punpcklbw xmm1, xmm5 |
| psubw xmm0, xmm1 |
| movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] |
| movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] |
| punpcklbw xmm1, xmm5 |
| punpcklbw xmm2, xmm5 |
| psubw xmm1, xmm2 |
| movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] |
| movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] |
| punpcklbw xmm2, xmm5 |
| punpcklbw xmm3, xmm5 |
| psubw xmm2, xmm3 |
| paddw xmm0, xmm2 |
| paddw xmm0, xmm1 |
| paddw xmm0, xmm1 |
| pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw |
| psubw xmm1, xmm0 |
| pmaxsw xmm0, xmm1 |
| packuswb xmm0, xmm0 |
| movq qword ptr [eax + edx], xmm0 |
| lea eax, [eax + 8] |
| sub ecx, 8 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| #endif // HAS_SOBELXROW_SSE2 |
| |
| #ifdef HAS_SOBELYROW_SSE2 |
| // SobelY as a matrix is |
| // -1 -2 -1 |
| // 0 0 0 |
| // 1 2 1 |
| __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0, |
| const uint8_t* src_y1, |
| uint8_t* dst_sobely, |
| int width) { |
| __asm { |
| push esi |
| mov eax, [esp + 4 + 4] // src_y0 |
| mov esi, [esp + 4 + 8] // src_y1 |
| mov edx, [esp + 4 + 12] // dst_sobely |
| mov ecx, [esp + 4 + 16] // width |
| sub esi, eax |
| sub edx, eax |
| pxor xmm5, xmm5 // constant 0 |
| |
| convertloop: |
| movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] |
| movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] |
| punpcklbw xmm0, xmm5 |
| punpcklbw xmm1, xmm5 |
| psubw xmm0, xmm1 |
| movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] |
| movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] |
| punpcklbw xmm1, xmm5 |
| punpcklbw xmm2, xmm5 |
| psubw xmm1, xmm2 |
| movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] |
| movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] |
| punpcklbw xmm2, xmm5 |
| punpcklbw xmm3, xmm5 |
| psubw xmm2, xmm3 |
| paddw xmm0, xmm2 |
| paddw xmm0, xmm1 |
| paddw xmm0, xmm1 |
| pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw |
| psubw xmm1, xmm0 |
| pmaxsw xmm0, xmm1 |
| packuswb xmm0, xmm0 |
| movq qword ptr [eax + edx], xmm0 |
| lea eax, [eax + 8] |
| sub ecx, 8 |
| jg convertloop |
| |
| pop esi |
| ret |
| } |
| } |
| #endif // HAS_SOBELYROW_SSE2 |
| |
| #ifdef HAS_SOBELROW_SSE2 |
| // Adds Sobel X and Sobel Y and stores Sobel into ARGB. |
| // A = 255 |
| // R = Sobel |
| // G = Sobel |
| // B = Sobel |
| __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx, |
| const uint8_t* src_sobely, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| push esi |
| mov eax, [esp + 4 + 4] // src_sobelx |
| mov esi, [esp + 4 + 8] // src_sobely |
| mov edx, [esp + 4 + 12] // dst_argb |
| mov ecx, [esp + 4 + 16] // width |
| sub esi, eax |
| pcmpeqb xmm5, xmm5 // alpha 255 |
| pslld xmm5, 24 // 0xff000000 |
| |
| convertloop: |
| movdqu xmm0, [eax] // read 16 pixels src_sobelx |
| movdqu xmm1, [eax + esi] // read 16 pixels src_sobely |
| lea eax, [eax + 16] |
| paddusb xmm0, xmm1 // sobel = sobelx + sobely |
| movdqa xmm2, xmm0 // GG |
| punpcklbw xmm2, xmm0 // First 8 |
| punpckhbw xmm0, xmm0 // Next 8 |
| movdqa xmm1, xmm2 // GGGG |
| punpcklwd xmm1, xmm2 // First 4 |
| punpckhwd xmm2, xmm2 // Next 4 |
| por xmm1, xmm5 // GGGA |
| por xmm2, xmm5 |
| movdqa xmm3, xmm0 // GGGG |
| punpcklwd xmm3, xmm0 // Next 4 |
| punpckhwd xmm0, xmm0 // Last 4 |
| por xmm3, xmm5 // GGGA |
| por xmm0, xmm5 |
| movdqu [edx], xmm1 |
| movdqu [edx + 16], xmm2 |
| movdqu [edx + 32], xmm3 |
| movdqu [edx + 48], xmm0 |
| lea edx, [edx + 64] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop esi |
| ret |
| } |
| } |
| #endif // HAS_SOBELROW_SSE2 |
| |
| #ifdef HAS_SOBELTOPLANEROW_SSE2 |
| // Adds Sobel X and Sobel Y and stores Sobel into a plane. |
| __declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, |
| const uint8_t* src_sobely, |
| uint8_t* dst_y, |
| int width) { |
| __asm { |
| push esi |
| mov eax, [esp + 4 + 4] // src_sobelx |
| mov esi, [esp + 4 + 8] // src_sobely |
| mov edx, [esp + 4 + 12] // dst_argb |
| mov ecx, [esp + 4 + 16] // width |
| sub esi, eax |
| |
| convertloop: |
| movdqu xmm0, [eax] // read 16 pixels src_sobelx |
| movdqu xmm1, [eax + esi] // read 16 pixels src_sobely |
| lea eax, [eax + 16] |
| paddusb xmm0, xmm1 // sobel = sobelx + sobely |
| movdqu [edx], xmm0 |
| lea edx, [edx + 16] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop esi |
| ret |
| } |
| } |
| #endif // HAS_SOBELTOPLANEROW_SSE2 |
| |
| #ifdef HAS_SOBELXYROW_SSE2 |
| // Mixes Sobel X, Sobel Y and Sobel into ARGB. |
| // A = 255 |
| // R = Sobel X |
| // G = Sobel |
| // B = Sobel Y |
| __declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx, |
| const uint8_t* src_sobely, |
| uint8_t* dst_argb, |
| int width) { |
| __asm { |
| push esi |
| mov eax, [esp + 4 + 4] // src_sobelx |
| mov esi, [esp + 4 + 8] // src_sobely |
| mov edx, [esp + 4 + 12] // dst_argb |
| mov ecx, [esp + 4 + 16] // width |
| sub esi, eax |
| pcmpeqb xmm5, xmm5 // alpha 255 |
| |
| convertloop: |
| movdqu xmm0, [eax] // read 16 pixels src_sobelx |
| movdqu xmm1, [eax + esi] // read 16 pixels src_sobely |
| lea eax, [eax + 16] |
| movdqa xmm2, xmm0 |
| paddusb xmm2, xmm1 // sobel = sobelx + sobely |
| movdqa xmm3, xmm0 // XA |
| punpcklbw xmm3, xmm5 |
| punpckhbw xmm0, xmm5 |
| movdqa xmm4, xmm1 // YS |
| punpcklbw xmm4, xmm2 |
| punpckhbw xmm1, xmm2 |
| movdqa xmm6, xmm4 // YSXA |
| punpcklwd xmm6, xmm3 // First 4 |
| punpckhwd xmm4, xmm3 // Next 4 |
| movdqa xmm7, xmm1 // YSXA |
| punpcklwd xmm7, xmm0 // Next 4 |
| punpckhwd xmm1, xmm0 // Last 4 |
| movdqu [edx], xmm6 |
| movdqu [edx + 16], xmm4 |
| movdqu [edx + 32], xmm7 |
| movdqu [edx + 48], xmm1 |
| lea edx, [edx + 64] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop esi |
| ret |
| } |
| } |
| #endif // HAS_SOBELXYROW_SSE2 |
| |
| #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 |
| // Consider float CumulativeSum. |
| // Consider calling CumulativeSum one row at time as needed. |
| // Consider circular CumulativeSum buffer of radius * 2 + 1 height. |
| // Convert cumulative sum for an area to an average for 1 pixel. |
| // topleft is pointer to top left of CumulativeSum buffer for area. |
| // botleft is pointer to bottom left of CumulativeSum buffer. |
| // width is offset from left to right of area in CumulativeSum buffer measured |
| // in number of ints. |
| // area is the number of pixels in the area being averaged. |
| // dst points to pixel to store result to. |
| // count is number of averaged pixels to produce. |
| // Does 4 pixels at a time. |
| // This function requires alignment on accumulation buffer pointers. |
| void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, |
| const int32_t* botleft, |
| int width, |
| int area, |
| uint8_t* dst, |
| int count) { |
| __asm { |
| mov eax, topleft // eax topleft |
| mov esi, botleft // esi botleft |
| mov edx, width |
| movd xmm5, area |
| mov edi, dst |
| mov ecx, count |
| cvtdq2ps xmm5, xmm5 |
| rcpss xmm4, xmm5 // 1.0f / area |
| pshufd xmm4, xmm4, 0 |
| sub ecx, 4 |
| jl l4b |
| |
| cmp area, 128 // 128 pixels will not overflow 15 bits. |
| ja l4 |
| |
| pshufd xmm5, xmm5, 0 // area |
| pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 |
| psrld xmm6, 16 |
| cvtdq2ps xmm6, xmm6 |
| addps xmm5, xmm6 // (65536.0 + area - 1) |
| mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area |
| cvtps2dq xmm5, xmm5 // 0.16 fixed point |
| packssdw xmm5, xmm5 // 16 bit shorts |
| |
| // 4 pixel loop small blocks. |
| s4: |
| // top left |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| movdqu xmm2, [eax + 32] |
| movdqu xmm3, [eax + 48] |
| |
| // - top right |
| psubd xmm0, [eax + edx * 4] |
| psubd xmm1, [eax + edx * 4 + 16] |
| psubd xmm2, [eax + edx * 4 + 32] |
| psubd xmm3, [eax + edx * 4 + 48] |
| lea eax, [eax + 64] |
| |
| // - bottom left |
| psubd xmm0, [esi] |
| psubd xmm1, [esi + 16] |
| psubd xmm2, [esi + 32] |
| psubd xmm3, [esi + 48] |
| |
| // + bottom right |
| paddd xmm0, [esi + edx * 4] |
| paddd xmm1, [esi + edx * 4 + 16] |
| paddd xmm2, [esi + edx * 4 + 32] |
| paddd xmm3, [esi + edx * 4 + 48] |
| lea esi, [esi + 64] |
| |
| packssdw xmm0, xmm1 // pack 4 pixels into 2 registers |
| packssdw xmm2, xmm3 |
| |
| pmulhuw xmm0, xmm5 |
| pmulhuw xmm2, xmm5 |
| |
| packuswb xmm0, xmm2 |
| movdqu [edi], xmm0 |
| lea edi, [edi + 16] |
| sub ecx, 4 |
| jge s4 |
| |
| jmp l4b |
| |
| // 4 pixel loop |
| l4: |
| // top left |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| movdqu xmm2, [eax + 32] |
| movdqu xmm3, [eax + 48] |
| |
| // - top right |
| psubd xmm0, [eax + edx * 4] |
| psubd xmm1, [eax + edx * 4 + 16] |
| psubd xmm2, [eax + edx * 4 + 32] |
| psubd xmm3, [eax + edx * 4 + 48] |
| lea eax, [eax + 64] |
| |
| // - bottom left |
| psubd xmm0, [esi] |
| psubd xmm1, [esi + 16] |
| psubd xmm2, [esi + 32] |
| psubd xmm3, [esi + 48] |
| |
| // + bottom right |
| paddd xmm0, [esi + edx * 4] |
| paddd xmm1, [esi + edx * 4 + 16] |
| paddd xmm2, [esi + edx * 4 + 32] |
| paddd xmm3, [esi + edx * 4 + 48] |
| lea esi, [esi + 64] |
| |
| cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area |
| cvtdq2ps xmm1, xmm1 |
| mulps xmm0, xmm4 |
| mulps xmm1, xmm4 |
| cvtdq2ps xmm2, xmm2 |
| cvtdq2ps xmm3, xmm3 |
| mulps xmm2, xmm4 |
| mulps xmm3, xmm4 |
| cvtps2dq xmm0, xmm0 |
| cvtps2dq xmm1, xmm1 |
| cvtps2dq xmm2, xmm2 |
| cvtps2dq xmm3, xmm3 |
| packssdw xmm0, xmm1 |
| packssdw xmm2, xmm3 |
| packuswb xmm0, xmm2 |
| movdqu [edi], xmm0 |
| lea edi, [edi + 16] |
| sub ecx, 4 |
| jge l4 |
| |
| l4b: |
| add ecx, 4 - 1 |
| jl l1b |
| |
| // 1 pixel loop |
| l1: |
| movdqu xmm0, [eax] |
| psubd xmm0, [eax + edx * 4] |
| lea eax, [eax + 16] |
| psubd xmm0, [esi] |
| paddd xmm0, [esi + edx * 4] |
| lea esi, [esi + 16] |
| cvtdq2ps xmm0, xmm0 |
| mulps xmm0, xmm4 |
| cvtps2dq xmm0, xmm0 |
| packssdw xmm0, xmm0 |
| packuswb xmm0, xmm0 |
| movd dword ptr [edi], xmm0 |
| lea edi, [edi + 4] |
| sub ecx, 1 |
| jge l1 |
| l1b: |
| } |
| } |
| #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 |
| |
| #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 |
| // Creates a table of cumulative sums where each value is a sum of all values |
| // above and to the left of the value. |
| void ComputeCumulativeSumRow_SSE2(const uint8_t* row, |
| int32_t* cumsum, |
| const int32_t* previous_cumsum, |
| int width) { |
| __asm { |
| mov eax, row |
| mov edx, cumsum |
| mov esi, previous_cumsum |
| mov ecx, width |
| pxor xmm0, xmm0 |
| pxor xmm1, xmm1 |
| |
| sub ecx, 4 |
| jl l4b |
| test edx, 15 |
| jne l4b |
| |
| // 4 pixel loop |
| l4: |
| movdqu xmm2, [eax] // 4 argb pixels 16 bytes. |
| lea eax, [eax + 16] |
| movdqa xmm4, xmm2 |
| |
| punpcklbw xmm2, xmm1 |
| movdqa xmm3, xmm2 |
| punpcklwd xmm2, xmm1 |
| punpckhwd xmm3, xmm1 |
| |
| punpckhbw xmm4, xmm1 |
| movdqa xmm5, xmm4 |
| punpcklwd xmm4, xmm1 |
| punpckhwd xmm5, xmm1 |
| |
| paddd xmm0, xmm2 |
| movdqu xmm2, [esi] // previous row above. |
| paddd xmm2, xmm0 |
| |
| paddd xmm0, xmm3 |
| movdqu xmm3, [esi + 16] |
| paddd xmm3, xmm0 |
| |
| paddd xmm0, xmm4 |
| movdqu xmm4, [esi + 32] |
| paddd xmm4, xmm0 |
| |
| paddd xmm0, xmm5 |
| movdqu xmm5, [esi + 48] |
| lea esi, [esi + 64] |
| paddd xmm5, xmm0 |
| |
| movdqu [edx], xmm2 |
| movdqu [edx + 16], xmm3 |
| movdqu [edx + 32], xmm4 |
| movdqu [edx + 48], xmm5 |
| |
| lea edx, [edx + 64] |
| sub ecx, 4 |
| jge l4 |
| |
| l4b: |
| add ecx, 4 - 1 |
| jl l1b |
| |
| // 1 pixel loop |
| l1: |
| movd xmm2, dword ptr [eax] // 1 argb pixel |
| lea eax, [eax + 4] |
| punpcklbw xmm2, xmm1 |
| punpcklwd xmm2, xmm1 |
| paddd xmm0, xmm2 |
| movdqu xmm2, [esi] |
| lea esi, [esi + 16] |
| paddd xmm2, xmm0 |
| movdqu [edx], xmm2 |
| lea edx, [edx + 16] |
| sub ecx, 1 |
| jge l1 |
| |
| l1b: |
| } |
| } |
| #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 |
| |
| #ifdef HAS_ARGBAFFINEROW_SSE2 |
| // Copy ARGB pixels from source image with slope to a row of destination. |
| __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb, |
| int src_argb_stride, |
| uint8_t* dst_argb, |
| const float* uv_dudv, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 12] // src_argb |
| mov esi, [esp + 16] // stride |
| mov edx, [esp + 20] // dst_argb |
| mov ecx, [esp + 24] // pointer to uv_dudv |
| movq xmm2, qword ptr [ecx] // uv |
| movq xmm7, qword ptr [ecx + 8] // dudv |
| mov ecx, [esp + 28] // width |
| shl esi, 16 // 4, stride |
| add esi, 4 |
| movd xmm5, esi |
| sub ecx, 4 |
| jl l4b |
| |
| // setup for 4 pixel loop |
| pshufd xmm7, xmm7, 0x44 // dup dudv |
| pshufd xmm5, xmm5, 0 // dup 4, stride |
| movdqa xmm0, xmm2 // x0, y0, x1, y1 |
| addps xmm0, xmm7 |
| movlhps xmm2, xmm0 |
| movdqa xmm4, xmm7 |
| addps xmm4, xmm4 // dudv *= 2 |
| movdqa xmm3, xmm2 // x2, y2, x3, y3 |
| addps xmm3, xmm4 |
| addps xmm4, xmm4 // dudv *= 4 |
| |
| // 4 pixel loop |
| l4: |
| cvttps2dq xmm0, xmm2 // x, y float to int first 2 |
| cvttps2dq xmm1, xmm3 // x, y float to int next 2 |
| packssdw xmm0, xmm1 // x, y as 8 shorts |
| pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. |
| movd esi, xmm0 |
| pshufd xmm0, xmm0, 0x39 // shift right |
| movd edi, xmm0 |
| pshufd xmm0, xmm0, 0x39 // shift right |
| movd xmm1, [eax + esi] // read pixel 0 |
| movd xmm6, [eax + edi] // read pixel 1 |
| punpckldq xmm1, xmm6 // combine pixel 0 and 1 |
| addps xmm2, xmm4 // x, y += dx, dy first 2 |
| movq qword ptr [edx], xmm1 |
| movd esi, xmm0 |
| pshufd xmm0, xmm0, 0x39 // shift right |
| movd edi, xmm0 |
| movd xmm6, [eax + esi] // read pixel 2 |
| movd xmm0, [eax + edi] // read pixel 3 |
| punpckldq xmm6, xmm0 // combine pixel 2 and 3 |
| addps xmm3, xmm4 // x, y += dx, dy next 2 |
| movq qword ptr 8[edx], xmm6 |
| lea edx, [edx + 16] |
| sub ecx, 4 |
| jge l4 |
| |
| l4b: |
| add ecx, 4 - 1 |
| jl l1b |
| |
| // 1 pixel loop |
| l1: |
| cvttps2dq xmm0, xmm2 // x, y float to int |
| packssdw xmm0, xmm0 // x, y as shorts |
| pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride |
| addps xmm2, xmm7 // x, y += dx, dy |
| movd esi, xmm0 |
| movd xmm0, [eax + esi] // copy a pixel |
| movd [edx], xmm0 |
| lea edx, [edx + 4] |
| sub ecx, 1 |
| jge l1 |
| l1b: |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| #endif // HAS_ARGBAFFINEROW_SSE2 |
| |
| #ifdef HAS_INTERPOLATEROW_AVX2 |
| // Bilinear filter 32x2 -> 32x1 |
| __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr, |
| const uint8_t* src_ptr, |
| ptrdiff_t src_stride, |
| int dst_width, |
| int source_y_fraction) { |
| __asm { |
| push esi |
| push edi |
| mov edi, [esp + 8 + 4] // dst_ptr |
| mov esi, [esp + 8 + 8] // src_ptr |
| mov edx, [esp + 8 + 12] // src_stride |
| mov ecx, [esp + 8 + 16] // dst_width |
| mov eax, [esp + 8 + 20] // source_y_fraction (0..255) |
| // Dispatch to specialized filters if applicable. |
| cmp eax, 0 |
| je xloop100 // 0 / 256. Blend 100 / 0. |
| sub edi, esi |
| cmp eax, 128 |
| je xloop50 // 128 /256 is 0.50. Blend 50 / 50. |
| |
| vmovd xmm0, eax // high fraction 0..255 |
| neg eax |
| add eax, 256 |
| vmovd xmm5, eax // low fraction 256..1 |
| vpunpcklbw xmm5, xmm5, xmm0 |
| vpunpcklwd xmm5, xmm5, xmm5 |
| vbroadcastss ymm5, xmm5 |
| |
| mov eax, 0x80808080 // 128b for bias and rounding. |
| vmovd xmm4, eax |
| vbroadcastss ymm4, xmm4 |
| |
| xloop: |
| vmovdqu ymm0, [esi] |
| vmovdqu ymm2, [esi + edx] |
| vpunpckhbw ymm1, ymm0, ymm2 // mutates |
| vpunpcklbw ymm0, ymm0, ymm2 |
| vpsubb ymm1, ymm1, ymm4 // bias to signed image |
| vpsubb ymm0, ymm0, ymm4 |
| vpmaddubsw ymm1, ymm5, ymm1 |
| vpmaddubsw ymm0, ymm5, ymm0 |
| vpaddw ymm1, ymm1, ymm4 // unbias and round |
| vpaddw ymm0, ymm0, ymm4 |
| vpsrlw ymm1, ymm1, 8 |
| vpsrlw ymm0, ymm0, 8 |
| vpackuswb ymm0, ymm0, ymm1 // unmutates |
| vmovdqu [esi + edi], ymm0 |
| lea esi, [esi + 32] |
| sub ecx, 32 |
| jg xloop |
| jmp xloop99 |
| |
| // Blend 50 / 50. |
| xloop50: |
| vmovdqu ymm0, [esi] |
| vpavgb ymm0, ymm0, [esi + edx] |
| vmovdqu [esi + edi], ymm0 |
| lea esi, [esi + 32] |
| sub ecx, 32 |
| jg xloop50 |
| jmp xloop99 |
| |
| // Blend 100 / 0 - Copy row unchanged. |
| xloop100: |
| rep movsb |
| |
| xloop99: |
| pop edi |
| pop esi |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_INTERPOLATEROW_AVX2 |
| |
| // Bilinear filter 16x2 -> 16x1 |
| // TODO(fbarchard): Consider allowing 256 using memcpy. |
| __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr, |
| const uint8_t* src_ptr, |
| ptrdiff_t src_stride, |
| int dst_width, |
| int source_y_fraction) { |
| __asm { |
| push esi |
| push edi |
| |
| mov edi, [esp + 8 + 4] // dst_ptr |
| mov esi, [esp + 8 + 8] // src_ptr |
| mov edx, [esp + 8 + 12] // src_stride |
| mov ecx, [esp + 8 + 16] // dst_width |
| mov eax, [esp + 8 + 20] // source_y_fraction (0..255) |
| sub edi, esi |
| // Dispatch to specialized filters if applicable. |
| cmp eax, 0 |
| je xloop100 // 0 /256. Blend 100 / 0. |
| cmp eax, 128 |
| je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. |
| |
| movd xmm0, eax // high fraction 0..255 |
| neg eax |
| add eax, 256 |
| movd xmm5, eax // low fraction 255..1 |
| punpcklbw xmm5, xmm0 |
| punpcklwd xmm5, xmm5 |
| pshufd xmm5, xmm5, 0 |
| mov eax, 0x80808080 // 128 for biasing image to signed. |
| movd xmm4, eax |
| pshufd xmm4, xmm4, 0x00 |
| |
| xloop: |
| movdqu xmm0, [esi] |
| movdqu xmm2, [esi + edx] |
| movdqu xmm1, xmm0 |
| punpcklbw xmm0, xmm2 |
| punpckhbw xmm1, xmm2 |
| psubb xmm0, xmm4 // bias image by -128 |
| psubb xmm1, xmm4 |
| movdqa xmm2, xmm5 |
| movdqa xmm3, xmm5 |
| pmaddubsw xmm2, xmm0 |
| pmaddubsw xmm3, xmm1 |
| paddw xmm2, xmm4 |
| paddw xmm3, xmm4 |
| psrlw xmm2, 8 |
| psrlw xmm3, 8 |
| packuswb xmm2, xmm3 |
| movdqu [esi + edi], xmm2 |
| lea esi, [esi + 16] |
| sub ecx, 16 |
| jg xloop |
| jmp xloop99 |
| |
| // Blend 50 / 50. |
| xloop50: |
| movdqu xmm0, [esi] |
| movdqu xmm1, [esi + edx] |
| pavgb xmm0, xmm1 |
| movdqu [esi + edi], xmm0 |
| lea esi, [esi + 16] |
| sub ecx, 16 |
| jg xloop50 |
| jmp xloop99 |
| |
| // Blend 100 / 0 - Copy row unchanged. |
| xloop100: |
| movdqu xmm0, [esi] |
| movdqu [esi + edi], xmm0 |
| lea esi, [esi + 16] |
| sub ecx, 16 |
| jg xloop100 |
| |
| xloop99: |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
| __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, |
| uint8_t* dst_argb, |
| const uint8_t* shuffler, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // shuffler |
| movdqu xmm5, [ecx] |
| mov ecx, [esp + 16] // width |
| |
| wloop: |
| movdqu xmm0, [eax] |
| movdqu xmm1, [eax + 16] |
| lea eax, [eax + 32] |
| pshufb xmm0, xmm5 |
| pshufb xmm1, xmm5 |
| movdqu [edx], xmm0 |
| movdqu [edx + 16], xmm1 |
| lea edx, [edx + 32] |
| sub ecx, 8 |
| jg wloop |
| ret |
| } |
| } |
| |
| #ifdef HAS_ARGBSHUFFLEROW_AVX2 |
| __declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb, |
| uint8_t* dst_argb, |
| const uint8_t* shuffler, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] // src_argb |
| mov edx, [esp + 8] // dst_argb |
| mov ecx, [esp + 12] // shuffler |
| vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. |
| mov ecx, [esp + 16] // width |
| |
| wloop: |
| vmovdqu ymm0, [eax] |
| vmovdqu ymm1, [eax + 32] |
| lea eax, [eax + 64] |
| vpshufb ymm0, ymm0, ymm5 |
| vpshufb ymm1, ymm1, ymm5 |
| vmovdqu [edx], ymm0 |
| vmovdqu [edx + 32], ymm1 |
| lea edx, [edx + 64] |
| sub ecx, 16 |
| jg wloop |
| |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBSHUFFLEROW_AVX2 |
| |
| // YUY2 - Macro-pixel = 2 image pixels |
| // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... |
| |
| // UYVY - Macro-pixel = 2 image pixels |
| // U0Y0V0Y1 |
| |
| __declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y, |
| const uint8_t* src_u, |
| const uint8_t* src_v, |
| uint8_t* dst_frame, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // src_y |
| mov esi, [esp + 8 + 8] // src_u |
| mov edx, [esp + 8 + 12] // src_v |
| mov edi, [esp + 8 + 16] // dst_frame |
| mov ecx, [esp + 8 + 20] // width |
| sub edx, esi |
| |
| convertloop: |
| movq xmm2, qword ptr [esi] // U |
| movq xmm3, qword ptr [esi + edx] // V |
| lea esi, [esi + 8] |
| punpcklbw xmm2, xmm3 // UV |
| movdqu xmm0, [eax] // Y |
| lea eax, [eax + 16] |
| movdqa xmm1, xmm0 |
| punpcklbw xmm0, xmm2 // YUYV |
| punpckhbw xmm1, xmm2 |
| movdqu [edi], xmm0 |
| movdqu [edi + 16], xmm1 |
| lea edi, [edi + 32] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| __declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y, |
| const uint8_t* src_u, |
| const uint8_t* src_v, |
| uint8_t* dst_frame, |
| int width) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] // src_y |
| mov esi, [esp + 8 + 8] // src_u |
| mov edx, [esp + 8 + 12] // src_v |
| mov edi, [esp + 8 + 16] // dst_frame |
| mov ecx, [esp + 8 + 20] // width |
| sub edx, esi |
| |
| convertloop: |
| movq xmm2, qword ptr [esi] // U |
| movq xmm3, qword ptr [esi + edx] // V |
| lea esi, [esi + 8] |
| punpcklbw xmm2, xmm3 // UV |
| movdqu xmm0, [eax] // Y |
| movdqa xmm1, xmm2 |
| lea eax, [eax + 16] |
| punpcklbw xmm1, xmm0 // UYVY |
| punpckhbw xmm2, xmm0 |
| movdqu [edi], xmm1 |
| movdqu [edi + 16], xmm2 |
| lea edi, [edi + 32] |
| sub ecx, 16 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| |
| #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 |
| __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, |
| uint8_t* dst_argb, |
| const float* poly, |
| int width) { |
| __asm { |
| push esi |
| mov eax, [esp + 4 + 4] /* src_argb */ |
| mov edx, [esp + 4 + 8] /* dst_argb */ |
| mov esi, [esp + 4 + 12] /* poly */ |
| mov ecx, [esp + 4 + 16] /* width */ |
| pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. |
| |
| // 2 pixel loop. |
| convertloop: |
| // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel |
| // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel |
| movq xmm0, qword ptr [eax] // BGRABGRA |
| lea eax, [eax + 8] |
| punpcklbw xmm0, xmm3 |
| movdqa xmm4, xmm0 |
| punpcklwd xmm0, xmm3 // pixel 0 |
| punpckhwd xmm4, xmm3 // pixel 1 |
| cvtdq2ps xmm0, xmm0 // 4 floats |
| cvtdq2ps xmm4, xmm4 |
| movdqa xmm1, xmm0 // X |
| movdqa xmm5, xmm4 |
| mulps xmm0, [esi + 16] // C1 * X |
| mulps xmm4, [esi + 16] |
| addps xmm0, [esi] // result = C0 + C1 * X |
| addps xmm4, [esi] |
| movdqa xmm2, xmm1 |
| movdqa xmm6, xmm5 |
| mulps xmm2, xmm1 // X * X |
| mulps xmm6, xmm5 |
| mulps xmm1, xmm2 // X * X * X |
| mulps xmm5, xmm6 |
| mulps xmm2, [esi + 32] // C2 * X * X |
| mulps xmm6, [esi + 32] |
| mulps xmm1, [esi + 48] // C3 * X * X * X |
| mulps xmm5, [esi + 48] |
| addps xmm0, xmm2 // result += C2 * X * X |
| addps xmm4, xmm6 |
| addps xmm0, xmm1 // result += C3 * X * X * X |
| addps xmm4, xmm5 |
| cvttps2dq xmm0, xmm0 |
| cvttps2dq xmm4, xmm4 |
| packuswb xmm0, xmm4 |
| packuswb xmm0, xmm0 |
| movq qword ptr [edx], xmm0 |
| lea edx, [edx + 8] |
| sub ecx, 2 |
| jg convertloop |
| pop esi |
| ret |
| } |
| } |
| #endif // HAS_ARGBPOLYNOMIALROW_SSE2 |
| |
| #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 |
| __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, |
| uint8_t* dst_argb, |
| const float* poly, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] /* src_argb */ |
| mov edx, [esp + 8] /* dst_argb */ |
| mov ecx, [esp + 12] /* poly */ |
| vbroadcastf128 ymm4, [ecx] // C0 |
| vbroadcastf128 ymm5, [ecx + 16] // C1 |
| vbroadcastf128 ymm6, [ecx + 32] // C2 |
| vbroadcastf128 ymm7, [ecx + 48] // C3 |
| mov ecx, [esp + 16] /* width */ |
| |
| // 2 pixel loop. |
| convertloop: |
| vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels |
| lea eax, [eax + 8] |
| vcvtdq2ps ymm0, ymm0 // X 8 floats |
| vmulps ymm2, ymm0, ymm0 // X * X |
| vmulps ymm3, ymm0, ymm7 // C3 * X |
| vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X |
| vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X |
| vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X |
| vcvttps2dq ymm0, ymm0 |
| vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 |
| vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 |
| vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 |
| vmovq qword ptr [edx], xmm0 |
| lea edx, [edx + 8] |
| sub ecx, 2 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_ARGBPOLYNOMIALROW_AVX2 |
| |
| #ifdef HAS_HALFFLOATROW_SSE2 |
| static float kExpBias = 1.9259299444e-34f; |
| __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src, |
| uint16_t* dst, |
| float scale, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] /* src */ |
| mov edx, [esp + 8] /* dst */ |
| movd xmm4, dword ptr [esp + 12] /* scale */ |
| mov ecx, [esp + 16] /* width */ |
| mulss xmm4, kExpBias |
| pshufd xmm4, xmm4, 0 |
| pxor xmm5, xmm5 |
| sub edx, eax |
| |
| // 8 pixel loop. |
| convertloop: |
| movdqu xmm2, xmmword ptr [eax] // 8 shorts |
| add eax, 16 |
| movdqa xmm3, xmm2 |
| punpcklwd xmm2, xmm5 |
| cvtdq2ps xmm2, xmm2 // convert 8 ints to floats |
| punpckhwd xmm3, xmm5 |
| cvtdq2ps xmm3, xmm3 |
| mulps xmm2, xmm4 |
| mulps xmm3, xmm4 |
| psrld xmm2, 13 |
| psrld xmm3, 13 |
| packssdw xmm2, xmm3 |
| movdqu [eax + edx - 16], xmm2 |
| sub ecx, 8 |
| jg convertloop |
| ret |
| } |
| } |
| #endif // HAS_HALFFLOATROW_SSE2 |
| |
| #ifdef HAS_HALFFLOATROW_AVX2 |
| __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src, |
| uint16_t* dst, |
| float scale, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] /* src */ |
| mov edx, [esp + 8] /* dst */ |
| movd xmm4, dword ptr [esp + 12] /* scale */ |
| mov ecx, [esp + 16] /* width */ |
| |
| vmulss xmm4, xmm4, kExpBias |
| vbroadcastss ymm4, xmm4 |
| vpxor ymm5, ymm5, ymm5 |
| sub edx, eax |
| |
| // 16 pixel loop. |
| convertloop: |
| vmovdqu ymm2, [eax] // 16 shorts |
| add eax, 32 |
| vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints |
| vpunpcklwd ymm2, ymm2, ymm5 |
| vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats |
| vcvtdq2ps ymm2, ymm2 |
| vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range. |
| vmulps ymm2, ymm2, ymm4 |
| vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate |
| vpsrld ymm2, ymm2, 13 |
| vpackssdw ymm2, ymm2, ymm3 |
| vmovdqu [eax + edx - 32], ymm2 |
| sub ecx, 16 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_HALFFLOATROW_AVX2 |
| |
| #ifdef HAS_HALFFLOATROW_F16C |
| __declspec(naked) void HalfFloatRow_F16C(const uint16_t* src, |
| uint16_t* dst, |
| float scale, |
| int width) { |
| __asm { |
| mov eax, [esp + 4] /* src */ |
| mov edx, [esp + 8] /* dst */ |
| vbroadcastss ymm4, [esp + 12] /* scale */ |
| mov ecx, [esp + 16] /* width */ |
| sub edx, eax |
| |
| // 16 pixel loop. |
| convertloop: |
| vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints |
| vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts |
| add eax, 32 |
| vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats |
| vcvtdq2ps ymm3, ymm3 |
| vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1 |
| vmulps ymm3, ymm3, ymm4 |
| vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate |
| vcvtps2ph xmm3, ymm3, 3 |
| vmovdqu [eax + edx + 32], xmm2 |
| vmovdqu [eax + edx + 32 + 16], xmm3 |
| sub ecx, 16 |
| jg convertloop |
| vzeroupper |
| ret |
| } |
| } |
| #endif // HAS_HALFFLOATROW_F16C |
| |
| #ifdef HAS_ARGBCOLORTABLEROW_X86 |
| // Tranform ARGB pixels with color table. |
| __declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb, |
| const uint8_t* table_argb, |
| int width) { |
| __asm { |
| push esi |
| mov eax, [esp + 4 + 4] /* dst_argb */ |
| mov esi, [esp + 4 + 8] /* table_argb */ |
| mov ecx, [esp + 4 + 12] /* width */ |
| |
| // 1 pixel loop. |
| convertloop: |
| movzx edx, byte ptr [eax] |
| lea eax, [eax + 4] |
| movzx edx, byte ptr [esi + edx * 4] |
| mov byte ptr [eax - 4], dl |
| movzx edx, byte ptr [eax - 4 + 1] |
| movzx edx, byte ptr [esi + edx * 4 + 1] |
| mov byte ptr [eax - 4 + 1], dl |
| movzx edx, byte ptr [eax - 4 + 2] |
| movzx edx, byte ptr [esi + edx * 4 + 2] |
| mov byte ptr [eax - 4 + 2], dl |
| movzx edx, byte ptr [eax - 4 + 3] |
| movzx edx, byte ptr [esi + edx * 4 + 3] |
| mov byte ptr [eax - 4 + 3], dl |
| dec ecx |
| jg convertloop |
| pop esi |
| ret |
| } |
| } |
| #endif // HAS_ARGBCOLORTABLEROW_X86 |
| |
| #ifdef HAS_RGBCOLORTABLEROW_X86 |
| // Tranform RGB pixels with color table. |
| __declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb, |
| const uint8_t* table_argb, |
| int width) { |
| __asm { |
| push esi |
| mov eax, [esp + 4 + 4] /* dst_argb */ |
| mov esi, [esp + 4 + 8] /* table_argb */ |
| mov ecx, [esp + 4 + 12] /* width */ |
| |
| // 1 pixel loop. |
| convertloop: |
| movzx edx, byte ptr [eax] |
| lea eax, [eax + 4] |
| movzx edx, byte ptr [esi + edx * 4] |
| mov byte ptr [eax - 4], dl |
| movzx edx, byte ptr [eax - 4 + 1] |
| movzx edx, byte ptr [esi + edx * 4 + 1] |
| mov byte ptr [eax - 4 + 1], dl |
| movzx edx, byte ptr [eax - 4 + 2] |
| movzx edx, byte ptr [esi + edx * 4 + 2] |
| mov byte ptr [eax - 4 + 2], dl |
| dec ecx |
| jg convertloop |
| |
| pop esi |
| ret |
| } |
| } |
| #endif // HAS_RGBCOLORTABLEROW_X86 |
| |
| #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
| // Tranform RGB pixels with luma table. |
| __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, |
| uint8_t* dst_argb, |
| int width, |
| const uint8_t* luma, |
| uint32_t lumacoeff) { |
| __asm { |
| push esi |
| push edi |
| mov eax, [esp + 8 + 4] /* src_argb */ |
| mov edi, [esp + 8 + 8] /* dst_argb */ |
| mov ecx, [esp + 8 + 12] /* width */ |
| movd xmm2, dword ptr [esp + 8 + 16] // luma table |
| movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff |
| pshufd xmm2, xmm2, 0 |
| pshufd xmm3, xmm3, 0 |
| pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 |
| psllw xmm4, 8 |
| pxor xmm5, xmm5 |
| |
| // 4 pixel loop. |
| convertloop: |
| movdqu xmm0, xmmword ptr [eax] // generate luma ptr |
| pmaddubsw xmm0, xmm3 |
| phaddw xmm0, xmm0 |
| pand xmm0, xmm4 // mask out low bits |
| punpcklwd xmm0, xmm5 |
| paddd xmm0, xmm2 // add table base |
| movd esi, xmm0 |
| pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 |
| |
| movzx edx, byte ptr [eax] |
| movzx edx, byte ptr [esi + edx] |
| mov byte ptr [edi], dl |
| movzx edx, byte ptr [eax + 1] |
| movzx edx, byte ptr [esi + edx] |
| mov byte ptr [edi + 1], dl |
| movzx edx, byte ptr [eax + 2] |
| movzx edx, byte ptr [esi + edx] |
| mov byte ptr [edi + 2], dl |
| movzx edx, byte ptr [eax + 3] // copy alpha. |
| mov byte ptr [edi + 3], dl |
| |
| movd esi, xmm0 |
| pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 |
| |
| movzx edx, byte ptr [eax + 4] |
| movzx edx, byte ptr [esi + edx] |
| mov byte ptr [edi + 4], dl |
| movzx edx, byte ptr [eax + 5] |
| movzx edx, byte ptr [esi + edx] |
| mov byte ptr [edi + 5], dl |
| movzx edx, byte ptr [eax + 6] |
| movzx edx, byte ptr [esi + edx] |
| mov byte ptr [edi + 6], dl |
| movzx edx, byte ptr [eax + 7] // copy alpha. |
| mov byte ptr [edi + 7], dl |
| |
| movd esi, xmm0 |
| pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 |
| |
| movzx edx, byte ptr [eax + 8] |
| movzx edx, byte ptr [esi + edx] |
| mov byte ptr [edi + 8], dl |
| movzx edx, byte ptr [eax + 9] |
| movzx edx, byte ptr [esi + edx] |
| mov byte ptr [edi + 9], dl |
| movzx edx, byte ptr [eax + 10] |
| movzx edx, byte ptr [esi + edx] |
| mov byte ptr [edi + 10], dl |
| movzx edx, byte ptr [eax + 11] // copy alpha. |
| mov byte ptr [edi + 11], dl |
| |
| movd esi, xmm0 |
| |
| movzx edx, byte ptr [eax + 12] |
| movzx edx, byte ptr [esi + edx] |
| mov byte ptr [edi + 12], dl |
| movzx edx, byte ptr [eax + 13] |
| movzx edx, byte ptr [esi + edx] |
| mov byte ptr [edi + 13], dl |
| movzx edx, byte ptr [eax + 14] |
| movzx edx, byte ptr [esi + edx] |
| mov byte ptr [edi + 14], dl |
| movzx edx, byte ptr [eax + 15] // copy alpha. |
| mov byte ptr [edi + 15], dl |
| |
| lea eax, [eax + 16] |
| lea edi, [edi + 16] |
| sub ecx, 4 |
| jg convertloop |
| |
| pop edi |
| pop esi |
| ret |
| } |
| } |
| #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
| |
| #endif // defined(_M_X64) |
| |
| #ifdef __cplusplus |
| } // extern "C" |
| } // namespace libyuv |
| #endif |
| |
| #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |