Remove uses of _mm_set_epi64x
Some platforms don't include this intrinsic. As a workaround, replace
all instances with _mm_set_epi32 and load the data in 32-bit chunks.
BUG=aomedia:2232
Change-Id: I5256688f145cb4f847a2cdca3b290217ec511a22
diff --git a/aom_dsp/x86/blend_a64_mask_avx2.c b/aom_dsp/x86/blend_a64_mask_avx2.c
index 057f615..8adcef2 100644
--- a/aom_dsp/x86/blend_a64_mask_avx2.c
+++ b/aom_dsp/x86/blend_a64_mask_avx2.c
@@ -1100,10 +1100,16 @@
do {
// Load 8x u8 pixels from each of 4 rows in the mask
const __m128i mask0a8 =
- _mm_set_epi64x(*(uint64_t *)mask, *(uint64_t *)(mask + mask_stride));
+ _mm_set_epi32(*(uint32_t *)(mask + 0 * mask_stride + 4),
+ *(uint32_t *)(mask + 0 * mask_stride),
+ *(uint32_t *)(mask + 1 * mask_stride + 4),
+ *(uint32_t *)(mask + 1 * mask_stride));
const __m128i mask0b8 =
- _mm_set_epi64x(*(uint64_t *)(mask + 2 * mask_stride),
- *(uint64_t *)(mask + 3 * mask_stride));
+ _mm_set_epi32(*(uint32_t *)(mask + 2 * mask_stride + 4),
+ *(uint32_t *)(mask + 2 * mask_stride),
+ *(uint32_t *)(mask + 3 * mask_stride + 4),
+ *(uint32_t *)(mask + 3 * mask_stride));
+
const __m256i mask0a = _mm256_cvtepu8_epi16(mask0a8);
const __m256i mask0b = _mm256_cvtepu8_epi16(mask0b8);
diff --git a/aom_dsp/x86/blend_a64_mask_sse4.c b/aom_dsp/x86/blend_a64_mask_sse4.c
index b7a2468..c07d2c3 100644
--- a/aom_dsp/x86/blend_a64_mask_sse4.c
+++ b/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -1119,14 +1119,22 @@
const __m128i *clip_low, const __m128i *clip_high,
const __m128i *mask_max) {
// Load 4 pixels from each of 4 rows from each source
- const __m128i s0a =
- _mm_set_epi64x(*(uint64_t *)src0, *(uint64_t *)(src0 + src0_stride));
- const __m128i s0b = _mm_set_epi64x(*(uint64_t *)(src0 + 2 * src0_stride),
- *(uint64_t *)(src0 + 3 * src0_stride));
- const __m128i s1a =
- _mm_set_epi64x(*(uint64_t *)(src1), *(uint64_t *)(src1 + src1_stride));
- const __m128i s1b = _mm_set_epi64x(*(uint64_t *)(src1 + 2 * src1_stride),
- *(uint64_t *)(src1 + 3 * src1_stride));
+ const __m128i s0a = _mm_set_epi32(*(uint32_t *)(src0 + 0 * src0_stride + 2),
+ *(uint32_t *)(src0 + 0 * src0_stride),
+ *(uint32_t *)(src0 + 1 * src0_stride + 2),
+ *(uint32_t *)(src0 + 1 * src0_stride));
+ const __m128i s0b = _mm_set_epi32(*(uint32_t *)(src0 + 2 * src0_stride + 2),
+ *(uint32_t *)(src0 + 2 * src0_stride),
+ *(uint32_t *)(src0 + 3 * src0_stride + 2),
+ *(uint32_t *)(src0 + 3 * src0_stride));
+ const __m128i s1a = _mm_set_epi32(*(uint32_t *)(src1 + 0 * src1_stride + 2),
+ *(uint32_t *)(src1 + 0 * src1_stride),
+ *(uint32_t *)(src1 + 1 * src1_stride + 2),
+ *(uint32_t *)(src1 + 1 * src1_stride));
+ const __m128i s1b = _mm_set_epi32(*(uint32_t *)(src1 + 2 * src1_stride + 2),
+ *(uint32_t *)(src1 + 2 * src1_stride),
+ *(uint32_t *)(src1 + 3 * src1_stride + 2),
+ *(uint32_t *)(src1 + 3 * src1_stride));
// Generate the inverse masks
const __m128i mask1a = _mm_sub_epi16(*mask_max, *mask0a);
@@ -1217,16 +1225,24 @@
// Load 8 pixels from each of 8 rows of mask,
// (saturating) add together rows then use madd to add adjacent pixels
// Finally, divide each value by 4 (with rounding)
- const __m128i m02 = _mm_set_epi64x(*(uint64_t *)(mask),
- *(uint64_t *)(mask + 2 * mask_stride));
- const __m128i m13 = _mm_set_epi64x(*(uint64_t *)(mask + mask_stride),
- *(uint64_t *)(mask + 3 * mask_stride));
+ const __m128i m02 = _mm_set_epi32(*(uint32_t *)(mask + 0 * mask_stride + 4),
+ *(uint32_t *)(mask + 0 * mask_stride),
+ *(uint32_t *)(mask + 2 * mask_stride + 4),
+ *(uint32_t *)(mask + 2 * mask_stride));
+ const __m128i m13 = _mm_set_epi32(*(uint32_t *)(mask + 1 * mask_stride + 4),
+ *(uint32_t *)(mask + 1 * mask_stride),
+ *(uint32_t *)(mask + 3 * mask_stride + 4),
+ *(uint32_t *)(mask + 3 * mask_stride));
const __m128i m0123 = _mm_maddubs_epi16(_mm_adds_epu8(m02, m13), one_b);
const __m128i mask_0a = _mm_srli_epi16(_mm_add_epi16(m0123, two_w), 2);
- const __m128i m46 = _mm_set_epi64x(*(uint64_t *)(mask + 4 * mask_stride),
- *(uint64_t *)(mask + 6 * mask_stride));
- const __m128i m57 = _mm_set_epi64x(*(uint64_t *)(mask + 5 * mask_stride),
- *(uint64_t *)(mask + 7 * mask_stride));
+ const __m128i m46 = _mm_set_epi32(*(uint32_t *)(mask + 4 * mask_stride + 4),
+ *(uint32_t *)(mask + 4 * mask_stride),
+ *(uint32_t *)(mask + 6 * mask_stride + 4),
+ *(uint32_t *)(mask + 6 * mask_stride));
+ const __m128i m57 = _mm_set_epi32(*(uint32_t *)(mask + 5 * mask_stride + 4),
+ *(uint32_t *)(mask + 5 * mask_stride),
+ *(uint32_t *)(mask + 7 * mask_stride + 4),
+ *(uint32_t *)(mask + 7 * mask_stride));
const __m128i m4567 = _mm_maddubs_epi16(_mm_adds_epu8(m46, m57), one_b);
const __m128i mask_0b = _mm_srli_epi16(_mm_add_epi16(m4567, two_w), 2);
diff --git a/av1/encoder/x86/rdopt_sse4.c b/av1/encoder/x86/rdopt_sse4.c
index 67d94b4..ba48e28 100644
--- a/av1/encoder/x86/rdopt_sse4.c
+++ b/av1/encoder/x86/rdopt_sse4.c
@@ -30,10 +30,12 @@
// [ i j k l ]
// [ m n o p ]
- const __m128i pixelsa = _mm_set_epi64x(*(uint64_t *)&diff[0 * stride],
- *(uint64_t *)&diff[2 * stride]);
- const __m128i pixelsb = _mm_set_epi64x(*(uint64_t *)&diff[1 * stride],
- *(uint64_t *)&diff[3 * stride]);
+ const __m128i pixelsa = _mm_set_epi32(
+ *(uint32_t *)&diff[0 * stride + 2], *(uint32_t *)&diff[0 * stride],
+ *(uint32_t *)&diff[2 * stride + 2], *(uint32_t *)&diff[2 * stride]);
+ const __m128i pixelsb = _mm_set_epi32(
+ *(uint32_t *)&diff[1 * stride + 2], *(uint32_t *)&diff[1 * stride],
+ *(uint32_t *)&diff[3 * stride + 2], *(uint32_t *)&diff[3 * stride]);
// pixelsa = [d c b a l k j i] as i16
// pixelsb = [h g f e p o n m] as i16