Revert "Remove uses of _mm_set_epi64x"
This reverts commit 4ab23f9ee9f9cddc011770cee7135d41dff82a1c.
Visual Studio 12 is no longer supported, so this patch can be reverted:
The original code was slightly clearer about what was happening.
BUG=aomedia:2232
Change-Id: I77710820b0ed4209564e75706b8ad028cfdf8888
diff --git a/aom_dsp/x86/blend_a64_mask_avx2.c b/aom_dsp/x86/blend_a64_mask_avx2.c
index 8adcef2..057f615 100644
--- a/aom_dsp/x86/blend_a64_mask_avx2.c
+++ b/aom_dsp/x86/blend_a64_mask_avx2.c
@@ -1100,16 +1100,10 @@
do {
// Load 8x u8 pixels from each of 4 rows in the mask
const __m128i mask0a8 =
- _mm_set_epi32(*(uint32_t *)(mask + 0 * mask_stride + 4),
- *(uint32_t *)(mask + 0 * mask_stride),
- *(uint32_t *)(mask + 1 * mask_stride + 4),
- *(uint32_t *)(mask + 1 * mask_stride));
+ _mm_set_epi64x(*(uint64_t *)mask, *(uint64_t *)(mask + mask_stride));
const __m128i mask0b8 =
- _mm_set_epi32(*(uint32_t *)(mask + 2 * mask_stride + 4),
- *(uint32_t *)(mask + 2 * mask_stride),
- *(uint32_t *)(mask + 3 * mask_stride + 4),
- *(uint32_t *)(mask + 3 * mask_stride));
-
+ _mm_set_epi64x(*(uint64_t *)(mask + 2 * mask_stride),
+ *(uint64_t *)(mask + 3 * mask_stride));
const __m256i mask0a = _mm256_cvtepu8_epi16(mask0a8);
const __m256i mask0b = _mm256_cvtepu8_epi16(mask0b8);
diff --git a/aom_dsp/x86/blend_a64_mask_sse4.c b/aom_dsp/x86/blend_a64_mask_sse4.c
index c07d2c3..b7a2468 100644
--- a/aom_dsp/x86/blend_a64_mask_sse4.c
+++ b/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -1119,22 +1119,14 @@
const __m128i *clip_low, const __m128i *clip_high,
const __m128i *mask_max) {
// Load 4 pixels from each of 4 rows from each source
- const __m128i s0a = _mm_set_epi32(*(uint32_t *)(src0 + 0 * src0_stride + 2),
- *(uint32_t *)(src0 + 0 * src0_stride),
- *(uint32_t *)(src0 + 1 * src0_stride + 2),
- *(uint32_t *)(src0 + 1 * src0_stride));
- const __m128i s0b = _mm_set_epi32(*(uint32_t *)(src0 + 2 * src0_stride + 2),
- *(uint32_t *)(src0 + 2 * src0_stride),
- *(uint32_t *)(src0 + 3 * src0_stride + 2),
- *(uint32_t *)(src0 + 3 * src0_stride));
- const __m128i s1a = _mm_set_epi32(*(uint32_t *)(src1 + 0 * src1_stride + 2),
- *(uint32_t *)(src1 + 0 * src1_stride),
- *(uint32_t *)(src1 + 1 * src1_stride + 2),
- *(uint32_t *)(src1 + 1 * src1_stride));
- const __m128i s1b = _mm_set_epi32(*(uint32_t *)(src1 + 2 * src1_stride + 2),
- *(uint32_t *)(src1 + 2 * src1_stride),
- *(uint32_t *)(src1 + 3 * src1_stride + 2),
- *(uint32_t *)(src1 + 3 * src1_stride));
+ const __m128i s0a =
+ _mm_set_epi64x(*(uint64_t *)src0, *(uint64_t *)(src0 + src0_stride));
+ const __m128i s0b = _mm_set_epi64x(*(uint64_t *)(src0 + 2 * src0_stride),
+ *(uint64_t *)(src0 + 3 * src0_stride));
+ const __m128i s1a =
+ _mm_set_epi64x(*(uint64_t *)(src1), *(uint64_t *)(src1 + src1_stride));
+ const __m128i s1b = _mm_set_epi64x(*(uint64_t *)(src1 + 2 * src1_stride),
+ *(uint64_t *)(src1 + 3 * src1_stride));
// Generate the inverse masks
const __m128i mask1a = _mm_sub_epi16(*mask_max, *mask0a);
@@ -1225,24 +1217,16 @@
// Load 8 pixels from each of 8 rows of mask,
// (saturating) add together rows then use madd to add adjacent pixels
// Finally, divide each value by 4 (with rounding)
- const __m128i m02 = _mm_set_epi32(*(uint32_t *)(mask + 0 * mask_stride + 4),
- *(uint32_t *)(mask + 0 * mask_stride),
- *(uint32_t *)(mask + 2 * mask_stride + 4),
- *(uint32_t *)(mask + 2 * mask_stride));
- const __m128i m13 = _mm_set_epi32(*(uint32_t *)(mask + 1 * mask_stride + 4),
- *(uint32_t *)(mask + 1 * mask_stride),
- *(uint32_t *)(mask + 3 * mask_stride + 4),
- *(uint32_t *)(mask + 3 * mask_stride));
+ const __m128i m02 = _mm_set_epi64x(*(uint64_t *)(mask),
+ *(uint64_t *)(mask + 2 * mask_stride));
+ const __m128i m13 = _mm_set_epi64x(*(uint64_t *)(mask + mask_stride),
+ *(uint64_t *)(mask + 3 * mask_stride));
const __m128i m0123 = _mm_maddubs_epi16(_mm_adds_epu8(m02, m13), one_b);
const __m128i mask_0a = _mm_srli_epi16(_mm_add_epi16(m0123, two_w), 2);
- const __m128i m46 = _mm_set_epi32(*(uint32_t *)(mask + 4 * mask_stride + 4),
- *(uint32_t *)(mask + 4 * mask_stride),
- *(uint32_t *)(mask + 6 * mask_stride + 4),
- *(uint32_t *)(mask + 6 * mask_stride));
- const __m128i m57 = _mm_set_epi32(*(uint32_t *)(mask + 5 * mask_stride + 4),
- *(uint32_t *)(mask + 5 * mask_stride),
- *(uint32_t *)(mask + 7 * mask_stride + 4),
- *(uint32_t *)(mask + 7 * mask_stride));
+ const __m128i m46 = _mm_set_epi64x(*(uint64_t *)(mask + 4 * mask_stride),
+ *(uint64_t *)(mask + 6 * mask_stride));
+ const __m128i m57 = _mm_set_epi64x(*(uint64_t *)(mask + 5 * mask_stride),
+ *(uint64_t *)(mask + 7 * mask_stride));
const __m128i m4567 = _mm_maddubs_epi16(_mm_adds_epu8(m46, m57), one_b);
const __m128i mask_0b = _mm_srli_epi16(_mm_add_epi16(m4567, two_w), 2);
diff --git a/av1/encoder/x86/rdopt_sse4.c b/av1/encoder/x86/rdopt_sse4.c
index ba48e28..67d94b4 100644
--- a/av1/encoder/x86/rdopt_sse4.c
+++ b/av1/encoder/x86/rdopt_sse4.c
@@ -30,12 +30,10 @@
// [ i j k l ]
// [ m n o p ]
- const __m128i pixelsa = _mm_set_epi32(
- *(uint32_t *)&diff[0 * stride + 2], *(uint32_t *)&diff[0 * stride],
- *(uint32_t *)&diff[2 * stride + 2], *(uint32_t *)&diff[2 * stride]);
- const __m128i pixelsb = _mm_set_epi32(
- *(uint32_t *)&diff[1 * stride + 2], *(uint32_t *)&diff[1 * stride],
- *(uint32_t *)&diff[3 * stride + 2], *(uint32_t *)&diff[3 * stride]);
+ const __m128i pixelsa = _mm_set_epi64x(*(uint64_t *)&diff[0 * stride],
+ *(uint64_t *)&diff[2 * stride]);
+ const __m128i pixelsb = _mm_set_epi64x(*(uint64_t *)&diff[1 * stride],
+ *(uint64_t *)&diff[3 * stride]);
// pixelsa = [d c b a l k j i] as i16
// pixelsb = [h g f e p o n m] as i16