Remove uses of _mm_set_epi64x

Some platforms don't include this intrinsic.  As a workaround, replace
all instances with _mm_set_epi32 and load the data in 32-bit chunks.

BUG=aomedia:2232

Change-Id: I5256688f145cb4f847a2cdca3b290217ec511a22
diff --git a/aom_dsp/x86/blend_a64_mask_avx2.c b/aom_dsp/x86/blend_a64_mask_avx2.c
index 057f615..8adcef2 100644
--- a/aom_dsp/x86/blend_a64_mask_avx2.c
+++ b/aom_dsp/x86/blend_a64_mask_avx2.c
@@ -1100,10 +1100,16 @@
   do {
     // Load 8x u8 pixels from each of 4 rows in the mask
     const __m128i mask0a8 =
-        _mm_set_epi64x(*(uint64_t *)mask, *(uint64_t *)(mask + mask_stride));
+        _mm_set_epi32(*(uint32_t *)(mask + 0 * mask_stride + 4),
+                      *(uint32_t *)(mask + 0 * mask_stride),
+                      *(uint32_t *)(mask + 1 * mask_stride + 4),
+                      *(uint32_t *)(mask + 1 * mask_stride));
     const __m128i mask0b8 =
-        _mm_set_epi64x(*(uint64_t *)(mask + 2 * mask_stride),
-                       *(uint64_t *)(mask + 3 * mask_stride));
+        _mm_set_epi32(*(uint32_t *)(mask + 2 * mask_stride + 4),
+                      *(uint32_t *)(mask + 2 * mask_stride),
+                      *(uint32_t *)(mask + 3 * mask_stride + 4),
+                      *(uint32_t *)(mask + 3 * mask_stride));
+
     const __m256i mask0a = _mm256_cvtepu8_epi16(mask0a8);
     const __m256i mask0b = _mm256_cvtepu8_epi16(mask0b8);
 
diff --git a/aom_dsp/x86/blend_a64_mask_sse4.c b/aom_dsp/x86/blend_a64_mask_sse4.c
index b7a2468..c07d2c3 100644
--- a/aom_dsp/x86/blend_a64_mask_sse4.c
+++ b/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -1119,14 +1119,22 @@
     const __m128i *clip_low, const __m128i *clip_high,
     const __m128i *mask_max) {
   // Load 4 pixels from each of 4 rows from each source
-  const __m128i s0a =
-      _mm_set_epi64x(*(uint64_t *)src0, *(uint64_t *)(src0 + src0_stride));
-  const __m128i s0b = _mm_set_epi64x(*(uint64_t *)(src0 + 2 * src0_stride),
-                                     *(uint64_t *)(src0 + 3 * src0_stride));
-  const __m128i s1a =
-      _mm_set_epi64x(*(uint64_t *)(src1), *(uint64_t *)(src1 + src1_stride));
-  const __m128i s1b = _mm_set_epi64x(*(uint64_t *)(src1 + 2 * src1_stride),
-                                     *(uint64_t *)(src1 + 3 * src1_stride));
+  const __m128i s0a = _mm_set_epi32(*(uint32_t *)(src0 + 0 * src0_stride + 2),
+                                    *(uint32_t *)(src0 + 0 * src0_stride),
+                                    *(uint32_t *)(src0 + 1 * src0_stride + 2),
+                                    *(uint32_t *)(src0 + 1 * src0_stride));
+  const __m128i s0b = _mm_set_epi32(*(uint32_t *)(src0 + 2 * src0_stride + 2),
+                                    *(uint32_t *)(src0 + 2 * src0_stride),
+                                    *(uint32_t *)(src0 + 3 * src0_stride + 2),
+                                    *(uint32_t *)(src0 + 3 * src0_stride));
+  const __m128i s1a = _mm_set_epi32(*(uint32_t *)(src1 + 0 * src1_stride + 2),
+                                    *(uint32_t *)(src1 + 0 * src1_stride),
+                                    *(uint32_t *)(src1 + 1 * src1_stride + 2),
+                                    *(uint32_t *)(src1 + 1 * src1_stride));
+  const __m128i s1b = _mm_set_epi32(*(uint32_t *)(src1 + 2 * src1_stride + 2),
+                                    *(uint32_t *)(src1 + 2 * src1_stride),
+                                    *(uint32_t *)(src1 + 3 * src1_stride + 2),
+                                    *(uint32_t *)(src1 + 3 * src1_stride));
 
   // Generate the inverse masks
   const __m128i mask1a = _mm_sub_epi16(*mask_max, *mask0a);
@@ -1217,16 +1225,24 @@
     // Load 8 pixels from each of 8 rows of mask,
     // (saturating) add together rows then use madd to add adjacent pixels
     // Finally, divide each value by 4 (with rounding)
-    const __m128i m02 = _mm_set_epi64x(*(uint64_t *)(mask),
-                                       *(uint64_t *)(mask + 2 * mask_stride));
-    const __m128i m13 = _mm_set_epi64x(*(uint64_t *)(mask + mask_stride),
-                                       *(uint64_t *)(mask + 3 * mask_stride));
+    const __m128i m02 = _mm_set_epi32(*(uint32_t *)(mask + 0 * mask_stride + 4),
+                                      *(uint32_t *)(mask + 0 * mask_stride),
+                                      *(uint32_t *)(mask + 2 * mask_stride + 4),
+                                      *(uint32_t *)(mask + 2 * mask_stride));
+    const __m128i m13 = _mm_set_epi32(*(uint32_t *)(mask + 1 * mask_stride + 4),
+                                      *(uint32_t *)(mask + 1 * mask_stride),
+                                      *(uint32_t *)(mask + 3 * mask_stride + 4),
+                                      *(uint32_t *)(mask + 3 * mask_stride));
     const __m128i m0123 = _mm_maddubs_epi16(_mm_adds_epu8(m02, m13), one_b);
     const __m128i mask_0a = _mm_srli_epi16(_mm_add_epi16(m0123, two_w), 2);
-    const __m128i m46 = _mm_set_epi64x(*(uint64_t *)(mask + 4 * mask_stride),
-                                       *(uint64_t *)(mask + 6 * mask_stride));
-    const __m128i m57 = _mm_set_epi64x(*(uint64_t *)(mask + 5 * mask_stride),
-                                       *(uint64_t *)(mask + 7 * mask_stride));
+    const __m128i m46 = _mm_set_epi32(*(uint32_t *)(mask + 4 * mask_stride + 4),
+                                      *(uint32_t *)(mask + 4 * mask_stride),
+                                      *(uint32_t *)(mask + 6 * mask_stride + 4),
+                                      *(uint32_t *)(mask + 6 * mask_stride));
+    const __m128i m57 = _mm_set_epi32(*(uint32_t *)(mask + 5 * mask_stride + 4),
+                                      *(uint32_t *)(mask + 5 * mask_stride),
+                                      *(uint32_t *)(mask + 7 * mask_stride + 4),
+                                      *(uint32_t *)(mask + 7 * mask_stride));
     const __m128i m4567 = _mm_maddubs_epi16(_mm_adds_epu8(m46, m57), one_b);
     const __m128i mask_0b = _mm_srli_epi16(_mm_add_epi16(m4567, two_w), 2);
 
diff --git a/av1/encoder/x86/rdopt_sse4.c b/av1/encoder/x86/rdopt_sse4.c
index 67d94b4..ba48e28 100644
--- a/av1/encoder/x86/rdopt_sse4.c
+++ b/av1/encoder/x86/rdopt_sse4.c
@@ -30,10 +30,12 @@
   //                      [ i j k l ]
   //                      [ m n o p ]
 
-  const __m128i pixelsa = _mm_set_epi64x(*(uint64_t *)&diff[0 * stride],
-                                         *(uint64_t *)&diff[2 * stride]);
-  const __m128i pixelsb = _mm_set_epi64x(*(uint64_t *)&diff[1 * stride],
-                                         *(uint64_t *)&diff[3 * stride]);
+  const __m128i pixelsa = _mm_set_epi32(
+      *(uint32_t *)&diff[0 * stride + 2], *(uint32_t *)&diff[0 * stride],
+      *(uint32_t *)&diff[2 * stride + 2], *(uint32_t *)&diff[2 * stride]);
+  const __m128i pixelsb = _mm_set_epi32(
+      *(uint32_t *)&diff[1 * stride + 2], *(uint32_t *)&diff[1 * stride],
+      *(uint32_t *)&diff[3 * stride + 2], *(uint32_t *)&diff[3 * stride]);
   // pixelsa = [d c b a l k j i] as i16
   // pixelsb = [h g f e p o n m] as i16