x86: normalize types used with _mm{,256}_set?_epi64x

w/clang -fsanitize=integer fixes warnings of the form:
implicit conversion from type 'uint64_t' (aka 'unsigned long') of value
15120695205995520509 (64-bit, unsigned) to type 'long long' changed the
value to -3326048867714031107 (64-bit, signed)

Bug: aomedia:3136
Bug: b/229626362
Change-Id: I4cd17cc978264a756cc2fd80bb09323d18622235
diff --git a/aom_dsp/simd/v128_intrinsics_x86.h b/aom_dsp/simd/v128_intrinsics_x86.h
index baf0528..62a0ee8 100644
--- a/aom_dsp/simd/v128_intrinsics_x86.h
+++ b/aom_dsp/simd/v128_intrinsics_x86.h
@@ -583,8 +583,8 @@
 
 SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
   // _mm_sra_epi64 is missing in gcc?
-  return v128_from_64((int64_t)v64_u64(v128_high_v64(a)) >> c,
-                      (int64_t)v64_u64(v128_low_v64(a)) >> c);
+  return v128_from_64((uint64_t)((int64_t)v64_u64(v128_high_v64(a)) >> c),
+                      (uint64_t)((int64_t)v64_u64(v128_low_v64(a)) >> c));
   // return _mm_sra_epi64(a, _mm_cvtsi32_si128((int)c));
 }
 
diff --git a/aom_dsp/simd/v256_intrinsics_x86.h b/aom_dsp/simd/v256_intrinsics_x86.h
index 0b77eae..45b82ee 100644
--- a/aom_dsp/simd/v256_intrinsics_x86.h
+++ b/aom_dsp/simd/v256_intrinsics_x86.h
@@ -57,7 +57,7 @@
 }
 
 SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
-  return _mm256_set_epi64x(a, b, c, d);
+  return _mm256_set_epi64x((int64_t)a, (int64_t)b, (int64_t)c, (int64_t)d);
 }
 
 SIMD_INLINE v256 v256_load_aligned(const void *p) {
@@ -84,7 +84,9 @@
 
 SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32((int)x); }
 
-SIMD_INLINE v256 v256_dup_64(uint64_t x) { return _mm256_set1_epi64x(x); }
+SIMD_INLINE v256 v256_dup_64(uint64_t x) {
+  return _mm256_set1_epi64x((int64_t)x);
+}
 
 SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); }
 
diff --git a/aom_dsp/x86/blend_a64_mask_avx2.c b/aom_dsp/x86/blend_a64_mask_avx2.c
index ad27152..09c63a3 100644
--- a/aom_dsp/x86/blend_a64_mask_avx2.c
+++ b/aom_dsp/x86/blend_a64_mask_avx2.c
@@ -910,14 +910,14 @@
     const __m256i *round_offset, int shift, const __m256i *clip_low,
     const __m256i *clip_high, const __m256i *mask_max) {
   // Load 4x u16 pixels from each of 4 rows from each source
-  const __m256i s0 = _mm256_set_epi64x(*(uint64_t *)(src0 + 3 * src0_stride),
-                                       *(uint64_t *)(src0 + 2 * src0_stride),
-                                       *(uint64_t *)(src0 + 1 * src0_stride),
-                                       *(uint64_t *)(src0 + 0 * src0_stride));
-  const __m256i s1 = _mm256_set_epi64x(*(uint64_t *)(src1 + 3 * src1_stride),
-                                       *(uint64_t *)(src1 + 2 * src1_stride),
-                                       *(uint64_t *)(src1 + 1 * src1_stride),
-                                       *(uint64_t *)(src1 + 0 * src1_stride));
+  const __m256i s0 = _mm256_set_epi64x(*(int64_t *)(src0 + 3 * src0_stride),
+                                       *(int64_t *)(src0 + 2 * src0_stride),
+                                       *(int64_t *)(src0 + 1 * src0_stride),
+                                       *(int64_t *)(src0 + 0 * src0_stride));
+  const __m256i s1 = _mm256_set_epi64x(*(int64_t *)(src1 + 3 * src1_stride),
+                                       *(int64_t *)(src1 + 2 * src1_stride),
+                                       *(int64_t *)(src1 + 1 * src1_stride),
+                                       *(int64_t *)(src1 + 0 * src1_stride));
   // Generate the inverse mask
   const __m256i mask1 = _mm256_sub_epi16(*mask_max, *mask0);
 
@@ -994,15 +994,15 @@
     // (saturating) add together rows then use madd to add adjacent pixels
     // Finally, divide each value by 4 (with rounding)
     const __m256i m0246 =
-        _mm256_set_epi64x(*(uint64_t *)(mask + 6 * mask_stride),
-                          *(uint64_t *)(mask + 4 * mask_stride),
-                          *(uint64_t *)(mask + 2 * mask_stride),
-                          *(uint64_t *)(mask + 0 * mask_stride));
+        _mm256_set_epi64x(*(int64_t *)(mask + 6 * mask_stride),
+                          *(int64_t *)(mask + 4 * mask_stride),
+                          *(int64_t *)(mask + 2 * mask_stride),
+                          *(int64_t *)(mask + 0 * mask_stride));
     const __m256i m1357 =
-        _mm256_set_epi64x(*(uint64_t *)(mask + 7 * mask_stride),
-                          *(uint64_t *)(mask + 5 * mask_stride),
-                          *(uint64_t *)(mask + 3 * mask_stride),
-                          *(uint64_t *)(mask + 1 * mask_stride));
+        _mm256_set_epi64x(*(int64_t *)(mask + 7 * mask_stride),
+                          *(int64_t *)(mask + 5 * mask_stride),
+                          *(int64_t *)(mask + 3 * mask_stride),
+                          *(int64_t *)(mask + 1 * mask_stride));
     const __m256i addrows = _mm256_adds_epu8(m0246, m1357);
     const __m256i adjacent = _mm256_maddubs_epi16(addrows, one_b);
     const __m256i mask0 =
@@ -1101,10 +1101,10 @@
   do {
     // Load 8x u8 pixels from each of 4 rows in the mask
     const __m128i mask0a8 =
-        _mm_set_epi64x(*(uint64_t *)mask, *(uint64_t *)(mask + mask_stride));
+        _mm_set_epi64x(*(int64_t *)mask, *(uint64_t *)(mask + mask_stride));
     const __m128i mask0b8 =
-        _mm_set_epi64x(*(uint64_t *)(mask + 2 * mask_stride),
-                       *(uint64_t *)(mask + 3 * mask_stride));
+        _mm_set_epi64x(*(int64_t *)(mask + 2 * mask_stride),
+                       *(int64_t *)(mask + 3 * mask_stride));
     const __m256i mask0a = _mm256_cvtepu8_epi16(mask0a8);
     const __m256i mask0b = _mm256_cvtepu8_epi16(mask0b8);
 
diff --git a/aom_dsp/x86/blend_a64_mask_sse4.c b/aom_dsp/x86/blend_a64_mask_sse4.c
index 5c68f41a..270ad26 100644
--- a/aom_dsp/x86/blend_a64_mask_sse4.c
+++ b/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -1121,13 +1121,13 @@
     const __m128i *mask_max) {
   // Load 4 pixels from each of 4 rows from each source
   const __m128i s0a =
-      _mm_set_epi64x(*(uint64_t *)src0, *(uint64_t *)(src0 + src0_stride));
-  const __m128i s0b = _mm_set_epi64x(*(uint64_t *)(src0 + 2 * src0_stride),
-                                     *(uint64_t *)(src0 + 3 * src0_stride));
+      _mm_set_epi64x(*(int64_t *)src0, *(int64_t *)(src0 + src0_stride));
+  const __m128i s0b = _mm_set_epi64x(*(int64_t *)(src0 + 2 * src0_stride),
+                                     *(int64_t *)(src0 + 3 * src0_stride));
   const __m128i s1a =
-      _mm_set_epi64x(*(uint64_t *)(src1), *(uint64_t *)(src1 + src1_stride));
-  const __m128i s1b = _mm_set_epi64x(*(uint64_t *)(src1 + 2 * src1_stride),
-                                     *(uint64_t *)(src1 + 3 * src1_stride));
+      _mm_set_epi64x(*(int64_t *)(src1), *(int64_t *)(src1 + src1_stride));
+  const __m128i s1b = _mm_set_epi64x(*(int64_t *)(src1 + 2 * src1_stride),
+                                     *(int64_t *)(src1 + 3 * src1_stride));
 
   // Generate the inverse masks
   const __m128i mask1a = _mm_sub_epi16(*mask_max, *mask0a);
@@ -1218,16 +1218,16 @@
     // Load 8 pixels from each of 8 rows of mask,
     // (saturating) add together rows then use madd to add adjacent pixels
     // Finally, divide each value by 4 (with rounding)
-    const __m128i m02 = _mm_set_epi64x(*(uint64_t *)(mask),
-                                       *(uint64_t *)(mask + 2 * mask_stride));
-    const __m128i m13 = _mm_set_epi64x(*(uint64_t *)(mask + mask_stride),
-                                       *(uint64_t *)(mask + 3 * mask_stride));
+    const __m128i m02 = _mm_set_epi64x(*(int64_t *)(mask),
+                                       *(int64_t *)(mask + 2 * mask_stride));
+    const __m128i m13 = _mm_set_epi64x(*(int64_t *)(mask + mask_stride),
+                                       *(int64_t *)(mask + 3 * mask_stride));
     const __m128i m0123 = _mm_maddubs_epi16(_mm_adds_epu8(m02, m13), one_b);
     const __m128i mask_0a = _mm_srli_epi16(_mm_add_epi16(m0123, two_w), 2);
-    const __m128i m46 = _mm_set_epi64x(*(uint64_t *)(mask + 4 * mask_stride),
-                                       *(uint64_t *)(mask + 6 * mask_stride));
-    const __m128i m57 = _mm_set_epi64x(*(uint64_t *)(mask + 5 * mask_stride),
-                                       *(uint64_t *)(mask + 7 * mask_stride));
+    const __m128i m46 = _mm_set_epi64x(*(int64_t *)(mask + 4 * mask_stride),
+                                       *(int64_t *)(mask + 6 * mask_stride));
+    const __m128i m57 = _mm_set_epi64x(*(int64_t *)(mask + 5 * mask_stride),
+                                       *(int64_t *)(mask + 7 * mask_stride));
     const __m128i m4567 = _mm_maddubs_epi16(_mm_adds_epu8(m46, m57), one_b);
     const __m128i mask_0b = _mm_srli_epi16(_mm_add_epi16(m4567, two_w), 2);
 
diff --git a/aom_dsp/x86/highbd_sad_avx2.c b/aom_dsp/x86/highbd_sad_avx2.c
index ad4db2f..f583772 100644
--- a/aom_dsp/x86/highbd_sad_avx2.c
+++ b/aom_dsp/x86/highbd_sad_avx2.c
@@ -551,7 +551,7 @@
 static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v,
                                                uint32_t *res) {
   __m256i u0, u1, u2, u3;
-  const __m256i mask = yy_set1_64_from_32i(UINT32_MAX);
+  const __m256i mask = yy_set1_64_from_32i(~0);
   __m128i sad;
 
   // 8 32-bit summation
diff --git a/aom_dsp/x86/sum_squares_avx2.c b/aom_dsp/x86/sum_squares_avx2.c
index 0d63db2..89b9b82 100644
--- a/aom_dsp/x86/sum_squares_avx2.c
+++ b/aom_dsp/x86/sum_squares_avx2.c
@@ -21,7 +21,7 @@
                                                 int width, int height) {
   uint64_t result;
   __m256i v_acc_q = _mm256_setzero_si256();
-  const __m256i v_zext_mask_q = yy_set1_64_from_32i(0xffffffff);
+  const __m256i v_zext_mask_q = yy_set1_64_from_32i(~0);
   for (int col = 0; col < height; col += 4) {
     __m256i v_acc_d = _mm256_setzero_si256();
     for (int row = 0; row < width; row += 16) {
diff --git a/aom_dsp/x86/sum_squares_sse2.c b/aom_dsp/x86/sum_squares_sse2.c
index 0bdeee9..053d595 100644
--- a/aom_dsp/x86/sum_squares_sse2.c
+++ b/aom_dsp/x86/sum_squares_sse2.c
@@ -84,7 +84,7 @@
     src += stride << 2;
     r += 4;
   } while (r < height);
-  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
   __m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32),
                                    _mm_and_si128(v_acc_q, v_zext_mask_q));
   v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8));
@@ -116,7 +116,7 @@
                                 int height) {
   int r = 0;
 
-  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
   __m128i v_acc_q = _mm_setzero_si128();
 
   do {
@@ -254,7 +254,7 @@
 //////////////////////////////////////////////////////////////////////////////
 
 static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
-  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
   __m128i v_acc0_q = _mm_setzero_si128();
   __m128i v_acc1_q = _mm_setzero_si128();
 
diff --git a/av1/encoder/x86/rdopt_sse4.c b/av1/encoder/x86/rdopt_sse4.c
index 4c4ec1f..12ac146 100644
--- a/av1/encoder/x86/rdopt_sse4.c
+++ b/av1/encoder/x86/rdopt_sse4.c
@@ -29,10 +29,10 @@
   //                      [ i j k l ]
   //                      [ m n o p ]
 
-  const __m128i pixelsa = _mm_set_epi64x(*(uint64_t *)&diff[0 * stride],
-                                         *(uint64_t *)&diff[2 * stride]);
-  const __m128i pixelsb = _mm_set_epi64x(*(uint64_t *)&diff[1 * stride],
-                                         *(uint64_t *)&diff[3 * stride]);
+  const __m128i pixelsa = _mm_set_epi64x(*(int64_t *)&diff[0 * stride],
+                                         *(int64_t *)&diff[2 * stride]);
+  const __m128i pixelsb = _mm_set_epi64x(*(int64_t *)&diff[1 * stride],
+                                         *(int64_t *)&diff[3 * stride]);
   // pixelsa = [d c b a l k j i] as i16
   // pixelsb = [h g f e p o n m] as i16
 
diff --git a/av1/encoder/x86/wedge_utils_avx2.c b/av1/encoder/x86/wedge_utils_avx2.c
index 4215f40..665951c 100644
--- a/av1/encoder/x86/wedge_utils_avx2.c
+++ b/av1/encoder/x86/wedge_utils_avx2.c
@@ -31,7 +31,7 @@
   uint64_t csse;
 
   const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE);
-  const __m256i v_zext_q = yy_set1_64_from_32i(0xffffffff);
+  const __m256i v_zext_q = yy_set1_64_from_32i(~0);
 
   __m256i v_acc0_q = _mm256_setzero_si256();
 
diff --git a/av1/encoder/x86/wedge_utils_sse2.c b/av1/encoder/x86/wedge_utils_sse2.c
index f3f4b8a..6a3ef00 100644
--- a/av1/encoder/x86/wedge_utils_sse2.c
+++ b/av1/encoder/x86/wedge_utils_sse2.c
@@ -31,7 +31,7 @@
   uint64_t csse;
 
   const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE);
-  const __m128i v_zext_q = xx_set1_64_from_32i(0xffffffff);
+  const __m128i v_zext_q = xx_set1_64_from_32i(~0);
 
   __m128i v_acc0_q = _mm_setzero_si128();