x86: normalize types used with _mm_cvtsi32_si128

w/clang -fsanitize=integer fixes warnings of the form:
implicit conversion from type 'uint32_t' (aka 'unsigned int') of value
2846534572 (32-bit, unsigned) to type 'int' changed the value to
-1448432724 (32-bit, signed)

Bug: aomedia:3136
Bug: b/229626362
Change-Id: I738df84c506ac2c42acce4597ebb00306ddeff97
diff --git a/aom_dsp/simd/v128_intrinsics_x86.h b/aom_dsp/simd/v128_intrinsics_x86.h
index c404015..3cc6c02 100644
--- a/aom_dsp/simd/v128_intrinsics_x86.h
+++ b/aom_dsp/simd/v128_intrinsics_x86.h
@@ -535,57 +535,57 @@
 
 SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
   return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
-                       _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
+                       _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
 }
 
 SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
   return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)),
-                       _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
+                       _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c)));
 }
 
 SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
-  __m128i x = _mm_cvtsi32_si128(c + 8);
+  __m128i x = _mm_cvtsi32_si128((int)(c + 8));
   return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x),
                          _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x));
 }
 
 SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
-  return _mm_sll_epi16(a, _mm_cvtsi32_si128(c));
+  return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
-  return _mm_srl_epi16(a, _mm_cvtsi32_si128(c));
+  return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
-  return _mm_sra_epi16(a, _mm_cvtsi32_si128(c));
+  return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
-  return _mm_sll_epi32(a, _mm_cvtsi32_si128(c));
+  return _mm_sll_epi32(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
-  return _mm_srl_epi32(a, _mm_cvtsi32_si128(c));
+  return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
-  return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
+  return _mm_sra_epi32(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
-  return _mm_sll_epi64(a, _mm_cvtsi32_si128(c));
+  return _mm_sll_epi64(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
-  return _mm_srl_epi64(a, _mm_cvtsi32_si128(c));
+  return _mm_srl_epi64(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
   // _mm_sra_epi64 is missing in gcc?
   return v128_from_64((int64_t)v64_u64(v128_high_v64(a)) >> c,
                       (int64_t)v64_u64(v128_low_v64(a)) >> c);
-  // return _mm_sra_epi64(a, _mm_cvtsi32_si128(c));
+  // return _mm_sra_epi64(a, _mm_cvtsi32_si128((int)c));
 }
 
 /* These intrinsics require immediate values, so we must use #defines
diff --git a/aom_dsp/simd/v256_intrinsics_x86.h b/aom_dsp/simd/v256_intrinsics_x86.h
index eb5eaf0..e10846b 100644
--- a/aom_dsp/simd/v256_intrinsics_x86.h
+++ b/aom_dsp/simd/v256_intrinsics_x86.h
@@ -597,55 +597,55 @@
 
 SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
   return _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << c)),
-                          _mm256_sll_epi16(a, _mm_cvtsi32_si128(c)));
+                          _mm256_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
 }
 
 SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
   return _mm256_and_si256(_mm256_set1_epi8((char)(0xff >> c)),
-                          _mm256_srl_epi16(a, _mm_cvtsi32_si128(c)));
+                          _mm256_srl_epi16(a, _mm_cvtsi32_si128((int)c)));
 }
 
 SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
-  __m128i x = _mm_cvtsi32_si128(c + 8);
+  __m128i x = _mm_cvtsi32_si128((int)(c + 8));
   return _mm256_packs_epi16(_mm256_sra_epi16(_mm256_unpacklo_epi8(a, a), x),
                             _mm256_sra_epi16(_mm256_unpackhi_epi8(a, a), x));
 }
 
 SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
-  return _mm256_sll_epi16(a, _mm_cvtsi32_si128(c));
+  return _mm256_sll_epi16(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
-  return _mm256_srl_epi16(a, _mm_cvtsi32_si128(c));
+  return _mm256_srl_epi16(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
-  return _mm256_sra_epi16(a, _mm_cvtsi32_si128(c));
+  return _mm256_sra_epi16(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
-  return _mm256_sll_epi32(a, _mm_cvtsi32_si128(c));
+  return _mm256_sll_epi32(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
-  return _mm256_srl_epi32(a, _mm_cvtsi32_si128(c));
+  return _mm256_srl_epi32(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
-  return _mm256_sra_epi32(a, _mm_cvtsi32_si128(c));
+  return _mm256_sra_epi32(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) {
-  return _mm256_sll_epi64(a, _mm_cvtsi32_si128(c));
+  return _mm256_sll_epi64(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) {
-  return _mm256_srl_epi64(a, _mm_cvtsi32_si128(c));
+  return _mm256_srl_epi64(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
 #if defined(__AVX512VL__)
-  return _mm256_sra_epi64(a, _mm_cvtsi32_si128(c));
+  return _mm256_sra_epi64(a, _mm_cvtsi32_si128((int)c));
 #else
   return v256_from_v128(v128_shr_s64(v256_high_v128(a), c),
                         v128_shr_s64(v256_low_v128(a), c));
diff --git a/aom_dsp/simd/v64_intrinsics_x86.h b/aom_dsp/simd/v64_intrinsics_x86.h
index 1f273fe..42b602b 100644
--- a/aom_dsp/simd/v64_intrinsics_x86.h
+++ b/aom_dsp/simd/v64_intrinsics_x86.h
@@ -434,41 +434,42 @@
 
 SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
   return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
-                       _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
+                       _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
 }
 
 SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
   return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)),
-                       _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
+                       _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c)));
 }
 
 SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
   return _mm_packs_epi16(
-      _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128(c + 8)), a);
+      _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128((int)(c + 8))),
+      a);
 }
 
 SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
-  return _mm_sll_epi16(a, _mm_cvtsi32_si128(c));
+  return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
-  return _mm_srl_epi16(a, _mm_cvtsi32_si128(c));
+  return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
-  return _mm_sra_epi16(a, _mm_cvtsi32_si128(c));
+  return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
-  return _mm_sll_epi32(a, _mm_cvtsi32_si128(c));
+  return _mm_sll_epi32(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
-  return _mm_srl_epi32(a, _mm_cvtsi32_si128(c));
+  return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)c));
 }
 
 SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
-  return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
+  return _mm_sra_epi32(a, _mm_cvtsi32_si128((int)c));
 }
 
 /* These intrinsics require immediate values, so we must use #defines
diff --git a/aom_dsp/x86/intrapred_sse2.c b/aom_dsp/x86/intrapred_sse2.c
index 4786696..d42f185 100644
--- a/aom_dsp/x86/intrapred_sse2.c
+++ b/aom_dsp/x86/intrapred_sse2.c
@@ -1334,7 +1334,7 @@
                                     const uint8_t *left, int height) {
   int i = height >> 2;
   do {
-    __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
+    __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
     left4 = _mm_unpacklo_epi8(left4, left4);
     left4 = _mm_unpacklo_epi8(left4, left4);
     const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
@@ -1364,7 +1364,7 @@
                                     const uint8_t *left, int height) {
   int i = height >> 2;
   do {
-    __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
+    __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
     left4 = _mm_unpacklo_epi8(left4, left4);
     left4 = _mm_unpacklo_epi8(left4, left4);
     const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
diff --git a/aom_dsp/x86/intrapred_ssse3.c b/aom_dsp/x86/intrapred_ssse3.c
index a75616e..6d579d5 100644
--- a/aom_dsp/x86/intrapred_ssse3.c
+++ b/aom_dsp/x86/intrapred_ssse3.c
@@ -86,7 +86,7 @@
 void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   __m128i l = _mm_load_si128((const __m128i *)left);
-  const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
+  const __m128i t = _mm_cvtsi32_si128(((const int *)above)[0]);
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
@@ -199,7 +199,7 @@
 
 void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  __m128i l = _mm_cvtsi32_si128(((const int *)left)[0]);
   const __m128i t = _mm_load_si128((const __m128i *)above);
   const __m128i zero = _mm_setzero_si128();
   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
@@ -586,9 +586,9 @@
 // pixels[2]: right_pred vector
 static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
                                  int height, __m128i *pixels) {
-  __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
+  __m128i d = _mm_cvtsi32_si128(((const int *)above)[0]);
   if (height == 4)
-    pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+    pixels[1] = _mm_cvtsi32_si128(((const int *)left)[0]);
   else if (height == 8)
     pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
   else
@@ -611,7 +611,7 @@
                                   __m128i *weight_w) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i d = _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
-  const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)smooth_weights)[0]);
+  const __m128i t = _mm_cvtsi32_si128(((const int *)smooth_weights)[0]);
   weight_h[0] = _mm_unpacklo_epi8(t, zero);
   weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
   weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
@@ -720,7 +720,7 @@
   pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
 
   if (height == 4) {
-    pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+    pixels[2] = _mm_cvtsi32_si128(((const int *)left)[0]);
   } else if (height == 8) {
     pixels[2] = _mm_loadl_epi64((const __m128i *)left);
   } else if (height == 16) {
@@ -892,17 +892,17 @@
   const __m128i zero = _mm_setzero_si128();
   const __m128i scale_value =
       _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
-  const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
+  const __m128i bottom_left = _mm_cvtsi32_si128(left[bh - 1]);
   const __m128i dup16 = _mm_set1_epi32(0x01000100);
   const __m128i top_right =
-      _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(above[bw - 1]), dup16);
   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
   const __m128i round =
       _mm_set1_epi32((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
 
   for (uint32_t y = 0; y < bh; ++y) {
-    const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
-    const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
+    const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
+    const __m128i left_y = _mm_cvtsi32_si128(left[y]);
     const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
     __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
     const __m128i wl_y =
@@ -1023,7 +1023,7 @@
 static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
                                    int height, __m128i *pixels) {
   const __m128i zero = _mm_setzero_si128();
-  __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
+  __m128i d = _mm_cvtsi32_si128(((const int *)above)[0]);
   const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
   d = _mm_unpacklo_epi8(d, zero);
   pixels[0] = _mm_unpacklo_epi16(d, bp);
@@ -1036,8 +1036,7 @@
   const __m128i d = _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
 
   if (height == 4) {
-    const __m128i weight =
-        _mm_cvtsi32_si128(((const uint32_t *)smooth_weights)[0]);
+    const __m128i weight = _mm_cvtsi32_si128(((const int *)smooth_weights)[0]);
     weights[0] = _mm_unpacklo_epi8(weight, zero);
     weights[1] = _mm_sub_epi16(d, weights[0]);
   } else if (height == 8) {
@@ -1264,13 +1263,13 @@
       _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
   const __m128i dup16 = _mm_set1_epi32(0x01000100);
   const __m128i bottom_left =
-      _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(left[bh - 1]), dup16);
   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
   const __m128i round =
       _mm_set1_epi32((uint16_t)(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
 
   for (uint32_t y = 0; y < bh; ++y) {
-    const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
+    const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
     const __m128i scale_m_weights_y =
         _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
     const __m128i wl_y =
@@ -1379,7 +1378,7 @@
 static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
                                    int height, __m128i *pixels) {
   if (height == 4)
-    pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+    pixels[0] = _mm_cvtsi32_si128(((const int *)left)[0]);
   else if (height == 8)
     pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
   else
@@ -1473,7 +1472,7 @@
   pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
 
   if (height == 4) {
-    pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+    pixels[0] = _mm_cvtsi32_si128(((const int *)left)[0]);
   } else if (height == 8) {
     pixels[0] = _mm_loadl_epi64((const __m128i *)left);
   } else if (height == 16) {
@@ -1593,13 +1592,13 @@
   const __m128i zero = _mm_setzero_si128();
   const __m128i scale_value =
       _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
-  const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
+  const __m128i top_right = _mm_cvtsi32_si128(above[bw - 1]);
   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
   const __m128i pred_round =
       _mm_set1_epi32((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
 
   for (uint32_t y = 0; y < bh; ++y) {
-    const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
+    const __m128i left_y = _mm_cvtsi32_si128(left[y]);
     const __m128i tr_ly =
         _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
 
diff --git a/aom_dsp/x86/masked_sad4d_ssse3.c b/aom_dsp/x86/masked_sad4d_ssse3.c
index 1235f27..799ce9e 100644
--- a/aom_dsp/x86/masked_sad4d_ssse3.c
+++ b/aom_dsp/x86/masked_sad4d_ssse3.c
@@ -153,15 +153,15 @@
   _mm_storeu_si128((__m128i *)sad_array, res0);
 }
 
-#define MASK_SAD4XH_ONE_REF(idx)                                               \
-  a = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)ref##idx),             \
-                         _mm_cvtsi32_si128(*(uint32_t *)&ref##idx[a_stride])); \
-  data = _mm_unpacklo_epi8(a, b);                                              \
-  mask = _mm_unpacklo_epi8(m, m_inv);                                          \
-  pred = _mm_maddubs_epi16(data, mask);                                        \
-  pred = xx_roundn_epu16(pred, AOM_BLEND_A64_ROUND_BITS);                      \
-                                                                               \
-  pred = _mm_packus_epi16(pred, _mm_setzero_si128());                          \
+#define MASK_SAD4XH_ONE_REF(idx)                                          \
+  a = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)ref##idx),             \
+                         _mm_cvtsi32_si128(*(int *)&ref##idx[a_stride])); \
+  data = _mm_unpacklo_epi8(a, b);                                         \
+  mask = _mm_unpacklo_epi8(m, m_inv);                                     \
+  pred = _mm_maddubs_epi16(data, mask);                                   \
+  pred = xx_roundn_epu16(pred, AOM_BLEND_A64_ROUND_BITS);                 \
+                                                                          \
+  pred = _mm_packus_epi16(pred, _mm_setzero_si128());                     \
   res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
 
 void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
@@ -182,15 +182,15 @@
   const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
 
   for (int y = 0; y < height; y += 2) {
-    const __m128i src = _mm_unpacklo_epi32(
-        _mm_cvtsi32_si128(*(uint32_t *)src_ptr),
-        _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride]));
+    const __m128i src =
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)src_ptr),
+                           _mm_cvtsi32_si128(*(int *)&src_ptr[src_stride]));
     const __m128i b =
-        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr),
-                           _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride]));
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)b_ptr),
+                           _mm_cvtsi32_si128(*(int *)&b_ptr[b_stride]));
     const __m128i m_copy =
-        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr),
-                           _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride]));
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)m_ptr),
+                           _mm_cvtsi32_si128(*(int *)&m_ptr[m_stride]));
 
     __m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
     __m128i m = inv_mask ? m_inv : m_copy;
diff --git a/aom_dsp/x86/masked_sad_intrin_ssse3.c b/aom_dsp/x86/masked_sad_intrin_ssse3.c
index fd5352c..df3a876 100644
--- a/aom_dsp/x86/masked_sad_intrin_ssse3.c
+++ b/aom_dsp/x86/masked_sad_intrin_ssse3.c
@@ -194,18 +194,18 @@
   for (y = 0; y < height; y += 2) {
     // Load two rows at a time, this seems to be a bit faster
     // than four rows at a time in this case.
-    const __m128i src = _mm_unpacklo_epi32(
-        _mm_cvtsi32_si128(*(uint32_t *)src_ptr),
-        _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride]));
+    const __m128i src =
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)src_ptr),
+                           _mm_cvtsi32_si128(*(int *)&src_ptr[src_stride]));
     const __m128i a =
-        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)a_ptr),
-                           _mm_cvtsi32_si128(*(uint32_t *)&a_ptr[a_stride]));
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)a_ptr),
+                           _mm_cvtsi32_si128(*(int *)&a_ptr[a_stride]));
     const __m128i b =
-        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr),
-                           _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride]));
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)b_ptr),
+                           _mm_cvtsi32_si128(*(int *)&b_ptr[b_stride]));
     const __m128i m =
-        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr),
-                           _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride]));
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)m_ptr),
+                           _mm_cvtsi32_si128(*(int *)&m_ptr[m_stride]));
     const __m128i m_inv = _mm_sub_epi8(mask_max, m);
 
     const __m128i data = _mm_unpacklo_epi8(a, b);
@@ -367,9 +367,8 @@
                            _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]));
     // Zero-extend mask to 16 bits
     const __m128i m = _mm_unpacklo_epi8(
-        _mm_unpacklo_epi32(
-            _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
-            _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)m_ptr),
+                           _mm_cvtsi32_si128(*(const int *)&m_ptr[m_stride])),
         _mm_setzero_si128());
     const __m128i m_inv = _mm_sub_epi16(mask_max, m);
 
diff --git a/aom_dsp/x86/masked_variance_intrin_ssse3.c b/aom_dsp/x86/masked_variance_intrin_ssse3.c
index ac0e576..fb39a2b 100644
--- a/aom_dsp/x86/masked_variance_intrin_ssse3.c
+++ b/aom_dsp/x86/masked_variance_intrin_ssse3.c
@@ -986,9 +986,8 @@
     const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
     const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
     const __m128i m = _mm_unpacklo_epi8(
-        _mm_unpacklo_epi32(
-            _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
-            _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)m_ptr),
+                           _mm_cvtsi32_si128(*(const int *)&m_ptr[m_stride])),
         zero);
     const __m128i m_inv = _mm_sub_epi16(mask_max, m);
 
diff --git a/aom_dsp/x86/obmc_intrinsic_sse4.h b/aom_dsp/x86/obmc_intrinsic_sse4.h
index 5181e44..210f466 100644
--- a/aom_dsp/x86/obmc_intrinsic_sse4.h
+++ b/aom_dsp/x86/obmc_intrinsic_sse4.h
@@ -28,7 +28,7 @@
   assert(IS_POWER_OF_TWO(h));
 
   do {
-    const __m128i v_p_b = _mm_cvtsi32_si128(*(const uint32_t *)(pre + n));
+    const __m128i v_p_b = _mm_cvtsi32_si128(*(const int *)(pre + n));
     const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n));
     const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n));
 
diff --git a/aom_dsp/x86/variance_avx2.c b/aom_dsp/x86/variance_avx2.c
index 7398a73..3ec81af 100644
--- a/aom_dsp/x86/variance_avx2.c
+++ b/aom_dsp/x86/variance_avx2.c
@@ -539,10 +539,10 @@
   const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
   __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
   for (int i = 0; i < h; i += 4) {
-    dst0_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 0) * dstride]));
-    dst1_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 1) * dstride]));
-    dst2_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 2) * dstride]));
-    dst3_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 3) * dstride]));
+    dst0_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 0) * dstride]));
+    dst1_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 1) * dstride]));
+    dst2_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 2) * dstride]));
+    dst3_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 3) * dstride]));
     dst_16x8 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(dst0_4x8, dst1_4x8),
                                   _mm_unpacklo_epi32(dst2_4x8, dst3_4x8));
     dst_16x16 = _mm256_cvtepu8_epi16(dst_16x8);
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index a0223a9..d49d115 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -678,8 +678,8 @@
   const __m128i zeros = _mm_setzero_si128();
   __m128i square_result = _mm_setzero_si128();
   for (int i = 0; i < h; i += 2) {
-    dst0_8x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 0) * dstride]));
-    dst1_8x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 1) * dstride]));
+    dst0_8x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 0) * dstride]));
+    dst1_8x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 1) * dstride]));
     dst_16x8 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(dst0_8x8, dst1_8x8), zeros);
 
     src0_16x4 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
diff --git a/av1/common/x86/av1_convolve_scale_sse4.c b/av1/common/x86/av1_convolve_scale_sse4.c
index 0bdf49f..6d27b52 100644
--- a/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/av1/common/x86/av1_convolve_scale_sse4.c
@@ -22,7 +22,7 @@
 // av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
 static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w,
                      int h, int subpel_x_qn, int x_step_qn,
-                     const InterpFilterParams *filter_params, unsigned round) {
+                     const InterpFilterParams *filter_params, int round) {
   const int bd = 8;
   const int ntaps = 8;
 
@@ -260,8 +260,8 @@
 // filters.
 static void highbd_hfilter8(const uint16_t *src, int src_stride, int16_t *dst,
                             int w, int h, int subpel_x_qn, int x_step_qn,
-                            const InterpFilterParams *filter_params,
-                            unsigned round, int bd) {
+                            const InterpFilterParams *filter_params, int round,
+                            int bd) {
   const int ntaps = 8;
 
   src -= ntaps / 2 - 1;
diff --git a/av1/common/x86/av1_inv_txfm_ssse3.c b/av1/common/x86/av1_inv_txfm_ssse3.c
index f9bfb37..738cc98 100644
--- a/av1/common/x86/av1_inv_txfm_ssse3.c
+++ b/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -2246,7 +2246,7 @@
   const int step = flipud ? -1 : 1;
   const __m128i zero = _mm_setzero_si128();
   for (int i = 0; i < height; ++i, j += step) {
-    const __m128i v = _mm_cvtsi32_si128(*((uint32_t *)(output + i * stride)));
+    const __m128i v = _mm_cvtsi32_si128(*((int *)(output + i * stride)));
     __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
     u = _mm_packus_epi16(u, zero);
     *((int *)(output + i * stride)) = _mm_cvtsi128_si32(u);
diff --git a/av1/common/x86/convolve_sse2.c b/av1/common/x86/convolve_sse2.c
index 0e77822..012e75c 100644
--- a/av1/common/x86/convolve_sse2.c
+++ b/av1/common/x86/convolve_sse2.c
@@ -200,31 +200,31 @@
     if (w <= 4) {
       __m128i s[8], src6, res, res_round, res16;
       int res_int;
-      src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
+      src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
       s[0] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
       s[1] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
       s[2] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
       s[3] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
       s[4] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
       s[5] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
 
       do {
         s[6] = _mm_unpacklo_epi8(
-            src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
-        src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
+            src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
+        src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
         s[7] = _mm_unpacklo_epi8(
-            _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
+            _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
 
         res = convolve_lo_y(s + 0, coeffs);
         res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
diff --git a/av1/common/x86/jnt_convolve_sse2.c b/av1/common/x86/jnt_convolve_sse2.c
index 581d150..ab937f9 100644
--- a/av1/common/x86/jnt_convolve_sse2.c
+++ b/av1/common/x86/jnt_convolve_sse2.c
@@ -178,31 +178,31 @@
 
   if (w == 4) {
     __m128i s[8], src6, res, res_shift;
-    src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
+    src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
     s[0] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
+        _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
+        _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
     s[1] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
+        _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
+        _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
     s[2] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
+        _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
+        _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
     s[3] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
+        _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
+        _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
     s[4] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
+        _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
+        _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
     s[5] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
+        _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
 
     do {
       s[6] = _mm_unpacklo_epi8(
-          src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
-      src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
+          src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
+      src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
       s[7] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
+          _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
 
       res = convolve_lo_y(s + 0, coeffs);
       res_shift = _mm_sll_epi32(res, left_shift);
diff --git a/av1/common/x86/reconinter_sse4.c b/av1/common/x86/reconinter_sse4.c
index a503532..95814b4 100644
--- a/av1/common/x86/reconinter_sse4.c
+++ b/av1/common/x86/reconinter_sse4.c
@@ -33,13 +33,13 @@
   int i = 0;
   if (4 == w) {
     do {
-      const __m128i s0A = _mm_cvtsi32_si128(*(uint32_t *)src0);
-      const __m128i s0B = _mm_cvtsi32_si128(*(uint32_t *)(src0 + stride0));
+      const __m128i s0A = _mm_cvtsi32_si128(*(int *)src0);
+      const __m128i s0B = _mm_cvtsi32_si128(*(int *)(src0 + stride0));
       const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
       const __m128i s0 = _mm_cvtepu8_epi16(s0AB);
 
-      const __m128i s1A = _mm_cvtsi32_si128(*(uint32_t *)src1);
-      const __m128i s1B = _mm_cvtsi32_si128(*(uint32_t *)(src1 + stride1));
+      const __m128i s1A = _mm_cvtsi32_si128(*(int *)src1);
+      const __m128i s1B = _mm_cvtsi32_si128(*(int *)(src1 + stride1));
       const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
       const __m128i s1 = _mm_cvtepu8_epi16(s1AB);