mem_sse2.h: loadu_uint{16,32,64} -> loadu_int*

this changes the returns to signed which matches the types with
usage of these calls as input to _mm_cvtsi32_si128(), _mm_set_epi32(),
etc. fixes implicit conversion warnings with clang-11
-fsanitize=undefined

Bug: aomedia:3136
Bug: b/229626362
Change-Id: I1425f12d4f79155dd5d7af0eb00fbdb9f1940544
diff --git a/aom_dsp/x86/mem_sse2.h b/aom_dsp/x86/mem_sse2.h
index dacb613..085a572 100644
--- a/aom_dsp/x86/mem_sse2.h
+++ b/aom_dsp/x86/mem_sse2.h
@@ -19,20 +19,20 @@
 
 #include "aom/aom_integer.h"
 
-static INLINE uint16_t loadu_uint16(const void *src) {
-  uint16_t v;
+static INLINE int16_t loadu_int16(const void *src) {
+  int16_t v;
   memcpy(&v, src, sizeof(v));
   return v;
 }
 
-static INLINE uint32_t loadu_uint32(const void *src) {
-  uint32_t v;
+static INLINE int32_t loadu_int32(const void *src) {
+  int32_t v;
   memcpy(&v, src, sizeof(v));
   return v;
 }
 
-static INLINE uint64_t loadu_uint64(const void *src) {
-  uint64_t v;
+static INLINE int64_t loadu_int64(const void *src) {
+  int64_t v;
   memcpy(&v, src, sizeof(v));
   return v;
 }
@@ -48,10 +48,10 @@
 
 static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
                                                   const int byte_stride) {
-  return _mm_setr_epi32(loadu_uint32((int8_t *)src + 0 * byte_stride),
-                        loadu_uint32((int8_t *)src + 1 * byte_stride),
-                        loadu_uint32((int8_t *)src + 2 * byte_stride),
-                        loadu_uint32((int8_t *)src + 3 * byte_stride));
+  return _mm_setr_epi32(loadu_int32((int8_t *)src + 0 * byte_stride),
+                        loadu_int32((int8_t *)src + 1 * byte_stride),
+                        loadu_int32((int8_t *)src + 2 * byte_stride),
+                        loadu_int32((int8_t *)src + 3 * byte_stride));
 }
 
 static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index d49d115..a62a089 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -36,8 +36,8 @@
 }
 
 static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
-  const __m128i p0 = _mm_cvtsi32_si128(loadu_uint32(p + 0 * stride));
-  const __m128i p1 = _mm_cvtsi32_si128(loadu_uint32(p + 1 * stride));
+  const __m128i p0 = _mm_cvtsi32_si128(loadu_int32(p + 0 * stride));
+  const __m128i p1 = _mm_cvtsi32_si128(loadu_int32(p + 1 * stride));
   return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128());
 }
 
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index d53b128..86f0734 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c
@@ -59,7 +59,7 @@
         M_int[k][l] += D1 * X1 + D2 * X2;
 
         const __m256i kl =
-            _mm256_cvtepu8_epi16(_mm_set1_epi16(loadu_uint16(dgd_ijk + l)));
+            _mm256_cvtepu8_epi16(_mm_set1_epi16(loadu_int16(dgd_ijk + l)));
         acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
@@ -260,7 +260,7 @@
 
         // Load two u16 values from dgd_ijkl combined as a u32,
         // then broadcast to 8x u32 slots of a 256
-        const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_uint32(dgd_ijk + l));
+        const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_int32(dgd_ijk + l));
         // dgd_ijkl = [y x y x y x y x] [y x y x y x y x] where each is a u16
 
         acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
@@ -408,7 +408,7 @@
 
         // Load two u16 values from dgd_ijkl combined as a u32,
         // then broadcast to 8x u32 slots of a 256
-        const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_uint32(dgd_ijk + l));
+        const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_int32(dgd_ijk + l));
         // dgd_ijkl = [x y x y x y x y] [x y x y x y x y] where each is a u16
 
         acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
@@ -569,7 +569,7 @@
         M_int[k][l] += D1 * X1 + D2 * X2;
 
         const __m256i kl =
-            _mm256_cvtepu8_epi16(_mm_set1_epi16(loadu_uint16(dgd_ijk + l)));
+            _mm256_cvtepu8_epi16(_mm_set1_epi16(loadu_int16(dgd_ijk + l)));
         acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
diff --git a/av1/encoder/x86/rdopt_avx2.c b/av1/encoder/x86/rdopt_avx2.c
index 3bc763c..a0ab394 100644
--- a/av1/encoder/x86/rdopt_avx2.c
+++ b/av1/encoder/x86/rdopt_avx2.c
@@ -31,8 +31,8 @@
   //                      [ m n o p ]
 
   const __m256i pixels = _mm256_set_epi64x(
-      loadu_uint64(&diff[0 * stride]), loadu_uint64(&diff[1 * stride]),
-      loadu_uint64(&diff[2 * stride]), loadu_uint64(&diff[3 * stride]));
+      loadu_int64(&diff[0 * stride]), loadu_int64(&diff[1 * stride]),
+      loadu_int64(&diff[2 * stride]), loadu_int64(&diff[3 * stride]));
   // pixels = [d c b a h g f e] [l k j i p o n m] as i16
 
   const __m256i slli = _mm256_slli_epi64(pixels, 16);