x86: normalize types used with _mm{,256}_set?_epi16

w/clang -fsanitize=integer fixes warnings of the form:
implicit conversion from type 'uint16_t' (aka 'unsigned short') of value
49151 (16-bit, unsigned) to type 'short' changed the value to -16385
(16-bit, signed)

Bug: aomedia:3136
Bug: b/229626362
Change-Id: I414091e4701cae297244a1ad1c9c32bc57b95b6b
diff --git a/aom_dsp/simd/v128_intrinsics_x86.h b/aom_dsp/simd/v128_intrinsics_x86.h
index dbd2442..1b87fb7 100644
--- a/aom_dsp/simd/v128_intrinsics_x86.h
+++ b/aom_dsp/simd/v128_intrinsics_x86.h
@@ -83,7 +83,7 @@
 
 SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); }
 
-SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
+SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); }
 
 SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32(x); }
 
diff --git a/aom_dsp/simd/v256_intrinsics_x86.h b/aom_dsp/simd/v256_intrinsics_x86.h
index 10f4f8a..b43a53d 100644
--- a/aom_dsp/simd/v256_intrinsics_x86.h
+++ b/aom_dsp/simd/v256_intrinsics_x86.h
@@ -80,7 +80,7 @@
 
 SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8((char)x); }
 
-SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16(x); }
+SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16((short)x); }
 
 SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32(x); }
 
diff --git a/aom_dsp/simd/v64_intrinsics_x86.h b/aom_dsp/simd/v64_intrinsics_x86.h
index 95a8765..88766be 100644
--- a/aom_dsp/simd/v64_intrinsics_x86.h
+++ b/aom_dsp/simd/v64_intrinsics_x86.h
@@ -103,7 +103,7 @@
 
 SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); }
 
-SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
+SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); }
 
 SIMD_INLINE v64 v64_dup_32(uint32_t x) { return _mm_set1_epi32(x); }
 
diff --git a/aom_dsp/x86/fwd_txfm_sse2.h b/aom_dsp/x86/fwd_txfm_sse2.h
index ab3cd91..78ea985 100644
--- a/aom_dsp/x86/fwd_txfm_sse2.h
+++ b/aom_dsp/x86/fwd_txfm_sse2.h
@@ -34,7 +34,7 @@
 static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
                                           const __m128i *preg1) {
   const __m128i max_overflow = _mm_set1_epi16(0x7fff);
-  const __m128i min_overflow = _mm_set1_epi16(0x8000);
+  const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
   __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
                               _mm_cmpeq_epi16(*preg0, min_overflow));
   __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
@@ -48,7 +48,7 @@
                                           const __m128i *preg2,
                                           const __m128i *preg3) {
   const __m128i max_overflow = _mm_set1_epi16(0x7fff);
-  const __m128i min_overflow = _mm_set1_epi16(0x8000);
+  const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
   __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
                               _mm_cmpeq_epi16(*preg0, min_overflow));
   __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
diff --git a/aom_dsp/x86/highbd_variance_sse2.c b/aom_dsp/x86/highbd_variance_sse2.c
index 6bd6a5a..d45885c 100644
--- a/aom_dsp/x86/highbd_variance_sse2.c
+++ b/aom_dsp/x86/highbd_variance_sse2.c
@@ -629,13 +629,12 @@
     const uint8_t *ref8, int ref_stride,
     const DIST_WTD_COMP_PARAMS *jcp_param) {
   int i;
-  const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
-  const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
-  const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
-  const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
-  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
-  const __m128i r =
-      _mm_set_epi16(round, round, round, round, round, round, round, round);
+  const int16_t wt0 = (int16_t)jcp_param->fwd_offset;
+  const int16_t wt1 = (int16_t)jcp_param->bck_offset;
+  const __m128i w0 = _mm_set1_epi16(wt0);
+  const __m128i w1 = _mm_set1_epi16(wt1);
+  const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r = _mm_set1_epi16(round);
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index 597e3bd..621ef7a 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c
@@ -754,7 +754,7 @@
                                    const uint8_t *above, const uint8_t *left) {
   __m128i x = _mm_loadl_epi64((const __m128i *)left);
   const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
-  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
   __m256i rep = _mm256_set1_epi16((short)0x8000);
   const __m256i one = _mm256_set1_epi16(1);
   const __m256i top = get_top_vector(above);
@@ -778,7 +778,7 @@
 void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   const __m256i l = get_left_vector(left);
-  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
   __m256i rep = _mm256_set1_epi16((short)0x8000);
   const __m256i one = _mm256_set1_epi16(1);
   const __m256i top = get_top_vector(above);
@@ -797,7 +797,7 @@
 void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   __m256i l = get_left_vector(left);
-  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
   __m256i rep = _mm256_set1_epi16((short)0x8000);
   const __m256i one = _mm256_set1_epi16(1);
   const __m256i top = get_top_vector(above);
@@ -826,7 +826,7 @@
 
 void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
-  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
   const __m256i one = _mm256_set1_epi16(1);
   const __m256i top = get_top_vector(above);
 
@@ -864,7 +864,7 @@
   const __m256i l = get_left_vector(left);
   const __m256i t0 = get_top_vector(above);
   const __m256i t1 = get_top_vector(above + 16);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
   __m256i rep = _mm256_set1_epi16((short)0x8000);
   const __m256i one = _mm256_set1_epi16(1);
 
@@ -886,7 +886,7 @@
   __m256i l = get_left_vector(left);
   const __m256i t0 = get_top_vector(above);
   const __m256i t1 = get_top_vector(above + 16);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
   __m256i rep = _mm256_set1_epi16((short)0x8000);
   const __m256i one = _mm256_set1_epi16(1);
 
@@ -924,7 +924,7 @@
                                     const uint8_t *above, const uint8_t *left) {
   const __m256i t0 = get_top_vector(above);
   const __m256i t1 = get_top_vector(above + 16);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
   const __m256i one = _mm256_set1_epi16(1);
 
   int i, j;
@@ -952,7 +952,7 @@
   const __m256i t1 = get_top_vector(above + 16);
   const __m256i t2 = get_top_vector(above + 32);
   const __m256i t3 = get_top_vector(above + 48);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
   const __m256i one = _mm256_set1_epi16(1);
 
   int i, j;
@@ -984,7 +984,7 @@
   const __m256i t1 = get_top_vector(above + 16);
   const __m256i t2 = get_top_vector(above + 32);
   const __m256i t3 = get_top_vector(above + 48);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
   const __m256i one = _mm256_set1_epi16(1);
 
   int i, j;
@@ -1016,7 +1016,7 @@
   const __m256i t1 = get_top_vector(above + 16);
   const __m256i t2 = get_top_vector(above + 32);
   const __m256i t3 = get_top_vector(above + 48);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
   const __m256i one = _mm256_set1_epi16(1);
 
   int i;
@@ -4092,7 +4092,7 @@
     __m128i resx, resy;
     __m128i resxy;
     int y = r + 1;
-    ydx = _mm256_set1_epi16((uint16_t)(y * dx));
+    ydx = _mm256_set1_epi16((int16_t)(y * dx));
 
     int base_x = (-y * dx) >> frac_bits_x;
     for (int j = 0; j < W; j += 16) {
diff --git a/aom_dsp/x86/intrapred_sse2.c b/aom_dsp/x86/intrapred_sse2.c
index ccbce1e..61e2973 100644
--- a/aom_dsp/x86/intrapred_sse2.c
+++ b/aom_dsp/x86/intrapred_sse2.c
@@ -313,7 +313,7 @@
                                    const uint8_t *above, const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_4(above);
-  const __m128i two = _mm_set1_epi16((int16_t)2);
+  const __m128i two = _mm_set1_epi16(2);
   sum_above = _mm_add_epi16(sum_above, two);
   sum_above = _mm_srai_epi16(sum_above, 2);
   sum_above = _mm_shufflelo_epi16(sum_above, 0);
@@ -327,7 +327,7 @@
                                     const uint8_t *above, const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_4(above);
-  const __m128i two = _mm_set1_epi16((int16_t)2);
+  const __m128i two = _mm_set1_epi16(2);
   sum_above = _mm_add_epi16(sum_above, two);
   sum_above = _mm_srai_epi16(sum_above, 2);
   sum_above = _mm_shufflelo_epi16(sum_above, 0);
@@ -341,7 +341,7 @@
                                    const uint8_t *above, const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_8(above);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  const __m128i four = _mm_set1_epi16(4);
   sum_above = _mm_add_epi16(sum_above, four);
   sum_above = _mm_srai_epi16(sum_above, 3);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -353,7 +353,7 @@
                                     const uint8_t *above, const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_8(above);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  const __m128i four = _mm_set1_epi16(4);
   sum_above = _mm_add_epi16(sum_above, four);
   sum_above = _mm_srai_epi16(sum_above, 3);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -365,7 +365,7 @@
                                     const uint8_t *above, const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_8(above);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  const __m128i four = _mm_set1_epi16(4);
   sum_above = _mm_add_epi16(sum_above, four);
   sum_above = _mm_srai_epi16(sum_above, 3);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -377,7 +377,7 @@
                                     const uint8_t *above, const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_16_sse2(above);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  const __m128i eight = _mm_set1_epi16(8);
   sum_above = _mm_add_epi16(sum_above, eight);
   sum_above = _mm_srai_epi16(sum_above, 4);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -390,7 +390,7 @@
                                     const uint8_t *above, const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_16_sse2(above);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  const __m128i eight = _mm_set1_epi16(8);
   sum_above = _mm_add_epi16(sum_above, eight);
   sum_above = _mm_srai_epi16(sum_above, 4);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -404,7 +404,7 @@
                                      const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_16_sse2(above);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  const __m128i eight = _mm_set1_epi16(8);
   sum_above = _mm_add_epi16(sum_above, eight);
   sum_above = _mm_srai_epi16(sum_above, 4);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -418,7 +418,7 @@
                                      const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_16_sse2(above);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  const __m128i eight = _mm_set1_epi16(8);
   sum_above = _mm_add_epi16(sum_above, eight);
   sum_above = _mm_srai_epi16(sum_above, 4);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -431,7 +431,7 @@
                                     const uint8_t *above, const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_32_sse2(above);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  const __m128i sixteen = _mm_set1_epi16(16);
   sum_above = _mm_add_epi16(sum_above, sixteen);
   sum_above = _mm_srai_epi16(sum_above, 5);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -445,7 +445,7 @@
                                      const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_32_sse2(above);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  const __m128i sixteen = _mm_set1_epi16(16);
   sum_above = _mm_add_epi16(sum_above, sixteen);
   sum_above = _mm_srai_epi16(sum_above, 5);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -459,7 +459,7 @@
                                      const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_32_sse2(above);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  const __m128i sixteen = _mm_set1_epi16(16);
   sum_above = _mm_add_epi16(sum_above, sixteen);
   sum_above = _mm_srai_epi16(sum_above, 5);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -473,7 +473,7 @@
                                      const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_64(above);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  const __m128i thirtytwo = _mm_set1_epi16(32);
   sum_above = _mm_add_epi16(sum_above, thirtytwo);
   sum_above = _mm_srai_epi16(sum_above, 6);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -487,7 +487,7 @@
                                      const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_64(above);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  const __m128i thirtytwo = _mm_set1_epi16(32);
   sum_above = _mm_add_epi16(sum_above, thirtytwo);
   sum_above = _mm_srai_epi16(sum_above, 6);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -501,7 +501,7 @@
                                      const uint8_t *left) {
   (void)left;
   __m128i sum_above = dc_sum_64(above);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  const __m128i thirtytwo = _mm_set1_epi16(32);
   sum_above = _mm_add_epi16(sum_above, thirtytwo);
   sum_above = _mm_srai_epi16(sum_above, 6);
   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
@@ -517,7 +517,7 @@
                                     const uint8_t *above, const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_8(left);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  const __m128i four = _mm_set1_epi16(4);
   sum_left = _mm_add_epi16(sum_left, four);
   sum_left = _mm_srai_epi16(sum_left, 3);
   sum_left = _mm_shufflelo_epi16(sum_left, 0);
@@ -532,7 +532,7 @@
                                      const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_16_sse2(left);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  const __m128i eight = _mm_set1_epi16(8);
   sum_left = _mm_add_epi16(sum_left, eight);
   sum_left = _mm_srai_epi16(sum_left, 4);
   sum_left = _mm_shufflelo_epi16(sum_left, 0);
@@ -546,7 +546,7 @@
                                     const uint8_t *above, const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_4(left);
-  const __m128i two = _mm_set1_epi16((uint16_t)2);
+  const __m128i two = _mm_set1_epi16(2);
   sum_left = _mm_add_epi16(sum_left, two);
   sum_left = _mm_srai_epi16(sum_left, 2);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -559,7 +559,7 @@
                                      const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_16_sse2(left);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  const __m128i eight = _mm_set1_epi16(8);
   sum_left = _mm_add_epi16(sum_left, eight);
   sum_left = _mm_srai_epi16(sum_left, 4);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -572,7 +572,7 @@
                                      const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_32_sse2(left);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  const __m128i sixteen = _mm_set1_epi16(16);
   sum_left = _mm_add_epi16(sum_left, sixteen);
   sum_left = _mm_srai_epi16(sum_left, 5);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -585,7 +585,7 @@
                                      const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_4(left);
-  const __m128i two = _mm_set1_epi16((uint16_t)2);
+  const __m128i two = _mm_set1_epi16(2);
   sum_left = _mm_add_epi16(sum_left, two);
   sum_left = _mm_srai_epi16(sum_left, 2);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -599,7 +599,7 @@
                                      const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_8(left);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  const __m128i four = _mm_set1_epi16(4);
   sum_left = _mm_add_epi16(sum_left, four);
   sum_left = _mm_srai_epi16(sum_left, 3);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -613,7 +613,7 @@
                                       const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_32_sse2(left);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  const __m128i sixteen = _mm_set1_epi16(16);
   sum_left = _mm_add_epi16(sum_left, sixteen);
   sum_left = _mm_srai_epi16(sum_left, 5);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -627,7 +627,7 @@
                                       const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_64(left);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  const __m128i thirtytwo = _mm_set1_epi16(32);
   sum_left = _mm_add_epi16(sum_left, thirtytwo);
   sum_left = _mm_srai_epi16(sum_left, 6);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -641,7 +641,7 @@
                                      const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_8(left);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  const __m128i four = _mm_set1_epi16(4);
   sum_left = _mm_add_epi16(sum_left, four);
   sum_left = _mm_srai_epi16(sum_left, 3);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -655,7 +655,7 @@
                                       const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_16_sse2(left);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  const __m128i eight = _mm_set1_epi16(8);
   sum_left = _mm_add_epi16(sum_left, eight);
   sum_left = _mm_srai_epi16(sum_left, 4);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -669,7 +669,7 @@
                                       const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_64(left);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  const __m128i thirtytwo = _mm_set1_epi16(32);
   sum_left = _mm_add_epi16(sum_left, thirtytwo);
   sum_left = _mm_srai_epi16(sum_left, 6);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -683,7 +683,7 @@
                                       const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_64(left);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  const __m128i thirtytwo = _mm_set1_epi16(32);
   sum_left = _mm_add_epi16(sum_left, thirtytwo);
   sum_left = _mm_srai_epi16(sum_left, 6);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -697,7 +697,7 @@
                                       const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_32_sse2(left);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  const __m128i sixteen = _mm_set1_epi16(16);
   sum_left = _mm_add_epi16(sum_left, sixteen);
   sum_left = _mm_srai_epi16(sum_left, 5);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
@@ -711,7 +711,7 @@
                                       const uint8_t *left) {
   (void)above;
   __m128i sum_left = dc_sum_16_sse2(left);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  const __m128i eight = _mm_set1_epi16(8);
   sum_left = _mm_add_epi16(sum_left, eight);
   sum_left = _mm_srai_epi16(sum_left, 4);
   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
diff --git a/aom_dsp/x86/intrapred_sse4.c b/aom_dsp/x86/intrapred_sse4.c
index f2c6188..3f72dc4 100644
--- a/aom_dsp/x86/intrapred_sse4.c
+++ b/aom_dsp/x86/intrapred_sse4.c
@@ -743,7 +743,7 @@
     __m128i resx, resy;
     __m128i resxy;
     int y = r + 1;
-    ydx = _mm_set1_epi16((uint16_t)(y * dx));
+    ydx = _mm_set1_epi16((int16_t)(y * dx));
 
     int base_x = (-y * dx) >> frac_bits_x;
     for (int j = 0; j < W; j += 16) {
diff --git a/aom_dsp/x86/intrapred_ssse3.c b/aom_dsp/x86/intrapred_ssse3.c
index 6d579d5..ab59220 100644
--- a/aom_dsp/x86/intrapred_ssse3.c
+++ b/aom_dsp/x86/intrapred_ssse3.c
@@ -47,7 +47,7 @@
   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
@@ -68,7 +68,7 @@
   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
@@ -89,7 +89,7 @@
   const __m128i t = _mm_cvtsi32_si128(((const int *)above)[0]);
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
@@ -109,7 +109,7 @@
   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
@@ -130,7 +130,7 @@
   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
@@ -151,7 +151,7 @@
   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
@@ -171,7 +171,7 @@
   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   const __m128i one = _mm_set1_epi16(1);
 
   for (int j = 0; j < 2; ++j) {
@@ -204,7 +204,7 @@
   const __m128i zero = _mm_setzero_si128();
   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
@@ -225,7 +225,7 @@
   const __m128i zero = _mm_setzero_si128();
   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
@@ -248,7 +248,7 @@
   const __m128i zero = _mm_setzero_si128();
   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
@@ -271,7 +271,7 @@
   const __m128i zero = _mm_setzero_si128();
   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
   __m128i l16;
@@ -305,7 +305,7 @@
   const __m128i zero = _mm_setzero_si128();
   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   const __m128i one = _mm_set1_epi16(1);
 
   for (int j = 0; j < 4; ++j) {
@@ -331,7 +331,7 @@
   const __m128i bl = _mm_unpacklo_epi8(b, zero);
   const __m128i bh = _mm_unpackhi_epi8(b, zero);
 
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
   const __m128i l = _mm_loadl_epi64((const __m128i *)left);
@@ -360,7 +360,7 @@
   const __m128i bl = _mm_unpacklo_epi8(b, zero);
   const __m128i bh = _mm_unpackhi_epi8(b, zero);
 
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
   __m128i l = _mm_load_si128((const __m128i *)left);
@@ -390,7 +390,7 @@
   const __m128i bl = _mm_unpacklo_epi8(b, zero);
   const __m128i bh = _mm_unpackhi_epi8(b, zero);
 
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
   __m128i l = _mm_load_si128((const __m128i *)left);
@@ -433,7 +433,7 @@
   const __m128i bl = _mm_unpacklo_epi8(b, zero);
   const __m128i bh = _mm_unpackhi_epi8(b, zero);
 
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   const __m128i one = _mm_set1_epi16(1);
   __m128i l16;
 
@@ -471,7 +471,7 @@
   const __m128i dl = _mm_unpacklo_epi8(d, zero);
   const __m128i dh = _mm_unpackhi_epi8(d, zero);
 
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   const __m128i one = _mm_set1_epi16(1);
   __m128i l16;
 
@@ -513,7 +513,7 @@
   const __m128i dl = _mm_unpacklo_epi8(d, zero);
   const __m128i dh = _mm_unpackhi_epi8(d, zero);
 
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   const __m128i one = _mm_set1_epi16(1);
   __m128i l16;
 
@@ -555,7 +555,7 @@
   const __m128i dl = _mm_unpacklo_epi8(d, zero);
   const __m128i dh = _mm_unpackhi_epi8(d, zero);
 
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   const __m128i one = _mm_set1_epi16(1);
   __m128i l16;
 
@@ -594,9 +594,9 @@
   else
     pixels[1] = _mm_loadu_si128(((const __m128i *)left));
 
-  pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
+  pixels[2] = _mm_set1_epi16((int16_t)above[3]);
 
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+  const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
   const __m128i zero = _mm_setzero_si128();
   d = _mm_unpacklo_epi8(d, zero);
   pixels[0] = _mm_unpacklo_epi16(d, bp);
@@ -610,7 +610,7 @@
 static INLINE void load_weight_w4(int height, __m128i *weight_h,
                                   __m128i *weight_w) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+  const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
   const __m128i t = _mm_cvtsi32_si128(((const int *)smooth_weights)[0]);
   weight_h[0] = _mm_unpacklo_epi8(t, zero);
   weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
@@ -711,13 +711,13 @@
 static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
                                  int height, __m128i *pixels) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+  const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
   __m128i d = _mm_loadl_epi64((const __m128i *)above);
   d = _mm_unpacklo_epi8(d, zero);
   pixels[0] = _mm_unpacklo_epi16(d, bp);
   pixels[1] = _mm_unpackhi_epi16(d, bp);
 
-  pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
+  pixels[3] = _mm_set1_epi16((int16_t)above[7]);
 
   if (height == 4) {
     pixels[2] = _mm_cvtsi32_si128(((const int *)left)[0]);
@@ -750,7 +750,7 @@
   const int we_offset = height < 8 ? 0 : 4;
   __m128i we = _mm_loadu_si128((const __m128i *)&smooth_weights[we_offset]);
   weight_h[0] = _mm_unpacklo_epi8(we, zero);
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+  const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
   weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
 
   if (height == 4) {
@@ -891,7 +891,7 @@
   const uint8_t *const sm_weights_h = smooth_weights + bh - 4;
   const __m128i zero = _mm_setzero_si128();
   const __m128i scale_value =
-      _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+      _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
   const __m128i bottom_left = _mm_cvtsi32_si128(left[bh - 1]);
   const __m128i dup16 = _mm_set1_epi32(0x01000100);
   const __m128i top_right =
@@ -1024,7 +1024,7 @@
                                    int height, __m128i *pixels) {
   const __m128i zero = _mm_setzero_si128();
   __m128i d = _mm_cvtsi32_si128(((const int *)above)[0]);
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+  const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
   d = _mm_unpacklo_epi8(d, zero);
   pixels[0] = _mm_unpacklo_epi16(d, bp);
 }
@@ -1033,7 +1033,7 @@
 // weights[1]: scale - weights_h vector
 static INLINE void load_weight_v_w4(int height, __m128i *weights) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+  const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
 
   if (height == 4) {
     const __m128i weight = _mm_cvtsi32_si128(((const int *)smooth_weights)[0]);
@@ -1120,7 +1120,7 @@
                                    int height, __m128i *pixels) {
   const __m128i zero = _mm_setzero_si128();
   __m128i d = _mm_loadl_epi64((const __m128i *)above);
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+  const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
   d = _mm_unpacklo_epi8(d, zero);
   pixels[0] = _mm_unpacklo_epi16(d, bp);
   pixels[1] = _mm_unpackhi_epi16(d, bp);
@@ -1136,7 +1136,7 @@
 // weight_h[7]: same as [1], offset 24
 static INLINE void load_weight_v_w8(int height, __m128i *weight_h) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+  const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
 
   if (height < 16) {
     const int offset = height < 8 ? 0 : 4;
@@ -1260,7 +1260,7 @@
   const uint8_t *const sm_weights_h = smooth_weights + bh - 4;
   const __m128i zero = _mm_setzero_si128();
   const __m128i scale_value =
-      _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+      _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
   const __m128i dup16 = _mm_set1_epi32(0x01000100);
   const __m128i bottom_left =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(left[bh - 1]), dup16);
@@ -1383,7 +1383,7 @@
     pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
   else
     pixels[0] = _mm_loadu_si128(((const __m128i *)left));
-  pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
+  pixels[1] = _mm_set1_epi16((int16_t)above[3]);
 }
 
 // weights[0]: weights_w and scale - weights_w interleave vector
@@ -1393,7 +1393,7 @@
   const __m128i zero = _mm_setzero_si128();
 
   const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+  const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
   const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
   weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
 }
@@ -1469,7 +1469,7 @@
 // pixels[3]: right_pred vector
 static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left,
                                    int height, __m128i *pixels) {
-  pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
+  pixels[1] = _mm_set1_epi16((int16_t)above[7]);
 
   if (height == 4) {
     pixels[0] = _mm_cvtsi32_si128(((const int *)left)[0]);
@@ -1489,7 +1489,7 @@
 static INLINE void load_weight_h_w8(int height, __m128i *weight_w) {
   (void)height;
   const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+  const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
   const __m128i we = _mm_loadu_si128((const __m128i *)&smooth_weights[4]);
   const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
   const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
@@ -1591,7 +1591,7 @@
   const uint8_t *const sm_weights_w = smooth_weights + bw - 4;
   const __m128i zero = _mm_setzero_si128();
   const __m128i scale_value =
-      _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+      _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
   const __m128i top_right = _mm_cvtsi32_si128(above[bw - 1]);
   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
   const __m128i pred_round =
diff --git a/aom_dsp/x86/jnt_variance_ssse3.c b/aom_dsp/x86/jnt_variance_ssse3.c
index ee88e1e..dd798ca 100644
--- a/aom_dsp/x86/jnt_variance_ssse3.c
+++ b/aom_dsp/x86/jnt_variance_ssse3.c
@@ -53,9 +53,8 @@
   const int8_t w1 = (int8_t)jcp_param->bck_offset;
   const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
                                  w1, w0, w1, w0);
-  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
-  const __m128i r =
-      _mm_set_epi16(round, round, round, round, round, round, round, round);
+  const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r = _mm_set1_epi16(round);
 
   if (width >= 16) {
     // Read 16 pixels one row at a time
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index 86f0734..0347bca 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c
@@ -88,7 +88,7 @@
         // are (effectively) used as inputs to a multiply-accumulate.
         // So if we set the extra pixel slot to 0, then it is effectively
         // ignored.
-        const __m256i kl = _mm256_cvtepu8_epi16(_mm_set1_epi16((uint16_t)D1));
+        const __m256i kl = _mm256_cvtepu8_epi16(_mm_set1_epi16((int16_t)D1));
         acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
@@ -596,7 +596,7 @@
         // are (effectively) used as inputs to a multiply-accumulate.
         // So if we set the extra pixel slot to 0, then it is effectively
         // ignored.
-        const __m256i kl = _mm256_cvtepu8_epi16(_mm_set1_epi16((uint16_t)D1));
+        const __m256i kl = _mm256_cvtepu8_epi16(_mm_set1_epi16((int16_t)D1));
         acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
diff --git a/av1/encoder/x86/pickrst_sse4.c b/av1/encoder/x86/pickrst_sse4.c
index 3d496ef..d4921bd 100644
--- a/av1/encoder/x86/pickrst_sse4.c
+++ b/av1/encoder/x86/pickrst_sse4.c
@@ -62,7 +62,7 @@
         M_int[k][l] += D1 * X1 + D2 * X2;
 
         const __m128i kl =
-            _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+            _mm_cvtepu8_epi16(_mm_set1_epi16(*((int16_t *)(dgd_ijk + l))));
         acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
         acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
         acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
@@ -91,7 +91,7 @@
         // are (effectively) used as inputs to a multiply-accumulate.
         // So if we set the extra pixel slot to 0, then it is effectively
         // ignored.
-        const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((uint16_t)D1));
+        const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((int16_t)D1));
         acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
         acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
         acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
@@ -574,7 +574,7 @@
         M_int[k][l] += D1 * X1 + D2 * X2;
 
         const __m128i kl =
-            _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+            _mm_cvtepu8_epi16(_mm_set1_epi16(*((int16_t *)(dgd_ijk + l))));
         acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
         acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
         acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
@@ -601,7 +601,7 @@
         // are (effectively) used as inputs to a multiply-accumulate.
         // So if we set the extra pixel slot to 0, then it is effectively
         // ignored.
-        const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((uint16_t)D1));
+        const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((int16_t)D1));
         acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
         acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
         acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
diff --git a/av1/encoder/x86/reconinter_enc_sse2.c b/av1/encoder/x86/reconinter_enc_sse2.c
index 6455bf3..d33fec7 100644
--- a/av1/encoder/x86/reconinter_enc_sse2.c
+++ b/av1/encoder/x86/reconinter_enc_sse2.c
@@ -305,13 +305,12 @@
   assert(!(width * height & 7));
   n = width * height >> 3;
 
-  const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
-  const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
-  const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
-  const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
-  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
-  const __m128i r =
-      _mm_set_epi16(round, round, round, round, round, round, round, round);
+  const int16_t wt0 = (int16_t)jcp_param->fwd_offset;
+  const int16_t wt1 = (int16_t)jcp_param->bck_offset;
+  const __m128i w0 = _mm_set1_epi16(wt0);
+  const __m128i w1 = _mm_set1_epi16(wt1);
+  const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r = _mm_set1_epi16(round);
 
   uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
   for (i = 0; i < n; i++) {
diff --git a/av1/encoder/x86/reconinter_enc_ssse3.c b/av1/encoder/x86/reconinter_enc_ssse3.c
index e3be996..df7aa95 100644
--- a/av1/encoder/x86/reconinter_enc_ssse3.c
+++ b/av1/encoder/x86/reconinter_enc_ssse3.c
@@ -52,9 +52,8 @@
   const int8_t w1 = (int8_t)jcp_param->bck_offset;
   const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
                                  w1, w0, w1, w0);
-  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
-  const __m128i r =
-      _mm_set_epi16(round, round, round, round, round, round, round, round);
+  const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r = _mm_set1_epi16(round);
 
   for (i = 0; i < n; i++) {
     __m128i p0 = xx_loadu_128(comp_pred);