Constrain the range of immediate constants

Improvement over solution in 8a99b5f

BUG=aomedia:1945

Change-Id: I6c72494544919943dbce799f2fb046b1ef33abb0
diff --git a/aom_dsp/simd/v128_intrinsics_x86.h b/aom_dsp/simd/v128_intrinsics_x86.h
index 72a2261..f9043fe 100644
--- a/aom_dsp/simd/v128_intrinsics_x86.h
+++ b/aom_dsp/simd/v128_intrinsics_x86.h
@@ -12,6 +12,7 @@
 #ifndef _V128_INTRINSICS_H
 #define _V128_INTRINSICS_H
 
+#include <stdint.h>
 #include "aom_dsp/simd/v64_intrinsics_x86.h"
 
 typedef __m128i v128;
@@ -71,7 +72,7 @@
 #endif
 #else
 #if defined(__SSSE3__)
-#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, c) : (b))
+#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, (uint8_t)(c)) : (b))
 #else
 #define v128_align(a, b, c) \
   ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
@@ -588,8 +589,8 @@
 
 /* These intrinsics require immediate values, so we must use #defines
    to enforce that. */
-#define v128_shl_n_byte(a, c) _mm_slli_si128(a, c)
-#define v128_shr_n_byte(a, c) _mm_srli_si128(a, c)
+#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
+#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
 #define v128_shl_n_8(a, c) \
   _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
 #define v128_shr_n_u8(a, c) \
diff --git a/aom_dsp/simd/v256_intrinsics_x86.h b/aom_dsp/simd/v256_intrinsics_x86.h
index 6bfc0b0..05f2051 100644
--- a/aom_dsp/simd/v256_intrinsics_x86.h
+++ b/aom_dsp/simd/v256_intrinsics_x86.h
@@ -658,12 +658,12 @@
    to enforce that. */
 // _mm256_slli_si256 works on 128 bit lanes and can't be used
 #define v256_shl_n_byte(a, n)                                                \
-  ((n) < 16 ? v256_from_v128(v128_align(v256_high_v128(a), v256_low_v128(a), \
-                                        (16 - (n)) & 15),                    \
-                             v128_shl_n_byte(v256_low_v128(a), n))           \
+  ((n) < 16 ? v256_from_v128(                                                \
+                  v128_align(v256_high_v128(a), v256_low_v128(a), 16 - (n)), \
+                  v128_shl_n_byte(v256_low_v128(a), n))                      \
             : _mm256_inserti128_si256(                                       \
                   _mm256_setzero_si256(),                                    \
-                  v128_shl_n_byte(v256_low_v128(a), (n)&15), 1))
+                  v128_shl_n_byte(v256_low_v128(a), (n)-16), 1))
 
 // _mm256_srli_si256 works on 128 bit lanes and can't be used
 #define v256_shr_n_byte(a, n)                                                \