x86: normalize types used with _mm_cvtsi32_si128
w/clang -fsanitize=integer fixes warnings of the form:
implicit conversion from type 'uint32_t' (aka 'unsigned int') of value
2846534572 (32-bit, unsigned) to type 'int' changed the value to
-1448432724 (32-bit, signed)
Bug: aomedia:3136
Bug: b/229626362
Change-Id: I738df84c506ac2c42acce4597ebb00306ddeff97
diff --git a/aom_dsp/simd/v128_intrinsics_x86.h b/aom_dsp/simd/v128_intrinsics_x86.h
index c404015..3cc6c02 100644
--- a/aom_dsp/simd/v128_intrinsics_x86.h
+++ b/aom_dsp/simd/v128_intrinsics_x86.h
@@ -535,57 +535,57 @@
SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
- _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
+ _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
}
SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)),
- _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
+ _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c)));
}
SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
- __m128i x = _mm_cvtsi32_si128(c + 8);
+ __m128i x = _mm_cvtsi32_si128((int)(c + 8));
return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x),
_mm_sra_epi16(_mm_unpackhi_epi8(a, a), x));
}
SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
- return _mm_sll_epi16(a, _mm_cvtsi32_si128(c));
+ return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c));
}
SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
- return _mm_srl_epi16(a, _mm_cvtsi32_si128(c));
+ return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c));
}
SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
- return _mm_sra_epi16(a, _mm_cvtsi32_si128(c));
+ return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c));
}
SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
- return _mm_sll_epi32(a, _mm_cvtsi32_si128(c));
+ return _mm_sll_epi32(a, _mm_cvtsi32_si128((int)c));
}
SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
- return _mm_srl_epi32(a, _mm_cvtsi32_si128(c));
+ return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)c));
}
SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
- return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
+ return _mm_sra_epi32(a, _mm_cvtsi32_si128((int)c));
}
SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
- return _mm_sll_epi64(a, _mm_cvtsi32_si128(c));
+ return _mm_sll_epi64(a, _mm_cvtsi32_si128((int)c));
}
SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
- return _mm_srl_epi64(a, _mm_cvtsi32_si128(c));
+ return _mm_srl_epi64(a, _mm_cvtsi32_si128((int)c));
}
SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
// _mm_sra_epi64 is missing in gcc?
return v128_from_64((int64_t)v64_u64(v128_high_v64(a)) >> c,
(int64_t)v64_u64(v128_low_v64(a)) >> c);
- // return _mm_sra_epi64(a, _mm_cvtsi32_si128(c));
+ // return _mm_sra_epi64(a, _mm_cvtsi32_si128((int)c));
}
/* These intrinsics require immediate values, so we must use #defines
diff --git a/aom_dsp/simd/v256_intrinsics_x86.h b/aom_dsp/simd/v256_intrinsics_x86.h
index eb5eaf0..e10846b 100644
--- a/aom_dsp/simd/v256_intrinsics_x86.h
+++ b/aom_dsp/simd/v256_intrinsics_x86.h
@@ -597,55 +597,55 @@
SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
return _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << c)),
- _mm256_sll_epi16(a, _mm_cvtsi32_si128(c)));
+ _mm256_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
}
SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
return _mm256_and_si256(_mm256_set1_epi8((char)(0xff >> c)),
- _mm256_srl_epi16(a, _mm_cvtsi32_si128(c)));
+ _mm256_srl_epi16(a, _mm_cvtsi32_si128((int)c)));
}
SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
- __m128i x = _mm_cvtsi32_si128(c + 8);
+ __m128i x = _mm_cvtsi32_si128((int)(c + 8));
return _mm256_packs_epi16(_mm256_sra_epi16(_mm256_unpacklo_epi8(a, a), x),
_mm256_sra_epi16(_mm256_unpackhi_epi8(a, a), x));
}
SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
- return _mm256_sll_epi16(a, _mm_cvtsi32_si128(c));
+ return _mm256_sll_epi16(a, _mm_cvtsi32_si128((int)c));
}
SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
- return _mm256_srl_epi16(a, _mm_cvtsi32_si128(c));
+ return _mm256_srl_epi16(a, _mm_cvtsi32_si128((int)c));
}
SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
- return _mm256_sra_epi16(a, _mm_cvtsi32_si128(c));
+ return _mm256_sra_epi16(a, _mm_cvtsi32_si128((int)c));
}
SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
- return _mm256_sll_epi32(a, _mm_cvtsi32_si128(c));
+ return _mm256_sll_epi32(a, _mm_cvtsi32_si128((int)c));
}
SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
- return _mm256_srl_epi32(a, _mm_cvtsi32_si128(c));
+ return _mm256_srl_epi32(a, _mm_cvtsi32_si128((int)c));
}
SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
- return _mm256_sra_epi32(a, _mm_cvtsi32_si128(c));
+ return _mm256_sra_epi32(a, _mm_cvtsi32_si128((int)c));
}
SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) {
- return _mm256_sll_epi64(a, _mm_cvtsi32_si128(c));
+ return _mm256_sll_epi64(a, _mm_cvtsi32_si128((int)c));
}
SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) {
- return _mm256_srl_epi64(a, _mm_cvtsi32_si128(c));
+ return _mm256_srl_epi64(a, _mm_cvtsi32_si128((int)c));
}
SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
#if defined(__AVX512VL__)
- return _mm256_sra_epi64(a, _mm_cvtsi32_si128(c));
+ return _mm256_sra_epi64(a, _mm_cvtsi32_si128((int)c));
#else
return v256_from_v128(v128_shr_s64(v256_high_v128(a), c),
v128_shr_s64(v256_low_v128(a), c));
diff --git a/aom_dsp/simd/v64_intrinsics_x86.h b/aom_dsp/simd/v64_intrinsics_x86.h
index 1f273fe..42b602b 100644
--- a/aom_dsp/simd/v64_intrinsics_x86.h
+++ b/aom_dsp/simd/v64_intrinsics_x86.h
@@ -434,41 +434,42 @@
SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
- _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
+ _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
}
SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)),
- _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
+ _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c)));
}
SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
return _mm_packs_epi16(
- _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128(c + 8)), a);
+ _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128((int)(c + 8))),
+ a);
}
SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
- return _mm_sll_epi16(a, _mm_cvtsi32_si128(c));
+ return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c));
}
SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
- return _mm_srl_epi16(a, _mm_cvtsi32_si128(c));
+ return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c));
}
SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
- return _mm_sra_epi16(a, _mm_cvtsi32_si128(c));
+ return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c));
}
SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
- return _mm_sll_epi32(a, _mm_cvtsi32_si128(c));
+ return _mm_sll_epi32(a, _mm_cvtsi32_si128((int)c));
}
SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
- return _mm_srl_epi32(a, _mm_cvtsi32_si128(c));
+ return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)c));
}
SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
- return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
+ return _mm_sra_epi32(a, _mm_cvtsi32_si128((int)c));
}
/* These intrinsics require immediate values, so we must use #defines
diff --git a/aom_dsp/x86/intrapred_sse2.c b/aom_dsp/x86/intrapred_sse2.c
index 4786696..d42f185 100644
--- a/aom_dsp/x86/intrapred_sse2.c
+++ b/aom_dsp/x86/intrapred_sse2.c
@@ -1334,7 +1334,7 @@
const uint8_t *left, int height) {
int i = height >> 2;
do {
- __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
+ __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
left4 = _mm_unpacklo_epi8(left4, left4);
left4 = _mm_unpacklo_epi8(left4, left4);
const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
@@ -1364,7 +1364,7 @@
const uint8_t *left, int height) {
int i = height >> 2;
do {
- __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
+ __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
left4 = _mm_unpacklo_epi8(left4, left4);
left4 = _mm_unpacklo_epi8(left4, left4);
const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
diff --git a/aom_dsp/x86/intrapred_ssse3.c b/aom_dsp/x86/intrapred_ssse3.c
index a75616e..6d579d5 100644
--- a/aom_dsp/x86/intrapred_ssse3.c
+++ b/aom_dsp/x86/intrapred_ssse3.c
@@ -86,7 +86,7 @@
void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
__m128i l = _mm_load_si128((const __m128i *)left);
- const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
+ const __m128i t = _mm_cvtsi32_si128(((const int *)above)[0]);
const __m128i zero = _mm_setzero_si128();
const __m128i t16 = _mm_unpacklo_epi8(t, zero);
const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
@@ -199,7 +199,7 @@
void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+ __m128i l = _mm_cvtsi32_si128(((const int *)left)[0]);
const __m128i t = _mm_load_si128((const __m128i *)above);
const __m128i zero = _mm_setzero_si128();
const __m128i top0 = _mm_unpacklo_epi8(t, zero);
@@ -586,9 +586,9 @@
// pixels[2]: right_pred vector
static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
int height, __m128i *pixels) {
- __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
+ __m128i d = _mm_cvtsi32_si128(((const int *)above)[0]);
if (height == 4)
- pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+ pixels[1] = _mm_cvtsi32_si128(((const int *)left)[0]);
else if (height == 8)
pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
else
@@ -611,7 +611,7 @@
__m128i *weight_w) {
const __m128i zero = _mm_setzero_si128();
const __m128i d = _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
- const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)smooth_weights)[0]);
+ const __m128i t = _mm_cvtsi32_si128(((const int *)smooth_weights)[0]);
weight_h[0] = _mm_unpacklo_epi8(t, zero);
weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
@@ -720,7 +720,7 @@
pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
if (height == 4) {
- pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+ pixels[2] = _mm_cvtsi32_si128(((const int *)left)[0]);
} else if (height == 8) {
pixels[2] = _mm_loadl_epi64((const __m128i *)left);
} else if (height == 16) {
@@ -892,17 +892,17 @@
const __m128i zero = _mm_setzero_si128();
const __m128i scale_value =
_mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
- const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
+ const __m128i bottom_left = _mm_cvtsi32_si128(left[bh - 1]);
const __m128i dup16 = _mm_set1_epi32(0x01000100);
const __m128i top_right =
- _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(above[bw - 1]), dup16);
const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
const __m128i round =
_mm_set1_epi32((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
for (uint32_t y = 0; y < bh; ++y) {
- const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
- const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
+ const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
+ const __m128i left_y = _mm_cvtsi32_si128(left[y]);
const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
__m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
const __m128i wl_y =
@@ -1023,7 +1023,7 @@
static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
int height, __m128i *pixels) {
const __m128i zero = _mm_setzero_si128();
- __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
+ __m128i d = _mm_cvtsi32_si128(((const int *)above)[0]);
const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
d = _mm_unpacklo_epi8(d, zero);
pixels[0] = _mm_unpacklo_epi16(d, bp);
@@ -1036,8 +1036,7 @@
const __m128i d = _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
if (height == 4) {
- const __m128i weight =
- _mm_cvtsi32_si128(((const uint32_t *)smooth_weights)[0]);
+ const __m128i weight = _mm_cvtsi32_si128(((const int *)smooth_weights)[0]);
weights[0] = _mm_unpacklo_epi8(weight, zero);
weights[1] = _mm_sub_epi16(d, weights[0]);
} else if (height == 8) {
@@ -1264,13 +1263,13 @@
_mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
const __m128i dup16 = _mm_set1_epi32(0x01000100);
const __m128i bottom_left =
- _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(left[bh - 1]), dup16);
const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
const __m128i round =
_mm_set1_epi32((uint16_t)(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
for (uint32_t y = 0; y < bh; ++y) {
- const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
+ const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
const __m128i scale_m_weights_y =
_mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
const __m128i wl_y =
@@ -1379,7 +1378,7 @@
static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
int height, __m128i *pixels) {
if (height == 4)
- pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+ pixels[0] = _mm_cvtsi32_si128(((const int *)left)[0]);
else if (height == 8)
pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
else
@@ -1473,7 +1472,7 @@
pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
if (height == 4) {
- pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+ pixels[0] = _mm_cvtsi32_si128(((const int *)left)[0]);
} else if (height == 8) {
pixels[0] = _mm_loadl_epi64((const __m128i *)left);
} else if (height == 16) {
@@ -1593,13 +1592,13 @@
const __m128i zero = _mm_setzero_si128();
const __m128i scale_value =
_mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
- const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
+ const __m128i top_right = _mm_cvtsi32_si128(above[bw - 1]);
const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
const __m128i pred_round =
_mm_set1_epi32((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
for (uint32_t y = 0; y < bh; ++y) {
- const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
+ const __m128i left_y = _mm_cvtsi32_si128(left[y]);
const __m128i tr_ly =
_mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
diff --git a/aom_dsp/x86/masked_sad4d_ssse3.c b/aom_dsp/x86/masked_sad4d_ssse3.c
index 1235f27..799ce9e 100644
--- a/aom_dsp/x86/masked_sad4d_ssse3.c
+++ b/aom_dsp/x86/masked_sad4d_ssse3.c
@@ -153,15 +153,15 @@
_mm_storeu_si128((__m128i *)sad_array, res0);
}
-#define MASK_SAD4XH_ONE_REF(idx) \
- a = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)ref##idx), \
- _mm_cvtsi32_si128(*(uint32_t *)&ref##idx[a_stride])); \
- data = _mm_unpacklo_epi8(a, b); \
- mask = _mm_unpacklo_epi8(m, m_inv); \
- pred = _mm_maddubs_epi16(data, mask); \
- pred = xx_roundn_epu16(pred, AOM_BLEND_A64_ROUND_BITS); \
- \
- pred = _mm_packus_epi16(pred, _mm_setzero_si128()); \
+#define MASK_SAD4XH_ONE_REF(idx) \
+ a = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)ref##idx), \
+ _mm_cvtsi32_si128(*(int *)&ref##idx[a_stride])); \
+ data = _mm_unpacklo_epi8(a, b); \
+ mask = _mm_unpacklo_epi8(m, m_inv); \
+ pred = _mm_maddubs_epi16(data, mask); \
+ pred = xx_roundn_epu16(pred, AOM_BLEND_A64_ROUND_BITS); \
+ \
+ pred = _mm_packus_epi16(pred, _mm_setzero_si128()); \
res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
@@ -182,15 +182,15 @@
const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
for (int y = 0; y < height; y += 2) {
- const __m128i src = _mm_unpacklo_epi32(
- _mm_cvtsi32_si128(*(uint32_t *)src_ptr),
- _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride]));
+ const __m128i src =
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)src_ptr),
+ _mm_cvtsi32_si128(*(int *)&src_ptr[src_stride]));
const __m128i b =
- _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr),
- _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride]));
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)b_ptr),
+ _mm_cvtsi32_si128(*(int *)&b_ptr[b_stride]));
const __m128i m_copy =
- _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr),
- _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride]));
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)m_ptr),
+ _mm_cvtsi32_si128(*(int *)&m_ptr[m_stride]));
__m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
__m128i m = inv_mask ? m_inv : m_copy;
diff --git a/aom_dsp/x86/masked_sad_intrin_ssse3.c b/aom_dsp/x86/masked_sad_intrin_ssse3.c
index fd5352c..df3a876 100644
--- a/aom_dsp/x86/masked_sad_intrin_ssse3.c
+++ b/aom_dsp/x86/masked_sad_intrin_ssse3.c
@@ -194,18 +194,18 @@
for (y = 0; y < height; y += 2) {
// Load two rows at a time, this seems to be a bit faster
// than four rows at a time in this case.
- const __m128i src = _mm_unpacklo_epi32(
- _mm_cvtsi32_si128(*(uint32_t *)src_ptr),
- _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride]));
+ const __m128i src =
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)src_ptr),
+ _mm_cvtsi32_si128(*(int *)&src_ptr[src_stride]));
const __m128i a =
- _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)a_ptr),
- _mm_cvtsi32_si128(*(uint32_t *)&a_ptr[a_stride]));
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)a_ptr),
+ _mm_cvtsi32_si128(*(int *)&a_ptr[a_stride]));
const __m128i b =
- _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr),
- _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride]));
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)b_ptr),
+ _mm_cvtsi32_si128(*(int *)&b_ptr[b_stride]));
const __m128i m =
- _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr),
- _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride]));
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)m_ptr),
+ _mm_cvtsi32_si128(*(int *)&m_ptr[m_stride]));
const __m128i m_inv = _mm_sub_epi8(mask_max, m);
const __m128i data = _mm_unpacklo_epi8(a, b);
@@ -367,9 +367,8 @@
_mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]));
// Zero-extend mask to 16 bits
const __m128i m = _mm_unpacklo_epi8(
- _mm_unpacklo_epi32(
- _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
- _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)m_ptr),
+ _mm_cvtsi32_si128(*(const int *)&m_ptr[m_stride])),
_mm_setzero_si128());
const __m128i m_inv = _mm_sub_epi16(mask_max, m);
diff --git a/aom_dsp/x86/masked_variance_intrin_ssse3.c b/aom_dsp/x86/masked_variance_intrin_ssse3.c
index ac0e576..fb39a2b 100644
--- a/aom_dsp/x86/masked_variance_intrin_ssse3.c
+++ b/aom_dsp/x86/masked_variance_intrin_ssse3.c
@@ -986,9 +986,8 @@
const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
const __m128i m = _mm_unpacklo_epi8(
- _mm_unpacklo_epi32(
- _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
- _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)m_ptr),
+ _mm_cvtsi32_si128(*(const int *)&m_ptr[m_stride])),
zero);
const __m128i m_inv = _mm_sub_epi16(mask_max, m);
diff --git a/aom_dsp/x86/obmc_intrinsic_sse4.h b/aom_dsp/x86/obmc_intrinsic_sse4.h
index 5181e44..210f466 100644
--- a/aom_dsp/x86/obmc_intrinsic_sse4.h
+++ b/aom_dsp/x86/obmc_intrinsic_sse4.h
@@ -28,7 +28,7 @@
assert(IS_POWER_OF_TWO(h));
do {
- const __m128i v_p_b = _mm_cvtsi32_si128(*(const uint32_t *)(pre + n));
+ const __m128i v_p_b = _mm_cvtsi32_si128(*(const int *)(pre + n));
const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n));
const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n));
diff --git a/aom_dsp/x86/variance_avx2.c b/aom_dsp/x86/variance_avx2.c
index 7398a73..3ec81af 100644
--- a/aom_dsp/x86/variance_avx2.c
+++ b/aom_dsp/x86/variance_avx2.c
@@ -539,10 +539,10 @@
const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
__m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
for (int i = 0; i < h; i += 4) {
- dst0_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 0) * dstride]));
- dst1_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 1) * dstride]));
- dst2_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 2) * dstride]));
- dst3_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 3) * dstride]));
+ dst0_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 0) * dstride]));
+ dst1_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 1) * dstride]));
+ dst2_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 2) * dstride]));
+ dst3_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 3) * dstride]));
dst_16x8 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(dst0_4x8, dst1_4x8),
_mm_unpacklo_epi32(dst2_4x8, dst3_4x8));
dst_16x16 = _mm256_cvtepu8_epi16(dst_16x8);
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index a0223a9..d49d115 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -678,8 +678,8 @@
const __m128i zeros = _mm_setzero_si128();
__m128i square_result = _mm_setzero_si128();
for (int i = 0; i < h; i += 2) {
- dst0_8x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 0) * dstride]));
- dst1_8x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 1) * dstride]));
+ dst0_8x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 0) * dstride]));
+ dst1_8x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 1) * dstride]));
dst_16x8 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(dst0_8x8, dst1_8x8), zeros);
src0_16x4 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
diff --git a/av1/common/x86/av1_convolve_scale_sse4.c b/av1/common/x86/av1_convolve_scale_sse4.c
index 0bdf49f..6d27b52 100644
--- a/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/av1/common/x86/av1_convolve_scale_sse4.c
@@ -22,7 +22,7 @@
// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w,
int h, int subpel_x_qn, int x_step_qn,
- const InterpFilterParams *filter_params, unsigned round) {
+ const InterpFilterParams *filter_params, int round) {
const int bd = 8;
const int ntaps = 8;
@@ -260,8 +260,8 @@
// filters.
static void highbd_hfilter8(const uint16_t *src, int src_stride, int16_t *dst,
int w, int h, int subpel_x_qn, int x_step_qn,
- const InterpFilterParams *filter_params,
- unsigned round, int bd) {
+ const InterpFilterParams *filter_params, int round,
+ int bd) {
const int ntaps = 8;
src -= ntaps / 2 - 1;
diff --git a/av1/common/x86/av1_inv_txfm_ssse3.c b/av1/common/x86/av1_inv_txfm_ssse3.c
index f9bfb37..738cc98 100644
--- a/av1/common/x86/av1_inv_txfm_ssse3.c
+++ b/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -2246,7 +2246,7 @@
const int step = flipud ? -1 : 1;
const __m128i zero = _mm_setzero_si128();
for (int i = 0; i < height; ++i, j += step) {
- const __m128i v = _mm_cvtsi32_si128(*((uint32_t *)(output + i * stride)));
+ const __m128i v = _mm_cvtsi32_si128(*((int *)(output + i * stride)));
__m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
u = _mm_packus_epi16(u, zero);
*((int *)(output + i * stride)) = _mm_cvtsi128_si32(u);
diff --git a/av1/common/x86/convolve_sse2.c b/av1/common/x86/convolve_sse2.c
index 0e77822..012e75c 100644
--- a/av1/common/x86/convolve_sse2.c
+++ b/av1/common/x86/convolve_sse2.c
@@ -200,31 +200,31 @@
if (w <= 4) {
__m128i s[8], src6, res, res_round, res16;
int res_int;
- src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
+ src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
s[0] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
s[1] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
s[2] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
s[3] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
s[4] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
s[5] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
do {
s[6] = _mm_unpacklo_epi8(
- src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
- src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
+ src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
+ src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
s[7] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
res = convolve_lo_y(s + 0, coeffs);
res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
diff --git a/av1/common/x86/jnt_convolve_sse2.c b/av1/common/x86/jnt_convolve_sse2.c
index 581d150..ab937f9 100644
--- a/av1/common/x86/jnt_convolve_sse2.c
+++ b/av1/common/x86/jnt_convolve_sse2.c
@@ -178,31 +178,31 @@
if (w == 4) {
__m128i s[8], src6, res, res_shift;
- src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
+ src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
s[0] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
s[1] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
s[2] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
s[3] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
s[4] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
s[5] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
do {
s[6] = _mm_unpacklo_epi8(
- src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
- src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
+ src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
+ src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
s[7] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
res = convolve_lo_y(s + 0, coeffs);
res_shift = _mm_sll_epi32(res, left_shift);
diff --git a/av1/common/x86/reconinter_sse4.c b/av1/common/x86/reconinter_sse4.c
index a503532..95814b4 100644
--- a/av1/common/x86/reconinter_sse4.c
+++ b/av1/common/x86/reconinter_sse4.c
@@ -33,13 +33,13 @@
int i = 0;
if (4 == w) {
do {
- const __m128i s0A = _mm_cvtsi32_si128(*(uint32_t *)src0);
- const __m128i s0B = _mm_cvtsi32_si128(*(uint32_t *)(src0 + stride0));
+ const __m128i s0A = _mm_cvtsi32_si128(*(int *)src0);
+ const __m128i s0B = _mm_cvtsi32_si128(*(int *)(src0 + stride0));
const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
const __m128i s0 = _mm_cvtepu8_epi16(s0AB);
- const __m128i s1A = _mm_cvtsi32_si128(*(uint32_t *)src1);
- const __m128i s1B = _mm_cvtsi32_si128(*(uint32_t *)(src1 + stride1));
+ const __m128i s1A = _mm_cvtsi32_si128(*(int *)src1);
+ const __m128i s1B = _mm_cvtsi32_si128(*(int *)(src1 + stride1));
const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
const __m128i s1 = _mm_cvtepu8_epi16(s1AB);