Fix all UBSan runtime errors on misaligned loads Add the loadu_uint16(), loadu_uint32(), and loadu_uint64() functions to aom_dsp/x86/mem_sse2.h. These are modeled after the loadu_uint32() function in libvpx/vpx_dsp/x86/mem_sse2.h. BUG=2642 Change-Id: I4c776cbb13d4f7424621d516cfab89b3de122389
diff --git a/aom_dsp/x86/mem_sse2.h b/aom_dsp/x86/mem_sse2.h index 6c82167..28fc5fe 100644 --- a/aom_dsp/x86/mem_sse2.h +++ b/aom_dsp/x86/mem_sse2.h
@@ -13,11 +13,30 @@ #define AOM_AOM_DSP_X86_MEM_SSE2_H_ #include <emmintrin.h> // SSE2 +#include <string.h> #include "config/aom_config.h" #include "aom/aom_integer.h" +static INLINE uint16_t loadu_uint16(const void *src) { + uint16_t v; + memcpy(&v, src, sizeof(v)); + return v; +} + +static INLINE uint32_t loadu_uint32(const void *src) { + uint32_t v; + memcpy(&v, src, sizeof(v)); + return v; +} + +static INLINE uint64_t loadu_uint64(const void *src) { + uint64_t v; + memcpy(&v, src, sizeof(v)); + return v; +} + static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) { return _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src)); @@ -25,10 +44,10 @@ static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src, const int byte_stride) { - return _mm_setr_epi32(*(const int32_t *)((int8_t *)src + 0 * byte_stride), - *(const int32_t *)((int8_t *)src + 1 * byte_stride), - *(const int32_t *)((int8_t *)src + 2 * byte_stride), - *(const int32_t *)((int8_t *)src + 3 * byte_stride)); + return _mm_setr_epi32(loadu_uint32((int8_t *)src + 0 * byte_stride), + loadu_uint32((int8_t *)src + 1 * byte_stride), + loadu_uint32((int8_t *)src + 2 * byte_stride), + loadu_uint32((int8_t *)src + 3 * byte_stride)); } static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c index 4e2b5a1..97f71fc 100644 --- a/aom_dsp/x86/variance_sse2.c +++ b/aom_dsp/x86/variance_sse2.c
@@ -17,6 +17,7 @@ #include "config/av1_rtcd.h" #include "aom_dsp/blend.h" +#include "aom_dsp/x86/mem_sse2.h" #include "aom_dsp/x86/synonyms.h" #include "aom_ports/mem.h" @@ -42,8 +43,8 @@ } static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) { - const __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 0 * stride)); - const __m128i p1 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 1 * stride)); + const __m128i p0 = _mm_cvtsi32_si128(loadu_uint32(p + 0 * stride)); + const __m128i p1 = _mm_cvtsi32_si128(loadu_uint32(p + 1 * stride)); return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128()); }
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c index f8703a2..ef70a7b 100644 --- a/av1/encoder/x86/pickrst_avx2.c +++ b/av1/encoder/x86/pickrst_avx2.c
@@ -10,6 +10,7 @@ */ #include <immintrin.h> // AVX2 +#include "aom_dsp/x86/mem_sse2.h" #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/synonyms_avx2.h" #include "aom_dsp/x86/transpose_sse2.h" @@ -49,7 +50,7 @@ M_int[k][l] += D1 * X1 + D2 * X2; const __m256i kl = - _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l)))); + _mm256_cvtepu8_epi16(_mm_set1_epi16(loadu_uint16(dgd_ijk + l))); acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); @@ -181,8 +182,7 @@ // Load two u16 values from dgd_ijkl combined as a u32, // then broadcast to 8x u32 slots of a 256 - const __m256i dgd_ijkl = - _mm256_set1_epi32(*((uint32_t *)(dgd_ijk + l))); + const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_uint32(dgd_ijk + l)); // dgd_ijkl = [y x y x y x y x] [y x y x y x y x] where each is a u16 acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, @@ -285,8 +285,7 @@ // Load two u16 values from dgd_ijkl combined as a u32, // then broadcast to 8x u32 slots of a 256 - const __m256i dgd_ijkl = - _mm256_set1_epi32(*((uint32_t *)(dgd_ijk + l))); + const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_uint32(dgd_ijk + l)); // dgd_ijkl = [x y x y x y x y] [x y x y x y x y] where each is a u16 acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, @@ -406,7 +405,7 @@ M_int[k][l] += D1 * X1 + D2 * X2; const __m256i kl = - _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l)))); + _mm256_cvtepu8_epi16(_mm_set1_epi16(loadu_uint16(dgd_ijk + l))); acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
diff --git a/av1/encoder/x86/rdopt_avx2.c b/av1/encoder/x86/rdopt_avx2.c index f588bad..fefc036 100644 --- a/av1/encoder/x86/rdopt_avx2.c +++ b/av1/encoder/x86/rdopt_avx2.c
@@ -11,6 +11,7 @@ #include <assert.h> #include <immintrin.h> +#include "aom_dsp/x86/mem_sse2.h" #include "aom_dsp/x86/synonyms_avx2.h" #include "aom_ports/system_state.h" @@ -31,8 +32,8 @@ // [ m n o p ] const __m256i pixels = _mm256_set_epi64x( - *(uint64_t *)&diff[0 * stride], *(uint64_t *)&diff[1 * stride], - *(uint64_t *)&diff[2 * stride], *(uint64_t *)&diff[3 * stride]); + loadu_uint64(&diff[0 * stride]), loadu_uint64(&diff[1 * stride]), + loadu_uint64(&diff[2 * stride]), loadu_uint64(&diff[3 * stride])); // pixels = [d c b a h g f e] [l k j i p o n m] as i16 const __m256i slli = _mm256_slli_epi64(pixels, 16);