Fix all UBSan runtime errors on misaligned loads
Add the loadu_uint16(), loadu_uint32(), and loadu_uint64() functions to
aom_dsp/x86/mem_sse2.h. These are modeled after the loadu_uint32()
function in libvpx/vpx_dsp/x86/mem_sse2.h.
BUG=2642
Change-Id: I4c776cbb13d4f7424621d516cfab89b3de122389
diff --git a/aom_dsp/x86/mem_sse2.h b/aom_dsp/x86/mem_sse2.h
index 6c82167..28fc5fe 100644
--- a/aom_dsp/x86/mem_sse2.h
+++ b/aom_dsp/x86/mem_sse2.h
@@ -13,11 +13,30 @@
#define AOM_AOM_DSP_X86_MEM_SSE2_H_
#include <emmintrin.h> // SSE2
+#include <string.h>
#include "config/aom_config.h"
#include "aom/aom_integer.h"
+static INLINE uint16_t loadu_uint16(const void *src) {
+ uint16_t v;
+ memcpy(&v, src, sizeof(v));
+ return v;
+}
+
+static INLINE uint32_t loadu_uint32(const void *src) {
+ uint32_t v;
+ memcpy(&v, src, sizeof(v));
+ return v;
+}
+
+static INLINE uint64_t loadu_uint64(const void *src) {
+ uint64_t v;
+ memcpy(&v, src, sizeof(v));
+ return v;
+}
+
static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
return _mm_castps_si128(
_mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
@@ -25,10 +44,10 @@
static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
const int byte_stride) {
- return _mm_setr_epi32(*(const int32_t *)((int8_t *)src + 0 * byte_stride),
- *(const int32_t *)((int8_t *)src + 1 * byte_stride),
- *(const int32_t *)((int8_t *)src + 2 * byte_stride),
- *(const int32_t *)((int8_t *)src + 3 * byte_stride));
+ return _mm_setr_epi32(loadu_uint32((int8_t *)src + 0 * byte_stride),
+ loadu_uint32((int8_t *)src + 1 * byte_stride),
+ loadu_uint32((int8_t *)src + 2 * byte_stride),
+ loadu_uint32((int8_t *)src + 3 * byte_stride));
}
static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index 4e2b5a1..97f71fc 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -17,6 +17,7 @@
#include "config/av1_rtcd.h"
#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/mem_sse2.h"
#include "aom_dsp/x86/synonyms.h"
#include "aom_ports/mem.h"
@@ -42,8 +43,8 @@
}
static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
- const __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 0 * stride));
- const __m128i p1 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 1 * stride));
+ const __m128i p0 = _mm_cvtsi32_si128(loadu_uint32(p + 0 * stride));
+ const __m128i p1 = _mm_cvtsi32_si128(loadu_uint32(p + 1 * stride));
return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128());
}
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index f8703a2..ef70a7b 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c
@@ -10,6 +10,7 @@
*/
#include <immintrin.h> // AVX2
+#include "aom_dsp/x86/mem_sse2.h"
#include "aom_dsp/x86/synonyms.h"
#include "aom_dsp/x86/synonyms_avx2.h"
#include "aom_dsp/x86/transpose_sse2.h"
@@ -49,7 +50,7 @@
M_int[k][l] += D1 * X1 + D2 * X2;
const __m256i kl =
- _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+ _mm256_cvtepu8_epi16(_mm_set1_epi16(loadu_uint16(dgd_ijk + l)));
acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
@@ -181,8 +182,7 @@
// Load two u16 values from dgd_ijkl combined as a u32,
// then broadcast to 8x u32 slots of a 256
- const __m256i dgd_ijkl =
- _mm256_set1_epi32(*((uint32_t *)(dgd_ijk + l)));
+ const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_uint32(dgd_ijk + l));
// dgd_ijkl = [y x y x y x y x] [y x y x y x y x] where each is a u16
acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
@@ -285,8 +285,7 @@
// Load two u16 values from dgd_ijkl combined as a u32,
// then broadcast to 8x u32 slots of a 256
- const __m256i dgd_ijkl =
- _mm256_set1_epi32(*((uint32_t *)(dgd_ijk + l)));
+ const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_uint32(dgd_ijk + l));
// dgd_ijkl = [x y x y x y x y] [x y x y x y x y] where each is a u16
acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
@@ -406,7 +405,7 @@
M_int[k][l] += D1 * X1 + D2 * X2;
const __m256i kl =
- _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+ _mm256_cvtepu8_epi16(_mm_set1_epi16(loadu_uint16(dgd_ijk + l)));
acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
diff --git a/av1/encoder/x86/rdopt_avx2.c b/av1/encoder/x86/rdopt_avx2.c
index f588bad..fefc036 100644
--- a/av1/encoder/x86/rdopt_avx2.c
+++ b/av1/encoder/x86/rdopt_avx2.c
@@ -11,6 +11,7 @@
#include <assert.h>
#include <immintrin.h>
+#include "aom_dsp/x86/mem_sse2.h"
#include "aom_dsp/x86/synonyms_avx2.h"
#include "aom_ports/system_state.h"
@@ -31,8 +32,8 @@
// [ m n o p ]
const __m256i pixels = _mm256_set_epi64x(
- *(uint64_t *)&diff[0 * stride], *(uint64_t *)&diff[1 * stride],
- *(uint64_t *)&diff[2 * stride], *(uint64_t *)&diff[3 * stride]);
+ loadu_uint64(&diff[0 * stride]), loadu_uint64(&diff[1 * stride]),
+ loadu_uint64(&diff[2 * stride]), loadu_uint64(&diff[3 * stride]));
// pixels = [d c b a h g f e] [l k j i p o n m] as i16
const __m256i slli = _mm256_slli_epi64(pixels, 16);