convolve_sse2: use xx_loadl_32() for unaligned int loads
This quiets some undefined sanitizer warnings related to unaligned
loads; no major changes in assembly with gcc-13 (some register changes,
instruction reordering).
Change-Id: I2e8ac7f40caec56f204440a39116745e2a9a1fe2
diff --git a/av1/common/x86/convolve_sse2.c b/av1/common/x86/convolve_sse2.c
index 6383567..4787d3f 100644
--- a/av1/common/x86/convolve_sse2.c
+++ b/av1/common/x86/convolve_sse2.c
@@ -16,6 +16,7 @@
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/x86/synonyms.h"
#include "av1/common/convolve.h"
static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
@@ -200,31 +201,23 @@
if (w <= 4) {
__m128i s[8], src6, res, res_round, res16;
int res_int;
- src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
- s[0] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
- _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
- s[1] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
- _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
- s[2] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
- _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
- s[3] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
- _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
- s[4] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
- _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
- s[5] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
+ src6 = xx_loadl_32(src_ptr + 6 * src_stride);
+ s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride),
+ xx_loadl_32(src_ptr + 1 * src_stride));
+ s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride),
+ xx_loadl_32(src_ptr + 2 * src_stride));
+ s[2] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 2 * src_stride),
+ xx_loadl_32(src_ptr + 3 * src_stride));
+ s[3] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 3 * src_stride),
+ xx_loadl_32(src_ptr + 4 * src_stride));
+ s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride),
+ xx_loadl_32(src_ptr + 5 * src_stride));
+ s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6);
do {
- s[6] = _mm_unpacklo_epi8(
- src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
- src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
- s[7] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
+ s[6] = _mm_unpacklo_epi8(src6, xx_loadl_32(src_ptr + 7 * src_stride));
+ src6 = xx_loadl_32(src_ptr + 8 * src_stride);
+ s[7] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 7 * src_stride), src6);
res = convolve_lo_y(s + 0, coeffs);
res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);