AVX2: Fix a couple unaligned load warnings
Change-Id: Ib46fbc3c3620730c59b607181b2763ebca88ba8c
diff --git a/third_party/SVT-AV1/convolve_avx2.h b/third_party/SVT-AV1/convolve_avx2.h
index 452a713..923cabe 100644
--- a/third_party/SVT-AV1/convolve_avx2.h
+++ b/third_party/SVT-AV1/convolve_avx2.h
@@ -18,6 +18,7 @@
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/mem_sse2.h"
static INLINE void populate_coeffs_4tap_avx2(const __m128i coeffs_128,
__m256i coeffs[2]) {
@@ -156,7 +157,7 @@
__m256i *const coeffs /* [1] */) {
const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
filter_params, subpel_q4 & SUBPEL_MASK);
- const __m128i coeffs_8 = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
+ const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
// right shift all filter co-efficients by 1 to reduce the bits required.
@@ -1154,9 +1155,9 @@
const __m128i coeffs[2],
__m128i s_16[4],
__m128i ss_128[2]) {
- s_16[3] = _mm_cvtsi32_si128(*(int16_t *)(src + stride));
+ s_16[3] = _mm_cvtsi32_si128(loadu_int16(src + stride));
const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
- s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src + 2 * stride));
+ s_16[2] = _mm_cvtsi32_si128(loadu_int16(src + 2 * stride));
const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[2]);
ss_128[1] = _mm_unpacklo_epi8(src23, src34);
return convolve_4tap_ssse3(ss_128, coeffs);
@@ -1167,9 +1168,9 @@
const __m128i coeffs[2],
__m128i s_32[4],
__m128i ss_128[2]) {
- s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src + stride));
+ s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + stride));
const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
- s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src + 2 * stride));
+ s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
ss_128[1] = _mm_unpacklo_epi8(src23, src34);
return convolve_4tap_ssse3(ss_128, coeffs);
@@ -1208,9 +1209,9 @@
const __m128i coeffs[3],
__m128i s_16[6],
__m128i ss_128[3]) {
- s_16[5] = _mm_cvtsi32_si128(*(int16_t *)(src + 3 * stride));
+ s_16[5] = _mm_cvtsi32_si128(loadu_int16(src + 3 * stride));
const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
- s_16[4] = _mm_cvtsi32_si128(*(int16_t *)(src + 4 * stride));
+ s_16[4] = _mm_cvtsi32_si128(loadu_int16(src + 4 * stride));
const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[4]);
ss_128[2] = _mm_unpacklo_epi8(src45, src56);
return convolve_6tap_ssse3(ss_128, coeffs);
@@ -1236,9 +1237,9 @@
const __m128i coeffs[3],
__m128i s_32[6],
__m128i ss_128[3]) {
- s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(src + 3 * stride));
+ s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 3 * stride));
const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
- s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src + 4 * stride));
+ s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 4 * stride));
const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
ss_128[2] = _mm_unpacklo_epi8(src45, src56);
return convolve_6tap_ssse3(ss_128, coeffs);
@@ -1292,9 +1293,9 @@
const __m128i coeffs[4],
__m128i s_16[8],
__m128i ss_128[4]) {
- s_16[7] = _mm_cvtsi32_si128(*(int16_t *)(src + 7 * stride));
+ s_16[7] = _mm_cvtsi32_si128(loadu_int16(src + 7 * stride));
const __m128i src67 = _mm_unpacklo_epi16(s_16[6], s_16[7]);
- s_16[6] = _mm_cvtsi32_si128(*(int16_t *)(src + 8 * stride));
+ s_16[6] = _mm_cvtsi32_si128(loadu_int16(src + 8 * stride));
const __m128i src78 = _mm_unpacklo_epi16(s_16[7], s_16[6]);
ss_128[3] = _mm_unpacklo_epi8(src67, src78);
return convolve_8tap_ssse3(ss_128, coeffs);
@@ -1305,9 +1306,9 @@
const __m128i coeffs[4],
__m128i s_32[8],
__m128i ss_128[4]) {
- s_32[7] = _mm_cvtsi32_si128(*(int32_t *)(src + 7 * stride));
+ s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * stride));
const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
- s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(src + 8 * stride));
+ s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * stride));
const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
ss_128[3] = _mm_unpacklo_epi8(src67, src78);
return convolve_8tap_ssse3(ss_128, coeffs);
@@ -1424,9 +1425,9 @@
const __m128i coeffs[1]) {
__m128i s_128[2];
- s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src + 2));
+ s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
- s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src + 2 * 2));
+ s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
const __m128i ss = _mm_unpacklo_epi16(s_128[0], s_128[1]);
return convolve16_2tap_sse2(&ss, coeffs);
@@ -1436,9 +1437,9 @@
const int16_t *const src, __m128i s_32[2]) {
__m128i s_128[2];
- s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src + 2));
+ s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
- s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src + 2 * 2));
+ s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
return _mm_add_epi16(s_128[0], s_128[1]);
}
@@ -2319,9 +2320,9 @@
if (w == 2) {
__m128i s_16[4], ss_128[2];
- s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
- s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
- s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
+ s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
+ s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
+ s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
@@ -2344,9 +2345,9 @@
assert(w == 4);
- s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
- s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
- s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
+ s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
+ s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
+ s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
@@ -2498,11 +2499,11 @@
if (w == 2) {
__m128i s_16[6], ss_128[3];
- s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
- s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
- s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
- s_16[3] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 3 * src_stride));
- s_16[4] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 4 * src_stride));
+ s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
+ s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
+ s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
+ s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
+ s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
@@ -2529,11 +2530,11 @@
assert(w == 4);
- s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
- s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
- s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
- s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 3 * src_stride));
- s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 4 * src_stride));
+ s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
+ s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
+ s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
+ s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
+ s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
@@ -2689,13 +2690,13 @@
if (w == 2) {
__m128i s_16[8], ss_128[4];
- s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
- s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
- s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
- s_16[3] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 3 * src_stride));
- s_16[4] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 4 * src_stride));
- s_16[5] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 5 * src_stride));
- s_16[6] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 6 * src_stride));
+ s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
+ s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
+ s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
+ s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
+ s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
+ s_16[5] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 5 * src_stride));
+ s_16[6] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 6 * src_stride));
const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
@@ -2725,13 +2726,13 @@
assert(w == 4);
- s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
- s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
- s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
- s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 3 * src_stride));
- s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 4 * src_stride));
- s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 5 * src_stride));
- s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 6 * src_stride));
+ s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
+ s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
+ s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
+ s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
+ s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
+ s_32[5] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 5 * src_stride));
+ s_32[6] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 6 * src_stride));
const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);