Fix thread sanitizer issue in hbd modules
Read/write race condition of dst.buf for width=4 has been fixed.
Change-Id: Iaf6a8ed06f86a179ba331205ea26239ccbee14fe
diff --git a/aom_dsp/x86/highbd_subtract_sse2.c b/aom_dsp/x86/highbd_subtract_sse2.c
index 18eb03d..b72d1cf 100644
--- a/aom_dsp/x86/highbd_subtract_sse2.c
+++ b/aom_dsp/x86/highbd_subtract_sse2.c
@@ -29,15 +29,15 @@
__m128i x0, x1, x2, x3;
int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
- u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
- u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
- u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
- u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+ u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride));
+ u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride));
+ u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
+ u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
- v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
- v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
- v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
- v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+ v0 = _mm_loadl_epi64((__m128i const *)(pred + 0 * pred_stride));
+ v1 = _mm_loadl_epi64((__m128i const *)(pred + 1 * pred_stride));
+ v2 = _mm_loadl_epi64((__m128i const *)(pred + 2 * pred_stride));
+ v3 = _mm_loadl_epi64((__m128i const *)(pred + 3 * pred_stride));
x0 = _mm_sub_epi16(u0, v0);
x1 = _mm_sub_epi16(u1, v1);
@@ -61,23 +61,23 @@
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
- u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
- u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
- u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
- u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
- u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
- u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
- u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
- u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
+ u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride));
+ u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride));
+ u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
+ u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
+ u4 = _mm_loadl_epi64((__m128i const *)(src + 4 * src_stride));
+ u5 = _mm_loadl_epi64((__m128i const *)(src + 5 * src_stride));
+ u6 = _mm_loadl_epi64((__m128i const *)(src + 6 * src_stride));
+ u7 = _mm_loadl_epi64((__m128i const *)(src + 7 * src_stride));
- v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
- v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
- v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
- v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
- v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
- v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
- v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
- v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
+ v0 = _mm_loadl_epi64((__m128i const *)(pred + 0 * pred_stride));
+ v1 = _mm_loadl_epi64((__m128i const *)(pred + 1 * pred_stride));
+ v2 = _mm_loadl_epi64((__m128i const *)(pred + 2 * pred_stride));
+ v3 = _mm_loadl_epi64((__m128i const *)(pred + 3 * pred_stride));
+ v4 = _mm_loadl_epi64((__m128i const *)(pred + 4 * pred_stride));
+ v5 = _mm_loadl_epi64((__m128i const *)(pred + 5 * pred_stride));
+ v6 = _mm_loadl_epi64((__m128i const *)(pred + 6 * pred_stride));
+ v7 = _mm_loadl_epi64((__m128i const *)(pred + 7 * pred_stride));
x0 = _mm_sub_epi16(u0, v0);
x1 = _mm_sub_epi16(u1, v1);