change to 32bit read from 64bit read

This is to avoid out-of-range memory access.

BUG=aomedia:391

Change-Id: I211ba1992d4f51fcf06cf2961d8cb2606b836207
diff --git a/av1/common/x86/selfguided_sse4.c b/av1/common/x86/selfguided_sse4.c
index b61991f..2f0ace0 100644
--- a/av1/common/x86/selfguided_sse4.c
+++ b/av1/common/x86/selfguided_sse4.c
@@ -274,9 +274,9 @@
       _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
 
       x = _mm_cvtepu8_epi32(
-          _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
+          _mm_cvtsi32_si128(*((int *)&src[(i - 2) * src_stride + j])));
       y = _mm_cvtepu8_epi32(
-          _mm_loadl_epi64((__m128i *)&src[(i + 3) * src_stride + j]));
+          _mm_cvtsi32_si128(*((int *)&src[(i + 3) * src_stride + j])));
 
       sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
 
@@ -484,9 +484,9 @@
       _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
 
       x = _mm_cvtepu8_epi32(
-          _mm_loadl_epi64((__m128i *)&src[(i - 3) * src_stride + j]));
+          _mm_cvtsi32_si128(*((int *)&src[(i - 3) * src_stride + j])));
       y = _mm_cvtepu8_epi32(
-          _mm_loadl_epi64((__m128i *)&src[(i + 4) * src_stride + j]));
+          _mm_cvtsi32_si128(*((int *)&src[(i + 4) * src_stride + j])));
 
       sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));