Avoid UB from misaligned loads in variance_sse2.c

The undefined behaviour came from READ64, whose loads compile to a
MOVD but which is technically incorrect if p is misaligned. This patch
rewrites it, and the other loads and stores in the file, to use the
xx_* functions from synonyms.h

BUG=aomedia:912

Change-Id: Ic2fae623ef3b609dacd0a830a7cc63653291202f
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index d1fac6e..5add56a 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -15,6 +15,8 @@
 #include "./aom_config.h"
 #include "./aom_dsp_rtcd.h"
 
+#include "aom_dsp/x86/synonyms.h"
+
 #include "aom_ports/mem.h"
 
 #include "./av1_rtcd.h"
@@ -29,7 +31,7 @@
   int i;
 
   for (i = 0; i < 32; ++i) {
-    const __m128i v = _mm_loadu_si128((const __m128i *)src);
+    const __m128i v = xx_loadu_128(src);
     vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
     src += 8;
   }
@@ -39,19 +41,22 @@
   return _mm_cvtsi128_si32(vsum);
 }
 
-#define READ64(p, stride, i)                                  \
-  _mm_unpacklo_epi8(                                          \
-      _mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
-      _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
+// Read 4 samples from each of row and row + 1. Interleave the two rows and
+// zero-extend them to 16 bit samples stored in the lower half of an SSE
+// register.
+static __m128i read64(const uint8_t *p, int stride, int row) {
+  __m128i row0 = xx_loadl_32(p + (row + 0) * stride);
+  __m128i row1 = xx_loadl_32(p + (row + 1) * stride);
+  return _mm_unpacklo_epi8(_mm_unpacklo_epi8(row0, row1), _mm_setzero_si128());
+}
 
 static void get4x4var_sse2(const uint8_t *src, int src_stride,
                            const uint8_t *ref, int ref_stride,
                            unsigned int *sse, int *sum) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
-  const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
-  const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
-  const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
+  const __m128i src0 = read64(src, src_stride, 0);
+  const __m128i src1 = read64(src, src_stride, 2);
+  const __m128i ref0 = read64(ref, ref_stride, 0);
+  const __m128i ref1 = read64(ref, ref_stride, 2);
   const __m128i diff0 = _mm_sub_epi16(src0, ref0);
   const __m128i diff1 = _mm_sub_epi16(src1, ref1);
 
@@ -78,16 +83,16 @@
   int i;
 
   for (i = 0; i < 8; i += 2) {
-    const __m128i src0 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(src + i * src_stride)), zero);
-    const __m128i ref0 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(ref + i * ref_stride)), zero);
+    const __m128i src0 =
+        _mm_unpacklo_epi8(xx_loadl_64(src + i * src_stride), zero);
+    const __m128i ref0 =
+        _mm_unpacklo_epi8(xx_loadl_64(ref + i * ref_stride), zero);
     const __m128i diff0 = _mm_sub_epi16(src0, ref0);
 
-    const __m128i src1 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(src + (i + 1) * src_stride)), zero);
-    const __m128i ref1 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(ref + (i + 1) * ref_stride)), zero);
+    const __m128i src1 =
+        _mm_unpacklo_epi8(xx_loadl_64(src + (i + 1) * src_stride), zero);
+    const __m128i ref1 =
+        _mm_unpacklo_epi8(xx_loadl_64(ref + (i + 1) * ref_stride), zero);
     const __m128i diff1 = _mm_sub_epi16(src1, ref1);
 
     vsum = _mm_add_epi16(vsum, diff0);
@@ -117,8 +122,8 @@
   int i;
 
   for (i = 0; i < 16; ++i) {
-    const __m128i s = _mm_loadu_si128((const __m128i *)src);
-    const __m128i r = _mm_loadu_si128((const __m128i *)ref);
+    const __m128i s = xx_loadu_128(src);
+    const __m128i r = xx_loadu_128(ref);
 
     const __m128i src0 = _mm_unpacklo_epi8(s, zero);
     const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
@@ -604,8 +609,7 @@
       for (i = 0; i < height; i++) {
         int j;
         for (j = 0; j < width; j += 16) {
-          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
-          _mm_storeu_si128((__m128i *)comp_pred, s0);
+          xx_storeu_128(comp_pred, xx_loadu_128(ref));
           comp_pred += 16;
           ref += 16;
         }
@@ -617,10 +621,9 @@
       assert(!(height & 1));
       /*Read 8 pixels two rows at a time.*/
       for (i = 0; i < height; i += 2) {
-        __m128i s0 = _mm_loadl_epi64((const __m128i *)ref);
-        __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride));
-        __m128i t0 = _mm_unpacklo_epi64(s0, s1);
-        _mm_storeu_si128((__m128i *)comp_pred, t0);
+        __m128i s0 = xx_loadl_64(ref + 0 * ref_stride);
+        __m128i s1 = xx_loadl_64(ref + 1 * ref_stride);
+        xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1));
         comp_pred += 16;
         ref += 2 * ref_stride;
       }
@@ -630,16 +633,13 @@
       assert(!(height & 3));
       /*Read 4 pixels four rows at a time.*/
       for (i = 0; i < height; i++) {
-        __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
-        __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + ref_stride));
-        __m128i s2 =
-            _mm_cvtsi32_si128(*(const uint32_t *)(ref + 2 * ref_stride));
-        __m128i s3 =
-            _mm_cvtsi32_si128(*(const uint32_t *)(ref + 3 * ref_stride));
-        __m128i t0 = _mm_unpacklo_epi32(s0, s1);
-        __m128i t1 = _mm_unpacklo_epi32(s2, s3);
-        __m128i u0 = _mm_unpacklo_epi64(t0, t1);
-        _mm_storeu_si128((__m128i *)comp_pred, u0);
+        const __m128i row0 = xx_loadl_32(ref + 0 * ref_stride);
+        const __m128i row1 = xx_loadl_32(ref + 1 * ref_stride);
+        const __m128i row2 = xx_loadl_32(ref + 2 * ref_stride);
+        const __m128i row3 = xx_loadl_32(ref + 3 * ref_stride);
+        const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1),
+                                               _mm_unpacklo_epi32(row2, row3));
+        xx_storeu_128(comp_pred, reg);
         comp_pred += 16;
         ref += 4 * ref_stride;
       }
@@ -690,9 +690,9 @@
   assert(!(width * height & 15));
   n = width * height >> 4;
   for (i = 0; i < n; i++) {
-    __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred);
-    __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
-    _mm_storeu_si128((__m128i *)comp_pred, _mm_avg_epu8(s0, p0));
+    __m128i s0 = xx_loadu_128(comp_pred);
+    __m128i p0 = xx_loadu_128(pred);
+    xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0));
     comp_pred += 16;
     pred += 16;
   }