Fix all UBSan runtime errors on misaligned loads

Add the loadu_uint16(), loadu_uint32(), and loadu_uint64() functions to
aom_dsp/x86/mem_sse2.h. These are modeled after the loadu_uint32()
function in libvpx/vpx_dsp/x86/mem_sse2.h.

BUG=2642

Change-Id: I4c776cbb13d4f7424621d516cfab89b3de122389
diff --git a/aom_dsp/x86/mem_sse2.h b/aom_dsp/x86/mem_sse2.h
index 6c82167..28fc5fe 100644
--- a/aom_dsp/x86/mem_sse2.h
+++ b/aom_dsp/x86/mem_sse2.h
@@ -13,11 +13,30 @@
 #define AOM_AOM_DSP_X86_MEM_SSE2_H_
 
 #include <emmintrin.h>  // SSE2
+#include <string.h>
 
 #include "config/aom_config.h"
 
 #include "aom/aom_integer.h"
 
+static INLINE uint16_t loadu_uint16(const void *src) {
+  uint16_t v;
+  memcpy(&v, src, sizeof(v));
+  return v;
+}
+
+static INLINE uint32_t loadu_uint32(const void *src) {
+  uint32_t v;
+  memcpy(&v, src, sizeof(v));
+  return v;
+}
+
+static INLINE uint64_t loadu_uint64(const void *src) {
+  uint64_t v;
+  memcpy(&v, src, sizeof(v));
+  return v;
+}
+
 static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
   return _mm_castps_si128(
       _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
@@ -25,10 +44,10 @@
 
 static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
                                                   const int byte_stride) {
-  return _mm_setr_epi32(*(const int32_t *)((int8_t *)src + 0 * byte_stride),
-                        *(const int32_t *)((int8_t *)src + 1 * byte_stride),
-                        *(const int32_t *)((int8_t *)src + 2 * byte_stride),
-                        *(const int32_t *)((int8_t *)src + 3 * byte_stride));
+  return _mm_setr_epi32(loadu_uint32((int8_t *)src + 0 * byte_stride),
+                        loadu_uint32((int8_t *)src + 1 * byte_stride),
+                        loadu_uint32((int8_t *)src + 2 * byte_stride),
+                        loadu_uint32((int8_t *)src + 3 * byte_stride));
 }
 
 static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index 4e2b5a1..97f71fc 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -17,6 +17,7 @@
 #include "config/av1_rtcd.h"
 
 #include "aom_dsp/blend.h"
+#include "aom_dsp/x86/mem_sse2.h"
 #include "aom_dsp/x86/synonyms.h"
 
 #include "aom_ports/mem.h"
@@ -42,8 +43,8 @@
 }
 
 static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
-  const __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 0 * stride));
-  const __m128i p1 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 1 * stride));
+  const __m128i p0 = _mm_cvtsi32_si128(loadu_uint32(p + 0 * stride));
+  const __m128i p1 = _mm_cvtsi32_si128(loadu_uint32(p + 1 * stride));
   return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128());
 }
 
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index f8703a2..ef70a7b 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c
@@ -10,6 +10,7 @@
  */
 
 #include <immintrin.h>  // AVX2
+#include "aom_dsp/x86/mem_sse2.h"
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/x86/synonyms_avx2.h"
 #include "aom_dsp/x86/transpose_sse2.h"
@@ -49,7 +50,7 @@
         M_int[k][l] += D1 * X1 + D2 * X2;
 
         const __m256i kl =
-            _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+            _mm256_cvtepu8_epi16(_mm_set1_epi16(loadu_uint16(dgd_ijk + l)));
         acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
@@ -181,8 +182,7 @@
 
         // Load two u16 values from dgd_ijkl combined as a u32,
         // then broadcast to 8x u32 slots of a 256
-        const __m256i dgd_ijkl =
-            _mm256_set1_epi32(*((uint32_t *)(dgd_ijk + l)));
+        const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_uint32(dgd_ijk + l));
         // dgd_ijkl = [y x y x y x y x] [y x y x y x y x] where each is a u16
 
         acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
@@ -285,8 +285,7 @@
 
         // Load two u16 values from dgd_ijkl combined as a u32,
         // then broadcast to 8x u32 slots of a 256
-        const __m256i dgd_ijkl =
-            _mm256_set1_epi32(*((uint32_t *)(dgd_ijk + l)));
+        const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_uint32(dgd_ijk + l));
         // dgd_ijkl = [x y x y x y x y] [x y x y x y x y] where each is a u16
 
         acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
@@ -406,7 +405,7 @@
         M_int[k][l] += D1 * X1 + D2 * X2;
 
         const __m256i kl =
-            _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+            _mm256_cvtepu8_epi16(_mm_set1_epi16(loadu_uint16(dgd_ijk + l)));
         acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
diff --git a/av1/encoder/x86/rdopt_avx2.c b/av1/encoder/x86/rdopt_avx2.c
index f588bad..fefc036 100644
--- a/av1/encoder/x86/rdopt_avx2.c
+++ b/av1/encoder/x86/rdopt_avx2.c
@@ -11,6 +11,7 @@
 
 #include <assert.h>
 #include <immintrin.h>
+#include "aom_dsp/x86/mem_sse2.h"
 #include "aom_dsp/x86/synonyms_avx2.h"
 #include "aom_ports/system_state.h"
 
@@ -31,8 +32,8 @@
   //                      [ m n o p ]
 
   const __m256i pixels = _mm256_set_epi64x(
-      *(uint64_t *)&diff[0 * stride], *(uint64_t *)&diff[1 * stride],
-      *(uint64_t *)&diff[2 * stride], *(uint64_t *)&diff[3 * stride]);
+      loadu_uint64(&diff[0 * stride]), loadu_uint64(&diff[1 * stride]),
+      loadu_uint64(&diff[2 * stride]), loadu_uint64(&diff[3 * stride]));
   // pixels = [d c b a h g f e] [l k j i p o n m] as i16
 
   const __m256i slli = _mm256_slli_epi64(pixels, 16);