Fix alignement issue on 32bit build

AVX2 implementation uses aligned load that needs data to be properly
aligned. As pointer size on 32bit is 4 bytes this assumption is not
correct. Forcing alignemt where necessary

BUG=aomedia:2263

Change-Id: Ib7e75a2b04623b6ba36e38e2ed816e9003d6ebff
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index 5f3e7bb..ccb64a8 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c
@@ -13,6 +13,7 @@
 
 #include "config/aom_dsp_rtcd.h"
 #include "aom_dsp/x86/lpf_common_sse2.h"
+#include "aom_ports/mem.h"
 
 static INLINE __m256i dc_sum_64(const uint8_t *ref) {
   const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
@@ -1818,7 +1819,7 @@
 static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
                                                const uint16_t *left,
                                                int upsample_left, int dy) {
-  uint16_t dstT[64 * 64];
+  DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]);
   highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
   transpose(dstT, 64, dst, stride, 64, 64);
 }
@@ -1870,7 +1871,7 @@
 static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
                                                const uint16_t *left,
                                                int upsample_left, int dy) {
-  uint16_t dstT[64 * 32];
+  DECLARE_ALIGNED(16, uint16_t, dstT[64 * 32]);
   highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
   transpose(dstT, 64, dst, stride, 32, 64);
 }
@@ -1878,7 +1879,7 @@
 static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
                                                const uint16_t *left,
                                                int upsample_left, int dy) {
-  uint16_t dstT[32 * 64];
+  DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]);
   highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
   transpose(dstT, 32, dst, stride, 64, 32);
   return;
@@ -1887,7 +1888,7 @@
 static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
                                                const uint16_t *left,
                                                int upsample_left, int dy) {
-  uint16_t dstT[64 * 16];
+  DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]);
   highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
   transpose(dstT, 64, dst, stride, 16, 64);
 }