Speed up wiener search with aligned loads/stores

Speed 4 encoder profiling result showed:
3.15%--acc_stat_avx2 (inlined)
        |
        |--1.44%--yy_loadu_256 (inlined)
        |          _mm256_loadu_si256 (inlined)
        |
         --1.18%--yy_storeu_256 (inlined)
                   _mm256_storeu_si256 (inlined)

Forced the buffer alignment to use aligned loads/stores. Now, we have:
1.39%--acc_stat_avx2 (inlined)
        |
         --0.58%--yy_store_256 (inlined)
                   _mm256_store_si256 (inlined)

The speed test showed ~1.3% speedup at speed 4.

Change-Id: Ib7e4ad9b1ff0c602a2cd8bf61a6ffd5f3e26152d
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index d00fca0..5d4097c 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c
@@ -22,9 +22,9 @@
                                  const __m128i *shuffle, const __m256i *kl) {
   const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
   const __m256i d0 = _mm256_madd_epi16(*kl, _mm256_cvtepu8_epi16(s));
-  const __m256i dst0 = yy_loadu_256(dst);
+  const __m256i dst0 = yy_load_256(dst);
   const __m256i r0 = _mm256_add_epi32(dst0, d0);
-  yy_storeu_256(dst, r0);
+  yy_store_256(dst, r0);
 }
 
 static INLINE void acc_stat_win7_one_line_avx2(
@@ -74,7 +74,9 @@
 
   int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
   int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
-  int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+
+  DECLARE_ALIGNED(32, int32_t,
+                  H_int32[WIENER_WIN2][WIENER_WIN * 8]) = { { 0 } };
   int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
   int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
   int32_t sumX = 0;
@@ -423,7 +425,9 @@
 
   int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
   int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
-  int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+  DECLARE_ALIGNED(
+      32, int32_t,
+      H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) = { { 0 } };
   int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
   int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
   int32_t sumX = 0;