Speed up wiener filter search in HBD case

This patch followed
https://aomedia-review.googlesource.com/c/aom/+/85363
to enforce aligned loads/stores in HBD wiener filter search code.

Change-Id: Ie21ee58a6e1abbe78b3ef25659739112b2bd42c9
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index 5d4097c..5571165 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c
@@ -147,14 +147,14 @@
   // Take the lower-half of d0, extend to u64, add it on to dst (H)
   const __m256i d0l = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 0));
   // d0l = [a b] [c d] as u64
-  const __m256i dst0 = yy_loadu_256(dst);
-  yy_storeu_256(dst, _mm256_add_epi64(d0l, dst0));
+  const __m256i dst0 = yy_load_256(dst);
+  yy_store_256(dst, _mm256_add_epi64(d0l, dst0));
 
   // Take the upper-half of d0, extend to u64, add it on to dst (H)
   const __m256i d0h = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 1));
   // d0h = [e f] [g h] as u64
-  const __m256i dst1 = yy_loadu_256(dst + 4);
-  yy_storeu_256(dst + 4, _mm256_add_epi64(d0h, dst1));
+  const __m256i dst1 = yy_load_256(dst + 4);
+  yy_store_256(dst + 4, _mm256_add_epi64(d0h, dst1));
 }
 
 static INLINE void acc_stat_highbd_win7_one_line_avx2(
@@ -218,7 +218,7 @@
       find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
 
   int64_t M_int[WIENER_WIN][WIENER_WIN] = { { 0 } };
-  int64_t H_int[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+  DECLARE_ALIGNED(32, int64_t, H_int[WIENER_WIN2][WIENER_WIN * 8]) = { { 0 } };
   int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
   int32_t sumX = 0;
   const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
@@ -318,7 +318,9 @@
       find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
 
   int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
-  int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+  DECLARE_ALIGNED(
+      32, int64_t,
+      H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) = { { 0 } };
   int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
   int32_t sumX = 0;
   const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;