Make shuffle masks aligned in av1_warp_affine

All the shuffle masks in both AVX2 and SSE4_1 variants
are made aligned and the respective loads are modified
accordingly.

At testbench level, 1% improvement was observed.

Change-Id: I46ea28b90e57e7fca801d9147b08f2f1da9f2ea8
diff --git a/av1/common/x86/warp_plane_avx2.c b/av1/common/x86/warp_plane_avx2.c
index 8303e5e..2ab11a4 100644
--- a/av1/common/x86/warp_plane_avx2.c
+++ b/av1/common/x86/warp_plane_avx2.c
@@ -13,63 +13,65 @@
 #include "config/av1_rtcd.h"
 #include "av1/common/warped_motion.h"
 
-static const uint8_t shuffle_alpha0_mask01_avx2[32] = {
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask01_avx2[32]) = {
   0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
   0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
 };
 
-static const uint8_t shuffle_alpha0_mask23_avx2[32] = {
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask23_avx2[32]) = {
   2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
   2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3
 };
 
-static const uint8_t shuffle_alpha0_mask45_avx2[32] = {
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask45_avx2[32]) = {
   4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
   4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5
 };
 
-static const uint8_t shuffle_alpha0_mask67_avx2[32] = {
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask67_avx2[32]) = {
   6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
   6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7
 };
 
-static const uint8_t shuffle_gamma0_mask0_avx2[32] = { 0, 1, 2, 3, 0, 1, 2, 3,
-                                                       0, 1, 2, 3, 0, 1, 2, 3,
-                                                       0, 1, 2, 3, 0, 1, 2, 3,
-                                                       0, 1, 2, 3, 0, 1, 2, 3 };
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask0_avx2[32]) = {
+  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
+  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+};
 
-static const uint8_t shuffle_gamma0_mask1_avx2[32] = { 4, 5, 6, 7, 4, 5, 6, 7,
-                                                       4, 5, 6, 7, 4, 5, 6, 7,
-                                                       4, 5, 6, 7, 4, 5, 6, 7,
-                                                       4, 5, 6, 7, 4, 5, 6, 7 };
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask1_avx2[32]) = {
+  4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
+  4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
+};
 
-static const uint8_t shuffle_gamma0_mask2_avx2[32] = {
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask2_avx2[32]) = {
   8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11,
   8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
 };
 
-static const uint8_t shuffle_gamma0_mask3_avx2[32] = {
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask3_avx2[32]) = {
   12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15,
   12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
 };
 
-static const uint8_t shuffle_src0[32] = { 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3,
-                                          5, 5, 7, 7, 9, 0, 2, 2, 4, 4, 6,
-                                          6, 8, 1, 3, 3, 5, 5, 7, 7, 9 };
+DECLARE_ALIGNED(32, static const uint8_t,
+                shuffle_src0[32]) = { 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3,
+                                      5, 5, 7, 7, 9, 0, 2, 2, 4, 4, 6,
+                                      6, 8, 1, 3, 3, 5, 5, 7, 7, 9 };
 
-static const uint8_t shuffle_src1[32] = { 4, 6, 6, 8, 8, 10, 10, 12,
-                                          5, 7, 7, 9, 9, 11, 11, 13,
-                                          4, 6, 6, 8, 8, 10, 10, 12,
-                                          5, 7, 7, 9, 9, 11, 11, 13 };
+DECLARE_ALIGNED(32, static const uint8_t,
+                shuffle_src1[32]) = { 4,  6,  6,  8,  8,  10, 10, 12, 5,  7, 7,
+                                      9,  9,  11, 11, 13, 4,  6,  6,  8,  8, 10,
+                                      10, 12, 5,  7,  7,  9,  9,  11, 11, 13 };
 
-static const uint8_t shuffle_src2[32] = { 1, 3, 3, 5, 5,  7, 7, 9, 2, 4, 4,
-                                          6, 6, 8, 8, 10, 1, 3, 3, 5, 5, 7,
-                                          7, 9, 2, 4, 4,  6, 6, 8, 8, 10 };
+DECLARE_ALIGNED(32, static const uint8_t,
+                shuffle_src2[32]) = { 1, 3, 3, 5, 5,  7, 7, 9, 2, 4, 4,
+                                      6, 6, 8, 8, 10, 1, 3, 3, 5, 5, 7,
+                                      7, 9, 2, 4, 4,  6, 6, 8, 8, 10 };
 
-static const uint8_t shuffle_src3[32] = { 5, 7, 7, 9,  9,  11, 11, 13,
-                                          6, 8, 8, 10, 10, 12, 12, 14,
-                                          5, 7, 7, 9,  9,  11, 11, 13,
-                                          6, 8, 8, 10, 10, 12, 12, 14 };
+DECLARE_ALIGNED(32, static const uint8_t,
+                shuffle_src3[32]) = { 5,  7,  7,  9,  9,  11, 11, 13, 6,  8, 8,
+                                      10, 10, 12, 12, 14, 5,  7,  7,  9,  9, 11,
+                                      11, 13, 6,  8,  8,  10, 10, 12, 12, 14 };
 
 static INLINE void filter_src_pixels_avx2(const __m256i src, __m256i *horz_out,
                                           __m256i *coeff,
@@ -215,13 +217,13 @@
       _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_1, 0x1);
 
   coeff[0] = _mm256_shuffle_epi8(
-      res_0, _mm256_loadu_si256((__m256i *)shuffle_alpha0_mask01_avx2));
+      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask01_avx2));
   coeff[1] = _mm256_shuffle_epi8(
-      res_0, _mm256_loadu_si256((__m256i *)shuffle_alpha0_mask23_avx2));
+      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask23_avx2));
   coeff[2] = _mm256_shuffle_epi8(
-      res_0, _mm256_loadu_si256((__m256i *)shuffle_alpha0_mask45_avx2));
+      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask45_avx2));
   coeff[3] = _mm256_shuffle_epi8(
-      res_0, _mm256_loadu_si256((__m256i *)shuffle_alpha0_mask67_avx2));
+      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask67_avx2));
 }
 
 static INLINE void horizontal_filter_avx2(const __m256i src, __m256i *horz_out,
@@ -564,13 +566,13 @@
       _mm256_inserti128_si256(_mm256_castsi128_si256(filt_0), filt_1, 0x1);
 
   coeffs[0] = _mm256_shuffle_epi8(
-      res_0, _mm256_loadu_si256((__m256i *)shuffle_gamma0_mask0_avx2));
+      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0_avx2));
   coeffs[1] = _mm256_shuffle_epi8(
-      res_0, _mm256_loadu_si256((__m256i *)shuffle_gamma0_mask1_avx2));
+      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1_avx2));
   coeffs[2] = _mm256_shuffle_epi8(
-      res_0, _mm256_loadu_si256((__m256i *)shuffle_gamma0_mask2_avx2));
+      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2_avx2));
   coeffs[3] = _mm256_shuffle_epi8(
-      res_0, _mm256_loadu_si256((__m256i *)shuffle_gamma0_mask3_avx2));
+      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask3_avx2));
 
   coeffs[4] = coeffs[0];
   coeffs[5] = coeffs[1];
@@ -1031,10 +1033,10 @@
   const int16_t const5 = (1 << (FILTER_BITS - reduce_bits_horiz));
 
   __m256i shuffle_src[4];
-  shuffle_src[0] = _mm256_loadu_si256((__m256i *)shuffle_src0);
-  shuffle_src[1] = _mm256_loadu_si256((__m256i *)shuffle_src1);
-  shuffle_src[2] = _mm256_loadu_si256((__m256i *)shuffle_src2);
-  shuffle_src[3] = _mm256_loadu_si256((__m256i *)shuffle_src3);
+  shuffle_src[0] = _mm256_load_si256((__m256i *)shuffle_src0);
+  shuffle_src[1] = _mm256_load_si256((__m256i *)shuffle_src1);
+  shuffle_src[2] = _mm256_load_si256((__m256i *)shuffle_src2);
+  shuffle_src[3] = _mm256_load_si256((__m256i *)shuffle_src3);
 
   for (i = 0; i < p_height; i += 8) {
     for (j = 0; j < p_width; j += 8) {
diff --git a/av1/common/x86/warp_plane_sse4.c b/av1/common/x86/warp_plane_sse4.c
index 9530525..93cf25ca 100644
--- a/av1/common/x86/warp_plane_sse4.c
+++ b/av1/common/x86/warp_plane_sse4.c
@@ -198,40 +198,53 @@
 // in an SSE register into two sequences:
 // 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
 // 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
-static const uint8_t even_mask[16] = { 0, 2,  2,  4,  4,  6,  6,  8,
-                                       8, 10, 10, 12, 12, 14, 14, 0 };
-static const uint8_t odd_mask[16] = { 1, 3,  3,  5,  5,  7,  7,  9,
-                                      9, 11, 11, 13, 13, 15, 15, 0 };
+DECLARE_ALIGNED(16, static const uint8_t,
+                even_mask[16]) = { 0, 2,  2,  4,  4,  6,  6,  8,
+                                   8, 10, 10, 12, 12, 14, 14, 0 };
 
-static const uint8_t shuffle_alpha0_mask01[16] = { 0, 1, 0, 1, 0, 1, 0, 1,
-                                                   0, 1, 0, 1, 0, 1, 0, 1 };
+DECLARE_ALIGNED(16, static const uint8_t,
+                odd_mask[16]) = { 1, 3,  3,  5,  5,  7,  7,  9,
+                                  9, 11, 11, 13, 13, 15, 15, 0 };
 
-static const uint8_t shuffle_alpha0_mask23[16] = { 2, 3, 2, 3, 2, 3, 2, 3,
-                                                   2, 3, 2, 3, 2, 3, 2, 3 };
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_alpha0_mask01[16]) = { 0, 1, 0, 1, 0, 1, 0, 1,
+                                               0, 1, 0, 1, 0, 1, 0, 1 };
 
-static const uint8_t shuffle_alpha0_mask45[16] = { 4, 5, 4, 5, 4, 5, 4, 5,
-                                                   4, 5, 4, 5, 4, 5, 4, 5 };
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_alpha0_mask23[16]) = { 2, 3, 2, 3, 2, 3, 2, 3,
+                                               2, 3, 2, 3, 2, 3, 2, 3 };
 
-static const uint8_t shuffle_alpha0_mask67[16] = { 6, 7, 6, 7, 6, 7, 6, 7,
-                                                   6, 7, 6, 7, 6, 7, 6, 7 };
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_alpha0_mask45[16]) = { 4, 5, 4, 5, 4, 5, 4, 5,
+                                               4, 5, 4, 5, 4, 5, 4, 5 };
 
-static const uint8_t shuffle_gamma0_mask0[16] = { 0, 1, 2, 3, 0, 1, 2, 3,
-                                                  0, 1, 2, 3, 0, 1, 2, 3 };
-static const uint8_t shuffle_gamma0_mask1[16] = { 4, 5, 6, 7, 4, 5, 6, 7,
-                                                  4, 5, 6, 7, 4, 5, 6, 7 };
-static const uint8_t shuffle_gamma0_mask2[16] = { 8, 9, 10, 11, 8, 9, 10, 11,
-                                                  8, 9, 10, 11, 8, 9, 10, 11 };
-static const uint8_t shuffle_gamma0_mask3[16] = { 12, 13, 14, 15, 12, 13,
-                                                  14, 15, 12, 13, 14, 15,
-                                                  12, 13, 14, 15 };
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_alpha0_mask67[16]) = { 6, 7, 6, 7, 6, 7, 6, 7,
+                                               6, 7, 6, 7, 6, 7, 6, 7 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_gamma0_mask0[16]) = { 0, 1, 2, 3, 0, 1, 2, 3,
+                                              0, 1, 2, 3, 0, 1, 2, 3 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_gamma0_mask1[16]) = { 4, 5, 6, 7, 4, 5, 6, 7,
+                                              4, 5, 6, 7, 4, 5, 6, 7 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_gamma0_mask2[16]) = { 8, 9, 10, 11, 8, 9, 10, 11,
+                                              8, 9, 10, 11, 8, 9, 10, 11 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_gamma0_mask3[16]) = { 12, 13, 14, 15, 12, 13, 14, 15,
+                                              12, 13, 14, 15, 12, 13, 14, 15 };
 
 static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
                                      const int offset_bits_horiz,
                                      const int reduce_bits_horiz, int k) {
   const __m128i src_even =
-      _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask));
+      _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)even_mask));
   const __m128i src_odd =
-      _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask));
+      _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)odd_mask));
   // The pixel order we need for 'src' is:
   // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
   const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
@@ -322,17 +335,17 @@
       _mm_loadl_epi64((__m128i *)&filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
 
   // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
-  coeff[0] = _mm_shuffle_epi8(
-      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask01));
+  coeff[0] =
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask01));
   // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
-  coeff[1] = _mm_shuffle_epi8(
-      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask23));
+  coeff[1] =
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask23));
   // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
-  coeff[2] = _mm_shuffle_epi8(
-      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask45));
+  coeff[2] =
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask45));
   // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
-  coeff[3] = _mm_shuffle_epi8(
-      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask67));
+  coeff[3] =
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask67));
 }
 
 static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
@@ -504,13 +517,13 @@
 
   // even coeffs
   coeffs[0] =
-      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask0));
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask0));
   coeffs[1] =
-      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask1));
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask1));
   coeffs[2] =
-      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask2));
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask2));
   coeffs[3] =
-      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask3));
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask3));
 
   // odd coeffs
   coeffs[4] = coeffs[0];