Make shuffle masks aligned in av1_warp_affine All the shuffle masks in both AVX2 and SSE4_1 variants are made aligned and the respective loads are modified accordingly. At testbench level, 1% improvement was observed. Change-Id: I46ea28b90e57e7fca801d9147b08f2f1da9f2ea8
diff --git a/av1/common/x86/warp_plane_avx2.c b/av1/common/x86/warp_plane_avx2.c index 8303e5e..2ab11a4 100644 --- a/av1/common/x86/warp_plane_avx2.c +++ b/av1/common/x86/warp_plane_avx2.c
@@ -13,63 +13,65 @@ #include "config/av1_rtcd.h" #include "av1/common/warped_motion.h" -static const uint8_t shuffle_alpha0_mask01_avx2[32] = { +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask01_avx2[32]) = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }; -static const uint8_t shuffle_alpha0_mask23_avx2[32] = { +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask23_avx2[32]) = { 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3 }; -static const uint8_t shuffle_alpha0_mask45_avx2[32] = { +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask45_avx2[32]) = { 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5 }; -static const uint8_t shuffle_alpha0_mask67_avx2[32] = { +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask67_avx2[32]) = { 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7 }; -static const uint8_t shuffle_gamma0_mask0_avx2[32] = { 0, 1, 2, 3, 0, 1, 2, 3, - 0, 1, 2, 3, 0, 1, 2, 3, - 0, 1, 2, 3, 0, 1, 2, 3, - 0, 1, 2, 3, 0, 1, 2, 3 }; +DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask0_avx2[32]) = { + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +}; -static const uint8_t shuffle_gamma0_mask1_avx2[32] = { 4, 5, 6, 7, 4, 5, 6, 7, - 4, 5, 6, 7, 4, 5, 6, 7, - 4, 5, 6, 7, 4, 5, 6, 7, - 4, 5, 6, 7, 4, 5, 6, 7 }; +DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask1_avx2[32]) = { + 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, + 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7 +}; -static const uint8_t shuffle_gamma0_mask2_avx2[32] = { +DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask2_avx2[32]) = { 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11 }; -static const uint8_t shuffle_gamma0_mask3_avx2[32] = { +DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask3_avx2[32]) = { 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 }; -static const uint8_t shuffle_src0[32] = { 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3, - 5, 5, 7, 7, 9, 0, 2, 2, 4, 4, 6, - 6, 8, 1, 3, 3, 5, 5, 7, 7, 9 }; +DECLARE_ALIGNED(32, static const uint8_t, + shuffle_src0[32]) = { 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3, + 5, 5, 7, 7, 9, 0, 2, 2, 4, 4, 6, + 6, 8, 1, 3, 3, 5, 5, 7, 7, 9 }; -static const uint8_t shuffle_src1[32] = { 4, 6, 6, 8, 8, 10, 10, 12, - 5, 7, 7, 9, 9, 11, 11, 13, - 4, 6, 6, 8, 8, 10, 10, 12, - 5, 7, 7, 9, 9, 11, 11, 13 }; +DECLARE_ALIGNED(32, static const uint8_t, + shuffle_src1[32]) = { 4, 6, 6, 8, 8, 10, 10, 12, 5, 7, 7, + 9, 9, 11, 11, 13, 4, 6, 6, 8, 8, 10, + 10, 12, 5, 7, 7, 9, 9, 11, 11, 13 }; -static const uint8_t shuffle_src2[32] = { 1, 3, 3, 5, 5, 7, 7, 9, 2, 4, 4, - 6, 6, 8, 8, 10, 1, 3, 3, 5, 5, 7, - 7, 9, 2, 4, 4, 6, 6, 8, 8, 10 }; +DECLARE_ALIGNED(32, static const uint8_t, + shuffle_src2[32]) = { 1, 3, 3, 5, 5, 7, 7, 9, 2, 4, 4, + 6, 6, 8, 8, 10, 1, 3, 3, 5, 5, 7, + 7, 9, 2, 4, 4, 6, 6, 8, 8, 10 }; -static const uint8_t shuffle_src3[32] = { 5, 7, 7, 9, 9, 11, 11, 13, - 6, 8, 8, 10, 10, 12, 12, 14, - 5, 7, 7, 9, 9, 11, 11, 13, - 6, 8, 8, 10, 10, 12, 12, 14 }; +DECLARE_ALIGNED(32, static const uint8_t, + shuffle_src3[32]) = { 5, 7, 7, 9, 9, 11, 11, 13, 6, 8, 8, + 10, 10, 12, 12, 14, 5, 7, 7, 9, 9, 11, + 11, 13, 6, 8, 8, 10, 10, 12, 12, 14 }; static INLINE void filter_src_pixels_avx2(const __m256i src, __m256i *horz_out, __m256i *coeff, @@ -215,13 +217,13 @@ _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_1, 0x1); coeff[0] = _mm256_shuffle_epi8( - res_0, _mm256_loadu_si256((__m256i *)shuffle_alpha0_mask01_avx2)); + res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask01_avx2)); coeff[1] = _mm256_shuffle_epi8( - res_0, _mm256_loadu_si256((__m256i *)shuffle_alpha0_mask23_avx2)); + res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask23_avx2)); coeff[2] = _mm256_shuffle_epi8( - res_0, _mm256_loadu_si256((__m256i *)shuffle_alpha0_mask45_avx2)); + res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask45_avx2)); coeff[3] = _mm256_shuffle_epi8( - res_0, _mm256_loadu_si256((__m256i *)shuffle_alpha0_mask67_avx2)); + res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask67_avx2)); } static INLINE void horizontal_filter_avx2(const __m256i src, __m256i *horz_out, @@ -564,13 +566,13 @@ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_0), filt_1, 0x1); coeffs[0] = _mm256_shuffle_epi8( - res_0, _mm256_loadu_si256((__m256i *)shuffle_gamma0_mask0_avx2)); + res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0_avx2)); coeffs[1] = _mm256_shuffle_epi8( - res_0, _mm256_loadu_si256((__m256i *)shuffle_gamma0_mask1_avx2)); + res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1_avx2)); coeffs[2] = _mm256_shuffle_epi8( - res_0, _mm256_loadu_si256((__m256i *)shuffle_gamma0_mask2_avx2)); + res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2_avx2)); coeffs[3] = _mm256_shuffle_epi8( - res_0, _mm256_loadu_si256((__m256i *)shuffle_gamma0_mask3_avx2)); + res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask3_avx2)); coeffs[4] = coeffs[0]; coeffs[5] = coeffs[1]; @@ -1031,10 +1033,10 @@ const int16_t const5 = (1 << (FILTER_BITS - reduce_bits_horiz)); __m256i shuffle_src[4]; - shuffle_src[0] = _mm256_loadu_si256((__m256i *)shuffle_src0); - shuffle_src[1] = _mm256_loadu_si256((__m256i *)shuffle_src1); - shuffle_src[2] = _mm256_loadu_si256((__m256i *)shuffle_src2); - shuffle_src[3] = _mm256_loadu_si256((__m256i *)shuffle_src3); + shuffle_src[0] = _mm256_load_si256((__m256i *)shuffle_src0); + shuffle_src[1] = _mm256_load_si256((__m256i *)shuffle_src1); + shuffle_src[2] = _mm256_load_si256((__m256i *)shuffle_src2); + shuffle_src[3] = _mm256_load_si256((__m256i *)shuffle_src3); for (i = 0; i < p_height; i += 8) { for (j = 0; j < p_width; j += 8) {
diff --git a/av1/common/x86/warp_plane_sse4.c b/av1/common/x86/warp_plane_sse4.c index 9530525..93cf25ca 100644 --- a/av1/common/x86/warp_plane_sse4.c +++ b/av1/common/x86/warp_plane_sse4.c
@@ -198,40 +198,53 @@ // in an SSE register into two sequences: // 0, 2, 2, 4, ..., 12, 12, 14, <don't care> // 1, 3, 3, 5, ..., 13, 13, 15, <don't care> -static const uint8_t even_mask[16] = { 0, 2, 2, 4, 4, 6, 6, 8, - 8, 10, 10, 12, 12, 14, 14, 0 }; -static const uint8_t odd_mask[16] = { 1, 3, 3, 5, 5, 7, 7, 9, - 9, 11, 11, 13, 13, 15, 15, 0 }; +DECLARE_ALIGNED(16, static const uint8_t, + even_mask[16]) = { 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 0 }; -static const uint8_t shuffle_alpha0_mask01[16] = { 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1 }; +DECLARE_ALIGNED(16, static const uint8_t, + odd_mask[16]) = { 1, 3, 3, 5, 5, 7, 7, 9, + 9, 11, 11, 13, 13, 15, 15, 0 }; -static const uint8_t shuffle_alpha0_mask23[16] = { 2, 3, 2, 3, 2, 3, 2, 3, - 2, 3, 2, 3, 2, 3, 2, 3 }; +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_alpha0_mask01[16]) = { 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1 }; -static const uint8_t shuffle_alpha0_mask45[16] = { 4, 5, 4, 5, 4, 5, 4, 5, - 4, 5, 4, 5, 4, 5, 4, 5 }; +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_alpha0_mask23[16]) = { 2, 3, 2, 3, 2, 3, 2, 3, + 2, 3, 2, 3, 2, 3, 2, 3 }; -static const uint8_t shuffle_alpha0_mask67[16] = { 6, 7, 6, 7, 6, 7, 6, 7, - 6, 7, 6, 7, 6, 7, 6, 7 }; +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_alpha0_mask45[16]) = { 4, 5, 4, 5, 4, 5, 4, 5, + 4, 5, 4, 5, 4, 5, 4, 5 }; -static const uint8_t shuffle_gamma0_mask0[16] = { 0, 1, 2, 3, 0, 1, 2, 3, - 0, 1, 2, 3, 0, 1, 2, 3 }; -static const uint8_t shuffle_gamma0_mask1[16] = { 4, 5, 6, 7, 4, 5, 6, 7, - 4, 5, 6, 7, 4, 5, 6, 7 }; -static const uint8_t shuffle_gamma0_mask2[16] = { 8, 9, 10, 11, 8, 9, 10, 11, - 8, 9, 10, 11, 8, 9, 10, 11 }; -static const uint8_t shuffle_gamma0_mask3[16] = { 12, 13, 14, 15, 12, 13, - 14, 15, 12, 13, 14, 15, - 12, 13, 14, 15 }; +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_alpha0_mask67[16]) = { 6, 7, 6, 7, 6, 7, 6, 7, + 6, 7, 6, 7, 6, 7, 6, 7 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_gamma0_mask0[16]) = { 0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_gamma0_mask1[16]) = { 4, 5, 6, 7, 4, 5, 6, 7, + 4, 5, 6, 7, 4, 5, 6, 7 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_gamma0_mask2[16]) = { 8, 9, 10, 11, 8, 9, 10, 11, + 8, 9, 10, 11, 8, 9, 10, 11 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_gamma0_mask3[16]) = { 12, 13, 14, 15, 12, 13, 14, 15, + 12, 13, 14, 15, 12, 13, 14, 15 }; static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff, const int offset_bits_horiz, const int reduce_bits_horiz, int k) { const __m128i src_even = - _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask)); + _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)even_mask)); const __m128i src_odd = - _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask)); + _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)odd_mask)); // The pixel order we need for 'src' is: // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9 const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd); @@ -322,17 +335,17 @@ _mm_loadl_epi64((__m128i *)&filter_8bit[sx >> WARPEDDIFF_PREC_BITS]); // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7 - coeff[0] = _mm_shuffle_epi8( - tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask01)); + coeff[0] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask01)); // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7 - coeff[1] = _mm_shuffle_epi8( - tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask23)); + coeff[1] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask23)); // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7 - coeff[2] = _mm_shuffle_epi8( - tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask45)); + coeff[2] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask45)); // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7 - coeff[3] = _mm_shuffle_epi8( - tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask67)); + coeff[3] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask67)); } static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx, @@ -504,13 +517,13 @@ // even coeffs coeffs[0] = - _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask0)); + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask0)); coeffs[1] = - _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask1)); + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask1)); coeffs[2] = - _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask2)); + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask2)); coeffs[3] = - _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask3)); + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask3)); // odd coeffs coeffs[4] = coeffs[0];