{jnt,}convolve_sse2: move load closer to first use
generates mildly better assembly with gcc-13 and clang-16.
Change-Id: I1e8fb2a6407e292c15e44dc7dd2676bad9a69857
diff --git a/av1/common/x86/convolve_sse2.c b/av1/common/x86/convolve_sse2.c
index 4787d3f..9272e91 100644
--- a/av1/common/x86/convolve_sse2.c
+++ b/av1/common/x86/convolve_sse2.c
@@ -201,7 +201,6 @@
if (w <= 4) {
__m128i s[8], src6, res, res_round, res16;
int res_int;
- src6 = xx_loadl_32(src_ptr + 6 * src_stride);
s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride),
xx_loadl_32(src_ptr + 1 * src_stride));
s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride),
@@ -212,6 +211,7 @@
xx_loadl_32(src_ptr + 4 * src_stride));
s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride),
xx_loadl_32(src_ptr + 5 * src_stride));
+ src6 = xx_loadl_32(src_ptr + 6 * src_stride);
s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6);
do {
diff --git a/av1/common/x86/jnt_convolve_sse2.c b/av1/common/x86/jnt_convolve_sse2.c
index 3386150..6b12278 100644
--- a/av1/common/x86/jnt_convolve_sse2.c
+++ b/av1/common/x86/jnt_convolve_sse2.c
@@ -179,7 +179,6 @@
if (w == 4) {
__m128i s[8], src6, res, res_shift;
- src6 = xx_loadl_32(src_ptr + 6 * src_stride);
s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride),
xx_loadl_32(src_ptr + 1 * src_stride));
s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride),
@@ -190,6 +189,7 @@
xx_loadl_32(src_ptr + 4 * src_stride));
s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride),
xx_loadl_32(src_ptr + 5 * src_stride));
+ src6 = xx_loadl_32(src_ptr + 6 * src_stride);
s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6);
do {