{jnt,}convolve_sse2: move load closer to first use

generates mildly better assembly with gcc-13 and clang-16.

Change-Id: I1e8fb2a6407e292c15e44dc7dd2676bad9a69857
diff --git a/av1/common/x86/convolve_sse2.c b/av1/common/x86/convolve_sse2.c
index 4787d3f..9272e91 100644
--- a/av1/common/x86/convolve_sse2.c
+++ b/av1/common/x86/convolve_sse2.c
@@ -201,7 +201,6 @@
     if (w <= 4) {
       __m128i s[8], src6, res, res_round, res16;
       int res_int;
-      src6 = xx_loadl_32(src_ptr + 6 * src_stride);
       s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride),
                                xx_loadl_32(src_ptr + 1 * src_stride));
       s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride),
@@ -212,6 +211,7 @@
                                xx_loadl_32(src_ptr + 4 * src_stride));
       s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride),
                                xx_loadl_32(src_ptr + 5 * src_stride));
+      src6 = xx_loadl_32(src_ptr + 6 * src_stride);
       s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6);
 
       do {
diff --git a/av1/common/x86/jnt_convolve_sse2.c b/av1/common/x86/jnt_convolve_sse2.c
index 3386150..6b12278 100644
--- a/av1/common/x86/jnt_convolve_sse2.c
+++ b/av1/common/x86/jnt_convolve_sse2.c
@@ -179,7 +179,6 @@
 
   if (w == 4) {
     __m128i s[8], src6, res, res_shift;
-    src6 = xx_loadl_32(src_ptr + 6 * src_stride);
     s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride),
                              xx_loadl_32(src_ptr + 1 * src_stride));
     s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride),
@@ -190,6 +189,7 @@
                              xx_loadl_32(src_ptr + 4 * src_stride));
     s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride),
                              xx_loadl_32(src_ptr + 5 * src_stride));
+    src6 = xx_loadl_32(src_ptr + 6 * src_stride);
     s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6);
 
     do {