disflow_avx2.c: Make compatible with gcc <= 9

Per the linked bug report, the _mm256_loadu2_m128i() intrinsic
was only added in gcc 10. Therefore, for compatibility with gcc 9
and earlier, we must instead use our own implementation of this
intrinsic, which we call yy_loadu2_128().

Bug: aomedia:3550
Change-Id: I8a4220acaaddeb6dcdd8fd918cd386c432a56bfc
diff --git a/aom_dsp/flow_estimation/x86/disflow_avx2.c b/aom_dsp/flow_estimation/x86/disflow_avx2.c
index e210042..ad5a1bd 100644
--- a/aom_dsp/flow_estimation/x86/disflow_avx2.c
+++ b/aom_dsp/flow_estimation/x86/disflow_avx2.c
@@ -145,7 +145,7 @@
     // for a total of 11 pixels. Here we load 16 pixels, but only use
     // the first 11.
     __m256i row =
-        _mm256_loadu2_m128i((__m128i *)(ref_row + stride), (__m128i *)ref_row);
+        yy_loadu2_128((__m128i *)(ref_row + stride), (__m128i *)ref_row);
 
     // Expand pixels to int16s
     // We must use unpacks here, as we have one row in each 128-bit lane
@@ -273,8 +273,8 @@
 
   // Loop setup: Load the first two rows (of 10 input rows) and apply
   // the horizontal parts of the two filters
-  __m256i row_m1_0 = _mm256_loadu2_m128i((__m128i *)(src - 1),
-                                         (__m128i *)(src - src_stride - 1));
+  __m256i row_m1_0 =
+      yy_loadu2_128((__m128i *)(src - 1), (__m128i *)(src - src_stride - 1));
   __m256i row_m1_0_a = _mm256_unpacklo_epi8(row_m1_0, zero);
   __m256i row_m1_0_b =
       _mm256_unpacklo_epi8(_mm256_srli_si256(row_m1_0, 1), zero);
@@ -293,8 +293,8 @@
   for (int i = 0; i < DISFLOW_PATCH_SIZE; i += 2) {
     // Load rows (i+1, i+2) and apply both horizontal filters
     const __m256i row_p1_p2 =
-        _mm256_loadu2_m128i((__m128i *)(src + (i + 2) * src_stride - 1),
-                            (__m128i *)(src + (i + 1) * src_stride - 1));
+        yy_loadu2_128((__m128i *)(src + (i + 2) * src_stride - 1),
+                      (__m128i *)(src + (i + 1) * src_stride - 1));
     const __m256i row_p1_p2_a = _mm256_unpacklo_epi8(row_p1_p2, zero);
     const __m256i row_p1_p2_b =
         _mm256_unpacklo_epi8(_mm256_srli_si256(row_p1_p2, 1), zero);