disflow_avx2.c: Make compatible with gcc <= 9 Per the linked bug report, the _mm256_loadu2_m128i() intrinsic was only added in gcc 10. Therefore, for compatibility with gcc 9 and earlier, we must instead use our own implementation of this intrinsic, which we call yy_loadu2_128(). Bug: aomedia:3550 Change-Id: I8a4220acaaddeb6dcdd8fd918cd386c432a56bfc (cherry picked from commit 874b7ca3adae0c408b06ce3982e8c4ad432bdd98)
diff --git a/aom_dsp/flow_estimation/x86/disflow_avx2.c b/aom_dsp/flow_estimation/x86/disflow_avx2.c index e210042..ad5a1bd 100644 --- a/aom_dsp/flow_estimation/x86/disflow_avx2.c +++ b/aom_dsp/flow_estimation/x86/disflow_avx2.c
@@ -145,7 +145,7 @@ // for a total of 11 pixels. Here we load 16 pixels, but only use // the first 11. __m256i row = - _mm256_loadu2_m128i((__m128i *)(ref_row + stride), (__m128i *)ref_row); + yy_loadu2_128((__m128i *)(ref_row + stride), (__m128i *)ref_row); // Expand pixels to int16s // We must use unpacks here, as we have one row in each 128-bit lane @@ -273,8 +273,8 @@ // Loop setup: Load the first two rows (of 10 input rows) and apply // the horizontal parts of the two filters - __m256i row_m1_0 = _mm256_loadu2_m128i((__m128i *)(src - 1), - (__m128i *)(src - src_stride - 1)); + __m256i row_m1_0 = + yy_loadu2_128((__m128i *)(src - 1), (__m128i *)(src - src_stride - 1)); __m256i row_m1_0_a = _mm256_unpacklo_epi8(row_m1_0, zero); __m256i row_m1_0_b = _mm256_unpacklo_epi8(_mm256_srli_si256(row_m1_0, 1), zero); @@ -293,8 +293,8 @@ for (int i = 0; i < DISFLOW_PATCH_SIZE; i += 2) { // Load rows (i+1, i+2) and apply both horizontal filters const __m256i row_p1_p2 = - _mm256_loadu2_m128i((__m128i *)(src + (i + 2) * src_stride - 1), - (__m128i *)(src + (i + 1) * src_stride - 1)); + yy_loadu2_128((__m128i *)(src + (i + 2) * src_stride - 1), + (__m128i *)(src + (i + 1) * src_stride - 1)); const __m256i row_p1_p2_a = _mm256_unpacklo_epi8(row_p1_p2, zero); const __m256i row_p1_p2_b = _mm256_unpacklo_epi8(_mm256_srli_si256(row_p1_p2, 1), zero);