Minor improvement to warp_affine_avx2
This patch basically includes two minor improvement:
1.) Through inserting the 128bit data to high half of ymm,
we are reducing register pressure, this would avoid
register spill/fill for gcc, and also save some unpacking
instructions.
2.) The '(unsigned)' convert is telling the compiler that we
know the array offset is non-negative, thus one 'movslq'
instruction could be avoided for array offset calculation.
Profiling below command:
./aomenc 1080p_park_joy.y4m --cpu-used=3 --cq-level=32 --limit=50 --passes=1 -o parkjoy.webm
Before the patch:
5.81% 5.80% aomenc aomenc [.] av1_warp_affine_avx2
|
--5.79%--av1_warp_plane
av1_warp_affine_avx2
With the patch:
5.53% 5.52% aomenc aomenc [.] av1_warp_affine_avx2
|
--5.51%--av1_warp_plane
av1_warp_affine_avx2
Change-Id: Ic245e4da3f0f39f95f69c16fffce01fdd9fd9117
diff --git a/av1/common/x86/warp_plane_avx2.c b/av1/common/x86/warp_plane_avx2.c
index b5fbb4c..53a928d 100644
--- a/av1/common/x86/warp_plane_avx2.c
+++ b/av1/common/x86/warp_plane_avx2.c
@@ -100,65 +100,85 @@
int sx,
__m256i *coeff) {
__m128i tmp_0 = _mm_loadl_epi64(
- (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 0 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
__m128i tmp_1 = _mm_loadl_epi64(
- (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 1 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
__m128i tmp_2 = _mm_loadl_epi64(
- (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 2 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
__m128i tmp_3 = _mm_loadl_epi64(
- (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 3 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+
__m128i tmp_4 = _mm_loadl_epi64(
- (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 4 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
__m128i tmp_5 = _mm_loadl_epi64(
- (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 5 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
__m128i tmp_6 = _mm_loadl_epi64(
- (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 6 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
__m128i tmp_7 = _mm_loadl_epi64(
- (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 7 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
- tmp_0 = _mm_unpacklo_epi16(tmp_0, tmp_2);
- tmp_1 = _mm_unpacklo_epi16(tmp_1, tmp_3);
- tmp_4 = _mm_unpacklo_epi16(tmp_4, tmp_6);
- tmp_5 = _mm_unpacklo_epi16(tmp_5, tmp_7);
+ __m256i tmp0_256 = _mm256_castsi128_si256(tmp_0);
+ __m256i tmp2_256 = _mm256_castsi128_si256(tmp_2);
+ __m256i tmp1_256 = _mm256_castsi128_si256(tmp_1);
+ __m256i tmp3_256 = _mm256_castsi128_si256(tmp_3);
- __m128i tmp_8 =
- _mm_loadl_epi64((__m128i *)&av1_filter_8bit[((sx + beta) + 0 * alpha) >>
- WARPEDDIFF_PREC_BITS]);
- __m128i tmp_9 =
- _mm_loadl_epi64((__m128i *)&av1_filter_8bit[((sx + beta) + 1 * alpha) >>
- WARPEDDIFF_PREC_BITS]);
- __m128i tmp_10 =
- _mm_loadl_epi64((__m128i *)&av1_filter_8bit[((sx + beta) + 2 * alpha) >>
- WARPEDDIFF_PREC_BITS]);
- __m128i tmp_11 =
- _mm_loadl_epi64((__m128i *)&av1_filter_8bit[((sx + beta) + 3 * alpha) >>
- WARPEDDIFF_PREC_BITS]);
- tmp_2 =
- _mm_loadl_epi64((__m128i *)&av1_filter_8bit[((sx + beta) + 4 * alpha) >>
- WARPEDDIFF_PREC_BITS]);
- tmp_3 =
- _mm_loadl_epi64((__m128i *)&av1_filter_8bit[((sx + beta) + 5 * alpha) >>
- WARPEDDIFF_PREC_BITS]);
- tmp_6 =
- _mm_loadl_epi64((__m128i *)&av1_filter_8bit[((sx + beta) + 6 * alpha) >>
- WARPEDDIFF_PREC_BITS]);
- tmp_7 =
- _mm_loadl_epi64((__m128i *)&av1_filter_8bit[((sx + beta) + 7 * alpha) >>
- WARPEDDIFF_PREC_BITS]);
+ __m256i tmp4_256 = _mm256_castsi128_si256(tmp_4);
+ __m256i tmp6_256 = _mm256_castsi128_si256(tmp_6);
+ __m256i tmp5_256 = _mm256_castsi128_si256(tmp_5);
+ __m256i tmp7_256 = _mm256_castsi128_si256(tmp_7);
- tmp_8 = _mm_unpacklo_epi16(tmp_8, tmp_10);
- tmp_2 = _mm_unpacklo_epi16(tmp_2, tmp_6);
- tmp_9 = _mm_unpacklo_epi16(tmp_9, tmp_11);
- tmp_3 = _mm_unpacklo_epi16(tmp_3, tmp_7);
+ __m128i tmp_8 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 0 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp0_256 = _mm256_inserti128_si256(tmp0_256, tmp_8, 1);
- const __m256i tmp_12 =
- _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_8, 0x1);
- const __m256i tmp_13 =
- _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_1), tmp_9, 0x1);
- const __m256i tmp_14 =
- _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_4), tmp_2, 0x1);
- const __m256i tmp_15 =
- _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_5), tmp_3, 0x1);
+ __m128i tmp_9 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 1 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp1_256 = _mm256_inserti128_si256(tmp1_256, tmp_9, 1);
+
+ __m128i tmp_10 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 2 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp2_256 = _mm256_inserti128_si256(tmp2_256, tmp_10, 1);
+
+ __m128i tmp_11 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 3 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp3_256 = _mm256_inserti128_si256(tmp3_256, tmp_11, 1);
+
+ tmp_2 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 4 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp4_256 = _mm256_inserti128_si256(tmp4_256, tmp_2, 1);
+
+ tmp_3 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 5 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp5_256 = _mm256_inserti128_si256(tmp5_256, tmp_3, 1);
+
+ tmp_6 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 6 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp6_256 = _mm256_inserti128_si256(tmp6_256, tmp_6, 1);
+
+ tmp_7 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 7 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp7_256 = _mm256_inserti128_si256(tmp7_256, tmp_7, 1);
+
+ const __m256i tmp_12 = _mm256_unpacklo_epi16(tmp0_256, tmp2_256);
+ const __m256i tmp_13 = _mm256_unpacklo_epi16(tmp1_256, tmp3_256);
+ const __m256i tmp_14 = _mm256_unpacklo_epi16(tmp4_256, tmp6_256);
+ const __m256i tmp_15 = _mm256_unpacklo_epi16(tmp5_256, tmp7_256);
const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);