Optimise self-guided restoration SIMD functions
Improvements have been made to calc_ab for both the
SSE4.1 and AVX2 versions of the self-guided filter.
These result in an increase in the speed of between
3% and 5% depending on the bit depth.
Change-Id: I83a12ba452fcbb61cce5066801ae213e23c609cd
diff --git a/av1/common/x86/selfguided_avx2.c b/av1/common/x86/selfguided_avx2.c
index f046180..a3f81d6 100644
--- a/av1/common/x86/selfguided_avx2.c
+++ b/av1/common/x86/selfguided_avx2.c
@@ -209,36 +209,32 @@
const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+ // Set up masks
+ const __m128i ones32 = _mm_set_epi64x(0, 0xffffffffffffffffULL);
+ __m256i mask[8];
+ for (int idx = 0; idx < 8; idx++) {
+ const __m128i shift = _mm_set_epi64x(0, 8 * (8 - idx));
+ mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+ }
+
for (int i = -1; i < height + 1; ++i) {
for (int j = -1; j < width + 1; j += 8) {
const int32_t *Cij = C + i * buf_stride + j;
const int32_t *Dij = D + i * buf_stride + j;
- const __m256i pre_sum1 = boxsum_from_ii(Dij, buf_stride, r);
- const __m256i pre_sum2 = boxsum_from_ii(Cij, buf_stride, r);
+ __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+ __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
-#if CONFIG_DEBUG
- // When width + 2 isn't a multiple of eight, z will contain some
- // uninitialised data in its upper words. This isn't really a problem
- // (they will be clamped to safe indices by the min() below, and will be
- // written to memory locations that we don't read again), but Valgrind
- // complains because we're using an uninitialised value as the address
- // for a load operation
- //
- // This mask is reasonably cheap to compute and quiets the warnings. Note
- // that we can't mask p instead of sum1 and sum2 (which would be cheaper)
- // because Valgrind gets the taint propagation in compute_p wrong.
+ // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
+ // some uninitialised data in their upper words. We use a mask to
+ // ensure that these bits are set to 0.
+ int idx = AOMMIN(8, width + 1 - j);
+ assert(idx >= 1);
- const __m128i ones32 = _mm_set_epi64x(0, 0xffffffffffffffffULL);
- const __m128i shift =
- _mm_set_epi64x(0, AOMMAX(0, 8 * (8 - (width + 1 - j))));
- const __m256i mask = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
- const __m256i sum1 = _mm256_and_si256(mask, pre_sum1);
- const __m256i sum2 = _mm256_and_si256(mask, pre_sum2);
-#else
- const __m256i sum1 = pre_sum1;
- const __m256i sum2 = pre_sum2;
-#endif // CONFIG_DEBUG
+ if (idx < 8) {
+ sum1 = _mm256_and_si256(mask[idx], sum1);
+ sum2 = _mm256_and_si256(mask[idx], sum2);
+ }
const __m256i p = compute_p(sum1, sum2, bit_depth, n);
diff --git a/av1/common/x86/selfguided_sse4.c b/av1/common/x86/selfguided_sse4.c
index d7688bb..1c8025e 100644
--- a/av1/common/x86/selfguided_sse4.c
+++ b/av1/common/x86/selfguided_sse4.c
@@ -166,35 +166,32 @@
const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+ // Set up masks
+ const __m128i ones32 = _mm_set_epi64x(0, 0xffffffffffffffffULL);
+ __m128i mask[4];
+ for (int idx = 0; idx < 4; idx++) {
+ const __m128i shift = _mm_set_epi64x(0, 8 * (4 - idx));
+ mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+ }
+
for (int i = -1; i < height + 1; ++i) {
- for (int j0 = -1; j0 < width + 1; j0 += 4) {
- const int32_t *Cij = C + i * buf_stride + j0;
- const int32_t *Dij = D + i * buf_stride + j0;
+ for (int j = -1; j < width + 1; j += 4) {
+ const int32_t *Cij = C + i * buf_stride + j;
+ const int32_t *Dij = D + i * buf_stride + j;
- const __m128i pre_sum1 = boxsum_from_ii(Dij, buf_stride, r);
- const __m128i pre_sum2 = boxsum_from_ii(Cij, buf_stride, r);
+ __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+ __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r);
-#if CONFIG_DEBUG
- // When width + 2 isn't a multiple of four, z will contain some
- // uninitialised data in its upper words. This isn't really a problem
- // (they will be clamped to safe indices by the min() below, and will be
- // written to memory locations that we don't read again), but Valgrind
- // complains because we're using an uninitialised value as the address
- // for a load operation
- //
- // This mask is reasonably cheap to compute and quiets the warnings. Note
- // that we can't mask p instead of sum1 and sum2 (which would be cheaper)
- // because Valgrind gets the taint propagation in compute_p wrong.
- const __m128i ones32 = _mm_set_epi64x(0, 0xffffffffULL);
- const __m128i shift =
- _mm_set_epi64x(0, AOMMAX(0, 32 - 8 * (width + 1 - j0)));
- const __m128i mask = _mm_cvtepi8_epi32(_mm_srl_epi32(ones32, shift));
- const __m128i sum1 = _mm_and_si128(mask, pre_sum1);
- const __m128i sum2 = _mm_and_si128(mask, pre_sum2);
-#else
- const __m128i sum1 = pre_sum1;
- const __m128i sum2 = pre_sum2;
-#endif // CONFIG_DEBUG
+ // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain
+ // some uninitialised data in their upper words. We use a mask to
+ // ensure that these bits are set to 0.
+ int idx = AOMMIN(4, width + 1 - j);
+ assert(idx >= 1);
+
+ if (idx < 4) {
+ sum1 = _mm_and_si128(mask[idx], sum1);
+ sum2 = _mm_and_si128(mask[idx], sum2);
+ }
const __m128i p = compute_p(sum1, sum2, bit_depth, n);
@@ -210,7 +207,7 @@
x_by_xplus1[_mm_extract_epi32(z, 1)],
x_by_xplus1[_mm_extract_epi32(z, 0)]);
- xx_storeu_128(A + i * buf_stride + j0, a_res);
+ xx_storeu_128(A + i * buf_stride + j, a_res);
const __m128i a_complement =
_mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
@@ -223,7 +220,7 @@
const __m128i b_res =
_mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS);
- xx_storeu_128(B + i * buf_stride + j0, b_res);
+ xx_storeu_128(B + i * buf_stride + j, b_res);
}
}
}