Optimise SSE4.1 self-guided filter implementation
The cross_sum function has been replaced by a more optimised
version, increasing the speed of the filter by ~5%.
Change-Id: Ieb0fbe53033591919f719d0a288a55abd74ba2e4
diff --git a/av1/common/x86/selfguided_sse4.c b/av1/common/x86/selfguided_sse4.c
index 1247b92..d7688bb 100644
--- a/av1/common/x86/selfguided_sse4.c
+++ b/av1/common/x86/selfguided_sse4.c
@@ -231,22 +231,34 @@
// Calculate 4 values of the "cross sum" starting at buf. This is a 3x3 filter
// where the outer four corners have weight 3 and all other pixels have weight
// 4.
+//
+// Pixels are indexed like this:
+// xtl xt xtr
+// xl x xr
+// xbl xb xbr
+//
+// buf points to x
+//
+// fours = xl + xt + xr + xb + x
+// threes = xtl + xtr + xbr + xbl
+// cross_sum = 4 * fours + 3 * threes
+// = 4 * (fours + threes) - threes
+// = (fours + threes) << 2 - threes
static __m128i cross_sum(const int32_t *buf, int stride) {
- const __m128i a0 = xx_loadu_128(buf - 1 - stride);
- const __m128i a1 = xx_loadu_128(buf + 3 - stride);
- const __m128i b0 = xx_loadu_128(buf - 1);
- const __m128i b1 = xx_loadu_128(buf + 3);
- const __m128i c0 = xx_loadu_128(buf - 1 + stride);
- const __m128i c1 = xx_loadu_128(buf + 3 + stride);
+ const __m128i xtl = xx_loadu_128(buf - 1 - stride);
+ const __m128i xt = xx_loadu_128(buf - stride);
+ const __m128i xtr = xx_loadu_128(buf + 1 - stride);
+ const __m128i xl = xx_loadu_128(buf - 1);
+ const __m128i x = xx_loadu_128(buf);
+ const __m128i xr = xx_loadu_128(buf + 1);
+ const __m128i xbl = xx_loadu_128(buf - 1 + stride);
+ const __m128i xb = xx_loadu_128(buf + stride);
+ const __m128i xbr = xx_loadu_128(buf + 1 + stride);
- const __m128i fours =
- _mm_add_epi32(_mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(b1, b0, 4), b0),
- _mm_add_epi32(_mm_alignr_epi8(b1, b0, 8),
- _mm_alignr_epi8(c1, c0, 4))),
- _mm_alignr_epi8(a1, a0, 4));
- const __m128i threes = _mm_add_epi32(
- _mm_add_epi32(a0, c0),
- _mm_add_epi32(_mm_alignr_epi8(a1, a0, 8), _mm_alignr_epi8(c1, c0, 8)));
+ const __m128i fours = _mm_add_epi32(
+ xl, _mm_add_epi32(xt, _mm_add_epi32(xr, _mm_add_epi32(xb, x))));
+ const __m128i threes =
+ _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl)));
return _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(fours, threes), 2), threes);
}