Optimise self-guided restoration SIMD functions

Improvements have been made to calc_ab for both the
SSE4.1 and AVX2 versions of the self-guided filter.
These result in an increase in the speed of between
3% and 5% depending on the bit depth.

Change-Id: I83a12ba452fcbb61cce5066801ae213e23c609cd
diff --git a/av1/common/x86/selfguided_avx2.c b/av1/common/x86/selfguided_avx2.c
index f046180..a3f81d6 100644
--- a/av1/common/x86/selfguided_avx2.c
+++ b/av1/common/x86/selfguided_avx2.c
@@ -209,36 +209,32 @@
   const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
   const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
 
+  // Set up masks
+  const __m128i ones32 = _mm_set_epi64x(0, 0xffffffffffffffffULL);
+  __m256i mask[8];
+  for (int idx = 0; idx < 8; idx++) {
+    const __m128i shift = _mm_set_epi64x(0, 8 * (8 - idx));
+    mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+  }
+
   for (int i = -1; i < height + 1; ++i) {
     for (int j = -1; j < width + 1; j += 8) {
       const int32_t *Cij = C + i * buf_stride + j;
       const int32_t *Dij = D + i * buf_stride + j;
 
-      const __m256i pre_sum1 = boxsum_from_ii(Dij, buf_stride, r);
-      const __m256i pre_sum2 = boxsum_from_ii(Cij, buf_stride, r);
+      __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+      __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
 
-#if CONFIG_DEBUG
-      // When width + 2 isn't a multiple of eight, z will contain some
-      // uninitialised data in its upper words. This isn't really a problem
-      // (they will be clamped to safe indices by the min() below, and will be
-      // written to memory locations that we don't read again), but Valgrind
-      // complains because we're using an uninitialised value as the address
-      // for a load operation
-      //
-      // This mask is reasonably cheap to compute and quiets the warnings. Note
-      // that we can't mask p instead of sum1 and sum2 (which would be cheaper)
-      // because Valgrind gets the taint propagation in compute_p wrong.
+      // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
+      // some uninitialised data in their upper words. We use a mask to
+      // ensure that these bits are set to 0.
+      int idx = AOMMIN(8, width + 1 - j);
+      assert(idx >= 1);
 
-      const __m128i ones32 = _mm_set_epi64x(0, 0xffffffffffffffffULL);
-      const __m128i shift =
-          _mm_set_epi64x(0, AOMMAX(0, 8 * (8 - (width + 1 - j))));
-      const __m256i mask = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
-      const __m256i sum1 = _mm256_and_si256(mask, pre_sum1);
-      const __m256i sum2 = _mm256_and_si256(mask, pre_sum2);
-#else
-      const __m256i sum1 = pre_sum1;
-      const __m256i sum2 = pre_sum2;
-#endif  // CONFIG_DEBUG
+      if (idx < 8) {
+        sum1 = _mm256_and_si256(mask[idx], sum1);
+        sum2 = _mm256_and_si256(mask[idx], sum2);
+      }
 
       const __m256i p = compute_p(sum1, sum2, bit_depth, n);
 
diff --git a/av1/common/x86/selfguided_sse4.c b/av1/common/x86/selfguided_sse4.c
index d7688bb..1c8025e 100644
--- a/av1/common/x86/selfguided_sse4.c
+++ b/av1/common/x86/selfguided_sse4.c
@@ -166,35 +166,32 @@
   const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
   const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
 
+  // Set up masks
+  const __m128i ones32 = _mm_set_epi64x(0, 0xffffffffffffffffULL);
+  __m128i mask[4];
+  for (int idx = 0; idx < 4; idx++) {
+    const __m128i shift = _mm_set_epi64x(0, 8 * (4 - idx));
+    mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+  }
+
   for (int i = -1; i < height + 1; ++i) {
-    for (int j0 = -1; j0 < width + 1; j0 += 4) {
-      const int32_t *Cij = C + i * buf_stride + j0;
-      const int32_t *Dij = D + i * buf_stride + j0;
+    for (int j = -1; j < width + 1; j += 4) {
+      const int32_t *Cij = C + i * buf_stride + j;
+      const int32_t *Dij = D + i * buf_stride + j;
 
-      const __m128i pre_sum1 = boxsum_from_ii(Dij, buf_stride, r);
-      const __m128i pre_sum2 = boxsum_from_ii(Cij, buf_stride, r);
+      __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+      __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r);
 
-#if CONFIG_DEBUG
-      // When width + 2 isn't a multiple of four, z will contain some
-      // uninitialised data in its upper words. This isn't really a problem
-      // (they will be clamped to safe indices by the min() below, and will be
-      // written to memory locations that we don't read again), but Valgrind
-      // complains because we're using an uninitialised value as the address
-      // for a load operation
-      //
-      // This mask is reasonably cheap to compute and quiets the warnings. Note
-      // that we can't mask p instead of sum1 and sum2 (which would be cheaper)
-      // because Valgrind gets the taint propagation in compute_p wrong.
-      const __m128i ones32 = _mm_set_epi64x(0, 0xffffffffULL);
-      const __m128i shift =
-          _mm_set_epi64x(0, AOMMAX(0, 32 - 8 * (width + 1 - j0)));
-      const __m128i mask = _mm_cvtepi8_epi32(_mm_srl_epi32(ones32, shift));
-      const __m128i sum1 = _mm_and_si128(mask, pre_sum1);
-      const __m128i sum2 = _mm_and_si128(mask, pre_sum2);
-#else
-      const __m128i sum1 = pre_sum1;
-      const __m128i sum2 = pre_sum2;
-#endif  // CONFIG_DEBUG
+      // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain
+      // some uninitialised data in their upper words. We use a mask to
+      // ensure that these bits are set to 0.
+      int idx = AOMMIN(4, width + 1 - j);
+      assert(idx >= 1);
+
+      if (idx < 4) {
+        sum1 = _mm_and_si128(mask[idx], sum1);
+        sum2 = _mm_and_si128(mask[idx], sum2);
+      }
 
       const __m128i p = compute_p(sum1, sum2, bit_depth, n);
 
@@ -210,7 +207,7 @@
                                           x_by_xplus1[_mm_extract_epi32(z, 1)],
                                           x_by_xplus1[_mm_extract_epi32(z, 0)]);
 
-      xx_storeu_128(A + i * buf_stride + j0, a_res);
+      xx_storeu_128(A + i * buf_stride + j, a_res);
 
       const __m128i a_complement =
           _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
@@ -223,7 +220,7 @@
       const __m128i b_res =
           _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS);
 
-      xx_storeu_128(B + i * buf_stride + j0, b_res);
+      xx_storeu_128(B + i * buf_stride + j, b_res);
     }
   }
 }