Fix and cleanup lowbd SIMD pixel_proj_error In low bit-depth pixel_proj_error SSE4.1 and AVX2 implementations there are two very similar codepaths depending whether only the first or only the second SGR filter are enabled. Combining these paths reduces repetition with minimal extra complexity. Also, in the SSE4.1 implementation was an apparent bug where a for loop for (j = 0; j < width - 8; j += 8) should have been for (j = 0; j <= width - 8; j += 8) This meant 8 (or 16 in another case) pixels were processed outside the optimised SIMD loop, reducing speed-up. Fixing this makes the SSE4.1 implementation of lowbd pixel_proj_error 20-70% faster depending on filter configuration (and RUI size). Change-Id: Iccdc7ea9da8131e2a4d6a0ee14b0a9924903e59b

commit: fff9bf1c36528363c7b50861dd7ff22ae4612850 [log] [tgz]
author: David Turner <david.turner@argondesign.com> Wed Oct 10 16:21:08 2018 +0100
committer: David Turner <david.turner@argondesign.com> Thu Oct 11 16:44:54 2018 +0000
tree: 8c16444348fde28b077a0e7b28c0018d43e7ee6b
parent: 4ca0087fa0abcb6e2b3db7c9270f6e5e9dc9fb4d [diff]
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index e87a7b8..579e424 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c

@@ -549,22 +549,25 @@
       sum64 = _mm256_add_epi64(sum64, sum64_0);
       sum64 = _mm256_add_epi64(sum64, sum64_1);
     }
-  } else if (params->r[0] > 0) {
-    __m256i xq_coeff =
-        pair_set_epi16(xq[0], (-xq[0] * (1 << SGRPROJ_RST_BITS)));
+  } else if (params->r[0] > 0 || params->r[1] > 0) {
+    const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
+    const __m256i xq_coeff =
+        pair_set_epi16(xq_active, (-xq_active * (1 << SGRPROJ_RST_BITS)));
+    const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+    const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
     for (i = 0; i < height; ++i) {
       __m256i sum32 = _mm256_setzero_si256();
       for (j = 0; j <= width - 16; j += 16) {
         const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
         const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
-        const __m256i flt0_16b = _mm256_permute4x64_epi64(
-            _mm256_packs_epi32(yy_loadu_256(flt0 + j),
-                               yy_loadu_256(flt0 + j + 8)),
+        const __m256i flt_16b = _mm256_permute4x64_epi64(
+            _mm256_packs_epi32(yy_loadu_256(flt + j),
+                               yy_loadu_256(flt + j + 8)),
             0xd8);
         const __m256i v0 =
-            _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt0_16b, d0));
+            _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt_16b, d0));
         const __m256i v1 =
-            _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt0_16b, d0));
+            _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt_16b, d0));
         const __m256i vr0 =
             _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
         const __m256i vr1 =
@@ -576,53 +579,13 @@
       }
       for (k = j; k < width; ++k) {
         const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
-        int32_t v = xq[0] * (flt0[k] - u);
+        int32_t v = xq_active * (flt[k] - u);
         const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
         err += e * e;
       }
       dat += dat_stride;
       src += src_stride;
-      flt0 += flt0_stride;
-      const __m256i sum64_0 =
-          _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
-      const __m256i sum64_1 =
-          _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
-      sum64 = _mm256_add_epi64(sum64, sum64_0);
-      sum64 = _mm256_add_epi64(sum64, sum64_1);
-    }
-  } else if (params->r[1] > 0) {
-    __m256i xq_coeff = pair_set_epi16(xq[1], -(xq[1] << SGRPROJ_RST_BITS));
-    for (i = 0; i < height; ++i) {
-      __m256i sum32 = _mm256_setzero_si256();
-      for (j = 0; j <= width - 16; j += 16) {
-        const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
-        const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
-        const __m256i flt1_16b = _mm256_permute4x64_epi64(
-            _mm256_packs_epi32(yy_loadu_256(flt1 + j),
-                               yy_loadu_256(flt1 + j + 8)),
-            0xd8);
-        const __m256i v0 =
-            _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt1_16b, d0));
-        const __m256i v1 =
-            _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt1_16b, d0));
-        const __m256i vr0 =
-            _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
-        const __m256i vr1 =
-            _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
-        const __m256i e0 = _mm256_sub_epi16(
-            _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
-        const __m256i err0 = _mm256_madd_epi16(e0, e0);
-        sum32 = _mm256_add_epi32(sum32, err0);
-      }
-      for (k = j; k < width; ++k) {
-        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
-        int32_t v = xq[1] * (flt1[k] - u);
-        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
-        err += e * e;
-      }
-      dat += dat_stride;
-      src += src_stride;
-      flt1 += flt1_stride;
+      flt += flt_stride;
       const __m256i sum64_0 =
           _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
       const __m256i sum64_1 =

diff --git a/av1/encoder/x86/pickrst_sse4.c b/av1/encoder/x86/pickrst_sse4.c
index 16394de..a067ab6 100644
--- a/av1/encoder/x86/pickrst_sse4.c
+++ b/av1/encoder/x86/pickrst_sse4.c

@@ -514,7 +514,7 @@
     __m128i xq_coeff = pair_set_epi16(xq[0], xq[1]);
     for (i = 0; i < height; ++i) {
       __m128i sum32 = _mm_setzero_si128();
-      for (j = 0; j < width - 8; j += 8) {
+      for (j = 0; j <= width - 8; j += 8) {
         const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
         const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
         const __m128i flt0_16b =
@@ -550,19 +550,23 @@
       sum64 = _mm_add_epi64(sum64, sum64_0);
       sum64 = _mm_add_epi64(sum64, sum64_1);
     }
-  } else if (params->r[0] > 0) {
-    __m128i xq_coeff = pair_set_epi16(xq[0], -(xq[0] << SGRPROJ_RST_BITS));
+  } else if (params->r[0] > 0 || params->r[1] > 0) {
+    const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
+    const __m128i xq_coeff =
+        pair_set_epi16(xq_active, -(xq_active << SGRPROJ_RST_BITS));
+    const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+    const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
     for (i = 0; i < height; ++i) {
       __m128i sum32 = _mm_setzero_si128();
-      for (j = 0; j < width - 8; j += 8) {
+      for (j = 0; j <= width - 8; j += 8) {
         const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
         const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
-        const __m128i flt0_16b =
-            _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4));
+        const __m128i flt_16b =
+            _mm_packs_epi32(xx_loadu_128(flt + j), xx_loadu_128(flt + j + 4));
         const __m128i v0 =
-            _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt0_16b, d0));
+            _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt_16b, d0));
         const __m128i v1 =
-            _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt0_16b, d0));
+            _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt_16b, d0));
         const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
         const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
         const __m128i e0 =
@@ -572,47 +576,13 @@
       }
       for (k = j; k < width; ++k) {
         const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
-        int32_t v = xq[0] * (flt0[k] - u);
+        int32_t v = xq_active * (flt[k] - u);
         const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
         err += e * e;
       }
       dat += dat_stride;
       src += src_stride;
-      flt0 += flt0_stride;
-      const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
-      const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
-      sum64 = _mm_add_epi64(sum64, sum64_0);
-      sum64 = _mm_add_epi64(sum64, sum64_1);
-    }
-  } else if (params->r[1] > 0) {
-    __m128i xq_coeff = pair_set_epi16(xq[1], -(xq[1] << SGRPROJ_RST_BITS));
-    for (i = 0; i < height; ++i) {
-      __m128i sum32 = _mm_setzero_si128();
-      for (j = 0; j < width - 8; j += 8) {
-        const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
-        const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
-        const __m128i flt1_16b =
-            _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4));
-        const __m128i v0 =
-            _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt1_16b, d0));
-        const __m128i v1 =
-            _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt1_16b, d0));
-        const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
-        const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
-        const __m128i e0 =
-            _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
-        const __m128i err0 = _mm_madd_epi16(e0, e0);
-        sum32 = _mm_add_epi32(sum32, err0);
-      }
-      for (k = j; k < width; ++k) {
-        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
-        int32_t v = xq[1] * (flt1[k] - u);
-        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
-        err += e * e;
-      }
-      dat += dat_stride;
-      src += src_stride;
-      flt1 += flt1_stride;
+      flt += flt_stride;
       const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
       const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
       sum64 = _mm_add_epi64(sum64, sum64_0);
@@ -621,7 +591,7 @@
   } else {
     __m128i sum32 = _mm_setzero_si128();
     for (i = 0; i < height; ++i) {
-      for (j = 0; j < width - 16; j += 16) {
+      for (j = 0; j <= width - 16; j += 16) {
         const __m128i d = xx_loadu_128(dat + j);
         const __m128i s = xx_loadu_128(src + j);
         const __m128i d0 = _mm_cvtepu8_epi16(d);
commit	fff9bf1c36528363c7b50861dd7ff22ae4612850	[log] [tgz]
author	David Turner <david.turner@argondesign.com>	Wed Oct 10 16:21:08 2018 +0100
committer	David Turner <david.turner@argondesign.com>	Thu Oct 11 16:44:54 2018 +0000
tree	8c16444348fde28b077a0e7b28c0018d43e7ee6b
parent	4ca0087fa0abcb6e2b3db7c9270f6e5e9dc9fb4d [diff]