Fix and cleanup lowbd SIMD pixel_proj_error

In low bit-depth pixel_proj_error SSE4.1 and AVX2 implementations there
are two very similar codepaths depending whether only the first or only
the second SGR filter are enabled.  Combining these paths reduces
repetition with minimal extra complexity.

Also, in the SSE4.1 implementation was an apparent bug where a for loop

for (j = 0; j < width - 8; j += 8)

should have been

for (j = 0; j <= width - 8; j += 8)

This meant 8 (or 16 in another case) pixels were processed outside the
optimised SIMD loop, reducing speed-up.  Fixing this makes the SSE4.1
implementation of lowbd pixel_proj_error 20-70% faster depending on
filter configuration (and RUI size).

Change-Id: Iccdc7ea9da8131e2a4d6a0ee14b0a9924903e59b
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index e87a7b8..579e424 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c
@@ -549,22 +549,25 @@
       sum64 = _mm256_add_epi64(sum64, sum64_0);
       sum64 = _mm256_add_epi64(sum64, sum64_1);
     }
-  } else if (params->r[0] > 0) {
-    __m256i xq_coeff =
-        pair_set_epi16(xq[0], (-xq[0] * (1 << SGRPROJ_RST_BITS)));
+  } else if (params->r[0] > 0 || params->r[1] > 0) {
+    const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
+    const __m256i xq_coeff =
+        pair_set_epi16(xq_active, (-xq_active * (1 << SGRPROJ_RST_BITS)));
+    const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+    const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
     for (i = 0; i < height; ++i) {
       __m256i sum32 = _mm256_setzero_si256();
       for (j = 0; j <= width - 16; j += 16) {
         const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
         const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
-        const __m256i flt0_16b = _mm256_permute4x64_epi64(
-            _mm256_packs_epi32(yy_loadu_256(flt0 + j),
-                               yy_loadu_256(flt0 + j + 8)),
+        const __m256i flt_16b = _mm256_permute4x64_epi64(
+            _mm256_packs_epi32(yy_loadu_256(flt + j),
+                               yy_loadu_256(flt + j + 8)),
             0xd8);
         const __m256i v0 =
-            _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt0_16b, d0));
+            _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt_16b, d0));
         const __m256i v1 =
-            _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt0_16b, d0));
+            _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt_16b, d0));
         const __m256i vr0 =
             _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
         const __m256i vr1 =
@@ -576,53 +579,13 @@
       }
       for (k = j; k < width; ++k) {
         const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
-        int32_t v = xq[0] * (flt0[k] - u);
+        int32_t v = xq_active * (flt[k] - u);
         const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
         err += e * e;
       }
       dat += dat_stride;
       src += src_stride;
-      flt0 += flt0_stride;
-      const __m256i sum64_0 =
-          _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
-      const __m256i sum64_1 =
-          _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
-      sum64 = _mm256_add_epi64(sum64, sum64_0);
-      sum64 = _mm256_add_epi64(sum64, sum64_1);
-    }
-  } else if (params->r[1] > 0) {
-    __m256i xq_coeff = pair_set_epi16(xq[1], -(xq[1] << SGRPROJ_RST_BITS));
-    for (i = 0; i < height; ++i) {
-      __m256i sum32 = _mm256_setzero_si256();
-      for (j = 0; j <= width - 16; j += 16) {
-        const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
-        const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
-        const __m256i flt1_16b = _mm256_permute4x64_epi64(
-            _mm256_packs_epi32(yy_loadu_256(flt1 + j),
-                               yy_loadu_256(flt1 + j + 8)),
-            0xd8);
-        const __m256i v0 =
-            _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt1_16b, d0));
-        const __m256i v1 =
-            _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt1_16b, d0));
-        const __m256i vr0 =
-            _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
-        const __m256i vr1 =
-            _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
-        const __m256i e0 = _mm256_sub_epi16(
-            _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
-        const __m256i err0 = _mm256_madd_epi16(e0, e0);
-        sum32 = _mm256_add_epi32(sum32, err0);
-      }
-      for (k = j; k < width; ++k) {
-        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
-        int32_t v = xq[1] * (flt1[k] - u);
-        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
-        err += e * e;
-      }
-      dat += dat_stride;
-      src += src_stride;
-      flt1 += flt1_stride;
+      flt += flt_stride;
       const __m256i sum64_0 =
           _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
       const __m256i sum64_1 =
diff --git a/av1/encoder/x86/pickrst_sse4.c b/av1/encoder/x86/pickrst_sse4.c
index 16394de..a067ab6 100644
--- a/av1/encoder/x86/pickrst_sse4.c
+++ b/av1/encoder/x86/pickrst_sse4.c
@@ -514,7 +514,7 @@
     __m128i xq_coeff = pair_set_epi16(xq[0], xq[1]);
     for (i = 0; i < height; ++i) {
       __m128i sum32 = _mm_setzero_si128();
-      for (j = 0; j < width - 8; j += 8) {
+      for (j = 0; j <= width - 8; j += 8) {
         const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
         const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
         const __m128i flt0_16b =
@@ -550,19 +550,23 @@
       sum64 = _mm_add_epi64(sum64, sum64_0);
       sum64 = _mm_add_epi64(sum64, sum64_1);
     }
-  } else if (params->r[0] > 0) {
-    __m128i xq_coeff = pair_set_epi16(xq[0], -(xq[0] << SGRPROJ_RST_BITS));
+  } else if (params->r[0] > 0 || params->r[1] > 0) {
+    const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
+    const __m128i xq_coeff =
+        pair_set_epi16(xq_active, -(xq_active << SGRPROJ_RST_BITS));
+    const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+    const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
     for (i = 0; i < height; ++i) {
       __m128i sum32 = _mm_setzero_si128();
-      for (j = 0; j < width - 8; j += 8) {
+      for (j = 0; j <= width - 8; j += 8) {
         const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
         const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
-        const __m128i flt0_16b =
-            _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4));
+        const __m128i flt_16b =
+            _mm_packs_epi32(xx_loadu_128(flt + j), xx_loadu_128(flt + j + 4));
         const __m128i v0 =
-            _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt0_16b, d0));
+            _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt_16b, d0));
         const __m128i v1 =
-            _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt0_16b, d0));
+            _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt_16b, d0));
         const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
         const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
         const __m128i e0 =
@@ -572,47 +576,13 @@
       }
       for (k = j; k < width; ++k) {
         const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
-        int32_t v = xq[0] * (flt0[k] - u);
+        int32_t v = xq_active * (flt[k] - u);
         const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
         err += e * e;
       }
       dat += dat_stride;
       src += src_stride;
-      flt0 += flt0_stride;
-      const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
-      const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
-      sum64 = _mm_add_epi64(sum64, sum64_0);
-      sum64 = _mm_add_epi64(sum64, sum64_1);
-    }
-  } else if (params->r[1] > 0) {
-    __m128i xq_coeff = pair_set_epi16(xq[1], -(xq[1] << SGRPROJ_RST_BITS));
-    for (i = 0; i < height; ++i) {
-      __m128i sum32 = _mm_setzero_si128();
-      for (j = 0; j < width - 8; j += 8) {
-        const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
-        const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
-        const __m128i flt1_16b =
-            _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4));
-        const __m128i v0 =
-            _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt1_16b, d0));
-        const __m128i v1 =
-            _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt1_16b, d0));
-        const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
-        const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
-        const __m128i e0 =
-            _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
-        const __m128i err0 = _mm_madd_epi16(e0, e0);
-        sum32 = _mm_add_epi32(sum32, err0);
-      }
-      for (k = j; k < width; ++k) {
-        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
-        int32_t v = xq[1] * (flt1[k] - u);
-        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
-        err += e * e;
-      }
-      dat += dat_stride;
-      src += src_stride;
-      flt1 += flt1_stride;
+      flt += flt_stride;
       const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
       const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
       sum64 = _mm_add_epi64(sum64, sum64_0);
@@ -621,7 +591,7 @@
   } else {
     __m128i sum32 = _mm_setzero_si128();
     for (i = 0; i < height; ++i) {
-      for (j = 0; j < width - 16; j += 16) {
+      for (j = 0; j <= width - 16; j += 16) {
         const __m128i d = xx_loadu_128(dat + j);
         const __m128i s = xx_loadu_128(src + j);
         const __m128i d0 = _mm_cvtepu8_epi16(d);