Cleanup SIMD FAST_SGR code Remove redundant SIMD code for the original FAST_SGR experiment and CONFIG_FAST_SGR == 2, and cleanup. Change-Id: If36e6843543e4678cf92e21b92d24c2941645de7

diff --git a/av1/common/x86/selfguided_avx2.c b/av1/common/x86/selfguided_avx2.c
index 23b651d..e1f50e3 100644
--- a/av1/common/x86/selfguided_avx2.c
+++ b/av1/common/x86/selfguided_avx2.c

@@ -157,8 +157,8 @@
   }
 }
 
-// Compute four values of boxsum from the given integral image. ii should point
-// at the middle of the box (for the first value). r is the box radius
+// Compute 8 values of boxsum from the given integral image. ii should point
+// at the middle of the box (for the first value). r is the box radius.
 static __m256i boxsum_from_ii(const int32_t *ii, int stride, int r) {
   const __m256i tl = yy_loadu_256(ii - (r + 1) - (r + 1) * stride);
   const __m256i tr = yy_loadu_256(ii + (r + 0) - (r + 1) * stride);
@@ -263,7 +263,7 @@
   }
 }
 
-// Calculate 4 values of the "cross sum" starting at buf. This is a 3x3 filter
+// Calculate 8 values of the "cross sum" starting at buf. This is a 3x3 filter
 // where the outer four corners have weight 3 and all other pixels have weight
 // 4.
 //
@@ -300,7 +300,7 @@
 }
 
 // The final filter for self-guided restoration. Computes a weighted average
-// across A, B with "cross sums" (see cross_sum implementation above)
+// across A, B with "cross sums" (see cross_sum implementation above).
 static void final_filter(int32_t *dst, int dst_stride, const int32_t *A,
                          const int32_t *B, int buf_stride, const void *dgd8,
                          int dgd_stride, int width, int height, int highbd) {
@@ -415,7 +415,7 @@
 // cross_sum = 6 * sixes + 5 * fives
 //           = 5 * (fives + sixes) - sixes
 //           = (fives + sixes) << 2 + (fives + sixes) + sixes
-static __m256i cross_sum_fast_even(const int32_t *buf, int stride) {
+static __m256i cross_sum_fast_even_row(const int32_t *buf, int stride) {
   const __m256i xtl = yy_loadu_256(buf - 1 - stride);
   const __m256i xt = yy_loadu_256(buf - stride);
   const __m256i xtr = yy_loadu_256(buf + 1 - stride);
@@ -449,7 +449,7 @@
 // cross_sum = 5 * fives + 6 * sixes
 //           = 4 * (fives + sixes) + (fives + sixes) + sixes
 //           = (fives + sixes) << 2 + (fives + sixes) + sixes
-static __m256i cross_sum_fast_odd(const int32_t *buf) {
+static __m256i cross_sum_fast_odd_row(const int32_t *buf) {
   const __m256i xl = yy_loadu_256(buf - 1);
   const __m256i x = yy_loadu_256(buf);
   const __m256i xr = yy_loadu_256(buf + 1);
@@ -465,197 +465,13 @@
       sixes);
 }
 
-// Calculate 8 values of the "cross sum" starting at buf.
-//
-// Pixels are indexed like this:
-// xtl  xt   xtr
-//  -    -    -
-// xl    x   xr
-//  -    -    -
-// xbl  xb   xbr
-//
-// Pixels are weighted like this:
-//  3    4    3
-//  0    0    0
-//  14   16   14
-//  0    0    0
-//  3    4    3
-//
-// buf points to x
-//
-// threes = xtl + xtr + xbr + xbl
-// fours = xt + xb
-// fourteens = xl + xr
-// sixteens = x
-// cross_sum = 4 * fours + 3 * threes + 14 * fourteens + 16 * sixteens
-//           = 4 * (fours + threes) + 16 * (sixteens + fourteens)
-//              - (threes + fourteens) - fourteens
-//           = (fours + threes) << 2 + (sixteens + fourteens) << 4
-//              - (threes + fourteens) - fourteens
-static __m256i cross_sum_fast_odd_not_last(const int32_t *buf, int stride) {
-  const int two_stride = 2 * stride;
-  const __m256i xtl = yy_loadu_256(buf - 1 - two_stride);
-  const __m256i xt = yy_loadu_256(buf - two_stride);
-  const __m256i xtr = yy_loadu_256(buf + 1 - two_stride);
-  const __m256i xl = yy_loadu_256(buf - 1);
-  const __m256i x = yy_loadu_256(buf);
-  const __m256i xr = yy_loadu_256(buf + 1);
-  const __m256i xbl = yy_loadu_256(buf - 1 + two_stride);
-  const __m256i xb = yy_loadu_256(buf + two_stride);
-  const __m256i xbr = yy_loadu_256(buf + 1 + two_stride);
-
-  const __m256i threes =
-      _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
-  const __m256i fours = _mm256_add_epi32(xt, xb);
-  const __m256i fourteens = _mm256_add_epi32(xl, xr);
-  const __m256i sixteens = x;
-
-  const __m256i fours_plus_threes = _mm256_add_epi32(fours, threes);
-  const __m256i sixteens_plus_fourteens = _mm256_add_epi32(sixteens, fourteens);
-  const __m256i threes_plus_fourteens = _mm256_add_epi32(threes, fourteens);
-
-  return _mm256_sub_epi32(
-      _mm256_sub_epi32(
-          _mm256_add_epi32(_mm256_slli_epi32(fours_plus_threes, 2),
-                           _mm256_slli_epi32(sixteens_plus_fourteens, 4)),
-          threes_plus_fourteens),
-      fourteens);
-}
-
-// Calculate 8 values of the "cross sum" starting at buf.
-//
-// Pixels are indexed like this:
-// xtl  xt   xtr
-//  -    -    -
-// xl    x   xr
-//
-// Pixels are weighted like this:
-//  4    6    4
-//  0    0    0
-//  16   18   16
-//
-// buf points to x
-//
-// fours = xtl + xtr
-// sixes = xt
-// sixteens = xl + xr
-// eighteens = x
-// cross_sum = 4 * fours + 6 * sixes + 16 * sixteens + 18 * eighteens
-//           = 4 * (fours + sixes) + 16 * (sixteens + eighteens)
-//              + 2 * (sixes + eighteens)
-//           = (fours + sixes) << 2 + (sixteens + eighteens) << 4
-//              + (sixes + eighteens) << 1
-static __m256i cross_sum_fast_odd_last(const int32_t *buf, int stride) {
-  const int two_stride = 2 * stride;
-  const __m256i xtl = yy_loadu_256(buf - 1 - two_stride);
-  const __m256i xt = yy_loadu_256(buf - two_stride);
-  const __m256i xtr = yy_loadu_256(buf + 1 - two_stride);
-  const __m256i xl = yy_loadu_256(buf - 1);
-  const __m256i x = yy_loadu_256(buf);
-  const __m256i xr = yy_loadu_256(buf + 1);
-
-  const __m256i fours = _mm256_add_epi32(xtl, xtr);
-  const __m256i sixes = xt;
-  const __m256i sixteens = _mm256_add_epi32(xl, xr);
-  const __m256i eighteens = x;
-
-  const __m256i fours_plus_sixes = _mm256_add_epi32(fours, sixes);
-  const __m256i sixteens_plus_eighteens = _mm256_add_epi32(sixteens, eighteens);
-  const __m256i sixes_plus_eighteens = _mm256_add_epi32(sixes, eighteens);
-
-  return _mm256_add_epi32(
-      _mm256_add_epi32(_mm256_slli_epi32(fours_plus_sixes, 2),
-                       _mm256_slli_epi32(sixteens_plus_eighteens, 4)),
-      _mm256_slli_epi32(sixes_plus_eighteens, 1));
-}
-
-// The final filter for selfguided restoration. Computes a weighted average
-// across A, B with "cross sums" (see cross_sum_... implementations above).
-// Designed for the first vertical sub-sampling version of FAST_SGR.
-static void final_filter_fast1(int32_t *dst, int dst_stride, const int32_t *A,
-                               const int32_t *B, int buf_stride,
-                               const void *dgd8, int dgd_stride, int width,
-                               int height, int highbd) {
-  const int nb0 = 5;
-  const int nb1 = 6;
-
-  const __m256i rounding0 =
-      round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
-  const __m256i rounding1 =
-      round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
-
-  const uint8_t *dgd_real =
-      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
-
-  for (int i = 0; i < height; ++i) {
-    if (!(i & 1)) {  // even row
-      for (int j = 0; j < width; j += 8) {
-        const __m256i a =
-            cross_sum_fast_even(A + i * buf_stride + j, buf_stride);
-        const __m256i b =
-            cross_sum_fast_even(B + i * buf_stride + j, buf_stride);
-
-        const __m128i raw =
-            xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
-        const __m256i src =
-            highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
-
-        __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
-        __m256i w =
-            _mm256_srai_epi32(_mm256_add_epi32(v, rounding0),
-                              SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
-
-        yy_storeu_256(dst + i * dst_stride + j, w);
-      }
-    } else if (i != height - 1) {  // odd row and not last
-      for (int j = 0; j < width; j += 8) {
-        const __m256i a =
-            cross_sum_fast_odd_not_last(A + i * buf_stride + j, buf_stride);
-        const __m256i b =
-            cross_sum_fast_odd_not_last(B + i * buf_stride + j, buf_stride);
-
-        const __m128i raw =
-            xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
-        const __m256i src =
-            highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
-
-        __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
-        __m256i w =
-            _mm256_srai_epi32(_mm256_add_epi32(v, rounding1),
-                              SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
-
-        yy_storeu_256(dst + i * dst_stride + j, w);
-      }
-    } else {  // odd row and last
-      for (int j = 0; j < width; j += 8) {
-        const __m256i a =
-            cross_sum_fast_odd_last(A + i * buf_stride + j, buf_stride);
-        const __m256i b =
-            cross_sum_fast_odd_last(B + i * buf_stride + j, buf_stride);
-
-        const __m128i raw =
-            xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
-        const __m256i src =
-            highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
-
-        __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
-        __m256i w =
-            _mm256_srai_epi32(_mm256_add_epi32(v, rounding1),
-                              SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
-
-        yy_storeu_256(dst + i * dst_stride + j, w);
-      }
-    }
-  }
-}
-
-// The final filter for selfguided restoration. Computes a weighted average
-// across A, B with "cross sums" (see cross_sum_... implementations above).
-// Designed for the second vertical sub-sampling version of FAST_SGR.
-static void final_filter_fast2(int32_t *dst, int dst_stride, const int32_t *A,
-                               const int32_t *B, int buf_stride,
-                               const void *dgd8, int dgd_stride, int width,
-                               int height, int highbd) {
+// The final filter for the FAST_SGR self-guided restoration. Computes a
+// weighted average across A, B with "cross sums" (see cross_sum_...
+// implementations above).
+static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
+                              const int32_t *B, int buf_stride,
+                              const void *dgd8, int dgd_stride, int width,
+                              int height, int highbd) {
   const int nb0 = 5;
   const int nb1 = 4;
 
@@ -671,9 +487,9 @@
     if (!(i & 1)) {  // even row
       for (int j = 0; j < width; j += 8) {
         const __m256i a =
-            cross_sum_fast_even(A + i * buf_stride + j, buf_stride);
+            cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride);
         const __m256i b =
-            cross_sum_fast_even(B + i * buf_stride + j, buf_stride);
+            cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride);
 
         const __m128i raw =
             xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
@@ -689,8 +505,8 @@
       }
     } else {  // odd row
       for (int j = 0; j < width; j += 8) {
-        const __m256i a = cross_sum_fast_odd(A + i * buf_stride + j);
-        const __m256i b = cross_sum_fast_odd(B + i * buf_stride + j);
+        const __m256i a = cross_sum_fast_odd_row(A + i * buf_stride + j);
+        const __m256i b = cross_sum_fast_odd_row(B + i * buf_stride + j);
 
         const __m128i raw =
             xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
@@ -775,8 +591,8 @@
   assert(params->r1 == 2);
   calc_ab_fast(A, B, C, D, width, height, buf_stride, params->e1, bit_depth,
                params->r1);
-  final_filter_fast2(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
-                     width, height, highbd);
+  final_filter_fast(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
+                    height, highbd);
 
   // r == 1 filter
   assert(params->r2 == 1);

diff --git a/av1/common/x86/selfguided_sse4.c b/av1/common/x86/selfguided_sse4.c
index 1bb7121..3e40f91 100644
--- a/av1/common/x86/selfguided_sse4.c
+++ b/av1/common/x86/selfguided_sse4.c

@@ -116,8 +116,8 @@
   }
 }
 
-// Compute four values of boxsum from the given integral image. ii should point
-// at the middle of the box (for the first value). r is the box radius
+// Compute 4 values of boxsum from the given integral image. ii should point
+// at the middle of the box (for the first value). r is the box radius.
 static __m128i boxsum_from_ii(const int32_t *ii, int stride, int r) {
   const __m128i tl = xx_loadu_128(ii - (r + 1) - (r + 1) * stride);
   const __m128i tr = xx_loadu_128(ii + (r + 0) - (r + 1) * stride);
@@ -260,8 +260,8 @@
   return _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(fours, threes), 2), threes);
 }
 
-// The final filter for selfguided restoration. Computes a weighted average
-// across A, B with "cross sums" (see cross_sum implementation above)
+// The final filter for self-guided restoration. Computes a weighted average
+// across A, B with "cross sums" (see cross_sum implementation above).
 static void final_filter(int32_t *dst, int dst_stride, const int32_t *A,
                          const int32_t *B, int buf_stride, const void *dgd8,
                          int dgd_stride, int width, int height, int highbd) {
@@ -380,7 +380,7 @@
 // cross_sum = 6 * sixes + 5 * fives
 //           = 5 * (fives + sixes) - sixes
 //           = (fives + sixes) << 2 + (fives + sixes) + sixes
-static __m128i cross_sum_fast_even(const int32_t *buf, int stride) {
+static __m128i cross_sum_fast_even_row(const int32_t *buf, int stride) {
   const __m128i xtl = xx_loadu_128(buf - 1 - stride);
   const __m128i xt = xx_loadu_128(buf - stride);
   const __m128i xtr = xx_loadu_128(buf + 1 - stride);
@@ -413,7 +413,7 @@
 // cross_sum = 5 * fives + 6 * sixes
 //           = 4 * (fives + sixes) + (fives + sixes) + sixes
 //           = (fives + sixes) << 2 + (fives + sixes) + sixes
-static __m128i cross_sum_fast_odd(const int32_t *buf) {
+static __m128i cross_sum_fast_odd_row(const int32_t *buf) {
   const __m128i xl = xx_loadu_128(buf - 1);
   const __m128i x = xx_loadu_128(buf);
   const __m128i xr = xx_loadu_128(buf + 1);
@@ -428,186 +428,9 @@
       sixes);
 }
 
-// Calculate 4 values of the "cross sum" starting at buf.
-//
-// Pixels are indexed like this:
-// xtl  xt   xtr
-//  -    -    -
-// xl    x   xr
-//  -    -    -
-// xbl  xb   xbr
-//
-// Pixels are weighted like this:
-//  3    4    3
-//  0    0    0
-//  14   16   14
-//  0    0    0
-//  3    4    3
-//
-// buf points to x
-//
-// threes = xtl + xtr + xbr + xbl
-// fours = xt + xb
-// fourteens = xl + xr
-// sixteens = x
-// cross_sum = 4 * fours + 3 * threes + 14 * fourteens + 16 * sixteens
-//           = 4 * (fours + threes) + 16 * (sixteens + fourteens)
-//              - (threes + fourteens) - fourteens
-//           = (fours + threes) << 2 + (sixteens + fourteens) << 4
-//              - (threes + fourteens) - fourteens
-static __m128i cross_sum_fast_odd_not_last(const int32_t *buf, int stride) {
-  const int two_stride = 2 * stride;
-  const __m128i xtl = xx_loadu_128(buf - 1 - two_stride);
-  const __m128i xt = xx_loadu_128(buf - two_stride);
-  const __m128i xtr = xx_loadu_128(buf + 1 - two_stride);
-  const __m128i xl = xx_loadu_128(buf - 1);
-  const __m128i x = xx_loadu_128(buf);
-  const __m128i xr = xx_loadu_128(buf + 1);
-  const __m128i xbl = xx_loadu_128(buf - 1 + two_stride);
-  const __m128i xb = xx_loadu_128(buf + two_stride);
-  const __m128i xbr = xx_loadu_128(buf + 1 + two_stride);
-
-  const __m128i threes =
-      _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl)));
-  const __m128i fours = _mm_add_epi32(xt, xb);
-  const __m128i fourteens = _mm_add_epi32(xl, xr);
-  const __m128i sixteens = x;
-
-  const __m128i fours_plus_threes = _mm_add_epi32(fours, threes);
-  const __m128i sixteens_plus_fourteens = _mm_add_epi32(sixteens, fourteens);
-  const __m128i threes_plus_fourteens = _mm_add_epi32(threes, fourteens);
-
-  return _mm_sub_epi32(
-      _mm_sub_epi32(_mm_add_epi32(_mm_slli_epi32(fours_plus_threes, 2),
-                                  _mm_slli_epi32(sixteens_plus_fourteens, 4)),
-                    threes_plus_fourteens),
-      fourteens);
-}
-
-// Calculate 4 values of the "cross sum" starting at buf.
-//
-// Pixels are indexed like this:
-// xtl  xt   xtr
-//  -    -    -
-// xl    x   xr
-//
-// Pixels are weighted like this:
-//  4    6    4
-//  0    0    0
-//  16   18   16
-//
-// buf points to x
-//
-// fours = xtl + xtr
-// sixes = xt
-// sixteens = xl + xr
-// eighteens = x
-// cross_sum = 4 * fours + 6 * sixes + 16 * sixteens + 18 * eighteens
-//           = 4 * (fours + sixes) + 16 * (sixteens + eighteens)
-//              + 2 * (sixes + eighteens)
-//           = (fours + sixes) << 2 + (sixteens + eighteens) << 4
-//              + (sixes + eighteens) << 1
-static __m128i cross_sum_fast_odd_last(const int32_t *buf, int stride) {
-  const int two_stride = 2 * stride;
-  const __m128i xtl = xx_loadu_128(buf - 1 - two_stride);
-  const __m128i xt = xx_loadu_128(buf - two_stride);
-  const __m128i xtr = xx_loadu_128(buf + 1 - two_stride);
-  const __m128i xl = xx_loadu_128(buf - 1);
-  const __m128i x = xx_loadu_128(buf);
-  const __m128i xr = xx_loadu_128(buf + 1);
-
-  const __m128i fours = _mm_add_epi32(xtl, xtr);
-  const __m128i sixes = xt;
-  const __m128i sixteens = _mm_add_epi32(xl, xr);
-  const __m128i eighteens = x;
-
-  const __m128i fours_plus_sixes = _mm_add_epi32(fours, sixes);
-  const __m128i sixteens_plus_eighteens = _mm_add_epi32(sixteens, eighteens);
-  const __m128i sixes_plus_eighteens = _mm_add_epi32(sixes, eighteens);
-
-  return _mm_add_epi32(
-      _mm_add_epi32(_mm_slli_epi32(fours_plus_sixes, 2),
-                    _mm_slli_epi32(sixteens_plus_eighteens, 4)),
-      _mm_slli_epi32(sixes_plus_eighteens, 1));
-}
-
-// The final filter for selfguided restoration. Computes a weighted average
-// across A, B with "cross sums" (see cross_sum_... implementations above).
-// Designed for the first vertical sub-sampling version of FAST_SGR.
-static void final_filter_fast1(int32_t *dst, int dst_stride, const int32_t *A,
-                               const int32_t *B, int buf_stride,
-                               const void *dgd8, int dgd_stride, int width,
-                               int height, int highbd) {
-  const int nb0 = 5;
-  const int nb1 = 6;
-
-  const __m128i rounding0 =
-      round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
-  const __m128i rounding1 =
-      round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
-
-  const uint8_t *dgd_real =
-      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
-
-  for (int i = 0; i < height; ++i) {
-    if (!(i & 1)) {  // even row
-      for (int j = 0; j < width; j += 4) {
-        const __m128i a =
-            cross_sum_fast_even(A + i * buf_stride + j, buf_stride);
-        const __m128i b =
-            cross_sum_fast_even(B + i * buf_stride + j, buf_stride);
-        const __m128i raw =
-            xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
-        const __m128i src =
-            highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
-
-        __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
-        __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding0),
-                                   SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
-
-        xx_storeu_128(dst + i * dst_stride + j, w);
-      }
-    } else if (i != height - 1) {  // odd row and not last
-      for (int j = 0; j < width; j += 4) {
-        const __m128i a =
-            cross_sum_fast_odd_not_last(A + i * buf_stride + j, buf_stride);
-        const __m128i b =
-            cross_sum_fast_odd_not_last(B + i * buf_stride + j, buf_stride);
-        const __m128i raw =
-            xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
-        const __m128i src =
-            highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
-
-        __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
-        __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding1),
-                                   SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
-
-        xx_storeu_128(dst + i * dst_stride + j, w);
-      }
-    } else {  // odd row and last
-      for (int j = 0; j < width; j += 4) {
-        const __m128i a =
-            cross_sum_fast_odd_last(A + i * buf_stride + j, buf_stride);
-        const __m128i b =
-            cross_sum_fast_odd_last(B + i * buf_stride + j, buf_stride);
-        const __m128i raw =
-            xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
-        const __m128i src =
-            highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
-
-        __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
-        __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding1),
-                                   SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
-
-        xx_storeu_128(dst + i * dst_stride + j, w);
-      }
-    }
-  }
-}
-
-// The final filter for selfguided restoration. Computes a weighted average
-// across A, B with "cross sums" (see cross_sum_... implementations above).
-// Designed for the second vertical sub-sampling version of FAST_SGR.
+// The final filter for the FAST_SGR self-guided restoration. Computes a
+// weighted average across A, B with "cross sums" (see cross_sum_...
+// implementations above).
 static void final_filter_fast2(int32_t *dst, int dst_stride, const int32_t *A,
                                const int32_t *B, int buf_stride,
                                const void *dgd8, int dgd_stride, int width,
@@ -627,9 +450,9 @@
     if (!(i & 1)) {  // even row
       for (int j = 0; j < width; j += 4) {
         const __m128i a =
-            cross_sum_fast_even(A + i * buf_stride + j, buf_stride);
+            cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride);
         const __m128i b =
-            cross_sum_fast_even(B + i * buf_stride + j, buf_stride);
+            cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride);
         const __m128i raw =
             xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
         const __m128i src =
@@ -643,8 +466,8 @@
       }
     } else {  // odd row
       for (int j = 0; j < width; j += 4) {
-        const __m128i a = cross_sum_fast_odd(A + i * buf_stride + j);
-        const __m128i b = cross_sum_fast_odd(B + i * buf_stride + j);
+        const __m128i a = cross_sum_fast_odd_row(A + i * buf_stride + j);
+        const __m128i b = cross_sum_fast_odd_row(B + i * buf_stride + j);
         const __m128i raw =
             xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
         const __m128i src =