SSE4 and AVX2 implementations of updated FAST_SGR

The SSE4.1 and AVX2 implementations of the self-guided filter have been updated
to match the updated FAST_SGR C implementation in restoration.c.

The self-guided filter speed tests have been altered to compare the speeds of
the SIMD and C implementations of the relevant functions.

Speed Tests (code compiled with CLANG)
===========

For LowBD:
- The SSE4.1 implementation is ~220% faster (~69% less time) than the C code
- The AVX2 implementation is ~314% faster (~76% less time) than the C code

For HighBD:
- The SSE4.1 implementation is ~240% faster (~71% less time) than the C code
- The AVX2 implementation is ~343% faster (~77% less time) than the C code

Change-Id: Ic2734bb89ccd3f66667c68647e5f677a5a496233
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index b49e248..1e965c0 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -1275,9 +1275,11 @@
                                             flt2, flt_stride, bit_depth,
                                             params->r2, params->e2);
 #elif CONFIG_FAST_SGR == 1
+  // r == 2 filter
   av1_selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
                                            flt1, flt_stride, bit_depth,
                                            params->r1, params->e1);
+  // r == 1 filter
   av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt2,
                                       flt_stride, bit_depth, params->r2,
                                       params->e2);
@@ -1336,7 +1338,7 @@
 
   for (int j = 0; j < stripe_width; j += procunit_width) {
     int w = AOMMIN(procunit_width, stripe_width - j);
-#if CONFIG_FAST_SGR
+#if CONFIG_FAST_SGR == 2
     apply_selfguided_restoration_c(src + j, w, stripe_height, src_stride,
                                    rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
                                    dst + j, dst_stride, tmpbuf, bit_depth, 0);
@@ -1344,7 +1346,7 @@
     apply_selfguided_restoration(src + j, w, stripe_height, src_stride,
                                  rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
                                  dst + j, dst_stride, tmpbuf, bit_depth, 0);
-#endif  // CONFIG_FAST_SGR
+#endif  // CONFIG_FAST_SGR == 2
   }
 }
 
@@ -1380,7 +1382,7 @@
                                          int32_t *tmpbuf, int bit_depth) {
   for (int j = 0; j < stripe_width; j += procunit_width) {
     int w = AOMMIN(procunit_width, stripe_width - j);
-#if CONFIG_FAST_SGR
+#if CONFIG_FAST_SGR == 2
     apply_selfguided_restoration_c(src8 + j, w, stripe_height, src_stride,
                                    rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
                                    dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
@@ -1388,7 +1390,7 @@
     apply_selfguided_restoration(src8 + j, w, stripe_height, src_stride,
                                  rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
                                  dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
-#endif  // CONFIG_FAST_SGR
+#endif  // CONFIG_FAST_SGR == 2
   }
 }
 
diff --git a/av1/common/x86/selfguided_avx2.c b/av1/common/x86/selfguided_avx2.c
index 061b327..23b651d 100644
--- a/av1/common/x86/selfguided_avx2.c
+++ b/av1/common/x86/selfguided_avx2.c
@@ -311,7 +311,7 @@
       highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
 
   for (int i = 0; i < height; ++i) {
-    for (int j = 0; j < width; j += 4) {
+    for (int j = 0; j < width; j += 8) {
       const __m256i a = cross_sum(A + i * buf_stride + j, buf_stride);
       const __m256i b = cross_sum(B + i * buf_stride + j, buf_stride);
 
@@ -398,7 +398,7 @@
   }
 }
 
-// Calculate 4 values of the "cross sum" starting at buf.
+// Calculate 8 values of the "cross sum" starting at buf.
 //
 // Pixels are indexed like this:
 // xtl  xt   xtr
@@ -434,7 +434,38 @@
       sixes);
 }
 
-// Calculate 4 values of the "cross sum" starting at buf.
+// Calculate 8 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xl    x   xr
+//
+// Pixels are weighted like this:
+//  5    6    5
+//
+// buf points to x
+//
+// fives = xl + xr
+// sixes = x
+// cross_sum = 5 * fives + 6 * sixes
+//           = 4 * (fives + sixes) + (fives + sixes) + sixes
+//           = (fives + sixes) << 2 + (fives + sixes) + sixes
+static __m256i cross_sum_fast_odd(const int32_t *buf) {
+  const __m256i xl = yy_loadu_256(buf - 1);
+  const __m256i x = yy_loadu_256(buf);
+  const __m256i xr = yy_loadu_256(buf + 1);
+
+  const __m256i fives = _mm256_add_epi32(xl, xr);
+  const __m256i sixes = x;
+
+  const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
+
+  return _mm256_add_epi32(
+      _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
+                       fives_plus_sixes),
+      sixes);
+}
+
+// Calculate 8 values of the "cross sum" starting at buf.
 //
 // Pixels are indexed like this:
 // xtl  xt   xtr
@@ -491,7 +522,7 @@
       fourteens);
 }
 
-// Calculate 4 values of the "cross sum" starting at buf.
+// Calculate 8 values of the "cross sum" starting at buf.
 //
 // Pixels are indexed like this:
 // xtl  xt   xtr
@@ -539,11 +570,12 @@
 }
 
 // The final filter for selfguided restoration. Computes a weighted average
-// across A, B with "cross sums" (see cross_sum_... implementations above)
-static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
-                              const int32_t *B, int buf_stride,
-                              const void *dgd8, int dgd_stride, int width,
-                              int height, int highbd) {
+// across A, B with "cross sums" (see cross_sum_... implementations above).
+// Designed for the first vertical sub-sampling version of FAST_SGR.
+static void final_filter_fast1(int32_t *dst, int dst_stride, const int32_t *A,
+                               const int32_t *B, int buf_stride,
+                               const void *dgd8, int dgd_stride, int width,
+                               int height, int highbd) {
   const int nb0 = 5;
   const int nb1 = 6;
 
@@ -557,7 +589,7 @@
 
   for (int i = 0; i < height; ++i) {
     if (!(i & 1)) {  // even row
-      for (int j = 0; j < width; j += 4) {
+      for (int j = 0; j < width; j += 8) {
         const __m256i a =
             cross_sum_fast_even(A + i * buf_stride + j, buf_stride);
         const __m256i b =
@@ -576,7 +608,7 @@
         yy_storeu_256(dst + i * dst_stride + j, w);
       }
     } else if (i != height - 1) {  // odd row and not last
-      for (int j = 0; j < width; j += 4) {
+      for (int j = 0; j < width; j += 8) {
         const __m256i a =
             cross_sum_fast_odd_not_last(A + i * buf_stride + j, buf_stride);
         const __m256i b =
@@ -595,7 +627,7 @@
         yy_storeu_256(dst + i * dst_stride + j, w);
       }
     } else {  // odd row and last
-      for (int j = 0; j < width; j += 4) {
+      for (int j = 0; j < width; j += 8) {
         const __m256i a =
             cross_sum_fast_odd_last(A + i * buf_stride + j, buf_stride);
         const __m256i b =
@@ -616,6 +648,65 @@
     }
   }
 }
+
+// The final filter for selfguided restoration. Computes a weighted average
+// across A, B with "cross sums" (see cross_sum_... implementations above).
+// Designed for the second vertical sub-sampling version of FAST_SGR.
+static void final_filter_fast2(int32_t *dst, int dst_stride, const int32_t *A,
+                               const int32_t *B, int buf_stride,
+                               const void *dgd8, int dgd_stride, int width,
+                               int height, int highbd) {
+  const int nb0 = 5;
+  const int nb1 = 4;
+
+  const __m256i rounding0 =
+      round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+  const __m256i rounding1 =
+      round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+  const uint8_t *dgd_real =
+      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+  for (int i = 0; i < height; ++i) {
+    if (!(i & 1)) {  // even row
+      for (int j = 0; j < width; j += 8) {
+        const __m256i a =
+            cross_sum_fast_even(A + i * buf_stride + j, buf_stride);
+        const __m256i b =
+            cross_sum_fast_even(B + i * buf_stride + j, buf_stride);
+
+        const __m128i raw =
+            xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+        const __m256i src =
+            highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+        __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+        __m256i w =
+            _mm256_srai_epi32(_mm256_add_epi32(v, rounding0),
+                              SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+
+        yy_storeu_256(dst + i * dst_stride + j, w);
+      }
+    } else {  // odd row
+      for (int j = 0; j < width; j += 8) {
+        const __m256i a = cross_sum_fast_odd(A + i * buf_stride + j);
+        const __m256i b = cross_sum_fast_odd(B + i * buf_stride + j);
+
+        const __m128i raw =
+            xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+        const __m256i src =
+            highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+        __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+        __m256i w =
+            _mm256_srai_epi32(_mm256_add_epi32(v, rounding1),
+                              SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+        yy_storeu_256(dst + i * dst_stride + j, w);
+      }
+    }
+  }
+}
 #endif
 
 void av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
@@ -676,23 +767,36 @@
     integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
                     buf_stride);
 
-  // Write to flt1 and flt2
+// Write to flt1 and flt2
+#if CONFIG_FAST_SGR
+  assert(params->r1 < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+
+  // r == 2 filter
+  assert(params->r1 == 2);
+  calc_ab_fast(A, B, C, D, width, height, buf_stride, params->e1, bit_depth,
+               params->r1);
+  final_filter_fast2(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
+                     width, height, highbd);
+
+  // r == 1 filter
+  assert(params->r2 == 1);
+  calc_ab(A, B, C, D, width, height, buf_stride, params->e2, bit_depth,
+          params->r2);
+  final_filter(flt2, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
+               height, highbd);
+#else
   for (int i = 0; i < 2; ++i) {
     int r = i ? params->r2 : params->r1;
     int e = i ? params->e2 : params->e1;
     int32_t *flt = i ? flt2 : flt1;
 
     assert(r + 1 <= AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
-#if CONFIG_FAST_SGR
-    calc_ab_fast(A, B, C, D, width, height, buf_stride, e, bit_depth, r);
-    final_filter_fast(flt, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
-                      width, height, highbd);
-#else
+
     calc_ab(A, B, C, D, width, height, buf_stride, e, bit_depth, r);
     final_filter(flt, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
                  height, highbd);
-#endif
   }
+#endif
 }
 
 void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
diff --git a/av1/common/x86/selfguided_sse4.c b/av1/common/x86/selfguided_sse4.c
index 83bdb8b..1bb7121 100644
--- a/av1/common/x86/selfguided_sse4.c
+++ b/av1/common/x86/selfguided_sse4.c
@@ -401,6 +401,36 @@
 // Calculate 4 values of the "cross sum" starting at buf.
 //
 // Pixels are indexed like this:
+// xl    x   xr
+//
+// Pixels are weighted like this:
+//  5    6    5
+//
+// buf points to x
+//
+// fives = xl + xr
+// sixes = x
+// cross_sum = 5 * fives + 6 * sixes
+//           = 4 * (fives + sixes) + (fives + sixes) + sixes
+//           = (fives + sixes) << 2 + (fives + sixes) + sixes
+static __m128i cross_sum_fast_odd(const int32_t *buf) {
+  const __m128i xl = xx_loadu_128(buf - 1);
+  const __m128i x = xx_loadu_128(buf);
+  const __m128i xr = xx_loadu_128(buf + 1);
+
+  const __m128i fives = _mm_add_epi32(xl, xr);
+  const __m128i sixes = x;
+
+  const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes);
+
+  return _mm_add_epi32(
+      _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes),
+      sixes);
+}
+
+// Calculate 4 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
 // xtl  xt   xtr
 //  -    -    -
 // xl    x   xr
@@ -502,11 +532,12 @@
 }
 
 // The final filter for selfguided restoration. Computes a weighted average
-// across A, B with "cross sums" (see cross_sum_... implementations above)
-static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
-                              const int32_t *B, int buf_stride,
-                              const void *dgd8, int dgd_stride, int width,
-                              int height, int highbd) {
+// across A, B with "cross sums" (see cross_sum_... implementations above).
+// Designed for the first vertical sub-sampling version of FAST_SGR.
+static void final_filter_fast1(int32_t *dst, int dst_stride, const int32_t *A,
+                               const int32_t *B, int buf_stride,
+                               const void *dgd8, int dgd_stride, int width,
+                               int height, int highbd) {
   const int nb0 = 5;
   const int nb1 = 6;
 
@@ -573,6 +604,61 @@
     }
   }
 }
+
+// The final filter for selfguided restoration. Computes a weighted average
+// across A, B with "cross sums" (see cross_sum_... implementations above).
+// Designed for the second vertical sub-sampling version of FAST_SGR.
+static void final_filter_fast2(int32_t *dst, int dst_stride, const int32_t *A,
+                               const int32_t *B, int buf_stride,
+                               const void *dgd8, int dgd_stride, int width,
+                               int height, int highbd) {
+  const int nb0 = 5;
+  const int nb1 = 4;
+
+  const __m128i rounding0 =
+      round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+  const __m128i rounding1 =
+      round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+  const uint8_t *dgd_real =
+      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+  for (int i = 0; i < height; ++i) {
+    if (!(i & 1)) {  // even row
+      for (int j = 0; j < width; j += 4) {
+        const __m128i a =
+            cross_sum_fast_even(A + i * buf_stride + j, buf_stride);
+        const __m128i b =
+            cross_sum_fast_even(B + i * buf_stride + j, buf_stride);
+        const __m128i raw =
+            xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+        const __m128i src =
+            highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+        __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+        __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding0),
+                                   SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+
+        xx_storeu_128(dst + i * dst_stride + j, w);
+      }
+    } else {  // odd row
+      for (int j = 0; j < width; j += 4) {
+        const __m128i a = cross_sum_fast_odd(A + i * buf_stride + j);
+        const __m128i b = cross_sum_fast_odd(B + i * buf_stride + j);
+        const __m128i raw =
+            xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+        const __m128i src =
+            highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+        __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+        __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding1),
+                                   SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+        xx_storeu_128(dst + i * dst_stride + j, w);
+      }
+    }
+  }
+}
 #endif
 
 void av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
@@ -629,23 +715,36 @@
     integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
                     buf_stride);
 
-  // Write to flt1 and flt2
+// Write to flt1 and flt2
+#if CONFIG_FAST_SGR
+  assert(params->r1 < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+
+  // r == 2 filter
+  assert(params->r1 == 2);
+  calc_ab_fast(A, B, C, D, width, height, buf_stride, params->e1, bit_depth,
+               params->r1);
+  final_filter_fast2(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
+                     width, height, highbd);
+
+  // r == 1 filter
+  assert(params->r2 == 1);
+  calc_ab(A, B, C, D, width, height, buf_stride, params->e2, bit_depth,
+          params->r2);
+  final_filter(flt2, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
+               height, highbd);
+#else
   for (int i = 0; i < 2; ++i) {
     int r = i ? params->r2 : params->r1;
     int e = i ? params->e2 : params->e1;
     int32_t *flt = i ? flt2 : flt1;
 
     assert(r + 1 <= AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
-#if CONFIG_FAST_SGR
-    calc_ab_fast(A, B, C, D, width, height, buf_stride, e, bit_depth, r);
-    final_filter_fast(flt, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
-                      width, height, highbd);
-#else
+
     calc_ab(A, B, C, D, width, height, buf_stride, e, bit_depth, r);
     final_filter(flt, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
                  height, highbd);
-#endif
   }
+#endif
 }
 
 void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index 2e18bb6..b590833 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -353,13 +353,13 @@
                              int width, int height, int dat_stride,
                              int use_highbd, int bit_depth, int32_t *flt1,
                              int32_t *flt2, int flt_stride) {
-#if CONFIG_FAST_SGR
+#if CONFIG_FAST_SGR == 2
   av1_selfguided_restoration_c(dat8, width, height, dat_stride, flt1, flt2,
                                flt_stride, params, bit_depth, use_highbd);
 #else
   av1_selfguided_restoration(dat8, width, height, dat_stride, flt1, flt2,
                              flt_stride, params, bit_depth, use_highbd);
-#endif  // CONFIG_FAST_SGR
+#endif  // CONFIG_FAST_SGR == 2
 }
 
 // Apply the self-guided filter across an entire restoration unit.
diff --git a/test/selfguided_filter_test.cc b/test/selfguided_filter_test.cc
index 345baac..0215e60 100644
--- a/test/selfguided_filter_test.cc
+++ b/test/selfguided_filter_test.cc
@@ -80,7 +80,24 @@
 
     av1_loop_restoration_precal();
 
-    std::clock_t start = std::clock();
+    aom_usec_timer ref_timer;
+    aom_usec_timer_start(&ref_timer);
+    for (i = 0; i < NUM_ITERS; ++i) {
+      for (k = 0; k < height; k += pu_height)
+        for (j = 0; j < width; j += pu_width) {
+          int w = AOMMIN(pu_width, width - j);
+          int h = AOMMIN(pu_height, height - k);
+          uint8_t *input_p = input + k * stride + j;
+          uint8_t *output_p = output + k * out_stride + j;
+          apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
+                                         output_p, out_stride, tmpbuf, 8, 0);
+        }
+    }
+    aom_usec_timer_mark(&ref_timer);
+    const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+    aom_usec_timer tst_timer;
+    aom_usec_timer_start(&tst_timer);
     for (i = 0; i < NUM_ITERS; ++i) {
       for (k = 0; k < height; k += pu_height)
         for (j = 0; j < width; j += pu_width) {
@@ -92,11 +109,16 @@
                    tmpbuf, 8, 0);
         }
     }
-    std::clock_t end = std::clock();
-    double elapsed = ((end - start) / (double)CLOCKS_PER_SEC);
+    aom_usec_timer_mark(&tst_timer);
+    const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
 
-    printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, width,
-           height, elapsed, elapsed * 1000000. / NUM_ITERS);
+    std::cout << "[          ] C time = " << ref_time / 1000
+              << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+    EXPECT_GT(ref_time, tst_time)
+        << "Error: AV1SelfguidedFilterTest.SpeedTest, SIMD slower than C.\n"
+        << "C time: " << ref_time << " us\n"
+        << "SIMD time: " << tst_time << " us\n";
 
     aom_free(input_);
     aom_free(output_);
@@ -238,8 +260,25 @@
 
     av1_loop_restoration_precal();
 
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
+    aom_usec_timer ref_timer;
+    aom_usec_timer_start(&ref_timer);
+    for (i = 0; i < NUM_ITERS; ++i) {
+      for (k = 0; k < height; k += pu_height)
+        for (j = 0; j < width; j += pu_width) {
+          int w = AOMMIN(pu_width, width - j);
+          int h = AOMMIN(pu_height, height - k);
+          uint16_t *input_p = input + k * stride + j;
+          uint16_t *output_p = output + k * out_stride + j;
+          apply_selfguided_restoration_c(
+              CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
+              CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth, 1);
+        }
+    }
+    aom_usec_timer_mark(&ref_timer);
+    const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+    aom_usec_timer tst_timer;
+    aom_usec_timer_start(&tst_timer);
     for (i = 0; i < NUM_ITERS; ++i) {
       for (k = 0; k < height; k += pu_height)
         for (j = 0; j < width; j += pu_width) {
@@ -252,11 +291,17 @@
                    1);
         }
     }
-    aom_usec_timer_mark(&timer);
-    double elapsed = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_mark(&tst_timer);
+    const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
 
-    printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, width,
-           height, elapsed / 1000000, elapsed / NUM_ITERS);
+    std::cout << "[          ] C time = " << ref_time / 1000
+              << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+    EXPECT_GT(ref_time, tst_time)
+        << "Error: AV1HighbdSelfguidedFilterTest.SpeedTest, SIMD slower than "
+           "C.\n"
+        << "C time: " << ref_time << " us\n"
+        << "SIMD time: " << tst_time << " us\n";
 
     aom_free(input_);
     aom_free(output_);