SSE4 and AVX2 implementations of updated FAST_SGR
The SSE4.1 and AVX2 implementations of the self-guided filter have been updated
to match the updated FAST_SGR C implementation in restoration.c.
The self-guided filter speed tests have been altered to compare the speeds of
the SIMD and C implementations of the relevant functions.
Speed Tests (code compiled with CLANG)
===========
For LowBD:
- The SSE4.1 implementation is ~220% faster (~69% less time) than the C code
- The AVX2 implementation is ~314% faster (~76% less time) than the C code
For HighBD:
- The SSE4.1 implementation is ~240% faster (~71% less time) than the C code
- The AVX2 implementation is ~343% faster (~77% less time) than the C code
Change-Id: Ic2734bb89ccd3f66667c68647e5f677a5a496233
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index b49e248..1e965c0 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -1275,9 +1275,11 @@
flt2, flt_stride, bit_depth,
params->r2, params->e2);
#elif CONFIG_FAST_SGR == 1
+ // r == 2 filter
av1_selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
flt1, flt_stride, bit_depth,
params->r1, params->e1);
+ // r == 1 filter
av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt2,
flt_stride, bit_depth, params->r2,
params->e2);
@@ -1336,7 +1338,7 @@
for (int j = 0; j < stripe_width; j += procunit_width) {
int w = AOMMIN(procunit_width, stripe_width - j);
-#if CONFIG_FAST_SGR
+#if CONFIG_FAST_SGR == 2
apply_selfguided_restoration_c(src + j, w, stripe_height, src_stride,
rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
dst + j, dst_stride, tmpbuf, bit_depth, 0);
@@ -1344,7 +1346,7 @@
apply_selfguided_restoration(src + j, w, stripe_height, src_stride,
rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
dst + j, dst_stride, tmpbuf, bit_depth, 0);
-#endif // CONFIG_FAST_SGR
+#endif // CONFIG_FAST_SGR == 2
}
}
@@ -1380,7 +1382,7 @@
int32_t *tmpbuf, int bit_depth) {
for (int j = 0; j < stripe_width; j += procunit_width) {
int w = AOMMIN(procunit_width, stripe_width - j);
-#if CONFIG_FAST_SGR
+#if CONFIG_FAST_SGR == 2
apply_selfguided_restoration_c(src8 + j, w, stripe_height, src_stride,
rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
@@ -1388,7 +1390,7 @@
apply_selfguided_restoration(src8 + j, w, stripe_height, src_stride,
rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
-#endif // CONFIG_FAST_SGR
+#endif // CONFIG_FAST_SGR == 2
}
}
diff --git a/av1/common/x86/selfguided_avx2.c b/av1/common/x86/selfguided_avx2.c
index 061b327..23b651d 100644
--- a/av1/common/x86/selfguided_avx2.c
+++ b/av1/common/x86/selfguided_avx2.c
@@ -311,7 +311,7 @@
highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
for (int i = 0; i < height; ++i) {
- for (int j = 0; j < width; j += 4) {
+ for (int j = 0; j < width; j += 8) {
const __m256i a = cross_sum(A + i * buf_stride + j, buf_stride);
const __m256i b = cross_sum(B + i * buf_stride + j, buf_stride);
@@ -398,7 +398,7 @@
}
}
-// Calculate 4 values of the "cross sum" starting at buf.
+// Calculate 8 values of the "cross sum" starting at buf.
//
// Pixels are indexed like this:
// xtl xt xtr
@@ -434,7 +434,38 @@
sixes);
}
-// Calculate 4 values of the "cross sum" starting at buf.
+// Calculate 8 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xl x xr
+//
+// Pixels are weighted like this:
+// 5 6 5
+//
+// buf points to x
+//
+// fives = xl + xr
+// sixes = x
+// cross_sum = 5 * fives + 6 * sixes
+// = 4 * (fives + sixes) + (fives + sixes) + sixes
+// = (fives + sixes) << 2 + (fives + sixes) + sixes
+static __m256i cross_sum_fast_odd(const int32_t *buf) {
+ const __m256i xl = yy_loadu_256(buf - 1);
+ const __m256i x = yy_loadu_256(buf);
+ const __m256i xr = yy_loadu_256(buf + 1);
+
+ const __m256i fives = _mm256_add_epi32(xl, xr);
+ const __m256i sixes = x;
+
+ const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
+
+ return _mm256_add_epi32(
+ _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
+ fives_plus_sixes),
+ sixes);
+}
+
+// Calculate 8 values of the "cross sum" starting at buf.
//
// Pixels are indexed like this:
// xtl xt xtr
@@ -491,7 +522,7 @@
fourteens);
}
-// Calculate 4 values of the "cross sum" starting at buf.
+// Calculate 8 values of the "cross sum" starting at buf.
//
// Pixels are indexed like this:
// xtl xt xtr
@@ -539,11 +570,12 @@
}
// The final filter for selfguided restoration. Computes a weighted average
-// across A, B with "cross sums" (see cross_sum_... implementations above)
-static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
- const int32_t *B, int buf_stride,
- const void *dgd8, int dgd_stride, int width,
- int height, int highbd) {
+// across A, B with "cross sums" (see cross_sum_... implementations above).
+// Designed for the first vertical sub-sampling version of FAST_SGR.
+static void final_filter_fast1(int32_t *dst, int dst_stride, const int32_t *A,
+ const int32_t *B, int buf_stride,
+ const void *dgd8, int dgd_stride, int width,
+ int height, int highbd) {
const int nb0 = 5;
const int nb1 = 6;
@@ -557,7 +589,7 @@
for (int i = 0; i < height; ++i) {
if (!(i & 1)) { // even row
- for (int j = 0; j < width; j += 4) {
+ for (int j = 0; j < width; j += 8) {
const __m256i a =
cross_sum_fast_even(A + i * buf_stride + j, buf_stride);
const __m256i b =
@@ -576,7 +608,7 @@
yy_storeu_256(dst + i * dst_stride + j, w);
}
} else if (i != height - 1) { // odd row and not last
- for (int j = 0; j < width; j += 4) {
+ for (int j = 0; j < width; j += 8) {
const __m256i a =
cross_sum_fast_odd_not_last(A + i * buf_stride + j, buf_stride);
const __m256i b =
@@ -595,7 +627,7 @@
yy_storeu_256(dst + i * dst_stride + j, w);
}
} else { // odd row and last
- for (int j = 0; j < width; j += 4) {
+ for (int j = 0; j < width; j += 8) {
const __m256i a =
cross_sum_fast_odd_last(A + i * buf_stride + j, buf_stride);
const __m256i b =
@@ -616,6 +648,65 @@
}
}
}
+
+// The final filter for selfguided restoration. Computes a weighted average
+// across A, B with "cross sums" (see cross_sum_... implementations above).
+// Designed for the second vertical sub-sampling version of FAST_SGR.
+static void final_filter_fast2(int32_t *dst, int dst_stride, const int32_t *A,
+ const int32_t *B, int buf_stride,
+ const void *dgd8, int dgd_stride, int width,
+ int height, int highbd) {
+ const int nb0 = 5;
+ const int nb1 = 4;
+
+ const __m256i rounding0 =
+ round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+ const __m256i rounding1 =
+ round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+ const uint8_t *dgd_real =
+ highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+ for (int i = 0; i < height; ++i) {
+ if (!(i & 1)) { // even row
+ for (int j = 0; j < width; j += 8) {
+ const __m256i a =
+ cross_sum_fast_even(A + i * buf_stride + j, buf_stride);
+ const __m256i b =
+ cross_sum_fast_even(B + i * buf_stride + j, buf_stride);
+
+ const __m128i raw =
+ xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m256i src =
+ highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+ __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+ __m256i w =
+ _mm256_srai_epi32(_mm256_add_epi32(v, rounding0),
+ SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+
+ yy_storeu_256(dst + i * dst_stride + j, w);
+ }
+ } else { // odd row
+ for (int j = 0; j < width; j += 8) {
+ const __m256i a = cross_sum_fast_odd(A + i * buf_stride + j);
+ const __m256i b = cross_sum_fast_odd(B + i * buf_stride + j);
+
+ const __m128i raw =
+ xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m256i src =
+ highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+ __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+ __m256i w =
+ _mm256_srai_epi32(_mm256_add_epi32(v, rounding1),
+ SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+ yy_storeu_256(dst + i * dst_stride + j, w);
+ }
+ }
+ }
+}
#endif
void av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
@@ -676,23 +767,36 @@
integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
buf_stride);
- // Write to flt1 and flt2
+// Write to flt1 and flt2
+#if CONFIG_FAST_SGR
+ assert(params->r1 < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+
+ // r == 2 filter
+ assert(params->r1 == 2);
+ calc_ab_fast(A, B, C, D, width, height, buf_stride, params->e1, bit_depth,
+ params->r1);
+ final_filter_fast2(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
+ width, height, highbd);
+
+ // r == 1 filter
+ assert(params->r2 == 1);
+ calc_ab(A, B, C, D, width, height, buf_stride, params->e2, bit_depth,
+ params->r2);
+ final_filter(flt2, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
+ height, highbd);
+#else
for (int i = 0; i < 2; ++i) {
int r = i ? params->r2 : params->r1;
int e = i ? params->e2 : params->e1;
int32_t *flt = i ? flt2 : flt1;
assert(r + 1 <= AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
-#if CONFIG_FAST_SGR
- calc_ab_fast(A, B, C, D, width, height, buf_stride, e, bit_depth, r);
- final_filter_fast(flt, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
- width, height, highbd);
-#else
+
calc_ab(A, B, C, D, width, height, buf_stride, e, bit_depth, r);
final_filter(flt, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
height, highbd);
-#endif
}
+#endif
}
void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
diff --git a/av1/common/x86/selfguided_sse4.c b/av1/common/x86/selfguided_sse4.c
index 83bdb8b..1bb7121 100644
--- a/av1/common/x86/selfguided_sse4.c
+++ b/av1/common/x86/selfguided_sse4.c
@@ -401,6 +401,36 @@
// Calculate 4 values of the "cross sum" starting at buf.
//
// Pixels are indexed like this:
+// xl x xr
+//
+// Pixels are weighted like this:
+// 5 6 5
+//
+// buf points to x
+//
+// fives = xl + xr
+// sixes = x
+// cross_sum = 5 * fives + 6 * sixes
+// = 4 * (fives + sixes) + (fives + sixes) + sixes
+// = (fives + sixes) << 2 + (fives + sixes) + sixes
+static __m128i cross_sum_fast_odd(const int32_t *buf) {
+ const __m128i xl = xx_loadu_128(buf - 1);
+ const __m128i x = xx_loadu_128(buf);
+ const __m128i xr = xx_loadu_128(buf + 1);
+
+ const __m128i fives = _mm_add_epi32(xl, xr);
+ const __m128i sixes = x;
+
+ const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes);
+
+ return _mm_add_epi32(
+ _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes),
+ sixes);
+}
+
+// Calculate 4 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
// xtl xt xtr
// - - -
// xl x xr
@@ -502,11 +532,12 @@
}
// The final filter for selfguided restoration. Computes a weighted average
-// across A, B with "cross sums" (see cross_sum_... implementations above)
-static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
- const int32_t *B, int buf_stride,
- const void *dgd8, int dgd_stride, int width,
- int height, int highbd) {
+// across A, B with "cross sums" (see cross_sum_... implementations above).
+// Designed for the first vertical sub-sampling version of FAST_SGR.
+static void final_filter_fast1(int32_t *dst, int dst_stride, const int32_t *A,
+ const int32_t *B, int buf_stride,
+ const void *dgd8, int dgd_stride, int width,
+ int height, int highbd) {
const int nb0 = 5;
const int nb1 = 6;
@@ -573,6 +604,61 @@
}
}
}
+
+// The final filter for selfguided restoration. Computes a weighted average
+// across A, B with "cross sums" (see cross_sum_... implementations above).
+// Designed for the second vertical sub-sampling version of FAST_SGR.
+static void final_filter_fast2(int32_t *dst, int dst_stride, const int32_t *A,
+ const int32_t *B, int buf_stride,
+ const void *dgd8, int dgd_stride, int width,
+ int height, int highbd) {
+ const int nb0 = 5;
+ const int nb1 = 4;
+
+ const __m128i rounding0 =
+ round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+ const __m128i rounding1 =
+ round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+ const uint8_t *dgd_real =
+ highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+ for (int i = 0; i < height; ++i) {
+ if (!(i & 1)) { // even row
+ for (int j = 0; j < width; j += 4) {
+ const __m128i a =
+ cross_sum_fast_even(A + i * buf_stride + j, buf_stride);
+ const __m128i b =
+ cross_sum_fast_even(B + i * buf_stride + j, buf_stride);
+ const __m128i raw =
+ xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m128i src =
+ highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+ __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+ __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding0),
+ SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+
+ xx_storeu_128(dst + i * dst_stride + j, w);
+ }
+ } else { // odd row
+ for (int j = 0; j < width; j += 4) {
+ const __m128i a = cross_sum_fast_odd(A + i * buf_stride + j);
+ const __m128i b = cross_sum_fast_odd(B + i * buf_stride + j);
+ const __m128i raw =
+ xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m128i src =
+ highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+ __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+ __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding1),
+ SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+ xx_storeu_128(dst + i * dst_stride + j, w);
+ }
+ }
+ }
+}
#endif
void av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
@@ -629,23 +715,36 @@
integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
buf_stride);
- // Write to flt1 and flt2
+// Write to flt1 and flt2
+#if CONFIG_FAST_SGR
+ assert(params->r1 < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+
+ // r == 2 filter
+ assert(params->r1 == 2);
+ calc_ab_fast(A, B, C, D, width, height, buf_stride, params->e1, bit_depth,
+ params->r1);
+ final_filter_fast2(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
+ width, height, highbd);
+
+ // r == 1 filter
+ assert(params->r2 == 1);
+ calc_ab(A, B, C, D, width, height, buf_stride, params->e2, bit_depth,
+ params->r2);
+ final_filter(flt2, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
+ height, highbd);
+#else
for (int i = 0; i < 2; ++i) {
int r = i ? params->r2 : params->r1;
int e = i ? params->e2 : params->e1;
int32_t *flt = i ? flt2 : flt1;
assert(r + 1 <= AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
-#if CONFIG_FAST_SGR
- calc_ab_fast(A, B, C, D, width, height, buf_stride, e, bit_depth, r);
- final_filter_fast(flt, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
- width, height, highbd);
-#else
+
calc_ab(A, B, C, D, width, height, buf_stride, e, bit_depth, r);
final_filter(flt, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
height, highbd);
-#endif
}
+#endif
}
void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index 2e18bb6..b590833 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -353,13 +353,13 @@
int width, int height, int dat_stride,
int use_highbd, int bit_depth, int32_t *flt1,
int32_t *flt2, int flt_stride) {
-#if CONFIG_FAST_SGR
+#if CONFIG_FAST_SGR == 2
av1_selfguided_restoration_c(dat8, width, height, dat_stride, flt1, flt2,
flt_stride, params, bit_depth, use_highbd);
#else
av1_selfguided_restoration(dat8, width, height, dat_stride, flt1, flt2,
flt_stride, params, bit_depth, use_highbd);
-#endif // CONFIG_FAST_SGR
+#endif // CONFIG_FAST_SGR == 2
}
// Apply the self-guided filter across an entire restoration unit.
diff --git a/test/selfguided_filter_test.cc b/test/selfguided_filter_test.cc
index 345baac..0215e60 100644
--- a/test/selfguided_filter_test.cc
+++ b/test/selfguided_filter_test.cc
@@ -80,7 +80,24 @@
av1_loop_restoration_precal();
- std::clock_t start = std::clock();
+ aom_usec_timer ref_timer;
+ aom_usec_timer_start(&ref_timer);
+ for (i = 0; i < NUM_ITERS; ++i) {
+ for (k = 0; k < height; k += pu_height)
+ for (j = 0; j < width; j += pu_width) {
+ int w = AOMMIN(pu_width, width - j);
+ int h = AOMMIN(pu_height, height - k);
+ uint8_t *input_p = input + k * stride + j;
+ uint8_t *output_p = output + k * out_stride + j;
+ apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
+ output_p, out_stride, tmpbuf, 8, 0);
+ }
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+ aom_usec_timer tst_timer;
+ aom_usec_timer_start(&tst_timer);
for (i = 0; i < NUM_ITERS; ++i) {
for (k = 0; k < height; k += pu_height)
for (j = 0; j < width; j += pu_width) {
@@ -92,11 +109,16 @@
tmpbuf, 8, 0);
}
}
- std::clock_t end = std::clock();
- double elapsed = ((end - start) / (double)CLOCKS_PER_SEC);
+ aom_usec_timer_mark(&tst_timer);
+ const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
- printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, width,
- height, elapsed, elapsed * 1000000. / NUM_ITERS);
+ std::cout << "[ ] C time = " << ref_time / 1000
+ << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+ EXPECT_GT(ref_time, tst_time)
+ << "Error: AV1SelfguidedFilterTest.SpeedTest, SIMD slower than C.\n"
+ << "C time: " << ref_time << " us\n"
+ << "SIMD time: " << tst_time << " us\n";
aom_free(input_);
aom_free(output_);
@@ -238,8 +260,25 @@
av1_loop_restoration_precal();
- aom_usec_timer timer;
- aom_usec_timer_start(&timer);
+ aom_usec_timer ref_timer;
+ aom_usec_timer_start(&ref_timer);
+ for (i = 0; i < NUM_ITERS; ++i) {
+ for (k = 0; k < height; k += pu_height)
+ for (j = 0; j < width; j += pu_width) {
+ int w = AOMMIN(pu_width, width - j);
+ int h = AOMMIN(pu_height, height - k);
+ uint16_t *input_p = input + k * stride + j;
+ uint16_t *output_p = output + k * out_stride + j;
+ apply_selfguided_restoration_c(
+ CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
+ CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth, 1);
+ }
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+ aom_usec_timer tst_timer;
+ aom_usec_timer_start(&tst_timer);
for (i = 0; i < NUM_ITERS; ++i) {
for (k = 0; k < height; k += pu_height)
for (j = 0; j < width; j += pu_width) {
@@ -252,11 +291,17 @@
1);
}
}
- aom_usec_timer_mark(&timer);
- double elapsed = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_mark(&tst_timer);
+ const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
- printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, width,
- height, elapsed / 1000000, elapsed / NUM_ITERS);
+ std::cout << "[ ] C time = " << ref_time / 1000
+ << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+ EXPECT_GT(ref_time, tst_time)
+ << "Error: AV1HighbdSelfguidedFilterTest.SpeedTest, SIMD slower than "
+ "C.\n"
+ << "C time: " << ref_time << " us\n"
+ << "SIMD time: " << tst_time << " us\n";
aom_free(input_);
aom_free(output_);