SSE2 8-tap sub-pixel filter optimization
To ensure fast encoding/decoding on devices without ssse3 support,
SSE2 optimization of sub-pixel filters was done. Test using 1080p
clip showed the decoder speeds were ~70fps with ssse3 filters, ~60fps
with sse2 filters, and ~15fps with c filters.
Change-Id: Ie2088f87d83a889fba80a613e4d0e287aadd785c
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index f0b412d..abeb4bd 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -599,6 +599,28 @@
make_tuple(32, 64, &convolve8_c),
make_tuple(64, 64, &convolve8_c)));
+#if HAVE_SSE2
+const ConvolveFunctions convolve8_sse2(
+ vp9_convolve8_horiz_sse2, vp9_convolve8_avg_horiz_sse2,
+ vp9_convolve8_vert_sse2, vp9_convolve8_avg_vert_sse2,
+ vp9_convolve8_sse2, vp9_convolve8_avg_sse2);
+
+INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
+ make_tuple(4, 4, &convolve8_sse2),
+ make_tuple(8, 4, &convolve8_sse2),
+ make_tuple(4, 8, &convolve8_sse2),
+ make_tuple(8, 8, &convolve8_sse2),
+ make_tuple(16, 8, &convolve8_sse2),
+ make_tuple(8, 16, &convolve8_sse2),
+ make_tuple(16, 16, &convolve8_sse2),
+ make_tuple(32, 16, &convolve8_sse2),
+ make_tuple(16, 32, &convolve8_sse2),
+ make_tuple(32, 32, &convolve8_sse2),
+ make_tuple(64, 32, &convolve8_sse2),
+ make_tuple(32, 64, &convolve8_sse2),
+ make_tuple(64, 64, &convolve8_sse2)));
+#endif
+
#if HAVE_SSSE3
const ConvolveFunctions convolve8_ssse3(
vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_ssse3,