Vectorize self-guided filter
Add an SSE4.1 lowbd version of the self-guided filter for
loop-restoration, and apply some optimizations to the C
version.
Approximate times per 128x128 / 256x256 tile on the machine
this was developed on:
Previous C: 620us / 2800us
Optimized C: 500us / 2200us ( 24% / 27% faster)
SSE4.1: 147us / 600us (320% / 370% faster)
Change-Id: I23ff5a5482a191aeb06f9d1f767a9f036bb357fe
diff --git a/test/test.mk b/test/test.mk
index 01014e6..8ffa87a 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -209,6 +209,9 @@
ifneq ($(findstring yes,$(CONFIG_GLOBAL_MOTION) $(CONFIG_WARPED_MOTION)),)
LIBAOM_TEST_SRCS-$(HAVE_SSE2) += warp_filter_test.cc
endif
+ifeq ($(CONFIG_LOOP_RESTORATION),yes)
+LIBAOM_TEST_SRCS-$(HAVE_SSE4_1) += selfguided_filter_test.cc
+endif
TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c