Vectorize self-guided filter
Add an SSE4.1 lowbd version of the self-guided filter for
loop-restoration, and apply some optimizations to the C
version.
Approximate times per 128x128 / 256x256 tile on the machine
this was developed on:
Previous C: 620us / 2800us
Optimized C: 500us / 2200us ( 24% / 27% faster)
SSE4.1: 147us / 600us (320% / 370% faster)
Change-Id: I23ff5a5482a191aeb06f9d1f767a9f036bb357fe
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index db5b2fd..92e78eb 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -75,8 +75,11 @@
endif
AV1_COMMON_SRCS-yes += common/convolve.c
AV1_COMMON_SRCS-yes += common/convolve.h
-AV1_COMMON_SRCS-$(CONFIG_LOOP_RESTORATION) += common/restoration.h
-AV1_COMMON_SRCS-$(CONFIG_LOOP_RESTORATION) += common/restoration.c
+ifeq ($(CONFIG_LOOP_RESTORATION),yes)
+AV1_COMMON_SRCS-yes += common/restoration.h
+AV1_COMMON_SRCS-yes += common/restoration.c
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/selfguided_sse4.c
+endif
ifeq (yes,$(filter $(CONFIG_GLOBAL_MOTION) $(CONFIG_WARPED_MOTION),yes))
AV1_COMMON_SRCS-yes += common/warped_motion.h
AV1_COMMON_SRCS-yes += common/warped_motion.c