AVX2 implementation for convolve_2d

Can be up to 40% faster with bit exact results

Change-Id: Ia67ba154222fdfb6173bf8942275649e511abe43
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index 86351ee..6fcb4d9 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -162,6 +162,7 @@
 ifeq ($(CONFIG_CONVOLVE_ROUND),yes)
 AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/convolve_2d_sse2.c
 AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/convolve_2d_sse4.c
+AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/convolve_2d_avx2.c
 ifeq ($(CONFIG_HIGHBITDEPTH),yes)
 AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/highbd_convolve_2d_ssse3.c
 endif