High bit depth 32x32 inverse DCT_DCT transform, AVX2

- Witness the follow user-level speedup on AV1 baseline:
 Encoding time reduction: 4.26%
 Decoding time reduction: 25.35%

Change-Id: Ideaf3cd473ad45ed9256c80d5a5daed0a6e098cf
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index 4e3f429..04dd817 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -144,6 +144,8 @@
 endif
 ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
 AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/highbd_txfm_utility_sse4.h
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/highbd_inv_txfm_sse4.c
+AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/highbd_inv_txfm_avx2.c
 endif
 
 ifneq ($(CONFIG_AOM_HIGHBITDEPTH),yes)