SSSE3 implementation of full inverse 8x8 2D-DCT

This commit enables SSSE3 version full inverse 8x8 2D-DCT and
reconstruction. It makes the runtime of vp9_idct8x8_64_add down
from 256 cycles (SSE2) to 246 cycles.

Change-Id: I0600feac894d6a443a3c9d18daf34156d4e225c3
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index b1ba0b1..eaff60a 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -120,6 +120,10 @@
 
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 
+ifeq ($(ARCH_X86_64), yes)
+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3.asm
+endif
+
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_16_neon.c