Make hadamard_col8_sse2 inline.

Without explicit inline this function is resolved as non-inline
and this lead to a bunch of sse register to put on stack for passing
to this function. Overall I see ~2-3% speed up on RT lowres.

Change-Id: I78fcba22925f459fd17e7f01fffc4265a9b796e6
diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index 1c5f6eb..5f9c651 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c
@@ -139,7 +139,7 @@
   return (avg + 8) >> 4;
 }
 
-static void hadamard_col8_sse2(__m128i *in, int iter) {
+static INLINE void hadamard_col8_sse2(__m128i *in, int iter) {
   __m128i a0 = in[0];
   __m128i a1 = in[1];
   __m128i a2 = in[2];