Convolution horizontal filter SSSE3 optimization

- Apply signal direction/4-pixel vertical/8-pixel vertical
  parallelism.
- Add unit test to verify the bit exact result.
- Overall encoding time improves ~24% on Xeon E5-2680 CPU.

Change-Id: I104dcbfd43451476fee1f94cd16ca5f965878e59
diff --git a/test/vp10_convolve_test.cc b/test/vp10_convolve_test.cc
index 42710a5..901f578 100644
--- a/test/vp10_convolve_test.cc
+++ b/test/vp10_convolve_test.cc
@@ -1,5 +1,6 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
+#include "./vp10_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "vp10/common/filter.h"
@@ -40,6 +41,8 @@
   int w = 1;
   int h = 1;
 
+  vp10_rtcd();
+
   for (int i = 0; i < filter_size * filter_size; i++) {
     src[i] = rnd.Rand16() % (1 << 8);
   }
@@ -86,6 +89,8 @@
   int subpel_x_q4;
   int subpel_y_q4;
 
+  vp10_rtcd();
+
   for (int i = 0; i < filter_size * filter_size; i++) {
     src[i] = rnd.Rand16() % (1 << 8);
   }
@@ -150,6 +155,8 @@
   int subpel_x_q4;
   int subpel_y_q4;
 
+  vp10_rtcd();
+
   for (int i = 0; i < filter_size * filter_size; i++) {
     src0[i] = rnd.Rand16() % (1 << 8);
     src1[i] = rnd.Rand16() % (1 << 8);