Convolution horizontal filter SSSE3 optimization - Apply signal direction/4-pixel vertical/8-pixel vertical parallelism. - Add unit test to verify the bit exact result. - Overall encoding time improves ~24% on Xeon E5-2680 CPU. Change-Id: I104dcbfd43451476fee1f94cd16ca5f965878e59

commit: 229690a95cf0a3a87bf98b85a1945568689e47b5 [log] [tgz]
author: Yi Luo <luoyi@google.com> Mon Jun 13 17:01:17 2016 -0700
committer: Yi Luo <luoyi@google.com> Mon Jun 20 11:10:30 2016 -0700
tree: 349f40572b21000ebd9c78d4f4492b357499f44c
parent: d10161eafccf5634f3d37972a4519931ca1f2ec7 [diff] [blame]
diff --git a/test/vp10_convolve_test.cc b/test/vp10_convolve_test.cc
index 42710a5..901f578 100644
--- a/test/vp10_convolve_test.cc
+++ b/test/vp10_convolve_test.cc

@@ -1,5 +1,6 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
+#include "./vp10_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "vp10/common/filter.h"
@@ -40,6 +41,8 @@
   int w = 1;
   int h = 1;
 
+  vp10_rtcd();
+
   for (int i = 0; i < filter_size * filter_size; i++) {
     src[i] = rnd.Rand16() % (1 << 8);
   }
@@ -86,6 +89,8 @@
   int subpel_x_q4;
   int subpel_y_q4;
 
+  vp10_rtcd();
+
   for (int i = 0; i < filter_size * filter_size; i++) {
     src[i] = rnd.Rand16() % (1 << 8);
   }
@@ -150,6 +155,8 @@
   int subpel_x_q4;
   int subpel_y_q4;
 
+  vp10_rtcd();
+
   for (int i = 0; i < filter_size * filter_size; i++) {
     src0[i] = rnd.Rand16() % (1 << 8);
     src1[i] = rnd.Rand16() % (1 << 8);
commit	229690a95cf0a3a87bf98b85a1945568689e47b5	[log] [tgz]
author	Yi Luo <luoyi@google.com>	Mon Jun 13 17:01:17 2016 -0700
committer	Yi Luo <luoyi@google.com>	Mon Jun 20 11:10:30 2016 -0700
tree	349f40572b21000ebd9c78d4f4492b357499f44c
parent	d10161eafccf5634f3d37972a4519931ca1f2ec7 [diff] [blame]