Convolution horizontal filter SSSE3 optimization
- Apply signal direction/4-pixel vertical/8-pixel vertical
parallelism.
- Add unit test to verify the bit exact result.
- Overall encoding time improves ~24% on Xeon E5-2680 CPU.
Change-Id: I104dcbfd43451476fee1f94cd16ca5f965878e59
diff --git a/test/vp10_convolve_test.cc b/test/vp10_convolve_test.cc
index 42710a5..901f578 100644
--- a/test/vp10_convolve_test.cc
+++ b/test/vp10_convolve_test.cc
@@ -1,5 +1,6 @@
#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vp10_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "test/acm_random.h"
#include "vp10/common/filter.h"
@@ -40,6 +41,8 @@
int w = 1;
int h = 1;
+ vp10_rtcd();
+
for (int i = 0; i < filter_size * filter_size; i++) {
src[i] = rnd.Rand16() % (1 << 8);
}
@@ -86,6 +89,8 @@
int subpel_x_q4;
int subpel_y_q4;
+ vp10_rtcd();
+
for (int i = 0; i < filter_size * filter_size; i++) {
src[i] = rnd.Rand16() % (1 << 8);
}
@@ -150,6 +155,8 @@
int subpel_x_q4;
int subpel_y_q4;
+ vp10_rtcd();
+
for (int i = 0; i < filter_size * filter_size; i++) {
src0[i] = rnd.Rand16() % (1 << 8);
src1[i] = rnd.Rand16() % (1 << 8);