Convolution vertical filter SSSE3 optimization

- Apply 8-pixel vertical filtering direction parallelism.
- Add unit tests to verify bit exact.
- Encoder speed improves ~29% (enable EXT_INTERP) on Xeon E5-2680.
- Combinational cycle count of vp10_convolve() drops from 26.06%
  to 6.73%.

Change-Id: Ic1ae48f8fb1909991577947a8c00d07832737e57
diff --git a/test/vp10_convolve_optimz_test.cc b/test/vp10_convolve_optimz_test.cc
index 1f26d0a..04c4321 100644
--- a/test/vp10_convolve_optimz_test.cc
+++ b/test/vp10_convolve_optimz_test.cc
@@ -21,13 +21,15 @@
 using std::tr1::tuple;
 using libvpx_test::ACMRandom;
 
-typedef void (*conv_horiz_t)(const uint8_t*, int, uint8_t*, int,
-                             int, int, const InterpFilterParams,
-                             const int, int, int);
+typedef void (*conv_filter_t)(const uint8_t*, int, uint8_t*, int,
+                              int, int, const InterpFilterParams,
+                              const int, int, int);
 // Test parameter list:
-//  <convolve_horiz_func, <width, height>, filter_params, subpel_x_q4, avg>
+//  <convolve_horiz_func, convolve_vert_func,
+//  <width, height>, filter_params, subpel_x_q4, avg>
 typedef tuple<int, int> BlockDimension;
-typedef tuple<conv_horiz_t, BlockDimension, INTERP_FILTER, int, int> ConvParams;
+typedef tuple<conv_filter_t, conv_filter_t, BlockDimension, INTERP_FILTER,
+              int, int> ConvParams;
 
 // Note:
 //  src_ and src_ref_ have special boundary requirement
@@ -44,13 +46,14 @@
  public:
   virtual ~VP10ConvolveOptimzTest() {}
   virtual void SetUp() {
-    conv_ = GET_PARAM(0);
-    BlockDimension block = GET_PARAM(1);
+    conv_horiz_ = GET_PARAM(0);
+    conv_vert_ = GET_PARAM(1);
+    BlockDimension block = GET_PARAM(2);
     width_ = std::tr1::get<0>(block);
     height_ = std::tr1::get<1>(block);
-    filter_ = GET_PARAM(2);
-    subpel_ = GET_PARAM(3);
-    avg_ = GET_PARAM(4);
+    filter_ = GET_PARAM(3);
+    subpel_ = GET_PARAM(4);
+    avg_ = GET_PARAM(5);
 
     alloc_ = new uint8_t[maxBlockSize * 4];
     src_ = alloc_ + (vertiOffset * maxWidth);
@@ -68,6 +71,7 @@
 
  protected:
   void RunHorizFilterBitExactCheck();
+  void RunVertFilterBitExactCheck();
 
  private:
   void PrepFilterBuffer(uint8_t *src, uint8_t *src_ref,
@@ -75,7 +79,8 @@
                         int w, int h);
   void DiffFilterBuffer(const uint8_t *buf, const uint8_t *buf_ref,
                         int w, int h, int fgroup, int findex);
-  conv_horiz_t conv_;
+  conv_filter_t conv_horiz_;
+  conv_filter_t conv_vert_;
   uint8_t *alloc_;
   uint8_t *src_;
   uint8_t *dst_;
@@ -94,7 +99,7 @@
   int r, c;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
 
-  memset(alloc_, 0, 4 * maxBlockSize);
+  memset(alloc_, 0, 4 * maxBlockSize * sizeof(alloc_[0]));
 
   uint8_t *src_ptr = src;
   uint8_t *dst_ptr = dst;
@@ -144,8 +149,8 @@
   vp10_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_, height_,
                         filter_params, subpel_, x_step_q4, avg_);
 
-  conv_(src_, stride, dst_, stride, width_, height_,
-        filter_params, subpel_, x_step_q4, avg_);
+  conv_horiz_(src_, stride, dst_, stride, width_, height_,
+              filter_params, subpel_, x_step_q4, avg_);
 
   DiffFilterBuffer(dst_, dst_ref_, width_, height_, filter_, subpel_);
 
@@ -160,21 +165,40 @@
                         intermediate_height, filter_params, subpel_, x_step_q4,
                         avg_);
 
-  conv_(src_, stride, dst_, stride, width_,
-        intermediate_height, filter_params, subpel_, x_step_q4,
-        avg_);
+  conv_horiz_(src_, stride, dst_, stride, width_,
+              intermediate_height, filter_params, subpel_, x_step_q4,
+              avg_);
 
   DiffFilterBuffer(dst_, dst_ref_, width_, intermediate_height, filter_,
                    subpel_);
 }
 
+void VP10ConvolveOptimzTest::RunVertFilterBitExactCheck() {
+  PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, width_, height_);
+
+  InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
+
+  vp10_convolve_vert_c(src_ref_, stride, dst_ref_, stride, width_, height_,
+                       filter_params, subpel_, x_step_q4, avg_);
+
+  conv_vert_(src_, stride, dst_, stride, width_, height_,
+             filter_params, subpel_, x_step_q4, avg_);
+
+  DiffFilterBuffer(dst_, dst_ref_, width_, height_, filter_, subpel_);
+}
+
 TEST_P(VP10ConvolveOptimzTest, HorizBitExactCheck) {
   RunHorizFilterBitExactCheck();
 }
+TEST_P(VP10ConvolveOptimzTest, VerticalBitExactCheck) {
+  RunVertFilterBitExactCheck();
+}
 
 using std::tr1::make_tuple;
 
 const BlockDimension kBlockDim[] = {
+  make_tuple(2, 2),
+  make_tuple(2, 4),
   make_tuple(4, 4),
   make_tuple(4, 8),
   make_tuple(8, 4),
@@ -195,7 +219,7 @@
 // 10/12-tap filters
 const INTERP_FILTER kFilter[] = {6, 4, 2};
 
-const int kSubpelXQ4[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+const int kSubpelQ4[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
 
 const int kAvg[] = {0, 1};
 
@@ -204,9 +228,10 @@
     SSSE3, VP10ConvolveOptimzTest,
     ::testing::Combine(
          ::testing::Values(vp10_convolve_horiz_ssse3),
+         ::testing::Values(vp10_convolve_vert_ssse3),
          ::testing::ValuesIn(kBlockDim),
          ::testing::ValuesIn(kFilter),
-         ::testing::ValuesIn(kSubpelXQ4),
+         ::testing::ValuesIn(kSubpelQ4),
          ::testing::ValuesIn(kAvg)));
 #endif  // HAVE_SSSE3 && CONFIG_EXT_INTERP
 }  // namespace