Restore SSSE3 subpixel filters in new convolve framework

This commit adds the 8 tap SSSE3 subpixel filters back into the code
underneath the convolve API. The C code is still called for 4x4
blocks, as well as compound prediction modes. This restores the
encode performance to be within about 8% of the baseline.

Change-Id: Ife0d81477075ae33c05b53c65003951efdc8b09c
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 3543840..56b1bec 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -12,6 +12,7 @@
 extern "C" {
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
 }
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/acm_random.h"
@@ -166,7 +167,25 @@
 }
 
 class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {
+ public:
+  static void SetUpTestCase() {
+    // Force input_ to be unaligned, output to be 16 byte aligned.
+    input_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kOuterBlockSize * kOuterBlockSize + 1))
+        + 1;
+    output_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kOuterBlockSize * kOuterBlockSize));
+  }
+
+  static void TearDownTestCase() {
+    vpx_free(input_ - 1);
+    input_ = NULL;
+    vpx_free(output_);
+    output_ = NULL;
+  }
+
   protected:
+    static const int kDataAlignment = 16;
     static const int kOuterBlockSize = 32;
     static const int kInputStride = kOuterBlockSize;
     static const int kOutputStride = kOuterBlockSize;
@@ -174,7 +193,10 @@
 
     int Width() const { return GET_PARAM(0); }
     int Height() const { return GET_PARAM(1); }
-    int BorderLeft() const { return (kOuterBlockSize - Width()) / 2; }
+    int BorderLeft() const {
+      const int center = (kOuterBlockSize - Width()) / 2;
+      return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1);
+    }
     int BorderTop() const { return (kOuterBlockSize - Height()) / 2; }
 
     bool IsIndexInBorder(int i) {
@@ -216,9 +238,11 @@
     }
 
     const ConvolveFunctions* UUT_;
-    uint8_t input_[kOuterBlockSize * kOuterBlockSize];
-    uint8_t output_[kOuterBlockSize * kOuterBlockSize];
+    static uint8_t* input_;
+    static uint8_t* output_;
 };
+uint8_t* ConvolveTest::input_ = NULL;
+uint8_t* ConvolveTest::output_ = NULL;
 
 TEST_P(ConvolveTest, GuardBlocks) {
   CheckGuardBlocks();
@@ -488,3 +512,16 @@
     make_tuple(8, 8, &convolve8_c),
     make_tuple(16, 16, &convolve8_c)));
 }
+
+#if HAVE_SSSE3
+const ConvolveFunctions convolve8_ssse3(
+    vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_c,
+    vp9_convolve8_vert_ssse3, vp9_convolve8_avg_vert_c,
+    vp9_convolve8_ssse3, vp9_convolve8_avg_c);
+
+INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
+    make_tuple(4, 4, &convolve8_ssse3),
+    make_tuple(8, 4, &convolve8_ssse3),
+    make_tuple(8, 8, &convolve8_ssse3),
+    make_tuple(16, 16, &convolve8_ssse3)));
+#endif