Add SSE2 optimization for satd_lp

Speedups (SSE2 vs c) at function level:
    47% (satd_size  = 16)
    50% (satd_size  = 64)
    70% (satd_size  = 256)
    46% (satd_size  = 1024)

Change-Id: Ib669dac0dc2c3a17a20f09acc4603519dbbe0ecd
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index a2b5f05..b39bfaa 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1130,7 +1130,7 @@
   specialize qw/aom_satd neon sse2 avx2/;
 
   add_proto qw/int aom_satd_lp/, "const int16_t *coeff, int length";
-  specialize qw/aom_satd_lp avx2 neon/;
+  specialize qw/aom_satd_lp sse2 avx2 neon/;
 
 
   #
diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index 18c4ca7..67ea85b 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c
@@ -483,6 +483,35 @@
   return _mm_cvtsi128_si32(accum);
 }
 
+int aom_satd_lp_sse2(const int16_t *coeff, int length) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i accum = zero;
+
+  for (int i = 0; i < length; i += 16) {
+    const __m128i src_line0 = _mm_loadu_si128((const __m128i *)coeff);
+    const __m128i src_line1 = _mm_loadu_si128((const __m128i *)(coeff + 8));
+    const __m128i inv0 = _mm_sub_epi16(zero, src_line0);
+    const __m128i inv1 = _mm_sub_epi16(zero, src_line1);
+    const __m128i abs0 = _mm_max_epi16(src_line0, inv0);  // abs(src_line)
+    const __m128i abs1 = _mm_max_epi16(src_line1, inv1);  // abs(src_line)
+    const __m128i sum0 = _mm_madd_epi16(abs0, one);
+    const __m128i sum1 = _mm_madd_epi16(abs1, one);
+    accum = _mm_add_epi32(accum, sum0);
+    accum = _mm_add_epi32(accum, sum1);
+    coeff += 16;
+  }
+
+  {  // cascading summation of accum
+    __m128i hi = _mm_srli_si128(accum, 8);
+    accum = _mm_add_epi32(accum, hi);
+    hi = _mm_srli_epi64(accum, 32);
+    accum = _mm_add_epi32(accum, hi);
+  }
+
+  return _mm_cvtsi128_si32(accum);
+}
+
 void aom_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
                           const int ref_stride, const int height) {
   int idx = 1;
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 9c20714..11ddafc 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -521,22 +521,35 @@
 #endif
 
 typedef int (*SatdFunc)(const tran_low_t *coeffs, int length);
-typedef ::testing::tuple<int, SatdFunc, SatdFunc> SatdTestParam;
-class SatdTest : public ::testing::Test,
-                 public ::testing::WithParamInterface<SatdTestParam> {
- protected:
-  virtual void SetUp() {
-    satd_size_ = GET_PARAM(0);
-    satd_func_ref_ = GET_PARAM(1);
-    satd_func_simd_ = GET_PARAM(2);
+typedef int (*SatdLpFunc)(const int16_t *coeffs, int length);
 
+template <typename SatdFuncType>
+struct SatdTestParam {
+  SatdTestParam(int s, SatdFuncType f1, SatdFuncType f2)
+      : satd_size(s), func_ref(f1), func_simd(f2) {}
+  int satd_size;
+  SatdFuncType func_ref;
+  SatdFuncType func_simd;
+};
+
+template <typename CoeffType, typename SatdFuncType>
+class SatdTestBase
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<SatdTestParam<SatdFuncType>> {
+ protected:
+  explicit SatdTestBase(const SatdTestParam<SatdFuncType> &func_param) {
+    satd_size_ = func_param.satd_size;
+    satd_func_ref_ = func_param.func_ref;
+    satd_func_simd_ = func_param.func_simd;
+  }
+  virtual void SetUp() {
     rnd_.Reset(ACMRandom::DeterministicSeed());
-    src_ = reinterpret_cast<tran_low_t *>(
+    src_ = reinterpret_cast<CoeffType *>(
         aom_memalign(32, sizeof(*src_) * satd_size_));
     ASSERT_TRUE(src_ != NULL);
   }
   virtual void TearDown() { aom_free(src_); }
-  void FillConstant(const tran_low_t val) {
+  void FillConstant(const CoeffType val) {
     for (int i = 0; i < satd_size_; ++i) src_[i] = val;
   }
   void FillRandom() {
@@ -597,12 +610,17 @@
   int satd_size_;
 
  private:
-  tran_low_t *src_;
-  SatdFunc satd_func_ref_;
-  SatdFunc satd_func_simd_;
+  CoeffType *src_;
+  SatdFuncType satd_func_ref_;
+  SatdFuncType satd_func_simd_;
   ACMRandom rnd_;
 };
 
+class SatdTest : public SatdTestBase<tran_low_t, SatdFunc> {
+ public:
+  SatdTest() : SatdTestBase(GetParam()) {}
+};
+
 TEST_P(SatdTest, MinValue) {
   const int kMin = -32640;
   const int expected = -kMin * satd_size_;
@@ -639,13 +657,21 @@
 }
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SatdTest);
 
+INSTANTIATE_TEST_SUITE_P(
+    C, SatdTest,
+    ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_c),
+                      SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_c),
+                      SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_c),
+                      SatdTestParam<SatdFunc>(1024, &aom_satd_c, &aom_satd_c)));
+
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(
     NEON, SatdTest,
-    ::testing::Values(make_tuple(16, &aom_satd_c, &aom_satd_neon),
-                      make_tuple(64, &aom_satd_c, &aom_satd_neon),
-                      make_tuple(256, &aom_satd_c, &aom_satd_neon),
-                      make_tuple(1024, &aom_satd_c, &aom_satd_neon)));
+    ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_neon),
+                      SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_neon),
+                      SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_neon),
+                      SatdTestParam<SatdFunc>(1024, &aom_satd_c,
+                                              &aom_satd_neon)));
 INSTANTIATE_TEST_SUITE_P(
     NEON, VectorVarTest,
     ::testing::Values(make_tuple(2, &aom_vector_var_c, &aom_vector_var_neon),
@@ -657,19 +683,101 @@
 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, SatdTest,
-    ::testing::Values(make_tuple(16, &aom_satd_c, &aom_satd_avx2),
-                      make_tuple(64, &aom_satd_c, &aom_satd_avx2),
-                      make_tuple(256, &aom_satd_c, &aom_satd_avx2),
-                      make_tuple(1024, &aom_satd_c, &aom_satd_avx2)));
+    ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_avx2),
+                      SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_avx2),
+                      SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_avx2),
+                      SatdTestParam<SatdFunc>(1024, &aom_satd_c,
+                                              &aom_satd_avx2)));
 #endif
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_SUITE_P(
     SSE2, SatdTest,
-    ::testing::Values(make_tuple(16, &aom_satd_c, &aom_satd_sse2),
-                      make_tuple(64, &aom_satd_c, &aom_satd_sse2),
-                      make_tuple(256, &aom_satd_c, &aom_satd_sse2),
-                      make_tuple(1024, &aom_satd_c, &aom_satd_sse2)));
+    ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_sse2),
+                      SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_sse2),
+                      SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_sse2),
+                      SatdTestParam<SatdFunc>(1024, &aom_satd_c,
+                                              &aom_satd_sse2)));
+#endif
+
+class SatdLpTest : public SatdTestBase<int16_t, SatdLpFunc> {
+ public:
+  SatdLpTest() : SatdTestBase(GetParam()) {}
+};
+
+TEST_P(SatdLpTest, MinValue) {
+  const int kMin = -32640;
+  const int expected = -kMin * satd_size_;
+  FillConstant(kMin);
+  Check(expected);
+}
+TEST_P(SatdLpTest, MaxValue) {
+  const int kMax = 32640;
+  const int expected = kMax * satd_size_;
+  FillConstant(kMax);
+  Check(expected);
+}
+TEST_P(SatdLpTest, Random) {
+  int expected;
+  switch (satd_size_) {
+    case 16: expected = 205298; break;
+    case 64: expected = 1113950; break;
+    case 256: expected = 4268415; break;
+    case 1024: expected = 16954082; break;
+    default:
+      FAIL() << "Invalid satd size (" << satd_size_
+             << ") valid: 16/64/256/1024";
+  }
+  FillRandom();
+  Check(expected);
+}
+TEST_P(SatdLpTest, Match) {
+  FillRandom();
+  RunComparison();
+}
+TEST_P(SatdLpTest, DISABLED_Speed) {
+  FillRandom();
+  RunSpeedTest();
+}
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SatdLpTest);
+
+// Add the following c test to avoid gtest uninitialized warning.
+INSTANTIATE_TEST_SUITE_P(
+    C, SatdLpTest,
+    ::testing::Values(
+        SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_c),
+        SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_c),
+        SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_c),
+        SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_c)));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, SatdLpTest,
+    ::testing::Values(
+        SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_neon),
+        SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_neon),
+        SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_neon),
+        SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_neon)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, SatdLpTest,
+    ::testing::Values(
+        SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_avx2),
+        SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_avx2),
+        SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_avx2),
+        SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_avx2)));
+#endif
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, SatdLpTest,
+    ::testing::Values(
+        SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_sse2),
+        SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_sse2),
+        SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_sse2),
+        SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_sse2)));
 #endif
 
 }  // namespace