Add sse2 for aom_int_pro_{row, col} and tests.

Bitexact with C code.

Change-Id: I7c244aa9f85d68a3ad8be56e06e4a7e8399f7062
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 4daadab..7d3e11a 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -872,12 +872,10 @@
   specialize qw/aom_minmax_8x8 sse2/;
 
   add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
-  # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
-  #specialize qw/aom_int_pro_row sse2/;
+  specialize qw/aom_int_pro_row sse2/;
 
   add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, const int width";
-  # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
-  #specialize qw/aom_int_pro_col sse2/;
+  specialize qw/aom_int_pro_col sse2/;
 
   add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
   # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index 0c20261..3057a71 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c
@@ -406,3 +406,74 @@
 
   return _mm_cvtsi128_si32(accum);
 }
+
+void aom_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
+                          const int ref_stride, const int height) {
+  int idx;
+  __m128i zero = _mm_setzero_si128();
+  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
+  __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
+  __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
+  __m128i t0, t1;
+  int height_1 = height - 1;
+  ref += ref_stride;
+
+  for (idx = 1; idx < height_1; idx += 2) {
+    src_line = _mm_loadu_si128((const __m128i *)ref);
+    t0 = _mm_unpacklo_epi8(src_line, zero);
+    t1 = _mm_unpackhi_epi8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, t0);
+    s1 = _mm_adds_epu16(s1, t1);
+    ref += ref_stride;
+
+    src_line = _mm_loadu_si128((const __m128i *)ref);
+    t0 = _mm_unpacklo_epi8(src_line, zero);
+    t1 = _mm_unpackhi_epi8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, t0);
+    s1 = _mm_adds_epu16(s1, t1);
+    ref += ref_stride;
+  }
+
+  src_line = _mm_loadu_si128((const __m128i *)ref);
+  t0 = _mm_unpacklo_epi8(src_line, zero);
+  t1 = _mm_unpackhi_epi8(src_line, zero);
+  s0 = _mm_adds_epu16(s0, t0);
+  s1 = _mm_adds_epu16(s1, t1);
+  if (height == 128) {
+    s0 = _mm_srai_epi16(s0, 6);
+    s1 = _mm_srai_epi16(s1, 6);
+  } else if (height == 64) {
+    s0 = _mm_srai_epi16(s0, 5);
+    s1 = _mm_srai_epi16(s1, 5);
+  } else if (height == 32) {
+    s0 = _mm_srai_epi16(s0, 4);
+    s1 = _mm_srai_epi16(s1, 4);
+  } else {
+    s0 = _mm_srai_epi16(s0, 3);
+    s1 = _mm_srai_epi16(s1, 3);
+  }
+
+  _mm_storeu_si128((__m128i *)hbuf, s0);
+  hbuf += 8;
+  _mm_storeu_si128((__m128i *)hbuf, s1);
+}
+
+int16_t aom_int_pro_col_sse2(const uint8_t *ref, const int width) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i src_line = _mm_load_si128((const __m128i *)ref);
+  __m128i s0 = _mm_sad_epu8(src_line, zero);
+  __m128i s1;
+  int i;
+
+  for (i = 16; i < width; i += 16) {
+    ref += 16;
+    src_line = _mm_load_si128((const __m128i *)ref);
+    s1 = _mm_sad_epu8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, s1);
+  }
+
+  s1 = _mm_srli_si128(s0, 8);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  return _mm_extract_epi16(s0, 0);
+}
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 4ed0eea..0dc06bd 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -135,6 +135,111 @@
   }
 }
 
+typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref,
+                              const int ref_stride, const int height);
+
+typedef std::tuple<int, IntProRowFunc, IntProRowFunc> IntProRowParam;
+
+class IntProRowTest : public AverageTestBase<uint8_t>,
+                      public ::testing::WithParamInterface<IntProRowParam> {
+ public:
+  IntProRowTest()
+      : AverageTestBase(16, GET_PARAM(0)), hbuf_asm_(NULL), hbuf_c_(NULL) {
+    asm_func_ = GET_PARAM(1);
+    c_func_ = GET_PARAM(2);
+  }
+
+ protected:
+  virtual void SetUp() {
+    source_data_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
+    ASSERT_TRUE(source_data_ != NULL);
+
+    hbuf_asm_ = reinterpret_cast<int16_t *>(
+        aom_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16));
+    hbuf_c_ = reinterpret_cast<int16_t *>(
+        aom_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16));
+  }
+
+  virtual void TearDown() {
+    aom_free(source_data_);
+    source_data_ = NULL;
+    aom_free(hbuf_c_);
+    hbuf_c_ = NULL;
+    aom_free(hbuf_asm_);
+    hbuf_asm_ = NULL;
+  }
+
+  void RunComparison() {
+    ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
+    ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
+    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
+        << "Output mismatch";
+  }
+
+ private:
+  IntProRowFunc asm_func_;
+  IntProRowFunc c_func_;
+  int16_t *hbuf_asm_;
+  int16_t *hbuf_c_;
+};
+
+typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width);
+
+typedef std::tuple<int, IntProColFunc, IntProColFunc> IntProColParam;
+
+class IntProColTest : public AverageTestBase<uint8_t>,
+                      public ::testing::WithParamInterface<IntProColParam> {
+ public:
+  IntProColTest() : AverageTestBase(GET_PARAM(0), 1), sum_asm_(0), sum_c_(0) {
+    asm_func_ = GET_PARAM(1);
+    c_func_ = GET_PARAM(2);
+  }
+
+ protected:
+  void RunComparison() {
+    ASM_REGISTER_STATE_CHECK(sum_c_ = c_func_(source_data_, width_));
+    ASM_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_));
+    EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch";
+  }
+
+ private:
+  IntProColFunc asm_func_;
+  IntProColFunc c_func_;
+  int16_t sum_asm_;
+  int16_t sum_c_;
+};
+
+TEST_P(IntProRowTest, MinValue) {
+  FillConstant(0);
+  RunComparison();
+}
+
+TEST_P(IntProRowTest, MaxValue) {
+  FillConstant(255);
+  RunComparison();
+}
+
+TEST_P(IntProRowTest, Random) {
+  FillRandom();
+  RunComparison();
+}
+
+TEST_P(IntProColTest, MinValue) {
+  FillConstant(0);
+  RunComparison();
+}
+
+TEST_P(IntProColTest, MaxValue) {
+  FillConstant(255);
+  RunComparison();
+}
+
+TEST_P(IntProColTest, Random) {
+  FillRandom();
+  RunComparison();
+}
+
 using std::make_tuple;
 
 INSTANTIATE_TEST_CASE_P(
@@ -151,6 +256,22 @@
                       make_tuple(16, 16, 0, 4, &aom_avg_4x4_sse2),
                       make_tuple(16, 16, 5, 4, &aom_avg_4x4_sse2),
                       make_tuple(32, 32, 15, 4, &aom_avg_4x4_sse2)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, IntProRowTest,
+    ::testing::Values(make_tuple(16, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+                      make_tuple(32, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+                      make_tuple(64, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+                      make_tuple(128, &aom_int_pro_row_sse2,
+                                 &aom_int_pro_row_c)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, IntProColTest,
+    ::testing::Values(make_tuple(16, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+                      make_tuple(32, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+                      make_tuple(64, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+                      make_tuple(128, &aom_int_pro_col_sse2,
+                                 &aom_int_pro_col_c)));
 #endif
 
 #if HAVE_NEON