Add sse2 for aom_int_pro_{row, col} and tests.
Bitexact with C code.
Change-Id: I7c244aa9f85d68a3ad8be56e06e4a7e8399f7062
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 4daadab..7d3e11a 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -872,12 +872,10 @@
specialize qw/aom_minmax_8x8 sse2/;
add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
- # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
- #specialize qw/aom_int_pro_row sse2/;
+ specialize qw/aom_int_pro_row sse2/;
add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, const int width";
- # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
- #specialize qw/aom_int_pro_col sse2/;
+ specialize qw/aom_int_pro_col sse2/;
add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
# TODO(kyslov@) bring back SSE2 by extending it to 128 block size
diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index 0c20261..3057a71 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c
@@ -406,3 +406,74 @@
return _mm_cvtsi128_si32(accum);
}
+
+void aom_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
+ const int ref_stride, const int height) {
+ int idx;
+ __m128i zero = _mm_setzero_si128();
+ __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
+ __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
+ __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
+ __m128i t0, t1;
+ int height_1 = height - 1;
+ ref += ref_stride;
+
+ for (idx = 1; idx < height_1; idx += 2) {
+ src_line = _mm_loadu_si128((const __m128i *)ref);
+ t0 = _mm_unpacklo_epi8(src_line, zero);
+ t1 = _mm_unpackhi_epi8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, t0);
+ s1 = _mm_adds_epu16(s1, t1);
+ ref += ref_stride;
+
+ src_line = _mm_loadu_si128((const __m128i *)ref);
+ t0 = _mm_unpacklo_epi8(src_line, zero);
+ t1 = _mm_unpackhi_epi8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, t0);
+ s1 = _mm_adds_epu16(s1, t1);
+ ref += ref_stride;
+ }
+
+ src_line = _mm_loadu_si128((const __m128i *)ref);
+ t0 = _mm_unpacklo_epi8(src_line, zero);
+ t1 = _mm_unpackhi_epi8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, t0);
+ s1 = _mm_adds_epu16(s1, t1);
+ if (height == 128) {
+ s0 = _mm_srai_epi16(s0, 6);
+ s1 = _mm_srai_epi16(s1, 6);
+ } else if (height == 64) {
+ s0 = _mm_srai_epi16(s0, 5);
+ s1 = _mm_srai_epi16(s1, 5);
+ } else if (height == 32) {
+ s0 = _mm_srai_epi16(s0, 4);
+ s1 = _mm_srai_epi16(s1, 4);
+ } else {
+ s0 = _mm_srai_epi16(s0, 3);
+ s1 = _mm_srai_epi16(s1, 3);
+ }
+
+ _mm_storeu_si128((__m128i *)hbuf, s0);
+ hbuf += 8;
+ _mm_storeu_si128((__m128i *)hbuf, s1);
+}
+
+int16_t aom_int_pro_col_sse2(const uint8_t *ref, const int width) {
+ __m128i zero = _mm_setzero_si128();
+ __m128i src_line = _mm_load_si128((const __m128i *)ref);
+ __m128i s0 = _mm_sad_epu8(src_line, zero);
+ __m128i s1;
+ int i;
+
+ for (i = 16; i < width; i += 16) {
+ ref += 16;
+ src_line = _mm_load_si128((const __m128i *)ref);
+ s1 = _mm_sad_epu8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, s1);
+ }
+
+ s1 = _mm_srli_si128(s0, 8);
+ s0 = _mm_adds_epu16(s0, s1);
+
+ return _mm_extract_epi16(s0, 0);
+}
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 4ed0eea..0dc06bd 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -135,6 +135,111 @@
}
}
+typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref,
+ const int ref_stride, const int height);
+
+typedef std::tuple<int, IntProRowFunc, IntProRowFunc> IntProRowParam;
+
+class IntProRowTest : public AverageTestBase<uint8_t>,
+ public ::testing::WithParamInterface<IntProRowParam> {
+ public:
+ IntProRowTest()
+ : AverageTestBase(16, GET_PARAM(0)), hbuf_asm_(NULL), hbuf_c_(NULL) {
+ asm_func_ = GET_PARAM(1);
+ c_func_ = GET_PARAM(2);
+ }
+
+ protected:
+ virtual void SetUp() {
+ source_data_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
+ ASSERT_TRUE(source_data_ != NULL);
+
+ hbuf_asm_ = reinterpret_cast<int16_t *>(
+ aom_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16));
+ hbuf_c_ = reinterpret_cast<int16_t *>(
+ aom_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16));
+ }
+
+ virtual void TearDown() {
+ aom_free(source_data_);
+ source_data_ = NULL;
+ aom_free(hbuf_c_);
+ hbuf_c_ = NULL;
+ aom_free(hbuf_asm_);
+ hbuf_asm_ = NULL;
+ }
+
+ void RunComparison() {
+ ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
+ ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
+ EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
+ << "Output mismatch";
+ }
+
+ private:
+ IntProRowFunc asm_func_;
+ IntProRowFunc c_func_;
+ int16_t *hbuf_asm_;
+ int16_t *hbuf_c_;
+};
+
+typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width);
+
+typedef std::tuple<int, IntProColFunc, IntProColFunc> IntProColParam;
+
+class IntProColTest : public AverageTestBase<uint8_t>,
+ public ::testing::WithParamInterface<IntProColParam> {
+ public:
+ IntProColTest() : AverageTestBase(GET_PARAM(0), 1), sum_asm_(0), sum_c_(0) {
+ asm_func_ = GET_PARAM(1);
+ c_func_ = GET_PARAM(2);
+ }
+
+ protected:
+ void RunComparison() {
+ ASM_REGISTER_STATE_CHECK(sum_c_ = c_func_(source_data_, width_));
+ ASM_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_));
+ EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch";
+ }
+
+ private:
+ IntProColFunc asm_func_;
+ IntProColFunc c_func_;
+ int16_t sum_asm_;
+ int16_t sum_c_;
+};
+
+TEST_P(IntProRowTest, MinValue) {
+ FillConstant(0);
+ RunComparison();
+}
+
+TEST_P(IntProRowTest, MaxValue) {
+ FillConstant(255);
+ RunComparison();
+}
+
+TEST_P(IntProRowTest, Random) {
+ FillRandom();
+ RunComparison();
+}
+
+TEST_P(IntProColTest, MinValue) {
+ FillConstant(0);
+ RunComparison();
+}
+
+TEST_P(IntProColTest, MaxValue) {
+ FillConstant(255);
+ RunComparison();
+}
+
+TEST_P(IntProColTest, Random) {
+ FillRandom();
+ RunComparison();
+}
+
using std::make_tuple;
INSTANTIATE_TEST_CASE_P(
@@ -151,6 +256,22 @@
make_tuple(16, 16, 0, 4, &aom_avg_4x4_sse2),
make_tuple(16, 16, 5, 4, &aom_avg_4x4_sse2),
make_tuple(32, 32, 15, 4, &aom_avg_4x4_sse2)));
+
+INSTANTIATE_TEST_CASE_P(
+ SSE2, IntProRowTest,
+ ::testing::Values(make_tuple(16, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+ make_tuple(32, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+ make_tuple(64, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+ make_tuple(128, &aom_int_pro_row_sse2,
+ &aom_int_pro_row_c)));
+
+INSTANTIATE_TEST_CASE_P(
+ SSE2, IntProColTest,
+ ::testing::Values(make_tuple(16, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+ make_tuple(32, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+ make_tuple(64, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+ make_tuple(128, &aom_int_pro_col_sse2,
+ &aom_int_pro_col_c)));
#endif
#if HAVE_NEON