Add SSE2 optimization for satd_lp
Speedups (SSE2 vs c) at function level:
47% (satd_size = 16)
50% (satd_size = 64)
70% (satd_size = 256)
46% (satd_size = 1024)
Change-Id: Ib669dac0dc2c3a17a20f09acc4603519dbbe0ecd
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index a2b5f05..b39bfaa 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1130,7 +1130,7 @@
specialize qw/aom_satd neon sse2 avx2/;
add_proto qw/int aom_satd_lp/, "const int16_t *coeff, int length";
- specialize qw/aom_satd_lp avx2 neon/;
+ specialize qw/aom_satd_lp sse2 avx2 neon/;
#
diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index 18c4ca7..67ea85b 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c
@@ -483,6 +483,35 @@
return _mm_cvtsi128_si32(accum);
}
+int aom_satd_lp_sse2(const int16_t *coeff, int length) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i accum = zero;
+
+ for (int i = 0; i < length; i += 16) {
+ const __m128i src_line0 = _mm_loadu_si128((const __m128i *)coeff);
+ const __m128i src_line1 = _mm_loadu_si128((const __m128i *)(coeff + 8));
+ const __m128i inv0 = _mm_sub_epi16(zero, src_line0);
+ const __m128i inv1 = _mm_sub_epi16(zero, src_line1);
+ const __m128i abs0 = _mm_max_epi16(src_line0, inv0); // abs(src_line)
+ const __m128i abs1 = _mm_max_epi16(src_line1, inv1); // abs(src_line)
+ const __m128i sum0 = _mm_madd_epi16(abs0, one);
+ const __m128i sum1 = _mm_madd_epi16(abs1, one);
+ accum = _mm_add_epi32(accum, sum0);
+ accum = _mm_add_epi32(accum, sum1);
+ coeff += 16;
+ }
+
+ { // cascading summation of accum
+ __m128i hi = _mm_srli_si128(accum, 8);
+ accum = _mm_add_epi32(accum, hi);
+ hi = _mm_srli_epi64(accum, 32);
+ accum = _mm_add_epi32(accum, hi);
+ }
+
+ return _mm_cvtsi128_si32(accum);
+}
+
void aom_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
const int ref_stride, const int height) {
int idx = 1;
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 9c20714..11ddafc 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -521,22 +521,35 @@
#endif
typedef int (*SatdFunc)(const tran_low_t *coeffs, int length);
-typedef ::testing::tuple<int, SatdFunc, SatdFunc> SatdTestParam;
-class SatdTest : public ::testing::Test,
- public ::testing::WithParamInterface<SatdTestParam> {
- protected:
- virtual void SetUp() {
- satd_size_ = GET_PARAM(0);
- satd_func_ref_ = GET_PARAM(1);
- satd_func_simd_ = GET_PARAM(2);
+typedef int (*SatdLpFunc)(const int16_t *coeffs, int length);
+template <typename SatdFuncType>
+struct SatdTestParam {
+ SatdTestParam(int s, SatdFuncType f1, SatdFuncType f2)
+ : satd_size(s), func_ref(f1), func_simd(f2) {}
+ int satd_size;
+ SatdFuncType func_ref;
+ SatdFuncType func_simd;
+};
+
+template <typename CoeffType, typename SatdFuncType>
+class SatdTestBase
+ : public ::testing::Test,
+ public ::testing::WithParamInterface<SatdTestParam<SatdFuncType>> {
+ protected:
+ explicit SatdTestBase(const SatdTestParam<SatdFuncType> &func_param) {
+ satd_size_ = func_param.satd_size;
+ satd_func_ref_ = func_param.func_ref;
+ satd_func_simd_ = func_param.func_simd;
+ }
+ virtual void SetUp() {
rnd_.Reset(ACMRandom::DeterministicSeed());
- src_ = reinterpret_cast<tran_low_t *>(
+ src_ = reinterpret_cast<CoeffType *>(
aom_memalign(32, sizeof(*src_) * satd_size_));
ASSERT_TRUE(src_ != NULL);
}
virtual void TearDown() { aom_free(src_); }
- void FillConstant(const tran_low_t val) {
+ void FillConstant(const CoeffType val) {
for (int i = 0; i < satd_size_; ++i) src_[i] = val;
}
void FillRandom() {
@@ -597,12 +610,17 @@
int satd_size_;
private:
- tran_low_t *src_;
- SatdFunc satd_func_ref_;
- SatdFunc satd_func_simd_;
+ CoeffType *src_;
+ SatdFuncType satd_func_ref_;
+ SatdFuncType satd_func_simd_;
ACMRandom rnd_;
};
+class SatdTest : public SatdTestBase<tran_low_t, SatdFunc> {
+ public:
+ SatdTest() : SatdTestBase(GetParam()) {}
+};
+
TEST_P(SatdTest, MinValue) {
const int kMin = -32640;
const int expected = -kMin * satd_size_;
@@ -639,13 +657,21 @@
}
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SatdTest);
+INSTANTIATE_TEST_SUITE_P(
+ C, SatdTest,
+ ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_c),
+ SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_c),
+ SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_c),
+ SatdTestParam<SatdFunc>(1024, &aom_satd_c, &aom_satd_c)));
+
#if HAVE_NEON
INSTANTIATE_TEST_SUITE_P(
NEON, SatdTest,
- ::testing::Values(make_tuple(16, &aom_satd_c, &aom_satd_neon),
- make_tuple(64, &aom_satd_c, &aom_satd_neon),
- make_tuple(256, &aom_satd_c, &aom_satd_neon),
- make_tuple(1024, &aom_satd_c, &aom_satd_neon)));
+ ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_neon),
+ SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_neon),
+ SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_neon),
+ SatdTestParam<SatdFunc>(1024, &aom_satd_c,
+ &aom_satd_neon)));
INSTANTIATE_TEST_SUITE_P(
NEON, VectorVarTest,
::testing::Values(make_tuple(2, &aom_vector_var_c, &aom_vector_var_neon),
@@ -657,19 +683,101 @@
#if HAVE_AVX2
INSTANTIATE_TEST_SUITE_P(
AVX2, SatdTest,
- ::testing::Values(make_tuple(16, &aom_satd_c, &aom_satd_avx2),
- make_tuple(64, &aom_satd_c, &aom_satd_avx2),
- make_tuple(256, &aom_satd_c, &aom_satd_avx2),
- make_tuple(1024, &aom_satd_c, &aom_satd_avx2)));
+ ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_avx2),
+ SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_avx2),
+ SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_avx2),
+ SatdTestParam<SatdFunc>(1024, &aom_satd_c,
+ &aom_satd_avx2)));
#endif
#if HAVE_SSE2
INSTANTIATE_TEST_SUITE_P(
SSE2, SatdTest,
- ::testing::Values(make_tuple(16, &aom_satd_c, &aom_satd_sse2),
- make_tuple(64, &aom_satd_c, &aom_satd_sse2),
- make_tuple(256, &aom_satd_c, &aom_satd_sse2),
- make_tuple(1024, &aom_satd_c, &aom_satd_sse2)));
+ ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_sse2),
+ SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_sse2),
+ SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_sse2),
+ SatdTestParam<SatdFunc>(1024, &aom_satd_c,
+ &aom_satd_sse2)));
+#endif
+
+class SatdLpTest : public SatdTestBase<int16_t, SatdLpFunc> {
+ public:
+ SatdLpTest() : SatdTestBase(GetParam()) {}
+};
+
+TEST_P(SatdLpTest, MinValue) {
+ const int kMin = -32640;
+ const int expected = -kMin * satd_size_;
+ FillConstant(kMin);
+ Check(expected);
+}
+TEST_P(SatdLpTest, MaxValue) {
+ const int kMax = 32640;
+ const int expected = kMax * satd_size_;
+ FillConstant(kMax);
+ Check(expected);
+}
+TEST_P(SatdLpTest, Random) {
+ int expected;
+ switch (satd_size_) {
+ case 16: expected = 205298; break;
+ case 64: expected = 1113950; break;
+ case 256: expected = 4268415; break;
+ case 1024: expected = 16954082; break;
+ default:
+ FAIL() << "Invalid satd size (" << satd_size_
+ << ") valid: 16/64/256/1024";
+ }
+ FillRandom();
+ Check(expected);
+}
+TEST_P(SatdLpTest, Match) {
+ FillRandom();
+ RunComparison();
+}
+TEST_P(SatdLpTest, DISABLED_Speed) {
+ FillRandom();
+ RunSpeedTest();
+}
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SatdLpTest);
+
+// Add the following c test to avoid gtest uninitialized warning.
+INSTANTIATE_TEST_SUITE_P(
+ C, SatdLpTest,
+ ::testing::Values(
+ SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_c),
+ SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_c),
+ SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_c),
+ SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_c)));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, SatdLpTest,
+ ::testing::Values(
+ SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_neon),
+ SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_neon),
+ SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_neon),
+ SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_neon)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, SatdLpTest,
+ ::testing::Values(
+ SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_avx2),
+ SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_avx2),
+ SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_avx2),
+ SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_avx2)));
+#endif
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, SatdLpTest,
+ ::testing::Values(
+ SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_sse2),
+ SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_sse2),
+ SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_sse2),
+ SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_sse2)));
#endif
} // namespace