rtc: Add AVX2 variant for functions related to motion search
This CL Adds AVX2 for aom_int_pro_row() and aom_int_pro_col()
functions. Also, refactored the existing code to make it AVX2
friendly.
The overall encode time reduction for RT preset is listed below
Encode_time
cpu Reduction(%)
7 1.150
8 1.331
Change-Id: Idebe6dd72933674148bcc41785f9b42d93dc2f11
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 93f4c34..bcbf1b9 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -43,7 +43,7 @@
protected:
// Handle blocks up to 4 blocks 64x64 with stride up to 128
static const int kDataAlignment = 16;
- static const int kDataBlockSize = 64 * 128;
+ static const int kDataBlockSize = 128 * 128;
virtual void SetUp() {
const testing::TestInfo *const test_info =
@@ -343,20 +343,32 @@
}
#endif // CONFIG_AV1_HIGHBITDEPTH
-typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref,
- const int ref_stride, const int height);
+typedef void (*IntProRowFunc)(int16_t *hbuf, uint8_t const *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor);
-// Params: height, asm function, c function.
-typedef std::tuple<int, IntProRowFunc, IntProRowFunc> IntProRowParam;
+// Params: width, height, asm function, c function.
+typedef std::tuple<int, int, IntProRowFunc, IntProRowFunc> IntProRowParam;
class IntProRowTest : public AverageTestBase<uint8_t>,
public ::testing::WithParamInterface<IntProRowParam> {
public:
IntProRowTest()
- : AverageTestBase(16, GET_PARAM(0)), hbuf_asm_(nullptr),
+ : AverageTestBase(GET_PARAM(0), GET_PARAM(1)), hbuf_asm_(nullptr),
hbuf_c_(nullptr) {
- asm_func_ = GET_PARAM(1);
- c_func_ = GET_PARAM(2);
+ asm_func_ = GET_PARAM(2);
+ c_func_ = GET_PARAM(3);
+ }
+
+ void set_norm_factor() {
+ if (height_ == 128)
+ norm_factor_ = 6;
+ else if (height_ == 64)
+ norm_factor_ = 5;
+ else if (height_ == 32)
+ norm_factor_ = 4;
+ else if (height_ == 16)
+ norm_factor_ = 3;
}
protected:
@@ -366,10 +378,10 @@
ASSERT_NE(source_data_, nullptr);
hbuf_asm_ = static_cast<int16_t *>(
- aom_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16));
+ aom_memalign(kDataAlignment, sizeof(*hbuf_asm_) * width_));
ASSERT_NE(hbuf_asm_, nullptr);
hbuf_c_ = static_cast<int16_t *>(
- aom_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16));
+ aom_memalign(kDataAlignment, sizeof(*hbuf_c_) * width_));
ASSERT_NE(hbuf_c_, nullptr);
}
@@ -383,19 +395,24 @@
}
void RunComparison() {
- API_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
- API_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
- EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
+ set_norm_factor();
+ API_REGISTER_STATE_CHECK(
+ c_func_(hbuf_c_, source_data_, width_, width_, height_, norm_factor_));
+ API_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, width_, width_,
+ height_, norm_factor_));
+ EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * width_))
<< "Output mismatch\n";
}
void RunSpeedTest() {
const int numIter = 5000000;
- printf("Height = %d number of iteration is %d \n", height_, numIter);
+ set_norm_factor();
+ printf("Blk_Size=%dx%d: number of iteration is %d \n", width_, height_,
+ numIter);
aom_usec_timer c_timer_;
aom_usec_timer_start(&c_timer_);
for (int i = 0; i < numIter; i++) {
- c_func_(hbuf_c_, source_data_, 0, height_);
+ c_func_(hbuf_c_, source_data_, width_, width_, height_, norm_factor_);
}
aom_usec_timer_mark(&c_timer_);
@@ -403,7 +420,7 @@
aom_usec_timer_start(&asm_timer_);
for (int i = 0; i < numIter; i++) {
- asm_func_(hbuf_asm_, source_data_, 0, height_);
+ asm_func_(hbuf_asm_, source_data_, width_, width_, height_, norm_factor_);
}
aom_usec_timer_mark(&asm_timer_);
@@ -415,7 +432,7 @@
asm_sum_time,
(static_cast<float>(c_sum_time) / static_cast<float>(asm_sum_time)));
- EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
+ EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * width_))
<< "Output mismatch\n";
}
@@ -424,35 +441,68 @@
IntProRowFunc c_func_;
int16_t *hbuf_asm_;
int16_t *hbuf_c_;
+ int norm_factor_;
};
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProRowTest);
-typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width);
+typedef void (*IntProColFunc)(int16_t *vbuf, uint8_t const *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor);
-// Params: width, asm function, c function.
-typedef std::tuple<int, IntProColFunc, IntProColFunc> IntProColParam;
+// Params: width, height, asm function, c function.
+typedef std::tuple<int, int, IntProColFunc, IntProColFunc> IntProColParam;
class IntProColTest : public AverageTestBase<uint8_t>,
public ::testing::WithParamInterface<IntProColParam> {
public:
- IntProColTest() : AverageTestBase(GET_PARAM(0), 1), sum_asm_(0), sum_c_(0) {
- asm_func_ = GET_PARAM(1);
- c_func_ = GET_PARAM(2);
+ IntProColTest()
+ : AverageTestBase(GET_PARAM(0), GET_PARAM(1)), vbuf_asm_(nullptr),
+ vbuf_c_(nullptr) {
+ asm_func_ = GET_PARAM(2);
+ c_func_ = GET_PARAM(3);
}
protected:
+ virtual void SetUp() {
+ source_data_ = static_cast<uint8_t *>(
+ aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
+ ASSERT_NE(source_data_, nullptr);
+
+ vbuf_asm_ = static_cast<int16_t *>(
+ aom_memalign(kDataAlignment, sizeof(*vbuf_asm_) * width_));
+ ASSERT_NE(vbuf_asm_, nullptr);
+ vbuf_c_ = static_cast<int16_t *>(
+ aom_memalign(kDataAlignment, sizeof(*vbuf_c_) * width_));
+ ASSERT_NE(vbuf_c_, nullptr);
+ }
+
+ virtual void TearDown() {
+ aom_free(source_data_);
+ source_data_ = nullptr;
+ aom_free(vbuf_c_);
+ vbuf_c_ = nullptr;
+ aom_free(vbuf_asm_);
+ vbuf_asm_ = nullptr;
+ }
+
void RunComparison() {
- API_REGISTER_STATE_CHECK(sum_c_ = c_func_(source_data_, width_));
- API_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_));
- EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch";
+ int norm_factor_ = 3 + (width_ >> 5);
+ API_REGISTER_STATE_CHECK(
+ c_func_(vbuf_c_, source_data_, width_, width_, height_, norm_factor_));
+ API_REGISTER_STATE_CHECK(asm_func_(vbuf_asm_, source_data_, width_, width_,
+ height_, norm_factor_));
+ EXPECT_EQ(0, memcmp(vbuf_c_, vbuf_asm_, sizeof(*vbuf_c_) * height_))
+ << "Output mismatch\n";
}
void RunSpeedTest() {
const int numIter = 5000000;
- printf("Width = %d number of iteration is %d \n", width_, numIter);
+ printf("Blk_Size=%dx%d: number of iteration is %d \n", width_, height_,
+ numIter);
+ int norm_factor_ = 3 + (width_ >> 5);
aom_usec_timer c_timer_;
aom_usec_timer_start(&c_timer_);
for (int i = 0; i < numIter; i++) {
- sum_c_ = c_func_(source_data_, width_);
+ c_func_(vbuf_c_, source_data_, width_, width_, height_, norm_factor_);
}
aom_usec_timer_mark(&c_timer_);
@@ -460,7 +510,7 @@
aom_usec_timer_start(&asm_timer_);
for (int i = 0; i < numIter; i++) {
- sum_asm_ = asm_func_(source_data_, width_);
+ asm_func_(vbuf_asm_, source_data_, width_, width_, height_, norm_factor_);
}
aom_usec_timer_mark(&asm_timer_);
@@ -472,14 +522,15 @@
asm_sum_time,
(static_cast<float>(c_sum_time) / static_cast<float>(asm_sum_time)));
- EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch \n";
+ EXPECT_EQ(0, memcmp(vbuf_c_, vbuf_asm_, sizeof(*vbuf_c_) * height_))
+ << "Output mismatch\n";
}
private:
IntProColFunc asm_func_;
IntProColFunc c_func_;
- int16_t sum_asm_;
- int16_t sum_c_;
+ int16_t *vbuf_asm_;
+ int16_t *vbuf_c_;
};
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProColTest);
@@ -703,19 +754,19 @@
INSTANTIATE_TEST_SUITE_P(
SSE2, IntProRowTest,
- ::testing::Values(make_tuple(16, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
- make_tuple(32, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
- make_tuple(64, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
- make_tuple(128, &aom_int_pro_row_sse2,
- &aom_int_pro_row_c)));
+ ::testing::Values(
+ make_tuple(16, 16, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+ make_tuple(32, 32, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+ make_tuple(64, 64, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+ make_tuple(128, 128, &aom_int_pro_row_sse2, &aom_int_pro_row_c)));
INSTANTIATE_TEST_SUITE_P(
SSE2, IntProColTest,
- ::testing::Values(make_tuple(16, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
- make_tuple(32, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
- make_tuple(64, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
- make_tuple(128, &aom_int_pro_col_sse2,
- &aom_int_pro_col_c)));
+ ::testing::Values(
+ make_tuple(16, 16, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+ make_tuple(32, 32, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+ make_tuple(64, 64, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+ make_tuple(128, 128, &aom_int_pro_col_sse2, &aom_int_pro_col_c)));
#endif
#if HAVE_AVX2
@@ -724,6 +775,22 @@
::testing::Values(make_tuple(16, 16, 8, 0, 16, &aom_avg_8x8_quad_avx2),
make_tuple(32, 32, 8, 16, 16, &aom_avg_8x8_quad_avx2),
make_tuple(32, 32, 8, 8, 16, &aom_avg_8x8_quad_avx2)));
+
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, IntProRowTest,
+ ::testing::Values(
+ make_tuple(16, 16, &aom_int_pro_row_avx2, &aom_int_pro_row_c),
+ make_tuple(32, 32, &aom_int_pro_row_avx2, &aom_int_pro_row_c),
+ make_tuple(64, 64, &aom_int_pro_row_avx2, &aom_int_pro_row_c),
+ make_tuple(128, 128, &aom_int_pro_row_avx2, &aom_int_pro_row_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, IntProColTest,
+ ::testing::Values(
+ make_tuple(16, 16, &aom_int_pro_col_avx2, &aom_int_pro_col_c),
+ make_tuple(32, 32, &aom_int_pro_col_avx2, &aom_int_pro_col_c),
+ make_tuple(64, 64, &aom_int_pro_col_avx2, &aom_int_pro_col_c),
+ make_tuple(128, 128, &aom_int_pro_col_avx2, &aom_int_pro_col_c)));
#endif
#if HAVE_NEON
@@ -737,19 +804,19 @@
make_tuple(32, 32, 8, 15, 4, &aom_avg_4x4_neon)));
INSTANTIATE_TEST_SUITE_P(
NEON, IntProRowTest,
- ::testing::Values(make_tuple(16, &aom_int_pro_row_neon, &aom_int_pro_row_c),
- make_tuple(32, &aom_int_pro_row_neon, &aom_int_pro_row_c),
- make_tuple(64, &aom_int_pro_row_neon, &aom_int_pro_row_c),
- make_tuple(128, &aom_int_pro_row_neon,
- &aom_int_pro_row_c)));
+ ::testing::Values(
+ make_tuple(16, 16, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+ make_tuple(32, 32, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+ make_tuple(64, 64, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+ make_tuple(128, 128, &aom_int_pro_row_neon, &aom_int_pro_row_c)));
INSTANTIATE_TEST_SUITE_P(
NEON, IntProColTest,
- ::testing::Values(make_tuple(16, &aom_int_pro_col_neon, &aom_int_pro_col_c),
- make_tuple(32, &aom_int_pro_col_neon, &aom_int_pro_col_c),
- make_tuple(64, &aom_int_pro_col_neon, &aom_int_pro_col_c),
- make_tuple(128, &aom_int_pro_col_neon,
- &aom_int_pro_col_c)));
+ ::testing::Values(
+ make_tuple(16, 16, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+ make_tuple(32, 32, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+ make_tuple(64, 64, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+ make_tuple(128, 128, &aom_int_pro_col_neon, &aom_int_pro_col_c)));
INSTANTIATE_TEST_SUITE_P(
NEON, AvgTest8bpp_avg_8x8_quad,