Add NEON version aom_int_pro_row/col and aom_satd
Speedup
NEON/Satd*Speed
size = 16 Gain = 1.65
size = 64 Gain = 1.04
size =256 Gain = 1.15
size =1024Gain = 0.98
NEON/IntProRowTest.DISABLED_Speed
Height = 16 Gain = 14.73
Height = 32 Gain = 15.55
Height = 64 Gain = 15.64
Height = 128 Gain = 15.82
NEON/IntProColTest.DISABLED_Speed
Width = 16 Gain = 1.9
Width = 32 Gain = 1.4
Width = 64 Gain = 1.3
Width = 128 Gain = 1.2
Change-Id: If60e88aad5be19a198fae55fe30100a3a88ec309
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index d9eb518..7e7c94d 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -967,10 +967,10 @@
}
add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
- specialize qw/aom_int_pro_row sse2/;
+ specialize qw/aom_int_pro_row sse2 neon/;
add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, const int width";
- specialize qw/aom_int_pro_col sse2/;
+ specialize qw/aom_int_pro_col sse2 neon/;
add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
# TODO(kyslov@) bring back SSE2 by extending it to 128 block size
@@ -1006,7 +1006,7 @@
specialize qw/aom_highbd_hadamard_32x32 avx2/;
}
add_proto qw/int aom_satd/, "const tran_low_t *coeff, int length";
- specialize qw/aom_satd avx2/;
+ specialize qw/aom_satd neon avx2/;
add_proto qw/int aom_satd_lp/, "const int16_t *coeff, int length";
specialize qw/aom_satd_lp avx2 neon/;
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index af3769e..c2a1dc4 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c
@@ -72,3 +72,83 @@
return satd;
}
}
+
+void aom_int_pro_row_neon(int16_t hbuf[16], const uint8_t *ref,
+ const int ref_stride, const int height) {
+ int i;
+ const uint8_t *idx = ref;
+ uint16x8_t vec0 = vdupq_n_u16(0);
+ uint16x8_t vec1 = vec0;
+ uint8x16_t tmp;
+
+ for (i = 0; i < height; ++i) {
+ tmp = vld1q_u8(idx);
+ idx += ref_stride;
+ vec0 = vaddw_u8(vec0, vget_low_u8(tmp));
+ vec1 = vaddw_u8(vec1, vget_high_u8(tmp));
+ }
+
+ if (128 == height) {
+ vec0 = vshrq_n_u16(vec0, 6);
+ vec1 = vshrq_n_u16(vec1, 6);
+ } else if (64 == height) {
+ vec0 = vshrq_n_u16(vec0, 5);
+ vec1 = vshrq_n_u16(vec1, 5);
+ } else if (32 == height) {
+ vec0 = vshrq_n_u16(vec0, 4);
+ vec1 = vshrq_n_u16(vec1, 4);
+ } else if (16 == height) {
+ vec0 = vshrq_n_u16(vec0, 3);
+ vec1 = vshrq_n_u16(vec1, 3);
+ }
+
+ vst1q_s16(hbuf, vreinterpretq_s16_u16(vec0));
+ hbuf += 8;
+ vst1q_s16(hbuf, vreinterpretq_s16_u16(vec1));
+}
+
+int16_t aom_int_pro_col_neon(const uint8_t *ref, const int width) {
+ const uint8_t *idx;
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ for (idx = ref; idx < (ref + width); idx += 16) {
+ uint8x16_t vec = vld1q_u8(idx);
+ sum = vaddq_u16(sum, vpaddlq_u8(vec));
+ }
+
+#if defined(__aarch64__)
+ return (int16_t)vaddvq_u16(sum);
+#else
+ const uint32x4_t a = vpaddlq_u16(sum);
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+ vreinterpret_u32_u64(vget_high_u64(b)));
+ return (int16_t)vget_lane_u32(c, 0);
+#endif
+}
+
+// coeff: 16 bits, dynamic range [-32640, 32640].
+// length: value range {16, 64, 256, 1024}.
+int aom_satd_neon(const tran_low_t *coeff, int length) {
+ const int32x4_t zero = vdupq_n_s32(0);
+ int32x4_t accum = zero;
+ do {
+ const int32x4_t src0 = vld1q_s32(&coeff[0]);
+ const int32x4_t src8 = vld1q_s32(&coeff[4]);
+ const int32x4_t src16 = vld1q_s32(&coeff[8]);
+ const int32x4_t src24 = vld1q_s32(&coeff[12]);
+ accum = vabaq_s32(accum, src0, zero);
+ accum = vabaq_s32(accum, src8, zero);
+ accum = vabaq_s32(accum, src16, zero);
+ accum = vabaq_s32(accum, src24, zero);
+ length -= 16;
+ coeff += 16;
+ } while (length != 0);
+
+ // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+#ifdef __aarch64__
+ return vaddvq_s32(accum);
+#else
+ return horizontal_add_s32x4(accum);
+#endif // __aarch64__
+}
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 1742aec..643d21a 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -176,7 +176,37 @@
ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
- << "Output mismatch";
+ << "Output mismatch\n";
+ }
+
+ void RunSpeedTest() {
+ const int numIter = 5000000;
+ printf("Height = %d number of iteration is %d \n", height_, numIter);
+ aom_usec_timer c_timer_;
+ aom_usec_timer_start(&c_timer_);
+ for (int i = 0; i < numIter; i++) {
+ c_func_(hbuf_c_, source_data_, 0, height_);
+ }
+ aom_usec_timer_mark(&c_timer_);
+
+ aom_usec_timer asm_timer_;
+ aom_usec_timer_start(&asm_timer_);
+
+ for (int i = 0; i < numIter; i++) {
+ asm_func_(hbuf_asm_, source_data_, 0, height_);
+ }
+ aom_usec_timer_mark(&asm_timer_);
+
+ const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+ const int asm_sum_time =
+ static_cast<int>(aom_usec_timer_elapsed(&asm_timer_));
+
+ printf("c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+ asm_sum_time,
+ (static_cast<float>(c_sum_time) / static_cast<float>(asm_sum_time)));
+
+ EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
+ << "Output mismatch\n";
}
private:
@@ -205,6 +235,34 @@
ASM_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_));
EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch";
}
+ void RunSpeedTest() {
+ const int numIter = 5000000;
+ printf("Width = %d number of iteration is %d \n", width_, numIter);
+ aom_usec_timer c_timer_;
+ aom_usec_timer_start(&c_timer_);
+ for (int i = 0; i < numIter; i++) {
+ sum_c_ = c_func_(source_data_, width_);
+ }
+ aom_usec_timer_mark(&c_timer_);
+
+ aom_usec_timer asm_timer_;
+ aom_usec_timer_start(&asm_timer_);
+
+ for (int i = 0; i < numIter; i++) {
+ sum_asm_ = asm_func_(source_data_, width_);
+ }
+ aom_usec_timer_mark(&asm_timer_);
+
+ const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+ const int asm_sum_time =
+ static_cast<int>(aom_usec_timer_elapsed(&asm_timer_));
+
+ printf("c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+ asm_sum_time,
+ (static_cast<float>(c_sum_time) / static_cast<float>(asm_sum_time)));
+
+ EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch \n";
+ }
private:
IntProColFunc asm_func_;
@@ -228,6 +286,11 @@
RunComparison();
}
+TEST_P(IntProRowTest, DISABLED_Speed) {
+ FillRandom();
+ RunSpeedTest();
+}
+
TEST_P(IntProColTest, MinValue) {
FillConstant(0);
RunComparison();
@@ -243,6 +306,11 @@
RunComparison();
}
+TEST_P(IntProColTest, DISABLED_Speed) {
+ FillRandom();
+ RunSpeedTest();
+}
+
using std::make_tuple;
INSTANTIATE_TEST_SUITE_P(
@@ -286,6 +354,150 @@
make_tuple(16, 16, 0, 4, &aom_avg_4x4_neon),
make_tuple(16, 16, 5, 4, &aom_avg_4x4_neon),
make_tuple(32, 32, 15, 4, &aom_avg_4x4_neon)));
+INSTANTIATE_TEST_SUITE_P(
+ NEON, IntProRowTest,
+ ::testing::Values(make_tuple(16, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+ make_tuple(32, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+ make_tuple(64, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+ make_tuple(128, &aom_int_pro_row_neon,
+ &aom_int_pro_row_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON, IntProColTest,
+ ::testing::Values(make_tuple(16, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+ make_tuple(32, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+ make_tuple(64, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+ make_tuple(128, &aom_int_pro_col_neon,
+ &aom_int_pro_col_c)));
+#endif
+
+typedef int (*SatdFunc)(const tran_low_t *coeffs, int length);
+typedef ::testing::tuple<int, SatdFunc, SatdFunc> SatdTestParam;
+class SatdTest : public ::testing::Test,
+ public ::testing::WithParamInterface<SatdTestParam> {
+ protected:
+ virtual void SetUp() {
+ satd_size_ = GET_PARAM(0);
+ satd_func_ref_ = GET_PARAM(1);
+ satd_func_simd_ = GET_PARAM(2);
+
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ src_ = reinterpret_cast<tran_low_t *>(
+ aom_memalign(32, sizeof(*src_) * satd_size_));
+ ASSERT_TRUE(src_ != NULL);
+ }
+ virtual void TearDown() {
+ libaom_test::ClearSystemState();
+ aom_free(src_);
+ }
+ void FillConstant(const tran_low_t val) {
+ for (int i = 0; i < satd_size_; ++i) src_[i] = val;
+ }
+ void FillRandom() {
+ for (int i = 0; i < satd_size_; ++i) {
+ src_[i] = static_cast<int16_t>(rnd_.Rand16());
+ }
+ }
+ void Check(int expected) {
+ int total_ref;
+ ASM_REGISTER_STATE_CHECK(total_ref = satd_func_ref_(src_, satd_size_));
+ EXPECT_EQ(expected, total_ref);
+
+ int total_simd;
+ ASM_REGISTER_STATE_CHECK(total_simd = satd_func_simd_(src_, satd_size_));
+ EXPECT_EQ(expected, total_simd);
+ }
+ void RunComparison() {
+ int total_ref;
+ ASM_REGISTER_STATE_CHECK(total_ref = satd_func_ref_(src_, satd_size_));
+
+ int total_simd;
+ ASM_REGISTER_STATE_CHECK(total_simd = satd_func_simd_(src_, satd_size_));
+
+ EXPECT_EQ(total_ref, total_simd);
+ }
+ void RunSpeedTest() {
+ const int numIter = 500000;
+ printf("size = %d number of iteration is %d \n", satd_size_, numIter);
+
+ int total_ref;
+ aom_usec_timer c_timer_;
+ aom_usec_timer_start(&c_timer_);
+ for (int i = 0; i < numIter; i++) {
+ total_ref = satd_func_ref_(src_, satd_size_);
+ }
+ aom_usec_timer_mark(&c_timer_);
+
+ int total_simd;
+ aom_usec_timer simd_timer_;
+ aom_usec_timer_start(&simd_timer_);
+
+ for (int i = 0; i < numIter; i++) {
+ total_simd = satd_func_simd_(src_, satd_size_);
+ }
+ aom_usec_timer_mark(&simd_timer_);
+
+ const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+ const int simd_sum_time =
+ static_cast<int>(aom_usec_timer_elapsed(&simd_timer_));
+
+ printf(
+ "c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+ simd_sum_time,
+ (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+
+ EXPECT_EQ(total_ref, total_simd) << "Output mismatch \n";
+ }
+ int satd_size_;
+
+ private:
+ tran_low_t *src_;
+ SatdFunc satd_func_ref_;
+ SatdFunc satd_func_simd_;
+ ACMRandom rnd_;
+};
+
+TEST_P(SatdTest, MinValue) {
+ const int kMin = -32640;
+ const int expected = -kMin * satd_size_;
+ FillConstant(kMin);
+ Check(expected);
+}
+TEST_P(SatdTest, MaxValue) {
+ const int kMax = 32640;
+ const int expected = kMax * satd_size_;
+ FillConstant(kMax);
+ Check(expected);
+}
+TEST_P(SatdTest, Random) {
+ int expected;
+ switch (satd_size_) {
+ case 16: expected = 205298; break;
+ case 64: expected = 1113950; break;
+ case 256: expected = 4268415; break;
+ case 1024: expected = 16954082; break;
+ default:
+ FAIL() << "Invalid satd size (" << satd_size_
+ << ") valid: 16/64/256/1024";
+ }
+ FillRandom();
+ Check(expected);
+}
+TEST_P(SatdTest, Match) {
+ FillRandom();
+ RunComparison();
+}
+TEST_P(SatdTest, DISABLED_Speed) {
+ FillRandom();
+ RunSpeedTest();
+}
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, SatdTest,
+ ::testing::Values(make_tuple(16, &aom_satd_c, &aom_satd_neon),
+ make_tuple(64, &aom_satd_c, &aom_satd_neon),
+ make_tuple(256, &aom_satd_c, &aom_satd_neon),
+ make_tuple(1024, &aom_satd_c, &aom_satd_neon)));
#endif
} // namespace