Add NEON version of aom_vector_var function
SpeedUp
width avg gain
16 1.4
32 1.3
64 1.2
128 1.2
via NEON/VectorVarTest*Speed
Change-Id: I99768ac59ab179bb38f0c853a15684a1aa422f3c
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index ac4a96f..2379b75 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -973,8 +973,9 @@
specialize qw/aom_int_pro_col sse2 neon/;
add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
+ specialize qw/aom_vector_var neon/;
# TODO(kyslov@) bring back SSE2 by extending it to 128 block size
- #specialize qw/aom_vector_var sse2/;
+ #specialize qw/aom_vector_var neon sse2/;
#
# hamadard transform and satd for implmenting temporal dependency model
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index c2a1dc4..c3d4de2 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c
@@ -152,3 +152,37 @@
return horizontal_add_s32x4(accum);
#endif // __aarch64__
}
+
+int aom_vector_var_neon(const int16_t *ref, const int16_t *src, const int bwl) {
+ int32x4_t v_mean = vdupq_n_s32(0);
+ int32x4_t v_sse = v_mean;
+ int16x8_t v_ref, v_src;
+ int16x4_t v_low;
+
+ int i, width = 4 << bwl;
+ for (i = 0; i < width; i += 8) {
+ v_ref = vld1q_s16(&ref[i]);
+ v_src = vld1q_s16(&src[i]);
+ const int16x8_t diff = vsubq_s16(v_ref, v_src);
+ // diff: dynamic range [-510, 510], 10 bits.
+ v_mean = vpadalq_s16(v_mean, diff);
+ v_low = vget_low_s16(diff);
+ v_sse = vmlal_s16(v_sse, v_low, v_low);
+#if defined(__aarch64__)
+ v_sse = vmlal_high_s16(v_sse, diff, diff);
+#else
+ const int16x4_t v_high = vget_high_s16(diff);
+ v_sse = vmlal_s16(v_sse, v_high, v_high);
+#endif
+ }
+#if defined(__aarch64__)
+ int mean = vaddvq_s32(v_mean);
+ int sse = (int)vaddvq_s32(v_sse);
+#else
+ int mean = horizontal_add_s32x4(v_mean);
+ int sse = horizontal_add_s32x4(v_sse);
+#endif
+ // (mean * mean): dynamic range 31 bits.
+ int var = sse - ((mean * mean) >> (bwl + 2));
+ return var;
+}
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 643d21a..4b10182 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -310,6 +310,155 @@
FillRandom();
RunSpeedTest();
}
+class VectorVarTestBase : public ::testing::Test {
+ public:
+ explicit VectorVarTestBase(int bwl) { m_bwl = bwl; }
+ VectorVarTestBase() {}
+ ~VectorVarTestBase() {}
+
+ protected:
+ static const int kDataAlignment = 16;
+
+ virtual void SetUp() {
+ width = 4 << m_bwl;
+
+ ref_vector = static_cast<int16_t *>(
+ aom_memalign(kDataAlignment, width * sizeof(ref_vector[0])));
+ ASSERT_TRUE(ref_vector != NULL);
+ src_vector = static_cast<int16_t *>(
+ aom_memalign(kDataAlignment, width * sizeof(src_vector[0])));
+ ASSERT_TRUE(src_vector != NULL);
+
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ }
+ virtual void TearDown() {
+ aom_free(ref_vector);
+ ref_vector = NULL;
+ aom_free(src_vector);
+ src_vector = NULL;
+ libaom_test::ClearSystemState();
+ }
+
+ void FillConstant(int16_t fill_constant_ref, int16_t fill_constant_src) {
+ for (int i = 0; i < width; ++i) {
+ ref_vector[i] = fill_constant_ref;
+ src_vector[i] = fill_constant_src;
+ }
+ }
+
+ void FillRandom() {
+ for (int i = 0; i < width; ++i) {
+ ref_vector[i] =
+ rnd_.Rand16() % max_range; // acc. aom_vector_var_c brief.
+ src_vector[i] = rnd_.Rand16() % max_range;
+ }
+ }
+
+ int width;
+ int m_bwl;
+ int16_t *ref_vector;
+ int16_t *src_vector;
+ ACMRandom rnd_;
+
+ static const int max_range = 510;
+ static const int num_random_cmp = 50;
+};
+
+typedef int (*VectorVarFunc)(const int16_t *ref, const int16_t *src,
+ const int bwl);
+
+typedef std::tuple<int, VectorVarFunc, VectorVarFunc> VecVarFunc;
+
+class VectorVarTest : public VectorVarTestBase,
+ public ::testing::WithParamInterface<VecVarFunc> {
+ public:
+ VectorVarTest()
+ : VectorVarTestBase(GET_PARAM(0)), c_func(GET_PARAM(1)),
+ simd_func(GET_PARAM(2)) {}
+
+ protected:
+ int calcVarC() { return c_func(ref_vector, src_vector, m_bwl); }
+ int calcVarSIMD() { return simd_func(ref_vector, src_vector, m_bwl); }
+
+ VectorVarFunc c_func;
+ VectorVarFunc simd_func;
+};
+
+TEST_P(VectorVarTest, MaxVar) {
+ FillConstant(0, max_range);
+ int c_var = calcVarC();
+ int simd_var = calcVarSIMD();
+ ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, MaxVarRev) {
+ FillConstant(max_range, 0);
+ int c_var = calcVarC();
+ int simd_var = calcVarSIMD();
+ ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, ZeroDiff) {
+ FillConstant(0, 0);
+ int c_var = calcVarC();
+ int simd_var = calcVarSIMD();
+ ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, ZeroDiff2) {
+ FillConstant(max_range, max_range);
+ int c_var = calcVarC();
+ int simd_var = calcVarSIMD();
+ ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, Constant) {
+ FillConstant(30, 90);
+ int c_var = calcVarC();
+ int simd_var = calcVarSIMD();
+ ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, Random) {
+ for (size_t i = 0; i < num_random_cmp; i++) {
+ FillRandom();
+ int c_var = calcVarC();
+ int simd_var = calcVarSIMD();
+ ASSERT_EQ(c_var, simd_var);
+ }
+}
+TEST_P(VectorVarTest, DISABLED_Speed) {
+ FillRandom();
+ const int numIter = 50000;
+ printf("Width = %d number of iteration is %d \n", width, numIter);
+
+ int sum_c_var = 0;
+ int c_var = 0;
+
+ aom_usec_timer c_timer_;
+ aom_usec_timer_start(&c_timer_);
+ for (size_t i = 0; i < numIter; i++) {
+ c_var = calcVarC();
+ sum_c_var += c_var;
+ }
+ aom_usec_timer_mark(&c_timer_);
+
+ int simd_var = 0;
+ int sum_simd_var = 0;
+ aom_usec_timer simd_timer_;
+ aom_usec_timer_start(&simd_timer_);
+ for (size_t i = 0; i < numIter; i++) {
+ simd_var = calcVarSIMD();
+ sum_simd_var += simd_var;
+ }
+ aom_usec_timer_mark(&simd_timer_);
+
+ const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+ const int simd_sum_time =
+ static_cast<int>(aom_usec_timer_elapsed(&simd_timer_));
+
+ printf("c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+ simd_sum_time,
+ (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+
+ EXPECT_EQ(c_var, simd_var) << "Output mismatch \n";
+ EXPECT_EQ(sum_c_var, sum_simd_var) << "Output mismatch \n";
+}
using std::make_tuple;
@@ -498,6 +647,12 @@
make_tuple(64, &aom_satd_c, &aom_satd_neon),
make_tuple(256, &aom_satd_c, &aom_satd_neon),
make_tuple(1024, &aom_satd_c, &aom_satd_neon)));
+INSTANTIATE_TEST_SUITE_P(
+ NEON, VectorVarTest,
+ ::testing::Values(make_tuple(2, &aom_vector_var_c, &aom_vector_var_neon),
+ make_tuple(3, &aom_vector_var_c, &aom_vector_var_neon),
+ make_tuple(4, &aom_vector_var_c, &aom_vector_var_neon),
+ make_tuple(5, &aom_vector_var_c, &aom_vector_var_neon)));
#endif
} // namespace