Add NEON version of aom_vector_var function

SpeedUp

width    avg gain
16       1.4
32       1.3
64       1.2
128      1.2

via NEON/VectorVarTest*Speed

Change-Id: I99768ac59ab179bb38f0c853a15684a1aa422f3c
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index ac4a96f..2379b75 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -973,8 +973,9 @@
   specialize qw/aom_int_pro_col sse2 neon/;
 
   add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
+  specialize qw/aom_vector_var neon/;
   # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
-  #specialize qw/aom_vector_var sse2/;
+  #specialize qw/aom_vector_var neon sse2/;
 
   #
   # hamadard transform and satd for implmenting temporal dependency model
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index c2a1dc4..c3d4de2 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c
@@ -152,3 +152,37 @@
   return horizontal_add_s32x4(accum);
 #endif  // __aarch64__
 }
+
+int aom_vector_var_neon(const int16_t *ref, const int16_t *src, const int bwl) {
+  int32x4_t v_mean = vdupq_n_s32(0);
+  int32x4_t v_sse = v_mean;
+  int16x8_t v_ref, v_src;
+  int16x4_t v_low;
+
+  int i, width = 4 << bwl;
+  for (i = 0; i < width; i += 8) {
+    v_ref = vld1q_s16(&ref[i]);
+    v_src = vld1q_s16(&src[i]);
+    const int16x8_t diff = vsubq_s16(v_ref, v_src);
+    // diff: dynamic range [-510, 510], 10 bits.
+    v_mean = vpadalq_s16(v_mean, diff);
+    v_low = vget_low_s16(diff);
+    v_sse = vmlal_s16(v_sse, v_low, v_low);
+#if defined(__aarch64__)
+    v_sse = vmlal_high_s16(v_sse, diff, diff);
+#else
+    const int16x4_t v_high = vget_high_s16(diff);
+    v_sse = vmlal_s16(v_sse, v_high, v_high);
+#endif
+  }
+#if defined(__aarch64__)
+  int mean = vaddvq_s32(v_mean);
+  int sse = (int)vaddvq_s32(v_sse);
+#else
+  int mean = horizontal_add_s32x4(v_mean);
+  int sse = horizontal_add_s32x4(v_sse);
+#endif
+  // (mean * mean): dynamic range 31 bits.
+  int var = sse - ((mean * mean) >> (bwl + 2));
+  return var;
+}
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 643d21a..4b10182 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -310,6 +310,155 @@
   FillRandom();
   RunSpeedTest();
 }
+class VectorVarTestBase : public ::testing::Test {
+ public:
+  explicit VectorVarTestBase(int bwl) { m_bwl = bwl; }
+  VectorVarTestBase() {}
+  ~VectorVarTestBase() {}
+
+ protected:
+  static const int kDataAlignment = 16;
+
+  virtual void SetUp() {
+    width = 4 << m_bwl;
+
+    ref_vector = static_cast<int16_t *>(
+        aom_memalign(kDataAlignment, width * sizeof(ref_vector[0])));
+    ASSERT_TRUE(ref_vector != NULL);
+    src_vector = static_cast<int16_t *>(
+        aom_memalign(kDataAlignment, width * sizeof(src_vector[0])));
+    ASSERT_TRUE(src_vector != NULL);
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+  virtual void TearDown() {
+    aom_free(ref_vector);
+    ref_vector = NULL;
+    aom_free(src_vector);
+    src_vector = NULL;
+    libaom_test::ClearSystemState();
+  }
+
+  void FillConstant(int16_t fill_constant_ref, int16_t fill_constant_src) {
+    for (int i = 0; i < width; ++i) {
+      ref_vector[i] = fill_constant_ref;
+      src_vector[i] = fill_constant_src;
+    }
+  }
+
+  void FillRandom() {
+    for (int i = 0; i < width; ++i) {
+      ref_vector[i] =
+          rnd_.Rand16() % max_range;  // acc. aom_vector_var_c brief.
+      src_vector[i] = rnd_.Rand16() % max_range;
+    }
+  }
+
+  int width;
+  int m_bwl;
+  int16_t *ref_vector;
+  int16_t *src_vector;
+  ACMRandom rnd_;
+
+  static const int max_range = 510;
+  static const int num_random_cmp = 50;
+};
+
+typedef int (*VectorVarFunc)(const int16_t *ref, const int16_t *src,
+                             const int bwl);
+
+typedef std::tuple<int, VectorVarFunc, VectorVarFunc> VecVarFunc;
+
+class VectorVarTest : public VectorVarTestBase,
+                      public ::testing::WithParamInterface<VecVarFunc> {
+ public:
+  VectorVarTest()
+      : VectorVarTestBase(GET_PARAM(0)), c_func(GET_PARAM(1)),
+        simd_func(GET_PARAM(2)) {}
+
+ protected:
+  int calcVarC() { return c_func(ref_vector, src_vector, m_bwl); }
+  int calcVarSIMD() { return simd_func(ref_vector, src_vector, m_bwl); }
+
+  VectorVarFunc c_func;
+  VectorVarFunc simd_func;
+};
+
+TEST_P(VectorVarTest, MaxVar) {
+  FillConstant(0, max_range);
+  int c_var = calcVarC();
+  int simd_var = calcVarSIMD();
+  ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, MaxVarRev) {
+  FillConstant(max_range, 0);
+  int c_var = calcVarC();
+  int simd_var = calcVarSIMD();
+  ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, ZeroDiff) {
+  FillConstant(0, 0);
+  int c_var = calcVarC();
+  int simd_var = calcVarSIMD();
+  ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, ZeroDiff2) {
+  FillConstant(max_range, max_range);
+  int c_var = calcVarC();
+  int simd_var = calcVarSIMD();
+  ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, Constant) {
+  FillConstant(30, 90);
+  int c_var = calcVarC();
+  int simd_var = calcVarSIMD();
+  ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, Random) {
+  for (size_t i = 0; i < num_random_cmp; i++) {
+    FillRandom();
+    int c_var = calcVarC();
+    int simd_var = calcVarSIMD();
+    ASSERT_EQ(c_var, simd_var);
+  }
+}
+TEST_P(VectorVarTest, DISABLED_Speed) {
+  FillRandom();
+  const int numIter = 50000;
+  printf("Width = %d number of iteration is %d \n", width, numIter);
+
+  int sum_c_var = 0;
+  int c_var = 0;
+
+  aom_usec_timer c_timer_;
+  aom_usec_timer_start(&c_timer_);
+  for (size_t i = 0; i < numIter; i++) {
+    c_var = calcVarC();
+    sum_c_var += c_var;
+  }
+  aom_usec_timer_mark(&c_timer_);
+
+  int simd_var = 0;
+  int sum_simd_var = 0;
+  aom_usec_timer simd_timer_;
+  aom_usec_timer_start(&simd_timer_);
+  for (size_t i = 0; i < numIter; i++) {
+    simd_var = calcVarSIMD();
+    sum_simd_var += simd_var;
+  }
+  aom_usec_timer_mark(&simd_timer_);
+
+  const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+  const int simd_sum_time =
+      static_cast<int>(aom_usec_timer_elapsed(&simd_timer_));
+
+  printf("c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+         simd_sum_time,
+         (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+
+  EXPECT_EQ(c_var, simd_var) << "Output mismatch \n";
+  EXPECT_EQ(sum_c_var, sum_simd_var) << "Output mismatch \n";
+}
 
 using std::make_tuple;
 
@@ -498,6 +647,12 @@
                       make_tuple(64, &aom_satd_c, &aom_satd_neon),
                       make_tuple(256, &aom_satd_c, &aom_satd_neon),
                       make_tuple(1024, &aom_satd_c, &aom_satd_neon)));
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VectorVarTest,
+    ::testing::Values(make_tuple(2, &aom_vector_var_c, &aom_vector_var_neon),
+                      make_tuple(3, &aom_vector_var_c, &aom_vector_var_neon),
+                      make_tuple(4, &aom_vector_var_c, &aom_vector_var_neon),
+                      make_tuple(5, &aom_vector_var_c, &aom_vector_var_neon)));
 #endif
 
 }  // namespace