Add NEON version aom_int_pro_row/col and aom_satd

Speedup
NEON/Satd*Speed
size = 16 Gain = 1.65
size = 64 Gain = 1.04
size =256 Gain = 1.15
size =1024Gain = 0.98

NEON/IntProRowTest.DISABLED_Speed
Height = 16   Gain = 14.73
Height = 32   Gain = 15.55
Height = 64   Gain = 15.64
Height = 128  Gain = 15.82

NEON/IntProColTest.DISABLED_Speed
Width = 16   Gain = 1.9
Width = 32   Gain = 1.4
Width = 64   Gain = 1.3
Width = 128  Gain = 1.2

Change-Id: If60e88aad5be19a198fae55fe30100a3a88ec309
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index d9eb518..7e7c94d 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -967,10 +967,10 @@
   }
 
   add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
-  specialize qw/aom_int_pro_row sse2/;
+  specialize qw/aom_int_pro_row sse2 neon/;
 
   add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, const int width";
-  specialize qw/aom_int_pro_col sse2/;
+  specialize qw/aom_int_pro_col sse2 neon/;
 
   add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
   # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
@@ -1006,7 +1006,7 @@
     specialize qw/aom_highbd_hadamard_32x32 avx2/;
   }
   add_proto qw/int aom_satd/, "const tran_low_t *coeff, int length";
-  specialize qw/aom_satd avx2/;
+  specialize qw/aom_satd neon avx2/;
 
   add_proto qw/int aom_satd_lp/, "const int16_t *coeff, int length";
   specialize qw/aom_satd_lp avx2 neon/;
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index af3769e..c2a1dc4 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c
@@ -72,3 +72,83 @@
     return satd;
   }
 }
+
+void aom_int_pro_row_neon(int16_t hbuf[16], const uint8_t *ref,
+                          const int ref_stride, const int height) {
+  int i;
+  const uint8_t *idx = ref;
+  uint16x8_t vec0 = vdupq_n_u16(0);
+  uint16x8_t vec1 = vec0;
+  uint8x16_t tmp;
+
+  for (i = 0; i < height; ++i) {
+    tmp = vld1q_u8(idx);
+    idx += ref_stride;
+    vec0 = vaddw_u8(vec0, vget_low_u8(tmp));
+    vec1 = vaddw_u8(vec1, vget_high_u8(tmp));
+  }
+
+  if (128 == height) {
+    vec0 = vshrq_n_u16(vec0, 6);
+    vec1 = vshrq_n_u16(vec1, 6);
+  } else if (64 == height) {
+    vec0 = vshrq_n_u16(vec0, 5);
+    vec1 = vshrq_n_u16(vec1, 5);
+  } else if (32 == height) {
+    vec0 = vshrq_n_u16(vec0, 4);
+    vec1 = vshrq_n_u16(vec1, 4);
+  } else if (16 == height) {
+    vec0 = vshrq_n_u16(vec0, 3);
+    vec1 = vshrq_n_u16(vec1, 3);
+  }
+
+  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec0));
+  hbuf += 8;
+  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec1));
+}
+
+int16_t aom_int_pro_col_neon(const uint8_t *ref, const int width) {
+  const uint8_t *idx;
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  for (idx = ref; idx < (ref + width); idx += 16) {
+    uint8x16_t vec = vld1q_u8(idx);
+    sum = vaddq_u16(sum, vpaddlq_u8(vec));
+  }
+
+#if defined(__aarch64__)
+  return (int16_t)vaddvq_u16(sum);
+#else
+  const uint32x4_t a = vpaddlq_u16(sum);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return (int16_t)vget_lane_u32(c, 0);
+#endif
+}
+
+// coeff: 16 bits, dynamic range [-32640, 32640].
+// length: value range {16, 64, 256, 1024}.
+int aom_satd_neon(const tran_low_t *coeff, int length) {
+  const int32x4_t zero = vdupq_n_s32(0);
+  int32x4_t accum = zero;
+  do {
+    const int32x4_t src0 = vld1q_s32(&coeff[0]);
+    const int32x4_t src8 = vld1q_s32(&coeff[4]);
+    const int32x4_t src16 = vld1q_s32(&coeff[8]);
+    const int32x4_t src24 = vld1q_s32(&coeff[12]);
+    accum = vabaq_s32(accum, src0, zero);
+    accum = vabaq_s32(accum, src8, zero);
+    accum = vabaq_s32(accum, src16, zero);
+    accum = vabaq_s32(accum, src24, zero);
+    length -= 16;
+    coeff += 16;
+  } while (length != 0);
+
+  // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+#ifdef __aarch64__
+  return vaddvq_s32(accum);
+#else
+  return horizontal_add_s32x4(accum);
+#endif  // __aarch64__
+}
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 1742aec..643d21a 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -176,7 +176,37 @@
     ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
     ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
     EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
-        << "Output mismatch";
+        << "Output mismatch\n";
+  }
+
+  void RunSpeedTest() {
+    const int numIter = 5000000;
+    printf("Height = %d number of iteration is %d \n", height_, numIter);
+    aom_usec_timer c_timer_;
+    aom_usec_timer_start(&c_timer_);
+    for (int i = 0; i < numIter; i++) {
+      c_func_(hbuf_c_, source_data_, 0, height_);
+    }
+    aom_usec_timer_mark(&c_timer_);
+
+    aom_usec_timer asm_timer_;
+    aom_usec_timer_start(&asm_timer_);
+
+    for (int i = 0; i < numIter; i++) {
+      asm_func_(hbuf_asm_, source_data_, 0, height_);
+    }
+    aom_usec_timer_mark(&asm_timer_);
+
+    const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+    const int asm_sum_time =
+        static_cast<int>(aom_usec_timer_elapsed(&asm_timer_));
+
+    printf("c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+           asm_sum_time,
+           (static_cast<float>(c_sum_time) / static_cast<float>(asm_sum_time)));
+
+    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
+        << "Output mismatch\n";
   }
 
  private:
@@ -205,6 +235,34 @@
     ASM_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_));
     EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch";
   }
+  void RunSpeedTest() {
+    const int numIter = 5000000;
+    printf("Width = %d number of iteration is %d \n", width_, numIter);
+    aom_usec_timer c_timer_;
+    aom_usec_timer_start(&c_timer_);
+    for (int i = 0; i < numIter; i++) {
+      sum_c_ = c_func_(source_data_, width_);
+    }
+    aom_usec_timer_mark(&c_timer_);
+
+    aom_usec_timer asm_timer_;
+    aom_usec_timer_start(&asm_timer_);
+
+    for (int i = 0; i < numIter; i++) {
+      sum_asm_ = asm_func_(source_data_, width_);
+    }
+    aom_usec_timer_mark(&asm_timer_);
+
+    const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+    const int asm_sum_time =
+        static_cast<int>(aom_usec_timer_elapsed(&asm_timer_));
+
+    printf("c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+           asm_sum_time,
+           (static_cast<float>(c_sum_time) / static_cast<float>(asm_sum_time)));
+
+    EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch \n";
+  }
 
  private:
   IntProColFunc asm_func_;
@@ -228,6 +286,11 @@
   RunComparison();
 }
 
+TEST_P(IntProRowTest, DISABLED_Speed) {
+  FillRandom();
+  RunSpeedTest();
+}
+
 TEST_P(IntProColTest, MinValue) {
   FillConstant(0);
   RunComparison();
@@ -243,6 +306,11 @@
   RunComparison();
 }
 
+TEST_P(IntProColTest, DISABLED_Speed) {
+  FillRandom();
+  RunSpeedTest();
+}
+
 using std::make_tuple;
 
 INSTANTIATE_TEST_SUITE_P(
@@ -286,6 +354,150 @@
                       make_tuple(16, 16, 0, 4, &aom_avg_4x4_neon),
                       make_tuple(16, 16, 5, 4, &aom_avg_4x4_neon),
                       make_tuple(32, 32, 15, 4, &aom_avg_4x4_neon)));
+INSTANTIATE_TEST_SUITE_P(
+    NEON, IntProRowTest,
+    ::testing::Values(make_tuple(16, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+                      make_tuple(32, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+                      make_tuple(64, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+                      make_tuple(128, &aom_int_pro_row_neon,
+                                 &aom_int_pro_row_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, IntProColTest,
+    ::testing::Values(make_tuple(16, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+                      make_tuple(32, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+                      make_tuple(64, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+                      make_tuple(128, &aom_int_pro_col_neon,
+                                 &aom_int_pro_col_c)));
+#endif
+
+typedef int (*SatdFunc)(const tran_low_t *coeffs, int length);
+typedef ::testing::tuple<int, SatdFunc, SatdFunc> SatdTestParam;
+class SatdTest : public ::testing::Test,
+                 public ::testing::WithParamInterface<SatdTestParam> {
+ protected:
+  virtual void SetUp() {
+    satd_size_ = GET_PARAM(0);
+    satd_func_ref_ = GET_PARAM(1);
+    satd_func_simd_ = GET_PARAM(2);
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<tran_low_t *>(
+        aom_memalign(32, sizeof(*src_) * satd_size_));
+    ASSERT_TRUE(src_ != NULL);
+  }
+  virtual void TearDown() {
+    libaom_test::ClearSystemState();
+    aom_free(src_);
+  }
+  void FillConstant(const tran_low_t val) {
+    for (int i = 0; i < satd_size_; ++i) src_[i] = val;
+  }
+  void FillRandom() {
+    for (int i = 0; i < satd_size_; ++i) {
+      src_[i] = static_cast<int16_t>(rnd_.Rand16());
+    }
+  }
+  void Check(int expected) {
+    int total_ref;
+    ASM_REGISTER_STATE_CHECK(total_ref = satd_func_ref_(src_, satd_size_));
+    EXPECT_EQ(expected, total_ref);
+
+    int total_simd;
+    ASM_REGISTER_STATE_CHECK(total_simd = satd_func_simd_(src_, satd_size_));
+    EXPECT_EQ(expected, total_simd);
+  }
+  void RunComparison() {
+    int total_ref;
+    ASM_REGISTER_STATE_CHECK(total_ref = satd_func_ref_(src_, satd_size_));
+
+    int total_simd;
+    ASM_REGISTER_STATE_CHECK(total_simd = satd_func_simd_(src_, satd_size_));
+
+    EXPECT_EQ(total_ref, total_simd);
+  }
+  void RunSpeedTest() {
+    const int numIter = 500000;
+    printf("size = %d number of iteration is %d \n", satd_size_, numIter);
+
+    int total_ref;
+    aom_usec_timer c_timer_;
+    aom_usec_timer_start(&c_timer_);
+    for (int i = 0; i < numIter; i++) {
+      total_ref = satd_func_ref_(src_, satd_size_);
+    }
+    aom_usec_timer_mark(&c_timer_);
+
+    int total_simd;
+    aom_usec_timer simd_timer_;
+    aom_usec_timer_start(&simd_timer_);
+
+    for (int i = 0; i < numIter; i++) {
+      total_simd = satd_func_simd_(src_, satd_size_);
+    }
+    aom_usec_timer_mark(&simd_timer_);
+
+    const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+    const int simd_sum_time =
+        static_cast<int>(aom_usec_timer_elapsed(&simd_timer_));
+
+    printf(
+        "c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+        simd_sum_time,
+        (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+
+    EXPECT_EQ(total_ref, total_simd) << "Output mismatch \n";
+  }
+  int satd_size_;
+
+ private:
+  tran_low_t *src_;
+  SatdFunc satd_func_ref_;
+  SatdFunc satd_func_simd_;
+  ACMRandom rnd_;
+};
+
+TEST_P(SatdTest, MinValue) {
+  const int kMin = -32640;
+  const int expected = -kMin * satd_size_;
+  FillConstant(kMin);
+  Check(expected);
+}
+TEST_P(SatdTest, MaxValue) {
+  const int kMax = 32640;
+  const int expected = kMax * satd_size_;
+  FillConstant(kMax);
+  Check(expected);
+}
+TEST_P(SatdTest, Random) {
+  int expected;
+  switch (satd_size_) {
+    case 16: expected = 205298; break;
+    case 64: expected = 1113950; break;
+    case 256: expected = 4268415; break;
+    case 1024: expected = 16954082; break;
+    default:
+      FAIL() << "Invalid satd size (" << satd_size_
+             << ") valid: 16/64/256/1024";
+  }
+  FillRandom();
+  Check(expected);
+}
+TEST_P(SatdTest, Match) {
+  FillRandom();
+  RunComparison();
+}
+TEST_P(SatdTest, DISABLED_Speed) {
+  FillRandom();
+  RunSpeedTest();
+}
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, SatdTest,
+    ::testing::Values(make_tuple(16, &aom_satd_c, &aom_satd_neon),
+                      make_tuple(64, &aom_satd_c, &aom_satd_neon),
+                      make_tuple(256, &aom_satd_c, &aom_satd_neon),
+                      make_tuple(1024, &aom_satd_c, &aom_satd_neon)));
 #endif
 
 }  // namespace