Add Neon implementation of HBD estimate_noise_from_single_plane

Add Neon implementation of av1_highbd_estimate_noise_from_single_plane
as well as the corresponding tests.

Change-Id: Iefa204b497f6195241326e56ba74e4cc96ad609c
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index f76edea..b9f228b 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -405,6 +405,7 @@
       specialize qw/av1_highbd_apply_temporal_filter sse2 avx2 neon/;
 
       add_proto qw/double av1_highbd_estimate_noise_from_single_plane/, "const uint16_t *src, int height, int width, int stride, int bit_depth, int edge_thresh";
+      specialize qw/av1_highbd_estimate_noise_from_single_plane neon/;
     }
   }
 
diff --git a/av1/encoder/arm/neon/highbd_temporal_filter_neon.c b/av1/encoder/arm/neon/highbd_temporal_filter_neon.c
index 763eeba..88e176f 100644
--- a/av1/encoder/arm/neon/highbd_temporal_filter_neon.c
+++ b/av1/encoder/arm/neon/highbd_temporal_filter_neon.c
@@ -390,3 +390,173 @@
     plane_offset += plane_h * plane_w;
   }
 }
+
+double av1_highbd_estimate_noise_from_single_plane_neon(const uint16_t *src,
+                                                        int height, int width,
+                                                        int stride,
+                                                        int bitdepth,
+                                                        int edge_thresh) {
+  uint16x8_t thresh = vdupq_n_u16(edge_thresh);
+  uint64x2_t acc = vdupq_n_u64(0);
+  // Count is in theory positive as it counts the number of times we're under
+  // the threshold, but it will be counted negatively in order to make best use
+  // of the vclt instruction, which sets every bit of a lane to 1 when the
+  // condition is true.
+  int32x4_t count = vdupq_n_s32(0);
+  int final_count = 0;
+  uint64_t final_acc = 0;
+  const uint16_t *src_start = src + stride + 1;
+  int h = 1;
+
+  do {
+    int w = 1;
+    const uint16_t *src_ptr = src_start;
+
+    while (w <= (width - 1) - 8) {
+      uint16x8_t mat[3][3];
+      mat[0][0] = vld1q_u16(src_ptr - stride - 1);
+      mat[0][1] = vld1q_u16(src_ptr - stride);
+      mat[0][2] = vld1q_u16(src_ptr - stride + 1);
+      mat[1][0] = vld1q_u16(src_ptr - 1);
+      mat[1][1] = vld1q_u16(src_ptr);
+      mat[1][2] = vld1q_u16(src_ptr + 1);
+      mat[2][0] = vld1q_u16(src_ptr + stride - 1);
+      mat[2][1] = vld1q_u16(src_ptr + stride);
+      mat[2][2] = vld1q_u16(src_ptr + stride + 1);
+
+      // Compute Sobel gradients.
+      uint16x8_t gxa = vaddq_u16(mat[0][0], mat[2][0]);
+      uint16x8_t gxb = vaddq_u16(mat[0][2], mat[2][2]);
+      gxa = vaddq_u16(gxa, vaddq_u16(mat[1][0], mat[1][0]));
+      gxb = vaddq_u16(gxb, vaddq_u16(mat[1][2], mat[1][2]));
+
+      uint16x8_t gya = vaddq_u16(mat[0][0], mat[0][2]);
+      uint16x8_t gyb = vaddq_u16(mat[2][0], mat[2][2]);
+      gya = vaddq_u16(gya, vaddq_u16(mat[0][1], mat[0][1]));
+      gyb = vaddq_u16(gyb, vaddq_u16(mat[2][1], mat[2][1]));
+
+      uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb);
+      ga = vrshlq_u16(ga, vdupq_n_s16(8 - bitdepth));
+
+      // Check which vector elements are under the threshold. The Laplacian is
+      // then unconditionnally computed and we accumulate zeros if we're not
+      // under the threshold. This is much faster than using an if statement.
+      uint16x8_t thresh_u16 = vcltq_u16(ga, thresh);
+
+      uint16x8_t center = vshlq_n_u16(mat[1][1], 2);
+
+      uint16x8_t adj0 = vaddq_u16(mat[0][1], mat[2][1]);
+      uint16x8_t adj1 = vaddq_u16(mat[1][0], mat[1][2]);
+      uint16x8_t adj = vaddq_u16(adj0, adj1);
+      adj = vaddq_u16(adj, adj);
+
+      uint16x8_t diag0 = vaddq_u16(mat[0][0], mat[0][2]);
+      uint16x8_t diag1 = vaddq_u16(mat[2][0], mat[2][2]);
+      uint16x8_t diag = vaddq_u16(diag0, diag1);
+
+      uint16x8_t v = vabdq_u16(vaddq_u16(center, diag), adj);
+      v = vandq_u16(vrshlq_u16(v, vdupq_n_s16(8 - bitdepth)), thresh_u16);
+      uint32x4_t v_u32 = vpaddlq_u16(v);
+
+      acc = vpadalq_u32(acc, v_u32);
+      // Add -1 for each lane where the gradient is under the threshold.
+      count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16));
+
+      w += 8;
+      src_ptr += 8;
+    }
+
+    if (w <= (width - 1) - 4) {
+      uint16x4_t mat[3][3];
+      mat[0][0] = vld1_u16(src_ptr - stride - 1);
+      mat[0][1] = vld1_u16(src_ptr - stride);
+      mat[0][2] = vld1_u16(src_ptr - stride + 1);
+      mat[1][0] = vld1_u16(src_ptr - 1);
+      mat[1][1] = vld1_u16(src_ptr);
+      mat[1][2] = vld1_u16(src_ptr + 1);
+      mat[2][0] = vld1_u16(src_ptr + stride - 1);
+      mat[2][1] = vld1_u16(src_ptr + stride);
+      mat[2][2] = vld1_u16(src_ptr + stride + 1);
+
+      // Compute Sobel gradients.
+      uint16x4_t gxa = vadd_u16(mat[0][0], mat[2][0]);
+      uint16x4_t gxb = vadd_u16(mat[0][2], mat[2][2]);
+      gxa = vadd_u16(gxa, vadd_u16(mat[1][0], mat[1][0]));
+      gxb = vadd_u16(gxb, vadd_u16(mat[1][2], mat[1][2]));
+
+      uint16x4_t gya = vadd_u16(mat[0][0], mat[0][2]);
+      uint16x4_t gyb = vadd_u16(mat[2][0], mat[2][2]);
+      gya = vadd_u16(gya, vadd_u16(mat[0][1], mat[0][1]));
+      gyb = vadd_u16(gyb, vadd_u16(mat[2][1], mat[2][1]));
+
+      uint16x4_t ga = vaba_u16(vabd_u16(gxa, gxb), gya, gyb);
+      ga = vrshl_u16(ga, vdup_n_s16(8 - bitdepth));
+
+      // Check which vector elements are under the threshold. The Laplacian is
+      // then unconditionnally computed and we accumulate zeros if we're not
+      // under the threshold. This is much faster than using an if statement.
+      uint16x4_t thresh_u16 = vclt_u16(ga, vget_low_u16(thresh));
+
+      uint16x4_t center = vshl_n_u16(mat[1][1], 2);
+
+      uint16x4_t adj0 = vadd_u16(mat[0][1], mat[2][1]);
+      uint16x4_t adj1 = vadd_u16(mat[1][0], mat[1][2]);
+      uint16x4_t adj = vadd_u16(adj0, adj1);
+      adj = vadd_u16(adj, adj);
+
+      uint16x4_t diag0 = vadd_u16(mat[0][0], mat[0][2]);
+      uint16x4_t diag1 = vadd_u16(mat[2][0], mat[2][2]);
+      uint16x4_t diag = vadd_u16(diag0, diag1);
+
+      uint16x4_t v = vabd_u16(vadd_u16(center, diag), adj);
+      v = vand_u16(v, thresh_u16);
+      uint32x4_t v_u32 = vmovl_u16(vrshl_u16(v, vdup_n_s16(8 - bitdepth)));
+
+      acc = vpadalq_u32(acc, v_u32);
+      // Add -1 for each lane where the gradient is under the threshold.
+      count = vaddw_s16(count, vreinterpret_s16_u16(thresh_u16));
+
+      w += 4;
+      src_ptr += 4;
+    }
+
+    while (w < width - 1) {
+      int mat[3][3];
+      mat[0][0] = *(src_ptr - stride - 1);
+      mat[0][1] = *(src_ptr - stride);
+      mat[0][2] = *(src_ptr - stride + 1);
+      mat[1][0] = *(src_ptr - 1);
+      mat[1][1] = *(src_ptr);
+      mat[1][2] = *(src_ptr + 1);
+      mat[2][0] = *(src_ptr + stride - 1);
+      mat[2][1] = *(src_ptr + stride);
+      mat[2][2] = *(src_ptr + stride + 1);
+
+      // Compute Sobel gradients.
+      const int gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
+                     2 * (mat[1][0] - mat[1][2]);
+      const int gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
+                     2 * (mat[0][1] - mat[2][1]);
+      const int ga = ROUND_POWER_OF_TWO(abs(gx) + abs(gy), bitdepth - 8);
+
+      // Accumulate Laplacian.
+      const int is_under = ga < edge_thresh;
+      const int v = 4 * mat[1][1] -
+                    2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
+                    (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
+      final_acc += ROUND_POWER_OF_TWO(abs(v), bitdepth - 8) * is_under;
+      final_count += is_under;
+
+      src_ptr++;
+      w++;
+    }
+    src_start += stride;
+  } while (++h < height - 1);
+
+  // We counted negatively, so subtract to get the final value.
+  final_count -= horizontal_add_s32x4(count);
+  final_acc += horizontal_add_u64x2(acc);
+  return (final_count < 16)
+             ? -1.0
+             : (double)final_acc / (6 * final_count) * SQRT_PI_BY_2;
+}
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 9bb1064..d6ae667 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -1253,11 +1253,11 @@
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-double av1_highbd_estimate_noise_from_single_plane(const uint16_t *src16,
-                                                   int height, int width,
-                                                   const int stride,
-                                                   int bit_depth,
-                                                   int edge_thresh) {
+double av1_highbd_estimate_noise_from_single_plane_c(const uint16_t *src16,
+                                                     int height, int width,
+                                                     const int stride,
+                                                     int bit_depth,
+                                                     int edge_thresh) {
   int64_t accum = 0;
   int count = 0;
   for (int i = 1; i < height - 1; ++i) {
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
index f81240c9..85f68b8 100644
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc
@@ -774,6 +774,15 @@
 
 TEST_P(HBDEstimateNoiseTest, DISABLED_Speed) { SpeedTest(2000); }
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, HBDEstimateNoiseTest,
+    ::testing::Combine(
+        ::testing::Values(av1_highbd_estimate_noise_from_single_plane_c),
+        ::testing::Values(av1_highbd_estimate_noise_from_single_plane_neon),
+        ::testing::ValuesIn(kWidths), ::testing::ValuesIn(kHeights),
+        ::testing::ValuesIn({ 8, 10, 12 })));
+#endif  // HAVE_NEON
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 }  // namespace
 #endif