Add Neon implementation of HBD estimate_noise_from_single_plane
Add Neon implementation of av1_highbd_estimate_noise_from_single_plane
as well as the corresponding tests.
Change-Id: Iefa204b497f6195241326e56ba74e4cc96ad609c
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index f76edea..b9f228b 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -405,6 +405,7 @@
specialize qw/av1_highbd_apply_temporal_filter sse2 avx2 neon/;
add_proto qw/double av1_highbd_estimate_noise_from_single_plane/, "const uint16_t *src, int height, int width, int stride, int bit_depth, int edge_thresh";
+ specialize qw/av1_highbd_estimate_noise_from_single_plane neon/;
}
}
diff --git a/av1/encoder/arm/neon/highbd_temporal_filter_neon.c b/av1/encoder/arm/neon/highbd_temporal_filter_neon.c
index 763eeba..88e176f 100644
--- a/av1/encoder/arm/neon/highbd_temporal_filter_neon.c
+++ b/av1/encoder/arm/neon/highbd_temporal_filter_neon.c
@@ -390,3 +390,173 @@
plane_offset += plane_h * plane_w;
}
}
+
+double av1_highbd_estimate_noise_from_single_plane_neon(const uint16_t *src,
+ int height, int width,
+ int stride,
+ int bitdepth,
+ int edge_thresh) {
+ uint16x8_t thresh = vdupq_n_u16(edge_thresh);
+ uint64x2_t acc = vdupq_n_u64(0);
+ // Count is in theory positive as it counts the number of times we're under
+ // the threshold, but it will be counted negatively in order to make best use
+ // of the vclt instruction, which sets every bit of a lane to 1 when the
+ // condition is true.
+ int32x4_t count = vdupq_n_s32(0);
+ int final_count = 0;
+ uint64_t final_acc = 0;
+ const uint16_t *src_start = src + stride + 1;
+ int h = 1;
+
+ do {
+ int w = 1;
+ const uint16_t *src_ptr = src_start;
+
+ while (w <= (width - 1) - 8) {
+ uint16x8_t mat[3][3];
+ mat[0][0] = vld1q_u16(src_ptr - stride - 1);
+ mat[0][1] = vld1q_u16(src_ptr - stride);
+ mat[0][2] = vld1q_u16(src_ptr - stride + 1);
+ mat[1][0] = vld1q_u16(src_ptr - 1);
+ mat[1][1] = vld1q_u16(src_ptr);
+ mat[1][2] = vld1q_u16(src_ptr + 1);
+ mat[2][0] = vld1q_u16(src_ptr + stride - 1);
+ mat[2][1] = vld1q_u16(src_ptr + stride);
+ mat[2][2] = vld1q_u16(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ uint16x8_t gxa = vaddq_u16(mat[0][0], mat[2][0]);
+ uint16x8_t gxb = vaddq_u16(mat[0][2], mat[2][2]);
+ gxa = vaddq_u16(gxa, vaddq_u16(mat[1][0], mat[1][0]));
+ gxb = vaddq_u16(gxb, vaddq_u16(mat[1][2], mat[1][2]));
+
+ uint16x8_t gya = vaddq_u16(mat[0][0], mat[0][2]);
+ uint16x8_t gyb = vaddq_u16(mat[2][0], mat[2][2]);
+ gya = vaddq_u16(gya, vaddq_u16(mat[0][1], mat[0][1]));
+ gyb = vaddq_u16(gyb, vaddq_u16(mat[2][1], mat[2][1]));
+
+ uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb);
+ ga = vrshlq_u16(ga, vdupq_n_s16(8 - bitdepth));
+
+ // Check which vector elements are under the threshold. The Laplacian is
+ // then unconditionnally computed and we accumulate zeros if we're not
+ // under the threshold. This is much faster than using an if statement.
+ uint16x8_t thresh_u16 = vcltq_u16(ga, thresh);
+
+ uint16x8_t center = vshlq_n_u16(mat[1][1], 2);
+
+ uint16x8_t adj0 = vaddq_u16(mat[0][1], mat[2][1]);
+ uint16x8_t adj1 = vaddq_u16(mat[1][0], mat[1][2]);
+ uint16x8_t adj = vaddq_u16(adj0, adj1);
+ adj = vaddq_u16(adj, adj);
+
+ uint16x8_t diag0 = vaddq_u16(mat[0][0], mat[0][2]);
+ uint16x8_t diag1 = vaddq_u16(mat[2][0], mat[2][2]);
+ uint16x8_t diag = vaddq_u16(diag0, diag1);
+
+ uint16x8_t v = vabdq_u16(vaddq_u16(center, diag), adj);
+ v = vandq_u16(vrshlq_u16(v, vdupq_n_s16(8 - bitdepth)), thresh_u16);
+ uint32x4_t v_u32 = vpaddlq_u16(v);
+
+ acc = vpadalq_u32(acc, v_u32);
+ // Add -1 for each lane where the gradient is under the threshold.
+ count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16));
+
+ w += 8;
+ src_ptr += 8;
+ }
+
+ if (w <= (width - 1) - 4) {
+ uint16x4_t mat[3][3];
+ mat[0][0] = vld1_u16(src_ptr - stride - 1);
+ mat[0][1] = vld1_u16(src_ptr - stride);
+ mat[0][2] = vld1_u16(src_ptr - stride + 1);
+ mat[1][0] = vld1_u16(src_ptr - 1);
+ mat[1][1] = vld1_u16(src_ptr);
+ mat[1][2] = vld1_u16(src_ptr + 1);
+ mat[2][0] = vld1_u16(src_ptr + stride - 1);
+ mat[2][1] = vld1_u16(src_ptr + stride);
+ mat[2][2] = vld1_u16(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ uint16x4_t gxa = vadd_u16(mat[0][0], mat[2][0]);
+ uint16x4_t gxb = vadd_u16(mat[0][2], mat[2][2]);
+ gxa = vadd_u16(gxa, vadd_u16(mat[1][0], mat[1][0]));
+ gxb = vadd_u16(gxb, vadd_u16(mat[1][2], mat[1][2]));
+
+ uint16x4_t gya = vadd_u16(mat[0][0], mat[0][2]);
+ uint16x4_t gyb = vadd_u16(mat[2][0], mat[2][2]);
+ gya = vadd_u16(gya, vadd_u16(mat[0][1], mat[0][1]));
+ gyb = vadd_u16(gyb, vadd_u16(mat[2][1], mat[2][1]));
+
+ uint16x4_t ga = vaba_u16(vabd_u16(gxa, gxb), gya, gyb);
+ ga = vrshl_u16(ga, vdup_n_s16(8 - bitdepth));
+
+ // Check which vector elements are under the threshold. The Laplacian is
+ // then unconditionnally computed and we accumulate zeros if we're not
+ // under the threshold. This is much faster than using an if statement.
+ uint16x4_t thresh_u16 = vclt_u16(ga, vget_low_u16(thresh));
+
+ uint16x4_t center = vshl_n_u16(mat[1][1], 2);
+
+ uint16x4_t adj0 = vadd_u16(mat[0][1], mat[2][1]);
+ uint16x4_t adj1 = vadd_u16(mat[1][0], mat[1][2]);
+ uint16x4_t adj = vadd_u16(adj0, adj1);
+ adj = vadd_u16(adj, adj);
+
+ uint16x4_t diag0 = vadd_u16(mat[0][0], mat[0][2]);
+ uint16x4_t diag1 = vadd_u16(mat[2][0], mat[2][2]);
+ uint16x4_t diag = vadd_u16(diag0, diag1);
+
+ uint16x4_t v = vabd_u16(vadd_u16(center, diag), adj);
+ v = vand_u16(v, thresh_u16);
+ uint32x4_t v_u32 = vmovl_u16(vrshl_u16(v, vdup_n_s16(8 - bitdepth)));
+
+ acc = vpadalq_u32(acc, v_u32);
+ // Add -1 for each lane where the gradient is under the threshold.
+ count = vaddw_s16(count, vreinterpret_s16_u16(thresh_u16));
+
+ w += 4;
+ src_ptr += 4;
+ }
+
+ while (w < width - 1) {
+ int mat[3][3];
+ mat[0][0] = *(src_ptr - stride - 1);
+ mat[0][1] = *(src_ptr - stride);
+ mat[0][2] = *(src_ptr - stride + 1);
+ mat[1][0] = *(src_ptr - 1);
+ mat[1][1] = *(src_ptr);
+ mat[1][2] = *(src_ptr + 1);
+ mat[2][0] = *(src_ptr + stride - 1);
+ mat[2][1] = *(src_ptr + stride);
+ mat[2][2] = *(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ const int gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
+ 2 * (mat[1][0] - mat[1][2]);
+ const int gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
+ 2 * (mat[0][1] - mat[2][1]);
+ const int ga = ROUND_POWER_OF_TWO(abs(gx) + abs(gy), bitdepth - 8);
+
+ // Accumulate Laplacian.
+ const int is_under = ga < edge_thresh;
+ const int v = 4 * mat[1][1] -
+ 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
+ (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
+ final_acc += ROUND_POWER_OF_TWO(abs(v), bitdepth - 8) * is_under;
+ final_count += is_under;
+
+ src_ptr++;
+ w++;
+ }
+ src_start += stride;
+ } while (++h < height - 1);
+
+ // We counted negatively, so subtract to get the final value.
+ final_count -= horizontal_add_s32x4(count);
+ final_acc += horizontal_add_u64x2(acc);
+ return (final_count < 16)
+ ? -1.0
+ : (double)final_acc / (6 * final_count) * SQRT_PI_BY_2;
+}
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 9bb1064..d6ae667 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -1253,11 +1253,11 @@
}
#if CONFIG_AV1_HIGHBITDEPTH
-double av1_highbd_estimate_noise_from_single_plane(const uint16_t *src16,
- int height, int width,
- const int stride,
- int bit_depth,
- int edge_thresh) {
+double av1_highbd_estimate_noise_from_single_plane_c(const uint16_t *src16,
+ int height, int width,
+ const int stride,
+ int bit_depth,
+ int edge_thresh) {
int64_t accum = 0;
int count = 0;
for (int i = 1; i < height - 1; ++i) {
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
index f81240c9..85f68b8 100644
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc
@@ -774,6 +774,15 @@
TEST_P(HBDEstimateNoiseTest, DISABLED_Speed) { SpeedTest(2000); }
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, HBDEstimateNoiseTest,
+ ::testing::Combine(
+ ::testing::Values(av1_highbd_estimate_noise_from_single_plane_c),
+ ::testing::Values(av1_highbd_estimate_noise_from_single_plane_neon),
+ ::testing::ValuesIn(kWidths), ::testing::ValuesIn(kHeights),
+ ::testing::ValuesIn({ 8, 10, 12 })));
+#endif // HAVE_NEON
#endif // CONFIG_AV1_HIGHBITDEPTH
} // namespace
#endif