Add Neon implementation of aom_highbd_minmax_8x8
Add Neon implementation of aom_highbd_minmax_8x8 as well as the
corresponding tests.
This is a straightforward port of this libvpx patch:
https://chromium-review.googlesource.com/c/webm/libvpx/+/4334315
Change-Id: I4a4cb59512fa5aeff050c65ca1e79c3497295bc9
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index fb2d934..7321222 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1193,6 +1193,7 @@
add_proto qw/unsigned int aom_highbd_avg_4x4/, "const uint8_t *, int p";
specialize qw/aom_highbd_avg_4x4 neon/;
add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+ specialize qw/aom_highbd_minmax_8x8 neon/;
}
add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor";
diff --git a/aom_dsp/arm/highbd_avg_neon.c b/aom_dsp/arm/highbd_avg_neon.c
index 41c8903..0483a83 100644
--- a/aom_dsp/arm/highbd_avg_neon.c
+++ b/aom_dsp/arm/highbd_avg_neon.c
@@ -47,3 +47,78 @@
return (horizontal_add_u16x8(sum) + (1 << 5)) >> 6;
}
+
+void aom_highbd_minmax_8x8_neon(const uint8_t *s8, int p, const uint8_t *d8,
+ int dp, int *min, int *max) {
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
+ const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(d8);
+
+ const uint16x8_t a0 = vld1q_u16(a_ptr + 0 * p);
+ const uint16x8_t a1 = vld1q_u16(a_ptr + 1 * p);
+ const uint16x8_t a2 = vld1q_u16(a_ptr + 2 * p);
+ const uint16x8_t a3 = vld1q_u16(a_ptr + 3 * p);
+ const uint16x8_t a4 = vld1q_u16(a_ptr + 4 * p);
+ const uint16x8_t a5 = vld1q_u16(a_ptr + 5 * p);
+ const uint16x8_t a6 = vld1q_u16(a_ptr + 6 * p);
+ const uint16x8_t a7 = vld1q_u16(a_ptr + 7 * p);
+
+ const uint16x8_t b0 = vld1q_u16(b_ptr + 0 * dp);
+ const uint16x8_t b1 = vld1q_u16(b_ptr + 1 * dp);
+ const uint16x8_t b2 = vld1q_u16(b_ptr + 2 * dp);
+ const uint16x8_t b3 = vld1q_u16(b_ptr + 3 * dp);
+ const uint16x8_t b4 = vld1q_u16(b_ptr + 4 * dp);
+ const uint16x8_t b5 = vld1q_u16(b_ptr + 5 * dp);
+ const uint16x8_t b6 = vld1q_u16(b_ptr + 6 * dp);
+ const uint16x8_t b7 = vld1q_u16(b_ptr + 7 * dp);
+
+ const uint16x8_t abs_diff0 = vabdq_u16(a0, b0);
+ const uint16x8_t abs_diff1 = vabdq_u16(a1, b1);
+ const uint16x8_t abs_diff2 = vabdq_u16(a2, b2);
+ const uint16x8_t abs_diff3 = vabdq_u16(a3, b3);
+ const uint16x8_t abs_diff4 = vabdq_u16(a4, b4);
+ const uint16x8_t abs_diff5 = vabdq_u16(a5, b5);
+ const uint16x8_t abs_diff6 = vabdq_u16(a6, b6);
+ const uint16x8_t abs_diff7 = vabdq_u16(a7, b7);
+
+ const uint16x8_t max01 = vmaxq_u16(abs_diff0, abs_diff1);
+ const uint16x8_t max23 = vmaxq_u16(abs_diff2, abs_diff3);
+ const uint16x8_t max45 = vmaxq_u16(abs_diff4, abs_diff5);
+ const uint16x8_t max67 = vmaxq_u16(abs_diff6, abs_diff7);
+
+ const uint16x8_t max0123 = vmaxq_u16(max01, max23);
+ const uint16x8_t max4567 = vmaxq_u16(max45, max67);
+ const uint16x8_t max07 = vmaxq_u16(max0123, max4567);
+
+ const uint16x8_t min01 = vminq_u16(abs_diff0, abs_diff1);
+ const uint16x8_t min23 = vminq_u16(abs_diff2, abs_diff3);
+ const uint16x8_t min45 = vminq_u16(abs_diff4, abs_diff5);
+ const uint16x8_t min67 = vminq_u16(abs_diff6, abs_diff7);
+
+ const uint16x8_t min0123 = vminq_u16(min01, min23);
+ const uint16x8_t min4567 = vminq_u16(min45, min67);
+ const uint16x8_t min07 = vminq_u16(min0123, min4567);
+
+#if defined(__aarch64__)
+ *max = (int)vmaxvq_u16(max07);
+ *min = (int)vminvq_u16(min07);
+#else
+ // Split into 64-bit vectors and execute pairwise min/max.
+ uint16x4_t ab_max = vmax_u16(vget_high_u16(max07), vget_low_u16(max07));
+ uint16x4_t ab_min = vmin_u16(vget_high_u16(min07), vget_low_u16(min07));
+
+ // Enough runs of vpmax/min propagate the max/min values to every position.
+ ab_max = vpmax_u16(ab_max, ab_max);
+ ab_min = vpmin_u16(ab_min, ab_min);
+
+ ab_max = vpmax_u16(ab_max, ab_max);
+ ab_min = vpmin_u16(ab_min, ab_min);
+
+ ab_max = vpmax_u16(ab_max, ab_max);
+ ab_min = vpmin_u16(ab_min, ab_min);
+
+ *min = *max = 0; // Clear high bits
+ // Store directly to avoid costly neon->gpr transfer.
+ vst1_lane_u16((uint16_t *)max, ab_max, 0);
+ vst1_lane_u16((uint16_t *)min, ab_min, 0);
+#endif
+}
diff --git a/test/minmax_test.cc b/test/minmax_test.cc
index 1ba1f9d..cf67b7b 100644
--- a/test/minmax_test.cc
+++ b/test/minmax_test.cc
@@ -226,6 +226,10 @@
#if CONFIG_AV1_HIGHBITDEPTH
INSTANTIATE_TEST_SUITE_P(C, HBDMinMaxTest,
::testing::Values(&aom_highbd_minmax_8x8_c));
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, HBDMinMaxTest,
+ ::testing::Values(&aom_highbd_minmax_8x8_neon));
+#endif
#endif
#if HAVE_SSE2