Add Neon implementations for remaining v predictors
We already have Neon implementations of the v predictors for all square
block sizes except 64x64, so add the remaining cases and update
tests/speed to match.
On Neoverse V1, these new implementations are worth about a 10%
improvement for the largest and smallest cases on both Clang 15 and GCC
12, and around parity for the remainder with both compilers.
Change-Id: I945f550608e75e978a48b253c2d81e04955c0be5
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index dab303c..0bf1a9c 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -146,24 +146,24 @@
specialize qw/aom_dc_128_predictor_64x64 neon sse2 avx2/;
specialize qw/aom_v_predictor_4x4 neon sse2/;
-specialize qw/aom_v_predictor_4x8 sse2/;
-specialize qw/aom_v_predictor_4x16 sse2/;
-specialize qw/aom_v_predictor_8x4 sse2/;
+specialize qw/aom_v_predictor_4x8 neon sse2/;
+specialize qw/aom_v_predictor_4x16 neon sse2/;
+specialize qw/aom_v_predictor_8x4 neon sse2/;
specialize qw/aom_v_predictor_8x8 neon sse2/;
-specialize qw/aom_v_predictor_8x16 sse2/;
-specialize qw/aom_v_predictor_8x32 sse2/;
-specialize qw/aom_v_predictor_16x4 sse2/;
-specialize qw/aom_v_predictor_16x8 sse2/;
+specialize qw/aom_v_predictor_8x16 neon sse2/;
+specialize qw/aom_v_predictor_8x32 neon sse2/;
+specialize qw/aom_v_predictor_16x4 neon sse2/;
+specialize qw/aom_v_predictor_16x8 neon sse2/;
specialize qw/aom_v_predictor_16x16 neon sse2/;
-specialize qw/aom_v_predictor_16x32 sse2/;
-specialize qw/aom_v_predictor_16x64 sse2/;
-specialize qw/aom_v_predictor_32x8 sse2/;
-specialize qw/aom_v_predictor_32x16 sse2 avx2/;
+specialize qw/aom_v_predictor_16x32 neon sse2/;
+specialize qw/aom_v_predictor_16x64 neon sse2/;
+specialize qw/aom_v_predictor_32x8 neon sse2/;
+specialize qw/aom_v_predictor_32x16 neon sse2 avx2/;
specialize qw/aom_v_predictor_32x32 neon sse2 avx2/;
-specialize qw/aom_v_predictor_32x64 sse2 avx2/;
-specialize qw/aom_v_predictor_64x16 sse2 avx2/;
-specialize qw/aom_v_predictor_64x32 sse2 avx2/;
-specialize qw/aom_v_predictor_64x64 sse2 avx2/;
+specialize qw/aom_v_predictor_32x64 neon sse2 avx2/;
+specialize qw/aom_v_predictor_64x16 neon sse2 avx2/;
+specialize qw/aom_v_predictor_64x32 neon sse2 avx2/;
+specialize qw/aom_v_predictor_64x64 neon sse2 avx2/;
specialize qw/aom_h_predictor_4x4 neon sse2/;
specialize qw/aom_h_predictor_4x8 sse2/;
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index c724fa5..5aa401b 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -603,52 +603,186 @@
vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
}
+// -----------------------------------------------------------------------------
+
+static INLINE void v_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x8_t d0) {
+ for (int i = 0; i < h; ++i) {
+ store_u8_4x1(dst + i * stride, d0, 0);
+ }
+}
+
+static INLINE void v_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x8_t d0) {
+ for (int i = 0; i < h; ++i) {
+ vst1_u8(dst + i * stride, d0);
+ }
+}
+
+static INLINE void v_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x16_t d0) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u8(dst + i * stride, d0);
+ }
+}
+
+static INLINE void v_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x16_t d0, uint8x16_t d1) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u8(dst + 0, d0);
+ vst1q_u8(dst + 16, d1);
+ dst += stride;
+ }
+}
+
+static INLINE void v_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x16_t d0, uint8x16_t d1, uint8x16_t d2,
+ uint8x16_t d3) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u8(dst + 0, d0);
+ vst1q_u8(dst + 16, d1);
+ vst1q_u8(dst + 32, d2);
+ vst1q_u8(dst + 48, d3);
+ dst += stride;
+ }
+}
+
void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- int i;
- uint32x2_t d0u32 = vdup_n_u32(0);
(void)left;
-
- d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
- for (i = 0; i < 4; i++, dst += stride)
- vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+ v_store_4xh(dst, stride, 4, load_u8_4x1_lane0(above));
}
void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- int i;
- uint8x8_t d0u8 = vdup_n_u8(0);
(void)left;
-
- d0u8 = vld1_u8(above);
- for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
+ v_store_8xh(dst, stride, 8, vld1_u8(above));
}
void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- int i;
- uint8x16_t q0u8 = vdupq_n_u8(0);
(void)left;
-
- q0u8 = vld1q_u8(above);
- for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
+ v_store_16xh(dst, stride, 16, vld1q_u8(above));
}
void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- int i;
- uint8x16_t q0u8 = vdupq_n_u8(0);
- uint8x16_t q1u8 = vdupq_n_u8(0);
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
(void)left;
-
- q0u8 = vld1q_u8(above);
- q1u8 = vld1q_u8(above + 16);
- for (i = 0; i < 32; i++, dst += stride) {
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q1u8);
- }
+ v_store_32xh(dst, stride, 32, d0, d1);
}
+void aom_v_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_4xh(dst, stride, 8, load_u8_4x1_lane0(above));
+}
+
+void aom_v_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_4xh(dst, stride, 16, load_u8_4x1_lane0(above));
+}
+
+void aom_v_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_8xh(dst, stride, 4, vld1_u8(above));
+}
+
+void aom_v_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_8xh(dst, stride, 16, vld1_u8(above));
+}
+
+void aom_v_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_8xh(dst, stride, 32, vld1_u8(above));
+}
+
+void aom_v_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_16xh(dst, stride, 4, vld1q_u8(above));
+}
+
+void aom_v_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_16xh(dst, stride, 8, vld1q_u8(above));
+}
+
+void aom_v_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_16xh(dst, stride, 32, vld1q_u8(above));
+}
+
+void aom_v_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_16xh(dst, stride, 64, vld1q_u8(above));
+}
+
+void aom_v_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
+ (void)left;
+ v_store_32xh(dst, stride, 8, d0, d1);
+}
+
+void aom_v_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
+ (void)left;
+ v_store_32xh(dst, stride, 16, d0, d1);
+}
+
+void aom_v_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
+ (void)left;
+ v_store_32xh(dst, stride, 64, d0, d1);
+}
+
+void aom_v_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
+ const uint8x16_t d2 = vld1q_u8(above + 32);
+ const uint8x16_t d3 = vld1q_u8(above + 48);
+ (void)left;
+ v_store_64xh(dst, stride, 16, d0, d1, d2, d3);
+}
+
+void aom_v_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
+ const uint8x16_t d2 = vld1q_u8(above + 32);
+ const uint8x16_t d3 = vld1q_u8(above + 48);
+ (void)left;
+ v_store_64xh(dst, stride, 32, d0, d1, d2, d3);
+}
+
+void aom_v_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
+ const uint8x16_t d2 = vld1q_u8(above + 32);
+ const uint8x16_t d3 = vld1q_u8(above + 48);
+ (void)left;
+ v_store_64xh(dst, stride, 64, d0, d1, d2, d3);
+}
+
+// -----------------------------------------------------------------------------
+
void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
uint8x8_t d0u8 = vdup_n_u8(0);
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index 01a5dc2..47f35f5 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -342,9 +342,7 @@
const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorNeon[] = {
lowbd_intrapred(dc, neon), lowbd_intrapred(dc_top, neon),
lowbd_intrapred(dc_left, neon), lowbd_intrapred(dc_128, neon),
-
- lowbd_entry(v, 4, 4, neon), lowbd_entry(v, 8, 8, neon),
- lowbd_entry(v, 16, 16, neon), lowbd_entry(v, 32, 32, neon),
+ lowbd_intrapred(v, neon),
lowbd_entry(h, 4, 4, neon), lowbd_entry(h, 8, 8, neon),
lowbd_entry(h, 16, 16, neon), lowbd_entry(h, 32, 32, neon),
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 526f678..e3751fb 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -470,14 +470,15 @@
aom_smooth_h_predictor_4x4_neon)
INTRA_PRED_TEST(NEON, TX_4X8, aom_dc_predictor_4x8_neon,
aom_dc_left_predictor_4x8_neon, aom_dc_top_predictor_4x8_neon,
- aom_dc_128_predictor_4x8_neon, nullptr, nullptr,
- aom_paeth_predictor_4x8_neon, aom_smooth_predictor_4x8_neon,
- aom_smooth_v_predictor_4x8_neon,
+ aom_dc_128_predictor_4x8_neon, aom_v_predictor_4x8_neon,
+ nullptr, aom_paeth_predictor_4x8_neon,
+ aom_smooth_predictor_4x8_neon, aom_smooth_v_predictor_4x8_neon,
aom_smooth_h_predictor_4x8_neon)
INTRA_PRED_TEST(NEON, TX_4X16, aom_dc_predictor_4x16_neon,
aom_dc_left_predictor_4x16_neon, aom_dc_top_predictor_4x16_neon,
- aom_dc_128_predictor_4x16_neon, nullptr, nullptr,
- aom_paeth_predictor_4x16_neon, aom_smooth_predictor_4x16_neon,
+ aom_dc_128_predictor_4x16_neon, aom_v_predictor_4x16_neon,
+ nullptr, aom_paeth_predictor_4x16_neon,
+ aom_smooth_predictor_4x16_neon,
aom_smooth_v_predictor_4x16_neon,
aom_smooth_h_predictor_4x16_neon)
#endif // HAVE_NEON
@@ -560,20 +561,22 @@
aom_smooth_h_predictor_8x8_neon)
INTRA_PRED_TEST(NEON, TX_8X4, aom_dc_predictor_8x4_neon,
aom_dc_left_predictor_8x4_neon, aom_dc_top_predictor_8x4_neon,
- aom_dc_128_predictor_8x4_neon, nullptr, nullptr,
- aom_paeth_predictor_8x4_neon, aom_smooth_predictor_8x4_neon,
- aom_smooth_v_predictor_8x4_neon,
+ aom_dc_128_predictor_8x4_neon, aom_v_predictor_8x4_neon,
+ nullptr, aom_paeth_predictor_8x4_neon,
+ aom_smooth_predictor_8x4_neon, aom_smooth_v_predictor_8x4_neon,
aom_smooth_h_predictor_8x4_neon)
INTRA_PRED_TEST(NEON, TX_8X16, aom_dc_predictor_8x16_neon,
aom_dc_left_predictor_8x16_neon, aom_dc_top_predictor_8x16_neon,
- aom_dc_128_predictor_8x16_neon, nullptr, nullptr,
- aom_paeth_predictor_8x16_neon, aom_smooth_predictor_8x16_neon,
+ aom_dc_128_predictor_8x16_neon, aom_v_predictor_8x16_neon,
+ nullptr, aom_paeth_predictor_8x16_neon,
+ aom_smooth_predictor_8x16_neon,
aom_smooth_v_predictor_8x16_neon,
aom_smooth_h_predictor_8x16_neon)
INTRA_PRED_TEST(NEON, TX_8X32, aom_dc_predictor_8x32_neon,
aom_dc_left_predictor_8x32_neon, aom_dc_top_predictor_8x32_neon,
- aom_dc_128_predictor_8x32_neon, nullptr, nullptr,
- aom_paeth_predictor_8x32_neon, aom_smooth_predictor_8x32_neon,
+ aom_dc_128_predictor_8x32_neon, aom_v_predictor_8x32_neon,
+ nullptr, aom_paeth_predictor_8x32_neon,
+ aom_smooth_predictor_8x32_neon,
aom_smooth_v_predictor_8x32_neon,
aom_smooth_h_predictor_8x32_neon)
#endif // HAVE_NEON
@@ -692,28 +695,32 @@
aom_smooth_h_predictor_16x16_neon)
INTRA_PRED_TEST(NEON, TX_16X8, aom_dc_predictor_16x8_neon,
aom_dc_left_predictor_16x8_neon, aom_dc_top_predictor_16x8_neon,
- aom_dc_128_predictor_16x8_neon, nullptr, nullptr,
- aom_paeth_predictor_16x8_neon, aom_smooth_predictor_16x8_neon,
+ aom_dc_128_predictor_16x8_neon, aom_v_predictor_16x8_neon,
+ nullptr, aom_paeth_predictor_16x8_neon,
+ aom_smooth_predictor_16x8_neon,
aom_smooth_v_predictor_16x8_neon,
aom_smooth_h_predictor_16x8_neon)
INTRA_PRED_TEST(NEON, TX_16X32, aom_dc_predictor_16x32_neon,
aom_dc_left_predictor_16x32_neon,
aom_dc_top_predictor_16x32_neon,
- aom_dc_128_predictor_16x32_neon, nullptr, nullptr,
- aom_paeth_predictor_16x32_neon, aom_smooth_predictor_16x32_neon,
+ aom_dc_128_predictor_16x32_neon, aom_v_predictor_16x32_neon,
+ nullptr, aom_paeth_predictor_16x32_neon,
+ aom_smooth_predictor_16x32_neon,
aom_smooth_v_predictor_16x32_neon,
aom_smooth_h_predictor_16x32_neon)
INTRA_PRED_TEST(NEON, TX_16X4, aom_dc_predictor_16x4_neon,
aom_dc_left_predictor_16x4_neon, aom_dc_top_predictor_16x4_neon,
- aom_dc_128_predictor_16x4_neon, nullptr, nullptr,
- aom_paeth_predictor_16x4_neon, aom_smooth_predictor_16x4_neon,
+ aom_dc_128_predictor_16x4_neon, aom_v_predictor_16x4_neon,
+ nullptr, aom_paeth_predictor_16x4_neon,
+ aom_smooth_predictor_16x4_neon,
aom_smooth_v_predictor_16x4_neon,
aom_smooth_h_predictor_16x4_neon)
INTRA_PRED_TEST(NEON, TX_16X64, aom_dc_predictor_16x64_neon,
aom_dc_left_predictor_16x64_neon,
aom_dc_top_predictor_16x64_neon,
- aom_dc_128_predictor_16x64_neon, nullptr, nullptr,
- aom_paeth_predictor_16x64_neon, aom_smooth_predictor_16x64_neon,
+ aom_dc_128_predictor_16x64_neon, aom_v_predictor_16x64_neon,
+ nullptr, aom_paeth_predictor_16x64_neon,
+ aom_smooth_predictor_16x64_neon,
aom_smooth_v_predictor_16x64_neon,
aom_smooth_h_predictor_16x64_neon)
#endif // HAVE_NEON
@@ -824,21 +831,24 @@
INTRA_PRED_TEST(NEON, TX_32X16, aom_dc_predictor_32x16_neon,
aom_dc_left_predictor_32x16_neon,
aom_dc_top_predictor_32x16_neon,
- aom_dc_128_predictor_32x16_neon, nullptr, nullptr,
- aom_paeth_predictor_32x16_neon, aom_smooth_predictor_32x16_neon,
+ aom_dc_128_predictor_32x16_neon, aom_v_predictor_32x16_neon,
+ nullptr, aom_paeth_predictor_32x16_neon,
+ aom_smooth_predictor_32x16_neon,
aom_smooth_v_predictor_32x16_neon,
aom_smooth_h_predictor_32x16_neon)
INTRA_PRED_TEST(NEON, TX_32X64, aom_dc_predictor_32x64_neon,
aom_dc_left_predictor_32x64_neon,
aom_dc_top_predictor_32x64_neon,
- aom_dc_128_predictor_32x64_neon, nullptr, nullptr,
- aom_paeth_predictor_32x64_neon, aom_smooth_predictor_32x64_neon,
+ aom_dc_128_predictor_32x64_neon, aom_v_predictor_32x64_neon,
+ nullptr, aom_paeth_predictor_32x64_neon,
+ aom_smooth_predictor_32x64_neon,
aom_smooth_v_predictor_32x64_neon,
aom_smooth_h_predictor_32x64_neon)
INTRA_PRED_TEST(NEON, TX_32X8, aom_dc_predictor_32x8_neon,
aom_dc_left_predictor_32x8_neon, aom_dc_top_predictor_32x8_neon,
- aom_dc_128_predictor_32x8_neon, nullptr, nullptr,
- aom_paeth_predictor_32x8_neon, aom_smooth_predictor_32x8_neon,
+ aom_dc_128_predictor_32x8_neon, aom_v_predictor_32x8_neon,
+ nullptr, aom_paeth_predictor_32x8_neon,
+ aom_smooth_predictor_32x8_neon,
aom_smooth_v_predictor_32x8_neon,
aom_smooth_h_predictor_32x8_neon)
#endif // HAVE_NEON
@@ -926,22 +936,25 @@
INTRA_PRED_TEST(NEON, TX_64X64, aom_dc_predictor_64x64_neon,
aom_dc_left_predictor_64x64_neon,
aom_dc_top_predictor_64x64_neon,
- aom_dc_128_predictor_64x64_neon, nullptr, nullptr,
- aom_paeth_predictor_64x64_neon, aom_smooth_predictor_64x64_neon,
+ aom_dc_128_predictor_64x64_neon, aom_v_predictor_64x64_neon,
+ nullptr, aom_paeth_predictor_64x64_neon,
+ aom_smooth_predictor_64x64_neon,
aom_smooth_v_predictor_64x64_neon,
aom_smooth_h_predictor_64x64_neon)
INTRA_PRED_TEST(NEON, TX_64X32, aom_dc_predictor_64x32_neon,
aom_dc_left_predictor_64x32_neon,
aom_dc_top_predictor_64x32_neon,
- aom_dc_128_predictor_64x32_neon, nullptr, nullptr,
- aom_paeth_predictor_64x32_neon, aom_smooth_predictor_64x32_neon,
+ aom_dc_128_predictor_64x32_neon, aom_v_predictor_64x32_neon,
+ nullptr, aom_paeth_predictor_64x32_neon,
+ aom_smooth_predictor_64x32_neon,
aom_smooth_v_predictor_64x32_neon,
aom_smooth_h_predictor_64x32_neon)
INTRA_PRED_TEST(NEON, TX_64X16, aom_dc_predictor_64x16_neon,
aom_dc_left_predictor_64x16_neon,
aom_dc_top_predictor_64x16_neon,
- aom_dc_128_predictor_64x16_neon, nullptr, nullptr,
- aom_paeth_predictor_64x16_neon, aom_smooth_predictor_64x16_neon,
+ aom_dc_128_predictor_64x16_neon, aom_v_predictor_64x16_neon,
+ nullptr, aom_paeth_predictor_64x16_neon,
+ aom_smooth_predictor_64x16_neon,
aom_smooth_v_predictor_64x16_neon,
aom_smooth_h_predictor_64x16_neon)
#endif // HAVE_NEON