{,highbd_}intrapred_neon.c: Fix unaligned accesses in z2 preds
The z2 predictors load 32-bit chunks of data at variable offsets,
however on AArch32 hardware with alignment checks enabled this can cause
alignment faults since the data is not guaranteed to be 32-bit aligned.
To work around this we can make use of the load_unaligned_u8_4x1 in some
cases and in others simply memcpy to avoid making any alignment
guarantees for the data being loaded.
This also reverts commit 5a46d2961fb233c8f099a7bc18a7a54c8883813b,
re-enabling the predictors for 32-bit Arm platforms.
Bug: b/349428506
Change-Id: Ib15a7993e50b4bb6dd2eb4de95fba1dc33d3cd3f
diff --git a/aom_dsp/arm/highbd_intrapred_neon.c b/aom_dsp/arm/highbd_intrapred_neon.c
index 5e6118d..71d133e 100644
--- a/aom_dsp/arm/highbd_intrapred_neon.c
+++ b/aom_dsp/arm/highbd_intrapred_neon.c
@@ -16,6 +16,7 @@
#include "config/av1_rtcd.h"
#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/sum_neon.h"
#include "aom_dsp/arm/transpose_neon.h"
#include "aom_dsp/intrapred_common.h"
@@ -1604,8 +1605,6 @@
}
#endif // AOM_ARCH_AARCH64
-// TODO(aomedia:349428506): enable this for armv7 after SIGBUS is fixed.
-#if AOM_ARCH_AARCH64
static AOM_FORCE_INLINE uint16x4x2_t highbd_dr_prediction_z2_gather_left_x4(
const uint16_t *left, const int16x4_t indices, int n) {
assert(n > 0);
@@ -1625,13 +1624,13 @@
// At time of writing both Clang and GCC produced better code with these
// nested if-statements compared to a switch statement with fallthrough.
- ret0_u32 = vld1_lane_u32((const uint32_t *)(left + idx0), ret0_u32, 0);
+ load_unaligned_u32_2x1_lane(ret0_u32, left + idx0, 0);
if (n > 1) {
- ret0_u32 = vld1_lane_u32((const uint32_t *)(left + idx1), ret0_u32, 1);
+ load_unaligned_u32_2x1_lane(ret0_u32, left + idx1, 1);
if (n > 2) {
- ret1_u32 = vld1_lane_u32((const uint32_t *)(left + idx2), ret1_u32, 0);
+ load_unaligned_u32_2x1_lane(ret1_u32, left + idx2, 0);
if (n > 3) {
- ret1_u32 = vld1_lane_u32((const uint32_t *)(left + idx3), ret1_u32, 1);
+ load_unaligned_u32_2x1_lane(ret1_u32, left + idx3, 1);
}
}
}
@@ -1665,25 +1664,21 @@
// At time of writing both Clang and GCC produced better code with these
// nested if-statements compared to a switch statement with fallthrough.
- ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx0), ret0_u32, 0);
+ load_unaligned_u32_4x1_lane(ret0_u32, left + idx0, 0);
if (n > 1) {
- ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx1), ret0_u32, 1);
+ load_unaligned_u32_4x1_lane(ret0_u32, left + idx1, 1);
if (n > 2) {
- ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx2), ret0_u32, 2);
+ load_unaligned_u32_4x1_lane(ret0_u32, left + idx2, 2);
if (n > 3) {
- ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx3), ret0_u32, 3);
+ load_unaligned_u32_4x1_lane(ret0_u32, left + idx3, 3);
if (n > 4) {
- ret1_u32 =
- vld1q_lane_u32((const uint32_t *)(left + idx4), ret1_u32, 0);
+ load_unaligned_u32_4x1_lane(ret1_u32, left + idx4, 0);
if (n > 5) {
- ret1_u32 =
- vld1q_lane_u32((const uint32_t *)(left + idx5), ret1_u32, 1);
+ load_unaligned_u32_4x1_lane(ret1_u32, left + idx5, 1);
if (n > 6) {
- ret1_u32 =
- vld1q_lane_u32((const uint32_t *)(left + idx6), ret1_u32, 2);
+ load_unaligned_u32_4x1_lane(ret1_u32, left + idx6, 2);
if (n > 7) {
- ret1_u32 = vld1q_lane_u32((const uint32_t *)(left + idx7),
- ret1_u32, 3);
+ load_unaligned_u32_4x1_lane(ret1_u32, left + idx7, 3);
}
}
}
@@ -2475,7 +2470,6 @@
assert(f != NULL);
f(dst, stride, above, left, upsample_above, upsample_left, dx, dy, bd);
}
-#endif // AOM_ARCH_AARCH64
// -----------------------------------------------------------------------------
// Z3
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index 561a9f7..7fd82a1 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -1488,8 +1488,6 @@
/* ---------------------P R E D I C T I O N Z 2--------------------------- */
-// TODO(aomedia:349428506): enable this for armv7 after SIGBUS is fixed.
-#if AOM_ARCH_AARCH64
#if !AOM_ARCH_AARCH64
static DECLARE_ALIGNED(16, uint8_t, LoadMaskz2[4][16]) = {
{ 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
@@ -1514,8 +1512,8 @@
*a1_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[1];
*shift0 = vand_u16(vsub_u16(r6, ydx), vdup_n_u16(0x1f));
} else {
- *a0_x = load_u8_4x1(above + base_x);
- *a1_x = load_u8_4x1(above + base_x + 1);
+ *a0_x = load_unaligned_u8_4x1(above + base_x);
+ *a1_x = load_unaligned_u8_4x1(above + base_x + 1);
*shift0 = vand_u16(vhsub_u16(r6, ydx), vdup_n_u16(0x1f));
}
}
@@ -2040,7 +2038,6 @@
break;
}
}
-#endif // AOM_ARCH_AARCH64
/* ---------------------P R E D I C T I O N Z 3--------------------------- */
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 41efd03..9734f8b 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -949,6 +949,32 @@
*s2 = vld1q_s16(s);
}
+#if AOM_ARCH_AARCH64
+#define load_unaligned_u32_2x1_lane(v, p, lane) \
+ do { \
+ (v) = vld1_lane_u32((const uint32_t *)(p), (v), (lane)); \
+ } while (0)
+
+#define load_unaligned_u32_4x1_lane(v, p, lane) \
+ do { \
+ (v) = vld1q_lane_u32((const uint32_t *)(p), (v), (lane)); \
+ } while (0)
+#else
+#define load_unaligned_u32_2x1_lane(v, p, lane) \
+ do { \
+ uint32_t tmp; \
+ memcpy(&tmp, (p), 4); \
+ (v) = vset_lane_u32(tmp, (v), (lane)); \
+ } while (0)
+
+#define load_unaligned_u32_4x1_lane(v, p, lane) \
+ do { \
+ uint32_t tmp; \
+ memcpy(&tmp, (p), 4); \
+ (v) = vsetq_lane_u32(tmp, (v), (lane)); \
+ } while (0)
+#endif
+
// Load 2 sets of 4 bytes when alignment is not guaranteed.
static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
uint32_t a;
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 284f0ef..5233325 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -115,12 +115,7 @@
add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy";
specialize qw/av1_dr_prediction_z1 sse4_1 avx2 neon/;
add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy";
-# TODO(aomedia:349428506): enable NEON for armv7 after SIGBUS is fixed.
-if (aom_config("AOM_ARCH_ARM") eq "yes" && aom_config("AOM_ARCH_AARCH64") eq "") {
- specialize qw/av1_dr_prediction_z2 sse4_1 avx2/;
-} else {
- specialize qw/av1_dr_prediction_z2 sse4_1 avx2 neon/;
-}
+specialize qw/av1_dr_prediction_z2 sse4_1 avx2 neon/;
add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy";
specialize qw/av1_dr_prediction_z3 sse4_1 avx2 neon/;
@@ -230,12 +225,7 @@
add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd";
specialize qw/av1_highbd_dr_prediction_z1 avx2 neon/;
add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd";
- # TODO(aomedia:349428506): enable NEON for armv7 after SIGBUS is fixed.
- if (aom_config("AOM_ARCH_ARM") eq "yes" && aom_config("AOM_ARCH_AARCH64") eq "") {
- specialize qw/av1_highbd_dr_prediction_z2 avx2/;
- } else {
- specialize qw/av1_highbd_dr_prediction_z2 avx2 neon/;
- }
+ specialize qw/av1_highbd_dr_prediction_z2 avx2 neon/;
add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd";
specialize qw/av1_highbd_dr_prediction_z3 avx2 neon/;
}
diff --git a/test/dr_prediction_test.cc b/test/dr_prediction_test.cc
index 20cf600..0938a3d 100644
--- a/test/dr_prediction_test.cc
+++ b/test/dr_prediction_test.cc
@@ -484,7 +484,6 @@
#endif // HAVE_AVX2
#if HAVE_NEON
-#if AOM_ARCH_AARCH64
INSTANTIATE_TEST_SUITE_P(
NEON, LowbdDrPredTest,
::testing::Values(DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>,
@@ -496,21 +495,8 @@
DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>,
&z3_wrapper<av1_dr_prediction_z3_neon>,
AOM_BITS_8, kZ3Start)));
-#else
-// TODO(aomedia:349428506): enable av1_highbd_dr_prediction_z2_neon for armv7
-// after SIGBUS is fixed.
-INSTANTIATE_TEST_SUITE_P(
- NEON, LowbdDrPredTest,
- ::testing::Values(DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>,
- &z1_wrapper<av1_dr_prediction_z1_neon>,
- AOM_BITS_8, kZ1Start),
- DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>,
- &z3_wrapper<av1_dr_prediction_z3_neon>,
- AOM_BITS_8, kZ3Start)));
-#endif
#if CONFIG_AV1_HIGHBITDEPTH
-#if AOM_ARCH_AARCH64
INSTANTIATE_TEST_SUITE_P(
NEON, HighbdDrPredTest,
::testing::Values(DrPredFunc<DrPred_Hbd>(
@@ -549,36 +535,6 @@
&z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
&z3_wrapper_hbd<av1_highbd_dr_prediction_z3_neon>,
AOM_BITS_12, kZ3Start)));
-#else // !AOM_ARCH_AARCH64
-// TODO(aomedia:349428506): enable av1_highbd_dr_prediction_z2_neon for armv7
-// after SIGBUS is fixed.
-INSTANTIATE_TEST_SUITE_P(
- NEON, HighbdDrPredTest,
- ::testing::Values(DrPredFunc<DrPred_Hbd>(
- &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
- &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_neon>,
- AOM_BITS_8, kZ1Start),
- DrPredFunc<DrPred_Hbd>(
- &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
- &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_neon>,
- AOM_BITS_10, kZ1Start),
- DrPredFunc<DrPred_Hbd>(
- &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
- &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_neon>,
- AOM_BITS_12, kZ1Start),
- DrPredFunc<DrPred_Hbd>(
- &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
- &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_neon>,
- AOM_BITS_8, kZ3Start),
- DrPredFunc<DrPred_Hbd>(
- &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
- &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_neon>,
- AOM_BITS_10, kZ3Start),
- DrPredFunc<DrPred_Hbd>(
- &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
- &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_neon>,
- AOM_BITS_12, kZ3Start)));
-#endif // AOM_ARCH_AARCH64
#endif // CONFIG_AV1_HIGHBITDEPTH
#endif // HAVE_NEON