{,highbd_}intrapred_neon.c: Fix unaligned accesses in z2 preds

The z2 predictors load 32-bit chunks of data at variable offsets,
however on AArch32 hardware with alignment checks enabled this can cause
alignment faults since the data is not guaranteed to be 32-bit aligned.

To work around this we can make use of the load_unaligned_u8_4x1 in some
cases and in others simply memcpy to avoid making any alignment
guarantees for the data being loaded.

This also reverts commit 5a46d2961fb233c8f099a7bc18a7a54c8883813b,
re-enabling the predictors for 32-bit Arm platforms.

Bug: b/349428506
Change-Id: Ib15a7993e50b4bb6dd2eb4de95fba1dc33d3cd3f
diff --git a/aom_dsp/arm/highbd_intrapred_neon.c b/aom_dsp/arm/highbd_intrapred_neon.c
index 5e6118d..71d133e 100644
--- a/aom_dsp/arm/highbd_intrapred_neon.c
+++ b/aom_dsp/arm/highbd_intrapred_neon.c
@@ -16,6 +16,7 @@
 #include "config/av1_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_dsp/intrapred_common.h"
@@ -1604,8 +1605,6 @@
 }
 #endif  // AOM_ARCH_AARCH64
 
-// TODO(aomedia:349428506): enable this for armv7 after SIGBUS is fixed.
-#if AOM_ARCH_AARCH64
 static AOM_FORCE_INLINE uint16x4x2_t highbd_dr_prediction_z2_gather_left_x4(
     const uint16_t *left, const int16x4_t indices, int n) {
   assert(n > 0);
@@ -1625,13 +1624,13 @@
 
   // At time of writing both Clang and GCC produced better code with these
   // nested if-statements compared to a switch statement with fallthrough.
-  ret0_u32 = vld1_lane_u32((const uint32_t *)(left + idx0), ret0_u32, 0);
+  load_unaligned_u32_2x1_lane(ret0_u32, left + idx0, 0);
   if (n > 1) {
-    ret0_u32 = vld1_lane_u32((const uint32_t *)(left + idx1), ret0_u32, 1);
+    load_unaligned_u32_2x1_lane(ret0_u32, left + idx1, 1);
     if (n > 2) {
-      ret1_u32 = vld1_lane_u32((const uint32_t *)(left + idx2), ret1_u32, 0);
+      load_unaligned_u32_2x1_lane(ret1_u32, left + idx2, 0);
       if (n > 3) {
-        ret1_u32 = vld1_lane_u32((const uint32_t *)(left + idx3), ret1_u32, 1);
+        load_unaligned_u32_2x1_lane(ret1_u32, left + idx3, 1);
       }
     }
   }
@@ -1665,25 +1664,21 @@
 
   // At time of writing both Clang and GCC produced better code with these
   // nested if-statements compared to a switch statement with fallthrough.
-  ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx0), ret0_u32, 0);
+  load_unaligned_u32_4x1_lane(ret0_u32, left + idx0, 0);
   if (n > 1) {
-    ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx1), ret0_u32, 1);
+    load_unaligned_u32_4x1_lane(ret0_u32, left + idx1, 1);
     if (n > 2) {
-      ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx2), ret0_u32, 2);
+      load_unaligned_u32_4x1_lane(ret0_u32, left + idx2, 2);
       if (n > 3) {
-        ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx3), ret0_u32, 3);
+        load_unaligned_u32_4x1_lane(ret0_u32, left + idx3, 3);
         if (n > 4) {
-          ret1_u32 =
-              vld1q_lane_u32((const uint32_t *)(left + idx4), ret1_u32, 0);
+          load_unaligned_u32_4x1_lane(ret1_u32, left + idx4, 0);
           if (n > 5) {
-            ret1_u32 =
-                vld1q_lane_u32((const uint32_t *)(left + idx5), ret1_u32, 1);
+            load_unaligned_u32_4x1_lane(ret1_u32, left + idx5, 1);
             if (n > 6) {
-              ret1_u32 =
-                  vld1q_lane_u32((const uint32_t *)(left + idx6), ret1_u32, 2);
+              load_unaligned_u32_4x1_lane(ret1_u32, left + idx6, 2);
               if (n > 7) {
-                ret1_u32 = vld1q_lane_u32((const uint32_t *)(left + idx7),
-                                          ret1_u32, 3);
+                load_unaligned_u32_4x1_lane(ret1_u32, left + idx7, 3);
               }
             }
           }
@@ -2475,7 +2470,6 @@
   assert(f != NULL);
   f(dst, stride, above, left, upsample_above, upsample_left, dx, dy, bd);
 }
-#endif  // AOM_ARCH_AARCH64
 
 // -----------------------------------------------------------------------------
 // Z3
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index 561a9f7..7fd82a1 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -1488,8 +1488,6 @@
 
 /* ---------------------P R E D I C T I O N   Z 2--------------------------- */
 
-// TODO(aomedia:349428506): enable this for armv7 after SIGBUS is fixed.
-#if AOM_ARCH_AARCH64
 #if !AOM_ARCH_AARCH64
 static DECLARE_ALIGNED(16, uint8_t, LoadMaskz2[4][16]) = {
   { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
@@ -1514,8 +1512,8 @@
     *a1_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[1];
     *shift0 = vand_u16(vsub_u16(r6, ydx), vdup_n_u16(0x1f));
   } else {
-    *a0_x = load_u8_4x1(above + base_x);
-    *a1_x = load_u8_4x1(above + base_x + 1);
+    *a0_x = load_unaligned_u8_4x1(above + base_x);
+    *a1_x = load_unaligned_u8_4x1(above + base_x + 1);
     *shift0 = vand_u16(vhsub_u16(r6, ydx), vdup_n_u16(0x1f));
   }
 }
@@ -2040,7 +2038,6 @@
       break;
   }
 }
-#endif  // AOM_ARCH_AARCH64
 
 /* ---------------------P R E D I C T I O N   Z 3--------------------------- */
 
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 41efd03..9734f8b 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -949,6 +949,32 @@
   *s2 = vld1q_s16(s);
 }
 
+#if AOM_ARCH_AARCH64
+#define load_unaligned_u32_2x1_lane(v, p, lane)              \
+  do {                                                       \
+    (v) = vld1_lane_u32((const uint32_t *)(p), (v), (lane)); \
+  } while (0)
+
+#define load_unaligned_u32_4x1_lane(v, p, lane)               \
+  do {                                                        \
+    (v) = vld1q_lane_u32((const uint32_t *)(p), (v), (lane)); \
+  } while (0)
+#else
+#define load_unaligned_u32_2x1_lane(v, p, lane) \
+  do {                                          \
+    uint32_t tmp;                               \
+    memcpy(&tmp, (p), 4);                       \
+    (v) = vset_lane_u32(tmp, (v), (lane));      \
+  } while (0)
+
+#define load_unaligned_u32_4x1_lane(v, p, lane) \
+  do {                                          \
+    uint32_t tmp;                               \
+    memcpy(&tmp, (p), 4);                       \
+    (v) = vsetq_lane_u32(tmp, (v), (lane));     \
+  } while (0)
+#endif
+
 // Load 2 sets of 4 bytes when alignment is not guaranteed.
 static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
   uint32_t a;
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 284f0ef..5233325 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -115,12 +115,7 @@
 add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy";
 specialize qw/av1_dr_prediction_z1 sse4_1 avx2 neon/;
 add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy";
-# TODO(aomedia:349428506): enable NEON for armv7 after SIGBUS is fixed.
-if (aom_config("AOM_ARCH_ARM") eq "yes" && aom_config("AOM_ARCH_AARCH64") eq "") {
-  specialize qw/av1_dr_prediction_z2 sse4_1 avx2/;
-} else {
-  specialize qw/av1_dr_prediction_z2 sse4_1 avx2 neon/;
-}
+specialize qw/av1_dr_prediction_z2 sse4_1 avx2 neon/;
 add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy";
 specialize qw/av1_dr_prediction_z3 sse4_1 avx2 neon/;
 
@@ -230,12 +225,7 @@
   add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd";
   specialize qw/av1_highbd_dr_prediction_z1 avx2 neon/;
   add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd";
-  # TODO(aomedia:349428506): enable NEON for armv7 after SIGBUS is fixed.
-  if (aom_config("AOM_ARCH_ARM") eq "yes" && aom_config("AOM_ARCH_AARCH64") eq "") {
-    specialize qw/av1_highbd_dr_prediction_z2 avx2/;
-  } else {
-    specialize qw/av1_highbd_dr_prediction_z2 avx2 neon/;
-  }
+  specialize qw/av1_highbd_dr_prediction_z2 avx2 neon/;
   add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd";
   specialize qw/av1_highbd_dr_prediction_z3 avx2 neon/;
 }
diff --git a/test/dr_prediction_test.cc b/test/dr_prediction_test.cc
index 20cf600..0938a3d 100644
--- a/test/dr_prediction_test.cc
+++ b/test/dr_prediction_test.cc
@@ -484,7 +484,6 @@
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
-#if AOM_ARCH_AARCH64
 INSTANTIATE_TEST_SUITE_P(
     NEON, LowbdDrPredTest,
     ::testing::Values(DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>,
@@ -496,21 +495,8 @@
                       DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>,
                                          &z3_wrapper<av1_dr_prediction_z3_neon>,
                                          AOM_BITS_8, kZ3Start)));
-#else
-// TODO(aomedia:349428506): enable av1_highbd_dr_prediction_z2_neon for armv7
-// after SIGBUS is fixed.
-INSTANTIATE_TEST_SUITE_P(
-    NEON, LowbdDrPredTest,
-    ::testing::Values(DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>,
-                                         &z1_wrapper<av1_dr_prediction_z1_neon>,
-                                         AOM_BITS_8, kZ1Start),
-                      DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>,
-                                         &z3_wrapper<av1_dr_prediction_z3_neon>,
-                                         AOM_BITS_8, kZ3Start)));
-#endif
 
 #if CONFIG_AV1_HIGHBITDEPTH
-#if AOM_ARCH_AARCH64
 INSTANTIATE_TEST_SUITE_P(
     NEON, HighbdDrPredTest,
     ::testing::Values(DrPredFunc<DrPred_Hbd>(
@@ -549,36 +535,6 @@
                           &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
                           &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_neon>,
                           AOM_BITS_12, kZ3Start)));
-#else   // !AOM_ARCH_AARCH64
-// TODO(aomedia:349428506): enable av1_highbd_dr_prediction_z2_neon for armv7
-// after SIGBUS is fixed.
-INSTANTIATE_TEST_SUITE_P(
-    NEON, HighbdDrPredTest,
-    ::testing::Values(DrPredFunc<DrPred_Hbd>(
-                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
-                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_neon>,
-                          AOM_BITS_8, kZ1Start),
-                      DrPredFunc<DrPred_Hbd>(
-                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
-                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_neon>,
-                          AOM_BITS_10, kZ1Start),
-                      DrPredFunc<DrPred_Hbd>(
-                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
-                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_neon>,
-                          AOM_BITS_12, kZ1Start),
-                      DrPredFunc<DrPred_Hbd>(
-                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
-                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_neon>,
-                          AOM_BITS_8, kZ3Start),
-                      DrPredFunc<DrPred_Hbd>(
-                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
-                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_neon>,
-                          AOM_BITS_10, kZ3Start),
-                      DrPredFunc<DrPred_Hbd>(
-                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
-                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_neon>,
-                          AOM_BITS_12, kZ3Start)));
-#endif  // AOM_ARCH_AARCH64
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #endif  // HAVE_NEON