blend_a64_*_neon: fix bus error in armv7
dst is unaligned, so avoid casting to a larger type that may produce
alignment hints; the only change to the assembly is the removal of those
hints
Bug: aomedia:3282
Fixed: aomedia:3282
Change-Id: Iee988b0ee77a28755bf83830d88f87f0c19efad5
diff --git a/aom_dsp/arm/blend_a64_mask_neon.c b/aom_dsp/arm/blend_a64_mask_neon.c
index 8709e38..f11d57e 100644
--- a/aom_dsp/arm/blend_a64_mask_neon.c
+++ b/aom_dsp/arm/blend_a64_mask_neon.c
@@ -117,14 +117,10 @@
uint8x8_t res_0 = vqmovun_s16(src0_0);
uint8x8_t res_1 = vqmovun_s16(src0_1);
- vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), vreinterpret_u32_u8(res_0),
- 0);
- vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), vreinterpret_u32_u8(res_0),
- 1);
- vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), vreinterpret_u32_u8(res_1),
- 0);
- vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), vreinterpret_u32_u8(res_1),
- 1);
+ store_unaligned_u8_4x1(dst + 0 * dst_stride, res_0, 0);
+ store_unaligned_u8_4x1(dst + 1 * dst_stride, res_0, 1);
+ store_unaligned_u8_4x1(dst + 2 * dst_stride, res_1, 0);
+ store_unaligned_u8_4x1(dst + 3 * dst_stride, res_1, 1);
}
void aom_lowbd_blend_a64_d16_mask_neon(
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index c8236da..81643e9 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -413,6 +413,13 @@
memcpy(dst, &a, 4); \
} while (0)
+#define store_unaligned_u8_2x1(dst, src, lane) \
+ do { \
+ uint16_t a; \
+ a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \
+ memcpy(dst, &a, 2); \
+ } while (0)
+
static INLINE void load_unaligned_u8_2x2(const uint8_t *buf, int stride,
uint16x4_t *tu0) {
uint16_t a;
diff --git a/av1/common/arm/blend_a64_hmask_neon.c b/av1/common/arm/blend_a64_hmask_neon.c
index 4639d4c..89252ef 100644
--- a/av1/common/arm/blend_a64_hmask_neon.c
+++ b/av1/common/arm/blend_a64_hmask_neon.c
@@ -83,6 +83,7 @@
dst += dst_stride;
}
} else if (w == 4) {
+ assert(((uintptr_t)mask & 3) == 0);
const uint8x8_t m = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)mask));
const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
for (int i = 0; i < h; i += 2) {
@@ -96,17 +97,15 @@
tmp1 = vreinterpret_u8_u32(tmp1_32);
res = vmull_u8(m, tmp0);
res = vmlal_u8(res, max_minus_m, tmp1);
- vst1_lane_u32(
- (uint32_t *)(dst + (0 * dst_stride)),
- vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
- vst1_lane_u32(
- (uint32_t *)(dst + (1 * dst_stride)),
- vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+ const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
+ store_unaligned_u8_4x1(dst + 0 * dst_stride, result, 0);
+ store_unaligned_u8_4x1(dst + 1 * dst_stride, result, 1);
src0 += (2 * src0_stride);
src1 += (2 * src1_stride);
dst += (2 * dst_stride);
}
} else if (w == 2) {
+ assert(((uintptr_t)mask & 1) == 0);
const uint8x8_t m = vreinterpret_u8_u16(vld1_dup_u16((uint16_t *)mask));
const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
for (int i = 0; i < h; i += 2) {
@@ -120,12 +119,9 @@
tmp1 = vreinterpret_u8_u16(tmp1_16);
res = vmull_u8(m, tmp0);
res = vmlal_u8(res, max_minus_m, tmp1);
- vst1_lane_u16(
- (uint16_t *)(dst + (0 * dst_stride)),
- vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
- vst1_lane_u16(
- (uint16_t *)(dst + (1 * dst_stride)),
- vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+ const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
+ store_unaligned_u8_2x1(dst + 0 * dst_stride, result, 0);
+ store_unaligned_u8_2x1(dst + 1 * dst_stride, result, 1);
src0 += (2 * src0_stride);
src1 += (2 * src1_stride);
dst += (2 * dst_stride);
diff --git a/av1/common/arm/blend_a64_vmask_neon.c b/av1/common/arm/blend_a64_vmask_neon.c
index 061af74..2132fbd 100644
--- a/av1/common/arm/blend_a64_vmask_neon.c
+++ b/av1/common/arm/blend_a64_vmask_neon.c
@@ -95,12 +95,9 @@
tmp1 = vreinterpret_u8_u32(tmp1_32);
res = vmull_u8(m, tmp0);
res = vmlal_u8(res, max_minus_m, tmp1);
- vst1_lane_u32(
- (uint32_t *)(dst + (0 * dst_stride)),
- vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
- vst1_lane_u32(
- (uint32_t *)(dst + (1 * dst_stride)),
- vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+ const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
+ store_unaligned_u8_4x1(dst + 0 * dst_stride, result, 0);
+ store_unaligned_u8_4x1(dst + 1 * dst_stride, result, 1);
src0 += (2 * src0_stride);
src1 += (2 * src1_stride);
dst += (2 * dst_stride);
@@ -127,12 +124,9 @@
tmp1 = vreinterpret_u8_u16(tmp1_16);
res = vmull_u8(m, tmp0);
res = vmlal_u8(res, max_minus_m, tmp1);
- vst1_lane_u16(
- (uint16_t *)(dst + (0 * dst_stride)),
- vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
- vst1_lane_u16(
- (uint16_t *)(dst + (1 * dst_stride)),
- vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+ const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
+ store_unaligned_u8_2x1(dst + 0 * dst_stride, result, 0);
+ store_unaligned_u8_2x1(dst + 1 * dst_stride, result, 1);
src0 += (2 * src0_stride);
src1 += (2 * src1_stride);
dst += (2 * dst_stride);