blend_a64_*_neon: fix bus error in armv7

dst is unaligned, so avoid casting to a larger type that may produce
alignment hints; the only change to the assembly is the removal of those
hints

Bug: aomedia:3282
Fixed: aomedia:3282
Change-Id: Iee988b0ee77a28755bf83830d88f87f0c19efad5
diff --git a/aom_dsp/arm/blend_a64_mask_neon.c b/aom_dsp/arm/blend_a64_mask_neon.c
index 8709e38..f11d57e 100644
--- a/aom_dsp/arm/blend_a64_mask_neon.c
+++ b/aom_dsp/arm/blend_a64_mask_neon.c
@@ -117,14 +117,10 @@
   uint8x8_t res_0 = vqmovun_s16(src0_0);
   uint8x8_t res_1 = vqmovun_s16(src0_1);
 
-  vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), vreinterpret_u32_u8(res_0),
-                0);
-  vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), vreinterpret_u32_u8(res_0),
-                1);
-  vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), vreinterpret_u32_u8(res_1),
-                0);
-  vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), vreinterpret_u32_u8(res_1),
-                1);
+  store_unaligned_u8_4x1(dst + 0 * dst_stride, res_0, 0);
+  store_unaligned_u8_4x1(dst + 1 * dst_stride, res_0, 1);
+  store_unaligned_u8_4x1(dst + 2 * dst_stride, res_1, 0);
+  store_unaligned_u8_4x1(dst + 3 * dst_stride, res_1, 1);
 }
 
 void aom_lowbd_blend_a64_d16_mask_neon(
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index c8236da..81643e9 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -413,6 +413,13 @@
     memcpy(dst, &a, 4);                                \
   } while (0)
 
+#define store_unaligned_u8_2x1(dst, src, lane)         \
+  do {                                                 \
+    uint16_t a;                                        \
+    a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \
+    memcpy(dst, &a, 2);                                \
+  } while (0)
+
 static INLINE void load_unaligned_u8_2x2(const uint8_t *buf, int stride,
                                          uint16x4_t *tu0) {
   uint16_t a;
diff --git a/av1/common/arm/blend_a64_hmask_neon.c b/av1/common/arm/blend_a64_hmask_neon.c
index 4639d4c..89252ef 100644
--- a/av1/common/arm/blend_a64_hmask_neon.c
+++ b/av1/common/arm/blend_a64_hmask_neon.c
@@ -83,6 +83,7 @@
       dst += dst_stride;
     }
   } else if (w == 4) {
+    assert(((uintptr_t)mask & 3) == 0);
     const uint8x8_t m = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)mask));
     const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
     for (int i = 0; i < h; i += 2) {
@@ -96,17 +97,15 @@
       tmp1 = vreinterpret_u8_u32(tmp1_32);
       res = vmull_u8(m, tmp0);
       res = vmlal_u8(res, max_minus_m, tmp1);
-      vst1_lane_u32(
-          (uint32_t *)(dst + (0 * dst_stride)),
-          vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
-      vst1_lane_u32(
-          (uint32_t *)(dst + (1 * dst_stride)),
-          vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+      const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
+      store_unaligned_u8_4x1(dst + 0 * dst_stride, result, 0);
+      store_unaligned_u8_4x1(dst + 1 * dst_stride, result, 1);
       src0 += (2 * src0_stride);
       src1 += (2 * src1_stride);
       dst += (2 * dst_stride);
     }
   } else if (w == 2) {
+    assert(((uintptr_t)mask & 1) == 0);
     const uint8x8_t m = vreinterpret_u8_u16(vld1_dup_u16((uint16_t *)mask));
     const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
     for (int i = 0; i < h; i += 2) {
@@ -120,12 +119,9 @@
       tmp1 = vreinterpret_u8_u16(tmp1_16);
       res = vmull_u8(m, tmp0);
       res = vmlal_u8(res, max_minus_m, tmp1);
-      vst1_lane_u16(
-          (uint16_t *)(dst + (0 * dst_stride)),
-          vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
-      vst1_lane_u16(
-          (uint16_t *)(dst + (1 * dst_stride)),
-          vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+      const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
+      store_unaligned_u8_2x1(dst + 0 * dst_stride, result, 0);
+      store_unaligned_u8_2x1(dst + 1 * dst_stride, result, 1);
       src0 += (2 * src0_stride);
       src1 += (2 * src1_stride);
       dst += (2 * dst_stride);
diff --git a/av1/common/arm/blend_a64_vmask_neon.c b/av1/common/arm/blend_a64_vmask_neon.c
index 061af74..2132fbd 100644
--- a/av1/common/arm/blend_a64_vmask_neon.c
+++ b/av1/common/arm/blend_a64_vmask_neon.c
@@ -95,12 +95,9 @@
       tmp1 = vreinterpret_u8_u32(tmp1_32);
       res = vmull_u8(m, tmp0);
       res = vmlal_u8(res, max_minus_m, tmp1);
-      vst1_lane_u32(
-          (uint32_t *)(dst + (0 * dst_stride)),
-          vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
-      vst1_lane_u32(
-          (uint32_t *)(dst + (1 * dst_stride)),
-          vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+      const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
+      store_unaligned_u8_4x1(dst + 0 * dst_stride, result, 0);
+      store_unaligned_u8_4x1(dst + 1 * dst_stride, result, 1);
       src0 += (2 * src0_stride);
       src1 += (2 * src1_stride);
       dst += (2 * dst_stride);
@@ -127,12 +124,9 @@
       tmp1 = vreinterpret_u8_u16(tmp1_16);
       res = vmull_u8(m, tmp0);
       res = vmlal_u8(res, max_minus_m, tmp1);
-      vst1_lane_u16(
-          (uint16_t *)(dst + (0 * dst_stride)),
-          vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
-      vst1_lane_u16(
-          (uint16_t *)(dst + (1 * dst_stride)),
-          vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+      const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
+      store_unaligned_u8_2x1(dst + 0 * dst_stride, result, 0);
+      store_unaligned_u8_2x1(dst + 1 * dst_stride, result, 1);
       src0 += (2 * src0_stride);
       src1 += (2 * src1_stride);
       dst += (2 * dst_stride);