mem_neon.h: Introduce and use strided store helpers

Several of the z3 predictor implementations make use of consecutive
store-lane instruction to scatter parts of the vector to strided
locations in memory. To save this duplication, add a new helper to
mem_neon.h for handling this strided-scatter operation.

It turns out that similar helpers already exist, but the naming scheme
currently clashes with the multi-vector store cases.

Remove all uses of the following macros, since they were either
never used or are can now be calls to the strided store helpers instead:

* store_s16_2x1(s, s0, lane)
* store_u16_2x1(s, s0, lane)
* store_u16q_2x1(s, s0, lane)
* store_unaligned_u8_4x1(dst, src, lane)
* store_unaligned_u8_2x1(dst, src, lane)
* store_unaligned_u16_2x1(dst, src, lane)
* store_unaligned_u16_4x1(dst, src, lane)

There is one remaining (non-strided store) use of store_u16_2x1, however
this can now be made into a function since it is only ever using the low
lane of the vector.

With this change the only macros exposed from mem_neon.h are now
store_u8_2x1 and store_u8_4x1. These are left for a later commit since
they have many more existing uses, primarily for the convolve kernels.

Change-Id: I59843f54e3a443fa8eefb3e6bf78baa68b3698b0
diff --git a/aom_dsp/arm/blend_a64_mask_neon.c b/aom_dsp/arm/blend_a64_mask_neon.c
index 7b1b66a..1bc3b80 100644
--- a/aom_dsp/arm/blend_a64_mask_neon.c
+++ b/aom_dsp/arm/blend_a64_mask_neon.c
@@ -91,7 +91,7 @@
 
         uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec);
 
-        store_unaligned_u8_4x2(dst, dst_stride, blend);
+        store_u8x4_strided_x2(dst, dst_stride, blend);
 
         mask += 2 * mask_stride;
         src0 += 2 * src0_stride;
@@ -139,7 +139,7 @@
         uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
         uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
 
-        store_unaligned_u8_4x2(dst, dst_stride, blend);
+        store_u8x4_strided_x2(dst, dst_stride, blend);
 
         mask += 4 * mask_stride;
         src0 += 2 * src0_stride;
@@ -181,7 +181,7 @@
         uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
         uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
 
-        store_unaligned_u8_4x2(dst, dst_stride, blend);
+        store_u8x4_strided_x2(dst, dst_stride, blend);
 
         mask += 2 * mask_stride;
         src0 += 2 * src0_stride;
@@ -225,7 +225,7 @@
         uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3));
         uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
 
-        store_unaligned_u8_4x2(dst, dst_stride, blend);
+        store_u8x4_strided_x2(dst, dst_stride, blend);
 
         mask += 4 * mask_stride;
         src0 += 2 * src0_stride;
@@ -293,7 +293,7 @@
 
         uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
 
-        store_unaligned_u8_4x2(dst, dst_stride, blend);
+        store_u8x4_strided_x2(dst, dst_stride, blend);
 
         mask += 2 * mask_stride;
         src0 += 2 * src0_stride;
@@ -358,7 +358,7 @@
         uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3);
         uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
 
-        store_unaligned_u8_4x2(dst, dst_stride, blend);
+        store_u8x4_strided_x2(dst, dst_stride, blend);
 
         mask += 4 * mask_stride;
         src0 += 2 * src0_stride;
@@ -418,7 +418,7 @@
         uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1);
         uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
 
-        store_unaligned_u8_4x2(dst, dst_stride, blend);
+        store_u8x4_strided_x2(dst, dst_stride, blend);
 
         mask += 2 * mask_stride;
         src0 += 2 * src0_stride;
@@ -479,7 +479,7 @@
         uint8x8_t m_avg = avg_blend_u8x8(m0_2, m1_3);
         uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
 
-        store_unaligned_u8_4x2(dst, dst_stride, blend);
+        store_u8x4_strided_x2(dst, dst_stride, blend);
 
         mask += 4 * mask_stride;
         src0 += 2 * src0_stride;
diff --git a/aom_dsp/arm/highbd_blend_a64_hmask_neon.c b/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
index bdd2177..8b03e91 100644
--- a/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
+++ b/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
@@ -67,7 +67,7 @@
 
       uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
 
-      store_unaligned_u16_4x2(dst, dst_stride, blend);
+      store_u16x4_strided_x2(dst, dst_stride, blend);
 
       src0 += 2 * src0_stride;
       src1 += 2 * src1_stride;
@@ -83,7 +83,7 @@
 
       uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1);
 
-      store_unaligned_u16_2x2(dst, dst_stride, blend);
+      store_u16x2_strided_x2(dst, dst_stride, blend);
 
       src0 += 2 * src0_stride;
       src1 += 2 * src1_stride;
diff --git a/aom_dsp/arm/highbd_blend_a64_mask_neon.c b/aom_dsp/arm/highbd_blend_a64_mask_neon.c
index 36d763a..90b44fc 100644
--- a/aom_dsp/arm/highbd_blend_a64_mask_neon.c
+++ b/aom_dsp/arm/highbd_blend_a64_mask_neon.c
@@ -91,7 +91,7 @@
           uint16x8_t blend =                                                  \
               alpha_##bd##_blend_a64_d16_u16x8(m0, s0, s1, offset);           \
                                                                               \
-          store_unaligned_u16_4x2(dst, dst_stride, blend);                    \
+          store_u16x4_strided_x2(dst, dst_stride, blend);                     \
                                                                               \
           mask += 2 * mask_stride;                                            \
           src0 += 2 * src0_stride;                                            \
@@ -139,7 +139,7 @@
           uint16x8_t blend =                                                  \
               alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset);        \
                                                                               \
-          store_unaligned_u16_4x2(dst, dst_stride, blend);                    \
+          store_u16x4_strided_x2(dst, dst_stride, blend);                     \
                                                                               \
           mask += 4 * mask_stride;                                            \
           src0 += 2 * src0_stride;                                            \
@@ -182,7 +182,7 @@
           uint16x8_t blend =                                                  \
               alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset);        \
                                                                               \
-          store_unaligned_u16_4x2(dst, dst_stride, blend);                    \
+          store_u16x4_strided_x2(dst, dst_stride, blend);                     \
                                                                               \
           mask += 2 * mask_stride;                                            \
           src0 += 2 * src0_stride;                                            \
@@ -227,7 +227,7 @@
           uint16x8_t blend =                                                  \
               alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset);        \
                                                                               \
-          store_unaligned_u16_4x2(dst, dst_stride, blend);                    \
+          store_u16x4_strided_x2(dst, dst_stride, blend);                     \
                                                                               \
           mask += 4 * mask_stride;                                            \
           src0 += 2 * src0_stride;                                            \
@@ -325,7 +325,7 @@
 
         uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
 
-        store_unaligned_u16_4x2(dst, dst_stride, blend);
+        store_u16x4_strided_x2(dst, dst_stride, blend);
 
         mask += 2 * mask_stride;
         src0 += 2 * src0_stride;
@@ -373,7 +373,7 @@
         uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
         uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
 
-        store_unaligned_u16_4x2(dst, dst_stride, blend);
+        store_u16x4_strided_x2(dst, dst_stride, blend);
 
         mask += 4 * mask_stride;
         src0 += 2 * src0_stride;
@@ -416,7 +416,7 @@
         uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
         uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
 
-        store_unaligned_u16_4x2(dst, dst_stride, blend);
+        store_u16x4_strided_x2(dst, dst_stride, blend);
 
         mask += 2 * mask_stride;
         src0 += 2 * src0_stride;
@@ -460,7 +460,7 @@
         uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3));
         uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
 
-        store_unaligned_u16_4x2(dst, dst_stride, blend);
+        store_u16x4_strided_x2(dst, dst_stride, blend);
 
         mask += 4 * mask_stride;
         src0 += 2 * src0_stride;
diff --git a/aom_dsp/arm/highbd_blend_a64_vmask_neon.c b/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
index ea3d655..1292e20 100644
--- a/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
+++ b/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
@@ -70,7 +70,7 @@
 
       uint16x8_t blend = alpha_blend_a64_u16x8(m, s0, s1);
 
-      store_unaligned_u16_4x2(dst, dst_stride, blend);
+      store_u16x4_strided_x2(dst, dst_stride, blend);
 
       mask += 2;
       src0 += 2 * src0_stride;
@@ -90,7 +90,7 @@
 
       uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1);
 
-      store_unaligned_u16_2x2(dst, dst_stride, blend);
+      store_u16x2_strided_x2(dst, dst_stride, blend);
 
       mask += 2;
       src0 += 2 * src0_stride;
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index c0ddcf9..2a5ac75 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -2741,14 +2741,10 @@
 
   dr_prediction_z1_HxW_internal_neon_64(4, 4, dstvec, left, upsample_left, dy);
   transpose4x8_8x4_low_neon(dstvec, &dest);
-  vst1_lane_u32((uint32_t *)(dst + stride * 0),
-                vreinterpret_u32_u16(dest.val[0]), 0);
-  vst1_lane_u32((uint32_t *)(dst + stride * 1),
-                vreinterpret_u32_u16(dest.val[0]), 1);
-  vst1_lane_u32((uint32_t *)(dst + stride * 2),
-                vreinterpret_u32_u16(dest.val[1]), 0);
-  vst1_lane_u32((uint32_t *)(dst + stride * 3),
-                vreinterpret_u32_u16(dest.val[1]), 1);
+  store_u8x4_strided_x2(dst + stride * 0, stride,
+                        vreinterpret_u8_u16(dest.val[0]));
+  store_u8x4_strided_x2(dst + stride * 2, stride,
+                        vreinterpret_u8_u16(dest.val[1]));
 }
 
 static void dr_prediction_z3_8x8_neon(uint8_t *dst, ptrdiff_t stride,
@@ -2777,22 +2773,14 @@
 
   dr_prediction_z1_HxW_internal_neon_64(8, 4, dstvec, left, upsample_left, dy);
   transpose4x8_8x4_neon(dstvec, d);
-  vst1_lane_u32((uint32_t *)(dst + stride * 0),
-                vreinterpret_u32_u16(d[0].val[0]), 0);
-  vst1_lane_u32((uint32_t *)(dst + stride * 1),
-                vreinterpret_u32_u16(d[0].val[0]), 1);
-  vst1_lane_u32((uint32_t *)(dst + stride * 2),
-                vreinterpret_u32_u16(d[0].val[1]), 0);
-  vst1_lane_u32((uint32_t *)(dst + stride * 3),
-                vreinterpret_u32_u16(d[0].val[1]), 1);
-  vst1_lane_u32((uint32_t *)(dst + stride * 4),
-                vreinterpret_u32_u16(d[1].val[0]), 0);
-  vst1_lane_u32((uint32_t *)(dst + stride * 5),
-                vreinterpret_u32_u16(d[1].val[0]), 1);
-  vst1_lane_u32((uint32_t *)(dst + stride * 6),
-                vreinterpret_u32_u16(d[1].val[1]), 0);
-  vst1_lane_u32((uint32_t *)(dst + stride * 7),
-                vreinterpret_u32_u16(d[1].val[1]), 1);
+  store_u8x4_strided_x2(dst + stride * 0, stride,
+                        vreinterpret_u8_u16(d[0].val[0]));
+  store_u8x4_strided_x2(dst + stride * 2, stride,
+                        vreinterpret_u8_u16(d[0].val[1]));
+  store_u8x4_strided_x2(dst + stride * 4, stride,
+                        vreinterpret_u8_u16(d[1].val[0]));
+  store_u8x4_strided_x2(dst + stride * 6, stride,
+                        vreinterpret_u8_u16(d[1].val[1]));
 }
 
 static void dr_prediction_z3_8x4_neon(uint8_t *dst, ptrdiff_t stride,
@@ -2844,41 +2832,14 @@
 
   dr_prediction_z1_HxW_internal_neon(16, 4, dstvec, left, upsample_left, dy);
   transpose4x16_neon(dstvec, d);
-  vst1q_lane_u32((uint32_t *)(dst + stride * 0),
-                 vreinterpretq_u32_u16(d[0].val[0]), 0);
-  vst1q_lane_u32((uint32_t *)(dst + stride * 1),
-                 vreinterpretq_u32_u16(d[0].val[0]), 1);
-  vst1q_lane_u32((uint32_t *)(dst + stride * 2),
-                 vreinterpretq_u32_u16(d[0].val[0]), 2);
-  vst1q_lane_u32((uint32_t *)(dst + stride * 3),
-                 vreinterpretq_u32_u16(d[0].val[0]), 3);
-
-  vst1q_lane_u32((uint32_t *)(dst + stride * 4),
-                 vreinterpretq_u32_u16(d[0].val[1]), 0);
-  vst1q_lane_u32((uint32_t *)(dst + stride * 5),
-                 vreinterpretq_u32_u16(d[0].val[1]), 1);
-  vst1q_lane_u32((uint32_t *)(dst + stride * 6),
-                 vreinterpretq_u32_u16(d[0].val[1]), 2);
-  vst1q_lane_u32((uint32_t *)(dst + stride * 7),
-                 vreinterpretq_u32_u16(d[0].val[1]), 3);
-
-  vst1q_lane_u32((uint32_t *)(dst + stride * 8),
-                 vreinterpretq_u32_u16(d[1].val[0]), 0);
-  vst1q_lane_u32((uint32_t *)(dst + stride * 9),
-                 vreinterpretq_u32_u16(d[1].val[0]), 1);
-  vst1q_lane_u32((uint32_t *)(dst + stride * 10),
-                 vreinterpretq_u32_u16(d[1].val[0]), 2);
-  vst1q_lane_u32((uint32_t *)(dst + stride * 11),
-                 vreinterpretq_u32_u16(d[1].val[0]), 3);
-
-  vst1q_lane_u32((uint32_t *)(dst + stride * 12),
-                 vreinterpretq_u32_u16(d[1].val[1]), 0);
-  vst1q_lane_u32((uint32_t *)(dst + stride * 13),
-                 vreinterpretq_u32_u16(d[1].val[1]), 1);
-  vst1q_lane_u32((uint32_t *)(dst + stride * 14),
-                 vreinterpretq_u32_u16(d[1].val[1]), 2);
-  vst1q_lane_u32((uint32_t *)(dst + stride * 15),
-                 vreinterpretq_u32_u16(d[1].val[1]), 3);
+  store_u8x4_strided_x4(dst + stride * 0, stride,
+                        vreinterpretq_u8_u16(d[0].val[0]));
+  store_u8x4_strided_x4(dst + stride * 4, stride,
+                        vreinterpretq_u8_u16(d[0].val[1]));
+  store_u8x4_strided_x4(dst + stride * 8, stride,
+                        vreinterpretq_u8_u16(d[1].val[0]));
+  store_u8x4_strided_x4(dst + stride * 12, stride,
+                        vreinterpretq_u8_u16(d[1].val[1]));
 }
 
 static void dr_prediction_z3_16x4_neon(uint8_t *dst, ptrdiff_t stride,
@@ -3731,7 +3692,7 @@
     result = vbsl_u8(left_or_top_mask, result, top_left);
 
     if (width == 4) {
-      store_unaligned_u8_4x1(dest, result, 0);
+      store_u8_4x1(dest, result, 0);
     } else {  // width == 8
       vst1_u8(dest, result);
     }
diff --git a/aom_dsp/arm/loopfilter_neon.c b/aom_dsp/arm/loopfilter_neon.c
index 4704f05..3bf98cc 100644
--- a/aom_dsp/arm/loopfilter_neon.c
+++ b/aom_dsp/arm/loopfilter_neon.c
@@ -862,10 +862,8 @@
 
   transpose_elems_inplace_u8_4x4(&p1p0, &q0q1);
 
-  store_unaligned_u8_4x1(src - 2, p1p0, 0);
-  store_unaligned_u8_4x1((src - 2) + 1 * stride, q0q1, 0);
-  store_unaligned_u8_4x1((src - 2) + 2 * stride, p1p0, 1);
-  store_unaligned_u8_4x1((src - 2) + 3 * stride, q0q1, 1);
+  store_u8x4_strided_x2(src - 2, 2 * stride, p1p0);
+  store_u8x4_strided_x2(src + stride - 2, 2 * stride, q0q1);
 }
 
 void aom_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 32a54a0..8426a26 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -457,18 +457,6 @@
   *s3 = vld1_s16(s);
 }
 
-/* These intrinsics require immediate values, so we must use #defines
-   to enforce that. */
-#define store_u8_2x1(s, s0, lane)                                  \
-  do {                                                             \
-    vst1_lane_u16((uint16_t *)(s), vreinterpret_u16_u8(s0), lane); \
-  } while (0)
-
-#define store_u8_4x1(s, s0, lane)                                  \
-  do {                                                             \
-    vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(s0), lane); \
-  } while (0)
-
 static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
                                 const uint8x8_t s1, const uint8x8_t s2,
                                 const uint8x8_t s3, const uint8x8_t s4,
@@ -602,21 +590,6 @@
   vst1_s16(s, s3);
 }
 
-/* These intrinsics require immediate values, so we must use #defines
-   to enforce that. */
-#define store_s16_2x1(s, s0, lane)                                 \
-  do {                                                             \
-    vst1_lane_s32((int32_t *)(s), vreinterpret_s32_s16(s0), lane); \
-  } while (0)
-#define store_u16_2x1(s, s0, lane)                                  \
-  do {                                                              \
-    vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u16(s0), lane); \
-  } while (0)
-#define store_u16q_2x1(s, s0, lane)                                   \
-  do {                                                                \
-    vst1q_lane_u32((uint32_t *)(s), vreinterpretq_u32_u16(s0), lane); \
-  } while (0)
-
 static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
                                  const int16x8_t s0, const int16x8_t s1,
                                  const int16x8_t s2, const int16x8_t s3) {
@@ -989,36 +962,6 @@
   load_unaligned_u8_4x4(buf, stride, tu2, tu3);
 }
 
-/* These intrinsics require immediate values, so we must use #defines
-   to enforce that. */
-#define store_unaligned_u8_4x1(dst, src, lane)         \
-  do {                                                 \
-    uint32_t a;                                        \
-    a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
-    memcpy(dst, &a, 4);                                \
-  } while (0)
-
-#define store_unaligned_u8_2x1(dst, src, lane)         \
-  do {                                                 \
-    uint16_t a;                                        \
-    a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \
-    memcpy(dst, &a, 2);                                \
-  } while (0)
-
-#define store_unaligned_u16_2x1(dst, src, lane)         \
-  do {                                                  \
-    uint32_t a;                                         \
-    a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \
-    memcpy(dst, &a, 4);                                 \
-  } while (0)
-
-#define store_unaligned_u16_4x1(dst, src, lane)           \
-  do {                                                    \
-    uint64_t a;                                           \
-    a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \
-    memcpy(dst, &a, 8);                                   \
-  } while (0)
-
 static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
                                 uint8x16_t *const s0, uint8x16_t *const s1,
                                 uint8x16_t *const s2, uint8x16_t *const s3,
@@ -1200,34 +1143,6 @@
   vst1q_s32(buf, v0);
 }
 
-static INLINE void store_unaligned_u8_2x2(uint8_t *dst, uint32_t dst_stride,
-                                          uint8x8_t src) {
-  store_unaligned_u8_2x1(dst, src, 0);
-  dst += dst_stride;
-  store_unaligned_u8_2x1(dst, src, 1);
-}
-
-static INLINE void store_unaligned_u8_4x2(uint8_t *dst, uint32_t dst_stride,
-                                          uint8x8_t src) {
-  store_unaligned_u8_4x1(dst, src, 0);
-  dst += dst_stride;
-  store_unaligned_u8_4x1(dst, src, 1);
-}
-
-static INLINE void store_unaligned_u16_2x2(uint16_t *dst, uint32_t dst_stride,
-                                           uint16x4_t src) {
-  store_unaligned_u16_2x1(dst, src, 0);
-  dst += dst_stride;
-  store_unaligned_u16_2x1(dst, src, 1);
-}
-
-static INLINE void store_unaligned_u16_4x2(uint16_t *dst, uint32_t dst_stride,
-                                           uint16x8_t src) {
-  store_unaligned_u16_4x1(dst, src, 0);
-  dst += dst_stride;
-  store_unaligned_u16_4x1(dst, src, 1);
-}
-
 static INLINE uint8x8_t load_u8_gather_s16_x8(const uint8_t *src,
                                               int16x8_t indices) {
   // Recent Clang and GCC versions correctly identify that this zero-broadcast
@@ -1246,4 +1161,83 @@
   return ret;
 }
 
+// The `lane` parameter here must be an immediate.
+#define store_u8_2x1(dst, src, lane)                            \
+  do {                                                          \
+    uint16_t a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \
+    memcpy(dst, &a, 2);                                         \
+  } while (0)
+
+#define store_u8_4x1(dst, src, lane)                            \
+  do {                                                          \
+    uint32_t a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
+    memcpy(dst, &a, 4);                                         \
+  } while (0)
+
+#define store_u16_2x1_lane(dst, src, lane)                       \
+  do {                                                           \
+    uint32_t a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \
+    memcpy(dst, &a, 4);                                          \
+  } while (0)
+
+#define store_u16_4x1_lane(dst, src, lane)                         \
+  do {                                                             \
+    uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \
+    memcpy(dst, &a, 8);                                            \
+  } while (0)
+
+// Store two blocks of 16-bits from a single vector.
+static INLINE void store_u8x2_strided_x2(uint8_t *dst, uint32_t dst_stride,
+                                         uint8x8_t src) {
+  store_u8_2x1(dst, src, 0);
+  dst += dst_stride;
+  store_u8_2x1(dst, src, 1);
+}
+
+// Store two blocks of 32-bits from a single vector.
+static INLINE void store_u8x4_strided_x2(uint8_t *dst, ptrdiff_t stride,
+                                         uint8x8_t src) {
+  store_u8_4x1(dst, src, 0);
+  dst += stride;
+  store_u8_4x1(dst, src, 1);
+}
+
+// Store four blocks of 32-bits from a single vector.
+static INLINE void store_u8x4_strided_x4(uint8_t *dst, ptrdiff_t stride,
+                                         uint8x16_t src) {
+  store_u8_4x1(dst, vget_low_u8(src), 0);
+  dst += stride;
+  store_u8_4x1(dst, vget_low_u8(src), 1);
+  dst += stride;
+  store_u8_4x1(dst, vget_high_u8(src), 0);
+  dst += stride;
+  store_u8_4x1(dst, vget_high_u8(src), 1);
+}
+
+// Store the low 32-bits from a single vector.
+static INLINE void store_u16_2x1(uint16_t *dst, const uint16x4_t src) {
+  store_u16_2x1_lane(dst, src, 0);
+}
+
+// Store two blocks of 32-bits from a single vector.
+static INLINE void store_u16x2_strided_x2(uint16_t *dst, uint32_t dst_stride,
+                                          uint16x4_t src) {
+  store_u16_2x1_lane(dst, src, 0);
+  dst += dst_stride;
+  store_u16_2x1_lane(dst, src, 1);
+}
+
+// Store two blocks of 64-bits from a single vector.
+static INLINE void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride,
+                                          uint16x8_t src) {
+  store_u16_4x1_lane(dst, src, 0);
+  dst += dst_stride;
+  store_u16_4x1_lane(dst, src, 1);
+}
+
+// The store_u8_2x1 and store_u8_4x1 macros are needed elsewhere so don't
+// #undef them.
+#undef store_u16_2x1_lane
+#undef store_u16_4x1_lane
+
 #endif  // AOM_AOM_DSP_ARM_MEM_NEON_H_
diff --git a/av1/common/arm/blend_a64_hmask_neon.c b/av1/common/arm/blend_a64_hmask_neon.c
index 22d2977..7afb1a9 100644
--- a/av1/common/arm/blend_a64_hmask_neon.c
+++ b/av1/common/arm/blend_a64_hmask_neon.c
@@ -73,7 +73,7 @@
 
       uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
 
-      store_unaligned_u8_4x2(dst, dst_stride, blend);
+      store_u8x4_strided_x2(dst, dst_stride, blend);
 
       src0 += 2 * src0_stride;
       src1 += 2 * src1_stride;
@@ -88,7 +88,7 @@
 
       uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
 
-      store_unaligned_u8_2x2(dst, dst_stride, blend);
+      store_u8x2_strided_x2(dst, dst_stride, blend);
 
       src0 += 2 * src0_stride;
       src1 += 2 * src1_stride;
diff --git a/av1/common/arm/blend_a64_vmask_neon.c b/av1/common/arm/blend_a64_vmask_neon.c
index d53d363..9aea299 100644
--- a/av1/common/arm/blend_a64_vmask_neon.c
+++ b/av1/common/arm/blend_a64_vmask_neon.c
@@ -78,7 +78,7 @@
 
       uint8x8_t blend = alpha_blend_a64_u8x8(m, s0, s1);
 
-      store_unaligned_u8_4x2(dst, dst_stride, blend);
+      store_u8x4_strided_x2(dst, dst_stride, blend);
 
       mask += 2;
       src0 += 2 * src0_stride;
@@ -97,7 +97,7 @@
 
       uint8x8_t blend = alpha_blend_a64_u8x8(m, s0, s1);
 
-      store_unaligned_u8_2x2(dst, dst_stride, blend);
+      store_u8x2_strided_x2(dst, dst_stride, blend);
 
       mask += 2;
       src0 += 2 * src0_stride;
diff --git a/av1/common/arm/cdef_block_neon.c b/av1/common/arm/cdef_block_neon.c
index 53b61e2..53d3a9f 100644
--- a/av1/common/arm/cdef_block_neon.c
+++ b/av1/common/arm/cdef_block_neon.c
@@ -712,7 +712,7 @@
                           vreinterpretq_s16_u16(max));
 
       const uint8x8_t res_u8 = vqmovun_s16(res_s16);
-      store_unaligned_u8_4x2(dst8, dstride, res_u8);
+      store_u8x4_strided_x2(dst8, dstride, res_u8);
 
       in += 2 * CDEF_BSTRIDE;
       dst8 += 2 * dstride;
@@ -794,7 +794,7 @@
       const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
 
       const uint8x8_t res_u8 = vqmovun_s16(res_s16);
-      store_unaligned_u8_4x2(dst8, dstride, res_u8);
+      store_u8x4_strided_x2(dst8, dstride, res_u8);
 
       in += 2 * CDEF_BSTRIDE;
       dst8 += 2 * dstride;
@@ -886,7 +886,7 @@
       const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
 
       const uint8x8_t res_u8 = vqmovun_s16(res_s16);
-      store_unaligned_u8_4x2(dst8, dstride, res_u8);
+      store_u8x4_strided_x2(dst8, dstride, res_u8);
 
       in += 2 * CDEF_BSTRIDE;
       dst8 += 2 * dstride;
@@ -925,7 +925,7 @@
     do {
       const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
       const uint8x8_t res = vqmovn_u16(s);
-      store_unaligned_u8_4x2(dst8, dstride, res);
+      store_u8x4_strided_x2(dst8, dstride, res);
 
       in += 2 * CDEF_BSTRIDE;
       dst8 += 2 * dstride;
@@ -1139,7 +1139,7 @@
       res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)),
                       vreinterpretq_s16_u16(max));
 
-      store_unaligned_u16_4x2(dst16, dstride, vreinterpretq_u16_s16(res));
+      store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
 
       in += 2 * CDEF_BSTRIDE;
       dst16 += 2 * dstride;
@@ -1218,7 +1218,7 @@
           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
       const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
 
-      store_unaligned_u16_4x2(dst16, dstride, vreinterpretq_u16_s16(res));
+      store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
 
       in += 2 * CDEF_BSTRIDE;
       dst16 += 2 * dstride;
@@ -1308,7 +1308,7 @@
           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
       const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
 
-      store_unaligned_u16_4x2(dst16, dstride, vreinterpretq_u16_s16(res));
+      store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
 
       in += 2 * CDEF_BSTRIDE;
       dst16 += 2 * dstride;
@@ -1345,7 +1345,7 @@
     int h = block_height;
     do {
       const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
-      store_unaligned_u16_4x2(dst16, dstride, s);
+      store_u16x4_strided_x2(dst16, dstride, s);
 
       in += 2 * CDEF_BSTRIDE;
       dst16 += 2 * dstride;
diff --git a/av1/common/arm/highbd_compound_convolve_neon.c b/av1/common/arm/highbd_compound_convolve_neon.c
index dc3f876..fc03a2e 100644
--- a/av1/common/arm/highbd_compound_convolve_neon.c
+++ b/av1/common/arm/highbd_compound_convolve_neon.c
@@ -1235,7 +1235,7 @@
       uint16x4_t d = vshl_u16(s, round_shift_s16);
       d = vadd_u16(d, offset_u16);
       if (w == 2) {
-        store_u16_2x1(dst_ptr + y * dst_stride, d, 0);
+        store_u16_2x1(dst_ptr + y * dst_stride, d);
       } else {
         vst1_u16(dst_ptr + y * dst_stride, d);
       }
diff --git a/av1/common/arm/highbd_convolve_horiz_rs_neon.c b/av1/common/arm/highbd_convolve_horiz_rs_neon.c
index 51da025..4f1c25d 100644
--- a/av1/common/arm/highbd_convolve_horiz_rs_neon.c
+++ b/av1/common/arm/highbd_convolve_horiz_rs_neon.c
@@ -142,9 +142,9 @@
       d0 = vmin_u16(d0, max);
 
       if (w == 2) {
-        store_u16_2x1(d + 0 * dst_stride, d0, 0);
+        store_u16_2x1(d, d0);
       } else {
-        vst1_u16(d + 0 * dst_stride, d0);
+        vst1_u16(d, d0);
       }
 
       src_ptr += src_stride;
diff --git a/av1/common/arm/highbd_convolve_neon.c b/av1/common/arm/highbd_convolve_neon.c
index 3f5ff9e..3a3e33f 100644
--- a/av1/common/arm/highbd_convolve_neon.c
+++ b/av1/common/arm/highbd_convolve_neon.c
@@ -1927,7 +1927,7 @@
       uint16x4_t d0 = vrhadd_u16(s0, s1);
 
       if (w == 2) {
-        store_u16_2x1(dst, d0, 0);
+        store_u16_2x1(dst, d0);
       } else {
         vst1_u16(dst, d0);
       }
@@ -1978,7 +1978,7 @@
       uint16x4_t d0 = vrhadd_u16(s0, s1);
 
       if (w == 2) {
-        store_u16_2x1(dst, d0, 0);
+        store_u16_2x1(dst, d0);
       } else {
         vst1_u16(dst, d0);
       }
@@ -2086,7 +2086,7 @@
       d0 = vhadd_u16(d0, vget_low_u16(vert_offset));
 
       if (w == 2) {
-        store_u16_2x1(dst, d0, 0);
+        store_u16_2x1(dst, d0);
       } else {
         vst1_u16(dst, d0);
       }
diff --git a/av1/common/arm/highbd_convolve_scale_neon.c b/av1/common/arm/highbd_convolve_scale_neon.c
index eee5a1c..702c651 100644
--- a/av1/common/arm/highbd_convolve_scale_neon.c
+++ b/av1/common/arm/highbd_convolve_scale_neon.c
@@ -51,7 +51,7 @@
       d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
 
       if (w == 2) {
-        store_u16_2x1(dst_ptr, d0_u16, 0);
+        store_u16_2x1(dst_ptr, d0_u16);
       } else {
         vst1_u16(dst_ptr, d0_u16);
       }
@@ -123,7 +123,7 @@
       d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
 
       if (w == 2) {
-        store_u16_2x1(dst_ptr, d0_u16, 0);
+        store_u16_2x1(dst_ptr, d0_u16);
       } else {
         vst1_u16(dst_ptr, d0_u16);
       }
@@ -260,9 +260,9 @@
           s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
 
       if (w == 2) {
-        store_u16_2x1(d + 0 * dst_stride, d0, 0);
+        store_u16_2x1(d, d0);
       } else {
-        vst1_u16(d + 0 * dst_stride, d0);
+        vst1_u16(d, d0);
       }
 
       src_ptr += src_stride;
@@ -398,7 +398,7 @@
           offset_s32, vdupq_n_s32(0));
 
       if (w == 2) {
-        store_u16_2x1(d, d0, 0);
+        store_u16_2x1(d, d0);
       } else {
         vst1_u16(d, d0);
       }
@@ -458,7 +458,7 @@
       uint16x4_t d = vqmovun_s32(d0);
       d = vmin_u16(d, vget_low_u16(max));
       if (w == 2) {
-        store_u16_2x1(dst_ptr + y * dst_stride, d, 0);
+        store_u16_2x1(dst_ptr + y * dst_stride, d);
       } else {
         vst1_u16(dst_ptr + y * dst_stride, d);
       }
diff --git a/av1/encoder/arm/neon/reconinter_enc_neon.c b/av1/encoder/arm/neon/reconinter_enc_neon.c
index 03afa30..3d17723 100644
--- a/av1/encoder/arm/neon/reconinter_enc_neon.c
+++ b/av1/encoder/arm/neon/reconinter_enc_neon.c
@@ -222,8 +222,7 @@
       int i = height / 2;
       do {
         uint16x4_t r = load_u16_2x2(ref, ref_stride);
-        store_u16_2x1(comp_pred + 0 * width, r, 0);
-        store_u16_2x1(comp_pred + 1 * width, r, 1);
+        store_u16x2_strided_x2(comp_pred, width, r);
         ref += 2 * ref_stride;
         comp_pred += 2 * width;
       } while (--i != 0);