mem_neon.h: Introduce and use strided store helpers
Several of the z3 predictor implementations make use of consecutive
store-lane instruction to scatter parts of the vector to strided
locations in memory. To save this duplication, add a new helper to
mem_neon.h for handling this strided-scatter operation.
It turns out that similar helpers already exist, but the naming scheme
currently clashes with the multi-vector store cases.
Remove all uses of the following macros, since they were either
never used or are can now be calls to the strided store helpers instead:
* store_s16_2x1(s, s0, lane)
* store_u16_2x1(s, s0, lane)
* store_u16q_2x1(s, s0, lane)
* store_unaligned_u8_4x1(dst, src, lane)
* store_unaligned_u8_2x1(dst, src, lane)
* store_unaligned_u16_2x1(dst, src, lane)
* store_unaligned_u16_4x1(dst, src, lane)
There is one remaining (non-strided store) use of store_u16_2x1, however
this can now be made into a function since it is only ever using the low
lane of the vector.
With this change the only macros exposed from mem_neon.h are now
store_u8_2x1 and store_u8_4x1. These are left for a later commit since
they have many more existing uses, primarily for the convolve kernels.
Change-Id: I59843f54e3a443fa8eefb3e6bf78baa68b3698b0
diff --git a/aom_dsp/arm/blend_a64_mask_neon.c b/aom_dsp/arm/blend_a64_mask_neon.c
index 7b1b66a..1bc3b80 100644
--- a/aom_dsp/arm/blend_a64_mask_neon.c
+++ b/aom_dsp/arm/blend_a64_mask_neon.c
@@ -91,7 +91,7 @@
uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec);
- store_unaligned_u8_4x2(dst, dst_stride, blend);
+ store_u8x4_strided_x2(dst, dst_stride, blend);
mask += 2 * mask_stride;
src0 += 2 * src0_stride;
@@ -139,7 +139,7 @@
uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
- store_unaligned_u8_4x2(dst, dst_stride, blend);
+ store_u8x4_strided_x2(dst, dst_stride, blend);
mask += 4 * mask_stride;
src0 += 2 * src0_stride;
@@ -181,7 +181,7 @@
uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
- store_unaligned_u8_4x2(dst, dst_stride, blend);
+ store_u8x4_strided_x2(dst, dst_stride, blend);
mask += 2 * mask_stride;
src0 += 2 * src0_stride;
@@ -225,7 +225,7 @@
uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3));
uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
- store_unaligned_u8_4x2(dst, dst_stride, blend);
+ store_u8x4_strided_x2(dst, dst_stride, blend);
mask += 4 * mask_stride;
src0 += 2 * src0_stride;
@@ -293,7 +293,7 @@
uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
- store_unaligned_u8_4x2(dst, dst_stride, blend);
+ store_u8x4_strided_x2(dst, dst_stride, blend);
mask += 2 * mask_stride;
src0 += 2 * src0_stride;
@@ -358,7 +358,7 @@
uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3);
uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
- store_unaligned_u8_4x2(dst, dst_stride, blend);
+ store_u8x4_strided_x2(dst, dst_stride, blend);
mask += 4 * mask_stride;
src0 += 2 * src0_stride;
@@ -418,7 +418,7 @@
uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1);
uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
- store_unaligned_u8_4x2(dst, dst_stride, blend);
+ store_u8x4_strided_x2(dst, dst_stride, blend);
mask += 2 * mask_stride;
src0 += 2 * src0_stride;
@@ -479,7 +479,7 @@
uint8x8_t m_avg = avg_blend_u8x8(m0_2, m1_3);
uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
- store_unaligned_u8_4x2(dst, dst_stride, blend);
+ store_u8x4_strided_x2(dst, dst_stride, blend);
mask += 4 * mask_stride;
src0 += 2 * src0_stride;
diff --git a/aom_dsp/arm/highbd_blend_a64_hmask_neon.c b/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
index bdd2177..8b03e91 100644
--- a/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
+++ b/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
@@ -67,7 +67,7 @@
uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
- store_unaligned_u16_4x2(dst, dst_stride, blend);
+ store_u16x4_strided_x2(dst, dst_stride, blend);
src0 += 2 * src0_stride;
src1 += 2 * src1_stride;
@@ -83,7 +83,7 @@
uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1);
- store_unaligned_u16_2x2(dst, dst_stride, blend);
+ store_u16x2_strided_x2(dst, dst_stride, blend);
src0 += 2 * src0_stride;
src1 += 2 * src1_stride;
diff --git a/aom_dsp/arm/highbd_blend_a64_mask_neon.c b/aom_dsp/arm/highbd_blend_a64_mask_neon.c
index 36d763a..90b44fc 100644
--- a/aom_dsp/arm/highbd_blend_a64_mask_neon.c
+++ b/aom_dsp/arm/highbd_blend_a64_mask_neon.c
@@ -91,7 +91,7 @@
uint16x8_t blend = \
alpha_##bd##_blend_a64_d16_u16x8(m0, s0, s1, offset); \
\
- store_unaligned_u16_4x2(dst, dst_stride, blend); \
+ store_u16x4_strided_x2(dst, dst_stride, blend); \
\
mask += 2 * mask_stride; \
src0 += 2 * src0_stride; \
@@ -139,7 +139,7 @@
uint16x8_t blend = \
alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
\
- store_unaligned_u16_4x2(dst, dst_stride, blend); \
+ store_u16x4_strided_x2(dst, dst_stride, blend); \
\
mask += 4 * mask_stride; \
src0 += 2 * src0_stride; \
@@ -182,7 +182,7 @@
uint16x8_t blend = \
alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
\
- store_unaligned_u16_4x2(dst, dst_stride, blend); \
+ store_u16x4_strided_x2(dst, dst_stride, blend); \
\
mask += 2 * mask_stride; \
src0 += 2 * src0_stride; \
@@ -227,7 +227,7 @@
uint16x8_t blend = \
alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
\
- store_unaligned_u16_4x2(dst, dst_stride, blend); \
+ store_u16x4_strided_x2(dst, dst_stride, blend); \
\
mask += 4 * mask_stride; \
src0 += 2 * src0_stride; \
@@ -325,7 +325,7 @@
uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
- store_unaligned_u16_4x2(dst, dst_stride, blend);
+ store_u16x4_strided_x2(dst, dst_stride, blend);
mask += 2 * mask_stride;
src0 += 2 * src0_stride;
@@ -373,7 +373,7 @@
uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
- store_unaligned_u16_4x2(dst, dst_stride, blend);
+ store_u16x4_strided_x2(dst, dst_stride, blend);
mask += 4 * mask_stride;
src0 += 2 * src0_stride;
@@ -416,7 +416,7 @@
uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
- store_unaligned_u16_4x2(dst, dst_stride, blend);
+ store_u16x4_strided_x2(dst, dst_stride, blend);
mask += 2 * mask_stride;
src0 += 2 * src0_stride;
@@ -460,7 +460,7 @@
uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3));
uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
- store_unaligned_u16_4x2(dst, dst_stride, blend);
+ store_u16x4_strided_x2(dst, dst_stride, blend);
mask += 4 * mask_stride;
src0 += 2 * src0_stride;
diff --git a/aom_dsp/arm/highbd_blend_a64_vmask_neon.c b/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
index ea3d655..1292e20 100644
--- a/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
+++ b/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
@@ -70,7 +70,7 @@
uint16x8_t blend = alpha_blend_a64_u16x8(m, s0, s1);
- store_unaligned_u16_4x2(dst, dst_stride, blend);
+ store_u16x4_strided_x2(dst, dst_stride, blend);
mask += 2;
src0 += 2 * src0_stride;
@@ -90,7 +90,7 @@
uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1);
- store_unaligned_u16_2x2(dst, dst_stride, blend);
+ store_u16x2_strided_x2(dst, dst_stride, blend);
mask += 2;
src0 += 2 * src0_stride;
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index c0ddcf9..2a5ac75 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -2741,14 +2741,10 @@
dr_prediction_z1_HxW_internal_neon_64(4, 4, dstvec, left, upsample_left, dy);
transpose4x8_8x4_low_neon(dstvec, &dest);
- vst1_lane_u32((uint32_t *)(dst + stride * 0),
- vreinterpret_u32_u16(dest.val[0]), 0);
- vst1_lane_u32((uint32_t *)(dst + stride * 1),
- vreinterpret_u32_u16(dest.val[0]), 1);
- vst1_lane_u32((uint32_t *)(dst + stride * 2),
- vreinterpret_u32_u16(dest.val[1]), 0);
- vst1_lane_u32((uint32_t *)(dst + stride * 3),
- vreinterpret_u32_u16(dest.val[1]), 1);
+ store_u8x4_strided_x2(dst + stride * 0, stride,
+ vreinterpret_u8_u16(dest.val[0]));
+ store_u8x4_strided_x2(dst + stride * 2, stride,
+ vreinterpret_u8_u16(dest.val[1]));
}
static void dr_prediction_z3_8x8_neon(uint8_t *dst, ptrdiff_t stride,
@@ -2777,22 +2773,14 @@
dr_prediction_z1_HxW_internal_neon_64(8, 4, dstvec, left, upsample_left, dy);
transpose4x8_8x4_neon(dstvec, d);
- vst1_lane_u32((uint32_t *)(dst + stride * 0),
- vreinterpret_u32_u16(d[0].val[0]), 0);
- vst1_lane_u32((uint32_t *)(dst + stride * 1),
- vreinterpret_u32_u16(d[0].val[0]), 1);
- vst1_lane_u32((uint32_t *)(dst + stride * 2),
- vreinterpret_u32_u16(d[0].val[1]), 0);
- vst1_lane_u32((uint32_t *)(dst + stride * 3),
- vreinterpret_u32_u16(d[0].val[1]), 1);
- vst1_lane_u32((uint32_t *)(dst + stride * 4),
- vreinterpret_u32_u16(d[1].val[0]), 0);
- vst1_lane_u32((uint32_t *)(dst + stride * 5),
- vreinterpret_u32_u16(d[1].val[0]), 1);
- vst1_lane_u32((uint32_t *)(dst + stride * 6),
- vreinterpret_u32_u16(d[1].val[1]), 0);
- vst1_lane_u32((uint32_t *)(dst + stride * 7),
- vreinterpret_u32_u16(d[1].val[1]), 1);
+ store_u8x4_strided_x2(dst + stride * 0, stride,
+ vreinterpret_u8_u16(d[0].val[0]));
+ store_u8x4_strided_x2(dst + stride * 2, stride,
+ vreinterpret_u8_u16(d[0].val[1]));
+ store_u8x4_strided_x2(dst + stride * 4, stride,
+ vreinterpret_u8_u16(d[1].val[0]));
+ store_u8x4_strided_x2(dst + stride * 6, stride,
+ vreinterpret_u8_u16(d[1].val[1]));
}
static void dr_prediction_z3_8x4_neon(uint8_t *dst, ptrdiff_t stride,
@@ -2844,41 +2832,14 @@
dr_prediction_z1_HxW_internal_neon(16, 4, dstvec, left, upsample_left, dy);
transpose4x16_neon(dstvec, d);
- vst1q_lane_u32((uint32_t *)(dst + stride * 0),
- vreinterpretq_u32_u16(d[0].val[0]), 0);
- vst1q_lane_u32((uint32_t *)(dst + stride * 1),
- vreinterpretq_u32_u16(d[0].val[0]), 1);
- vst1q_lane_u32((uint32_t *)(dst + stride * 2),
- vreinterpretq_u32_u16(d[0].val[0]), 2);
- vst1q_lane_u32((uint32_t *)(dst + stride * 3),
- vreinterpretq_u32_u16(d[0].val[0]), 3);
-
- vst1q_lane_u32((uint32_t *)(dst + stride * 4),
- vreinterpretq_u32_u16(d[0].val[1]), 0);
- vst1q_lane_u32((uint32_t *)(dst + stride * 5),
- vreinterpretq_u32_u16(d[0].val[1]), 1);
- vst1q_lane_u32((uint32_t *)(dst + stride * 6),
- vreinterpretq_u32_u16(d[0].val[1]), 2);
- vst1q_lane_u32((uint32_t *)(dst + stride * 7),
- vreinterpretq_u32_u16(d[0].val[1]), 3);
-
- vst1q_lane_u32((uint32_t *)(dst + stride * 8),
- vreinterpretq_u32_u16(d[1].val[0]), 0);
- vst1q_lane_u32((uint32_t *)(dst + stride * 9),
- vreinterpretq_u32_u16(d[1].val[0]), 1);
- vst1q_lane_u32((uint32_t *)(dst + stride * 10),
- vreinterpretq_u32_u16(d[1].val[0]), 2);
- vst1q_lane_u32((uint32_t *)(dst + stride * 11),
- vreinterpretq_u32_u16(d[1].val[0]), 3);
-
- vst1q_lane_u32((uint32_t *)(dst + stride * 12),
- vreinterpretq_u32_u16(d[1].val[1]), 0);
- vst1q_lane_u32((uint32_t *)(dst + stride * 13),
- vreinterpretq_u32_u16(d[1].val[1]), 1);
- vst1q_lane_u32((uint32_t *)(dst + stride * 14),
- vreinterpretq_u32_u16(d[1].val[1]), 2);
- vst1q_lane_u32((uint32_t *)(dst + stride * 15),
- vreinterpretq_u32_u16(d[1].val[1]), 3);
+ store_u8x4_strided_x4(dst + stride * 0, stride,
+ vreinterpretq_u8_u16(d[0].val[0]));
+ store_u8x4_strided_x4(dst + stride * 4, stride,
+ vreinterpretq_u8_u16(d[0].val[1]));
+ store_u8x4_strided_x4(dst + stride * 8, stride,
+ vreinterpretq_u8_u16(d[1].val[0]));
+ store_u8x4_strided_x4(dst + stride * 12, stride,
+ vreinterpretq_u8_u16(d[1].val[1]));
}
static void dr_prediction_z3_16x4_neon(uint8_t *dst, ptrdiff_t stride,
@@ -3731,7 +3692,7 @@
result = vbsl_u8(left_or_top_mask, result, top_left);
if (width == 4) {
- store_unaligned_u8_4x1(dest, result, 0);
+ store_u8_4x1(dest, result, 0);
} else { // width == 8
vst1_u8(dest, result);
}
diff --git a/aom_dsp/arm/loopfilter_neon.c b/aom_dsp/arm/loopfilter_neon.c
index 4704f05..3bf98cc 100644
--- a/aom_dsp/arm/loopfilter_neon.c
+++ b/aom_dsp/arm/loopfilter_neon.c
@@ -862,10 +862,8 @@
transpose_elems_inplace_u8_4x4(&p1p0, &q0q1);
- store_unaligned_u8_4x1(src - 2, p1p0, 0);
- store_unaligned_u8_4x1((src - 2) + 1 * stride, q0q1, 0);
- store_unaligned_u8_4x1((src - 2) + 2 * stride, p1p0, 1);
- store_unaligned_u8_4x1((src - 2) + 3 * stride, q0q1, 1);
+ store_u8x4_strided_x2(src - 2, 2 * stride, p1p0);
+ store_u8x4_strided_x2(src + stride - 2, 2 * stride, q0q1);
}
void aom_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 32a54a0..8426a26 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -457,18 +457,6 @@
*s3 = vld1_s16(s);
}
-/* These intrinsics require immediate values, so we must use #defines
- to enforce that. */
-#define store_u8_2x1(s, s0, lane) \
- do { \
- vst1_lane_u16((uint16_t *)(s), vreinterpret_u16_u8(s0), lane); \
- } while (0)
-
-#define store_u8_4x1(s, s0, lane) \
- do { \
- vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(s0), lane); \
- } while (0)
-
static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
const uint8x8_t s1, const uint8x8_t s2,
const uint8x8_t s3, const uint8x8_t s4,
@@ -602,21 +590,6 @@
vst1_s16(s, s3);
}
-/* These intrinsics require immediate values, so we must use #defines
- to enforce that. */
-#define store_s16_2x1(s, s0, lane) \
- do { \
- vst1_lane_s32((int32_t *)(s), vreinterpret_s32_s16(s0), lane); \
- } while (0)
-#define store_u16_2x1(s, s0, lane) \
- do { \
- vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u16(s0), lane); \
- } while (0)
-#define store_u16q_2x1(s, s0, lane) \
- do { \
- vst1q_lane_u32((uint32_t *)(s), vreinterpretq_u32_u16(s0), lane); \
- } while (0)
-
static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
const int16x8_t s0, const int16x8_t s1,
const int16x8_t s2, const int16x8_t s3) {
@@ -989,36 +962,6 @@
load_unaligned_u8_4x4(buf, stride, tu2, tu3);
}
-/* These intrinsics require immediate values, so we must use #defines
- to enforce that. */
-#define store_unaligned_u8_4x1(dst, src, lane) \
- do { \
- uint32_t a; \
- a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
- memcpy(dst, &a, 4); \
- } while (0)
-
-#define store_unaligned_u8_2x1(dst, src, lane) \
- do { \
- uint16_t a; \
- a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \
- memcpy(dst, &a, 2); \
- } while (0)
-
-#define store_unaligned_u16_2x1(dst, src, lane) \
- do { \
- uint32_t a; \
- a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \
- memcpy(dst, &a, 4); \
- } while (0)
-
-#define store_unaligned_u16_4x1(dst, src, lane) \
- do { \
- uint64_t a; \
- a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \
- memcpy(dst, &a, 8); \
- } while (0)
-
static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
uint8x16_t *const s0, uint8x16_t *const s1,
uint8x16_t *const s2, uint8x16_t *const s3,
@@ -1200,34 +1143,6 @@
vst1q_s32(buf, v0);
}
-static INLINE void store_unaligned_u8_2x2(uint8_t *dst, uint32_t dst_stride,
- uint8x8_t src) {
- store_unaligned_u8_2x1(dst, src, 0);
- dst += dst_stride;
- store_unaligned_u8_2x1(dst, src, 1);
-}
-
-static INLINE void store_unaligned_u8_4x2(uint8_t *dst, uint32_t dst_stride,
- uint8x8_t src) {
- store_unaligned_u8_4x1(dst, src, 0);
- dst += dst_stride;
- store_unaligned_u8_4x1(dst, src, 1);
-}
-
-static INLINE void store_unaligned_u16_2x2(uint16_t *dst, uint32_t dst_stride,
- uint16x4_t src) {
- store_unaligned_u16_2x1(dst, src, 0);
- dst += dst_stride;
- store_unaligned_u16_2x1(dst, src, 1);
-}
-
-static INLINE void store_unaligned_u16_4x2(uint16_t *dst, uint32_t dst_stride,
- uint16x8_t src) {
- store_unaligned_u16_4x1(dst, src, 0);
- dst += dst_stride;
- store_unaligned_u16_4x1(dst, src, 1);
-}
-
static INLINE uint8x8_t load_u8_gather_s16_x8(const uint8_t *src,
int16x8_t indices) {
// Recent Clang and GCC versions correctly identify that this zero-broadcast
@@ -1246,4 +1161,83 @@
return ret;
}
+// The `lane` parameter here must be an immediate.
+#define store_u8_2x1(dst, src, lane) \
+ do { \
+ uint16_t a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \
+ memcpy(dst, &a, 2); \
+ } while (0)
+
+#define store_u8_4x1(dst, src, lane) \
+ do { \
+ uint32_t a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
+ memcpy(dst, &a, 4); \
+ } while (0)
+
+#define store_u16_2x1_lane(dst, src, lane) \
+ do { \
+ uint32_t a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \
+ memcpy(dst, &a, 4); \
+ } while (0)
+
+#define store_u16_4x1_lane(dst, src, lane) \
+ do { \
+ uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \
+ memcpy(dst, &a, 8); \
+ } while (0)
+
+// Store two blocks of 16-bits from a single vector.
+static INLINE void store_u8x2_strided_x2(uint8_t *dst, uint32_t dst_stride,
+ uint8x8_t src) {
+ store_u8_2x1(dst, src, 0);
+ dst += dst_stride;
+ store_u8_2x1(dst, src, 1);
+}
+
+// Store two blocks of 32-bits from a single vector.
+static INLINE void store_u8x4_strided_x2(uint8_t *dst, ptrdiff_t stride,
+ uint8x8_t src) {
+ store_u8_4x1(dst, src, 0);
+ dst += stride;
+ store_u8_4x1(dst, src, 1);
+}
+
+// Store four blocks of 32-bits from a single vector.
+static INLINE void store_u8x4_strided_x4(uint8_t *dst, ptrdiff_t stride,
+ uint8x16_t src) {
+ store_u8_4x1(dst, vget_low_u8(src), 0);
+ dst += stride;
+ store_u8_4x1(dst, vget_low_u8(src), 1);
+ dst += stride;
+ store_u8_4x1(dst, vget_high_u8(src), 0);
+ dst += stride;
+ store_u8_4x1(dst, vget_high_u8(src), 1);
+}
+
+// Store the low 32-bits from a single vector.
+static INLINE void store_u16_2x1(uint16_t *dst, const uint16x4_t src) {
+ store_u16_2x1_lane(dst, src, 0);
+}
+
+// Store two blocks of 32-bits from a single vector.
+static INLINE void store_u16x2_strided_x2(uint16_t *dst, uint32_t dst_stride,
+ uint16x4_t src) {
+ store_u16_2x1_lane(dst, src, 0);
+ dst += dst_stride;
+ store_u16_2x1_lane(dst, src, 1);
+}
+
+// Store two blocks of 64-bits from a single vector.
+static INLINE void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride,
+ uint16x8_t src) {
+ store_u16_4x1_lane(dst, src, 0);
+ dst += dst_stride;
+ store_u16_4x1_lane(dst, src, 1);
+}
+
+// The store_u8_2x1 and store_u8_4x1 macros are needed elsewhere so don't
+// #undef them.
+#undef store_u16_2x1_lane
+#undef store_u16_4x1_lane
+
#endif // AOM_AOM_DSP_ARM_MEM_NEON_H_
diff --git a/av1/common/arm/blend_a64_hmask_neon.c b/av1/common/arm/blend_a64_hmask_neon.c
index 22d2977..7afb1a9 100644
--- a/av1/common/arm/blend_a64_hmask_neon.c
+++ b/av1/common/arm/blend_a64_hmask_neon.c
@@ -73,7 +73,7 @@
uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
- store_unaligned_u8_4x2(dst, dst_stride, blend);
+ store_u8x4_strided_x2(dst, dst_stride, blend);
src0 += 2 * src0_stride;
src1 += 2 * src1_stride;
@@ -88,7 +88,7 @@
uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
- store_unaligned_u8_2x2(dst, dst_stride, blend);
+ store_u8x2_strided_x2(dst, dst_stride, blend);
src0 += 2 * src0_stride;
src1 += 2 * src1_stride;
diff --git a/av1/common/arm/blend_a64_vmask_neon.c b/av1/common/arm/blend_a64_vmask_neon.c
index d53d363..9aea299 100644
--- a/av1/common/arm/blend_a64_vmask_neon.c
+++ b/av1/common/arm/blend_a64_vmask_neon.c
@@ -78,7 +78,7 @@
uint8x8_t blend = alpha_blend_a64_u8x8(m, s0, s1);
- store_unaligned_u8_4x2(dst, dst_stride, blend);
+ store_u8x4_strided_x2(dst, dst_stride, blend);
mask += 2;
src0 += 2 * src0_stride;
@@ -97,7 +97,7 @@
uint8x8_t blend = alpha_blend_a64_u8x8(m, s0, s1);
- store_unaligned_u8_2x2(dst, dst_stride, blend);
+ store_u8x2_strided_x2(dst, dst_stride, blend);
mask += 2;
src0 += 2 * src0_stride;
diff --git a/av1/common/arm/cdef_block_neon.c b/av1/common/arm/cdef_block_neon.c
index 53b61e2..53d3a9f 100644
--- a/av1/common/arm/cdef_block_neon.c
+++ b/av1/common/arm/cdef_block_neon.c
@@ -712,7 +712,7 @@
vreinterpretq_s16_u16(max));
const uint8x8_t res_u8 = vqmovun_s16(res_s16);
- store_unaligned_u8_4x2(dst8, dstride, res_u8);
+ store_u8x4_strided_x2(dst8, dstride, res_u8);
in += 2 * CDEF_BSTRIDE;
dst8 += 2 * dstride;
@@ -794,7 +794,7 @@
const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
const uint8x8_t res_u8 = vqmovun_s16(res_s16);
- store_unaligned_u8_4x2(dst8, dstride, res_u8);
+ store_u8x4_strided_x2(dst8, dstride, res_u8);
in += 2 * CDEF_BSTRIDE;
dst8 += 2 * dstride;
@@ -886,7 +886,7 @@
const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
const uint8x8_t res_u8 = vqmovun_s16(res_s16);
- store_unaligned_u8_4x2(dst8, dstride, res_u8);
+ store_u8x4_strided_x2(dst8, dstride, res_u8);
in += 2 * CDEF_BSTRIDE;
dst8 += 2 * dstride;
@@ -925,7 +925,7 @@
do {
const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
const uint8x8_t res = vqmovn_u16(s);
- store_unaligned_u8_4x2(dst8, dstride, res);
+ store_u8x4_strided_x2(dst8, dstride, res);
in += 2 * CDEF_BSTRIDE;
dst8 += 2 * dstride;
@@ -1139,7 +1139,7 @@
res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)),
vreinterpretq_s16_u16(max));
- store_unaligned_u16_4x2(dst16, dstride, vreinterpretq_u16_s16(res));
+ store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
in += 2 * CDEF_BSTRIDE;
dst16 += 2 * dstride;
@@ -1218,7 +1218,7 @@
vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
- store_unaligned_u16_4x2(dst16, dstride, vreinterpretq_u16_s16(res));
+ store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
in += 2 * CDEF_BSTRIDE;
dst16 += 2 * dstride;
@@ -1308,7 +1308,7 @@
vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
- store_unaligned_u16_4x2(dst16, dstride, vreinterpretq_u16_s16(res));
+ store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
in += 2 * CDEF_BSTRIDE;
dst16 += 2 * dstride;
@@ -1345,7 +1345,7 @@
int h = block_height;
do {
const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
- store_unaligned_u16_4x2(dst16, dstride, s);
+ store_u16x4_strided_x2(dst16, dstride, s);
in += 2 * CDEF_BSTRIDE;
dst16 += 2 * dstride;
diff --git a/av1/common/arm/highbd_compound_convolve_neon.c b/av1/common/arm/highbd_compound_convolve_neon.c
index dc3f876..fc03a2e 100644
--- a/av1/common/arm/highbd_compound_convolve_neon.c
+++ b/av1/common/arm/highbd_compound_convolve_neon.c
@@ -1235,7 +1235,7 @@
uint16x4_t d = vshl_u16(s, round_shift_s16);
d = vadd_u16(d, offset_u16);
if (w == 2) {
- store_u16_2x1(dst_ptr + y * dst_stride, d, 0);
+ store_u16_2x1(dst_ptr + y * dst_stride, d);
} else {
vst1_u16(dst_ptr + y * dst_stride, d);
}
diff --git a/av1/common/arm/highbd_convolve_horiz_rs_neon.c b/av1/common/arm/highbd_convolve_horiz_rs_neon.c
index 51da025..4f1c25d 100644
--- a/av1/common/arm/highbd_convolve_horiz_rs_neon.c
+++ b/av1/common/arm/highbd_convolve_horiz_rs_neon.c
@@ -142,9 +142,9 @@
d0 = vmin_u16(d0, max);
if (w == 2) {
- store_u16_2x1(d + 0 * dst_stride, d0, 0);
+ store_u16_2x1(d, d0);
} else {
- vst1_u16(d + 0 * dst_stride, d0);
+ vst1_u16(d, d0);
}
src_ptr += src_stride;
diff --git a/av1/common/arm/highbd_convolve_neon.c b/av1/common/arm/highbd_convolve_neon.c
index 3f5ff9e..3a3e33f 100644
--- a/av1/common/arm/highbd_convolve_neon.c
+++ b/av1/common/arm/highbd_convolve_neon.c
@@ -1927,7 +1927,7 @@
uint16x4_t d0 = vrhadd_u16(s0, s1);
if (w == 2) {
- store_u16_2x1(dst, d0, 0);
+ store_u16_2x1(dst, d0);
} else {
vst1_u16(dst, d0);
}
@@ -1978,7 +1978,7 @@
uint16x4_t d0 = vrhadd_u16(s0, s1);
if (w == 2) {
- store_u16_2x1(dst, d0, 0);
+ store_u16_2x1(dst, d0);
} else {
vst1_u16(dst, d0);
}
@@ -2086,7 +2086,7 @@
d0 = vhadd_u16(d0, vget_low_u16(vert_offset));
if (w == 2) {
- store_u16_2x1(dst, d0, 0);
+ store_u16_2x1(dst, d0);
} else {
vst1_u16(dst, d0);
}
diff --git a/av1/common/arm/highbd_convolve_scale_neon.c b/av1/common/arm/highbd_convolve_scale_neon.c
index eee5a1c..702c651 100644
--- a/av1/common/arm/highbd_convolve_scale_neon.c
+++ b/av1/common/arm/highbd_convolve_scale_neon.c
@@ -51,7 +51,7 @@
d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
if (w == 2) {
- store_u16_2x1(dst_ptr, d0_u16, 0);
+ store_u16_2x1(dst_ptr, d0_u16);
} else {
vst1_u16(dst_ptr, d0_u16);
}
@@ -123,7 +123,7 @@
d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
if (w == 2) {
- store_u16_2x1(dst_ptr, d0_u16, 0);
+ store_u16_2x1(dst_ptr, d0_u16);
} else {
vst1_u16(dst_ptr, d0_u16);
}
@@ -260,9 +260,9 @@
s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
if (w == 2) {
- store_u16_2x1(d + 0 * dst_stride, d0, 0);
+ store_u16_2x1(d, d0);
} else {
- vst1_u16(d + 0 * dst_stride, d0);
+ vst1_u16(d, d0);
}
src_ptr += src_stride;
@@ -398,7 +398,7 @@
offset_s32, vdupq_n_s32(0));
if (w == 2) {
- store_u16_2x1(d, d0, 0);
+ store_u16_2x1(d, d0);
} else {
vst1_u16(d, d0);
}
@@ -458,7 +458,7 @@
uint16x4_t d = vqmovun_s32(d0);
d = vmin_u16(d, vget_low_u16(max));
if (w == 2) {
- store_u16_2x1(dst_ptr + y * dst_stride, d, 0);
+ store_u16_2x1(dst_ptr + y * dst_stride, d);
} else {
vst1_u16(dst_ptr + y * dst_stride, d);
}
diff --git a/av1/encoder/arm/neon/reconinter_enc_neon.c b/av1/encoder/arm/neon/reconinter_enc_neon.c
index 03afa30..3d17723 100644
--- a/av1/encoder/arm/neon/reconinter_enc_neon.c
+++ b/av1/encoder/arm/neon/reconinter_enc_neon.c
@@ -222,8 +222,7 @@
int i = height / 2;
do {
uint16x4_t r = load_u16_2x2(ref, ref_stride);
- store_u16_2x1(comp_pred + 0 * width, r, 0);
- store_u16_2x1(comp_pred + 1 * width, r, 1);
+ store_u16x2_strided_x2(comp_pred, width, r);
ref += 2 * ref_stride;
comp_pred += 2 * width;
} while (--i != 0);