Refactor unaligned load parameter types in mem_neon.h
Refactor mem_neon.h unaligned load helper parameter types to be the
types we actually want in the calling code. This moves the casts into
mem_neon.h - from the calling code - which is desirable since the
wider types are an implementation detail of the unaligned load
helpers.
Change-Id: I9bf30e0ea23ca6f7cd1e5a14e35e0ed5ff99ad3c
diff --git a/aom_dsp/arm/blend_a64_mask_neon.c b/aom_dsp/arm/blend_a64_mask_neon.c
index f11d57e..c3ee0b7 100644
--- a/aom_dsp/arm/blend_a64_mask_neon.c
+++ b/aom_dsp/arm/blend_a64_mask_neon.c
@@ -86,19 +86,21 @@
const int16x8_t vec_round_bits) {
int16x8_t src0_0, src0_1;
int16x8_t src1_0, src1_1;
- uint64x2_t tu0 = vdupq_n_u64(0), tu1 = vdupq_n_u64(0), tu2 = vdupq_n_u64(0),
- tu3 = vdupq_n_u64(0);
+ uint16x8_t tu0 = vdupq_n_u16(0);
+ uint16x8_t tu1 = vdupq_n_u16(0);
+ uint16x8_t tu2 = vdupq_n_u16(0);
+ uint16x8_t tu3 = vdupq_n_u16(0);
int16x8_t mask0_1, mask2_3;
int16x8_t res0, res1;
load_unaligned_u16_4x4(src0, src0_stride, &tu0, &tu1);
load_unaligned_u16_4x4(src1, src1_stride, &tu2, &tu3);
- src0_0 = vreinterpretq_s16_u64(tu0);
- src0_1 = vreinterpretq_s16_u64(tu1);
+ src0_0 = vreinterpretq_s16_u16(tu0);
+ src0_1 = vreinterpretq_s16_u16(tu1);
- src1_0 = vreinterpretq_s16_u64(tu2);
- src1_1 = vreinterpretq_s16_u64(tu3);
+ src1_0 = vreinterpretq_s16_u16(tu2);
+ src1_1 = vreinterpretq_s16_u16(tu3);
mask0_1 = vcombine_s16(mask0, mask1);
mask2_3 = vcombine_s16(mask2, mask3);
@@ -150,9 +152,10 @@
assert(IS_POWER_OF_TWO(h));
assert(IS_POWER_OF_TWO(w));
- uint8x8_t s0, s1, s2, s3;
- uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0),
- tu3 = vdup_n_u32(0);
+ uint8x8_t s0 = vdup_n_u8(0);
+ uint8x8_t s1 = vdup_n_u8(0);
+ uint8x8_t s2 = vdup_n_u8(0);
+ uint8x8_t s3 = vdup_n_u8(0);
uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
int16x8_t mask0, mask1, mask2, mask3;
int16x8_t mask4, mask5, mask6, mask7;
@@ -197,10 +200,10 @@
} while (i < h);
} else {
do {
- load_unaligned_u8_4x4(mask_tmp, mask_stride, &tu0, &tu1);
+ load_unaligned_u8_4x4(mask_tmp, mask_stride, &s0, &s1);
- mask0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
- mask1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
+ mask0 = vreinterpretq_s16_u16(vmovl_u8(s0));
+ mask1 = vreinterpretq_s16_u16(vmovl_u8(s1));
mask0_low = vget_low_s16(mask0);
mask1_low = vget_high_s16(mask0);
@@ -412,14 +415,9 @@
} while (i < h);
} else {
do {
- load_unaligned_u8_4x4(mask_tmp, 2 * mask_stride, &tu0, &tu1);
- load_unaligned_u8_4x4(mask_tmp + mask_stride, 2 * mask_stride, &tu2,
- &tu3);
-
- s0 = vreinterpret_u8_u32(tu0);
- s1 = vreinterpret_u8_u32(tu1);
- s2 = vreinterpret_u8_u32(tu2);
- s3 = vreinterpret_u8_u32(tu3);
+ load_unaligned_u8_4x4(mask_tmp, 2 * mask_stride, &s0, &s1);
+ load_unaligned_u8_4x4(mask_tmp + mask_stride, 2 * mask_stride, &s2,
+ &s3);
mask0 = vreinterpretq_s16_u16(vaddl_u8(s0, s2));
mask1 = vreinterpretq_s16_u16(vaddl_u8(s1, s3));
diff --git a/aom_dsp/arm/loopfilter_neon.c b/aom_dsp/arm/loopfilter_neon.c
index 7caa643..8fc7ccb 100644
--- a/aom_dsp/arm/loopfilter_neon.c
+++ b/aom_dsp/arm/loopfilter_neon.c
@@ -841,8 +841,7 @@
// row1: p1 p0 | q0 q1
// row2: p1 p0 | q0 q1
// row3: p1 p0 | q0 q1
- load_unaligned_u8_4x4(src - 2, stride, (uint32x2_t *)&p1p0,
- (uint32x2_t *)&q0q1);
+ load_unaligned_u8_4x4(src - 2, stride, &p1p0, &q0q1);
transpose_u8_4x4(&p1p0, &q0q1);
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 2386d54..994a636 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -613,71 +613,53 @@
return vreinterpretq_u8_u32(a_u32);
}
-static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
- uint32x2_t *tu0, uint32x2_t *tu1,
- uint32x2_t *tu2, uint32x2_t *tu3) {
+static INLINE uint8x8_t load_unaligned_u8_2x2(const uint8_t *buf, int stride) {
+ uint16_t a;
+ uint16x4_t a_u16;
+
+ memcpy(&a, buf, 2);
+ buf += stride;
+ a_u16 = vdup_n_u16(a);
+ memcpy(&a, buf, 2);
+ a_u16 = vset_lane_u16(a, a_u16, 1);
+ return vreinterpret_u8_u16(a_u16);
+}
+
+static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
uint32_t a;
+ uint32x2_t a_u32;
+
+ memcpy(&a, buf, 4);
+ a_u32 = vdup_n_u32(0);
+ a_u32 = vset_lane_u32(a, a_u32, 0);
+ return vreinterpret_u8_u32(a_u32);
+}
+
+static INLINE uint8x8_t load_unaligned_u8_4x2(const uint8_t *buf, int stride) {
+ uint32_t a;
+ uint32x2_t a_u32;
memcpy(&a, buf, 4);
buf += stride;
- *tu0 = vdup_n_u32(a);
+ a_u32 = vdup_n_u32(a);
memcpy(&a, buf, 4);
- buf += stride;
- *tu0 = vset_lane_u32(a, *tu0, 1);
- memcpy(&a, buf, 4);
- buf += stride;
- *tu1 = vdup_n_u32(a);
- memcpy(&a, buf, 4);
- buf += stride;
- *tu1 = vset_lane_u32(a, *tu1, 1);
- memcpy(&a, buf, 4);
- buf += stride;
- *tu2 = vdup_n_u32(a);
- memcpy(&a, buf, 4);
- buf += stride;
- *tu2 = vset_lane_u32(a, *tu2, 1);
- memcpy(&a, buf, 4);
- buf += stride;
- *tu3 = vdup_n_u32(a);
- memcpy(&a, buf, 4);
- *tu3 = vset_lane_u32(a, *tu3, 1);
+ a_u32 = vset_lane_u32(a, a_u32, 1);
+ return vreinterpret_u8_u32(a_u32);
}
static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
- uint32x2_t *tu0, uint32x2_t *tu1) {
- uint32_t a;
-
- memcpy(&a, buf, 4);
- buf += stride;
- *tu0 = vdup_n_u32(a);
- memcpy(&a, buf, 4);
- buf += stride;
- *tu0 = vset_lane_u32(a, *tu0, 1);
- memcpy(&a, buf, 4);
- buf += stride;
- *tu1 = vdup_n_u32(a);
- memcpy(&a, buf, 4);
- *tu1 = vset_lane_u32(a, *tu1, 1);
+ uint8x8_t *tu0, uint8x8_t *tu1) {
+ *tu0 = load_unaligned_u8_4x2(buf, stride);
+ buf += 2 * stride;
+ *tu1 = load_unaligned_u8_4x2(buf, stride);
}
-static INLINE void load_unaligned_u8_4x1(const uint8_t *buf, int stride,
- uint32x2_t *tu0) {
- uint32_t a;
-
- memcpy(&a, buf, 4);
- buf += stride;
- *tu0 = vset_lane_u32(a, *tu0, 0);
-}
-
-static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride,
- uint32x2_t *tu0) {
- uint32_t a;
-
- memcpy(&a, buf, 4);
- buf += stride;
- *tu0 = vdup_n_u32(a);
- memcpy(&a, buf, 4);
- *tu0 = vset_lane_u32(a, *tu0, 1);
+static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
+ uint8x8_t *tu0, uint8x8_t *tu1,
+ uint8x8_t *tu2, uint8x8_t *tu3) {
+ load_unaligned_u8_4x4(buf, stride, tu0, tu1);
+ buf += 4 * stride;
+ load_unaligned_u8_4x4(buf, stride, tu2, tu3);
}
/* These intrinsics require immediate values, so we must use #defines
@@ -696,17 +678,6 @@
memcpy(dst, &a, 2); \
} while (0)
-static INLINE void load_unaligned_u8_2x2(const uint8_t *buf, int stride,
- uint16x4_t *tu0) {
- uint16_t a;
-
- memcpy(&a, buf, 2);
- buf += stride;
- *tu0 = vdup_n_u16(a);
- memcpy(&a, buf, 2);
- *tu0 = vset_lane_u16(a, *tu0, 1);
-}
-
static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
uint8x16_t *const s0, uint8x16_t *const s1,
uint8x16_t *const s2, uint8x16_t *const s3,
@@ -742,20 +713,24 @@
}
static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
- uint64x2_t *tu0, uint64x2_t *tu1) {
+ uint16x8_t *tu0, uint16x8_t *tu1) {
uint64_t a;
+ uint64x2_t a_u64;
memcpy(&a, buf, 8);
buf += stride;
- *tu0 = vdupq_n_u64(a);
+ a_u64 = vdupq_n_u64(0);
+ a_u64 = vsetq_lane_u64(a, a_u64, 0);
memcpy(&a, buf, 8);
buf += stride;
- *tu0 = vsetq_lane_u64(a, *tu0, 1);
+ a_u64 = vsetq_lane_u64(a, a_u64, 1);
+ *tu0 = vreinterpretq_u16_u64(a_u64);
memcpy(&a, buf, 8);
buf += stride;
- *tu1 = vdupq_n_u64(a);
+ a_u64 = vdupq_n_u64(a);
memcpy(&a, buf, 8);
- *tu1 = vsetq_lane_u64(a, *tu1, 1);
+ a_u64 = vsetq_lane_u64(a, a_u64, 1);
+ *tu1 = vreinterpretq_u16_u64(a_u64);
}
static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
diff --git a/av1/common/arm/blend_a64_hmask_neon.c b/av1/common/arm/blend_a64_hmask_neon.c
index 89252ef..baad328 100644
--- a/av1/common/arm/blend_a64_hmask_neon.c
+++ b/av1/common/arm/blend_a64_hmask_neon.c
@@ -34,8 +34,6 @@
uint8x8_t tmp0, tmp1;
uint8x16_t res_q;
uint16x8_t res, res_low, res_high;
- uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0);
- uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0);
const uint8x8_t vdup_64 = vdup_n_u8((uint8_t)64);
if (w >= 16) {
@@ -91,10 +89,8 @@
__builtin_prefetch(src0 + 1 * src0_stride);
__builtin_prefetch(src1 + 0 * src1_stride);
__builtin_prefetch(src1 + 1 * src1_stride);
- load_unaligned_u8_4x2(src0, src0_stride, &tmp0_32);
- tmp0 = vreinterpret_u8_u32(tmp0_32);
- load_unaligned_u8_4x2(src1, src1_stride, &tmp1_32);
- tmp1 = vreinterpret_u8_u32(tmp1_32);
+ tmp0 = load_unaligned_u8_4x2(src0, src0_stride);
+ tmp1 = load_unaligned_u8_4x2(src1, src1_stride);
res = vmull_u8(m, tmp0);
res = vmlal_u8(res, max_minus_m, tmp1);
const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
@@ -113,10 +109,8 @@
__builtin_prefetch(src0 + 1 * src0_stride);
__builtin_prefetch(src1 + 0 * src1_stride);
__builtin_prefetch(src1 + 1 * src1_stride);
- load_unaligned_u8_2x2(src0, src0_stride, &tmp0_16);
- tmp0 = vreinterpret_u8_u16(tmp0_16);
- load_unaligned_u8_2x2(src1, src1_stride, &tmp1_16);
- tmp1 = vreinterpret_u8_u16(tmp1_16);
+ tmp0 = load_unaligned_u8_2x2(src0, src0_stride);
+ tmp1 = load_unaligned_u8_2x2(src1, src1_stride);
res = vmull_u8(m, tmp0);
res = vmlal_u8(res, max_minus_m, tmp1);
const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
diff --git a/av1/common/arm/blend_a64_vmask_neon.c b/av1/common/arm/blend_a64_vmask_neon.c
index 2132fbd..c316977 100644
--- a/av1/common/arm/blend_a64_vmask_neon.c
+++ b/av1/common/arm/blend_a64_vmask_neon.c
@@ -27,8 +27,6 @@
uint8x8_t tmp0, tmp1;
uint8x16_t tmp0_q, tmp1_q, res_q;
uint16x8_t res, res_low, res_high;
- uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0);
- uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0);
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
@@ -89,10 +87,8 @@
const uint16x4_t max_minus_m2 = vdup_n_u16(64 - (uint16_t)mask[i + 1]);
const uint8x8_t max_minus_m =
vmovn_u16(vcombine_u16(max_minus_m1, max_minus_m2));
- load_unaligned_u8_4x2(src0, src0_stride, &tmp0_32);
- tmp0 = vreinterpret_u8_u32(tmp0_32);
- load_unaligned_u8_4x2(src1, src1_stride, &tmp1_32);
- tmp1 = vreinterpret_u8_u32(tmp1_32);
+ tmp0 = load_unaligned_u8_4x2(src0, src0_stride);
+ tmp1 = load_unaligned_u8_4x2(src1, src1_stride);
res = vmull_u8(m, tmp0);
res = vmlal_u8(res, max_minus_m, tmp1);
const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
@@ -118,10 +114,8 @@
const uint16x4x2_t max_minus_m_trn = vtrn_u16(
vreinterpret_u16_u8(max_minus_m1), vreinterpret_u16_u8(max_minus_m2));
const uint8x8_t max_minus_m = vreinterpret_u8_u16(max_minus_m_trn.val[0]);
- load_unaligned_u8_2x2(src0, src0_stride, &tmp0_16);
- tmp0 = vreinterpret_u8_u16(tmp0_16);
- load_unaligned_u8_2x2(src1, src1_stride, &tmp1_16);
- tmp1 = vreinterpret_u8_u16(tmp1_16);
+ tmp0 = load_unaligned_u8_2x2(src0, src0_stride);
+ tmp1 = load_unaligned_u8_2x2(src1, src1_stride);
res = vmull_u8(m, tmp0);
res = vmlal_u8(res, max_minus_m, tmp1);
const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
diff --git a/av1/common/arm/jnt_convolve_neon.c b/av1/common/arm/jnt_convolve_neon.c
index 6aa2d43..4293443 100644
--- a/av1/common/arm/jnt_convolve_neon.c
+++ b/av1/common/arm/jnt_convolve_neon.c
@@ -1621,7 +1621,6 @@
int16x4_t s8, s9, s10, d1, d2, d3;
int16x8_t tt1, tt2, tt3;
uint16x4_t res5, res6, res7;
- uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0);
int16x8_t u0, u1;
#else
int16x4_t temp_0;
@@ -1660,9 +1659,7 @@
__builtin_prefetch(d + 3 * dst_stride);
s += 7;
do {
- load_unaligned_u8_4x4(s, src_stride, &tu0, &tu1);
- t0 = vreinterpret_u8_u32(tu0);
- t1 = vreinterpret_u8_u32(tu1);
+ load_unaligned_u8_4x4(s, src_stride, &t0, &t1);
transpose_u8_4x4(&t0, &t1);
u0 = vreinterpretq_s16_u16(vmovl_u8(t0));
@@ -2066,8 +2063,10 @@
if ((w == 4) || (h == 4)) {
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
uint16x4_t res4;
- uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0),
- tu3 = vdup_n_u32(0);
+ uint8x8_t tu0 = vdup_n_u8(0);
+ uint8x8_t tu1 = vdup_n_u8(0);
+ uint8x8_t tu2 = vdup_n_u8(0);
+ uint8x8_t tu3 = vdup_n_u8(0);
int16x8_t u0, u1, u2, u3;
uint8x8_t t0;
@@ -2092,10 +2091,10 @@
load_unaligned_u8_4x8(s, src_stride, &tu0, &tu1, &tu2, &tu3);
- u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
- u1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
- u2 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu2)));
- u3 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu3)));
+ u0 = vreinterpretq_s16_u16(vmovl_u8(tu0));
+ u1 = vreinterpretq_s16_u16(vmovl_u8(tu1));
+ u2 = vreinterpretq_s16_u16(vmovl_u8(tu2));
+ u3 = vreinterpretq_s16_u16(vmovl_u8(tu3));
s0 = vget_low_s16(u0);
s1 = vget_high_s16(u0);
@@ -2115,8 +2114,8 @@
#if defined(__aarch64__)
load_unaligned_u8_4x4(s, src_stride, &tu0, &tu1);
- u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
- u1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
+ u0 = vreinterpretq_s16_u16(vmovl_u8(tu0));
+ u1 = vreinterpretq_s16_u16(vmovl_u8(tu1));
s7 = vget_low_s16(u0);
s8 = vget_high_s16(u0);
@@ -2177,8 +2176,8 @@
d_u8 += 4 * dst8_stride;
height -= 4;
#else
- load_unaligned_u8_4x1(s, src_stride, &tu0);
- u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
+ tu0 = load_unaligned_u8_4x1(s);
+ u0 = vreinterpretq_s16_u16(vmovl_u8(tu0));
s7 = vget_low_s16(u0);
d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, zero,