simd,arm: use intrinsic functions where appropriate
Don't assume the NEON types are backed by a vector type and use
intrinsics to initialize and store registers. This fixes compilation
errors on Windows Arm64; no change in assembly.
Bug: b/277255390
Change-Id: I7a23b4e6419a0f40c94eb938ee42c57306c90b6f
diff --git a/aom_dsp/simd/v128_intrinsics_arm.h b/aom_dsp/simd/v128_intrinsics_arm.h
index 2d497f4..5ee35bf 100644
--- a/aom_dsp/simd/v128_intrinsics_arm.h
+++ b/aom_dsp/simd/v128_intrinsics_arm.h
@@ -29,7 +29,7 @@
SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); }
SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
- return vcombine_s64((int64x1_t)b, (int64x1_t)a);
+ return vcombine_s64(vcreate_s64(b), vcreate_s64(a));
}
SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
@@ -101,7 +101,7 @@
return vaddlvq_s16(t1) + vaddlvq_s16(t2);
#else
int64x2_t t = vpaddlq_s32(vaddq_s32(vpaddlq_s16(t1), vpaddlq_s16(t2)));
- return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t);
+ return vget_lane_s64(vadd_s64(vget_high_s64(t), vget_low_s64(t)), 0);
#endif
}
@@ -113,7 +113,7 @@
SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
int64x2_t t = vpaddlq_s32(
vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
- return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t);
+ return vget_lane_s64(vadd_s64(vget_high_s64(t), vget_low_s64(t)), 0);
}
SIMD_INLINE uint64_t v128_hadd_u8(v128 x) {
@@ -159,7 +159,8 @@
return vaddlvq_u16(s.hi) + vaddlvq_u16(s.lo);
#else
uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vaddq_u16(s.hi, s.lo)));
- return (uint32_t)(uint64_t)(vget_high_u64(t) + vget_low_u64(t));
+ return (uint32_t)vget_lane_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t)),
+ 0);
#endif
}
@@ -377,8 +378,8 @@
uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(
vandq_u8(vreinterpretq_u8_s64(a),
vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL))))));
- return v64_low_u32(
- v64_ziplo_8(v128_high_v64((v128)m), v128_low_v64((v128)m)));
+ int64x2_t s = vreinterpretq_s64_u64(m);
+ return v64_low_u32(v64_ziplo_8(vget_high_s64(s), vget_low_s64(s)));
#endif
}
@@ -488,12 +489,11 @@
}
SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
- return v128_from_v64(vget_low_s64((int64x2_t)a), vget_low_s64((int64x2_t)b));
+ return v128_from_v64(vget_low_s64(a), vget_low_s64(b));
}
SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
- return v128_from_v64(vget_high_s64((int64x2_t)a),
- vget_high_s64((int64x2_t)b));
+ return v128_from_v64(vget_high_s64(a), vget_high_s64(b));
}
SIMD_INLINE v128 v128_unziplo_8(v128 x, v128 y) {
@@ -643,10 +643,12 @@
#else
uint8x8x2_t p = { { vget_low_u8(vreinterpretq_u8_s64(x)),
vget_high_u8(vreinterpretq_u8_s64(x)) } };
- return v128_from_64((uint64_t)vreinterpret_s64_u8(vtbl2_u8(
- p, vreinterpret_u8_s64(vget_high_s64(pattern)))),
- (uint64_t)vreinterpret_s64_u8(vtbl2_u8(
- p, vreinterpret_u8_s64(vget_low_s64(pattern)))));
+ uint8x8_t shuffle_hi =
+ vtbl2_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern)));
+ uint8x8_t shuffle_lo =
+ vtbl2_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern)));
+ return v128_from_64(vget_lane_u64(vreinterpret_u64_u8(shuffle_hi), 0),
+ vget_lane_u64(vreinterpret_u64_u8(shuffle_lo), 0));
#endif
}
@@ -949,8 +951,8 @@
SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
uint64x2_t t = vpaddlq_u32(s);
- return (uint32_t)(uint64_t)vget_high_u64(t) +
- (uint32_t)(uint64_t)vget_low_u64(t);
+ return (uint32_t)vget_lane_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t)),
+ 0);
}
typedef v128 ssd128_internal_s16;
diff --git a/aom_dsp/simd/v256_intrinsics_v128.h b/aom_dsp/simd/v256_intrinsics_v128.h
index 0d22667..cf44965 100644
--- a/aom_dsp/simd/v256_intrinsics_v128.h
+++ b/aom_dsp/simd/v256_intrinsics_v128.h
@@ -626,15 +626,18 @@
vget_high_u8(vreinterpretq_u8_s64(x.val[0])),
vget_low_u8(vreinterpretq_u8_s64(x.val[1])),
vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } };
- return v256_from_64(
- (uint64_t)vreinterpret_s64_u8(
- vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))),
- (uint64_t)vreinterpret_s64_u8(
- vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))),
- (uint64_t)vreinterpret_s64_u8(
- vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))),
- (uint64_t)vreinterpret_s64_u8(
- vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[0])))));
+ uint8x8_t shuffle1_hi =
+ vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])));
+ uint8x8_t shuffle1_lo =
+ vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])));
+ uint8x8_t shuffle0_hi =
+ vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])));
+ uint8x8_t shuffle0_lo =
+ vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[0])));
+ return v256_from_64(vget_lane_u64(vreinterpret_u64_u8(shuffle1_hi), 0),
+ vget_lane_u64(vreinterpret_u64_u8(shuffle1_lo), 0),
+ vget_lane_u64(vreinterpret_u64_u8(shuffle0_hi), 0),
+ vget_lane_u64(vreinterpret_u64_u8(shuffle0_lo), 0));
#endif
#else
v128 c16 = v128_dup_8(16);
@@ -672,24 +675,26 @@
vget_high_u8(vreinterpretq_u8_s64(y.val[0])),
vget_low_u8(vreinterpretq_u8_s64(y.val[1])),
vget_high_u8(vreinterpretq_u8_s64(y.val[1])) } };
- v256 r1 =
- v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8(
- p, vreinterpret_u8_s64(vget_high_s64(p32.val[1])))),
- (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
- p, vreinterpret_u8_s64(vget_low_s64(p32.val[1])))),
- (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
- p, vreinterpret_u8_s64(vget_high_s64(p32.val[0])))),
- (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
- p, vreinterpret_u8_s64(vget_low_s64(p32.val[0])))));
- v256 r2 =
- v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8(
- q, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))),
- (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
- q, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))),
- (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
- q, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))),
- (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
- q, vreinterpret_u8_s64(vget_low_s64(pattern.val[0])))));
+ uint8x8_t shuffle1_hi =
+ vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(p32.val[1])));
+ uint8x8_t shuffle1_lo =
+ vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(p32.val[1])));
+ uint8x8_t shuffle0_hi =
+ vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(p32.val[0])));
+ uint8x8_t shuffle0_lo =
+ vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(p32.val[0])));
+ v256 r1 = v256_from_64(vget_lane_u64(vreinterpret_u64_u8(shuffle1_hi), 0),
+ vget_lane_u64(vreinterpret_u64_u8(shuffle1_lo), 0),
+ vget_lane_u64(vreinterpret_u64_u8(shuffle0_hi), 0),
+ vget_lane_u64(vreinterpret_u64_u8(shuffle0_lo), 0));
+ shuffle1_hi = vtbl4_u8(q, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])));
+ shuffle1_lo = vtbl4_u8(q, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])));
+ shuffle0_hi = vtbl4_u8(q, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])));
+ shuffle0_lo = vtbl4_u8(q, vreinterpret_u8_s64(vget_low_s64(pattern.val[0])));
+ v256 r2 = v256_from_64(vget_lane_u64(vreinterpret_u64_u8(shuffle1_hi), 0),
+ vget_lane_u64(vreinterpret_u64_u8(shuffle1_lo), 0),
+ vget_lane_u64(vreinterpret_u64_u8(shuffle0_hi), 0),
+ vget_lane_u64(vreinterpret_u64_u8(shuffle0_lo), 0));
return v256_blend_8(r1, r2, v256_cmplt_s8(pattern, c32));
#endif
#else
diff --git a/aom_dsp/simd/v64_intrinsics_arm.h b/aom_dsp/simd/v64_intrinsics_arm.h
index a4ecdf4..1576b8f 100644
--- a/aom_dsp/simd/v64_intrinsics_arm.h
+++ b/aom_dsp/simd/v64_intrinsics_arm.h
@@ -13,6 +13,7 @@
#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_
#include <arm_neon.h>
+#include <string.h>
#include "aom_dsp/simd/v64_intrinsics_arm.h"
#include "aom_ports/arm.h"
@@ -50,7 +51,7 @@
SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); }
-SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)x; }
+SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)vget_lane_s64(x, 0); }
SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
return *((uint32_t *)p);
@@ -77,8 +78,7 @@
} __attribute__((__packed__));
((struct Unaligned32Struct *)p)->value = a;
#else
- vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
- 0);
+ memcpy(p, &a, 4);
#endif
}
@@ -106,7 +106,8 @@
vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c))
: b;
#else
- return c ? v64_from_64(((uint64_t)b >> c * 8) | ((uint64_t)a << (8 - c) * 8))
+ return c ? v64_from_64(((uint64_t)vget_lane_s64(b, 0) >> c * 8) |
+ ((uint64_t)vget_lane_s64(a, 0) << (8 - c) * 8))
: b;
#endif
}
@@ -133,7 +134,7 @@
return vaddlvq_s16(t);
#else
int64x2_t r = vpaddlq_s32(vpaddlq_s16(t));
- return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r));
+ return vget_lane_s64(vadd_s64(vget_high_s64(r), vget_low_s64(r)), 0);
#endif
}
@@ -144,7 +145,7 @@
#else
int64x2_t r =
vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
- return (int64_t)(vget_high_s64(r) + vget_low_s64(r));
+ return vget_lane_s64(vadd_s64(vget_high_s64(r), vget_low_s64(r)), 0);
#endif
}
@@ -152,12 +153,13 @@
#if defined(__aarch64__)
return vaddlv_u8(vreinterpret_u8_s64(x));
#else
- return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x))));
+ return vget_lane_u64(
+ vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x)))), 0);
#endif
}
SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
- return (int64_t)vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a)));
+ return vget_lane_s64(vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a))), 0);
}
typedef uint16x8_t sad64_internal;
@@ -175,7 +177,8 @@
return vaddlvq_u16(s);
#else
uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s));
- return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r));
+ return (uint32_t)vget_lane_u64(vadd_u64(vget_high_u64(r), vget_low_u64(r)),
+ 0);
#endif
}