cfl_neon: use intrinsic functions where appropriate Don't assume the NEON types are backed by a vector type and use intrinsics to initialize and store registers. This fixes compilation errors on Windows Arm64; no change in assembly. Bug: b/277255390 Change-Id: I1510038078f1e0a21dfbffbf717f5f1d73ea2058

commit: d192cdfc229d3d4edf6a0acd2e5b71fb4880d28e [log] [tgz]
author: James Zern <jzern@google.com> Wed Apr 26 15:56:47 2023 -0700
committer: James Zern <jzern@google.com> Thu Apr 27 04:51:11 2023 +0000
tree: 7942a276ad4f249bddd4736c2b6081b8e73c5a54
parent: 8ad11952c3c37e685e148cc72fd2dac103181e10 [diff]
diff --git a/av1/common/arm/cfl_neon.c b/av1/common/arm/cfl_neon.c
index 371be5f..8c15345 100644
--- a/av1/common/arm/cfl_neon.c
+++ b/av1/common/arm/cfl_neon.c

@@ -31,12 +31,12 @@
 
 // Store half of a vector.
 static INLINE void vsth_u16(uint16_t *ptr, uint16x4_t val) {
-  *((uint32_t *)ptr) = vreinterpret_u32_u16(val)[0];
+  vst1_lane_u32((uint32_t *)ptr, vreinterpret_u32_u16(val), 0);
 }
 
 // Store half of a vector.
 static INLINE void vsth_u8(uint8_t *ptr, uint8x8_t val) {
-  *((uint32_t *)ptr) = vreinterpret_u32_u8(val)[0];
+  vst1_lane_u32((uint32_t *)ptr, vreinterpret_u32_u8(val), 0);
 }
 
 static void cfl_luma_subsampling_420_lbd_neon(const uint8_t *input,
@@ -269,7 +269,7 @@
   // unsigned integer for the sum, we can do one addition operation inside 16
   // bits (8 lanes) before having to convert to 32 bits (4 lanes).
   const uint16_t *sum_buf = src;
-  uint32x4_t sum_32x4 = { 0, 0, 0, 0 };
+  uint32x4_t sum_32x4 = vdupq_n_u32(0);
   do {
     // For all widths, we load, add and combine the data so it fits in 4 lanes.
     if (width == 4) {
commit	d192cdfc229d3d4edf6a0acd2e5b71fb4880d28e	[log] [tgz]
author	James Zern <jzern@google.com>	Wed Apr 26 15:56:47 2023 -0700
committer	James Zern <jzern@google.com>	Thu Apr 27 04:51:11 2023 +0000
tree	7942a276ad4f249bddd4736c2b6081b8e73c5a54
parent	8ad11952c3c37e685e148cc72fd2dac103181e10 [diff]