cfl_neon: use intrinsic functions where appropriate

Don't assume the NEON types are backed by a vector type and use
intrinsics to initialize and store registers. This fixes compilation
errors on Windows Arm64; no change in assembly.

Bug: b/277255390
Change-Id: I1510038078f1e0a21dfbffbf717f5f1d73ea2058
diff --git a/av1/common/arm/cfl_neon.c b/av1/common/arm/cfl_neon.c
index 371be5f..8c15345 100644
--- a/av1/common/arm/cfl_neon.c
+++ b/av1/common/arm/cfl_neon.c
@@ -31,12 +31,12 @@
 
 // Store half of a vector.
 static INLINE void vsth_u16(uint16_t *ptr, uint16x4_t val) {
-  *((uint32_t *)ptr) = vreinterpret_u32_u16(val)[0];
+  vst1_lane_u32((uint32_t *)ptr, vreinterpret_u32_u16(val), 0);
 }
 
 // Store half of a vector.
 static INLINE void vsth_u8(uint8_t *ptr, uint8x8_t val) {
-  *((uint32_t *)ptr) = vreinterpret_u32_u8(val)[0];
+  vst1_lane_u32((uint32_t *)ptr, vreinterpret_u32_u8(val), 0);
 }
 
 static void cfl_luma_subsampling_420_lbd_neon(const uint8_t *input,
@@ -269,7 +269,7 @@
   // unsigned integer for the sum, we can do one addition operation inside 16
   // bits (8 lanes) before having to convert to 32 bits (4 lanes).
   const uint16_t *sum_buf = src;
-  uint32x4_t sum_32x4 = { 0, 0, 0, 0 };
+  uint32x4_t sum_32x4 = vdupq_n_u32(0);
   do {
     // For all widths, we load, add and combine the data so it fits in 4 lanes.
     if (width == 4) {