simd,arm: use intrinsic functions where appropriate

Don't assume the NEON types are backed by a vector type and use
intrinsics to initialize and store registers. This fixes compilation
errors on Windows Arm64; no change in assembly.

Bug: b/277255390
Change-Id: I7a23b4e6419a0f40c94eb938ee42c57306c90b6f
diff --git a/aom_dsp/simd/v128_intrinsics_arm.h b/aom_dsp/simd/v128_intrinsics_arm.h
index 2d497f4..5ee35bf 100644
--- a/aom_dsp/simd/v128_intrinsics_arm.h
+++ b/aom_dsp/simd/v128_intrinsics_arm.h
@@ -29,7 +29,7 @@
 SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); }
 
 SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
-  return vcombine_s64((int64x1_t)b, (int64x1_t)a);
+  return vcombine_s64(vcreate_s64(b), vcreate_s64(a));
 }
 
 SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
@@ -101,7 +101,7 @@
   return vaddlvq_s16(t1) + vaddlvq_s16(t2);
 #else
   int64x2_t t = vpaddlq_s32(vaddq_s32(vpaddlq_s16(t1), vpaddlq_s16(t2)));
-  return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t);
+  return vget_lane_s64(vadd_s64(vget_high_s64(t), vget_low_s64(t)), 0);
 #endif
 }
 
@@ -113,7 +113,7 @@
 SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
   int64x2_t t = vpaddlq_s32(
       vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
-  return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t);
+  return vget_lane_s64(vadd_s64(vget_high_s64(t), vget_low_s64(t)), 0);
 }
 
 SIMD_INLINE uint64_t v128_hadd_u8(v128 x) {
@@ -159,7 +159,8 @@
   return vaddlvq_u16(s.hi) + vaddlvq_u16(s.lo);
 #else
   uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vaddq_u16(s.hi, s.lo)));
-  return (uint32_t)(uint64_t)(vget_high_u64(t) + vget_low_u64(t));
+  return (uint32_t)vget_lane_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t)),
+                                 0);
 #endif
 }
 
@@ -377,8 +378,8 @@
   uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(
       vandq_u8(vreinterpretq_u8_s64(a),
                vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL))))));
-  return v64_low_u32(
-      v64_ziplo_8(v128_high_v64((v128)m), v128_low_v64((v128)m)));
+  int64x2_t s = vreinterpretq_s64_u64(m);
+  return v64_low_u32(v64_ziplo_8(vget_high_s64(s), vget_low_s64(s)));
 #endif
 }
 
@@ -488,12 +489,11 @@
 }
 
 SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
-  return v128_from_v64(vget_low_s64((int64x2_t)a), vget_low_s64((int64x2_t)b));
+  return v128_from_v64(vget_low_s64(a), vget_low_s64(b));
 }
 
 SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
-  return v128_from_v64(vget_high_s64((int64x2_t)a),
-                       vget_high_s64((int64x2_t)b));
+  return v128_from_v64(vget_high_s64(a), vget_high_s64(b));
 }
 
 SIMD_INLINE v128 v128_unziplo_8(v128 x, v128 y) {
@@ -643,10 +643,12 @@
 #else
   uint8x8x2_t p = { { vget_low_u8(vreinterpretq_u8_s64(x)),
                       vget_high_u8(vreinterpretq_u8_s64(x)) } };
-  return v128_from_64((uint64_t)vreinterpret_s64_u8(vtbl2_u8(
-                          p, vreinterpret_u8_s64(vget_high_s64(pattern)))),
-                      (uint64_t)vreinterpret_s64_u8(vtbl2_u8(
-                          p, vreinterpret_u8_s64(vget_low_s64(pattern)))));
+  uint8x8_t shuffle_hi =
+      vtbl2_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern)));
+  uint8x8_t shuffle_lo =
+      vtbl2_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern)));
+  return v128_from_64(vget_lane_u64(vreinterpret_u64_u8(shuffle_hi), 0),
+                      vget_lane_u64(vreinterpret_u64_u8(shuffle_lo), 0));
 #endif
 }
 
@@ -949,8 +951,8 @@
 
 SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
   uint64x2_t t = vpaddlq_u32(s);
-  return (uint32_t)(uint64_t)vget_high_u64(t) +
-         (uint32_t)(uint64_t)vget_low_u64(t);
+  return (uint32_t)vget_lane_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t)),
+                                 0);
 }
 
 typedef v128 ssd128_internal_s16;
diff --git a/aom_dsp/simd/v256_intrinsics_v128.h b/aom_dsp/simd/v256_intrinsics_v128.h
index 0d22667..cf44965 100644
--- a/aom_dsp/simd/v256_intrinsics_v128.h
+++ b/aom_dsp/simd/v256_intrinsics_v128.h
@@ -626,15 +626,18 @@
                       vget_high_u8(vreinterpretq_u8_s64(x.val[0])),
                       vget_low_u8(vreinterpretq_u8_s64(x.val[1])),
                       vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } };
-  return v256_from_64(
-      (uint64_t)vreinterpret_s64_u8(
-          vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))),
-      (uint64_t)vreinterpret_s64_u8(
-          vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))),
-      (uint64_t)vreinterpret_s64_u8(
-          vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))),
-      (uint64_t)vreinterpret_s64_u8(
-          vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[0])))));
+  uint8x8_t shuffle1_hi =
+      vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])));
+  uint8x8_t shuffle1_lo =
+      vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])));
+  uint8x8_t shuffle0_hi =
+      vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])));
+  uint8x8_t shuffle0_lo =
+      vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[0])));
+  return v256_from_64(vget_lane_u64(vreinterpret_u64_u8(shuffle1_hi), 0),
+                      vget_lane_u64(vreinterpret_u64_u8(shuffle1_lo), 0),
+                      vget_lane_u64(vreinterpret_u64_u8(shuffle0_hi), 0),
+                      vget_lane_u64(vreinterpret_u64_u8(shuffle0_lo), 0));
 #endif
 #else
   v128 c16 = v128_dup_8(16);
@@ -672,24 +675,26 @@
                       vget_high_u8(vreinterpretq_u8_s64(y.val[0])),
                       vget_low_u8(vreinterpretq_u8_s64(y.val[1])),
                       vget_high_u8(vreinterpretq_u8_s64(y.val[1])) } };
-  v256 r1 =
-      v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8(
-                       p, vreinterpret_u8_s64(vget_high_s64(p32.val[1])))),
-                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
-                       p, vreinterpret_u8_s64(vget_low_s64(p32.val[1])))),
-                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
-                       p, vreinterpret_u8_s64(vget_high_s64(p32.val[0])))),
-                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
-                       p, vreinterpret_u8_s64(vget_low_s64(p32.val[0])))));
-  v256 r2 =
-      v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8(
-                       q, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))),
-                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
-                       q, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))),
-                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
-                       q, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))),
-                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
-                       q, vreinterpret_u8_s64(vget_low_s64(pattern.val[0])))));
+  uint8x8_t shuffle1_hi =
+      vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(p32.val[1])));
+  uint8x8_t shuffle1_lo =
+      vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(p32.val[1])));
+  uint8x8_t shuffle0_hi =
+      vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(p32.val[0])));
+  uint8x8_t shuffle0_lo =
+      vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(p32.val[0])));
+  v256 r1 = v256_from_64(vget_lane_u64(vreinterpret_u64_u8(shuffle1_hi), 0),
+                         vget_lane_u64(vreinterpret_u64_u8(shuffle1_lo), 0),
+                         vget_lane_u64(vreinterpret_u64_u8(shuffle0_hi), 0),
+                         vget_lane_u64(vreinterpret_u64_u8(shuffle0_lo), 0));
+  shuffle1_hi = vtbl4_u8(q, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])));
+  shuffle1_lo = vtbl4_u8(q, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])));
+  shuffle0_hi = vtbl4_u8(q, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])));
+  shuffle0_lo = vtbl4_u8(q, vreinterpret_u8_s64(vget_low_s64(pattern.val[0])));
+  v256 r2 = v256_from_64(vget_lane_u64(vreinterpret_u64_u8(shuffle1_hi), 0),
+                         vget_lane_u64(vreinterpret_u64_u8(shuffle1_lo), 0),
+                         vget_lane_u64(vreinterpret_u64_u8(shuffle0_hi), 0),
+                         vget_lane_u64(vreinterpret_u64_u8(shuffle0_lo), 0));
   return v256_blend_8(r1, r2, v256_cmplt_s8(pattern, c32));
 #endif
 #else
diff --git a/aom_dsp/simd/v64_intrinsics_arm.h b/aom_dsp/simd/v64_intrinsics_arm.h
index a4ecdf4..1576b8f 100644
--- a/aom_dsp/simd/v64_intrinsics_arm.h
+++ b/aom_dsp/simd/v64_intrinsics_arm.h
@@ -13,6 +13,7 @@
 #define AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_
 
 #include <arm_neon.h>
+#include <string.h>
 
 #include "aom_dsp/simd/v64_intrinsics_arm.h"
 #include "aom_ports/arm.h"
@@ -50,7 +51,7 @@
 
 SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); }
 
-SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)x; }
+SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)vget_lane_s64(x, 0); }
 
 SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
   return *((uint32_t *)p);
@@ -77,8 +78,7 @@
   } __attribute__((__packed__));
   ((struct Unaligned32Struct *)p)->value = a;
 #else
-  vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
-                0);
+  memcpy(p, &a, 4);
 #endif
 }
 
@@ -106,7 +106,8 @@
                  vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c))
            : b;
 #else
-  return c ? v64_from_64(((uint64_t)b >> c * 8) | ((uint64_t)a << (8 - c) * 8))
+  return c ? v64_from_64(((uint64_t)vget_lane_s64(b, 0) >> c * 8) |
+                         ((uint64_t)vget_lane_s64(a, 0) << (8 - c) * 8))
            : b;
 #endif
 }
@@ -133,7 +134,7 @@
   return vaddlvq_s16(t);
 #else
   int64x2_t r = vpaddlq_s32(vpaddlq_s16(t));
-  return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r));
+  return vget_lane_s64(vadd_s64(vget_high_s64(r), vget_low_s64(r)), 0);
 #endif
 }
 
@@ -144,7 +145,7 @@
 #else
   int64x2_t r =
       vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-  return (int64_t)(vget_high_s64(r) + vget_low_s64(r));
+  return vget_lane_s64(vadd_s64(vget_high_s64(r), vget_low_s64(r)), 0);
 #endif
 }
 
@@ -152,12 +153,13 @@
 #if defined(__aarch64__)
   return vaddlv_u8(vreinterpret_u8_s64(x));
 #else
-  return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x))));
+  return vget_lane_u64(
+      vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x)))), 0);
 #endif
 }
 
 SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
-  return (int64_t)vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a)));
+  return vget_lane_s64(vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a))), 0);
 }
 
 typedef uint16x8_t sad64_internal;
@@ -175,7 +177,8 @@
   return vaddlvq_u16(s);
 #else
   uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s));
-  return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r));
+  return (uint32_t)vget_lane_u64(vadd_u64(vget_high_u64(r), vget_low_u64(r)),
+                                 0);
 #endif
 }