c_v{128,256}_load_unaligned: quiet -Warray-bounds

removing type punning of c_v128 and c_v256 and use u8[] directly. this
fixes warnings like the following using
arm-linux-gnueabihf-g++ (Debian 11.2.0-16) 11.2.0:

In file included from aom_dsp/simd/v256_intrinsics_c.h:20,
                 from test/simd_cmp_impl.h:23,
                 from test/simd_cmp_neon.cc:16:
aom_dsp/simd/v128_intrinsics_c.h: In function 'void
simd_test_neon::TestSimd1Arg(uint32_t, uint32_t, uint32_t, const char*) [with
CRet = c_v64; CArg = c_v64]':
aom_dsp/simd/v128_intrinsics_c.h:69:33: warning: array subscript 8 is outside
array bounds of 'c_v64 [1]' [-Warray-bounds]
   69 |   for (c = 0; c < 16; c++) q[c] = pp[c];
      |                            ~~~~~^~~~~~~

Change-Id: I3f5e71d289497e8a05cffba80a9c55a2e35d43a3
diff --git a/aom_dsp/simd/v128_intrinsics_c.h b/aom_dsp/simd/v128_intrinsics_c.h
index 466a41e..f048020 100644
--- a/aom_dsp/simd/v128_intrinsics_c.h
+++ b/aom_dsp/simd/v128_intrinsics_c.h
@@ -64,9 +64,9 @@
 SIMD_INLINE c_v128 c_v128_load_unaligned(const void *p) {
   c_v128 t;
   uint8_t *pp = (uint8_t *)p;
-  uint8_t *q = (uint8_t *)&t;
   int c;
-  for (c = 0; c < 16; c++) q[c] = pp[c];
+  // Note memcpy is avoided due to some versions of gcc issuing -Warray-bounds.
+  for (c = 0; c < 16; c++) t.u8[c] = pp[c];
   return t;
 }
 
@@ -80,9 +80,8 @@
 
 SIMD_INLINE void c_v128_store_unaligned(void *p, c_v128 a) {
   uint8_t *pp = (uint8_t *)p;
-  uint8_t *q = (uint8_t *)&a;
   int c;
-  for (c = 0; c < 16; c++) pp[c] = q[c];
+  for (c = 0; c < 16; c++) pp[c] = a.u8[c];
 }
 
 SIMD_INLINE void c_v128_store_aligned(void *p, c_v128 a) {
diff --git a/aom_dsp/simd/v256_intrinsics_c.h b/aom_dsp/simd/v256_intrinsics_c.h
index 8127ee3..dcfe33d 100644
--- a/aom_dsp/simd/v256_intrinsics_c.h
+++ b/aom_dsp/simd/v256_intrinsics_c.h
@@ -71,9 +71,9 @@
 SIMD_INLINE c_v256 c_v256_load_unaligned(const void *p) {
   c_v256 t;
   uint8_t *pp = (uint8_t *)p;
-  uint8_t *q = (uint8_t *)&t;
   int c;
-  for (c = 0; c < 32; c++) q[c] = pp[c];
+  // Note memcpy is avoided due to some versions of gcc issuing -Warray-bounds.
+  for (c = 0; c < 32; c++) t.u8[c] = pp[c];
   return t;
 }
 
@@ -87,9 +87,8 @@
 
 SIMD_INLINE void c_v256_store_unaligned(void *p, c_v256 a) {
   uint8_t *pp = (uint8_t *)p;
-  uint8_t *q = (uint8_t *)&a;
   int c;
-  for (c = 0; c < 32; c++) pp[c] = q[c];
+  for (c = 0; c < 32; c++) pp[c] = a.u8[c];
 }
 
 SIMD_INLINE void c_v256_store_aligned(void *p, c_v256 a) {