Refactor unaligned load parameter types in mem_neon.h

Refactor mem_neon.h unaligned load helper parameter types to be the
types we actually want in the calling code. This moves the casts into
mem_neon.h - from the calling code - which is desirable since the
wider types are an implementation detail of the unaligned load
helpers.

Change-Id: I9bf30e0ea23ca6f7cd1e5a14e35e0ed5ff99ad3c
diff --git a/aom_dsp/arm/blend_a64_mask_neon.c b/aom_dsp/arm/blend_a64_mask_neon.c
index f11d57e..c3ee0b7 100644
--- a/aom_dsp/arm/blend_a64_mask_neon.c
+++ b/aom_dsp/arm/blend_a64_mask_neon.c
@@ -86,19 +86,21 @@
                              const int16x8_t vec_round_bits) {
   int16x8_t src0_0, src0_1;
   int16x8_t src1_0, src1_1;
-  uint64x2_t tu0 = vdupq_n_u64(0), tu1 = vdupq_n_u64(0), tu2 = vdupq_n_u64(0),
-             tu3 = vdupq_n_u64(0);
+  uint16x8_t tu0 = vdupq_n_u16(0);
+  uint16x8_t tu1 = vdupq_n_u16(0);
+  uint16x8_t tu2 = vdupq_n_u16(0);
+  uint16x8_t tu3 = vdupq_n_u16(0);
   int16x8_t mask0_1, mask2_3;
   int16x8_t res0, res1;
 
   load_unaligned_u16_4x4(src0, src0_stride, &tu0, &tu1);
   load_unaligned_u16_4x4(src1, src1_stride, &tu2, &tu3);
 
-  src0_0 = vreinterpretq_s16_u64(tu0);
-  src0_1 = vreinterpretq_s16_u64(tu1);
+  src0_0 = vreinterpretq_s16_u16(tu0);
+  src0_1 = vreinterpretq_s16_u16(tu1);
 
-  src1_0 = vreinterpretq_s16_u64(tu2);
-  src1_1 = vreinterpretq_s16_u64(tu3);
+  src1_0 = vreinterpretq_s16_u16(tu2);
+  src1_1 = vreinterpretq_s16_u16(tu3);
 
   mask0_1 = vcombine_s16(mask0, mask1);
   mask2_3 = vcombine_s16(mask2, mask3);
@@ -150,9 +152,10 @@
   assert(IS_POWER_OF_TWO(h));
   assert(IS_POWER_OF_TWO(w));
 
-  uint8x8_t s0, s1, s2, s3;
-  uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0),
-             tu3 = vdup_n_u32(0);
+  uint8x8_t s0 = vdup_n_u8(0);
+  uint8x8_t s1 = vdup_n_u8(0);
+  uint8x8_t s2 = vdup_n_u8(0);
+  uint8x8_t s3 = vdup_n_u8(0);
   uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
   int16x8_t mask0, mask1, mask2, mask3;
   int16x8_t mask4, mask5, mask6, mask7;
@@ -197,10 +200,10 @@
       } while (i < h);
     } else {
       do {
-        load_unaligned_u8_4x4(mask_tmp, mask_stride, &tu0, &tu1);
+        load_unaligned_u8_4x4(mask_tmp, mask_stride, &s0, &s1);
 
-        mask0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
-        mask1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
+        mask0 = vreinterpretq_s16_u16(vmovl_u8(s0));
+        mask1 = vreinterpretq_s16_u16(vmovl_u8(s1));
 
         mask0_low = vget_low_s16(mask0);
         mask1_low = vget_high_s16(mask0);
@@ -412,14 +415,9 @@
       } while (i < h);
     } else {
       do {
-        load_unaligned_u8_4x4(mask_tmp, 2 * mask_stride, &tu0, &tu1);
-        load_unaligned_u8_4x4(mask_tmp + mask_stride, 2 * mask_stride, &tu2,
-                              &tu3);
-
-        s0 = vreinterpret_u8_u32(tu0);
-        s1 = vreinterpret_u8_u32(tu1);
-        s2 = vreinterpret_u8_u32(tu2);
-        s3 = vreinterpret_u8_u32(tu3);
+        load_unaligned_u8_4x4(mask_tmp, 2 * mask_stride, &s0, &s1);
+        load_unaligned_u8_4x4(mask_tmp + mask_stride, 2 * mask_stride, &s2,
+                              &s3);
 
         mask0 = vreinterpretq_s16_u16(vaddl_u8(s0, s2));
         mask1 = vreinterpretq_s16_u16(vaddl_u8(s1, s3));
diff --git a/aom_dsp/arm/loopfilter_neon.c b/aom_dsp/arm/loopfilter_neon.c
index 7caa643..8fc7ccb 100644
--- a/aom_dsp/arm/loopfilter_neon.c
+++ b/aom_dsp/arm/loopfilter_neon.c
@@ -841,8 +841,7 @@
   // row1: p1 p0 | q0 q1
   // row2: p1 p0 | q0 q1
   // row3: p1 p0 | q0 q1
-  load_unaligned_u8_4x4(src - 2, stride, (uint32x2_t *)&p1p0,
-                        (uint32x2_t *)&q0q1);
+  load_unaligned_u8_4x4(src - 2, stride, &p1p0, &q0q1);
 
   transpose_u8_4x4(&p1p0, &q0q1);
 
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 2386d54..994a636 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -613,71 +613,53 @@
   return vreinterpretq_u8_u32(a_u32);
 }
 
-static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
-                                         uint32x2_t *tu0, uint32x2_t *tu1,
-                                         uint32x2_t *tu2, uint32x2_t *tu3) {
+static INLINE uint8x8_t load_unaligned_u8_2x2(const uint8_t *buf, int stride) {
+  uint16_t a;
+  uint16x4_t a_u16;
+
+  memcpy(&a, buf, 2);
+  buf += stride;
+  a_u16 = vdup_n_u16(a);
+  memcpy(&a, buf, 2);
+  a_u16 = vset_lane_u16(a, a_u16, 1);
+  return vreinterpret_u8_u16(a_u16);
+}
+
+static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
   uint32_t a;
+  uint32x2_t a_u32;
+
+  memcpy(&a, buf, 4);
+  a_u32 = vdup_n_u32(0);
+  a_u32 = vset_lane_u32(a, a_u32, 0);
+  return vreinterpret_u8_u32(a_u32);
+}
+
+static INLINE uint8x8_t load_unaligned_u8_4x2(const uint8_t *buf, int stride) {
+  uint32_t a;
+  uint32x2_t a_u32;
 
   memcpy(&a, buf, 4);
   buf += stride;
-  *tu0 = vdup_n_u32(a);
+  a_u32 = vdup_n_u32(a);
   memcpy(&a, buf, 4);
-  buf += stride;
-  *tu0 = vset_lane_u32(a, *tu0, 1);
-  memcpy(&a, buf, 4);
-  buf += stride;
-  *tu1 = vdup_n_u32(a);
-  memcpy(&a, buf, 4);
-  buf += stride;
-  *tu1 = vset_lane_u32(a, *tu1, 1);
-  memcpy(&a, buf, 4);
-  buf += stride;
-  *tu2 = vdup_n_u32(a);
-  memcpy(&a, buf, 4);
-  buf += stride;
-  *tu2 = vset_lane_u32(a, *tu2, 1);
-  memcpy(&a, buf, 4);
-  buf += stride;
-  *tu3 = vdup_n_u32(a);
-  memcpy(&a, buf, 4);
-  *tu3 = vset_lane_u32(a, *tu3, 1);
+  a_u32 = vset_lane_u32(a, a_u32, 1);
+  return vreinterpret_u8_u32(a_u32);
 }
 
 static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
-                                         uint32x2_t *tu0, uint32x2_t *tu1) {
-  uint32_t a;
-
-  memcpy(&a, buf, 4);
-  buf += stride;
-  *tu0 = vdup_n_u32(a);
-  memcpy(&a, buf, 4);
-  buf += stride;
-  *tu0 = vset_lane_u32(a, *tu0, 1);
-  memcpy(&a, buf, 4);
-  buf += stride;
-  *tu1 = vdup_n_u32(a);
-  memcpy(&a, buf, 4);
-  *tu1 = vset_lane_u32(a, *tu1, 1);
+                                         uint8x8_t *tu0, uint8x8_t *tu1) {
+  *tu0 = load_unaligned_u8_4x2(buf, stride);
+  buf += 2 * stride;
+  *tu1 = load_unaligned_u8_4x2(buf, stride);
 }
 
-static INLINE void load_unaligned_u8_4x1(const uint8_t *buf, int stride,
-                                         uint32x2_t *tu0) {
-  uint32_t a;
-
-  memcpy(&a, buf, 4);
-  buf += stride;
-  *tu0 = vset_lane_u32(a, *tu0, 0);
-}
-
-static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride,
-                                         uint32x2_t *tu0) {
-  uint32_t a;
-
-  memcpy(&a, buf, 4);
-  buf += stride;
-  *tu0 = vdup_n_u32(a);
-  memcpy(&a, buf, 4);
-  *tu0 = vset_lane_u32(a, *tu0, 1);
+static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
+                                         uint8x8_t *tu0, uint8x8_t *tu1,
+                                         uint8x8_t *tu2, uint8x8_t *tu3) {
+  load_unaligned_u8_4x4(buf, stride, tu0, tu1);
+  buf += 4 * stride;
+  load_unaligned_u8_4x4(buf, stride, tu2, tu3);
 }
 
 /* These intrinsics require immediate values, so we must use #defines
@@ -696,17 +678,6 @@
     memcpy(dst, &a, 2);                                \
   } while (0)
 
-static INLINE void load_unaligned_u8_2x2(const uint8_t *buf, int stride,
-                                         uint16x4_t *tu0) {
-  uint16_t a;
-
-  memcpy(&a, buf, 2);
-  buf += stride;
-  *tu0 = vdup_n_u16(a);
-  memcpy(&a, buf, 2);
-  *tu0 = vset_lane_u16(a, *tu0, 1);
-}
-
 static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
                                 uint8x16_t *const s0, uint8x16_t *const s1,
                                 uint8x16_t *const s2, uint8x16_t *const s3,
@@ -742,20 +713,24 @@
 }
 
 static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
-                                          uint64x2_t *tu0, uint64x2_t *tu1) {
+                                          uint16x8_t *tu0, uint16x8_t *tu1) {
   uint64_t a;
+  uint64x2_t a_u64;
 
   memcpy(&a, buf, 8);
   buf += stride;
-  *tu0 = vdupq_n_u64(a);
+  a_u64 = vdupq_n_u64(0);
+  a_u64 = vsetq_lane_u64(a, a_u64, 0);
   memcpy(&a, buf, 8);
   buf += stride;
-  *tu0 = vsetq_lane_u64(a, *tu0, 1);
+  a_u64 = vsetq_lane_u64(a, a_u64, 1);
+  *tu0 = vreinterpretq_u16_u64(a_u64);
   memcpy(&a, buf, 8);
   buf += stride;
-  *tu1 = vdupq_n_u64(a);
+  a_u64 = vdupq_n_u64(a);
   memcpy(&a, buf, 8);
-  *tu1 = vsetq_lane_u64(a, *tu1, 1);
+  a_u64 = vsetq_lane_u64(a, a_u64, 1);
+  *tu1 = vreinterpretq_u16_u64(a_u64);
 }
 
 static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
diff --git a/av1/common/arm/blend_a64_hmask_neon.c b/av1/common/arm/blend_a64_hmask_neon.c
index 89252ef..baad328 100644
--- a/av1/common/arm/blend_a64_hmask_neon.c
+++ b/av1/common/arm/blend_a64_hmask_neon.c
@@ -34,8 +34,6 @@
   uint8x8_t tmp0, tmp1;
   uint8x16_t res_q;
   uint16x8_t res, res_low, res_high;
-  uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0);
-  uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0);
   const uint8x8_t vdup_64 = vdup_n_u8((uint8_t)64);
 
   if (w >= 16) {
@@ -91,10 +89,8 @@
       __builtin_prefetch(src0 + 1 * src0_stride);
       __builtin_prefetch(src1 + 0 * src1_stride);
       __builtin_prefetch(src1 + 1 * src1_stride);
-      load_unaligned_u8_4x2(src0, src0_stride, &tmp0_32);
-      tmp0 = vreinterpret_u8_u32(tmp0_32);
-      load_unaligned_u8_4x2(src1, src1_stride, &tmp1_32);
-      tmp1 = vreinterpret_u8_u32(tmp1_32);
+      tmp0 = load_unaligned_u8_4x2(src0, src0_stride);
+      tmp1 = load_unaligned_u8_4x2(src1, src1_stride);
       res = vmull_u8(m, tmp0);
       res = vmlal_u8(res, max_minus_m, tmp1);
       const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
@@ -113,10 +109,8 @@
       __builtin_prefetch(src0 + 1 * src0_stride);
       __builtin_prefetch(src1 + 0 * src1_stride);
       __builtin_prefetch(src1 + 1 * src1_stride);
-      load_unaligned_u8_2x2(src0, src0_stride, &tmp0_16);
-      tmp0 = vreinterpret_u8_u16(tmp0_16);
-      load_unaligned_u8_2x2(src1, src1_stride, &tmp1_16);
-      tmp1 = vreinterpret_u8_u16(tmp1_16);
+      tmp0 = load_unaligned_u8_2x2(src0, src0_stride);
+      tmp1 = load_unaligned_u8_2x2(src1, src1_stride);
       res = vmull_u8(m, tmp0);
       res = vmlal_u8(res, max_minus_m, tmp1);
       const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
diff --git a/av1/common/arm/blend_a64_vmask_neon.c b/av1/common/arm/blend_a64_vmask_neon.c
index 2132fbd..c316977 100644
--- a/av1/common/arm/blend_a64_vmask_neon.c
+++ b/av1/common/arm/blend_a64_vmask_neon.c
@@ -27,8 +27,6 @@
   uint8x8_t tmp0, tmp1;
   uint8x16_t tmp0_q, tmp1_q, res_q;
   uint16x8_t res, res_low, res_high;
-  uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0);
-  uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0);
   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
   assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
 
@@ -89,10 +87,8 @@
       const uint16x4_t max_minus_m2 = vdup_n_u16(64 - (uint16_t)mask[i + 1]);
       const uint8x8_t max_minus_m =
           vmovn_u16(vcombine_u16(max_minus_m1, max_minus_m2));
-      load_unaligned_u8_4x2(src0, src0_stride, &tmp0_32);
-      tmp0 = vreinterpret_u8_u32(tmp0_32);
-      load_unaligned_u8_4x2(src1, src1_stride, &tmp1_32);
-      tmp1 = vreinterpret_u8_u32(tmp1_32);
+      tmp0 = load_unaligned_u8_4x2(src0, src0_stride);
+      tmp1 = load_unaligned_u8_4x2(src1, src1_stride);
       res = vmull_u8(m, tmp0);
       res = vmlal_u8(res, max_minus_m, tmp1);
       const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
@@ -118,10 +114,8 @@
       const uint16x4x2_t max_minus_m_trn = vtrn_u16(
           vreinterpret_u16_u8(max_minus_m1), vreinterpret_u16_u8(max_minus_m2));
       const uint8x8_t max_minus_m = vreinterpret_u8_u16(max_minus_m_trn.val[0]);
-      load_unaligned_u8_2x2(src0, src0_stride, &tmp0_16);
-      tmp0 = vreinterpret_u8_u16(tmp0_16);
-      load_unaligned_u8_2x2(src1, src1_stride, &tmp1_16);
-      tmp1 = vreinterpret_u8_u16(tmp1_16);
+      tmp0 = load_unaligned_u8_2x2(src0, src0_stride);
+      tmp1 = load_unaligned_u8_2x2(src1, src1_stride);
       res = vmull_u8(m, tmp0);
       res = vmlal_u8(res, max_minus_m, tmp1);
       const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
diff --git a/av1/common/arm/jnt_convolve_neon.c b/av1/common/arm/jnt_convolve_neon.c
index 6aa2d43..4293443 100644
--- a/av1/common/arm/jnt_convolve_neon.c
+++ b/av1/common/arm/jnt_convolve_neon.c
@@ -1621,7 +1621,6 @@
     int16x4_t s8, s9, s10, d1, d2, d3;
     int16x8_t tt1, tt2, tt3;
     uint16x4_t res5, res6, res7;
-    uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0);
     int16x8_t u0, u1;
 #else
     int16x4_t temp_0;
@@ -1660,9 +1659,7 @@
       __builtin_prefetch(d + 3 * dst_stride);
       s += 7;
       do {
-        load_unaligned_u8_4x4(s, src_stride, &tu0, &tu1);
-        t0 = vreinterpret_u8_u32(tu0);
-        t1 = vreinterpret_u8_u32(tu1);
+        load_unaligned_u8_4x4(s, src_stride, &t0, &t1);
 
         transpose_u8_4x4(&t0, &t1);
         u0 = vreinterpretq_s16_u16(vmovl_u8(t0));
@@ -2066,8 +2063,10 @@
   if ((w == 4) || (h == 4)) {
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
     uint16x4_t res4;
-    uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0),
-               tu3 = vdup_n_u32(0);
+    uint8x8_t tu0 = vdup_n_u8(0);
+    uint8x8_t tu1 = vdup_n_u8(0);
+    uint8x8_t tu2 = vdup_n_u8(0);
+    uint8x8_t tu3 = vdup_n_u8(0);
     int16x8_t u0, u1, u2, u3;
     uint8x8_t t0;
 
@@ -2092,10 +2091,10 @@
 
       load_unaligned_u8_4x8(s, src_stride, &tu0, &tu1, &tu2, &tu3);
 
-      u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
-      u1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
-      u2 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu2)));
-      u3 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu3)));
+      u0 = vreinterpretq_s16_u16(vmovl_u8(tu0));
+      u1 = vreinterpretq_s16_u16(vmovl_u8(tu1));
+      u2 = vreinterpretq_s16_u16(vmovl_u8(tu2));
+      u3 = vreinterpretq_s16_u16(vmovl_u8(tu3));
 
       s0 = vget_low_s16(u0);
       s1 = vget_high_s16(u0);
@@ -2115,8 +2114,8 @@
 #if defined(__aarch64__)
         load_unaligned_u8_4x4(s, src_stride, &tu0, &tu1);
 
-        u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
-        u1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
+        u0 = vreinterpretq_s16_u16(vmovl_u8(tu0));
+        u1 = vreinterpretq_s16_u16(vmovl_u8(tu1));
 
         s7 = vget_low_s16(u0);
         s8 = vget_high_s16(u0);
@@ -2177,8 +2176,8 @@
         d_u8 += 4 * dst8_stride;
         height -= 4;
 #else
-        load_unaligned_u8_4x1(s, src_stride, &tu0);
-        u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
+        tu0 = load_unaligned_u8_4x1(s);
+        u0 = vreinterpretq_s16_u16(vmovl_u8(tu0));
         s7 = vget_low_s16(u0);
 
         d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, zero,