neon,load_unaligned_*: use dup for lane 0

this produces better assembly with gcc (11.3.0-3); no change in assembly
using clang from the r24 android sdk (Android (8075178, based on
r437112b) clang version 14.0.1
(https://android.googlesource.com/toolchain/llvm-project
8671348b81b95fc603505dfc881b45103bee1731)

Change-Id: Ifec252d4f499f23be1cd94aa8516caf6b3fbbc11
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 81643e9..9f79ec5 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -319,11 +319,11 @@
 // Load 4 sets of 4 bytes when alignment is not guaranteed.
 static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
   uint32_t a;
-  uint32x4_t a_u32 = vdupq_n_u32(0);
+  uint32x4_t a_u32;
   if (stride == 4) return vld1q_u8(buf);
   memcpy(&a, buf, 4);
   buf += stride;
-  a_u32 = vsetq_lane_u32(a, a_u32, 0);
+  a_u32 = vdupq_n_u32(a);
   memcpy(&a, buf, 4);
   buf += stride;
   a_u32 = vsetq_lane_u32(a, a_u32, 1);
@@ -331,7 +331,6 @@
   buf += stride;
   a_u32 = vsetq_lane_u32(a, a_u32, 2);
   memcpy(&a, buf, 4);
-  buf += stride;
   a_u32 = vsetq_lane_u32(a, a_u32, 3);
   return vreinterpretq_u8_u32(a_u32);
 }
@@ -343,25 +342,25 @@
 
   memcpy(&a, buf, 4);
   buf += stride;
-  *tu0 = vset_lane_u32(a, *tu0, 0);
+  *tu0 = vdup_n_u32(a);
   memcpy(&a, buf, 4);
   buf += stride;
   *tu0 = vset_lane_u32(a, *tu0, 1);
   memcpy(&a, buf, 4);
   buf += stride;
-  *tu1 = vset_lane_u32(a, *tu1, 0);
+  *tu1 = vdup_n_u32(a);
   memcpy(&a, buf, 4);
   buf += stride;
   *tu1 = vset_lane_u32(a, *tu1, 1);
   memcpy(&a, buf, 4);
   buf += stride;
-  *tu2 = vset_lane_u32(a, *tu2, 0);
+  *tu2 = vdup_n_u32(a);
   memcpy(&a, buf, 4);
   buf += stride;
   *tu2 = vset_lane_u32(a, *tu2, 1);
   memcpy(&a, buf, 4);
   buf += stride;
-  *tu3 = vset_lane_u32(a, *tu3, 0);
+  *tu3 = vdup_n_u32(a);
   memcpy(&a, buf, 4);
   *tu3 = vset_lane_u32(a, *tu3, 1);
 }
@@ -372,13 +371,13 @@
 
   memcpy(&a, buf, 4);
   buf += stride;
-  *tu0 = vset_lane_u32(a, *tu0, 0);
+  *tu0 = vdup_n_u32(a);
   memcpy(&a, buf, 4);
   buf += stride;
   *tu0 = vset_lane_u32(a, *tu0, 1);
   memcpy(&a, buf, 4);
   buf += stride;
-  *tu1 = vset_lane_u32(a, *tu1, 0);
+  *tu1 = vdup_n_u32(a);
   memcpy(&a, buf, 4);
   *tu1 = vset_lane_u32(a, *tu1, 1);
 }
@@ -398,9 +397,8 @@
 
   memcpy(&a, buf, 4);
   buf += stride;
-  *tu0 = vset_lane_u32(a, *tu0, 0);
+  *tu0 = vdup_n_u32(a);
   memcpy(&a, buf, 4);
-  buf += stride;
   *tu0 = vset_lane_u32(a, *tu0, 1);
 }
 
@@ -426,9 +424,8 @@
 
   memcpy(&a, buf, 2);
   buf += stride;
-  *tu0 = vset_lane_u16(a, *tu0, 0);
+  *tu0 = vdup_n_u16(a);
   memcpy(&a, buf, 2);
-  buf += stride;
   *tu0 = vset_lane_u16(a, *tu0, 1);
 }
 
@@ -472,13 +469,13 @@
 
   memcpy(&a, buf, 8);
   buf += stride;
-  *tu0 = vsetq_lane_u64(a, *tu0, 0);
+  *tu0 = vdupq_n_u64(a);
   memcpy(&a, buf, 8);
   buf += stride;
   *tu0 = vsetq_lane_u64(a, *tu0, 1);
   memcpy(&a, buf, 8);
   buf += stride;
-  *tu1 = vsetq_lane_u64(a, *tu1, 0);
+  *tu1 = vdupq_n_u64(a);
   memcpy(&a, buf, 8);
   *tu1 = vsetq_lane_u64(a, *tu1, 1);
 }
diff --git a/av1/common/arm/av1_inv_txfm_neon.c b/av1/common/arm/av1_inv_txfm_neon.c
index bee496a..1628cbf 100644
--- a/av1/common/arm/av1_inv_txfm_neon.c
+++ b/av1/common/arm/av1_inv_txfm_neon.c
@@ -250,8 +250,7 @@
 
 static INLINE int16x4_t set_s16x4_neon(const int16_t c0, const int16_t c1,
                                        const int16_t c2, const int16_t c3) {
-  int16x4_t val = vdup_n_s16((int16_t)0);
-  val = vset_lane_s16(c0, val, 0);
+  int16x4_t val = vdup_n_s16(c0);
   val = vset_lane_s16(c1, val, 1);
   val = vset_lane_s16(c2, val, 2);
   val = vset_lane_s16(c3, val, 3);