neon,load_unaligned_*: use dup for lane 0
this produces better assembly with gcc (11.3.0-3); no change in assembly
using clang from the r24 android sdk (Android (8075178, based on
r437112b) clang version 14.0.1
(https://android.googlesource.com/toolchain/llvm-project
8671348b81b95fc603505dfc881b45103bee1731)
Change-Id: Ifec252d4f499f23be1cd94aa8516caf6b3fbbc11
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 81643e9..9f79ec5 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -319,11 +319,11 @@
// Load 4 sets of 4 bytes when alignment is not guaranteed.
static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
uint32_t a;
- uint32x4_t a_u32 = vdupq_n_u32(0);
+ uint32x4_t a_u32;
if (stride == 4) return vld1q_u8(buf);
memcpy(&a, buf, 4);
buf += stride;
- a_u32 = vsetq_lane_u32(a, a_u32, 0);
+ a_u32 = vdupq_n_u32(a);
memcpy(&a, buf, 4);
buf += stride;
a_u32 = vsetq_lane_u32(a, a_u32, 1);
@@ -331,7 +331,6 @@
buf += stride;
a_u32 = vsetq_lane_u32(a, a_u32, 2);
memcpy(&a, buf, 4);
- buf += stride;
a_u32 = vsetq_lane_u32(a, a_u32, 3);
return vreinterpretq_u8_u32(a_u32);
}
@@ -343,25 +342,25 @@
memcpy(&a, buf, 4);
buf += stride;
- *tu0 = vset_lane_u32(a, *tu0, 0);
+ *tu0 = vdup_n_u32(a);
memcpy(&a, buf, 4);
buf += stride;
*tu0 = vset_lane_u32(a, *tu0, 1);
memcpy(&a, buf, 4);
buf += stride;
- *tu1 = vset_lane_u32(a, *tu1, 0);
+ *tu1 = vdup_n_u32(a);
memcpy(&a, buf, 4);
buf += stride;
*tu1 = vset_lane_u32(a, *tu1, 1);
memcpy(&a, buf, 4);
buf += stride;
- *tu2 = vset_lane_u32(a, *tu2, 0);
+ *tu2 = vdup_n_u32(a);
memcpy(&a, buf, 4);
buf += stride;
*tu2 = vset_lane_u32(a, *tu2, 1);
memcpy(&a, buf, 4);
buf += stride;
- *tu3 = vset_lane_u32(a, *tu3, 0);
+ *tu3 = vdup_n_u32(a);
memcpy(&a, buf, 4);
*tu3 = vset_lane_u32(a, *tu3, 1);
}
@@ -372,13 +371,13 @@
memcpy(&a, buf, 4);
buf += stride;
- *tu0 = vset_lane_u32(a, *tu0, 0);
+ *tu0 = vdup_n_u32(a);
memcpy(&a, buf, 4);
buf += stride;
*tu0 = vset_lane_u32(a, *tu0, 1);
memcpy(&a, buf, 4);
buf += stride;
- *tu1 = vset_lane_u32(a, *tu1, 0);
+ *tu1 = vdup_n_u32(a);
memcpy(&a, buf, 4);
*tu1 = vset_lane_u32(a, *tu1, 1);
}
@@ -398,9 +397,8 @@
memcpy(&a, buf, 4);
buf += stride;
- *tu0 = vset_lane_u32(a, *tu0, 0);
+ *tu0 = vdup_n_u32(a);
memcpy(&a, buf, 4);
- buf += stride;
*tu0 = vset_lane_u32(a, *tu0, 1);
}
@@ -426,9 +424,8 @@
memcpy(&a, buf, 2);
buf += stride;
- *tu0 = vset_lane_u16(a, *tu0, 0);
+ *tu0 = vdup_n_u16(a);
memcpy(&a, buf, 2);
- buf += stride;
*tu0 = vset_lane_u16(a, *tu0, 1);
}
@@ -472,13 +469,13 @@
memcpy(&a, buf, 8);
buf += stride;
- *tu0 = vsetq_lane_u64(a, *tu0, 0);
+ *tu0 = vdupq_n_u64(a);
memcpy(&a, buf, 8);
buf += stride;
*tu0 = vsetq_lane_u64(a, *tu0, 1);
memcpy(&a, buf, 8);
buf += stride;
- *tu1 = vsetq_lane_u64(a, *tu1, 0);
+ *tu1 = vdupq_n_u64(a);
memcpy(&a, buf, 8);
*tu1 = vsetq_lane_u64(a, *tu1, 1);
}
diff --git a/av1/common/arm/av1_inv_txfm_neon.c b/av1/common/arm/av1_inv_txfm_neon.c
index bee496a..1628cbf 100644
--- a/av1/common/arm/av1_inv_txfm_neon.c
+++ b/av1/common/arm/av1_inv_txfm_neon.c
@@ -250,8 +250,7 @@
static INLINE int16x4_t set_s16x4_neon(const int16_t c0, const int16_t c1,
const int16_t c2, const int16_t c3) {
- int16x4_t val = vdup_n_s16((int16_t)0);
- val = vset_lane_s16(c0, val, 0);
+ int16x4_t val = vdup_n_s16(c0);
val = vset_lane_s16(c1, val, 1);
val = vset_lane_s16(c2, val, 2);
val = vset_lane_s16(c3, val, 3);