av1_fwd_txfm2d_neon: fix -Wmaybe-uninitialized warnings
with some versions of gcc (observed with gcc 12.2.0 and 10.x under some
conditions).
fixes warnings of the form:
av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c:292:14: warning: 'buf0' may
be used uninitialized in this function [-Wmaybe-uninitialized]
out[i] = vreinterpretq_s16_u64(vld1q_lane_u64(
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
(uint64_t *)(in + i * stride), vreinterpretq_u64_s16(out[i]), 0));
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This uses vld1q_dup_u64 rather than vld1q_lane_u64. This is safe because
the upper lane is unused after the load.
Change-Id: Ia15d4ea54e3a5a92a738bfb7bfe3bf5de1d6c974
diff --git a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
index 5754def..249b3a6 100644
--- a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
+++ b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
@@ -284,23 +284,32 @@
vst1q_s32((b + 4), b_hi);
}
-static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
+static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *in,
const int stride,
int16x8_t *const out,
const int out_size) {
- for (int i = 0; i < out_size; ++i)
- out[i] = vreinterpretq_s16_u64(vld1q_lane_u64(
- (uint64_t *)(in + i * stride), vreinterpretq_u64_s16(out[i]), 0));
+ for (int i = 0; i < out_size; ++i) {
+ // vld1q_dup_u64 is used rather than vld1q_lane_u64(lane=0) to avoid
+ // -Wmaybe-uninitialized warnings with some versions of gcc. This assumes
+ // the upper lane is unused or further modified after this call. The
+ // latency should be similar between the two.
+ out[i] = vreinterpretq_s16_u64(vld1q_dup_u64((uint64_t *)in));
+ in += stride;
+ }
}
-static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
+static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *in,
const int stride,
int16x8_t *const out,
const int out_size) {
- for (int i = 0; i < out_size; ++i)
- out[out_size - i - 1] = vreinterpretq_s16_u64(
- vld1q_lane_u64((uint64_t *)(in + i * stride),
- vreinterpretq_u64_s16(out[out_size - i - 1]), 0));
+ for (int i = out_size - 1; i >= 0; --i) {
+ // vld1q_dup_u64 is used rather than vld1q_lane_u64(lane=0) to avoid
+ // -Wmaybe-uninitialized warnings with some versions of gcc. This assumes
+ // the upper lane is unused or further modified after this call. The
+ // latency should be similar between the two.
+ out[i] = vreinterpretq_s16_u64(vld1q_dup_u64((uint64_t *)in));
+ in += stride;
+ }
}
static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride,