av1_fwd_txfm2d_neon: fix -Wmaybe-uninitialized warnings with some versions of gcc (observed with gcc 12.2.0 and 10.x under some conditions). fixes warnings of the form: av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c:292:14: warning: 'buf0' may be used uninitialized in this function [-Wmaybe-uninitialized] out[i] = vreinterpretq_s16_u64(vld1q_lane_u64( ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ (uint64_t *)(in + i * stride), vreinterpretq_u64_s16(out[i]), 0)); ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This uses vld1q_dup_u64 rather than vld1q_lane_u64. This is safe because the upper lane is unused after the load. Change-Id: Ia15d4ea54e3a5a92a738bfb7bfe3bf5de1d6c974

commit: 07f66e9a4df11dca5371d8007234ac6bb70eb76b [log] [tgz]
author: James Zern <jzern@google.com> Fri Dec 16 19:04:32 2022 -0800
committer: James Zern <jzern@google.com> Sat Dec 17 10:16:10 2022 -0800
tree: 87950d618ef161a7779a6fe837652c80cc546e28
parent: af89b290e3234b10e0c73b2b18c4d11b46c25547 [diff]
diff --git a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
index 5754def..249b3a6 100644
--- a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
+++ b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c

@@ -284,23 +284,32 @@
   vst1q_s32((b + 4), b_hi);
 }
 
-static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
+static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *in,
                                                  const int stride,
                                                  int16x8_t *const out,
                                                  const int out_size) {
-  for (int i = 0; i < out_size; ++i)
-    out[i] = vreinterpretq_s16_u64(vld1q_lane_u64(
-        (uint64_t *)(in + i * stride), vreinterpretq_u64_s16(out[i]), 0));
+  for (int i = 0; i < out_size; ++i) {
+    // vld1q_dup_u64 is used rather than vld1q_lane_u64(lane=0) to avoid
+    // -Wmaybe-uninitialized warnings with some versions of gcc. This assumes
+    // the upper lane is unused or further modified after this call. The
+    // latency should be similar between the two.
+    out[i] = vreinterpretq_s16_u64(vld1q_dup_u64((uint64_t *)in));
+    in += stride;
+  }
 }
 
-static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
+static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *in,
                                                       const int stride,
                                                       int16x8_t *const out,
                                                       const int out_size) {
-  for (int i = 0; i < out_size; ++i)
-    out[out_size - i - 1] = vreinterpretq_s16_u64(
-        vld1q_lane_u64((uint64_t *)(in + i * stride),
-                       vreinterpretq_u64_s16(out[out_size - i - 1]), 0));
+  for (int i = out_size - 1; i >= 0; --i) {
+    // vld1q_dup_u64 is used rather than vld1q_lane_u64(lane=0) to avoid
+    // -Wmaybe-uninitialized warnings with some versions of gcc. This assumes
+    // the upper lane is unused or further modified after this call. The
+    // latency should be similar between the two.
+    out[i] = vreinterpretq_s16_u64(vld1q_dup_u64((uint64_t *)in));
+    in += stride;
+  }
 }
 
 static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
commit	07f66e9a4df11dca5371d8007234ac6bb70eb76b	[log] [tgz]
author	James Zern <jzern@google.com>	Fri Dec 16 19:04:32 2022 -0800
committer	James Zern <jzern@google.com>	Sat Dec 17 10:16:10 2022 -0800
tree	87950d618ef161a7779a6fe837652c80cc546e28
parent	af89b290e3234b10e0c73b2b18c4d11b46c25547 [diff]