Add missing pre-processor directive comments Add description for #else and #endif pre-processor directives in Neon convolution functions. Change-Id: I846c42090ffbcc324a4a4bbf36bec5b0a46775cf

commit: 5711b50eebe392119defd2a2a262bffef05e8507 [log] [tgz]
author: Gerda Zsejke More <gerdazsejke.more@arm.com> Sun Mar 26 10:33:08 2023 +0200
committer: James Zern <jzern@google.com> Tue Apr 18 23:53:54 2023 +0000
tree: 6a040bd2b3d113ae979f35acaafd6e9d4abd5e50
parent: 76bb4aab0c407a87605d796fce1efbe641ea280f [diff]
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index e019bda..05a7547 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c

@@ -1091,7 +1091,7 @@
   // FILTER_BITS - ROUND0_BITS.
   // The outermost -1 is needed because we halved the filter values.
   const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
-#endif
+#endif  // defined(__aarch64__)
   // Filter values are even so downshift by 1 to reduce precision requirements.
   const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
 
@@ -1172,7 +1172,7 @@
       w -= 4;
     } while (w > 0);
   } else {
-#endif
+#endif  // defined(__aarch64__)
     int width;
     const uint8_t *s;
     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
@@ -1180,7 +1180,7 @@
 #if defined(__aarch64__)
     int16x8_t s8, s9, s10;
     uint8x8_t t4, t5, t6, t7;
-#endif
+#endif  // defined(__aarch64__)
 
     if (w <= 4) {
 #if defined(__aarch64__)
@@ -1260,7 +1260,7 @@
         dst += 8 * dst_stride;
         h -= 8;
       } while (h > 0);
-#else
+#else   // !defined(__aarch64__)
     // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
     // rounding right shift by FILTER_BITS - instead of a first rounding right
     // shift by ROUND0_BITS, followed by second rounding right shift by
@@ -1301,7 +1301,7 @@
       }
       h -= 1;
     } while (h > 0);
-#endif
+#endif  // defined(__aarch64__)
     } else {
       uint8_t *d;
       int16x8_t s11;
@@ -1390,7 +1390,7 @@
         dst += 8 * dst_stride;
         h -= 8;
       } while (h > 0);
-#else
+#else   // !defined(__aarch64__)
     // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
     // rounding right shift by FILTER_BITS - instead of a first rounding right
     // shift by ROUND0_BITS, followed by second rounding right shift by
@@ -1434,11 +1434,11 @@
       dst += dst_stride;
       h -= 1;
     } while (h > 0);
-#endif
+#endif  // defined(__aarch64__)
     }
 #if defined(__aarch64__)
   }
-#endif
+#endif  // defined(__aarch64__)
 }
 
 #endif  // defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
@@ -1894,7 +1894,7 @@
 #if defined(__aarch64__)
     uint8x8_t d23;
     int16x4_t s8, s9, s10, d1, d2, d3;
-#endif
+#endif  // defined(__aarch64__)
     s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
     src += src_stride;
     s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
@@ -1962,7 +1962,7 @@
       s6 = s10;
       dst += 4 * dst_stride;
       h -= 4;
-#else
+#else   // !defined(__aarch64__)
       __builtin_prefetch(dst + 0 * dst_stride);
       __builtin_prefetch(src + 0 * src_stride);
 
@@ -1984,7 +1984,7 @@
       s6 = s7;
       dst += dst_stride;
       h -= 1;
-#endif
+#endif  // defined(__aarch64__)
     } while (h > 0);
   } else {
     int height;
@@ -1995,7 +1995,7 @@
 #if defined(__aarch64__)
     uint8x8_t t1, t2, t3;
     int16x8_t s8, s9, s10;
-#endif
+#endif  // defined(__aarch64__)
     do {
       __builtin_prefetch(src + 0 * src_stride);
       __builtin_prefetch(src + 1 * src_stride);
@@ -2060,7 +2060,7 @@
         s6 = s10;
         d += 4 * dst_stride;
         height -= 4;
-#else
+#else   // !defined(__aarch64__)
         __builtin_prefetch(d);
         __builtin_prefetch(s);
 
@@ -2077,7 +2077,7 @@
         s5 = s6;
         s6 = s7;
         height -= 1;
-#endif
+#endif  // defined(__aarch64__)
       } while (height > 0);
       src += 8;
       dst += 8;

diff --git a/av1/common/arm/jnt_convolve_neon.c b/av1/common/arm/jnt_convolve_neon.c
index f970044..459f885 100644
--- a/av1/common/arm/jnt_convolve_neon.c
+++ b/av1/common/arm/jnt_convolve_neon.c

@@ -482,7 +482,7 @@
     int16x4_t s8, s9, s10, d1, d2, d3;
     int16x8_t tt1, tt2, tt3;
     uint8x8_t t1, t2, t3;
-#endif
+#endif  // defined(__aarch64__)
     do {
       s = src;
       __builtin_prefetch(s + 0 * src_stride);
@@ -537,7 +537,7 @@
       src += 4 * src_stride;
       dst_ptr += 4 * dst_stride;
       height -= 4;
-#else
+#else   // !defined(__aarch64__)
       t0 = vld1_u8(s);                            // a0 a1 a2 a3 a4 a5 a6 a7
       tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
       s0 = vget_low_s16(tt0);                     // a0 a1 a2 a3
@@ -564,7 +564,7 @@
       src += src_stride;
       dst_ptr += dst_stride;
       height -= 1;
-#endif
+#endif  // defined(__aarch64__)
     } while (height > 0);
   } else {
     int16_t *d_tmp;
@@ -660,7 +660,7 @@
       src += 8 * src_stride;
       dst_ptr += 8 * dst_stride;
       height -= 8;
-#else
+#else   // !defined(__aarch64__)
       int16x8_t temp_0;
       t0 = vld1_u8(src);
       s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
@@ -695,7 +695,7 @@
       src += src_stride;
       dst_ptr += dst_stride;
       height -= 1;
-#endif
+#endif  // defined(__aarch64__)
     } while (height > 0);
   }
 }
@@ -732,7 +732,7 @@
     int16x4_t s6, s7, s8;
     uint16x4_t dd1, dd2, dd3, d1, d2, d3;
     uint8x8_t d23_u8;
-#endif
+#endif  // defined(__aarch64__)
 
     load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4);
     src_ptr += 5 * src_stride;
@@ -770,7 +770,7 @@
       src_ptr += 4 * src_stride;
       dst_ptr += 4 * dst_stride;
       h -= 4;
-#else
+#else   // !defined(__aarch64__)
       s5 = vld1_s16(src_ptr);
 
       d0 = convolve6_4_s32(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
@@ -795,7 +795,7 @@
       src_ptr += src_stride;
       dst_ptr += dst_stride;
       h--;
-#endif
+#endif  // defined(__aarch64__)
     } while (h > 0);
 
   } else {
@@ -807,7 +807,7 @@
     int16x8_t s6, s7, s8;
     uint16x8_t dd1, dd2, dd3, d1, d2, d3;
     uint8x8_t d1_u8, d2_u8, d3_u8;
-#endif
+#endif  // defined(__aarch64__)
 
     do {
       int16_t *s = src_ptr;
@@ -849,7 +849,7 @@
         s += 4 * src_stride;
         d += 4 * dst_stride;
         height -= 4;
-#else
+#else   // !defined(__aarch64__)
         s5 = vld1q_s16(s);
 
         d0 = convolve6_8_s32(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
@@ -875,7 +875,7 @@
         s += src_stride;
         d += dst_stride;
         height--;
-#endif
+#endif  // defined(__aarch64__)
       } while (height > 0);
 
       src_ptr += 8;
@@ -916,7 +916,7 @@
     int16x4_t s8, s9, s10;
     uint16x4_t dd1, dd2, dd3, d1, d2, d3;
     uint8x8_t d23_u8;
-#endif
+#endif  // defined(__aarch64__)
 
     load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
     src_ptr += 7 * src_stride;
@@ -960,7 +960,7 @@
       src_ptr += 4 * src_stride;
       dst_ptr += 4 * dst_stride;
       h -= 4;
-#else
+#else   // !defined(__aarch64__)
       s7 = vld1_s16(src_ptr);
 
       d0 = convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
@@ -988,7 +988,7 @@
       src_ptr += src_stride;
       dst_ptr += dst_stride;
       h--;
-#endif
+#endif  // defined(__aarch64__)
     } while (h > 0);
 
   } else {
@@ -1000,7 +1000,7 @@
     int16x8_t s8, s9, s10;
     uint16x8_t dd1, dd2, dd3, d1, d2, d3;
     uint8x8_t d1_u8, d2_u8, d3_u8;
-#endif
+#endif  // defined(__aarch64__)
 
     do {
       int16_t *s = src_ptr;
@@ -1048,7 +1048,7 @@
         s += 4 * src_stride;
         d += 4 * dst_stride;
         height -= 4;
-#else
+#else   // !defined(__aarch64__)
         s7 = vld1q_s16(s);
 
         d0 = convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
@@ -1076,7 +1076,7 @@
         s += src_stride;
         d += dst_stride;
         height--;
-#endif
+#endif  // defined(__aarch64__)
       } while (height > 0);
 
       src_ptr += 8;
@@ -1622,7 +1622,7 @@
   uint8x8_t t0;
 #if defined(__aarch64__)
   uint8x8_t t1, t2, t3, t4, t5, t6, t7;
-#endif
+#endif  // defined(__aarch64__)
   s = src_ptr;
   dst_ptr = dst;
   dst_u8_ptr = dst8;
@@ -1639,10 +1639,10 @@
     int16x8_t tt1, tt2, tt3, t01, t23;
     uint16x4_t res5, res6, res7;
     int16x8_t u0, u1;
-#else
+#else   // !defined(__aarch64__)
     const int16x4_t round_offset_vec = vdup_n_s16(round_offset);
     int16x4_t temp_0;
-#endif
+#endif  // defined(__aarch64__)
     // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
     // shifts - which are generally faster than rounding shifts on modern CPUs.
     // The outermost -1 is needed because we halved the filter values.
@@ -1760,7 +1760,7 @@
       dst_ptr += 4 * dst_stride;
       dst_u8_ptr += 4 * dst8_stride;
       height -= 4;
-#else
+#else   // !defined(__aarch64__)
       t0 = vld1_u8(s);                            // a0 a1 a2 a3 a4 a5 a6 a7
       tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
       s0 = vget_low_s16(tt0);                     // a0 a1 a2 a3
@@ -1812,7 +1812,7 @@
       dst_ptr += dst_stride;
       dst_u8_ptr += dst8_stride;
       height--;
-#endif
+#endif  // defined(__aarch64__)
     } while (height > 0);
   } else {
     CONV_BUF_TYPE *d_tmp;
@@ -1973,7 +1973,7 @@
       dst_ptr += 8 * dst_stride;
       dst_u8_ptr += 8 * dst8_stride;
       height -= 8;
-#else
+#else   // !defined(__aarch64__)
       int16x8_t temp_0;
       __builtin_prefetch(src_ptr);
       t0 = vld1_u8(src_ptr);
@@ -2033,7 +2033,7 @@
       dst_ptr += dst_stride;
       dst_u8_ptr += dst8_stride;
       height--;
-#endif
+#endif  // defined(__aarch64__)
     } while (height > 0);
   }
 }
@@ -2279,7 +2279,7 @@
     int16x8_t s6, s7, s8, s9, s10, s11, s12, d1, d2, d3, d4, d5, d6, d7;
     uint16x8_t d9, d10, d11;
     uint8x8_t t5, t6, t7;
-#endif
+#endif  // defined(__aarch64__)
     int width = w;
 
     do {
@@ -2459,9 +2459,9 @@
     uint16x4_t dd1, dd2, dd3;
     int16x8_t t01, t23;
     uint8x8_t d23;
-#else
+#else   // !defined(__aarch64__)
     const int16x4_t round_offset64 = vdup_n_s16(round_offset);
-#endif
+#endif  // defined(__aarch64__)
     // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
     // shifts - which are generally faster than rounding shifts on modern CPUs.
     // The outermost -1 is needed because we halved the filter values.
@@ -2576,7 +2576,7 @@
         d += 4 * dst_stride;
         d_u8 += 4 * dst8_stride;
         height -= 4;
-#else
+#else   // !defined(__aarch64__)
         t0 = load_unaligned_u8_4x1(s);
         tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
         s7 = vget_low_s16(tt0);
@@ -2612,7 +2612,7 @@
         d += dst_stride;
         d_u8 += dst8_stride;
         height--;
-#endif
+#endif  // defined(__aarch64__)
       } while (height > 0);
       src_ptr += 4;
       dst_ptr += 4;
@@ -2634,7 +2634,7 @@
     int16x8_t s8, s9, s10, s11, s12, s13, s14, d1, d2, d3, d4, d5, d6, d7;
     uint16x8_t dd1, dd2, dd3;
     uint8x8_t t7;
-#endif
+#endif  // defined(__aarch64__)
     int width = w;
 
     do {
@@ -2754,7 +2754,7 @@
         s6 = s14;
         s += 8 * src_stride;
         height -= 8;
-#else
+#else   // !defined(__aarch64__)
         s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
 
         __builtin_prefetch(dst_ptr);
@@ -2790,7 +2790,7 @@
 
         s += src_stride;
         height--;
-#endif
+#endif  // defined(__aarch64__)
       } while (height > 0);
       src_ptr += 8;
       dst_ptr += 8;
commit	5711b50eebe392119defd2a2a262bffef05e8507	[log] [tgz]
author	Gerda Zsejke More <gerdazsejke.more@arm.com>	Sun Mar 26 10:33:08 2023 +0200
committer	James Zern <jzern@google.com>	Tue Apr 18 23:53:54 2023 +0000
tree	6a040bd2b3d113ae979f35acaafd6e9d4abd5e50
parent	76bb4aab0c407a87605d796fce1efbe641ea280f [diff]