Add missing pre-processor directive comments
Add description for #else and #endif pre-processor directives in Neon
convolution functions.
Change-Id: I846c42090ffbcc324a4a4bbf36bec5b0a46775cf
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index e019bda..05a7547 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -1091,7 +1091,7 @@
// FILTER_BITS - ROUND0_BITS.
// The outermost -1 is needed because we halved the filter values.
const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
-#endif
+#endif // defined(__aarch64__)
// Filter values are even so downshift by 1 to reduce precision requirements.
const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
@@ -1172,7 +1172,7 @@
w -= 4;
} while (w > 0);
} else {
-#endif
+#endif // defined(__aarch64__)
int width;
const uint8_t *s;
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
@@ -1180,7 +1180,7 @@
#if defined(__aarch64__)
int16x8_t s8, s9, s10;
uint8x8_t t4, t5, t6, t7;
-#endif
+#endif // defined(__aarch64__)
if (w <= 4) {
#if defined(__aarch64__)
@@ -1260,7 +1260,7 @@
dst += 8 * dst_stride;
h -= 8;
} while (h > 0);
-#else
+#else // !defined(__aarch64__)
// This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
// rounding right shift by FILTER_BITS - instead of a first rounding right
// shift by ROUND0_BITS, followed by second rounding right shift by
@@ -1301,7 +1301,7 @@
}
h -= 1;
} while (h > 0);
-#endif
+#endif // defined(__aarch64__)
} else {
uint8_t *d;
int16x8_t s11;
@@ -1390,7 +1390,7 @@
dst += 8 * dst_stride;
h -= 8;
} while (h > 0);
-#else
+#else // !defined(__aarch64__)
// This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
// rounding right shift by FILTER_BITS - instead of a first rounding right
// shift by ROUND0_BITS, followed by second rounding right shift by
@@ -1434,11 +1434,11 @@
dst += dst_stride;
h -= 1;
} while (h > 0);
-#endif
+#endif // defined(__aarch64__)
}
#if defined(__aarch64__)
}
-#endif
+#endif // defined(__aarch64__)
}
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
@@ -1894,7 +1894,7 @@
#if defined(__aarch64__)
uint8x8_t d23;
int16x4_t s8, s9, s10, d1, d2, d3;
-#endif
+#endif // defined(__aarch64__)
s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
src += src_stride;
s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
@@ -1962,7 +1962,7 @@
s6 = s10;
dst += 4 * dst_stride;
h -= 4;
-#else
+#else // !defined(__aarch64__)
__builtin_prefetch(dst + 0 * dst_stride);
__builtin_prefetch(src + 0 * src_stride);
@@ -1984,7 +1984,7 @@
s6 = s7;
dst += dst_stride;
h -= 1;
-#endif
+#endif // defined(__aarch64__)
} while (h > 0);
} else {
int height;
@@ -1995,7 +1995,7 @@
#if defined(__aarch64__)
uint8x8_t t1, t2, t3;
int16x8_t s8, s9, s10;
-#endif
+#endif // defined(__aarch64__)
do {
__builtin_prefetch(src + 0 * src_stride);
__builtin_prefetch(src + 1 * src_stride);
@@ -2060,7 +2060,7 @@
s6 = s10;
d += 4 * dst_stride;
height -= 4;
-#else
+#else // !defined(__aarch64__)
__builtin_prefetch(d);
__builtin_prefetch(s);
@@ -2077,7 +2077,7 @@
s5 = s6;
s6 = s7;
height -= 1;
-#endif
+#endif // defined(__aarch64__)
} while (height > 0);
src += 8;
dst += 8;
diff --git a/av1/common/arm/jnt_convolve_neon.c b/av1/common/arm/jnt_convolve_neon.c
index f970044..459f885 100644
--- a/av1/common/arm/jnt_convolve_neon.c
+++ b/av1/common/arm/jnt_convolve_neon.c
@@ -482,7 +482,7 @@
int16x4_t s8, s9, s10, d1, d2, d3;
int16x8_t tt1, tt2, tt3;
uint8x8_t t1, t2, t3;
-#endif
+#endif // defined(__aarch64__)
do {
s = src;
__builtin_prefetch(s + 0 * src_stride);
@@ -537,7 +537,7 @@
src += 4 * src_stride;
dst_ptr += 4 * dst_stride;
height -= 4;
-#else
+#else // !defined(__aarch64__)
t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
s0 = vget_low_s16(tt0); // a0 a1 a2 a3
@@ -564,7 +564,7 @@
src += src_stride;
dst_ptr += dst_stride;
height -= 1;
-#endif
+#endif // defined(__aarch64__)
} while (height > 0);
} else {
int16_t *d_tmp;
@@ -660,7 +660,7 @@
src += 8 * src_stride;
dst_ptr += 8 * dst_stride;
height -= 8;
-#else
+#else // !defined(__aarch64__)
int16x8_t temp_0;
t0 = vld1_u8(src);
s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
@@ -695,7 +695,7 @@
src += src_stride;
dst_ptr += dst_stride;
height -= 1;
-#endif
+#endif // defined(__aarch64__)
} while (height > 0);
}
}
@@ -732,7 +732,7 @@
int16x4_t s6, s7, s8;
uint16x4_t dd1, dd2, dd3, d1, d2, d3;
uint8x8_t d23_u8;
-#endif
+#endif // defined(__aarch64__)
load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4);
src_ptr += 5 * src_stride;
@@ -770,7 +770,7 @@
src_ptr += 4 * src_stride;
dst_ptr += 4 * dst_stride;
h -= 4;
-#else
+#else // !defined(__aarch64__)
s5 = vld1_s16(src_ptr);
d0 = convolve6_4_s32(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
@@ -795,7 +795,7 @@
src_ptr += src_stride;
dst_ptr += dst_stride;
h--;
-#endif
+#endif // defined(__aarch64__)
} while (h > 0);
} else {
@@ -807,7 +807,7 @@
int16x8_t s6, s7, s8;
uint16x8_t dd1, dd2, dd3, d1, d2, d3;
uint8x8_t d1_u8, d2_u8, d3_u8;
-#endif
+#endif // defined(__aarch64__)
do {
int16_t *s = src_ptr;
@@ -849,7 +849,7 @@
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
-#else
+#else // !defined(__aarch64__)
s5 = vld1q_s16(s);
d0 = convolve6_8_s32(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
@@ -875,7 +875,7 @@
s += src_stride;
d += dst_stride;
height--;
-#endif
+#endif // defined(__aarch64__)
} while (height > 0);
src_ptr += 8;
@@ -916,7 +916,7 @@
int16x4_t s8, s9, s10;
uint16x4_t dd1, dd2, dd3, d1, d2, d3;
uint8x8_t d23_u8;
-#endif
+#endif // defined(__aarch64__)
load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
src_ptr += 7 * src_stride;
@@ -960,7 +960,7 @@
src_ptr += 4 * src_stride;
dst_ptr += 4 * dst_stride;
h -= 4;
-#else
+#else // !defined(__aarch64__)
s7 = vld1_s16(src_ptr);
d0 = convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
@@ -988,7 +988,7 @@
src_ptr += src_stride;
dst_ptr += dst_stride;
h--;
-#endif
+#endif // defined(__aarch64__)
} while (h > 0);
} else {
@@ -1000,7 +1000,7 @@
int16x8_t s8, s9, s10;
uint16x8_t dd1, dd2, dd3, d1, d2, d3;
uint8x8_t d1_u8, d2_u8, d3_u8;
-#endif
+#endif // defined(__aarch64__)
do {
int16_t *s = src_ptr;
@@ -1048,7 +1048,7 @@
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
-#else
+#else // !defined(__aarch64__)
s7 = vld1q_s16(s);
d0 = convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
@@ -1076,7 +1076,7 @@
s += src_stride;
d += dst_stride;
height--;
-#endif
+#endif // defined(__aarch64__)
} while (height > 0);
src_ptr += 8;
@@ -1622,7 +1622,7 @@
uint8x8_t t0;
#if defined(__aarch64__)
uint8x8_t t1, t2, t3, t4, t5, t6, t7;
-#endif
+#endif // defined(__aarch64__)
s = src_ptr;
dst_ptr = dst;
dst_u8_ptr = dst8;
@@ -1639,10 +1639,10 @@
int16x8_t tt1, tt2, tt3, t01, t23;
uint16x4_t res5, res6, res7;
int16x8_t u0, u1;
-#else
+#else // !defined(__aarch64__)
const int16x4_t round_offset_vec = vdup_n_s16(round_offset);
int16x4_t temp_0;
-#endif
+#endif // defined(__aarch64__)
// This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
// shifts - which are generally faster than rounding shifts on modern CPUs.
// The outermost -1 is needed because we halved the filter values.
@@ -1760,7 +1760,7 @@
dst_ptr += 4 * dst_stride;
dst_u8_ptr += 4 * dst8_stride;
height -= 4;
-#else
+#else // !defined(__aarch64__)
t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
s0 = vget_low_s16(tt0); // a0 a1 a2 a3
@@ -1812,7 +1812,7 @@
dst_ptr += dst_stride;
dst_u8_ptr += dst8_stride;
height--;
-#endif
+#endif // defined(__aarch64__)
} while (height > 0);
} else {
CONV_BUF_TYPE *d_tmp;
@@ -1973,7 +1973,7 @@
dst_ptr += 8 * dst_stride;
dst_u8_ptr += 8 * dst8_stride;
height -= 8;
-#else
+#else // !defined(__aarch64__)
int16x8_t temp_0;
__builtin_prefetch(src_ptr);
t0 = vld1_u8(src_ptr);
@@ -2033,7 +2033,7 @@
dst_ptr += dst_stride;
dst_u8_ptr += dst8_stride;
height--;
-#endif
+#endif // defined(__aarch64__)
} while (height > 0);
}
}
@@ -2279,7 +2279,7 @@
int16x8_t s6, s7, s8, s9, s10, s11, s12, d1, d2, d3, d4, d5, d6, d7;
uint16x8_t d9, d10, d11;
uint8x8_t t5, t6, t7;
-#endif
+#endif // defined(__aarch64__)
int width = w;
do {
@@ -2459,9 +2459,9 @@
uint16x4_t dd1, dd2, dd3;
int16x8_t t01, t23;
uint8x8_t d23;
-#else
+#else // !defined(__aarch64__)
const int16x4_t round_offset64 = vdup_n_s16(round_offset);
-#endif
+#endif // defined(__aarch64__)
// This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
// shifts - which are generally faster than rounding shifts on modern CPUs.
// The outermost -1 is needed because we halved the filter values.
@@ -2576,7 +2576,7 @@
d += 4 * dst_stride;
d_u8 += 4 * dst8_stride;
height -= 4;
-#else
+#else // !defined(__aarch64__)
t0 = load_unaligned_u8_4x1(s);
tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
s7 = vget_low_s16(tt0);
@@ -2612,7 +2612,7 @@
d += dst_stride;
d_u8 += dst8_stride;
height--;
-#endif
+#endif // defined(__aarch64__)
} while (height > 0);
src_ptr += 4;
dst_ptr += 4;
@@ -2634,7 +2634,7 @@
int16x8_t s8, s9, s10, s11, s12, s13, s14, d1, d2, d3, d4, d5, d6, d7;
uint16x8_t dd1, dd2, dd3;
uint8x8_t t7;
-#endif
+#endif // defined(__aarch64__)
int width = w;
do {
@@ -2754,7 +2754,7 @@
s6 = s14;
s += 8 * src_stride;
height -= 8;
-#else
+#else // !defined(__aarch64__)
s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
__builtin_prefetch(dst_ptr);
@@ -2790,7 +2790,7 @@
s += src_stride;
height--;
-#endif
+#endif // defined(__aarch64__)
} while (height > 0);
src_ptr += 8;
dst_ptr += 8;