s/__aarch64__/AOM_ARCH_AARCH64/
This allows AArch64 to be correctly detected when building with Visual
Studio (cl.exe). There are still test failures, however.
Microsoft's compiler doesn't define __ARM_FEATURE_*. To use those paths
we may need to rely on _M_ARM64_EXTENSION.
This change is similar to the one in libvpx:
57b9afa58 s/__aarch64__/VPX_ARCH_AARCH64/
Bug: b/277255390
Change-Id: Ie3174d59c1bcdab5677b81b0176cc363cf18018f
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index 2da64b4..1bb58e4 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c
@@ -18,7 +18,7 @@
#include "aom_dsp/arm/transpose_neon.h"
#include "aom_ports/mem.h"
-#if !defined(__aarch64__)
+#if !AOM_ARCH_AARCH64
static INLINE uint32x2_t horizontal_add_u16x8_v(const uint16x8_t a) {
const uint32x4_t b = vpaddlq_u16(a);
const uint64x2_t c = vpaddlq_u32(b);
@@ -30,7 +30,7 @@
unsigned int aom_avg_4x4_neon(const uint8_t *a, int a_stride) {
const uint8x16_t b = load_unaligned_u8q(a, a_stride);
const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b));
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
const uint32_t d = vaddlvq_u16(c);
return (d + 8) >> 4;
#else
@@ -53,7 +53,7 @@
sum = vaddw_u8(sum, e);
}
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
const uint32_t d = vaddlvq_u16(sum);
return (d + 32) >> 6;
#else
@@ -216,7 +216,7 @@
v_mean = vpadalq_s16(v_mean, diff);
v_low = vget_low_s16(diff);
v_sse = vmlal_s16(v_sse, v_low, v_low);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
v_sse = vmlal_high_s16(v_sse, diff, diff);
#else
const int16x4_t v_high = vget_high_s16(diff);
@@ -259,7 +259,7 @@
const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
*min = *max = 0; // Clear high bits
*((uint8_t *)max) = vmaxvq_u8(ab07_max);
*((uint8_t *)min) = vminvq_u8(ab07_min);
diff --git a/aom_dsp/arm/highbd_avg_neon.c b/aom_dsp/arm/highbd_avg_neon.c
index 0483a83..13cce03 100644
--- a/aom_dsp/arm/highbd_avg_neon.c
+++ b/aom_dsp/arm/highbd_avg_neon.c
@@ -98,7 +98,7 @@
const uint16x8_t min4567 = vminq_u16(min45, min67);
const uint16x8_t min07 = vminq_u16(min0123, min4567);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
*max = (int)vmaxvq_u16(max07);
*min = (int)vminvq_u16(min07);
#else
diff --git a/aom_dsp/arm/highbd_loopfilter_neon.c b/aom_dsp/arm/highbd_loopfilter_neon.c
index 0b720ce..2b5128e 100644
--- a/aom_dsp/arm/highbd_loopfilter_neon.c
+++ b/aom_dsp/arm/highbd_loopfilter_neon.c
@@ -247,12 +247,12 @@
filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
&needs_filter4_mask);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
if (vaddv_u16(needs_filter4_mask) == 0) {
// None of the values will be filtered.
return;
}
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
// Copy the masks to the high bits for packed comparisons later.
const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
@@ -313,12 +313,12 @@
filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
&needs_filter4_mask);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
if (vaddv_u16(needs_filter4_mask) == 0) {
// None of the values will be filtered.
return;
}
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
// Copy the masks to the high bits for packed comparisons later.
const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
@@ -437,12 +437,12 @@
filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd,
&needs_filter_mask, &is_flat3_mask, &hev_mask);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
if (vaddv_u16(needs_filter_mask) == 0) {
// None of the values will be filtered.
return;
}
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
// Copy the masks to the high bits for packed comparisons later.
const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
@@ -528,12 +528,12 @@
filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd,
&needs_filter_mask, &is_flat3_mask, &hev_mask);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
if (vaddv_u16(needs_filter_mask) == 0) {
// None of the values will be filtered.
return;
}
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
// Copy the masks to the high bits for packed comparisons later.
const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
@@ -684,12 +684,12 @@
filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
if (vaddv_u16(needs_filter_mask) == 0) {
// None of the values will be filtered.
return;
}
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
// Copy the masks to the high bits for packed comparisons later.
const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
@@ -783,12 +783,12 @@
filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
if (vaddv_u16(needs_filter_mask) == 0) {
// None of the values will be filtered.
return;
}
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
// Copy the masks to the high bits for packed comparisons later.
const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
@@ -976,12 +976,12 @@
filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
if (vaddv_u16(needs_filter_mask) == 0) {
// None of the values will be filtered.
return;
}
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]);
const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]);
const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]);
@@ -1083,7 +1083,7 @@
static INLINE uint16x8x2_t permute_acdb64(const uint16x8_t ab,
const uint16x8_t cd) {
uint16x8x2_t acdb;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
// a[b] <- [c]d
acdb.val[0] = vreinterpretq_u16_u64(
vtrn1q_u64(vreinterpretq_u64_u16(ab), vreinterpretq_u64_u16(cd)));
@@ -1099,7 +1099,7 @@
acdb.val[1] = vreinterpretq_u16_u64(
vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 1),
vreinterpretq_u64_u16(ab), 0));
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
return acdb;
}
@@ -1144,12 +1144,12 @@
filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
if (vaddv_u16(needs_filter_mask) == 0) {
// None of the values will be filtered.
return;
}
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
const uint16x8_t p4q4 =
vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0]));
const uint16x8_t p5q5 =
diff --git a/aom_dsp/arm/highbd_quantize_neon.c b/aom_dsp/arm/highbd_quantize_neon.c
index 927e13c..1b7e046 100644
--- a/aom_dsp/arm/highbd_quantize_neon.c
+++ b/aom_dsp/arm/highbd_quantize_neon.c
@@ -19,7 +19,7 @@
#include "av1/encoder/av1_quantize.h"
static INLINE uint32_t sum_abs_coeff(const uint32x4_t a) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vaddvq_u32(a);
#else
const uint64x2_t b = vpaddlq_u32(a);
@@ -98,7 +98,7 @@
}
static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
-#ifdef __aarch64__
+#if AOM_ARCH_AARCH64
return (uint16_t)vmaxvq_s16(v_eobmax);
#else
const int16x4_t v_eobmax_3210 =
@@ -116,7 +116,7 @@
}
static INLINE uint16_t get_min_eob(int16x8_t v_eobmin) {
-#ifdef __aarch64__
+#if AOM_ARCH_AARCH64
return (uint16_t)vminvq_s16(v_eobmin);
#else
const int16x4_t v_eobmin_3210 =
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index 58b31a3..2161378 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -84,7 +84,7 @@
}
static INLINE uint16x8_t horizontal_add_and_broadcast_u16x8(uint16x8_t a) {
-#ifdef __aarch64__
+#if AOM_ARCH_AARCH64
// On AArch64 we could also use vdupq_n_u16(vaddvq_u16(a)) here to save an
// instruction, however the addv instruction is usually slightly more
// expensive than a pairwise addition, so the need for immediately
@@ -1528,13 +1528,13 @@
int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y);
int16x4_t min_base_y64 = vdup_n_s16(min_base_y);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
// Use ext rather than loading left + 14 directly to avoid over-read.
const uint8x16_t left_m2 = vld1q_u8(left - 2);
const uint8x16_t left_0 = vld1q_u8(left);
const uint8x16_t left_14 = vextq_u8(left_0, left_0, 14);
const uint8x16x2_t left_vals = { { left_m2, left_14 } };
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
for (int r = 0; r < N; r++) {
uint16x8_t res, shift;
@@ -1593,13 +1593,13 @@
// Values in base_y_c64 range from -2 through 14 inclusive.
base_y_c64 = vbic_s16(base_y_c64, vreinterpret_s16_u16(mask64));
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
uint8x8_t left_idx0 = vreinterpret_u8_s16(base_y_c64 + 2); // [0, 16]
uint8x8_t left_idx1 = vreinterpret_u8_s16(base_y_c64 + 3); // [1, 17]
uint8x8_t a0_y = vtrn1_u8(vqtbl2_u8(left_vals, left_idx0), v_zero_u8);
uint8x8_t a1_y = vtrn1_u8(vqtbl2_u8(left_vals, left_idx1), v_zero_u8);
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
DECLARE_ALIGNED(32, int16_t, base_y_c[4]);
vst1_s16(base_y_c, base_y_c64);
@@ -1616,7 +1616,7 @@
a1_y = vld1_lane_u8(left + base_y_c[1], a1_y, 2);
a1_y = vld1_lane_u8(left + base_y_c[2], a1_y, 4);
a1_y = vld1_lane_u8(left + base_y_c[3], a1_y, 6);
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
if (upsample_left) {
v_shift.val[1] = vshr_n_u16(
@@ -1696,7 +1696,7 @@
uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
vcreate_u16(0x0008000700060005));
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
// Use ext rather than loading left + 30 directly to avoid over-read.
const uint8x16_t left_m2 = vld1q_u8(left - 2);
const uint8x16_t left_0 = vld1q_u8(left + 0);
@@ -1704,7 +1704,7 @@
const uint8x16_t left_14 = vextq_u8(left_0, left_16, 14);
const uint8x16_t left_30 = vextq_u8(left_16, left_16, 14);
const uint8x16x3_t left_vals = { { left_m2, left_14, left_30 } };
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
for (int r = 0; r < N; r++) {
uint8x8_t resx, resy, resxy;
@@ -1776,7 +1776,7 @@
// Values in base_y_c128 range from -2 through 31 inclusive.
base_y_c128 = vbicq_s16(base_y_c128, vreinterpretq_s16_u16(mask128));
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
uint8x16_t left_idx0 = vreinterpretq_u8_s16(base_y_c128 + 2); // [0, 33]
uint8x16_t left_idx1 = vreinterpretq_u8_s16(base_y_c128 + 3); // [1, 34]
uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
@@ -1784,7 +1784,7 @@
uint8x16_t a01_x = vqtbl3q_u8(left_vals, left_idx01);
uint8x8_t a0_x1 = vget_low_u8(a01_x);
uint8x8_t a1_x1 = vget_high_u8(a01_x);
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
vst1q_s16(base_y_c, base_y_c128);
@@ -1809,7 +1809,7 @@
a1_x1 = vld1_lane_u8(left + base_y_c[5], a1_x1, 5);
a1_x1 = vld1_lane_u8(left + base_y_c[6], a1_x1, 6);
a1_x1 = vld1_lane_u8(left + base_y_c[7], a1_x1, 7);
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
if (upsample_left) {
shift.val[1] = vshrq_n_u16(
@@ -1867,7 +1867,7 @@
c1234.val[0] = vaddq_u16(c0123.val[0], c1);
c1234.val[1] = vaddq_u16(c0123.val[1], c1);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
const uint8x16_t left_m1 = vld1q_u8(left - 1);
const uint8x16_t left_0 = vld1q_u8(left + 0);
const uint8x16_t left_16 = vld1q_u8(left + 16);
@@ -1878,7 +1878,7 @@
const uint8x16_t left_47 = vextq_u8(left_32, left_48, 15);
const uint8x16x4_t left_vals0 = { { left_m1, left_15, left_31, left_47 } };
const uint8x16x4_t left_vals1 = { { left_0, left_16, left_32, left_48 } };
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
for (int r = 0; r < H; r++) {
uint16x8x2_t res, r6, shift;
@@ -1991,7 +1991,7 @@
a0_y128 = vandq_u8(a0_y128, v_loadmaskz2);
a1_y128 = vld1q_u8(left + min_y + 1);
a1_y128 = vandq_u8(a1_y128, v_loadmaskz2);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
a0_y128 = vqtbl1q_u8(a0_y128, vreinterpretq_u8_s8(base_y_offset128));
a1_y128 = vqtbl1q_u8(a1_y128, vreinterpretq_u8_s8(base_y_offset128));
#else
@@ -2023,7 +2023,7 @@
base_y_c256.val[1] = vbicq_s16(base_y_c256.val[1],
vreinterpretq_s16_u16(mask256.val[1]));
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
// Values in left_idx{0,1} range from 0 through 63 inclusive.
uint8x16_t left_idx0 = vreinterpretq_u8_s16(base_y_c256.val[0] + 1);
uint8x16_t left_idx1 = vreinterpretq_u8_s16(base_y_c256.val[1] + 1);
@@ -2037,7 +2037,7 @@
a0_y1 = vget_high_u8(a0_y01);
a1_y0 = vget_low_u8(a1_y01);
a1_y1 = vget_high_u8(a1_y01);
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
vst1q_s16(base_y_c, base_y_c256.val[0]);
@@ -2086,7 +2086,7 @@
a1_y1 = vld1_lane_u8(left + base_y_c[13], a1_y1, 5);
a1_y1 = vld1_lane_u8(left + base_y_c[14], a1_y1, 6);
a1_y1 = vld1_lane_u8(left + base_y_c[15], a1_y1, 7);
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
}
shifty.val[0] = vshrq_n_u16(
@@ -2318,7 +2318,7 @@
w11 = vzipq_u32(vreinterpretq_u32_u16(w6.val[1]),
vreinterpretq_u32_u16(w7.val[1]));
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
d[0] = vzip1q_u64(vreinterpretq_u64_u32(w8.val[0]),
vreinterpretq_u64_u32(w9.val[0]));
d[1] = vzip2q_u64(vreinterpretq_u64_u32(w8.val[0]),
@@ -2388,7 +2388,7 @@
w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
vreinterpretq_u32_u16(w11.val[1]));
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
d[0] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[0]),
vreinterpretq_u64_u32(w13.val[0]));
d[1] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[0]),
@@ -2443,7 +2443,7 @@
w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
vreinterpretq_u32_u16(w11.val[1]));
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
d[8] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[0]),
vreinterpretq_u64_u32(w13.val[0]));
d[9] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[0]),
@@ -2516,7 +2516,7 @@
// Store first 4-line result
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
d[0].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
vreinterpretq_u64_u32(w14.val[0]));
d[0].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
@@ -2572,7 +2572,7 @@
// Store second 4-line result
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
d[4].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
vreinterpretq_u64_u32(w14.val[0]));
d[4].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
@@ -2639,7 +2639,7 @@
// Store first 4-line result
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
d[8].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
vreinterpretq_u64_u32(w14.val[0]));
d[8].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
@@ -2695,7 +2695,7 @@
// Store second 4-line result
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
d[12].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
vreinterpretq_u64_u32(w14.val[0]));
d[12].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
diff --git a/aom_dsp/arm/obmc_sad_neon.c b/aom_dsp/arm/obmc_sad_neon.c
index 60656e5..a692cbb 100644
--- a/aom_dsp/arm/obmc_sad_neon.c
+++ b/aom_dsp/arm/obmc_sad_neon.c
@@ -37,7 +37,7 @@
*sum = vrsraq_n_u32(*sum, abs_hi, 12);
}
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
// Use tbl for doing a double-width zero extension from 8->32 bits since we can
// do this in one instruction rather than two (indices out of range (255 here)
@@ -110,7 +110,7 @@
return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
}
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
static INLINE unsigned int obmc_sad_large_neon(const uint8_t *ref,
int ref_stride,
@@ -144,7 +144,7 @@
return horizontal_add_u32x4(sum);
}
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
static INLINE unsigned int obmc_sad_128xh_neon(const uint8_t *ref,
int ref_stride,
diff --git a/aom_dsp/arm/obmc_variance_neon.c b/aom_dsp/arm/obmc_variance_neon.c
index 8702ba6..50cd5f3 100644
--- a/aom_dsp/arm/obmc_variance_neon.c
+++ b/aom_dsp/arm/obmc_variance_neon.c
@@ -55,7 +55,7 @@
*ssev = vmlaq_s32(*ssev, round_s32_hi, round_s32_hi);
}
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
// Use tbl for doing a double-width zero extension from 8->32 bits since we can
// do this in one instruction rather than two (indices out of range (255 here)
@@ -140,7 +140,7 @@
*sum = horizontal_add_s32x4(sumv);
}
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
static INLINE void obmc_variance_large_neon(const uint8_t *pre, int pre_stride,
const int32_t *wsrc,
@@ -180,7 +180,7 @@
*sum = horizontal_add_s32x4(sumv);
}
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
static INLINE void obmc_variance_neon_128xh(const uint8_t *pre, int pre_stride,
const int32_t *wsrc,
diff --git a/aom_dsp/arm/sum_neon.h b/aom_dsp/arm/sum_neon.h
index 7d2f18b..ff68c12 100644
--- a/aom_dsp/arm/sum_neon.h
+++ b/aom_dsp/arm/sum_neon.h
@@ -15,7 +15,7 @@
#include "aom_ports/mem.h"
static INLINE int horizontal_add_s16x8(const int16x8_t a) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vaddlvq_s16(a);
#else
const int32x4_t b = vpaddlq_s16(a);
@@ -27,7 +27,7 @@
}
static INLINE int horizontal_add_s32x4(const int32x4_t a) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vaddvq_s32(a);
#else
const int64x2_t b = vpaddlq_s32(a);
@@ -38,7 +38,7 @@
}
static INLINE int64_t horizontal_add_s64x2(const int64x2_t a) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vaddvq_s64(a);
#else
return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
@@ -46,7 +46,7 @@
}
static INLINE uint64_t horizontal_add_u64x2(const uint64x2_t a) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vaddvq_u64(a);
#else
return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
@@ -54,7 +54,7 @@
}
static INLINE uint64_t horizontal_long_add_u32x4(const uint32x4_t a) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vaddlvq_u32(a);
#else
const uint64x2_t b = vpaddlq_u32(a);
@@ -63,7 +63,7 @@
}
static INLINE unsigned int horizontal_add_u32x4(const uint32x4_t a) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vaddvq_u32(a);
#else
const uint64x2_t b = vpaddlq_u32(a);
@@ -74,7 +74,7 @@
}
static INLINE uint32x4_t horizontal_add_4d_u32x4(const uint32x4_t sum[4]) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]);
uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]);
return vpaddq_u32(res01, res23);
@@ -90,7 +90,7 @@
static INLINE uint32_t horizontal_long_add_u16x8(const uint16x8_t vec_lo,
const uint16x8_t vec_hi) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi);
#else
const uint32x4_t vec_l_lo =
@@ -115,7 +115,7 @@
const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]);
const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]);
const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
const uint32x4_t c0 = vpaddq_u32(b0, b1);
const uint32x4_t c1 = vpaddq_u32(b2, b3);
return vpaddq_u32(c0, c1);
@@ -131,7 +131,7 @@
}
static INLINE uint32_t horizontal_add_u16x8(const uint16x8_t a) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vaddlvq_u16(a);
#else
const uint32x4_t b = vpaddlq_u16(a);
@@ -143,7 +143,7 @@
}
static INLINE uint32x4_t horizontal_add_4d_u16x8(const uint16x8_t sum[4]) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
const uint16x8_t b0 = vpaddq_u16(a0, a1);
@@ -160,7 +160,7 @@
}
static INLINE uint32_t horizontal_add_u32x2(const uint32x2_t a) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vaddv_u32(a);
#else
const uint64x1_t b = vpaddl_u32(a);
@@ -169,7 +169,7 @@
}
static INLINE uint64_t horizontal_long_add_u32x2(const uint32x2_t a) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vaddlv_u32(a);
#else
const uint64x1_t b = vpaddl_u32(a);
@@ -178,7 +178,7 @@
}
static INLINE uint32_t horizontal_add_u16x4(const uint16x4_t a) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vaddlv_u16(a);
#else
const uint32x2_t b = vpaddl_u16(a);
diff --git a/aom_dsp/arm/transpose_neon.h b/aom_dsp/arm/transpose_neon.h
index d151c58..2dbb414 100644
--- a/aom_dsp/arm/transpose_neon.h
+++ b/aom_dsp/arm/transpose_neon.h
@@ -260,7 +260,7 @@
static INLINE uint16x8x2_t aom_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
uint16x8x2_t b0;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
b0.val[0] = vreinterpretq_u16_u64(
vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
b0.val[1] = vreinterpretq_u16_u64(
@@ -521,7 +521,7 @@
static INLINE int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
int16x8x2_t b0;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
b0.val[0] = vreinterpretq_s16_s64(
vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
b0.val[1] = vreinterpretq_s16_s64(
@@ -744,7 +744,7 @@
static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
int32x4x2_t b0;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
b0.val[0] = vreinterpretq_s32_s64(
vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
b0.val[1] = vreinterpretq_s32_s64(
diff --git a/aom_dsp/simd/v128_intrinsics_arm.h b/aom_dsp/simd/v128_intrinsics_arm.h
index fb89d60..eee150a 100644
--- a/aom_dsp/simd/v128_intrinsics_arm.h
+++ b/aom_dsp/simd/v128_intrinsics_arm.h
@@ -97,7 +97,7 @@
int16x8_t t2 = vmulq_s16(
vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))),
vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(b)))));
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vaddlvq_s16(t1) + vaddlvq_s16(t2);
#else
int64x2_t t = vpaddlq_s32(vaddq_s32(vpaddlq_s16(t1), vpaddlq_s16(t2)));
@@ -117,7 +117,7 @@
}
SIMD_INLINE uint64_t v128_hadd_u8(v128 x) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vaddlvq_u8(vreinterpretq_u8_s64(x));
#else
uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s64(x))));
@@ -155,7 +155,7 @@
}
SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vaddlvq_u16(s.hi) + vaddlvq_u16(s.lo);
#else
uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vaddq_u16(s.hi, s.lo)));
@@ -286,7 +286,7 @@
}
SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpretq_s64_s16(vuzp2q_s16(
vreinterpretq_s16_s32(vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)),
vreinterpret_s16_s64(vget_low_s64(b)))),
@@ -304,7 +304,7 @@
}
SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
int32x4_t t1 = vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)),
vreinterpret_s16_s64(vget_low_s64(b)));
int32x4_t t2 =
@@ -317,7 +317,7 @@
}
SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
int16x8_t t1 = vmulq_s16(
vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a)))),
vmovl_s8(vreinterpret_s8_s64(vget_low_s64(b))));
@@ -369,7 +369,7 @@
SIMD_INLINE uint32_t v128_movemask_8(v128 a) {
a = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(0)));
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
uint8x16_t m =
vandq_u8(vreinterpretq_u8_s64(a),
vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL)));
@@ -414,7 +414,7 @@
}
SIMD_INLINE v128 v128_ziplo_8(v128 x, v128 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpretq_s64_u8(
vzip1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
#else
@@ -424,7 +424,7 @@
}
SIMD_INLINE v128 v128_ziphi_8(v128 x, v128 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpretq_s64_u8(
vzip2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
#else
@@ -439,7 +439,7 @@
}
SIMD_INLINE v128 v128_ziplo_16(v128 x, v128 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpretq_s64_u16(
vzip1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
#else
@@ -449,7 +449,7 @@
}
SIMD_INLINE v128 v128_ziphi_16(v128 x, v128 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpretq_s64_u16(
vzip2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
#else
@@ -464,7 +464,7 @@
}
SIMD_INLINE v128 v128_ziplo_32(v128 x, v128 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpretq_s64_u32(
vzip1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
#else
@@ -474,7 +474,7 @@
}
SIMD_INLINE v128 v128_ziphi_32(v128 x, v128 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpretq_s64_u32(
vzip2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
#else
@@ -497,7 +497,7 @@
}
SIMD_INLINE v128 v128_unziplo_8(v128 x, v128 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpretq_s64_u8(
vuzp1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
#else
@@ -507,7 +507,7 @@
}
SIMD_INLINE v128 v128_unziphi_8(v128 x, v128 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpretq_s64_u8(
vuzp2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
#else
@@ -517,7 +517,7 @@
}
SIMD_INLINE v128 v128_unziplo_16(v128 x, v128 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpretq_s64_u16(
vuzp1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
#else
@@ -528,7 +528,7 @@
}
SIMD_INLINE v128 v128_unziphi_16(v128 x, v128 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpretq_s64_u16(
vuzp2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
#else
@@ -539,7 +539,7 @@
}
SIMD_INLINE v128 v128_unziplo_32(v128 x, v128 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpretq_s64_u32(
vuzp1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
#else
@@ -550,7 +550,7 @@
}
SIMD_INLINE v128 v128_unziphi_32(v128 x, v128 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpretq_s64_u32(
vuzp2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
#else
@@ -637,7 +637,7 @@
}
SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpretq_s64_u8(
vqtbl1q_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(pattern)));
#else
diff --git a/aom_dsp/simd/v256_intrinsics_v128.h b/aom_dsp/simd/v256_intrinsics_v128.h
index cf44965..3dfb325 100644
--- a/aom_dsp/simd/v256_intrinsics_v128.h
+++ b/aom_dsp/simd/v256_intrinsics_v128.h
@@ -614,7 +614,7 @@
SIMD_INLINE v256 v256_shuffle_8(v256 x, v256 pattern) {
#if HAVE_NEON
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
uint8x16x2_t p = { { vreinterpretq_u8_s64(x.val[0]),
vreinterpretq_u8_s64(x.val[1]) } };
return v256_from_v128(
@@ -653,7 +653,7 @@
SIMD_INLINE v256 v256_wideshuffle_8(v256 x, v256 y, v256 pattern) {
#if HAVE_NEON
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
uint8x16x4_t p = { {
vreinterpretq_u8_s64(y.val[0]),
vreinterpretq_u8_s64(y.val[1]),
diff --git a/aom_dsp/simd/v64_intrinsics_arm.h b/aom_dsp/simd/v64_intrinsics_arm.h
index 265ebed..35d88e2 100644
--- a/aom_dsp/simd/v64_intrinsics_arm.h
+++ b/aom_dsp/simd/v64_intrinsics_arm.h
@@ -130,7 +130,7 @@
int16x8_t t =
vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)),
vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y))));
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vaddlvq_s16(t);
#else
int64x2_t r = vpaddlq_s32(vpaddlq_s16(t));
@@ -139,7 +139,7 @@
}
SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vaddlvq_s32(
vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
#else
@@ -150,7 +150,7 @@
}
SIMD_INLINE uint64_t v64_hadd_u8(v64 x) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vaddlv_u8(vreinterpret_u8_s64(x));
#else
return vget_lane_u64(
@@ -173,7 +173,7 @@
}
SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vaddlvq_u16(s);
#else
uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s));
@@ -194,7 +194,7 @@
}
SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vaddvq_u32(s);
#else
uint64x2_t t = vpaddlq_u32(s);
@@ -290,7 +290,7 @@
}
SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
int16x8_t t = vreinterpretq_s16_s32(
vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
return vget_low_s64(vreinterpretq_s64_s16(vuzp2q_s16(t, t)));
@@ -370,7 +370,7 @@
}
SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpret_s64_u8(
vzip1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
#else
@@ -380,7 +380,7 @@
}
SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpret_s64_u8(
vzip2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
#else
@@ -390,7 +390,7 @@
}
SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpret_s64_u16(
vzip1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
#else
@@ -400,7 +400,7 @@
}
SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpret_s64_u16(
vzip2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
#else
@@ -410,7 +410,7 @@
}
SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpret_s64_u32(
vzip1_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)));
#else
@@ -420,7 +420,7 @@
}
SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpret_s64_u32(
vzip2_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)));
#else
@@ -466,7 +466,7 @@
}
SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpret_s64_u8(
vuzp1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
#else
@@ -476,7 +476,7 @@
}
SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpret_s64_u8(
vuzp2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
#else
@@ -486,7 +486,7 @@
}
SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpret_s64_u16(
vuzp1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
#else
@@ -496,7 +496,7 @@
}
SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vreinterpret_s64_u16(
vuzp2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
#else
diff --git a/aom_ports/arm_cpudetect.c b/aom_ports/arm_cpudetect.c
index e1981c4..276ef61 100644
--- a/aom_ports/arm_cpudetect.c
+++ b/aom_ports/arm_cpudetect.c
@@ -57,16 +57,14 @@
}
#elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT || __APPLE__ */
-#if HAVE_NEON && \
- !(defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC))
+#if HAVE_NEON && !AOM_ARCH_AARCH64
/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
#undef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#undef WIN32_EXTRA_LEAN
#define WIN32_EXTRA_LEAN
#include <windows.h>
-#endif // HAVE_NEON &&
- // !(defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC))
+#endif // HAVE_NEON && !AOM_ARCH_AARCH64
int aom_arm_cpu_caps(void) {
int flags;
@@ -75,7 +73,7 @@
return flags;
}
mask = arm_cpu_env_mask();
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#if AOM_ARCH_AARCH64
return HAS_NEON & mask;
#else
/* MSVC has no inline __asm support for ARM, but it does let you __emit
@@ -94,7 +92,7 @@
}
#endif /* HAVE_NEON */
return flags & mask;
-#endif // defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#endif // AOM_ARCH_AARCH64
}
#elif defined(__ANDROID__) /* end _MSC_VER */
diff --git a/av1/common/arm/cfl_neon.c b/av1/common/arm/cfl_neon.c
index 8c15345..59c1b92 100644
--- a/av1/common/arm/cfl_neon.c
+++ b/av1/common/arm/cfl_neon.c
@@ -132,7 +132,7 @@
}
#if CONFIG_AV1_HIGHBITDEPTH
-#ifndef __aarch64__
+#if !AOM_ARCH_AARCH64
uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
vpadd_u16(vget_low_u16(b), vget_high_u16(b)));
@@ -313,7 +313,7 @@
// Permute and add in such a way that each lane contains the block sum.
// [A+C+B+D, B+D+A+C, C+A+D+B, D+B+C+A]
-#ifdef __aarch64__
+#if AOM_ARCH_AARCH64
sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
#else
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index ec9008e..ed47b17 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -44,7 +44,7 @@
return sum;
}
-#if !defined(__aarch64__)
+#if !AOM_ARCH_AARCH64
static INLINE uint8x8_t convolve8_x_4x1(const int16x4_t s0, const int16x4_t s1,
const int16x4_t s2, const int16x4_t s3,
const int16x4_t s4, const int16x4_t s5,
@@ -67,9 +67,9 @@
// We halved the convolution filter values so - 1 from the right shift.
return vqrshrun_n_s16(vcombine_s16(sum, vdup_n_s16(0)), FILTER_BITS - 1);
}
-#endif // !defined(__arch64__)
+#endif // !AOM_ARCH_AARCH64
-#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
static INLINE int32x4_t convolve12_4_usdot(uint8x16_t samples,
const int8x16_t filters,
@@ -126,7 +126,7 @@
vqrshrn_n_s32(sum[1], FILTER_BITS));
}
-#elif defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#elif AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
static INLINE int16x4_t convolve12_horiz_4_sdot(
uint8x16_t samples, const int8x16_t filters, const int32x4_t correction,
@@ -257,7 +257,7 @@
vqrshrn_n_s32(sum[1], FILTER_BITS));
}
-#endif // defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
static INLINE uint8x8_t convolve8_vert_8x4(
const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
@@ -334,7 +334,7 @@
return vqmovun_s16(res);
}
-#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
void convolve_x_sr_12tap_neon(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
@@ -572,7 +572,7 @@
}
}
-#elif defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#elif AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
void convolve_x_sr_12tap_neon(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
@@ -830,7 +830,7 @@
}
}
-#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+#else // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
static INLINE uint8x8_t
convolve8_horiz_8x8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
@@ -951,7 +951,7 @@
const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
// This shim of 1 << (ROUND0_BITS - 1) enables us to use a single
// rounding right shift by FILTER_BITS - instead of a first rounding right
// shift by ROUND0_BITS, followed by second rounding right shift by
@@ -1058,10 +1058,10 @@
x_filter_12tap_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w, h,
x_filter_0_7, x_filter_8_11);
}
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
x_filter_12tap_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w, h,
x_filter_0_7, x_filter_8_11);
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
}
void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
@@ -1083,7 +1083,7 @@
}
uint8x8_t t0;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
uint8x8_t t1, t2, t3;
// This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
// rounding right shift by FILTER_BITS - instead of a first rounding right
@@ -1091,11 +1091,11 @@
// FILTER_BITS - ROUND0_BITS.
// The outermost -1 is needed because we halved the filter values.
const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
// Filter values are even so downshift by 1 to reduce precision requirements.
const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
if (h == 4) {
uint8x8_t d01, d23;
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
@@ -1172,18 +1172,18 @@
w -= 4;
} while (w > 0);
} else {
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
int width;
const uint8_t *s;
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
int16x8_t s8, s9, s10;
uint8x8_t t4, t5, t6, t7;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
if (w <= 4) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
do {
load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
@@ -1260,7 +1260,7 @@
dst += 8 * dst_stride;
h -= 8;
} while (h > 0);
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
// This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
// rounding right shift by FILTER_BITS - instead of a first rounding right
// shift by ROUND0_BITS, followed by second rounding right shift by
@@ -1301,11 +1301,11 @@
}
h -= 1;
} while (h > 0);
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
} else {
uint8_t *d;
int16x8_t s11;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
int16x8_t s12, s13, s14;
do {
__builtin_prefetch(src + 0 * src_stride);
@@ -1390,7 +1390,7 @@
dst += 8 * dst_stride;
h -= 8;
} while (h > 0);
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
// This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
// rounding right shift by FILTER_BITS - instead of a first rounding right
// shift by ROUND0_BITS, followed by second rounding right shift by
@@ -1434,14 +1434,14 @@
dst += dst_stride;
h -= 1;
} while (h > 0);
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
}
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
}
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
}
-#endif // defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
static INLINE void convolve_y_sr_6tap_neon(const uint8_t *src_ptr,
int src_stride, uint8_t *dst_ptr,
@@ -1452,11 +1452,11 @@
int16x4_t s0, s1, s2, s3, s4, s5, d0;
uint8x8_t d01;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
uint8x8_t t6, t7, t8;
int16x4_t s6, s7, s8, d1, d2, d3;
uint8x8_t d23;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
const uint8_t *s = src_ptr + src_stride;
uint8_t *d = dst_ptr;
@@ -1470,7 +1470,7 @@
s += 5 * src_stride;
do {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
load_u8_8x4(s, src_stride, &t5, &t6, &t7, &t8);
s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
@@ -1509,7 +1509,7 @@
s += 4 * src_stride;
d += 4 * dst_stride;
h -= 4;
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
t5 = vld1_u8(s);
s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
@@ -1530,18 +1530,18 @@
s += src_stride;
d += dst_stride;
h--;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
} while (h > 0);
} else {
// if width is a multiple of 8 & height is a multiple of 4
uint8x8_t t0, t1, t2, t3, t4, t5;
int16x8_t s0, s1, s2, s3, s4, s5, dd0;
uint8x8_t d0;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
uint8x8_t t6, t7, t8;
int16x8_t s6, s7, s8, dd1, dd2, dd3;
uint8x8_t d1, d2, d3;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
do {
int height = h;
@@ -1557,7 +1557,7 @@
s += 5 * src_stride;
do {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
load_u8_8x4(s, src_stride, &t5, &t6, &t7, &t8);
s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
@@ -1588,7 +1588,7 @@
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
t5 = vld1_u8(s);
s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
@@ -1605,7 +1605,7 @@
s += src_stride;
d += dst_stride;
height--;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
} while (height > 0);
src_ptr += 8;
@@ -1891,10 +1891,10 @@
if (w <= 4) {
uint8x8_t d01;
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
uint8x8_t d23;
int16x4_t s8, s9, s10, d1, d2, d3;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
src += src_stride;
s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
@@ -1913,7 +1913,7 @@
do {
s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
src += src_stride;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
src += src_stride;
s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
@@ -1962,7 +1962,7 @@
s6 = s10;
dst += 4 * dst_stride;
h -= 4;
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
__builtin_prefetch(dst + 0 * dst_stride);
__builtin_prefetch(src + 0 * src_stride);
@@ -1984,7 +1984,7 @@
s6 = s7;
dst += dst_stride;
h -= 1;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
} while (h > 0);
} else {
int height;
@@ -1992,10 +1992,10 @@
uint8_t *d;
uint8x8_t t0;
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
uint8x8_t t1, t2, t3;
int16x8_t s8, s9, s10;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
do {
__builtin_prefetch(src + 0 * src_stride);
__builtin_prefetch(src + 1 * src_stride);
@@ -2025,7 +2025,7 @@
do {
s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
s += src_stride;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
s += src_stride;
s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
@@ -2060,7 +2060,7 @@
s6 = s10;
d += 4 * dst_stride;
height -= 4;
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
__builtin_prefetch(d);
__builtin_prefetch(s);
@@ -2077,7 +2077,7 @@
s5 = s6;
s6 = s7;
height -= 1;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
} while (height > 0);
src += 8;
dst += 8;
@@ -2086,7 +2086,7 @@
}
}
-#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
static INLINE int16x4_t convolve12_horiz_4_usdot(uint8x16_t samples,
const int8x16_t filters,
@@ -2312,7 +2312,7 @@
}
}
-#elif defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#elif AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
static INLINE void convolve_2d_sr_horiz_12tap_neon(
const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
@@ -2495,7 +2495,7 @@
}
}
-#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+#else // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
static INLINE int16x4_t convolve12_horiz_4x4_s16(
const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
@@ -2591,7 +2591,7 @@
const int32x4_t horiz_const =
vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
do {
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
uint8x8_t t0, t1, t2, t3;
@@ -2683,15 +2683,15 @@
w, h, x_filter_0_7, x_filter_8_11,
horiz_const);
}
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
horiz_filter_12tap_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w,
h, x_filter_0_7, x_filter_8_11, horiz_const);
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
}
-#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
static INLINE void convolve_2d_sr_horiz_8tap_neon(
const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
@@ -2827,7 +2827,7 @@
}
}
-#elif defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#elif AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
static INLINE void convolve_2d_sr_horiz_8tap_neon(
const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
@@ -2975,7 +2975,7 @@
}
}
-#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+#else // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
// Horizontal filtering for convolve_2d_sr for width multiple of 8
// Processes one row at a time
@@ -3094,7 +3094,7 @@
const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) +
(1 << ((ROUND0_BITS - 1) - 1)));
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
do {
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
uint8x8_t t0, t1, t2, t3;
@@ -3154,10 +3154,10 @@
height, x_filter, horiz_const);
}
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w,
height, x_filter, horiz_const);
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
} else {
// This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
@@ -3166,7 +3166,7 @@
const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
(1 << ((ROUND0_BITS - 1) - 1)));
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
for (; height >= 8; height -= 8) {
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
@@ -3325,14 +3325,14 @@
height, x_filter, horiz_const);
}
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w,
height, x_filter, horiz_const);
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
}
}
-#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
static INLINE int32x4_t convolve12_vert_4_s32(
const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
@@ -3545,10 +3545,10 @@
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
uint8x8_t d01;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
int16x4_t s8, s9, s10, d1, d2, d3;
uint8x8_t d23;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
int16_t *s = src_ptr;
uint8_t *d = dst_ptr;
@@ -3557,7 +3557,7 @@
s += 7 * src_stride;
do {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
d0 = convolve8_vert_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
@@ -3594,7 +3594,7 @@
s += 4 * src_stride;
d += 4 * dst_stride;
h -= 4;
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
s7 = vld1_s16(s);
s += src_stride;
@@ -3617,16 +3617,16 @@
s6 = s7;
d += dst_stride;
h--;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
} while (h > 0);
} else {
// if width is a multiple of 8 & height is a multiple of 4
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
uint8x8_t d0;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
int16x8_t s8, s9, s10;
uint8x8_t d1, d2, d3;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
do {
int height = h;
@@ -3637,7 +3637,7 @@
s += 7 * src_stride;
do {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
d0 = convolve8_vert_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
@@ -3665,7 +3665,7 @@
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
s7 = vld1q_s16(s);
d0 = convolve8_vert_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
@@ -3683,7 +3683,7 @@
s += src_stride;
d += dst_stride;
height--;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
} while (height > 0);
src_ptr += 8;
@@ -3753,10 +3753,10 @@
int16x4_t s0, s1, s2, s3, s4, s5, d0;
uint8x8_t d01;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
int16x4_t s6, s7, s8, d1, d2, d3;
uint8x8_t d23;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
int16_t *s = src_ptr;
uint8_t *d = dst_ptr;
@@ -3765,7 +3765,7 @@
s += 5 * src_stride;
do {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
d0 = convolve6_vert_4_s32(s0, s1, s2, s3, s4, s5, y_filter);
@@ -3800,7 +3800,7 @@
s += 4 * src_stride;
d += 4 * dst_stride;
h -= 4;
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
s5 = vld1_s16(s);
d0 = convolve6_vert_4_s32(s0, s1, s2, s3, s4, s5, y_filter);
@@ -3820,16 +3820,16 @@
s += src_stride;
d += dst_stride;
h--;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
} while (h > 0);
} else {
// if width is a multiple of 8 & height is a multiple of 4
int16x8_t s0, s1, s2, s3, s4, s5;
uint8x8_t d0;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
int16x8_t s6, s7, s8;
uint8x8_t d1, d2, d3;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
do {
int height = h;
@@ -3840,7 +3840,7 @@
s += 5 * src_stride;
do {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
d0 = convolve6_vert_8_s32(s0, s1, s2, s3, s4, s5, y_filter, sub_const);
@@ -3862,7 +3862,7 @@
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
s5 = vld1q_s16(s);
d0 = convolve6_vert_8_s32(s0, s1, s2, s3, s4, s5, y_filter, sub_const);
@@ -3877,7 +3877,7 @@
s += src_stride;
d += dst_stride;
height--;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
} while (height > 0);
src_ptr += 8;
diff --git a/av1/common/arm/convolve_neon.h b/av1/common/arm/convolve_neon.h
index cbca376..528712c 100644
--- a/av1/common/arm/convolve_neon.h
+++ b/av1/common/arm/convolve_neon.h
@@ -232,7 +232,7 @@
// clang versions < 16 did not include the dotprod feature for Arm architecture
// versions that should have it by default, e.g., armv8.6-a.
-#if defined(__aarch64__) && \
+#if AOM_ARCH_AARCH64 && \
(defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
@@ -241,9 +241,9 @@
8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
};
-#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
static INLINE int16x8_t convolve8_x_8_usdot(uint8x16_t samples,
const int8x8_t filters,
@@ -319,7 +319,7 @@
return sum;
}
-#elif defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#elif AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
static INLINE int16x8_t convolve8_horiz_8_sdot(uint8x16_t samples,
const int8x8_t filters,
@@ -444,7 +444,7 @@
return vcombine_s16(vmovn_s32(sum[0]), vmovn_s32(sum[1]));
}
-#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
static INLINE int16x4_t convolve8_4x4_s16(
const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
@@ -508,7 +508,7 @@
return sum;
}
-#if !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+#if !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
static INLINE int16x4_t convolve8_horiz_4x4_s16(
const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
@@ -556,6 +556,6 @@
return vshrq_n_s16(sum, ROUND0_BITS - 1);
}
-#endif // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+#endif // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
#endif // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
diff --git a/av1/common/arm/highbd_convolve_neon.c b/av1/common/arm/highbd_convolve_neon.c
index d7def2b..979aff4 100644
--- a/av1/common/arm/highbd_convolve_neon.c
+++ b/av1/common/arm/highbd_convolve_neon.c
@@ -1156,7 +1156,7 @@
// 2
const uint32x4_t src_idx_u32 =
vshlq_n_u32(vshrq_n_u32(xqn_idx, SCALE_SUBPEL_BITS), 1);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
uint64x2_t src4[2];
src4[0] = vaddw_u32(vdupq_n_u64((const uint64_t)src_ptr),
vget_low_u32(src_idx_u32));
@@ -1172,7 +1172,7 @@
int16_t *src4_ptr[4];
uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
vst1q_u32(tmp_ptr, src4);
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
// Same for the filter vectors
const int32x4_t filter_idx_s32 = vreinterpretq_s32_u32(
vshrq_n_u32(vandq_u32(xqn_idx, subpel_mask), SCALE_EXTRA_BITS));
@@ -1253,7 +1253,7 @@
// = 2
const uint32x4_t src_idx_u32 =
vshlq_n_u32(vshrq_n_u32(xqn_idx, SCALE_SUBPEL_BITS), 1);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
uint64x2_t src4[2];
src4[0] = vaddw_u32(vdupq_n_u64((const uint64_t)s),
vget_low_u32(src_idx_u32));
@@ -1269,7 +1269,7 @@
int16_t *src4_ptr[4];
uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
vst1q_u32(tmp_ptr, src4);
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
// Same for the filter vectors
const int32x4_t filter_idx_s32 = vreinterpretq_s32_u32(
vshrq_n_u32(vandq_u32(xqn_idx, subpel_mask), SCALE_EXTRA_BITS));
@@ -2178,7 +2178,7 @@
// negative offsets. Argon test
// profile0_core/streams/test10573_11003.obu was failing because of
// this.
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
uint64x2_t tmp4[2];
tmp4[0] = vreinterpretq_u64_s64(vaddw_s32(
vdupq_n_s64((const int64_t)src_ptr), vget_low_s32(src_idx)));
@@ -2216,7 +2216,7 @@
const int16_t *x_filter4_ptr[4];
tmp_ptr = (uint32_t *)&x_filter4_ptr;
vst1q_u32(tmp_ptr, tmp4);
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
// Load source
s0 = vld1q_s16(src4_ptr[0]);
s1 = vld1q_s16(src4_ptr[1]);
@@ -2296,7 +2296,7 @@
// negative offsets. Argon test
// profile0_core/streams/test10573_11003.obu was failing because of
// this.
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
uint64x2_t tmp4[2];
tmp4[0] = vreinterpretq_u64_s64(
vaddw_s32(vdupq_n_s64((const int64_t)s), vget_low_s32(src_idx)));
@@ -2334,7 +2334,7 @@
const int16_t *x_filter4_ptr[4];
tmp_ptr = (uint32_t *)&x_filter4_ptr;
vst1q_u32(tmp_ptr, tmp4);
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
// Load source
s0 = vld1q_s16(src4_ptr[0]);
diff --git a/av1/common/arm/highbd_inv_txfm_neon.c b/av1/common/arm/highbd_inv_txfm_neon.c
index 96b9738..d197fca 100644
--- a/av1/common/arm/highbd_inv_txfm_neon.c
+++ b/av1/common/arm/highbd_inv_txfm_neon.c
@@ -17,7 +17,7 @@
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \
do { \
int32x4x2_t swap_low = vtrnq_s32(x0, x1); \
@@ -49,7 +49,7 @@
y3 = vextq_s32(swap_low.val[1], \
vextq_s32(swap_high.val[1], swap_high.val[1], 2), 2); \
} while (0)
-#endif // (__aarch64__)
+#endif // AOM_ARCH_AARCH64
static INLINE void transpose_4x4(const int32x4_t *in, int32x4_t *out) {
TRANSPOSE_4X4(in[0], in[1], in[2], in[3], out[0], out[1], out[2], out[3]);
@@ -644,12 +644,12 @@
vreinterpretq_s16_s32(u0x.val[1]), vreinterpretq_s16_s32(zero), 1));
u0x = vzipq_s32(u0x.val[0], u0x.val[1]);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
u0 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u0x.val[0]),
vreinterpretq_s64_s32(u0x.val[1])));
#else
u0 = vcombine_s32(vget_low_s32(u0x.val[0]), vget_low_s32(u0x.val[1]));
-#endif // (__aarch64__)
+#endif // AOM_ARCH_AARCH64
// u1
int32x4x2_t u1x;
u1x.val[0] = vreinterpretq_s32_s64(
@@ -669,12 +669,12 @@
vreinterpretq_s16_s32(u1x.val[1]), vreinterpretq_s16_s32(zero), 1));
u1x = vzipq_s32(u1x.val[0], u1x.val[1]);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
u1 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u1x.val[0]),
vreinterpretq_s64_s32(u1x.val[1])));
#else
u1 = vcombine_s32(vget_low_s32(u1x.val[0]), vget_low_s32(u1x.val[1]));
-#endif // (__aarch64__)
+#endif // AOM_ARCH_AARCH64
// u2
int32x4x2_t u2x;
@@ -695,12 +695,12 @@
vreinterpretq_s16_s32(u2x.val[1]), vreinterpretq_s16_s32(zero), 1));
u2x = vzipq_s32(u2x.val[0], u2x.val[1]);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
u2 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u2x.val[0]),
vreinterpretq_s64_s32(u2x.val[1])));
#else
u2 = vcombine_s32(vget_low_s32(u2x.val[0]), vget_low_s32(u2x.val[1]));
-#endif // (__aarch64__)
+#endif // AOM_ARCH_AARCH64
// u3
int32x4x2_t u3x;
@@ -721,12 +721,12 @@
vreinterpretq_s16_s32(u3x.val[1]), vreinterpretq_s16_s32(zero), 1));
u3x = vzipq_s32(u3x.val[0], u3x.val[1]);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
u3 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u3x.val[0]),
vreinterpretq_s64_s32(u3x.val[1])));
#else
u3 = vcombine_s32(vget_low_s32(u3x.val[0]), vget_low_s32(u3x.val[1]));
-#endif // (__aarch64__)
+#endif // AOM_ARCH_AARCH64
out[0] = u0;
out[1] = u1;
@@ -809,7 +809,7 @@
vshrq_n_s64(vreinterpretq_s64_s32(a0.val[1]), NewSqrt2Bits));
a0 = vzipq_s32(a0.val[0], a0.val[1]);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
out[i] = vreinterpretq_s32_s64(vzip1q_s64(
vreinterpretq_s64_s32(a0.val[0]), vreinterpretq_s64_s32(a0.val[1])));
#else
@@ -2824,7 +2824,7 @@
a0.val[1] = vreinterpretq_s32_s64(
vshrq_n_s64(vreinterpretq_s64_s32(a0.val[1]), NewSqrt2Bits));
a0 = vzipq_s32(a0.val[0], a0.val[1]);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
out[i] = vreinterpretq_s32_s64(vzip1q_s64(
vreinterpretq_s64_s32(a0.val[0]), vreinterpretq_s64_s32(a0.val[1])));
#else
diff --git a/av1/common/arm/jnt_convolve_neon.c b/av1/common/arm/jnt_convolve_neon.c
index 3cb2567..018b2ce 100644
--- a/av1/common/arm/jnt_convolve_neon.c
+++ b/av1/common/arm/jnt_convolve_neon.c
@@ -22,7 +22,7 @@
#include "av1/common/common.h"
#include "av1/common/arm/convolve_neon.h"
-#if !defined(__aarch64__)
+#if !AOM_ARCH_AARCH64
static INLINE void compute_avg_4x1(uint16x4_t dd0, uint16x4_t d0,
const uint16_t fwd_offset,
const uint16_t bck_offset,
@@ -74,7 +74,7 @@
*d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
}
-#endif // !defined(__arch64__)
+#endif // !AOM_ARCH_AARCH64
static INLINE void compute_avg_4x4(uint16x4_t dd0, uint16x4_t dd1,
uint16x4_t dd2, uint16x4_t dd3,
@@ -177,7 +177,7 @@
*d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS);
}
-#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
static INLINE int16x4_t convolve8_4_2d_h(uint8x16_t samples,
const int8x8_t x_filter,
@@ -295,7 +295,7 @@
}
}
-#elif defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#elif AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
static INLINE int16x4_t convolve8_4_2d_h(uint8x16_t samples,
const int8x8_t x_filter,
@@ -431,7 +431,7 @@
}
}
-#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+#else // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
static INLINE int16x4_t convolve8_4_2d_h(const int16x4_t s0, const int16x4_t s1,
const int16x4_t s2, const int16x4_t s3,
@@ -492,10 +492,10 @@
if (w == 4) {
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
uint8x8_t t0;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
int16x4_t s8, s9, s10, d1, d2, d3;
uint8x8_t t1, t2, t3;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
// A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
// shifts - which are generally faster than rounding shifts on modern CPUs.
@@ -504,7 +504,7 @@
(1 << ((ROUND0_BITS - 1) - 1)));
do {
__builtin_prefetch(src_ptr + 0 * src_stride);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
__builtin_prefetch(src_ptr + 1 * src_stride);
__builtin_prefetch(src_ptr + 2 * src_stride);
__builtin_prefetch(src_ptr + 3 * src_stride);
@@ -548,7 +548,7 @@
src_ptr += 4 * src_stride;
dst_ptr += 4 * dst_stride;
height -= 4;
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); // a0 a1 a2 a3
s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); // a4 a5 a6 a7
@@ -572,16 +572,16 @@
src_ptr += src_stride;
dst_ptr += dst_stride;
height--;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
} while (height > 0);
} else {
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, d0;
uint8x8_t t0;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
int16x8_t s9, s10, s11, s12, s13, s14;
int16x8_t d1, d2, d3, d4, d5, d6, d7;
uint8x8_t t1, t2, t3, t4, t5, t6, t7;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
// A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
// shifts - which are generally faster than rounding shifts on modern CPUs.
@@ -593,7 +593,7 @@
int16_t *d = dst_ptr;
int width = w;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
__builtin_prefetch(src_ptr + 0 * src_stride);
__builtin_prefetch(src_ptr + 1 * src_stride);
__builtin_prefetch(src_ptr + 2 * src_stride);
@@ -672,7 +672,7 @@
src_ptr += 8 * src_stride;
dst_ptr += 8 * dst_stride;
height -= 8;
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
t0 = vld1_u8(src_ptr);
s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
@@ -703,12 +703,12 @@
src_ptr += src_stride;
dst_ptr += dst_stride;
height--;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
} while (height > 0);
}
}
-#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
static INLINE uint16x4_t
convolve6_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
@@ -779,17 +779,17 @@
int16x4_t s0, s1, s2, s3, s4, s5;
uint16x4_t dd0, d0;
uint8x8_t d01_u8;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
int16x4_t s6, s7, s8;
uint16x4_t dd1, dd2, dd3, d1, d2, d3;
uint8x8_t d23_u8;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4);
src_ptr += 5 * src_stride;
do {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
@@ -821,7 +821,7 @@
src_ptr += 4 * src_stride;
dst_ptr += 4 * dst_stride;
h -= 4;
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
s5 = vld1_s16(src_ptr);
d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
@@ -847,17 +847,17 @@
src_ptr += src_stride;
dst_ptr += dst_stride;
h--;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
} while (h != 0);
} else {
int16x8_t s0, s1, s2, s3, s4, s5;
uint16x8_t dd0, d0;
uint8x8_t d0_u8;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
int16x8_t s6, s7, s8;
uint16x8_t dd1, dd2, dd3, d1, d2, d3;
uint8x8_t d1_u8, d2_u8, d3_u8;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
do {
int16_t *s = src_ptr;
@@ -869,7 +869,7 @@
s += 5 * src_stride;
do {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
@@ -898,7 +898,7 @@
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
s5 = vld1q_s16(s);
d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
@@ -923,7 +923,7 @@
s += src_stride;
d += dst_stride;
height--;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
} while (height != 0);
src_ptr += 8;
dst_ptr += 8;
@@ -1008,17 +1008,17 @@
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
uint16x4_t dd0, d0;
uint8x8_t d01_u8;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
int16x4_t s8, s9, s10;
uint16x4_t dd1, dd2, dd3, d1, d2, d3;
uint8x8_t d23_u8;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
src_ptr += 7 * src_stride;
do {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
@@ -1056,7 +1056,7 @@
src_ptr += 4 * src_stride;
dst_ptr += 4 * dst_stride;
h -= 4;
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
s7 = vld1_s16(src_ptr);
d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
@@ -1085,17 +1085,17 @@
src_ptr += src_stride;
dst_ptr += dst_stride;
h--;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
} while (h != 0);
} else {
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
uint16x8_t dd0, d0;
uint8x8_t d0_u8;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
int16x8_t s8, s9, s10;
uint16x8_t dd1, dd2, dd3, d1, d2, d3;
uint8x8_t d1_u8, d2_u8, d3_u8;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
do {
int16_t *s = src_ptr;
@@ -1107,7 +1107,7 @@
s += 7 * src_stride;
do {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
@@ -1142,7 +1142,7 @@
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
s7 = vld1q_s16(s);
d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
@@ -1170,7 +1170,7 @@
s += src_stride;
d += dst_stride;
height--;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
} while (height != 0);
src_ptr += 8;
dst_ptr += 8;
@@ -1321,7 +1321,7 @@
}
}
-#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
static INLINE uint16x4_t convolve8_4_x(uint8x16_t samples,
const int8x8_t x_filter,
@@ -1492,7 +1492,7 @@
}
}
-#elif defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#elif AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
static INLINE uint16x4_t convolve8_4_x(uint8x16_t samples,
const int8x8_t x_filter,
@@ -1678,7 +1678,7 @@
}
}
-#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+#else // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
static INLINE uint16x4_t convolve8_4_x(const int16x4_t s0, const int16x4_t s1,
const int16x4_t s2, const int16x4_t s3,
@@ -1764,19 +1764,19 @@
int height = h;
uint8x8_t t0;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
uint8x8_t t1, t2, t3, t4, t5, t6, t7;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
if ((w == 4) || (h == 4)) {
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
uint16x4_t d0, dd0;
uint8x8_t d01;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
int16x4_t s9, s10;
uint16x4_t d1, d2, d3, dd1, dd2, dd3;
uint8x8_t d23;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
do {
d = dst_ptr;
@@ -1784,7 +1784,7 @@
width = w;
__builtin_prefetch(src_ptr + 0 * src_stride);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
__builtin_prefetch(src_ptr + 1 * src_stride);
__builtin_prefetch(src_ptr + 2 * src_stride);
__builtin_prefetch(src_ptr + 3 * src_stride);
@@ -1868,7 +1868,7 @@
dst_ptr += 4 * dst_stride;
dst8_ptr += 4 * dst8_stride;
height -= 4;
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
@@ -1917,7 +1917,7 @@
dst_ptr += dst_stride;
dst8_ptr += dst8_stride;
height--;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
} while (height != 0);
} else {
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
@@ -1929,7 +1929,7 @@
d_u8 = dst8_ptr;
width = w;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
int16x8_t s9, s10, s11, s12, s13, s14;
uint16x8_t d1, d2, d3, d4, d5, d6, d7, dd1, dd2, dd3, dd4, dd5, dd6, dd7;
uint8x8_t d1_u8, d2_u8, d3_u8, d4_u8, d5_u8, d6_u8, d7_u8;
@@ -2034,7 +2034,7 @@
dst_ptr += 8 * dst_stride;
dst8_ptr += 8 * dst8_stride;
height -= 8;
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
__builtin_prefetch(src_ptr);
t0 = vld1_u8(src_ptr);
@@ -2080,12 +2080,12 @@
dst_ptr += dst_stride;
dst8_ptr += dst8_stride;
height--;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
} while (height != 0);
}
}
-#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
static INLINE uint16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
const int16x4_t s2, const int16x4_t s3,
@@ -2152,11 +2152,11 @@
int16x4_t s0, s1, s2, s3, s4, s5;
uint16x4_t d0, dd0;
uint8x8_t t0, t1, t2, t3, t4, d01;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
int16x4_t s6, s7, s8;
uint16x4_t d1, d2, d3, dd1, dd2, dd3;
uint8x8_t d23;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
do {
const uint8_t *s = src_ptr;
@@ -2179,7 +2179,7 @@
s += 5 * src_stride;
do {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
@@ -2223,7 +2223,7 @@
d += 4 * dst_stride;
d_u8 += 4 * dst8_stride;
height -= 4;
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
t0 = load_unaligned_u8_4x1(s);
s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
@@ -2251,7 +2251,7 @@
d += dst_stride;
d_u8 += dst8_stride;
height--;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
} while (height != 0);
src_ptr += 4;
dst_ptr += 4;
@@ -2262,11 +2262,11 @@
int16x8_t s0, s1, s2, s3, s4, s5;
uint16x8_t d0, dd0;
uint8x8_t d0_u8, t0, t1, t2, t3, t4;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
int16x8_t s6, s7, s8, s9, s10, s11, s12;
uint16x8_t d1, d2, d3, d4, d5, d6, d7, dd1, dd2, dd3, dd4, dd5, dd6, dd7;
uint8x8_t d1_u8, d2_u8, d3_u8, d4_u8, d5_u8, d6_u8, d7_u8, t5, t6, t7;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
do {
const uint8_t *s = src_ptr + (5 * src_stride);
@@ -2283,7 +2283,7 @@
s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
do {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
@@ -2336,7 +2336,7 @@
s += 8 * src_stride;
d += 8 * dst_stride;
height -= 8;
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
@@ -2362,7 +2362,7 @@
s += src_stride;
d += dst_stride;
height--;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
} while (height != 0);
src_ptr += 8;
dst_ptr += 8;
@@ -2441,11 +2441,11 @@
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
uint16x4_t d0, dd0;
uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
int16x4_t s8, s9, s10;
uint16x4_t d1, d2, d3, dd1, dd2, dd3;
uint8x8_t d23;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
do {
const uint8_t *s = src_ptr;
@@ -2482,7 +2482,7 @@
s += 7 * src_stride;
do {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
@@ -2538,7 +2538,7 @@
d += 4 * dst_stride;
d_u8 += 4 * dst8_stride;
height -= 4;
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
t0 = load_unaligned_u8_4x1(s);
s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
@@ -2570,7 +2570,7 @@
d += dst_stride;
d_u8 += dst8_stride;
height--;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
} while (height != 0);
src_ptr += 4;
dst_ptr += 4;
@@ -2581,11 +2581,11 @@
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
uint16x8_t d0, dd0;
uint8x8_t d0_u8, t0, t1, t2, t3, t4, t5, t6;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
int16x8_t s8, s9, s10, s11, s12, s13, s14;
uint16x8_t d1, d2, d3, d4, d5, d6, d7, dd1, dd2, dd3, dd4, dd5, dd6, dd7;
uint8x8_t d1_u8, d2_u8, d3_u8, d4_u8, d5_u8, d6_u8, d7_u8, t7;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
do {
const uint8_t *s = src_ptr;
@@ -2614,7 +2614,7 @@
s += 7 * src_stride;
do {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
@@ -2685,7 +2685,7 @@
s += 8 * src_stride;
d += 8 * dst_stride;
height -= 8;
-#else // !defined(__aarch64__)
+#else // !AOM_ARCH_AARCH64
s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
__builtin_prefetch(dst_ptr);
@@ -2718,7 +2718,7 @@
s += src_stride;
d += dst_stride;
height--;
-#endif // defined(__aarch64__)
+#endif // AOM_ARCH_AARCH64
} while (height != 0);
src_ptr += 8;
dst_ptr += 8;
diff --git a/av1/common/arm/reconintra_neon.c b/av1/common/arm/reconintra_neon.c
index 43c470f..07e6b0b 100644
--- a/av1/common/arm/reconintra_neon.c
+++ b/av1/common/arm/reconintra_neon.c
@@ -126,7 +126,7 @@
out_45 = vmlaq_s16(out_45, vreinterpretq_s16_u16(p_b_hi), f5f4_hi);
int16x8_t out_67 = vmulq_s16(vreinterpretq_s16_u16(p_b_lo), f7f6_lo);
out_67 = vmlaq_s16(out_67, vreinterpretq_s16_u16(p_b_hi), f7f6_hi);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
const int16x8_t out_0123 = vpaddq_s16(out_01, out_23);
const int16x8_t out_4567 = vpaddq_s16(out_45, out_67);
const int16x8_t out_01234567 = vpaddq_s16(out_0123, out_4567);
@@ -137,7 +137,7 @@
vqmovn_s32(vpaddlq_s16(out_67)));
const int16x8_t out_01234567 = vcombine_s16(
vqmovn_s32(vpaddlq_s16(out_0123)), vqmovn_s32(vpaddlq_s16(out_4567)));
-#endif // (__aarch64__)
+#endif // AOM_ARCH_AARCH64
const uint32x2_t out_r =
vreinterpret_u32_u8(vqmovun_s16(vrshrq_n_s16(out_01234567, 4)));
// Storing
diff --git a/av1/common/arm/wiener_convolve_neon.c b/av1/common/arm/wiener_convolve_neon.c
index 1d36f68..d7f511d 100644
--- a/av1/common/arm/wiener_convolve_neon.c
+++ b/av1/common/arm/wiener_convolve_neon.c
@@ -153,7 +153,7 @@
height = intermediate_height;
// For aarch_64.
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
int processed_height = 0;
uint16_t *d_tmp;
int width, remaining_height;
diff --git a/av1/encoder/arm/crc32/hash_crc32.c b/av1/encoder/arm/crc32/hash_crc32.c
index dd8685d..5c22e7d 100644
--- a/av1/encoder/arm/crc32/hash_crc32.c
+++ b/av1/encoder/arm/crc32/hash_crc32.c
@@ -37,7 +37,7 @@
const uint8_t *buf = p;
uint32_t crc = 0xFFFFFFFF;
-#if !defined(__aarch64__)
+#if !AOM_ARCH_AARCH64
// Align input to 8-byte boundary (only necessary for 32-bit builds.)
while (len && ((uintptr_t)buf & 7)) {
crc = __crc32cb(crc, *buf++);
diff --git a/av1/encoder/arm/neon/av1_error_neon.c b/av1/encoder/arm/neon/av1_error_neon.c
index 124c1fd..cd7b434 100644
--- a/av1/encoder/arm/neon/av1_error_neon.c
+++ b/av1/encoder/arm/neon/av1_error_neon.c
@@ -48,7 +48,7 @@
block_size -= 8;
} while (block_size != 0);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
*ssz = vaddvq_s64(sqcoeff);
return vaddvq_s64(error);
#else
diff --git a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
index 3640cf1..ee8b115 100644
--- a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
+++ b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
@@ -24,7 +24,7 @@
static INLINE void transpose_16bit_4x4(const int16x8_t *const in,
int16x8_t *const out) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
const int16x8_t a0 = vzip1q_s16(in[0], in[1]);
const int16x8_t a1 = vzip1q_s16(in[2], in[3]);
#else
@@ -45,7 +45,7 @@
static INLINE void transpose_16bit_4x8(const int16x8_t *const in,
int16x8_t *const out) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
const int16x8_t a0 = vzip1q_s16(in[0], in[1]);
const int16x8_t a1 = vzip1q_s16(in[2], in[3]);
const int16x8_t a2 = vzip1q_s16(in[4], in[5]);
@@ -67,7 +67,7 @@
const int32x4x2_t b13 =
vzipq_s32(vreinterpretq_s32_s16(a2), vreinterpretq_s32_s16(a3));
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[0]),
vreinterpretq_s64_s32(b13.val[0])));
out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[0]),
@@ -100,7 +100,7 @@
const int32x4_t zeros = vdupq_n_s32(0);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b01.val[0]),
vreinterpretq_s64_s32(zeros)));
out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b01.val[0]),
@@ -149,7 +149,7 @@
const int32x4x2_t b37 = vzipq_s32(vreinterpretq_s32_s16(a26.val[1]),
vreinterpretq_s32_s16(a37.val[1]));
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b04.val[0]),
vreinterpretq_s64_s32(b15.val[0])));
out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b04.val[0]),
diff --git a/av1/encoder/arm/neon/av1_highbd_quantize_neon.c b/av1/encoder/arm/neon/av1_highbd_quantize_neon.c
index 197eae0..5481928 100644
--- a/av1/encoder/arm/neon/av1_highbd_quantize_neon.c
+++ b/av1/encoder/arm/neon/av1_highbd_quantize_neon.c
@@ -65,7 +65,7 @@
}
static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
-#ifdef __aarch64__
+#if AOM_ARCH_AARCH64
return (uint16_t)vmaxvq_s16(v_eobmax);
#else
const int16x4_t v_eobmax_3210 =
diff --git a/av1/encoder/arm/neon/av1_k_means_neon.c b/av1/encoder/arm/neon/av1_k_means_neon.c
index d421f76..dd9ab9d 100644
--- a/av1/encoder/arm/neon/av1_k_means_neon.c
+++ b/av1/encoder/arm/neon/av1_k_means_neon.c
@@ -16,7 +16,7 @@
static int32x4_t k_means_multiply_add_neon(const int16x8_t a) {
const int32x4_t l = vmull_s16(vget_low_s16(a), vget_low_s16(a));
const int32x4_t h = vmull_s16(vget_high_s16(a), vget_high_s16(a));
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vpaddq_s32(l, h);
#else
const int32x2_t dl = vpadd_s32(vget_low_s32(l), vget_high_s32(l));
diff --git a/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c b/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c
index 3528105..18cd0ce 100644
--- a/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c
+++ b/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c
@@ -24,7 +24,7 @@
// Compute the sum of all pixel differences of this MB.
static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
return vaddlvq_s8(v_sum_diff_total);
#else
const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
diff --git a/av1/encoder/arm/neon/encodetxb_neon.c b/av1/encoder/arm/neon/encodetxb_neon.c
index 0af2521..e312863 100644
--- a/av1/encoder/arm/neon/encodetxb_neon.c
+++ b/av1/encoder/arm/neon/encodetxb_neon.c
@@ -37,7 +37,7 @@
vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
const int16x8_t absAB = vqabsq_s16(coeffAB);
const int8x8_t absABs = vqmovn_s16(absAB);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
const int8x16_t absAB8 =
vcombine_s8(absABs, vreinterpret_s8_s32(vget_low_s32(zeros)));
const uint8x16_t lsAB =
@@ -188,7 +188,7 @@
static INLINE uint8x16_t load_8bit_4x4_to_1_reg(const uint8_t *const src,
const int byte_stride) {
-#ifdef __aarch64__
+#if AOM_ARCH_AARCH64
uint32x4_t v_data = vld1q_u32((uint32_t *)src);
v_data = vld1q_lane_u32((uint32_t *)(src + 1 * byte_stride), v_data, 1);
v_data = vld1q_lane_u32((uint32_t *)(src + 2 * byte_stride), v_data, 2);
@@ -202,7 +202,7 @@
static INLINE uint8x16_t load_8bit_8x2_to_1_reg(const uint8_t *const src,
const int byte_stride) {
-#ifdef __aarch64__
+#if AOM_ARCH_AARCH64
uint64x2_t v_data = vld1q_u64((uint64_t *)src);
v_data = vld1q_lane_u64((uint64_t *)(src + 1 * byte_stride), v_data, 1);
diff --git a/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c b/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c
index 64e4dae..15d375a 100644
--- a/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c
+++ b/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c
@@ -47,7 +47,7 @@
return x;
}
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \
do { \
int32x4x2_t swap_low = vtrnq_s32(x0, x1); \
@@ -79,7 +79,7 @@
y3 = vextq_s32(swap_low.val[1], \
vextq_s32(swap_high.val[1], swap_high.val[1], 2), 2); \
} while (0)
-#endif // (__aarch64__)
+#endif // AOM_ARCH_AARCH64
static INLINE void transpose_4x4(const int32x4_t *in, int32x4_t *out) {
TRANSPOSE_4X4(in[0], in[1], in[2], in[3], out[0], out[1], out[2], out[3]);
diff --git a/av1/encoder/arm/neon/ml_neon.c b/av1/encoder/arm/neon/ml_neon.c
index fcff3a9..f456f55 100644
--- a/av1/encoder/arm/neon/ml_neon.c
+++ b/av1/encoder/arm/neon/ml_neon.c
@@ -46,7 +46,7 @@
vadd = vmlaq_f32(vadd, inputs_h, weights_h);
vadd = vmlaq_f32(vadd, inputs_l, weights_l);
}
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
total += vaddvq_f32(vadd);
#else
float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd));
@@ -80,7 +80,7 @@
j -= 8;
}
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
total += vaddvq_f32(vadd);
#else
@@ -98,7 +98,7 @@
const float *layer_bias,
float *const output_nodes) {
float total = *layer_bias;
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
const float32x4_t v_inputs = vld1q_f32(inputs);
const float32x4_t v_weights = vld1q_f32(weights);
const float32x4_t vadd = vmulq_f32(v_inputs, v_weights);
@@ -126,7 +126,7 @@
vadd = vmlaq_f32(vadd, v_inputs, v_weights);
}
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
total += vaddvq_f32(vadd);
#else
float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd));
@@ -159,7 +159,7 @@
}
}
for (int i = 0; i < 2; i++)
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
mul0[i] = vpaddq_f32(mul0[i], mul1[i]);
const float32x4_t hh = vpaddq_f32(mul0[0], mul0[1]);
#else
@@ -197,7 +197,7 @@
}
}
for (int i = 0; i < 4; i++)
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
mul0[i] = vpaddq_f32(mul0[i], mul1[i]);
const float32x4_t hh0 = vpaddq_f32(mul0[0], mul0[1]);
const float32x4_t hh1 = vpaddq_f32(mul0[2], mul0[3]);
@@ -239,7 +239,7 @@
add[i] = vmlaq_f32(add[i], inputs_h, weight_h);
}
}
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
const float32x4_t hadd_h = vpaddq_f32(add[2], add[3]);
const float32x4_t hadd_l = vpaddq_f32(add[0], add[1]);
const float32x4_t haddhadd = vpaddq_f32(hadd_l, hadd_h);
diff --git a/av1/encoder/arm/neon/picksrt_neon.c b/av1/encoder/arm/neon/picksrt_neon.c
index a1e7765..1346d6b 100644
--- a/av1/encoder/arm/neon/picksrt_neon.c
+++ b/av1/encoder/arm/neon/picksrt_neon.c
@@ -141,10 +141,10 @@
}
sum64 = vpaddlq_u32(err0);
}
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
err += vaddvq_u64(sum64);
#else
err += vget_lane_u64(vadd_u64(vget_low_u64(sum64), vget_high_u64(sum64)), 0);
-#endif // __aarch64__
+#endif // AOM_ARCH_AARCH64
return err;
}
diff --git a/av1/encoder/arm/neon/quantize_neon.c b/av1/encoder/arm/neon/quantize_neon.c
index dbfbeef..c590702 100644
--- a/av1/encoder/arm/neon/quantize_neon.c
+++ b/av1/encoder/arm/neon/quantize_neon.c
@@ -26,7 +26,7 @@
#include "av1/encoder/rd.h"
static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
-#ifdef __aarch64__
+#if AOM_ARCH_AARCH64
return (uint16_t)vmaxvq_s16(v_eobmax);
#else
const int16x4_t v_eobmax_3210 =
diff --git a/av1/encoder/arm/neon/rdopt_neon.c b/av1/encoder/arm/neon/rdopt_neon.c
index 25df6b4..cf443fd 100644
--- a/av1/encoder/arm/neon/rdopt_neon.c
+++ b/av1/encoder/arm/neon/rdopt_neon.c
@@ -97,7 +97,7 @@
v_x_sum = vpadalq_s32(v_x_sum, x_sum_32);
v_x2_sum = vpadalq_s32(v_x2_sum, x2_sum_32);
}
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
xy_sum = vaddvq_s64(v_xy_sum);
xz_sum = vaddvq_s64(v_xz_sum);
x2_sum = vaddvq_s64(v_x2_sum);
@@ -160,7 +160,7 @@
v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
const int32x4_t v_y_sum_a = vpadalq_s16(v_y_sum, v_y);
const int64x2_t v_xy_sum2 = vpaddlq_s32(v_xy_sum_a);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
const int64x2_t v_y2_sum_a = vpaddlq_s32(v_y2_sum);
xy_sum += vaddvq_s64(v_xy_sum2);
const int32_t y = vaddvq_s32(v_y_sum_a);
@@ -278,7 +278,7 @@
v_x_sum_a = vpadalq_s16(v_x_sum_a, v_y);
v_x_sum_a = vpadalq_s16(v_x_sum_a, v_w);
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
xy_sum += vaddvq_s64(vpaddlq_s32(v_xy_sum_a));
xz_sum += vaddvq_s64(vpaddlq_s32(v_xz_sum_a));
x_sum += vaddvq_s32(v_x_sum_a);
@@ -398,7 +398,7 @@
v_x2_firstrow = vmlal_s16(v_x2_firstrow, v_diff_lo, v_diff_lo);
v_x2_firstrow = vmlal_s16(v_x2_firstrow, v_diff_hi, v_diff_hi);
}
-#if defined(__aarch64__)
+#if AOM_ARCH_AARCH64
x_firstrow += vaddvq_s32(v_x_firstrow);
x2_firstrow += vaddvq_s32(v_x2_firstrow);
#else
diff --git a/av1/encoder/arm/neon/temporal_filter_neon.c b/av1/encoder/arm/neon/temporal_filter_neon.c
index c89e83e..4d14845 100644
--- a/av1/encoder/arm/neon/temporal_filter_neon.c
+++ b/av1/encoder/arm/neon/temporal_filter_neon.c
@@ -21,7 +21,7 @@
// For the squared error buffer, add padding for 4 samples.
#define SSE_STRIDE (BW + 4)
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
// clang-format off
@@ -192,7 +192,7 @@
}
}
-#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+#else // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
// When using vld1q_u16_x4 compilers may insert an alignment hint of 256 bits.
DECLARE_ALIGNED(32, static const uint16_t, kSlidingWindowMask[]) = {
@@ -350,7 +350,7 @@
}
}
-#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
void av1_apply_temporal_filter_neon(
const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
@@ -392,11 +392,11 @@
double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
s_decay = CLIP(s_decay, 1e-5, 1);
double d_factor[4] = { 0 };
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
uint8_t frame_abs_diff[SSE_STRIDE * BH] = { 0 };
-#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+#else // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
-#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
uint32_t luma_sse_sum[BW * BH] = { 0 };
for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
@@ -435,7 +435,7 @@
// search is only done on Y-plane, so the information from Y-plane
// will be more accurate. The luma sse sum is reused in both chroma
// planes.
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
if (plane == AOM_PLANE_U) {
for (unsigned int i = 0; i < plane_h; i++) {
for (unsigned int j = 0; j < plane_w; j++) {
@@ -460,7 +460,7 @@
count + plane_offset, frame_abs_diff, luma_sse_sum,
inv_num_ref_pixels, decay_factor, inv_factor,
weight_factor, d_factor, tf_wgt_calc_lvl);
-#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+#else // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
if (plane == AOM_PLANE_U) {
for (unsigned int i = 0; i < plane_h; i++) {
for (unsigned int j = 0; j < plane_w; j++) {
@@ -483,7 +483,7 @@
count + plane_offset, frame_sse, luma_sse_sum,
inv_num_ref_pixels, decay_factor, inv_factor,
weight_factor, d_factor, tf_wgt_calc_lvl);
-#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
plane_offset += plane_h * plane_w;
}