highbd_loopfilter_neon: rm armv7 0 vector check
this is slower on armv7 devices (s3/Nexus S/Nexus 7) due to the
transition to and from NEON
Bug: b/217462944
Change-Id: I7ba09ccfb791b1ad88ebc1648c1ceb7fd86c91c1
diff --git a/aom_dsp/arm/highbd_loopfilter_neon.c b/aom_dsp/arm/highbd_loopfilter_neon.c
index 4537513..7d62515 100644
--- a/aom_dsp/arm/highbd_loopfilter_neon.c
+++ b/aom_dsp/arm/highbd_loopfilter_neon.c
@@ -252,13 +252,6 @@
// None of the values will be filtered.
return;
}
-#else // !defined(__aarch64__)
- const uint64x1_t needs_filter4_mask64 =
- vreinterpret_u64_u16(needs_filter4_mask);
- if (vget_lane_u64(needs_filter4_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
#endif // defined(__aarch64__)
// Copy the masks to the high bits for packed comparisons later.
@@ -325,13 +318,6 @@
// None of the values will be filtered.
return;
}
-#else // !defined(__aarch64__)
- const uint64x1_t needs_filter4_mask64 =
- vreinterpret_u64_u16(needs_filter4_mask);
- if (vget_lane_u64(needs_filter4_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
#endif // defined(__aarch64__)
// Copy the masks to the high bits for packed comparisons later.
@@ -456,15 +442,6 @@
// None of the values will be filtered.
return;
}
-#else // !defined(__aarch64__)
- // This might be faster than vaddv (latency 3) because mov to general register
- // has latency 2.
- const uint64x1_t needs_filter_mask64 =
- vreinterpret_u64_u16(needs_filter_mask);
- if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
#endif // defined(__aarch64__)
// Copy the masks to the high bits for packed comparisons later.
@@ -556,15 +533,6 @@
// None of the values will be filtered.
return;
}
-#else // !defined(__aarch64__)
- // This might be faster than vaddv (latency 3) because mov to general register
- // has latency 2.
- const uint64x1_t needs_filter_mask64 =
- vreinterpret_u64_u16(needs_filter_mask);
- if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
#endif // defined(__aarch64__)
// Copy the masks to the high bits for packed comparisons later.
@@ -721,15 +689,6 @@
// None of the values will be filtered.
return;
}
-#else // !defined(__aarch64__)
- // This might be faster than vaddv (latency 3) because mov to general register
- // has latency 2.
- const uint64x1_t needs_filter_mask64 =
- vreinterpret_u64_u16(needs_filter_mask);
- if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
#endif // defined(__aarch64__)
// Copy the masks to the high bits for packed comparisons later.
@@ -829,15 +788,6 @@
// None of the values will be filtered.
return;
}
-#else // !defined(__aarch64__)
- // This might be faster than vaddv (latency 3) because mov to general register
- // has latency 2.
- const uint64x1_t needs_filter_mask64 =
- vreinterpret_u64_u16(needs_filter_mask);
- if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
#endif // defined(__aarch64__)
// Copy the masks to the high bits for packed comparisons later.
@@ -1031,15 +981,6 @@
// None of the values will be filtered.
return;
}
-#else // !defined(__aarch64__)
- // This might be faster than vaddv (latency 3) because mov to general register
- // has latency 2.
- const uint64x1_t needs_filter_mask64 =
- vreinterpret_u64_u16(needs_filter_mask);
- if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
#endif // defined(__aarch64__)
const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]);
const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]);
@@ -1208,15 +1149,6 @@
// None of the values will be filtered.
return;
}
-#else // !defined(__aarch64__)
- // This might be faster than vaddv (latency 3) because mov to general register
- // has latency 2.
- const uint64x1_t needs_filter_mask64 =
- vreinterpret_u64_u16(needs_filter_mask);
- if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
#endif // defined(__aarch64__)
const uint16x8_t p4q4 =
vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0]));