highbd_loopfilter_neon: rm armv7 0 vector check this is slower on armv7 devices (s3/Nexus S/Nexus 7) due to the transition to and from NEON Bug: b/217462944 Change-Id: I7ba09ccfb791b1ad88ebc1648c1ceb7fd86c91c1

commit: cf3ea11c136e29d368a63a0170a2ca846a2756e4 [log] [tgz]
author: James Zern <jzern@google.com> Mon May 02 11:11:31 2022 -0700
committer: James Zern <jzern@google.com> Mon May 02 11:12:39 2022 -0700
tree: 7c76a87c51df8c41b790392de4209c18d8a5ddb5
parent: b79f705b4fd5d92390ef6ab9a75616e8211997cb [diff]
diff --git a/aom_dsp/arm/highbd_loopfilter_neon.c b/aom_dsp/arm/highbd_loopfilter_neon.c
index 4537513..7d62515 100644
--- a/aom_dsp/arm/highbd_loopfilter_neon.c
+++ b/aom_dsp/arm/highbd_loopfilter_neon.c

@@ -252,13 +252,6 @@
     // None of the values will be filtered.
     return;
   }
-#else   // !defined(__aarch64__)
-  const uint64x1_t needs_filter4_mask64 =
-      vreinterpret_u64_u16(needs_filter4_mask);
-  if (vget_lane_u64(needs_filter4_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
 #endif  // defined(__aarch64__)
 
   // Copy the masks to the high bits for packed comparisons later.
@@ -325,13 +318,6 @@
     // None of the values will be filtered.
     return;
   }
-#else   // !defined(__aarch64__)
-  const uint64x1_t needs_filter4_mask64 =
-      vreinterpret_u64_u16(needs_filter4_mask);
-  if (vget_lane_u64(needs_filter4_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
 #endif  // defined(__aarch64__)
 
   // Copy the masks to the high bits for packed comparisons later.
@@ -456,15 +442,6 @@
     // None of the values will be filtered.
     return;
   }
-#else   // !defined(__aarch64__)
-  // This might be faster than vaddv (latency 3) because mov to general register
-  // has latency 2.
-  const uint64x1_t needs_filter_mask64 =
-      vreinterpret_u64_u16(needs_filter_mask);
-  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
 #endif  // defined(__aarch64__)
 
   // Copy the masks to the high bits for packed comparisons later.
@@ -556,15 +533,6 @@
     // None of the values will be filtered.
     return;
   }
-#else   // !defined(__aarch64__)
-  // This might be faster than vaddv (latency 3) because mov to general register
-  // has latency 2.
-  const uint64x1_t needs_filter_mask64 =
-      vreinterpret_u64_u16(needs_filter_mask);
-  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
 #endif  // defined(__aarch64__)
 
   // Copy the masks to the high bits for packed comparisons later.
@@ -721,15 +689,6 @@
     // None of the values will be filtered.
     return;
   }
-#else   // !defined(__aarch64__)
-  // This might be faster than vaddv (latency 3) because mov to general register
-  // has latency 2.
-  const uint64x1_t needs_filter_mask64 =
-      vreinterpret_u64_u16(needs_filter_mask);
-  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
 #endif  // defined(__aarch64__)
 
   // Copy the masks to the high bits for packed comparisons later.
@@ -829,15 +788,6 @@
     // None of the values will be filtered.
     return;
   }
-#else   // !defined(__aarch64__)
-  // This might be faster than vaddv (latency 3) because mov to general register
-  // has latency 2.
-  const uint64x1_t needs_filter_mask64 =
-      vreinterpret_u64_u16(needs_filter_mask);
-  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
 #endif  // defined(__aarch64__)
 
   // Copy the masks to the high bits for packed comparisons later.
@@ -1031,15 +981,6 @@
     // None of the values will be filtered.
     return;
   }
-#else   // !defined(__aarch64__)
-  // This might be faster than vaddv (latency 3) because mov to general register
-  // has latency 2.
-  const uint64x1_t needs_filter_mask64 =
-      vreinterpret_u64_u16(needs_filter_mask);
-  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
 #endif  // defined(__aarch64__)
   const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]);
   const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]);
@@ -1208,15 +1149,6 @@
     // None of the values will be filtered.
     return;
   }
-#else   // !defined(__aarch64__)
-  // This might be faster than vaddv (latency 3) because mov to general register
-  // has latency 2.
-  const uint64x1_t needs_filter_mask64 =
-      vreinterpret_u64_u16(needs_filter_mask);
-  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
 #endif  // defined(__aarch64__)
   const uint16x8_t p4q4 =
       vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0]));
commit	cf3ea11c136e29d368a63a0170a2ca846a2756e4	[log] [tgz]
author	James Zern <jzern@google.com>	Mon May 02 11:11:31 2022 -0700
committer	James Zern <jzern@google.com>	Mon May 02 11:12:39 2022 -0700
tree	7c76a87c51df8c41b790392de4209c18d8a5ddb5
parent	b79f705b4fd5d92390ef6ab9a75616e8211997cb [diff]