highbd_loopfilter_neon: rm armv7 0 vector check

this is slower on armv7 devices (s3/Nexus S/Nexus 7) due to the
transition to and from NEON

Bug: b/217462944
Change-Id: I7ba09ccfb791b1ad88ebc1648c1ceb7fd86c91c1
diff --git a/aom_dsp/arm/highbd_loopfilter_neon.c b/aom_dsp/arm/highbd_loopfilter_neon.c
index 4537513..7d62515 100644
--- a/aom_dsp/arm/highbd_loopfilter_neon.c
+++ b/aom_dsp/arm/highbd_loopfilter_neon.c
@@ -252,13 +252,6 @@
     // None of the values will be filtered.
     return;
   }
-#else   // !defined(__aarch64__)
-  const uint64x1_t needs_filter4_mask64 =
-      vreinterpret_u64_u16(needs_filter4_mask);
-  if (vget_lane_u64(needs_filter4_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
 #endif  // defined(__aarch64__)
 
   // Copy the masks to the high bits for packed comparisons later.
@@ -325,13 +318,6 @@
     // None of the values will be filtered.
     return;
   }
-#else   // !defined(__aarch64__)
-  const uint64x1_t needs_filter4_mask64 =
-      vreinterpret_u64_u16(needs_filter4_mask);
-  if (vget_lane_u64(needs_filter4_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
 #endif  // defined(__aarch64__)
 
   // Copy the masks to the high bits for packed comparisons later.
@@ -456,15 +442,6 @@
     // None of the values will be filtered.
     return;
   }
-#else   // !defined(__aarch64__)
-  // This might be faster than vaddv (latency 3) because mov to general register
-  // has latency 2.
-  const uint64x1_t needs_filter_mask64 =
-      vreinterpret_u64_u16(needs_filter_mask);
-  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
 #endif  // defined(__aarch64__)
 
   // Copy the masks to the high bits for packed comparisons later.
@@ -556,15 +533,6 @@
     // None of the values will be filtered.
     return;
   }
-#else   // !defined(__aarch64__)
-  // This might be faster than vaddv (latency 3) because mov to general register
-  // has latency 2.
-  const uint64x1_t needs_filter_mask64 =
-      vreinterpret_u64_u16(needs_filter_mask);
-  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
 #endif  // defined(__aarch64__)
 
   // Copy the masks to the high bits for packed comparisons later.
@@ -721,15 +689,6 @@
     // None of the values will be filtered.
     return;
   }
-#else   // !defined(__aarch64__)
-  // This might be faster than vaddv (latency 3) because mov to general register
-  // has latency 2.
-  const uint64x1_t needs_filter_mask64 =
-      vreinterpret_u64_u16(needs_filter_mask);
-  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
 #endif  // defined(__aarch64__)
 
   // Copy the masks to the high bits for packed comparisons later.
@@ -829,15 +788,6 @@
     // None of the values will be filtered.
     return;
   }
-#else   // !defined(__aarch64__)
-  // This might be faster than vaddv (latency 3) because mov to general register
-  // has latency 2.
-  const uint64x1_t needs_filter_mask64 =
-      vreinterpret_u64_u16(needs_filter_mask);
-  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
 #endif  // defined(__aarch64__)
 
   // Copy the masks to the high bits for packed comparisons later.
@@ -1031,15 +981,6 @@
     // None of the values will be filtered.
     return;
   }
-#else   // !defined(__aarch64__)
-  // This might be faster than vaddv (latency 3) because mov to general register
-  // has latency 2.
-  const uint64x1_t needs_filter_mask64 =
-      vreinterpret_u64_u16(needs_filter_mask);
-  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
 #endif  // defined(__aarch64__)
   const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]);
   const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]);
@@ -1208,15 +1149,6 @@
     // None of the values will be filtered.
     return;
   }
-#else   // !defined(__aarch64__)
-  // This might be faster than vaddv (latency 3) because mov to general register
-  // has latency 2.
-  const uint64x1_t needs_filter_mask64 =
-      vreinterpret_u64_u16(needs_filter_mask);
-  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
 #endif  // defined(__aarch64__)
   const uint16x8_t p4q4 =
       vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0]));