Optimize vertical filter kernel in av1_warp_affine

Some filter kernels contain zero-padded coefficients in the
vertical_filter_4x1_f1 and vertical_filter_8x1_f1 functions. Skip
unnecessary multiplications by zero in the Neon (Armv8.0), Neon I8MM,
and SVE implementations to reduce instruction count, yielding up to
~5% performance improvement.

Move the vertical_filter_4x1_f1 and vertical_filter_8x1_f1 functions
into the header file, as their implementations are identical across
all extensions.

Change-Id: I23e8cd8e2516edd3323addd956549ce6e2a6d494
diff --git a/av1/common/arm/warp_plane_neon.c b/av1/common/arm/warp_plane_neon.c
index f4c1377..f7b393a 100644
--- a/av1/common/arm/warp_plane_neon.c
+++ b/av1/common/arm/warp_plane_neon.c
@@ -143,31 +143,6 @@
   return horizontal_filter_8x1_f1_beta0(in, f_s16);
 }
 
-static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
-                                                    int32x4_t *res, int sy) {
-  int16x4_t s0 = vget_low_s16(src[0]);
-  int16x4_t s1 = vget_low_s16(src[1]);
-  int16x4_t s2 = vget_low_s16(src[2]);
-  int16x4_t s3 = vget_low_s16(src[3]);
-  int16x4_t s4 = vget_low_s16(src[4]);
-  int16x4_t s5 = vget_low_s16(src[5]);
-  int16x4_t s6 = vget_low_s16(src[6]);
-  int16x4_t s7 = vget_low_s16(src[7]);
-
-  int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
-
-  int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
-  m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
-  m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2);
-  m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3);
-  m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0);
-  m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1);
-  m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2);
-  m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3);
-
-  *res = m0123;
-}
-
 static AOM_FORCE_INLINE void vertical_filter_4x1_f4(const int16x8_t *src,
                                                     int32x4_t *res, int sy,
                                                     int gamma) {
@@ -194,43 +169,6 @@
   *res = horizontal_add_4d_s32x4(m0123_pairs);
 }
 
-static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
-                                                    int32x4_t *res_low,
-                                                    int32x4_t *res_high,
-                                                    int sy) {
-  int16x8_t s0 = src[0];
-  int16x8_t s1 = src[1];
-  int16x8_t s2 = src[2];
-  int16x8_t s3 = src[3];
-  int16x8_t s4 = src[4];
-  int16x8_t s5 = src[5];
-  int16x8_t s6 = src[6];
-  int16x8_t s7 = src[7];
-
-  int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
-
-  int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
-  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
-  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2);
-  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3);
-  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0);
-  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1);
-  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2);
-  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3);
-
-  int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0);
-  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1);
-  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2);
-  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3);
-  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0);
-  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1);
-  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2);
-  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3);
-
-  *res_low = m0123;
-  *res_high = m4567;
-}
-
 static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src,
                                                     int32x4_t *res_low,
                                                     int32x4_t *res_high, int sy,
diff --git a/av1/common/arm/warp_plane_neon.h b/av1/common/arm/warp_plane_neon.h
index 6c50c41..0bf5c03 100644
--- a/av1/common/arm/warp_plane_neon.h
+++ b/av1/common/arm/warp_plane_neon.h
@@ -24,23 +24,91 @@
 #include "av1/common/warped_motion.h"
 #include "av1/common/scale.h"
 
-static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
-                                                    int32x4_t *res, int sy);
-
 static AOM_FORCE_INLINE void vertical_filter_4x1_f4(const int16x8_t *src,
                                                     int32x4_t *res, int sy,
                                                     int gamma);
 
-static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
-                                                    int32x4_t *res_low,
-                                                    int32x4_t *res_high,
-                                                    int sy);
-
 static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src,
                                                     int32x4_t *res_low,
                                                     int32x4_t *res_high, int sy,
                                                     int gamma);
 
+static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
+                                                    int32x4_t *res, int sy) {
+  int16_t *f_ptr =
+      (int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS));
+  int16x8_t f = vld1q_s16(f_ptr);
+
+  int32x4_t m0123;
+  if (f_ptr[0] != 0) {
+    m0123 = vmull_lane_s16(vget_low_s16(src[0]), vget_low_s16(f), 0);
+  } else {
+    m0123 = vdupq_n_s32(0);
+  }
+  if (f_ptr[1] != 0) {
+    m0123 = vmlal_lane_s16(m0123, vget_low_s16(src[1]), vget_low_s16(f), 1);
+  }
+
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(src[2]), vget_low_s16(f), 2);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(src[3]), vget_low_s16(f), 3);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(src[4]), vget_high_s16(f), 0);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(src[5]), vget_high_s16(f), 1);
+
+  if (f_ptr[6] != 0) {
+    m0123 = vmlal_lane_s16(m0123, vget_low_s16(src[6]), vget_high_s16(f), 2);
+  }
+  if (f_ptr[7] != 0) {
+    m0123 = vmlal_lane_s16(m0123, vget_low_s16(src[7]), vget_high_s16(f), 3);
+  }
+
+  *res = m0123;
+}
+
+static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *s,
+                                                    int32x4_t *res_low,
+                                                    int32x4_t *res_high,
+                                                    int sy) {
+  int16_t *f_ptr =
+      (int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS));
+  int16x8_t f = vld1q_s16(f_ptr);
+
+  int32x4_t m0123, m4567;
+  if (f_ptr[0] != 0) {
+    m0123 = vmull_lane_s16(vget_low_s16(s[0]), vget_low_s16(f), 0);
+    m4567 = vmull_lane_s16(vget_high_s16(s[0]), vget_low_s16(f), 0);
+  } else {
+    m0123 = vdupq_n_s32(0);
+    m4567 = vdupq_n_s32(0);
+  }
+  if (f_ptr[1] != 0) {
+    m0123 = vmlal_lane_s16(m0123, vget_low_s16(s[1]), vget_low_s16(f), 1);
+    m4567 = vmlal_lane_s16(m4567, vget_high_s16(s[1]), vget_low_s16(f), 1);
+  }
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s[2]), vget_low_s16(f), 2);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s[2]), vget_low_s16(f), 2);
+
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s[3]), vget_low_s16(f), 3);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s[3]), vget_low_s16(f), 3);
+
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s[4]), vget_high_s16(f), 0);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s[4]), vget_high_s16(f), 0);
+
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s[5]), vget_high_s16(f), 1);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s[5]), vget_high_s16(f), 1);
+
+  if (f_ptr[6] != 0) {
+    m0123 = vmlal_lane_s16(m0123, vget_low_s16(s[6]), vget_high_s16(f), 2);
+    m4567 = vmlal_lane_s16(m4567, vget_high_s16(s[6]), vget_high_s16(f), 2);
+  }
+  if (f_ptr[7] != 0) {
+    m0123 = vmlal_lane_s16(m0123, vget_low_s16(s[7]), vget_high_s16(f), 3);
+    m4567 = vmlal_lane_s16(m4567, vget_high_s16(s[7]), vget_high_s16(f), 3);
+  }
+
+  *res_low = m0123;
+  *res_high = m4567;
+}
+
 static AOM_FORCE_INLINE void load_filters_4(int16x8_t out[], int offset,
                                             int stride) {
   out[0] = vld1q_s16(
diff --git a/av1/common/arm/warp_plane_neon_i8mm.c b/av1/common/arm/warp_plane_neon_i8mm.c
index 44689e9..3f16ddd 100644
--- a/av1/common/arm/warp_plane_neon_i8mm.c
+++ b/av1/common/arm/warp_plane_neon_i8mm.c
@@ -244,31 +244,6 @@
   return vreinterpretq_s16_u16(res);
 }
 
-static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
-                                                    int32x4_t *res, int sy) {
-  int16x4_t s0 = vget_low_s16(src[0]);
-  int16x4_t s1 = vget_low_s16(src[1]);
-  int16x4_t s2 = vget_low_s16(src[2]);
-  int16x4_t s3 = vget_low_s16(src[3]);
-  int16x4_t s4 = vget_low_s16(src[4]);
-  int16x4_t s5 = vget_low_s16(src[5]);
-  int16x4_t s6 = vget_low_s16(src[6]);
-  int16x4_t s7 = vget_low_s16(src[7]);
-
-  int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
-
-  int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
-  m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
-  m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2);
-  m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3);
-  m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0);
-  m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1);
-  m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2);
-  m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3);
-
-  *res = m0123;
-}
-
 static AOM_FORCE_INLINE void vertical_filter_4x1_f4(const int16x8_t *src,
                                                     int32x4_t *res, int sy,
                                                     int gamma) {
@@ -295,43 +270,6 @@
   *res = horizontal_add_4d_s32x4(m0123_pairs);
 }
 
-static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
-                                                    int32x4_t *res_low,
-                                                    int32x4_t *res_high,
-                                                    int sy) {
-  int16x8_t s0 = src[0];
-  int16x8_t s1 = src[1];
-  int16x8_t s2 = src[2];
-  int16x8_t s3 = src[3];
-  int16x8_t s4 = src[4];
-  int16x8_t s5 = src[5];
-  int16x8_t s6 = src[6];
-  int16x8_t s7 = src[7];
-
-  int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
-
-  int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
-  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
-  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2);
-  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3);
-  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0);
-  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1);
-  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2);
-  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3);
-
-  int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0);
-  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1);
-  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2);
-  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3);
-  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0);
-  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1);
-  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2);
-  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3);
-
-  *res_low = m0123;
-  *res_high = m4567;
-}
-
 static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src,
                                                     int32x4_t *res_low,
                                                     int32x4_t *res_high, int sy,
diff --git a/av1/common/arm/warp_plane_sve.c b/av1/common/arm/warp_plane_sve.c
index 885ffe8..01a3875 100644
--- a/av1/common/arm/warp_plane_sve.c
+++ b/av1/common/arm/warp_plane_sve.c
@@ -247,31 +247,6 @@
   return vreinterpretq_s16_u16(res);
 }
 
-static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
-                                                    int32x4_t *res, int sy) {
-  int16x4_t s0 = vget_low_s16(src[0]);
-  int16x4_t s1 = vget_low_s16(src[1]);
-  int16x4_t s2 = vget_low_s16(src[2]);
-  int16x4_t s3 = vget_low_s16(src[3]);
-  int16x4_t s4 = vget_low_s16(src[4]);
-  int16x4_t s5 = vget_low_s16(src[5]);
-  int16x4_t s6 = vget_low_s16(src[6]);
-  int16x4_t s7 = vget_low_s16(src[7]);
-
-  int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
-
-  int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
-  m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
-  m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2);
-  m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3);
-  m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0);
-  m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1);
-  m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2);
-  m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3);
-
-  *res = m0123;
-}
-
 static AOM_FORCE_INLINE void vertical_filter_4x1_f4(const int16x8_t *src,
                                                     int32x4_t *res, int sy,
                                                     int gamma) {
@@ -295,43 +270,6 @@
   *res = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23));
 }
 
-static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
-                                                    int32x4_t *res_low,
-                                                    int32x4_t *res_high,
-                                                    int sy) {
-  int16x8_t s0 = src[0];
-  int16x8_t s1 = src[1];
-  int16x8_t s2 = src[2];
-  int16x8_t s3 = src[3];
-  int16x8_t s4 = src[4];
-  int16x8_t s5 = src[5];
-  int16x8_t s6 = src[6];
-  int16x8_t s7 = src[7];
-
-  int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
-
-  int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
-  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
-  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2);
-  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3);
-  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0);
-  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1);
-  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2);
-  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3);
-
-  int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0);
-  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1);
-  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2);
-  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3);
-  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0);
-  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1);
-  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2);
-  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3);
-
-  *res_low = m0123;
-  *res_high = m4567;
-}
-
 static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src,
                                                     int32x4_t *res_low,
                                                     int32x4_t *res_high, int sy,