Optimize narrowing sequences in Neon convolution kernels

Only use saturating narrowing Neon instructions where necessary; also
remove redundant max(x, 0) sequences where vqmovun (saturating
convert to unsigned and narrow) will suffice.

Change-Id: I1bcb3378da81eef06a23b7f83bbb1035f9556a5b
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index 537d764..28009d8 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -114,7 +114,7 @@
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE uint16x4_t convolve8_vert_4x4_s32(
+static INLINE int16x4_t convolve8_vert_4x4_s32(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
@@ -122,27 +122,22 @@
     const int32x4_t sub_const_vec) {
   const int16x4_t y_filter_lo = vget_low_s16(y_filter);
   const int16x4_t y_filter_hi = vget_high_s16(y_filter);
-  int32x4_t sum0;
-  uint16x4_t res;
-  const int32x4_t zero = vdupq_n_s32(0);
+  int32x4_t sum;
 
-  sum0 = vmull_lane_s16(s0, y_filter_lo, 0);
-  sum0 = vmlal_lane_s16(sum0, s1, y_filter_lo, 1);
-  sum0 = vmlal_lane_s16(sum0, s2, y_filter_lo, 2);
-  sum0 = vmlal_lane_s16(sum0, s3, y_filter_lo, 3);
-  sum0 = vmlal_lane_s16(sum0, s4, y_filter_hi, 0);
-  sum0 = vmlal_lane_s16(sum0, s5, y_filter_hi, 1);
-  sum0 = vmlal_lane_s16(sum0, s6, y_filter_hi, 2);
-  sum0 = vmlal_lane_s16(sum0, s7, y_filter_hi, 3);
+  sum = vmull_lane_s16(s0, y_filter_lo, 0);
+  sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
+  sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
+  sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
+  sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
+  sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
+  sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
+  sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
 
-  sum0 = vaddq_s32(sum0, offset_const);
-  sum0 = vqrshlq_s32(sum0, round_shift_vec);
-  sum0 = vsubq_s32(sum0, sub_const_vec);
-  sum0 = vmaxq_s32(sum0, zero);
+  sum = vaddq_s32(sum, offset_const);
+  sum = vqrshlq_s32(sum, round_shift_vec);
+  sum = vsubq_s32(sum, sub_const_vec);
 
-  res = vmovn_u32(vreinterpretq_u32_s32(sum0));
-
-  return res;
+  return vmovn_s32(sum);
 }
 
 static INLINE uint8x8_t convolve8_vert_8x4_s32(
@@ -154,8 +149,7 @@
   const int16x4_t y_filter_lo = vget_low_s16(y_filter);
   const int16x4_t y_filter_hi = vget_high_s16(y_filter);
   int32x4_t sum0, sum1;
-  uint16x8_t res;
-  const int32x4_t zero = vdupq_n_s32(0);
+  int16x8_t res;
 
   sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0);
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1);
@@ -181,14 +175,11 @@
   sum1 = vqrshlq_s32(sum1, round_shift_vec);
   sum0 = vsubq_s32(sum0, sub_const_vec);
   sum1 = vsubq_s32(sum1, sub_const_vec);
-  sum0 = vmaxq_s32(sum0, zero);
-  sum1 = vmaxq_s32(sum1, zero);
-  res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(sum0)),
-                     vqmovn_u32(vreinterpretq_u32_s32(sum1)));
 
-  res = vqrshlq_u16(res, vec_round_bits);
+  res = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  res = vqrshlq_s16(res, vec_round_bits);
 
-  return vqmovn_u16(res);
+  return vqmovun_s16(res);
 }
 
 void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
@@ -1309,15 +1300,13 @@
     width = w;
 
     if (width <= 4) {
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
-      uint16x4_t d0;
-      uint16x8_t dd0;
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+      int16x8_t dd0;
       uint8x8_t d01;
 
 #if defined(__aarch64__)
-      int16x4_t s8, s9, s10;
-      uint16x4_t d1, d2, d3;
-      uint16x8_t dd1;
+      int16x4_t s8, s9, s10, d1, d2, d3;
+      int16x8_t dd1;
       uint8x8_t d23;
 #endif
 
@@ -1359,11 +1348,11 @@
                                     round_shift_vec, offset_const,
                                     sub_const_vec);
 
-        dd0 = vqrshlq_u16(vcombine_u16(d0, d1), vec_round_bits);
-        dd1 = vqrshlq_u16(vcombine_u16(d2, d3), vec_round_bits);
+        dd0 = vqrshlq_s16(vcombine_s16(d0, d1), vec_round_bits);
+        dd1 = vqrshlq_s16(vcombine_s16(d2, d3), vec_round_bits);
 
-        d01 = vqmovn_u16(dd0);
-        d23 = vqmovn_u16(dd1);
+        d01 = vqmovun_s16(dd0);
+        d23 = vqmovun_s16(dd1);
 
         if ((w == 4) && (h != 2)) {
           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
@@ -1425,8 +1414,8 @@
                                     round_shift_vec, offset_const,
                                     sub_const_vec);
 
-        dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits);
-        d01 = vqmovn_u16(dd0);
+        dd0 = vqrshlq_s16(vcombine_s16(d0, d0), vec_round_bits);
+        d01 = vqmovun_s16(dd0);
 
         if (w == 4) {
           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
diff --git a/av1/common/arm/convolve_neon.h b/av1/common/arm/convolve_neon.h
index 3459ebe..05d781e 100644
--- a/av1/common/arm/convolve_neon.h
+++ b/av1/common/arm/convolve_neon.h
@@ -79,13 +79,11 @@
     const int round1_bits) {
   int16x8_t ss0, ss1, ss2;
   int32x4_t sum0, sum1;
-  uint16x4_t tmp0, tmp1;
-  uint16x8_t tmp;
+  int16x8_t tmp;
   uint8x8_t res;
 
   const int32_t round_const = (1 << (bd + round1_bits - 1));
   const int32x4_t round_bits = vdupq_n_s32(-round1_bits);
-  const int32x4_t zero = vdupq_n_s32(0);
   const int32x4_t round_vec = vdupq_n_s32(round_const);
   const int16x4_t filter = vld1_s16(filter_y);
 
@@ -110,14 +108,9 @@
   sum0 = vrshlq_s32(sum0, round_bits);
   sum1 = vrshlq_s32(sum1, round_bits);
 
-  sum0 = vmaxq_s32(sum0, zero);
-  sum1 = vmaxq_s32(sum1, zero);
-
   /* from int32x4_t to uint8x8_t */
-  tmp0 = vqmovn_u32(vreinterpretq_u32_s32(sum0));
-  tmp1 = vqmovn_u32(vreinterpretq_u32_s32(sum1));
-  tmp = vcombine_u16(tmp0, tmp1);
-  res = vqmovn_u16(tmp);
+  tmp = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  res = vqmovun_s16(tmp);
 
   return res;
 }
@@ -185,7 +178,6 @@
   const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1));
   const int32_t round_const_1 = (1 << (bd + 1 + FILTER_BITS - round0_bits)) - 1;
   const int32x4_t round_bits = vdupq_n_s32(-round0_bits);
-  const int32x4_t zero = vdupq_n_s32(0);
   const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);
   const int32x4_t round_vec_1 = vdupq_n_s32(round_const_1);
   const int16x4_t filter = vld1_s16(filter_x);
@@ -209,7 +201,6 @@
   sum_0 = vaddq_s32(sum_0, round_vec_0);
   sum_0 = vrshlq_s32(sum_0, round_bits);
 
-  sum_0 = vmaxq_s32(sum_0, zero);
   sum_0 = vminq_s32(sum_0, round_vec_1);
   res = vqmovun_s32(sum_0);
   return res;
@@ -270,25 +261,21 @@
     const int32x4_t round_shift_vec, const int32x4_t offset_const) {
   const int16x4_t y_filter_lo = vget_low_s16(y_filter);
   const int16x4_t y_filter_hi = vget_high_s16(y_filter);
-  int32x4_t sum0;
-  uint16x4_t res;
-  const int32x4_t zero = vdupq_n_s32(0);
+  int32x4_t sum;
 
-  sum0 = vmull_lane_s16(s0, y_filter_lo, 0);
-  sum0 = vmlal_lane_s16(sum0, s1, y_filter_lo, 1);
-  sum0 = vmlal_lane_s16(sum0, s2, y_filter_lo, 2);
-  sum0 = vmlal_lane_s16(sum0, s3, y_filter_lo, 3);
-  sum0 = vmlal_lane_s16(sum0, s4, y_filter_hi, 0);
-  sum0 = vmlal_lane_s16(sum0, s5, y_filter_hi, 1);
-  sum0 = vmlal_lane_s16(sum0, s6, y_filter_hi, 2);
-  sum0 = vmlal_lane_s16(sum0, s7, y_filter_hi, 3);
+  sum = vmull_lane_s16(s0, y_filter_lo, 0);
+  sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
+  sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
+  sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
+  sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
+  sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
+  sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
+  sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
 
-  sum0 = vaddq_s32(sum0, offset_const);
-  sum0 = vqrshlq_s32(sum0, round_shift_vec);
-  sum0 = vmaxq_s32(sum0, zero);
-  res = vmovn_u32(vreinterpretq_u32_s32(sum0));
+  sum = vaddq_s32(sum, offset_const);
+  sum = vqrshlq_s32(sum, round_shift_vec);
 
-  return res;
+  return vqmovun_s32(sum);
 }
 
 #endif  // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_