Optimize Neon I8MM aom_convolve8* implementations

The convolution filters used with the aom_convolve8* functions have
all even coefficient values. Halving these filter coefficients reduces
the intermediate precision requirements inside the convolution
kernels, and allows us to avoid some slower saturating narrow
instructions in the Neon I8MM aom_convolve8* functions.

This patch applies this optimization in the following cases:
- 8-tap horizontal
- 8-tap vertical
- 6-tap vertical (covered by 8-tap vertical code path.)

Change-Id: I55072e3be3371cc40f9ffe0f9b1fee308064a8fd
diff --git a/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
index 121e892..6d1ab96 100644
--- a/aom_dsp/arm/aom_convolve8_neon_i8mm.c
+++ b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
@@ -61,7 +61,7 @@
   sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
 
   // Further narrowing and packing is performed by the caller.
-  return vqmovn_s32(sum);
+  return vmovn_s32(sum);
 }
 
 static inline uint8x8_t convolve8_8_h(const uint8x16_t samples,
@@ -85,14 +85,16 @@
   sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
 
   // Narrow and re-pack.
-  int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, FILTER_BITS);
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
 static inline void convolve8_horiz_8tap_neon_i8mm(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) {
-  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t filter = vshrn_n_s16(vld1q_s16(filter_x), 1);
 
   if (w == 4) {
     const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
@@ -104,8 +106,9 @@
       int16x4_t d1 = convolve8_4_h(s1, filter, perm_tbl);
       int16x4_t d2 = convolve8_4_h(s2, filter, perm_tbl);
       int16x4_t d3 = convolve8_4_h(s3, filter, perm_tbl);
-      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
 
       store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
       store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
@@ -326,7 +329,7 @@
   sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
 
   // Further narrowing and packing is performed by the caller.
-  return vqmovn_s32(sum);
+  return vmovn_s32(sum);
 }
 
 static inline uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo,
@@ -344,14 +347,16 @@
   sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1);
 
   // Narrow and re-pack.
-  int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, FILTER_BITS);
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
 static inline void convolve8_vert_8tap_neon_i8mm(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) {
-  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t filter = vshrn_n_s16(vld1q_s16(filter_y), 1);
   const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
   uint8x16x2_t samples_LUT;
 
@@ -386,8 +391,9 @@
       int16x4_t d1 = convolve8_4_v(s1234, s5678, filter);
       int16x4_t d2 = convolve8_4_v(s2345, s6789, filter);
       int16x4_t d3 = convolve8_4_v(s3456, s78910, filter);
-      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
 
       store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
       store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);