Optimize 8-tap Neon DotProd path for av1_convolve_y_sr

The 8-tap convolution filters used with av1_convolve_y_sr have all
even coefficient values. Halving these filter coefficients reduces the
intermediate precision requirements inside the convolution kernels,
and allows us to avoid some slower saturating narrow instructions.

Also update some variable naming to be consistent with the rest of the
file.

Change-Id: Iab48d8ddc2b620da6d35d47c99bc37d8ca49e00d
diff --git a/av1/common/arm/convolve_neon_dotprod.c b/av1/common/arm/convolve_neon_dotprod.c
index ab4b878..d917e6a 100644
--- a/av1/common/arm/convolve_neon_dotprod.c
+++ b/av1/common/arm/convolve_neon_dotprod.c
@@ -623,12 +623,13 @@
                                       const int8x8_t filters) {
   // The sample range transform and permutation are performed by the caller.
   // Accumulate into 128 << FILTER_BITS to account for range transform.
-  const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS);
+  // (- 1 since we halved the filters.)
+  const int32x4_t acc = vdupq_n_s32(128 << (FILTER_BITS - 1));
   int32x4_t sum = vdotq_lane_s32(acc, s0, filters, 0);
   sum = vdotq_lane_s32(sum, s1, filters, 1);
 
   // Further narrowing and packing is performed by the caller.
-  return vqmovn_s32(sum);
+  return vmovn_s32(sum);
 }
 
 static inline uint8x8_t convolve8_8_y(const int8x16_t s0_lo,
@@ -638,7 +639,8 @@
                                       const int8x8_t filters) {
   // The sample range transform and permutation are performed by the caller.
   // Accumulate into 128 << FILTER_BITS to account for range transform.
-  const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS);
+  // (- 1 since we halved the filters.)
+  const int32x4_t acc = vdupq_n_s32(128 << (FILTER_BITS - 1));
 
   int32x4_t sum0123 = vdotq_lane_s32(acc, s0_lo, filters, 0);
   sum0123 = vdotq_lane_s32(sum0123, s1_lo, filters, 1);
@@ -647,14 +649,16 @@
   sum4567 = vdotq_lane_s32(sum4567, s1_hi, filters, 1);
 
   // Narrow and re-pack.
-  int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567));
-  return vqrshrun_n_s16(sum, FILTER_BITS);
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
 static inline void convolve_y_sr_8tap_neon_dotprod(
     const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr) {
-  const int8x8_t filter = vmovn_s16(vld1q_s16(y_filter_ptr));
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t filter = vshrn_n_s16(vld1q_s16(y_filter_ptr), 1);
 
   const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
 
@@ -679,19 +683,19 @@
     transpose_concat_elems_s8_4x4(s3, s4, s5, s6, &s3456);
 
     do {
-      uint8x8_t t7, t8, t9, t10;
-      load_u8_8x4(src_ptr, src_stride, &t7, &t8, &t9, &t10);
+      uint8x8_t t7, t8, t9, tA;
+      load_u8_8x4(src_ptr, src_stride, &t7, &t8, &t9, &tA);
 
       int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
       int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
       int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
-      int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
+      int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128)));
 
-      int8x16_t s4567, s5678, s6789, s78910;
-      transpose_concat_elems_s8_4x4(s7, s8, s9, s10, &s78910);
+      int8x16_t s4567, s5678, s6789, s789A;
+      transpose_concat_elems_s8_4x4(s7, s8, s9, sA, &s789A);
 
       // Merge new data into block from previous iteration.
-      int8x16x2_t samples_LUT = { { s3456, s78910 } };
+      int8x16x2_t samples_LUT = { { s3456, s789A } };
       s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
       s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
       s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
@@ -699,9 +703,10 @@
       int16x4_t d0 = convolve8_4_y(s0123, s4567, filter);
       int16x4_t d1 = convolve8_4_y(s1234, s5678, filter);
       int16x4_t d2 = convolve8_4_y(s2345, s6789, filter);
-      int16x4_t d3 = convolve8_4_y(s3456, s78910, filter);
-      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      int16x4_t d3 = convolve8_4_y(s3456, s789A, filter);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
 
       store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
       store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
@@ -711,7 +716,7 @@
       s0123 = s4567;
       s1234 = s5678;
       s2345 = s6789;
-      s3456 = s78910;
+      s3456 = s789A;
 
       src_ptr += 4 * src_stride;
       dst_ptr += 4 * dst_stride;
@@ -747,25 +752,25 @@
       transpose_concat_elems_s8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
 
       do {
-        uint8x8_t t7, t8, t9, t10;
-        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+        uint8x8_t t7, t8, t9, tA;
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &tA);
 
         int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
         int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
         int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
-        int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
+        int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128)));
 
         int8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi,
-            s78910_lo, s78910_hi;
-        transpose_concat_elems_s8_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
+            s789A_lo, s789A_hi;
+        transpose_concat_elems_s8_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
 
         // Merge new data into block from previous iteration.
-        int8x16x2_t samples_LUT_lo = { { s3456_lo, s78910_lo } };
+        int8x16x2_t samples_LUT_lo = { { s3456_lo, s789A_lo } };
         s4567_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[0]);
         s5678_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[1]);
         s6789_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[2]);
 
-        int8x16x2_t samples_LUT_hi = { { s3456_hi, s78910_hi } };
+        int8x16x2_t samples_LUT_hi = { { s3456_hi, s789A_hi } };
         s4567_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[0]);
         s5678_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[1]);
         s6789_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[2]);
@@ -777,7 +782,7 @@
         uint8x8_t d2 =
             convolve8_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, filter);
         uint8x8_t d3 =
-            convolve8_8_y(s3456_lo, s3456_hi, s78910_lo, s78910_hi, filter);
+            convolve8_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, filter);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -789,8 +794,8 @@
         s1234_hi = s5678_hi;
         s2345_lo = s6789_lo;
         s2345_hi = s6789_hi;
-        s3456_lo = s78910_lo;
-        s3456_hi = s78910_hi;
+        s3456_lo = s789A_lo;
+        s3456_hi = s789A_hi;
 
         s += 4 * src_stride;
         d += 4 * dst_stride;