Fold 'round_offset' constant in Neon horizontal convolutions

Fold 'round_offset' constants into the dot-product accumulator
constant in Neon horizontal convolution functions. This removes a few
add instructions from the inner loop of horizontal convolution paths
implemented using Neon dot-product (SDOT or USDOT) instructions.

Change-Id: Ifaeb3d916507d8b221ce6602a19be2d6ce9679ea
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index 05a7547..ec9008e 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -2934,12 +2934,6 @@
         d3 = convolve8_horiz_8_sdot(s3, x_filter, correction, range_limit,
                                     permute_tbl);
 
-        // We halved the convolution filter values so -1 from the right shift.
-        d0 = vshrq_n_s16(d0, ROUND0_BITS - 1);
-        d1 = vshrq_n_s16(d1, ROUND0_BITS - 1);
-        d2 = vshrq_n_s16(d2, ROUND0_BITS - 1);
-        d3 = vshrq_n_s16(d3, ROUND0_BITS - 1);
-
         store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s += 8;
diff --git a/av1/common/arm/convolve_neon.h b/av1/common/arm/convolve_neon.h
index 557a08a..520a393 100644
--- a/av1/common/arm/convolve_neon.h
+++ b/av1/common/arm/convolve_neon.h
@@ -349,7 +349,9 @@
   sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], filters, 1);
 
   /* Narrow and re-pack. */
-  return vcombine_s16(vmovn_s32(sum[0]), vmovn_s32(sum[1]));
+  /* We halved the convolution filter values so -1 from the right shift. */
+  return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
+                      vshrn_n_s32(sum[1], ROUND0_BITS - 1));
 }
 
 static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples,
diff --git a/av1/common/arm/jnt_convolve_neon.c b/av1/common/arm/jnt_convolve_neon.c
index 4129c43..dfeab05 100644
--- a/av1/common/arm/jnt_convolve_neon.c
+++ b/av1/common/arm/jnt_convolve_neon.c
@@ -327,12 +327,6 @@
         d3 = convolve8_horiz_8_sdot(s3, x_filter, correction, range_limit,
                                     permute_tbl);
 
-        // We halved the convolution filter values so -1 from the right shift.
-        d0 = vshrq_n_s16(d0, ROUND0_BITS - 1);
-        d1 = vshrq_n_s16(d1, ROUND0_BITS - 1);
-        d2 = vshrq_n_s16(d2, ROUND0_BITS - 1);
-        d3 = vshrq_n_s16(d3, ROUND0_BITS - 1);
-
         store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s += 8;
@@ -1135,7 +1129,8 @@
   // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
   // shifts - which are generally faster than rounding shifts on modern CPUs.
   // The outermost -1 is needed because we halved the filter values.
-  const int32x4_t horiz_const = vdupq_n_s32(1 << ((ROUND0_BITS - 1) - 1));
+  const int32x4_t horiz_const = vdupq_n_s32(
+      (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
 
   const uint8_t *src_ptr = src - horiz_offset;
   CONV_BUF_TYPE *dst = conv_params->dst;
@@ -1162,15 +1157,11 @@
       d2 = convolve8_4_usdot(s2, x_filter, permute_tbl, horiz_const);
       d3 = convolve8_4_usdot(s3, x_filter, permute_tbl, horiz_const);
 
-      d01 = vcombine_s16(vmovn_s32(d0), vmovn_s32(d1));
-      d23 = vcombine_s16(vmovn_s32(d2), vmovn_s32(d3));
-
       // We halved the convolution filter values so -1 from the right shift.
-      d01 = vshrq_n_s16(d01, ROUND0_BITS - 1);
-      d23 = vshrq_n_s16(d23, ROUND0_BITS - 1);
-
-      d01 = vaddq_s16(d01, round_offset_vec);
-      d23 = vaddq_s16(d23, round_offset_vec);
+      d01 = vcombine_s16(vshrn_n_s32(d0, ROUND0_BITS - 1),
+                         vshrn_n_s32(d1, ROUND0_BITS - 1));
+      d23 = vcombine_s16(vshrn_n_s32(d2, ROUND0_BITS - 1),
+                         vshrn_n_s32(d3, ROUND0_BITS - 1));
 
       if (conv_params->do_average) {
         load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
@@ -1221,11 +1212,6 @@
         d2 = convolve8_horiz_8_usdot(s2, x_filter, permute_tbl, horiz_const);
         d3 = convolve8_horiz_8_usdot(s3, x_filter, permute_tbl, horiz_const);
 
-        d0 = vaddq_s16(d0, round_offset_vec);
-        d1 = vaddq_s16(d1, round_offset_vec);
-        d2 = vaddq_s16(d2, round_offset_vec);
-        d3 = vaddq_s16(d3, round_offset_vec);
-
         if (conv_params->do_average) {
           load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
 
@@ -1289,7 +1275,8 @@
   // shifts - which are generally faster than rounding shifts on modern CPUs.
   // The outermost -1 is needed because we halved the filter values.
   int32x4_t correction =
-      vdupq_n_s32(correction_s32 + (1 << ((ROUND0_BITS - 1) - 1)));
+      vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
+                  (1 << ((ROUND0_BITS - 1) - 1)));
 
   const uint8_t *src_ptr = src - horiz_offset;
   CONV_BUF_TYPE *dst = conv_params->dst;
@@ -1316,15 +1303,11 @@
       d2 = convolve8_4_sdot(s2, x_filter, correction, range_limit, permute_tbl);
       d3 = convolve8_4_sdot(s3, x_filter, correction, range_limit, permute_tbl);
 
-      d01 = vcombine_s16(vmovn_s32(d0), vmovn_s32(d1));
-      d23 = vcombine_s16(vmovn_s32(d2), vmovn_s32(d3));
-
       // We halved the convolution filter values so -1 from the right shift.
-      d01 = vshrq_n_s16(d01, ROUND0_BITS - 1);
-      d23 = vshrq_n_s16(d23, ROUND0_BITS - 1);
-
-      d01 = vaddq_s16(d01, round_offset_vec);
-      d23 = vaddq_s16(d23, round_offset_vec);
+      d01 = vcombine_s16(vshrn_n_s32(d0, ROUND0_BITS - 1),
+                         vshrn_n_s32(d1, ROUND0_BITS - 1));
+      d23 = vcombine_s16(vshrn_n_s32(d2, ROUND0_BITS - 1),
+                         vshrn_n_s32(d3, ROUND0_BITS - 1));
 
       if (conv_params->do_average) {
         load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
@@ -1379,17 +1362,6 @@
         d3 = convolve8_horiz_8_sdot(s3, x_filter, correction, range_limit,
                                     permute_tbl);
 
-        // We halved the convolution filter values so -1 from the right shift.
-        d0 = vshrq_n_s16(d0, ROUND0_BITS - 1);
-        d1 = vshrq_n_s16(d1, ROUND0_BITS - 1);
-        d2 = vshrq_n_s16(d2, ROUND0_BITS - 1);
-        d3 = vshrq_n_s16(d3, ROUND0_BITS - 1);
-
-        d0 = vaddq_s16(d0, round_offset_vec);
-        d1 = vaddq_s16(d1, round_offset_vec);
-        d2 = vaddq_s16(d2, round_offset_vec);
-        d3 = vaddq_s16(d3, round_offset_vec);
-
         if (conv_params->do_average) {
           load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);