Fold 'round_offset' constant in Neon horizontal convolutions
Fold 'round_offset' constants into the dot-product accumulator
constant in Neon horizontal convolution functions. This removes a few
add instructions from the inner loop of horizontal convolution paths
implemented using Neon dot-product (SDOT or USDOT) instructions.
Change-Id: Ifaeb3d916507d8b221ce6602a19be2d6ce9679ea
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index 05a7547..ec9008e 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -2934,12 +2934,6 @@
d3 = convolve8_horiz_8_sdot(s3, x_filter, correction, range_limit,
permute_tbl);
- // We halved the convolution filter values so -1 from the right shift.
- d0 = vshrq_n_s16(d0, ROUND0_BITS - 1);
- d1 = vshrq_n_s16(d1, ROUND0_BITS - 1);
- d2 = vshrq_n_s16(d2, ROUND0_BITS - 1);
- d3 = vshrq_n_s16(d3, ROUND0_BITS - 1);
-
store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
s += 8;
diff --git a/av1/common/arm/convolve_neon.h b/av1/common/arm/convolve_neon.h
index 557a08a..520a393 100644
--- a/av1/common/arm/convolve_neon.h
+++ b/av1/common/arm/convolve_neon.h
@@ -349,7 +349,9 @@
sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], filters, 1);
/* Narrow and re-pack. */
- return vcombine_s16(vmovn_s32(sum[0]), vmovn_s32(sum[1]));
+ /* We halved the convolution filter values so -1 from the right shift. */
+ return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
+ vshrn_n_s32(sum[1], ROUND0_BITS - 1));
}
static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples,
diff --git a/av1/common/arm/jnt_convolve_neon.c b/av1/common/arm/jnt_convolve_neon.c
index 4129c43..dfeab05 100644
--- a/av1/common/arm/jnt_convolve_neon.c
+++ b/av1/common/arm/jnt_convolve_neon.c
@@ -327,12 +327,6 @@
d3 = convolve8_horiz_8_sdot(s3, x_filter, correction, range_limit,
permute_tbl);
- // We halved the convolution filter values so -1 from the right shift.
- d0 = vshrq_n_s16(d0, ROUND0_BITS - 1);
- d1 = vshrq_n_s16(d1, ROUND0_BITS - 1);
- d2 = vshrq_n_s16(d2, ROUND0_BITS - 1);
- d3 = vshrq_n_s16(d3, ROUND0_BITS - 1);
-
store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
s += 8;
@@ -1135,7 +1129,8 @@
// This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
// shifts - which are generally faster than rounding shifts on modern CPUs.
// The outermost -1 is needed because we halved the filter values.
- const int32x4_t horiz_const = vdupq_n_s32(1 << ((ROUND0_BITS - 1) - 1));
+ const int32x4_t horiz_const = vdupq_n_s32(
+ (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
const uint8_t *src_ptr = src - horiz_offset;
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -1162,15 +1157,11 @@
d2 = convolve8_4_usdot(s2, x_filter, permute_tbl, horiz_const);
d3 = convolve8_4_usdot(s3, x_filter, permute_tbl, horiz_const);
- d01 = vcombine_s16(vmovn_s32(d0), vmovn_s32(d1));
- d23 = vcombine_s16(vmovn_s32(d2), vmovn_s32(d3));
-
// We halved the convolution filter values so -1 from the right shift.
- d01 = vshrq_n_s16(d01, ROUND0_BITS - 1);
- d23 = vshrq_n_s16(d23, ROUND0_BITS - 1);
-
- d01 = vaddq_s16(d01, round_offset_vec);
- d23 = vaddq_s16(d23, round_offset_vec);
+ d01 = vcombine_s16(vshrn_n_s32(d0, ROUND0_BITS - 1),
+ vshrn_n_s32(d1, ROUND0_BITS - 1));
+ d23 = vcombine_s16(vshrn_n_s32(d2, ROUND0_BITS - 1),
+ vshrn_n_s32(d3, ROUND0_BITS - 1));
if (conv_params->do_average) {
load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
@@ -1221,11 +1212,6 @@
d2 = convolve8_horiz_8_usdot(s2, x_filter, permute_tbl, horiz_const);
d3 = convolve8_horiz_8_usdot(s3, x_filter, permute_tbl, horiz_const);
- d0 = vaddq_s16(d0, round_offset_vec);
- d1 = vaddq_s16(d1, round_offset_vec);
- d2 = vaddq_s16(d2, round_offset_vec);
- d3 = vaddq_s16(d3, round_offset_vec);
-
if (conv_params->do_average) {
load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
@@ -1289,7 +1275,8 @@
// shifts - which are generally faster than rounding shifts on modern CPUs.
// The outermost -1 is needed because we halved the filter values.
int32x4_t correction =
- vdupq_n_s32(correction_s32 + (1 << ((ROUND0_BITS - 1) - 1)));
+ vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
+ (1 << ((ROUND0_BITS - 1) - 1)));
const uint8_t *src_ptr = src - horiz_offset;
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -1316,15 +1303,11 @@
d2 = convolve8_4_sdot(s2, x_filter, correction, range_limit, permute_tbl);
d3 = convolve8_4_sdot(s3, x_filter, correction, range_limit, permute_tbl);
- d01 = vcombine_s16(vmovn_s32(d0), vmovn_s32(d1));
- d23 = vcombine_s16(vmovn_s32(d2), vmovn_s32(d3));
-
// We halved the convolution filter values so -1 from the right shift.
- d01 = vshrq_n_s16(d01, ROUND0_BITS - 1);
- d23 = vshrq_n_s16(d23, ROUND0_BITS - 1);
-
- d01 = vaddq_s16(d01, round_offset_vec);
- d23 = vaddq_s16(d23, round_offset_vec);
+ d01 = vcombine_s16(vshrn_n_s32(d0, ROUND0_BITS - 1),
+ vshrn_n_s32(d1, ROUND0_BITS - 1));
+ d23 = vcombine_s16(vshrn_n_s32(d2, ROUND0_BITS - 1),
+ vshrn_n_s32(d3, ROUND0_BITS - 1));
if (conv_params->do_average) {
load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
@@ -1379,17 +1362,6 @@
d3 = convolve8_horiz_8_sdot(s3, x_filter, correction, range_limit,
permute_tbl);
- // We halved the convolution filter values so -1 from the right shift.
- d0 = vshrq_n_s16(d0, ROUND0_BITS - 1);
- d1 = vshrq_n_s16(d1, ROUND0_BITS - 1);
- d2 = vshrq_n_s16(d2, ROUND0_BITS - 1);
- d3 = vshrq_n_s16(d3, ROUND0_BITS - 1);
-
- d0 = vaddq_s16(d0, round_offset_vec);
- d1 = vaddq_s16(d1, round_offset_vec);
- d2 = vaddq_s16(d2, round_offset_vec);
- d3 = vaddq_s16(d3, round_offset_vec);
-
if (conv_params->do_average) {
load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);