Remove magic numbers in av1_convolve_2d_sr_horiz Neon paths
Refactor the av1_convolve_2d_sr_horiz Neon paths to use expressions
- instead of magic numbers - for rounding shim values. Also add
additional comments to explain right shift values.
Co-authored by: Jonathan Wright <jonathan.wright@arm.com>
Change-Id: Idcd7dad8e204ece9684b41deb29c9ec8e1e499e2
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index 7377531..8ee3203 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -2311,10 +2311,10 @@
};
const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
vmovn_s16(x_filter_s16.val[1]));
- // This shim of +4 enables us to use non-rounding shifts - which are
- // generally faster than rounding shifts on modern CPUs.
+ // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
+ // - which are generally faster than rounding shifts on modern CPUs.
const int32x4_t horiz_const =
- vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + 4);
+ vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
if (w <= 4) {
@@ -2482,9 +2482,10 @@
const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
vmovn_s16(x_filter_s16.val[1]));
- // This shim of +4 enables us to use non-rounding shifts - which are
- // generally faster than rounding shifts on modern CPUs.
- const int32_t horiz_const = ((1 << (bd + FILTER_BITS - 1)) + 4);
+ // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
+ // - which are generally faster than rounding shifts on modern CPUs.
+ const int32_t horiz_const =
+ ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
// Dot product constants.
const int32x4_t correct_tmp =
vaddq_s32(vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[0], 7)),
@@ -2716,9 +2717,10 @@
const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
const int16x4_t x_filter_8_11) {
const int bd = 8;
- // This shim of +4 enables us to use non-rounding shifts - which are
- // generally faster than rounding shifts on modern CPUs.
- const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + 4);
+ // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts -
+ // which are generally faster than rounding shifts on modern CPUs.
+ const int32x4_t horiz_const =
+ vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
#if defined(__aarch64__)
do {
@@ -2845,6 +2847,7 @@
sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], filters, 1);
/* Narrow and re-pack. */
+ // We halved the convolution filter values so -1 from the right shift.
return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
vshrn_n_s32(sum[1], ROUND0_BITS - 1));
}
@@ -2863,9 +2866,11 @@
// Filter values are even, so downshift by 1 to reduce intermediate precision
// requirements.
const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
- // This shim of +2 enables us to use non-rounding shifts - which are
- // generally faster than rounding shifts on modern CPUs.
- const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) + 2);
+ // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // The outermost -1 is needed because we halved the filter values.
+ const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) +
+ (1 << ((ROUND0_BITS - 1) - 1)));
if (w <= 4) {
const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
@@ -2883,6 +2888,7 @@
t2 = convolve8_4_usdot(s2, x_filter, permute_tbl, horiz_const);
t3 = convolve8_4_usdot(s3, x_filter, permute_tbl, horiz_const);
+ // We halved the convolution filter values so -1 from the right shift.
d0 = vshrn_n_s32(t0, ROUND0_BITS - 1);
d1 = vshrn_n_s32(t1, ROUND0_BITS - 1);
d2 = vshrn_n_s32(t2, ROUND0_BITS - 1);
@@ -2908,6 +2914,7 @@
do {
s0 = vld1q_u8(src_ptr);
t0 = convolve8_4_usdot(s0, x_filter, permute_tbl, horiz_const);
+ // We halved the convolution filter values so -1 from the right shift.
d0 = vshrn_n_s32(t0, ROUND0_BITS - 1);
if (w == 2) {
@@ -2995,9 +3002,11 @@
// Filter values are even, so downshift by 1 to reduce intermediate precision
// requirements.
const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
- // This shim of +2 enables us to use non-rounding shifts - which are
- // generally faster than rounding shifts on modern CPUs.
- const int32_t horiz_const = ((1 << (bd + FILTER_BITS - 2)) + 2);
+ // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // The outermost -1 is needed because we halved the filter values.
+ const int32_t horiz_const =
+ ((1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1)));
// Dot product constants.
const int16x8_t correct_tmp = vshlq_n_s16(x_filter_s16, 6);
int32x4_t correction = vdupq_n_s32(vaddlvq_s16(correct_tmp) + horiz_const);
@@ -3019,6 +3028,7 @@
t2 = convolve8_4_sdot(s2, x_filter, correction, range_limit, permute_tbl);
t3 = convolve8_4_sdot(s3, x_filter, correction, range_limit, permute_tbl);
+ // We halved the convolution filter values so -1 from the right shift.
d0 = vshrn_n_s32(t0, ROUND0_BITS - 1);
d1 = vshrn_n_s32(t1, ROUND0_BITS - 1);
d2 = vshrn_n_s32(t2, ROUND0_BITS - 1);
@@ -3045,6 +3055,7 @@
s0 = vld1q_u8(src_ptr);
t0 = convolve8_4_sdot(s0, x_filter, correction, range_limit,
permute_tbl);
+ // We halved the convolution filter values so -1 from the right shift.
d0 = vshrn_n_s32(t0, ROUND0_BITS - 1);
if (w == 2) {
@@ -3082,6 +3093,7 @@
d3 = convolve8_horiz_8_sdot(s3, x_filter, correction, range_limit,
permute_tbl);
+ // We halved the convolution filter values so -1 from the right shift.
d0 = vshrq_n_s16(d0, ROUND0_BITS - 1);
d1 = vshrq_n_s16(d1, ROUND0_BITS - 1);
d2 = vshrq_n_s16(d2, ROUND0_BITS - 1);
@@ -3111,6 +3123,7 @@
s0 = vld1q_u8(s);
d0 = convolve8_8_sdot(s0, x_filter, correction, range_limit,
permute_tbl, vdupq_n_s16(0));
+ // We halved the convolution filter values so -1 from the right shift.
d0 = vshrq_n_s16(d0, ROUND0_BITS - 1);
vst1q_s16(d, d0);
@@ -3148,6 +3161,7 @@
sum = vmla_lane_s16(sum, s6, filter_hi, 2);
sum = vmla_lane_s16(sum, s7, filter_hi, 3);
+ // We halved the convolution filter values so -1 from the right shift.
return vshr_n_s16(sum, ROUND0_BITS - 1);
}
@@ -3170,6 +3184,7 @@
sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+ // We halved the convolution filter values so -1 from the right shift.
return vshrq_n_s16(sum, ROUND0_BITS - 1);
}
@@ -3284,9 +3299,11 @@
const int16x8_t x_filter = vshrq_n_s16(x_filter_s16, 1);
if (w <= 4) {
- // This shim of +2 enables us to use non-rounding shifts - which are
- // generally faster than rounding shifts on modern CPUs.
- const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) + 2);
+ // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // The outermost -1 is needed because we halved the filter values.
+ const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) +
+ (1 << ((ROUND0_BITS - 1) - 1)));
#if defined(__aarch64__)
do {
@@ -3354,10 +3371,11 @@
#endif // defined(__aarch64__)
} else {
- // This shim of +2 enables us to use non-rounding shifts - which are
- // generally faster than rounding shifts on modern CPUs.
- const int16x8_t horiz_const =
- vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + 2);
+ // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // The outermost -1 is needed because we halved the filter values.
+ const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
+ (1 << ((ROUND0_BITS - 1) - 1)));
#if defined(__aarch64__)
@@ -3488,6 +3506,7 @@
d2 = vaddq_s16(d2, horiz_const);
d3 = vaddq_s16(d3, horiz_const);
+ // We halved the convolution filter values so -1 from the right shift.
d0 = vshrq_n_s16(d0, ROUND0_BITS - 1);
d1 = vshrq_n_s16(d1, ROUND0_BITS - 1);
d2 = vshrq_n_s16(d2, ROUND0_BITS - 1);