Use lane-referencing intrinsics in Neon convolution kernels
The Neon convolution functions take a pointer to a filter and load
each of the (usually 8) values into separate Neon registers before
executing the series of multiply-accumulate instructions.
This patch modifies these helper functions to load all of the filter
values into a single Neon register and then access them via the lane-
referencing versions of the various multiply and multiply-accumulate
Neon instructions. This reduces register pressure and also the number
of load instructions.
Change-Id: I2c13251449113b26500517d9c6d774e189cd622e
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index f0e4bed..61c8760 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -27,20 +27,22 @@
const int16x4_t s2, const int16x4_t s3,
const int16x4_t s4, const int16x4_t s5,
const int16x4_t s6, const int16x4_t s7,
- const int16_t *filter) {
+ const int16x8_t filter) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
int16x4_t sum;
- sum = vmul_n_s16(s0, filter[0]);
- sum = vmla_n_s16(sum, s1, filter[1]);
- sum = vmla_n_s16(sum, s2, filter[2]);
- sum = vmla_n_s16(sum, s5, filter[5]);
- sum = vmla_n_s16(sum, s6, filter[6]);
- sum = vmla_n_s16(sum, s7, filter[7]);
+ sum = vmul_lane_s16(s0, filter_lo, 0);
+ sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmla_lane_s16(sum, s7, filter_hi, 3);
/* filter[3] can take a max value of 128. So the max value of the result :
* 128*255 + sum > 16 bits
*/
- sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
- sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
+ sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3));
+ sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0));
return sum;
}
@@ -48,21 +50,23 @@
static INLINE uint8x8_t convolve8_horiz_8x8(
const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x8_t s7, const int16_t *filter,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
const int16x8_t shift_round_0, const int16x8_t shift_by_bits) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
int16x8_t sum;
- sum = vmulq_n_s16(s0, filter[0]);
- sum = vmlaq_n_s16(sum, s1, filter[1]);
- sum = vmlaq_n_s16(sum, s2, filter[2]);
- sum = vmlaq_n_s16(sum, s5, filter[5]);
- sum = vmlaq_n_s16(sum, s6, filter[6]);
- sum = vmlaq_n_s16(sum, s7, filter[7]);
+ sum = vmulq_lane_s16(s0, filter_lo, 0);
+ sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
/* filter[3] can take a max value of 128. So the max value of the result :
* 128*255 + sum > 16 bits
*/
- sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
- sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3));
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0));
sum = vqrshlq_s16(sum, shift_round_0);
sum = vqrshlq_s16(sum, shift_by_bits);
@@ -74,21 +78,23 @@
static INLINE uint8x8_t convolve8_horiz_4x1(
const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
const int16x4_t shift_round_0, const int16x4_t shift_by_bits) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
int16x4_t sum;
- sum = vmul_n_s16(s0, filter[0]);
- sum = vmla_n_s16(sum, s1, filter[1]);
- sum = vmla_n_s16(sum, s2, filter[2]);
- sum = vmla_n_s16(sum, s5, filter[5]);
- sum = vmla_n_s16(sum, s6, filter[6]);
- sum = vmla_n_s16(sum, s7, filter[7]);
+ sum = vmul_lane_s16(s0, filter_lo, 0);
+ sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmla_lane_s16(sum, s7, filter_hi, 3);
/* filter[3] can take a max value of 128. So the max value of the result :
* 128*255 + sum > 16 bits
*/
- sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
- sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
+ sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3));
+ sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0));
sum = vqrshl_s16(sum, shift_round_0);
sum = vqrshl_s16(sum, shift_by_bits);
@@ -100,20 +106,22 @@
static INLINE uint8x8_t convolve8_vert_8x4(
const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x8_t s7, const int16_t *filter) {
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t filter) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
int16x8_t sum;
- sum = vmulq_n_s16(s0, filter[0]);
- sum = vmlaq_n_s16(sum, s1, filter[1]);
- sum = vmlaq_n_s16(sum, s2, filter[2]);
- sum = vmlaq_n_s16(sum, s5, filter[5]);
- sum = vmlaq_n_s16(sum, s6, filter[6]);
- sum = vmlaq_n_s16(sum, s7, filter[7]);
+ sum = vmulq_lane_s16(s0, filter_lo, 0);
+ sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
/* filter[3] can take a max value of 128. So the max value of the result :
* 128*255 + sum > 16 bits
*/
- sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
- sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3));
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0));
return vqrshrun_n_s16(sum, FILTER_BITS);
}
@@ -121,21 +129,23 @@
static INLINE uint16x4_t convolve8_vert_4x4_s32(
const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
const int32x4_t round_shift_vec, const int32x4_t offset_const,
const int32x4_t sub_const_vec) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
int32x4_t sum0;
uint16x4_t res;
const int32x4_t zero = vdupq_n_s32(0);
- sum0 = vmull_n_s16(s0, y_filter[0]);
- sum0 = vmlal_n_s16(sum0, s1, y_filter[1]);
- sum0 = vmlal_n_s16(sum0, s2, y_filter[2]);
- sum0 = vmlal_n_s16(sum0, s3, y_filter[3]);
- sum0 = vmlal_n_s16(sum0, s4, y_filter[4]);
- sum0 = vmlal_n_s16(sum0, s5, y_filter[5]);
- sum0 = vmlal_n_s16(sum0, s6, y_filter[6]);
- sum0 = vmlal_n_s16(sum0, s7, y_filter[7]);
+ sum0 = vmull_lane_s16(s0, y_filter_lo, 0);
+ sum0 = vmlal_lane_s16(sum0, s1, y_filter_lo, 1);
+ sum0 = vmlal_lane_s16(sum0, s2, y_filter_lo, 2);
+ sum0 = vmlal_lane_s16(sum0, s3, y_filter_lo, 3);
+ sum0 = vmlal_lane_s16(sum0, s4, y_filter_hi, 0);
+ sum0 = vmlal_lane_s16(sum0, s5, y_filter_hi, 1);
+ sum0 = vmlal_lane_s16(sum0, s6, y_filter_hi, 2);
+ sum0 = vmlal_lane_s16(sum0, s7, y_filter_hi, 3);
sum0 = vaddq_s32(sum0, offset_const);
sum0 = vqrshlq_s32(sum0, round_shift_vec);
@@ -150,30 +160,32 @@
static INLINE uint8x8_t convolve8_vert_8x4_s32(
const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x8_t s7, const int16_t *y_filter,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
const int32x4_t round_shift_vec, const int32x4_t offset_const,
const int32x4_t sub_const_vec, const int16x8_t vec_round_bits) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
int32x4_t sum0, sum1;
uint16x8_t res;
const int32x4_t zero = vdupq_n_s32(0);
- sum0 = vmull_n_s16(vget_low_s16(s0), y_filter[0]);
- sum0 = vmlal_n_s16(sum0, vget_low_s16(s1), y_filter[1]);
- sum0 = vmlal_n_s16(sum0, vget_low_s16(s2), y_filter[2]);
- sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), y_filter[3]);
- sum0 = vmlal_n_s16(sum0, vget_low_s16(s4), y_filter[4]);
- sum0 = vmlal_n_s16(sum0, vget_low_s16(s5), y_filter[5]);
- sum0 = vmlal_n_s16(sum0, vget_low_s16(s6), y_filter[6]);
- sum0 = vmlal_n_s16(sum0, vget_low_s16(s7), y_filter[7]);
+ sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3);
- sum1 = vmull_n_s16(vget_high_s16(s0), y_filter[0]);
- sum1 = vmlal_n_s16(sum1, vget_high_s16(s1), y_filter[1]);
- sum1 = vmlal_n_s16(sum1, vget_high_s16(s2), y_filter[2]);
- sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), y_filter[3]);
- sum1 = vmlal_n_s16(sum1, vget_high_s16(s4), y_filter[4]);
- sum1 = vmlal_n_s16(sum1, vget_high_s16(s5), y_filter[5]);
- sum1 = vmlal_n_s16(sum1, vget_high_s16(s6), y_filter[6]);
- sum1 = vmlal_n_s16(sum1, vget_high_s16(s7), y_filter[7]);
+ sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3);
sum0 = vaddq_s32(sum0, offset_const);
sum1 = vaddq_s32(sum1, offset_const);
@@ -213,8 +225,9 @@
assert((FILTER_BITS - conv_params->round_1) >= 0 ||
((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0);
const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
@@ -622,8 +635,9 @@
src -= vert_offset * src_stride;
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
if (w <= 4) {
uint8x8_t d01;
@@ -853,7 +867,7 @@
// Processes one row at a time
static INLINE void horiz_filter_w8_single_row(
const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
- const int dst_stride, int width, int height, const int16_t *x_filter,
+ const int dst_stride, int width, int height, const int16x8_t x_filter,
const int16x8_t horiz_const, const int16x8_t shift_round_0) {
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
do {
@@ -899,7 +913,7 @@
// Processes one row at a time
static INLINE void horiz_filter_w4_single_row(
const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
- const int dst_stride, int width, int height, const int16_t *x_filter,
+ const int dst_stride, int width, int height, const int16x8_t x_filter,
const int16x4_t horiz_const, const int16x4_t shift_round_0) {
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
do {
@@ -983,16 +997,12 @@
FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
filter_params_x, subpel_x_qn & SUBPEL_MASK);
- int16_t x_filter_tmp[8];
- int16x8_t filter_x_coef = vld1q_s16(x_filter);
-
- // filter coeffs are even, so downshifting by 1 to reduce intermediate
- // precision requirements.
- filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
- vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+ // Filter values are even, so downshift by 1 to reduce intermediate precision
+ // requirements.
+ const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
assert(conv_params->round_0 > 0);
@@ -1035,13 +1045,13 @@
s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
horiz_const, shift_round_0);
- d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+ d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
horiz_const, shift_round_0);
- d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+ d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
horiz_const, shift_round_0);
- d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+ d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
horiz_const, shift_round_0);
transpose_s16_4x4d(&d0, &d1, &d2, &d3);
@@ -1068,13 +1078,11 @@
if (height) {
assert(height < 4);
horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
- height, x_filter_tmp, horiz_const,
- shift_round_0);
+ height, x_filter, horiz_const, shift_round_0);
}
#else
horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
- height, x_filter_tmp, horiz_const,
- shift_round_0);
+ height, x_filter, horiz_const, shift_round_0);
#endif
} else {
@@ -1135,22 +1143,22 @@
s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
- res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
horiz_const, shift_round_0);
- res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+ res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
horiz_const, shift_round_0);
- res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+ res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
horiz_const, shift_round_0);
- res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+ res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
horiz_const, shift_round_0);
- res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
+ res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
horiz_const, shift_round_0);
- res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
- x_filter_tmp, horiz_const, shift_round_0);
- res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
- x_filter_tmp, horiz_const, shift_round_0);
- res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
- x_filter_tmp, horiz_const, shift_round_0);
+ res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
+ horiz_const, shift_round_0);
+ res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
+ horiz_const, shift_round_0);
+ res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, x_filter,
+ horiz_const, shift_round_0);
transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
&res7);
@@ -1220,28 +1228,28 @@
reg14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
d0 = convolve8_4x4(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
- x_filter_tmp);
+ x_filter);
d1 = convolve8_4x4(reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8,
- x_filter_tmp);
+ x_filter);
d2 = convolve8_4x4(reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
- x_filter_tmp);
+ x_filter);
d3 = convolve8_4x4(reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10,
- x_filter_tmp);
+ x_filter);
d4 = convolve8_4x4(reg4, reg5, reg6, reg7, reg8, reg9, reg10, reg11,
- x_filter_tmp);
+ x_filter);
d5 = convolve8_4x4(reg5, reg6, reg7, reg8, reg9, reg10, reg11, reg12,
- x_filter_tmp);
+ x_filter);
d6 = convolve8_4x4(reg6, reg7, reg8, reg9, reg10, reg11, reg12, reg13,
- x_filter_tmp);
+ x_filter);
d7 = convolve8_4x4(reg7, reg8, reg9, reg10, reg11, reg12, reg13, reg14,
- x_filter_tmp);
+ x_filter);
transpose_s16_4x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &out0, &out1,
&out2, &out3);
@@ -1279,14 +1287,12 @@
if (height) {
assert(height < 4);
horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
- height, x_filter_tmp, horiz_const,
- shift_round_0);
+ height, x_filter, horiz_const, shift_round_0);
}
#else
horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
- height, x_filter_tmp, horiz_const,
- shift_round_0);
+ height, x_filter, horiz_const, shift_round_0);
#endif
}
@@ -1297,8 +1303,9 @@
const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1));
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
@@ -1574,8 +1581,6 @@
const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
if (x_q4 & SUBPEL_MASK) {
const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
- const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
- const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
uint8x8_t s[8], d;
int16x8_t ss[4];
int16x4_t t[8], tt;
@@ -1597,7 +1602,7 @@
t[7] = vget_high_s16(ss[3]);
tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
- filters, filter3, filter4);
+ filters);
d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
} else {
@@ -1703,8 +1708,6 @@
if (y_q4 & SUBPEL_MASK) {
const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
- const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
- const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
uint8x8_t s[8], d;
int16x4_t t[8], tt;
@@ -1719,8 +1722,7 @@
t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
- tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters,
- filter3, filter4);
+ tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters);
d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
} else {
diff --git a/av1/common/arm/convolve_neon.h b/av1/common/arm/convolve_neon.h
index 27a996c..3459ebe 100644
--- a/av1/common/arm/convolve_neon.h
+++ b/av1/common/arm/convolve_neon.h
@@ -19,21 +19,19 @@
const int16x4_t s2, const int16x4_t s3,
const int16x4_t s4, const int16x4_t s5,
const int16x4_t s6, const int16x4_t s7,
- const int16x8_t filters,
- const int16x4_t filter3,
- const int16x4_t filter4) {
- const int16x4_t filters_lo = vget_low_s16(filters);
- const int16x4_t filters_hi = vget_high_s16(filters);
+ const int16x8_t filter) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
int16x4_t sum;
- sum = vmul_lane_s16(s0, filters_lo, 0);
- sum = vmla_lane_s16(sum, s1, filters_lo, 1);
- sum = vmla_lane_s16(sum, s2, filters_lo, 2);
- sum = vmla_lane_s16(sum, s5, filters_hi, 1);
- sum = vmla_lane_s16(sum, s6, filters_hi, 2);
- sum = vmla_lane_s16(sum, s7, filters_hi, 3);
- sum = vqadd_s16(sum, vmul_s16(s3, filter3));
- sum = vqadd_s16(sum, vmul_s16(s4, filter4));
+ sum = vmul_lane_s16(s0, filter_lo, 0);
+ sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmla_lane_s16(sum, s7, filter_hi, 3);
+ sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3));
+ sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0));
return sum;
}
@@ -41,28 +39,24 @@
const int16x8_t s2, const int16x8_t s3,
const int16x8_t s4, const int16x8_t s5,
const int16x8_t s6, const int16x8_t s7,
- const int16x8_t filters,
- const int16x8_t filter3,
- const int16x8_t filter4) {
- const int16x4_t filters_lo = vget_low_s16(filters);
- const int16x4_t filters_hi = vget_high_s16(filters);
+ const int16x8_t filter) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
int16x8_t sum;
- sum = vmulq_lane_s16(s0, filters_lo, 0);
- sum = vmlaq_lane_s16(sum, s1, filters_lo, 1);
- sum = vmlaq_lane_s16(sum, s2, filters_lo, 2);
- sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
- sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
- sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
- sum = vqaddq_s16(sum, vmulq_s16(s3, filter3));
- sum = vqaddq_s16(sum, vmulq_s16(s4, filter4));
+ sum = vmulq_lane_s16(s0, filter_lo, 0);
+ sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3));
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0));
return vqrshrun_n_s16(sum, 7);
}
static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
- const int16x8_t filters) {
- const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
- const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
+ const int16x8_t filter) {
int16x8_t ss[8];
ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
@@ -75,7 +69,7 @@
ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7]));
return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
- filters, filter3, filter4);
+ filter);
}
static INLINE uint8x8_t wiener_convolve8_vert_4x8(
@@ -93,20 +87,21 @@
const int32x4_t round_bits = vdupq_n_s32(-round1_bits);
const int32x4_t zero = vdupq_n_s32(0);
const int32x4_t round_vec = vdupq_n_s32(round_const);
+ const int16x4_t filter = vld1_s16(filter_y);
ss0 = vaddq_s16(s0, s6);
ss1 = vaddq_s16(s1, s5);
ss2 = vaddq_s16(s2, s4);
- sum0 = vmull_n_s16(vget_low_s16(ss0), filter_y[0]);
- sum0 = vmlal_n_s16(sum0, vget_low_s16(ss1), filter_y[1]);
- sum0 = vmlal_n_s16(sum0, vget_low_s16(ss2), filter_y[2]);
- sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), filter_y[3]);
+ sum0 = vmull_lane_s16(vget_low_s16(ss0), filter, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(ss1), filter, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(ss2), filter, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3);
- sum1 = vmull_n_s16(vget_high_s16(ss0), filter_y[0]);
- sum1 = vmlal_n_s16(sum1, vget_high_s16(ss1), filter_y[1]);
- sum1 = vmlal_n_s16(sum1, vget_high_s16(ss2), filter_y[2]);
- sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), filter_y[3]);
+ sum1 = vmull_lane_s16(vget_high_s16(ss0), filter, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(ss1), filter, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(ss2), filter, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3);
sum0 = vsubq_s32(sum0, round_vec);
sum1 = vsubq_s32(sum1, round_vec);
@@ -143,10 +138,11 @@
const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);
const int32x4_t round_vec_1 = vdupq_n_s32(round_const_1);
+ const int16x4_t filter = vld1_s16(filter_x);
- sum = vmulq_n_s16(s0, filter_x[0]);
- sum = vmlaq_n_s16(sum, s1, filter_x[1]);
- sum = vmlaq_n_s16(sum, s2, filter_x[2]);
+ sum = vmulq_lane_s16(s0, filter, 0);
+ sum = vmlaq_lane_s16(sum, s1, filter, 1);
+ sum = vmlaq_lane_s16(sum, s2, filter, 2);
/* sum from 16x8 to 2 32x4 registers */
sum_0 = vmovl_s16(vget_low_s16(sum));
@@ -156,8 +152,8 @@
* then max value possible = 128*128*255 exceeding 16 bit
*/
- s3_0 = vmull_n_s16(vget_low_s16(s3), filter_x[3]);
- s3_1 = vmull_n_s16(vget_high_s16(s3), filter_x[3]);
+ s3_0 = vmull_lane_s16(vget_low_s16(s3), filter, 3);
+ s3_1 = vmull_lane_s16(vget_high_s16(s3), filter, 3);
sum_0 = vaddq_s32(sum_0, s3_0);
sum_1 = vaddq_s32(sum_1, s3_1);
@@ -192,21 +188,22 @@
const int32x4_t zero = vdupq_n_s32(0);
const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);
const int32x4_t round_vec_1 = vdupq_n_s32(round_const_1);
+ const int16x4_t filter = vld1_s16(filter_x);
temp0 = vadd_s16(s0, s6);
temp1 = vadd_s16(s1, s5);
temp2 = vadd_s16(s2, s4);
- sum = vmul_n_s16(temp0, filter_x[0]);
- sum = vmla_n_s16(sum, temp1, filter_x[1]);
- sum = vmla_n_s16(sum, temp2, filter_x[2]);
+ sum = vmul_lane_s16(temp0, filter, 0);
+ sum = vmla_lane_s16(sum, temp1, filter, 1);
+ sum = vmla_lane_s16(sum, temp2, filter, 2);
sum_0 = vmovl_s16(sum);
/* s[3]*128 -- and filter coff max can be 128.
* then max value possible = 128*128*255 Therefore, 32 bits are required to
* hold the result.
*/
- s3_0 = vmull_n_s16(s3, filter_x[3]);
+ s3_0 = vmull_lane_s16(s3, filter, 3);
sum_0 = vaddq_s32(sum_0, s3_0);
sum_0 = vaddq_s32(sum_0, round_vec_0);
@@ -218,44 +215,48 @@
return res;
}
-static INLINE int16x8_t
-convolve8_8x8_s16(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x8_t s7, const int16_t *filter,
- const int16x8_t horiz_const, const int16x8_t shift_round_0) {
+static INLINE int16x8_t convolve8_8x8_s16(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
+ const int16x8_t horiz_const, const int16x8_t shift_round_0) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
int16x8_t sum;
- int16x8_t res;
sum = horiz_const;
- sum = vmlaq_n_s16(sum, s0, filter[0]);
- sum = vmlaq_n_s16(sum, s1, filter[1]);
- sum = vmlaq_n_s16(sum, s2, filter[2]);
- sum = vmlaq_n_s16(sum, s3, filter[3]);
- sum = vmlaq_n_s16(sum, s4, filter[4]);
- sum = vmlaq_n_s16(sum, s5, filter[5]);
- sum = vmlaq_n_s16(sum, s6, filter[6]);
- sum = vmlaq_n_s16(sum, s7, filter[7]);
+ sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
+ sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
+ sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
+ sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
- res = vqrshlq_s16(sum, shift_round_0);
+ sum = vqrshlq_s16(sum, shift_round_0);
- return res;
+ return sum;
}
-static INLINE int16x4_t
-convolve8_4x4_s16(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
- const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
- const int16x4_t horiz_const, const int16x4_t shift_round_0) {
+static INLINE int16x4_t convolve8_4x4_s16(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
+ const int16x4_t horiz_const, const int16x4_t shift_round_0) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
int16x4_t sum;
+
sum = horiz_const;
- sum = vmla_n_s16(sum, s0, filter[0]);
- sum = vmla_n_s16(sum, s1, filter[1]);
- sum = vmla_n_s16(sum, s2, filter[2]);
- sum = vmla_n_s16(sum, s3, filter[3]);
- sum = vmla_n_s16(sum, s4, filter[4]);
- sum = vmla_n_s16(sum, s5, filter[5]);
- sum = vmla_n_s16(sum, s6, filter[6]);
- sum = vmla_n_s16(sum, s7, filter[7]);
+ sum = vmla_lane_s16(sum, s0, filter_lo, 0);
+ sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmla_lane_s16(sum, s3, filter_lo, 3);
+ sum = vmla_lane_s16(sum, s4, filter_hi, 0);
+ sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmla_lane_s16(sum, s7, filter_hi, 3);
sum = vqrshl_s16(sum, shift_round_0);
@@ -265,20 +266,22 @@
static INLINE uint16x4_t convolve8_4x4_s32(
const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
const int32x4_t round_shift_vec, const int32x4_t offset_const) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
int32x4_t sum0;
uint16x4_t res;
const int32x4_t zero = vdupq_n_s32(0);
- sum0 = vmull_n_s16(s0, y_filter[0]);
- sum0 = vmlal_n_s16(sum0, s1, y_filter[1]);
- sum0 = vmlal_n_s16(sum0, s2, y_filter[2]);
- sum0 = vmlal_n_s16(sum0, s3, y_filter[3]);
- sum0 = vmlal_n_s16(sum0, s4, y_filter[4]);
- sum0 = vmlal_n_s16(sum0, s5, y_filter[5]);
- sum0 = vmlal_n_s16(sum0, s6, y_filter[6]);
- sum0 = vmlal_n_s16(sum0, s7, y_filter[7]);
+ sum0 = vmull_lane_s16(s0, y_filter_lo, 0);
+ sum0 = vmlal_lane_s16(sum0, s1, y_filter_lo, 1);
+ sum0 = vmlal_lane_s16(sum0, s2, y_filter_lo, 2);
+ sum0 = vmlal_lane_s16(sum0, s3, y_filter_lo, 3);
+ sum0 = vmlal_lane_s16(sum0, s4, y_filter_hi, 0);
+ sum0 = vmlal_lane_s16(sum0, s5, y_filter_hi, 1);
+ sum0 = vmlal_lane_s16(sum0, s6, y_filter_hi, 2);
+ sum0 = vmlal_lane_s16(sum0, s7, y_filter_hi, 3);
sum0 = vaddq_s32(sum0, offset_const);
sum0 = vqrshlq_s32(sum0, round_shift_vec);
diff --git a/av1/common/arm/jnt_convolve_neon.c b/av1/common/arm/jnt_convolve_neon.c
index e0b76a8..22caf83 100644
--- a/av1/common/arm/jnt_convolve_neon.c
+++ b/av1/common/arm/jnt_convolve_neon.c
@@ -317,7 +317,7 @@
static INLINE void dist_wtd_convolve_2d_horiz_neon(
const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
- int16_t *x_filter_tmp, const int im_h, int w, const int round_0) {
+ const int16x8_t x_filter, const int im_h, int w, const int round_0) {
const int bd = 8;
const uint8_t *s;
int16_t *dst_ptr;
@@ -380,13 +380,13 @@
s9 = vget_low_s16(tt2);
s10 = vget_low_s16(tt3);
- d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
horiz_const, shift_round_0);
- d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+ d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
horiz_const, shift_round_0);
- d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+ d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
horiz_const, shift_round_0);
- d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+ d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
horiz_const, shift_round_0);
transpose_s16_4x4d(&d0, &d1, &d2, &d3);
@@ -418,7 +418,7 @@
s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9
s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10
- d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
horiz_const, shift_round_0);
vst1_s16(dst_ptr, d0);
@@ -483,22 +483,22 @@
s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
- res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
horiz_const, shift_round_0);
- res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+ res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
horiz_const, shift_round_0);
- res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+ res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
horiz_const, shift_round_0);
- res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+ res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
horiz_const, shift_round_0);
- res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
+ res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
horiz_const, shift_round_0);
- res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
- x_filter_tmp, horiz_const, shift_round_0);
- res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
- x_filter_tmp, horiz_const, shift_round_0);
- res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
- x_filter_tmp, horiz_const, shift_round_0);
+ res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
+ horiz_const, shift_round_0);
+ res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
+ horiz_const, shift_round_0);
+ res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, x_filter,
+ horiz_const, shift_round_0);
transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
&res7);
@@ -543,8 +543,8 @@
s6 = vextq_s16(temp_0, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
s7 = vextq_s16(temp_0, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
- res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7,
- x_filter_tmp, horiz_const, shift_round_0);
+ res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ horiz_const, shift_round_0);
vst1q_s16(d_tmp, res0);
s += 8;
@@ -561,7 +561,7 @@
static INLINE void dist_wtd_convolve_2d_vert_neon(
int16_t *im_block, const int im_stride, uint8_t *dst8, int dst8_stride,
- ConvolveParams *conv_params, const int16_t *y_filter, int h, int w) {
+ ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
uint8_t *dst_u8_ptr, *d_u8;
CONV_BUF_TYPE *dst_ptr, *dst;
int16_t *src_ptr, *s;
@@ -731,21 +731,18 @@
const int horiz_offset = filter_params_x->taps / 2 - 1;
const int round_0 = conv_params->round_0 - 1;
const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
filter_params_x, subpel_x_qn & SUBPEL_MASK);
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
filter_params_y, subpel_y_qn & SUBPEL_MASK);
- int16_t x_filter_tmp[8];
- int16x8_t filter_x_coef = vld1q_s16(x_filter);
-
- // filter coeffs are even, so downshifting by 1 to reduce intermediate
- // precision requirements.
- filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
- vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+ // Filter values are even, so downshift by 1 to reduce intermediate precision
+ // requirements.
+ const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride,
- x_filter_tmp, im_h, w, round_0);
+ x_filter, im_h, w, round_0);
dist_wtd_convolve_2d_vert_neon(im_block, im_stride, dst8, dst8_stride,
conv_params, y_filter, h, w);
@@ -892,18 +889,14 @@
const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
// horizontal filter
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
filter_params_x, subpel_x_qn & SUBPEL_MASK);
const uint8_t *src_ptr = src - horiz_offset;
- int16_t x_filter_tmp[8];
- int16x8_t filter_x_coef = vld1q_s16(x_filter);
-
- // filter coeffs are even, so downshifting by 1 to reduce intermediate
- // precision requirements.
- filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
- vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+ // Filter values are even, so downshift by 1 to reduce intermediate precision
+ // requirements.
+ const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
const uint8_t *s;
uint8_t *d_u8;
@@ -980,20 +973,20 @@
s9 = vget_high_s16(u0);
s10 = vget_high_s16(u1);
- d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
- zero, shift_round_0);
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, zero,
+ shift_round_0);
d0 = vrshl_s16(d0, horiz_const);
d0 = vadd_s16(d0, round_offset_vec);
- d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
- zero, shift_round_0);
+ d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, zero,
+ shift_round_0);
d1 = vrshl_s16(d1, horiz_const);
d1 = vadd_s16(d1, round_offset_vec);
- d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
- zero, shift_round_0);
+ d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, zero,
+ shift_round_0);
d2 = vrshl_s16(d2, horiz_const);
d2 = vadd_s16(d2, round_offset_vec);
- d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
- zero, shift_round_0);
+ d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, zero,
+ shift_round_0);
d3 = vrshl_s16(d3, horiz_const);
d3 = vadd_s16(d3, round_offset_vec);
@@ -1073,8 +1066,8 @@
s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9
s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10
- d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
- zero, shift_round_0);
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, zero,
+ shift_round_0);
d0 = vrshl_s16(d0, horiz_const);
d0 = vadd_s16(d0, round_offset_vec);
s0 = s4;
@@ -1173,38 +1166,38 @@
s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
- res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
- zero, shift_round_0);
+ res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, zero,
+ shift_round_0);
res0 = vrshlq_s16(res0, horiz_const);
res0 = vaddq_s16(res0, round_offset128);
- res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
- zero, shift_round_0);
+ res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, zero,
+ shift_round_0);
res1 = vrshlq_s16(res1, horiz_const);
res1 = vaddq_s16(res1, round_offset128);
- res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
- zero, shift_round_0);
+ res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, zero,
+ shift_round_0);
res2 = vrshlq_s16(res2, horiz_const);
res2 = vaddq_s16(res2, round_offset128);
- res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+ res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
zero, shift_round_0);
res3 = vrshlq_s16(res3, horiz_const);
res3 = vaddq_s16(res3, round_offset128);
- res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
+ res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
zero, shift_round_0);
res4 = vrshlq_s16(res4, horiz_const);
res4 = vaddq_s16(res4, round_offset128);
- res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
- x_filter_tmp, zero, shift_round_0);
+ res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
+ zero, shift_round_0);
res5 = vrshlq_s16(res5, horiz_const);
res5 = vaddq_s16(res5, round_offset128);
- res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
- x_filter_tmp, zero, shift_round_0);
+ res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
+ zero, shift_round_0);
res6 = vrshlq_s16(res6, horiz_const);
res6 = vaddq_s16(res6, round_offset128);
- res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
- x_filter_tmp, zero, shift_round_0);
+ res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, x_filter,
+ zero, shift_round_0);
res7 = vrshlq_s16(res7, horiz_const);
res7 = vaddq_s16(res7, round_offset128);
@@ -1293,8 +1286,8 @@
s6 = vextq_s16(temp_0, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
s7 = vextq_s16(temp_0, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
- res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7,
- x_filter_tmp, zero, shift_round_0);
+ res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ zero, shift_round_0);
res0 = vrshlq_s16(res0, horiz_const);
res0 = vaddq_s16(res0, round_offset128);
@@ -1352,18 +1345,14 @@
const int shift_value = (conv_params->round_1 - 1 - bits);
// vertical filter
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
filter_params_y, subpel_y_qn & SUBPEL_MASK);
const uint8_t *src_ptr = src - (vert_offset * src_stride);
- int16_t y_filter_tmp[8];
- int16x8_t filter_y_coef = vld1q_s16(y_filter);
-
- // filter coeffs are even, so downshifting by 1 to reduce intermediate
- // precision requirements.
- filter_y_coef = vshrq_n_s16(filter_y_coef, 1);
- vst1q_s16(&y_filter_tmp[0], filter_y_coef);
+ // Filter values are even, so downshift by 1 to reduce intermediate precision
+ // requirements.
+ const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
const uint8_t *s;
uint8_t *d_u8;
@@ -1441,17 +1430,17 @@
s9 = vget_low_s16(u1);
s10 = vget_high_s16(u1);
- d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
- zero, shift_vec);
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, zero,
+ shift_vec);
d0 = vadd_s16(d0, round_offset64);
- d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter_tmp,
- zero, shift_vec);
+ d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, zero,
+ shift_vec);
d1 = vadd_s16(d1, round_offset64);
- d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter_tmp,
- zero, shift_vec);
+ d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, zero,
+ shift_vec);
d2 = vadd_s16(d2, round_offset64);
- d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter_tmp,
- zero, shift_vec);
+ d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, zero,
+ shift_vec);
d3 = vadd_s16(d3, round_offset64);
if (conv_params->do_average) {
@@ -1504,8 +1493,8 @@
u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
s7 = vget_low_s16(u0);
- d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
- zero, shift_vec);
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, zero,
+ shift_vec);
d0 = vadd_s16(d0, round_offset64);
@@ -1602,29 +1591,29 @@
__builtin_prefetch(dst_ptr + 2 * dst_stride);
__builtin_prefetch(dst_ptr + 3 * dst_stride);
- res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
- zero, shift_vec);
+ res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, zero,
+ shift_vec);
res0 = vaddq_s16(res0, round_offset128);
- res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter_tmp,
- zero, shift_vec);
+ res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, zero,
+ shift_vec);
res1 = vaddq_s16(res1, round_offset128);
- res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter_tmp,
- zero, shift_vec);
+ res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, zero,
+ shift_vec);
res2 = vaddq_s16(res2, round_offset128);
- res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter_tmp,
+ res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
zero, shift_vec);
res3 = vaddq_s16(res3, round_offset128);
- res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, y_filter_tmp,
+ res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, y_filter,
zero, shift_vec);
res4 = vaddq_s16(res4, round_offset128);
- res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
- y_filter_tmp, zero, shift_vec);
+ res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, y_filter,
+ zero, shift_vec);
res5 = vaddq_s16(res5, round_offset128);
- res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
- y_filter_tmp, zero, shift_vec);
+ res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, y_filter,
+ zero, shift_vec);
res6 = vaddq_s16(res6, round_offset128);
- res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
- y_filter_tmp, zero, shift_vec);
+ res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, y_filter,
+ zero, shift_vec);
res7 = vaddq_s16(res7, round_offset128);
if (conv_params->do_average) {
@@ -1682,8 +1671,8 @@
__builtin_prefetch(dst_ptr);
- res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
- zero, shift_vec);
+ res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, zero,
+ shift_vec);
res0 = vaddq_s16(res0, round_offset128);
s0 = s1;