Specialize HBD Neon Wiener convolution vert. pass by bitdepth The narrowing shift values used in the vertical pass of Wiener convolution differ depending on the bitdepth. Since we can eliminate 2 relatively expensive instructions from the inner loop of the convolution kernel if we supply compile-time constants, specialize the path by bitdepth. (Bitdpeths 8 and 10 use the same shift values so we only actually need one extra path.) Change-Id: Iad02a42821bdb1324a10e8c0e7b41af280ccdecf
diff --git a/av1/common/arm/highbd_wiener_convolve_neon.c b/av1/common/arm/highbd_wiener_convolve_neon.c index aeb4cda..da1af97 100644 --- a/av1/common/arm/highbd_wiener_convolve_neon.c +++ b/av1/common/arm/highbd_wiener_convolve_neon.c
@@ -78,89 +78,94 @@ #undef HBD_WIENER_7TAP_HORIZ -static INLINE uint16x8_t highbd_wiener_convolve7_8_2d_v( - const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, - const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, - const int16x8_t s6, const int16x4_t y_filter, const int32x4_t round_vec, - const int32x4_t shift, const uint16x8_t res_max_val) { - const int32x2_t y_filter_lo = vget_low_s32(vmovl_s16(y_filter)); - const int32x2_t y_filter_hi = vget_high_s32(vmovl_s16(y_filter)); - // Since the Wiener filter is symmetric about the middle tap (tap 3) add - // mirrored source elements before multiplying by filter coefficients. - int32x4_t s06_lo = vaddl_s16(vget_low_s16(s0), vget_low_s16(s6)); - int32x4_t s15_lo = vaddl_s16(vget_low_s16(s1), vget_low_s16(s5)); - int32x4_t s24_lo = vaddl_s16(vget_low_s16(s2), vget_low_s16(s4)); +#define HBD_WIENER_7TAP_VERT(name, shift) \ + static INLINE uint16x8_t name##_wiener_convolve7_8_2d_v( \ + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \ + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, \ + const int16x8_t s6, const int16x4_t y_filter, const int32x4_t round_vec, \ + const uint16x8_t res_max_val) { \ + const int32x2_t y_filter_lo = vget_low_s32(vmovl_s16(y_filter)); \ + const int32x2_t y_filter_hi = vget_high_s32(vmovl_s16(y_filter)); \ + /* Wiener filter is symmetric so add mirrored source elements. */ \ + int32x4_t s06_lo = vaddl_s16(vget_low_s16(s0), vget_low_s16(s6)); \ + int32x4_t s15_lo = vaddl_s16(vget_low_s16(s1), vget_low_s16(s5)); \ + int32x4_t s24_lo = vaddl_s16(vget_low_s16(s2), vget_low_s16(s4)); \ + \ + int32x4_t sum_lo = vmlaq_lane_s32(round_vec, s06_lo, y_filter_lo, 0); \ + sum_lo = vmlaq_lane_s32(sum_lo, s15_lo, y_filter_lo, 1); \ + sum_lo = vmlaq_lane_s32(sum_lo, s24_lo, y_filter_hi, 0); \ + sum_lo = \ + vmlaq_lane_s32(sum_lo, vmovl_s16(vget_low_s16(s3)), y_filter_hi, 1); \ + \ + int32x4_t s06_hi = vaddl_s16(vget_high_s16(s0), vget_high_s16(s6)); \ + int32x4_t s15_hi = vaddl_s16(vget_high_s16(s1), vget_high_s16(s5)); \ + int32x4_t s24_hi = vaddl_s16(vget_high_s16(s2), vget_high_s16(s4)); \ + \ + int32x4_t sum_hi = vmlaq_lane_s32(round_vec, s06_hi, y_filter_lo, 0); \ + sum_hi = vmlaq_lane_s32(sum_hi, s15_hi, y_filter_lo, 1); \ + sum_hi = vmlaq_lane_s32(sum_hi, s24_hi, y_filter_hi, 0); \ + sum_hi = \ + vmlaq_lane_s32(sum_hi, vmovl_s16(vget_high_s16(s3)), y_filter_hi, 1); \ + \ + uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \ + uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \ + \ + return vminq_u16(vcombine_u16(res_lo, res_hi), res_max_val); \ + } \ + \ + static INLINE void name##_convolve_add_src_vert_hip( \ + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \ + ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter, \ + const int32x4_t round_vec, const uint16x8_t res_max_val) { \ + do { \ + const int16_t *s = (int16_t *)src_ptr; \ + uint16_t *d = dst_ptr; \ + int height = h; \ + \ + while (height > 3) { \ + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9; \ + load_s16_8x10(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, \ + &s8, &s9); \ + \ + uint16x8_t d0 = name##_wiener_convolve7_8_2d_v( \ + s0, s1, s2, s3, s4, s5, s6, y_filter, round_vec, res_max_val); \ + uint16x8_t d1 = name##_wiener_convolve7_8_2d_v( \ + s1, s2, s3, s4, s5, s6, s7, y_filter, round_vec, res_max_val); \ + uint16x8_t d2 = name##_wiener_convolve7_8_2d_v( \ + s2, s3, s4, s5, s6, s7, s8, y_filter, round_vec, res_max_val); \ + uint16x8_t d3 = name##_wiener_convolve7_8_2d_v( \ + s3, s4, s5, s6, s7, s8, s9, y_filter, round_vec, res_max_val); \ + \ + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); \ + \ + s += 4 * src_stride; \ + d += 4 * dst_stride; \ + height -= 4; \ + } \ + \ + while (height-- != 0) { \ + int16x8_t s0, s1, s2, s3, s4, s5, s6; \ + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); \ + \ + uint16x8_t d0 = name##_wiener_convolve7_8_2d_v( \ + s0, s1, s2, s3, s4, s5, s6, y_filter, round_vec, res_max_val); \ + \ + vst1q_u16(d, d0); \ + \ + s += src_stride; \ + d += dst_stride; \ + } \ + \ + src_ptr += 8; \ + dst_ptr += 8; \ + w -= 8; \ + } while (w != 0); \ + } - int32x4_t sum_lo = vmlaq_lane_s32(round_vec, s06_lo, y_filter_lo, 0); - sum_lo = vmlaq_lane_s32(sum_lo, s15_lo, y_filter_lo, 1); - sum_lo = vmlaq_lane_s32(sum_lo, s24_lo, y_filter_hi, 0); - sum_lo = vmlaq_lane_s32(sum_lo, vmovl_s16(vget_low_s16(s3)), y_filter_hi, 1); +HBD_WIENER_7TAP_VERT(highbd, 2 * FILTER_BITS - WIENER_ROUND0_BITS) +HBD_WIENER_7TAP_VERT(highbd_12, 2 * FILTER_BITS - WIENER_ROUND0_BITS - 2) - int32x4_t s06_hi = vaddl_s16(vget_high_s16(s0), vget_high_s16(s6)); - int32x4_t s15_hi = vaddl_s16(vget_high_s16(s1), vget_high_s16(s5)); - int32x4_t s24_hi = vaddl_s16(vget_high_s16(s2), vget_high_s16(s4)); - - int32x4_t sum_hi = vmlaq_lane_s32(round_vec, s06_hi, y_filter_lo, 0); - sum_hi = vmlaq_lane_s32(sum_hi, s15_hi, y_filter_lo, 1); - sum_hi = vmlaq_lane_s32(sum_hi, s24_hi, y_filter_hi, 0); - sum_hi = vmlaq_lane_s32(sum_hi, vmovl_s16(vget_high_s16(s3)), y_filter_hi, 1); - - sum_lo = vqrshlq_s32(sum_lo, shift); - sum_hi = vqrshlq_s32(sum_hi, shift); - - uint16x8_t res = vcombine_u16(vqmovun_s32(sum_lo), vqmovun_s32(sum_hi)); - return vminq_u16(res, res_max_val); -} - -static INLINE void highbd_convolve_add_src_vert_hip( - const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, - ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter, - const int32x4_t round_vec, const int32x4_t shift, - const uint16x8_t res_max_val) { - do { - const int16_t *s = (int16_t *)src_ptr; - uint16_t *d = dst_ptr; - int height = h; - - while (height > 3) { - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9; - load_s16_8x10(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, - &s9); - - uint16x8_t d0 = highbd_wiener_convolve7_8_2d_v( - s0, s1, s2, s3, s4, s5, s6, y_filter, round_vec, shift, res_max_val); - uint16x8_t d1 = highbd_wiener_convolve7_8_2d_v( - s1, s2, s3, s4, s5, s6, s7, y_filter, round_vec, shift, res_max_val); - uint16x8_t d2 = highbd_wiener_convolve7_8_2d_v( - s2, s3, s4, s5, s6, s7, s8, y_filter, round_vec, shift, res_max_val); - uint16x8_t d3 = highbd_wiener_convolve7_8_2d_v( - s3, s4, s5, s6, s7, s8, s9, y_filter, round_vec, shift, res_max_val); - - store_u16_8x4(d, dst_stride, d0, d1, d2, d3); - - s += 4 * src_stride; - d += 4 * dst_stride; - height -= 4; - } - - while (height-- != 0) { - int16x8_t s0, s1, s2, s3, s4, s5, s6; - load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); - - uint16x8_t d0 = highbd_wiener_convolve7_8_2d_v( - s0, s1, s2, s3, s4, s5, s6, y_filter, round_vec, shift, res_max_val); - - vst1q_u16(d, d0); - - s += src_stride; - d += dst_stride; - } - - src_ptr += 8; - dst_ptr += 8; - w -= 8; - } while (w != 0); -} +#undef HBD_WIENER_7TAP_VERT void av1_highbd_wiener_convolve_add_src_neon( const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, @@ -195,7 +200,6 @@ const int32x4_t horiz_round_vec = vdupq_n_s32(1 << (bd + FILTER_BITS - 1)); const uint16x8_t res_max_val = vdupq_n_u16((1 << bd) - 1); - const int32x4_t vert_shift = vdupq_n_s32(-conv_params->round_1); const int32x4_t vert_round_vec = vdupq_n_s32(-(1 << (bd + conv_params->round_1 - 1))); @@ -206,13 +210,14 @@ highbd_12_convolve_add_src_horiz_hip( src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, im_h, x_filter_s16, horiz_round_vec, im_max_val); + highbd_12_convolve_add_src_vert_hip(im_block, im_stride, dst, dst_stride, w, + h, y_filter_s16, vert_round_vec, + res_max_val); } else { highbd_convolve_add_src_horiz_hip( src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, im_h, x_filter_s16, horiz_round_vec, im_max_val); + highbd_convolve_add_src_vert_hip(im_block, im_stride, dst, dst_stride, w, h, + y_filter_s16, vert_round_vec, res_max_val); } - - highbd_convolve_add_src_vert_hip(im_block, im_stride, dst, dst_stride, w, h, - y_filter_s16, vert_round_vec, vert_shift, - res_max_val); }