Specialize HBD Neon Wiener convolution horiz. pass by bitdepth The narrowing shift values used in the horizontal pass of Wiener convolution differ depending on the bitdepth. Since we can eliminate 2 relatively expensive instructions from the inner loop of the convolution kernel if we supply compile-time constants, specialize the path by bitdepth. (Bitdpeths 8 and 10 use the same shift values so we only actually need one extra path.) Change-Id: Ie36e028e62ff78d3901b7d13138dc5a61ab3534b
diff --git a/av1/common/arm/highbd_wiener_convolve_neon.c b/av1/common/arm/highbd_wiener_convolve_neon.c index 4cec1b2..aeb4cda 100644 --- a/av1/common/arm/highbd_wiener_convolve_neon.c +++ b/av1/common/arm/highbd_wiener_convolve_neon.c
@@ -17,61 +17,66 @@ #include "config/aom_config.h" #include "config/av1_rtcd.h" -static INLINE uint16x8_t highbd_wiener_convolve7_8_2d_h( - const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, - const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, - const int16x8_t s6, const int16x4_t x_filter, const int32x4_t round_vec, - const int32x4_t shift, const uint16x8_t im_max_val) { - // Since the Wiener filter is symmetric about the middle tap (tap 3) add - // mirrored source elements before multiplying by filter coefficients. - int16x8_t s06 = vaddq_s16(s0, s6); - int16x8_t s15 = vaddq_s16(s1, s5); - int16x8_t s24 = vaddq_s16(s2, s4); +#define HBD_WIENER_7TAP_HORIZ(name, shift) \ + static INLINE uint16x8_t name##_wiener_convolve7_8_2d_h( \ + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \ + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, \ + const int16x8_t s6, const int16x4_t x_filter, const int32x4_t round_vec, \ + const uint16x8_t im_max_val) { \ + /* Wiener filter is symmetric so add mirrored source elements. */ \ + int16x8_t s06 = vaddq_s16(s0, s6); \ + int16x8_t s15 = vaddq_s16(s1, s5); \ + int16x8_t s24 = vaddq_s16(s2, s4); \ + \ + int32x4_t sum_lo = \ + vmlal_lane_s16(round_vec, vget_low_s16(s06), x_filter, 0); \ + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), x_filter, 1); \ + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), x_filter, 2); \ + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), x_filter, 3); \ + \ + int32x4_t sum_hi = \ + vmlal_lane_s16(round_vec, vget_high_s16(s06), x_filter, 0); \ + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), x_filter, 1); \ + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), x_filter, 2); \ + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), x_filter, 3); \ + \ + uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \ + uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \ + \ + return vminq_u16(vcombine_u16(res_lo, res_hi), im_max_val); \ + } \ + \ + static INLINE void name##_convolve_add_src_horiz_hip( \ + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \ + ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, \ + const int32x4_t round_vec, const uint16x8_t im_max_val) { \ + do { \ + const int16_t *s = (int16_t *)src_ptr; \ + uint16_t *d = dst_ptr; \ + int width = w; \ + \ + do { \ + int16x8_t s0, s1, s2, s3, s4, s5, s6; \ + load_s16_8x7(s, 1, &s0, &s1, &s2, &s3, &s4, &s5, &s6); \ + \ + uint16x8_t d0 = name##_wiener_convolve7_8_2d_h( \ + s0, s1, s2, s3, s4, s5, s6, x_filter, round_vec, im_max_val); \ + \ + vst1q_u16(d, d0); \ + \ + s += 8; \ + d += 8; \ + width -= 8; \ + } while (width != 0); \ + src_ptr += src_stride; \ + dst_ptr += dst_stride; \ + } while (--h != 0); \ + } - int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s06), x_filter, 0); - sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), x_filter, 1); - sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), x_filter, 2); - sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), x_filter, 3); +HBD_WIENER_7TAP_HORIZ(highbd, WIENER_ROUND0_BITS) +HBD_WIENER_7TAP_HORIZ(highbd_12, WIENER_ROUND0_BITS + 2) - int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s06), x_filter, 0); - sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), x_filter, 1); - sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), x_filter, 2); - sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), x_filter, 3); - - sum_lo = vqrshlq_s32(sum_lo, shift); - sum_hi = vqrshlq_s32(sum_hi, shift); - - uint16x8_t res = vcombine_u16(vqmovun_s32(sum_lo), vqmovun_s32(sum_hi)); - return vminq_u16(res, im_max_val); -} - -static INLINE void highbd_convolve_add_src_horiz_hip( - const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, - ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, - const int32x4_t round_vec, const int32x4_t shift, - const uint16x8_t im_max_val) { - do { - const int16_t *s = (int16_t *)src_ptr; - uint16_t *d = dst_ptr; - int width = w; - - do { - int16x8_t s0, s1, s2, s3, s4, s5, s6; - load_s16_8x7(s, 1, &s0, &s1, &s2, &s3, &s4, &s5, &s6); - - uint16x8_t d0 = highbd_wiener_convolve7_8_2d_h( - s0, s1, s2, s3, s4, s5, s6, x_filter, round_vec, shift, im_max_val); - - vst1q_u16(d, d0); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src_ptr += src_stride; - dst_ptr += dst_stride; - } while (--h != 0); -} +#undef HBD_WIENER_7TAP_HORIZ static INLINE uint16x8_t highbd_wiener_convolve7_8_2d_v( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, @@ -187,7 +192,6 @@ const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(conv_params->round_0, bd); const uint16x8_t im_max_val = vdupq_n_u16(extraprec_clamp_limit - 1); - const int32x4_t horiz_shift = vdupq_n_s32(-conv_params->round_0); const int32x4_t horiz_round_vec = vdupq_n_s32(1 << (bd + FILTER_BITS - 1)); const uint16x8_t res_max_val = vdupq_n_u16((1 << bd) - 1); @@ -198,9 +202,16 @@ uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - highbd_convolve_add_src_horiz_hip( - src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, - im_h, x_filter_s16, horiz_round_vec, horiz_shift, im_max_val); + if (bd == 12) { + highbd_12_convolve_add_src_horiz_hip( + src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, + im_h, x_filter_s16, horiz_round_vec, im_max_val); + } else { + highbd_convolve_add_src_horiz_hip( + src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, + im_h, x_filter_s16, horiz_round_vec, im_max_val); + } + highbd_convolve_add_src_vert_hip(im_block, im_stride, dst, dst_stride, w, h, y_filter_s16, vert_round_vec, vert_shift, res_max_val);