Specialize HBD Neon Wiener convolution horiz. pass by bitdepth
The narrowing shift values used in the horizontal pass of Wiener
convolution differ depending on the bitdepth. Since we can eliminate
2 relatively expensive instructions from the inner loop of the
convolution kernel if we supply compile-time constants, specialize
the path by bitdepth. (Bitdpeths 8 and 10 use the same shift values
so we only actually need one extra path.)
Change-Id: Ie36e028e62ff78d3901b7d13138dc5a61ab3534b
diff --git a/av1/common/arm/highbd_wiener_convolve_neon.c b/av1/common/arm/highbd_wiener_convolve_neon.c
index 4cec1b2..aeb4cda 100644
--- a/av1/common/arm/highbd_wiener_convolve_neon.c
+++ b/av1/common/arm/highbd_wiener_convolve_neon.c
@@ -17,61 +17,66 @@
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
-static INLINE uint16x8_t highbd_wiener_convolve7_8_2d_h(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x4_t x_filter, const int32x4_t round_vec,
- const int32x4_t shift, const uint16x8_t im_max_val) {
- // Since the Wiener filter is symmetric about the middle tap (tap 3) add
- // mirrored source elements before multiplying by filter coefficients.
- int16x8_t s06 = vaddq_s16(s0, s6);
- int16x8_t s15 = vaddq_s16(s1, s5);
- int16x8_t s24 = vaddq_s16(s2, s4);
+#define HBD_WIENER_7TAP_HORIZ(name, shift) \
+ static INLINE uint16x8_t name##_wiener_convolve7_8_2d_h( \
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, \
+ const int16x8_t s6, const int16x4_t x_filter, const int32x4_t round_vec, \
+ const uint16x8_t im_max_val) { \
+ /* Wiener filter is symmetric so add mirrored source elements. */ \
+ int16x8_t s06 = vaddq_s16(s0, s6); \
+ int16x8_t s15 = vaddq_s16(s1, s5); \
+ int16x8_t s24 = vaddq_s16(s2, s4); \
+ \
+ int32x4_t sum_lo = \
+ vmlal_lane_s16(round_vec, vget_low_s16(s06), x_filter, 0); \
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), x_filter, 1); \
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), x_filter, 2); \
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), x_filter, 3); \
+ \
+ int32x4_t sum_hi = \
+ vmlal_lane_s16(round_vec, vget_high_s16(s06), x_filter, 0); \
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), x_filter, 1); \
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), x_filter, 2); \
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), x_filter, 3); \
+ \
+ uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \
+ uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \
+ \
+ return vminq_u16(vcombine_u16(res_lo, res_hi), im_max_val); \
+ } \
+ \
+ static INLINE void name##_convolve_add_src_horiz_hip( \
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, \
+ const int32x4_t round_vec, const uint16x8_t im_max_val) { \
+ do { \
+ const int16_t *s = (int16_t *)src_ptr; \
+ uint16_t *d = dst_ptr; \
+ int width = w; \
+ \
+ do { \
+ int16x8_t s0, s1, s2, s3, s4, s5, s6; \
+ load_s16_8x7(s, 1, &s0, &s1, &s2, &s3, &s4, &s5, &s6); \
+ \
+ uint16x8_t d0 = name##_wiener_convolve7_8_2d_h( \
+ s0, s1, s2, s3, s4, s5, s6, x_filter, round_vec, im_max_val); \
+ \
+ vst1q_u16(d, d0); \
+ \
+ s += 8; \
+ d += 8; \
+ width -= 8; \
+ } while (width != 0); \
+ src_ptr += src_stride; \
+ dst_ptr += dst_stride; \
+ } while (--h != 0); \
+ }
- int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s06), x_filter, 0);
- sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), x_filter, 1);
- sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), x_filter, 2);
- sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), x_filter, 3);
+HBD_WIENER_7TAP_HORIZ(highbd, WIENER_ROUND0_BITS)
+HBD_WIENER_7TAP_HORIZ(highbd_12, WIENER_ROUND0_BITS + 2)
- int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s06), x_filter, 0);
- sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), x_filter, 1);
- sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), x_filter, 2);
- sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), x_filter, 3);
-
- sum_lo = vqrshlq_s32(sum_lo, shift);
- sum_hi = vqrshlq_s32(sum_hi, shift);
-
- uint16x8_t res = vcombine_u16(vqmovun_s32(sum_lo), vqmovun_s32(sum_hi));
- return vminq_u16(res, im_max_val);
-}
-
-static INLINE void highbd_convolve_add_src_horiz_hip(
- const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
- ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,
- const int32x4_t round_vec, const int32x4_t shift,
- const uint16x8_t im_max_val) {
- do {
- const int16_t *s = (int16_t *)src_ptr;
- uint16_t *d = dst_ptr;
- int width = w;
-
- do {
- int16x8_t s0, s1, s2, s3, s4, s5, s6;
- load_s16_8x7(s, 1, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-
- uint16x8_t d0 = highbd_wiener_convolve7_8_2d_h(
- s0, s1, s2, s3, s4, s5, s6, x_filter, round_vec, shift, im_max_val);
-
- vst1q_u16(d, d0);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width != 0);
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- } while (--h != 0);
-}
+#undef HBD_WIENER_7TAP_HORIZ
static INLINE uint16x8_t highbd_wiener_convolve7_8_2d_v(
const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
@@ -187,7 +192,6 @@
const int extraprec_clamp_limit =
WIENER_CLAMP_LIMIT(conv_params->round_0, bd);
const uint16x8_t im_max_val = vdupq_n_u16(extraprec_clamp_limit - 1);
- const int32x4_t horiz_shift = vdupq_n_s32(-conv_params->round_0);
const int32x4_t horiz_round_vec = vdupq_n_s32(1 << (bd + FILTER_BITS - 1));
const uint16x8_t res_max_val = vdupq_n_u16((1 << bd) - 1);
@@ -198,9 +202,16 @@
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- highbd_convolve_add_src_horiz_hip(
- src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
- im_h, x_filter_s16, horiz_round_vec, horiz_shift, im_max_val);
+ if (bd == 12) {
+ highbd_12_convolve_add_src_horiz_hip(
+ src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+ im_h, x_filter_s16, horiz_round_vec, im_max_val);
+ } else {
+ highbd_convolve_add_src_horiz_hip(
+ src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+ im_h, x_filter_s16, horiz_round_vec, im_max_val);
+ }
+
highbd_convolve_add_src_vert_hip(im_block, im_stride, dst, dst_stride, w, h,
y_filter_s16, vert_round_vec, vert_shift,
res_max_val);