Optimize HBD Neon horizontal Wiener convolve for 5-tap filters
Wiener convolution filters can be either 5 or 7 taps. The current Neon
implementation pads 5-tap filters with zeros to treat them as 7-tap
filters. This patch adds a Neon path specialised for 5-tap horizontal
Wiener filters, avoiding the redundant work of using the 7-tap path
with a 5-tap filter.
Change-Id: I3bfd27f3a63956ccd9558095cb829736f31ccff4
diff --git a/av1/common/arm/highbd_wiener_convolve_neon.c b/av1/common/arm/highbd_wiener_convolve_neon.c
index da1af97..588b4f8 100644
--- a/av1/common/arm/highbd_wiener_convolve_neon.c
+++ b/av1/common/arm/highbd_wiener_convolve_neon.c
@@ -17,6 +17,64 @@
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
+#define HBD_WIENER_5TAP_HORIZ(name, shift) \
+ static INLINE uint16x8_t name##_wiener_convolve5_8_2d_h( \
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \
+ const int16x8_t s3, const int16x8_t s4, const int16x4_t x_filter, \
+ const int32x4_t round_vec, const uint16x8_t im_max_val) { \
+ /* Wiener filter is symmetric so add mirrored source elements. */ \
+ int16x8_t s04 = vaddq_s16(s0, s4); \
+ int16x8_t s13 = vaddq_s16(s1, s3); \
+ \
+ /* x_filter[0] = 0. (5-tap filters are 0-padded to 7 taps.) */ \
+ int32x4_t sum_lo = \
+ vmlal_lane_s16(round_vec, vget_low_s16(s04), x_filter, 1); \
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s13), x_filter, 2); \
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), x_filter, 3); \
+ \
+ int32x4_t sum_hi = \
+ vmlal_lane_s16(round_vec, vget_high_s16(s04), x_filter, 1); \
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s13), x_filter, 2); \
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), x_filter, 3); \
+ \
+ uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \
+ uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \
+ \
+ return vminq_u16(vcombine_u16(res_lo, res_hi), im_max_val); \
+ } \
+ \
+ static INLINE void name##_convolve_add_src_5tap_horiz( \
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, \
+ const int32x4_t round_vec, const uint16x8_t im_max_val) { \
+ do { \
+ const int16_t *s = (int16_t *)src_ptr; \
+ uint16_t *d = dst_ptr; \
+ int width = w; \
+ \
+ do { \
+ int16x8_t s0, s1, s2, s3, s4; \
+ load_s16_8x5(s, 1, &s0, &s1, &s2, &s3, &s4); \
+ \
+ uint16x8_t d0 = name##_wiener_convolve5_8_2d_h( \
+ s0, s1, s2, s3, s4, x_filter, round_vec, im_max_val); \
+ \
+ vst1q_u16(d, d0); \
+ \
+ s += 8; \
+ d += 8; \
+ width -= 8; \
+ } while (width != 0); \
+ src_ptr += src_stride; \
+ dst_ptr += dst_stride; \
+ } while (--h != 0); \
+ }
+
+HBD_WIENER_5TAP_HORIZ(highbd, WIENER_ROUND0_BITS)
+HBD_WIENER_5TAP_HORIZ(highbd_12, WIENER_ROUND0_BITS + 2)
+
+#undef HBD_WIENER_5TAP_HORIZ
+
#define HBD_WIENER_7TAP_HORIZ(name, shift) \
static INLINE uint16x8_t name##_wiener_convolve7_8_2d_h( \
const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \
@@ -46,7 +104,7 @@
return vminq_u16(vcombine_u16(res_lo, res_hi), im_max_val); \
} \
\
- static INLINE void name##_convolve_add_src_horiz_hip( \
+ static INLINE void name##_convolve_add_src_7tap_horiz( \
const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \
ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, \
const int32x4_t round_vec, const uint16x8_t im_max_val) { \
@@ -167,6 +225,14 @@
#undef HBD_WIENER_7TAP_VERT
+static AOM_INLINE int get_wiener_filter_taps(const int16_t *filter) {
+ assert(filter[7] == 0);
+ if (filter[0] == 0 && filter[6] == 0) {
+ return WIENER_WIN_REDUCED;
+ }
+ return WIENER_WIN;
+}
+
void av1_highbd_wiener_convolve_add_src_neon(
const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
ptrdiff_t dst_stride, const int16_t *x_filter, int x_step_q4,
@@ -183,6 +249,7 @@
DECLARE_ALIGNED(16, uint16_t,
im_block[(MAX_SB_SIZE + WIENER_WIN - 1) * MAX_SB_SIZE]);
+ const int x_filter_taps = get_wiener_filter_taps(x_filter);
int16x4_t x_filter_s16 = vld1_s16(x_filter);
int16x4_t y_filter_s16 = vld1_s16(y_filter);
// Add 128 to tap 3. (Needed for rounding.)
@@ -191,7 +258,7 @@
const int im_stride = MAX_SB_SIZE;
const int im_h = h + WIENER_WIN - 1;
- const int horiz_offset = WIENER_HALFWIN;
+ const int horiz_offset = x_filter_taps / 2;
const int vert_offset = WIENER_HALFWIN * (int)src_stride;
const int extraprec_clamp_limit =
@@ -207,16 +274,30 @@
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
if (bd == 12) {
- highbd_12_convolve_add_src_horiz_hip(
- src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
- im_h, x_filter_s16, horiz_round_vec, im_max_val);
+ if (x_filter_taps == WIENER_WIN_REDUCED) {
+ highbd_12_convolve_add_src_5tap_horiz(
+ src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+ im_h, x_filter_s16, horiz_round_vec, im_max_val);
+ } else {
+ highbd_12_convolve_add_src_7tap_horiz(
+ src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+ im_h, x_filter_s16, horiz_round_vec, im_max_val);
+ }
+
highbd_12_convolve_add_src_vert_hip(im_block, im_stride, dst, dst_stride, w,
h, y_filter_s16, vert_round_vec,
res_max_val);
} else {
- highbd_convolve_add_src_horiz_hip(
- src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
- im_h, x_filter_s16, horiz_round_vec, im_max_val);
+ if (x_filter_taps == WIENER_WIN_REDUCED) {
+ highbd_convolve_add_src_5tap_horiz(
+ src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+ im_h, x_filter_s16, horiz_round_vec, im_max_val);
+ } else {
+ highbd_convolve_add_src_7tap_horiz(
+ src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+ im_h, x_filter_s16, horiz_round_vec, im_max_val);
+ }
+
highbd_convolve_add_src_vert_hip(im_block, im_stride, dst, dst_stride, w, h,
y_filter_s16, vert_round_vec, res_max_val);
}