Refactor and optimize HBD Neon Wiener convolution
Refactor the high bitdepth Neon path for Wiener convolution. The
biggest change is removing a needless gather-load and subsequent
transpose. Additionally make use of the fact that Wiener filters are
symmetrical, adding mirrored source elements to reduce the number of
multiply-accumulate instructions.
Change-Id: Ifb48f14baca2dd31d2b8bab602980e0f2329f1c5
diff --git a/av1/common/arm/highbd_convolve_neon.h b/av1/common/arm/highbd_convolve_neon.h
index b534358..08b2bda 100644
--- a/av1/common/arm/highbd_convolve_neon.h
+++ b/av1/common/arm/highbd_convolve_neon.h
@@ -145,40 +145,4 @@
return vqmovun_s32(sum);
}
-static INLINE int32x4_t highbd_convolve8_horiz4x8_s32(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t s3, const int16x8_t x_filter_0_7, const int32x4_t offset) {
- int16x4_t s_lo[] = { vget_low_s16(s0), vget_low_s16(s1), vget_low_s16(s2),
- vget_low_s16(s3) };
- int16x4_t s_hi[] = { vget_high_s16(s0), vget_high_s16(s1), vget_high_s16(s2),
- vget_high_s16(s3) };
-
- transpose_array_inplace_u16_4x4((uint16x4_t *)s_lo);
- transpose_array_inplace_u16_4x4((uint16x4_t *)s_hi);
- const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
- const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
-
- int32x4_t sum = vmlal_lane_s16(offset, s_lo[0], x_filter_0_3, 0);
- sum = vmlal_lane_s16(sum, s_lo[1], x_filter_0_3, 1);
- sum = vmlal_lane_s16(sum, s_lo[2], x_filter_0_3, 2);
- sum = vmlal_lane_s16(sum, s_lo[3], x_filter_0_3, 3);
- sum = vmlal_lane_s16(sum, s_hi[0], x_filter_4_7, 0);
- sum = vmlal_lane_s16(sum, s_hi[1], x_filter_4_7, 1);
- sum = vmlal_lane_s16(sum, s_hi[2], x_filter_4_7, 2);
- sum = vmlal_lane_s16(sum, s_hi[3], x_filter_4_7, 3);
-
- return sum;
-}
-
-static INLINE uint16x4_t highbd_convolve8_horiz4x8_s32_s16(
- const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
- const int16x8_t s3, const int16x8_t x_filters_0_7,
- const int32x4_t shift_s32, const int32x4_t offset) {
- int32x4_t sum =
- highbd_convolve8_horiz4x8_s32(s0, s1, s2, s3, x_filters_0_7, offset);
-
- sum = vqrshlq_s32(sum, shift_s32);
- return vqmovun_s32(sum);
-}
-
#endif // AOM_AV1_COMMON_ARM_HIGHBD_CONVOLVE_NEON_H_
diff --git a/av1/common/arm/highbd_wiener_convolve_neon.c b/av1/common/arm/highbd_wiener_convolve_neon.c
index 7ceaffb..4cec1b2 100644
--- a/av1/common/arm/highbd_wiener_convolve_neon.c
+++ b/av1/common/arm/highbd_wiener_convolve_neon.c
@@ -10,198 +10,198 @@
*/
#include <arm_neon.h>
+#include <assert.h>
+#include "aom_dsp/arm/mem_neon.h"
+#include "av1/common/convolve.h"
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
-#include "aom_dsp/arm/mem_neon.h"
-#include "aom_dsp/arm/transpose_neon.h"
-#include "av1/common/convolve.h"
-#include "av1/common/arm/highbd_convolve_neon.h"
+static INLINE uint16x8_t highbd_wiener_convolve7_8_2d_h(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x4_t x_filter, const int32x4_t round_vec,
+ const int32x4_t shift, const uint16x8_t im_max_val) {
+ // Since the Wiener filter is symmetric about the middle tap (tap 3) add
+ // mirrored source elements before multiplying by filter coefficients.
+ int16x8_t s06 = vaddq_s16(s0, s6);
+ int16x8_t s15 = vaddq_s16(s1, s5);
+ int16x8_t s24 = vaddq_s16(s2, s4);
-static void highbd_convolve_add_src_horiz_hip(
- const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
- ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int x_step_q4, int w,
- int h, int round0_bits, int bd) {
- const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
+ int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s06), x_filter, 0);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), x_filter, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), x_filter, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), x_filter, 3);
- static const int32_t kIdx[4] = { 0, 1, 2, 3 };
- const int32x4_t idx = vld1q_s32(kIdx);
- const int32x4_t shift_s32 = vdupq_n_s32(-round0_bits);
- const uint16x4_t max = vdup_n_u16(extraprec_clamp_limit - 1);
- const int32x4_t rounding0 = vdupq_n_s32(1 << (bd + FILTER_BITS - 1));
+ int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s06), x_filter, 0);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), x_filter, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), x_filter, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), x_filter, 3);
- int height = h;
- do {
- int width = w;
- int x_q4 = 0;
- uint16_t *d = dst_ptr;
- const uint16_t *s = src_ptr;
+ sum_lo = vqrshlq_s32(sum_lo, shift);
+ sum_hi = vqrshlq_s32(sum_hi, shift);
- do {
- // Load 4 src vectors at a time, they might be the same, but we have to
- // calculate the indices anyway. Doing it in SIMD and then storing the
- // indices is faster than having to calculate the expression
- // &src_ptr[((x_q4 + i*x_step_q4) >> SUBPEL_BITS)] 4 times
- // Ideally this should be a gather using the indices, but NEON does not
- // have that, so have to emulate
- const int32x4_t xq4_idx = vmlaq_n_s32(vdupq_n_s32(x_q4), idx, x_step_q4);
- // We have to multiply x2 to get the actual pointer as sizeof(uint16_t)
- // = 2
- const int32x4_t src_idx =
- vshlq_n_s32(vshrq_n_s32(xq4_idx, SUBPEL_BITS), 1);
-
-#if AOM_ARCH_AARCH64
- uint64x2_t tmp4[2];
- tmp4[0] = vreinterpretq_u64_s64(
- vaddw_s32(vdupq_n_s64((const int64_t)s), vget_low_s32(src_idx)));
- tmp4[1] = vreinterpretq_u64_s64(
- vaddw_s32(vdupq_n_s64((const int64_t)s), vget_high_s32(src_idx)));
- int16_t *src4_ptr[4];
- uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
- vst1q_u64(tmp_ptr, tmp4[0]);
- vst1q_u64(tmp_ptr + 2, tmp4[1]);
-#else
- uint32x4_t tmp4;
- tmp4 = vreinterpretq_u32_s32(
- vaddq_s32(vdupq_n_s32((const int32_t)s), src_idx));
- int16_t *src4_ptr[4];
- uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
- vst1q_u32(tmp_ptr, tmp4);
-#endif // AOM_ARCH_AARCH64
- // Load source
- int16x8_t s0 = vld1q_s16(src4_ptr[0]);
- int16x8_t s1 = vld1q_s16(src4_ptr[1]);
- int16x8_t s2 = vld1q_s16(src4_ptr[2]);
- int16x8_t s3 = vld1q_s16(src4_ptr[3]);
-
- // Actually load the filters
- const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
-
- const int16_t *rounding_ptr = (const int16_t *)src4_ptr[0];
- int16x4_t rounding_s16 = vld1_s16(&rounding_ptr[SUBPEL_TAPS / 2 - 1]);
- int32x4_t rounding = vshlq_n_s32(vmovl_s16(rounding_s16), FILTER_BITS);
- rounding = vaddq_s32(rounding, rounding0);
-
- uint16x4_t d0 = highbd_convolve8_horiz4x8_s32_s16(
- s0, s1, s2, s3, x_filter, shift_s32, rounding);
- d0 = vmin_u16(d0, max);
- vst1_u16(d, d0);
-
- x_q4 += 4 * x_step_q4;
- d += 4;
- width -= 4;
- } while (width > 0);
-
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- height--;
- } while (height > 0);
+ uint16x8_t res = vcombine_u16(vqmovun_s32(sum_lo), vqmovun_s32(sum_hi));
+ return vminq_u16(res, im_max_val);
}
-static void highbd_convolve_add_src_vert_hip(
+static INLINE void highbd_convolve_add_src_horiz_hip(
const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
- ptrdiff_t dst_stride, const int16_t *y_filter_ptr, int y_step_q4, int w,
- int h, int round1_bits, int bd) {
- static const int32_t kIdx[4] = { 0, 1, 2, 3 };
- const int32x4_t idx = vld1q_s32(kIdx);
- const int32x4_t shift_s32 = vdupq_n_s32(-round1_bits);
- const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
- const int32x4_t rounding0 = vdupq_n_s32(1 << (bd + round1_bits - 1));
-
- int width = w;
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,
+ const int32x4_t round_vec, const int32x4_t shift,
+ const uint16x8_t im_max_val) {
do {
- int height = h;
- int y_q4 = 0;
+ const int16_t *s = (int16_t *)src_ptr;
uint16_t *d = dst_ptr;
- const uint16_t *s = src_ptr;
+ int width = w;
do {
- // Load 4 src vectors at a time, they might be the same, but we have to
- // calculate the indices anyway. Doing it in SIMD and then storing the
- // indices is faster than having to calculate the expression
- // &src_ptr[((x_q4 + i*x_step_q4) >> SUBPEL_BITS)] 4 times
- // Ideally this should be a gather using the indices, but NEON does not
- // have that, so have to emulate
- const int32x4_t yq4_idx = vmlaq_n_s32(vdupq_n_s32(y_q4), idx, y_step_q4);
- // We have to multiply x2 to get the actual pointer as sizeof(uint16_t)
- // = 2
- const int32x4_t src_idx =
- vshlq_n_s32(vshrq_n_s32(yq4_idx, SUBPEL_BITS), 1);
-#if AOM_ARCH_AARCH64
- uint64x2_t tmp4[2];
- tmp4[0] = vreinterpretq_u64_s64(
- vaddw_s32(vdupq_n_s64((const int64_t)s), vget_low_s32(src_idx)));
- tmp4[1] = vreinterpretq_u64_s64(
- vaddw_s32(vdupq_n_s64((const int64_t)s), vget_high_s32(src_idx)));
- const int16_t *src4_ptr[4];
- uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
- vst1q_u64(tmp_ptr, tmp4[0]);
- vst1q_u64(tmp_ptr + 2, tmp4[1]);
-#else
- uint32x4_t tmp4;
- tmp4 = vreinterpretq_u32_s32(
- vaddq_s32(vdupq_n_s32((const int32_t)s), src_idx));
- int16_t *src4_ptr[4];
- uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
- vst1q_u32(tmp_ptr, tmp4);
-#endif // AOM_ARCH_AARCH64
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, 1, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- // Load source
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
- load_s16_4x8(src4_ptr[0], src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6,
- &s7);
+ uint16x8_t d0 = highbd_wiener_convolve7_8_2d_h(
+ s0, s1, s2, s3, s4, s5, s6, x_filter, round_vec, shift, im_max_val);
- // Actually load the filters
- const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+ vst1q_u16(d, d0);
- const int16_t *rounding_ptr = (const int16_t *)src4_ptr[0];
- int16x4_t rounding_s16 =
- vld1_s16(&rounding_ptr[(SUBPEL_TAPS / 2 - 1) * src_stride]);
- int32x4_t rounding = vshlq_n_s32(vmovl_s16(rounding_s16), FILTER_BITS);
- rounding = vsubq_s32(rounding, rounding0);
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+}
- // Run the convolution
- uint16x4_t d0 = highbd_convolve8_4_sr_s32_s16(
- s0, s1, s2, s3, s4, s5, s6, s7, y_filter, shift_s32, rounding);
- d0 = vmin_u16(d0, max);
- vst1_u16(d, d0);
+static INLINE uint16x8_t highbd_wiener_convolve7_8_2d_v(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x4_t y_filter, const int32x4_t round_vec,
+ const int32x4_t shift, const uint16x8_t res_max_val) {
+ const int32x2_t y_filter_lo = vget_low_s32(vmovl_s16(y_filter));
+ const int32x2_t y_filter_hi = vget_high_s32(vmovl_s16(y_filter));
+ // Since the Wiener filter is symmetric about the middle tap (tap 3) add
+ // mirrored source elements before multiplying by filter coefficients.
+ int32x4_t s06_lo = vaddl_s16(vget_low_s16(s0), vget_low_s16(s6));
+ int32x4_t s15_lo = vaddl_s16(vget_low_s16(s1), vget_low_s16(s5));
+ int32x4_t s24_lo = vaddl_s16(vget_low_s16(s2), vget_low_s16(s4));
+
+ int32x4_t sum_lo = vmlaq_lane_s32(round_vec, s06_lo, y_filter_lo, 0);
+ sum_lo = vmlaq_lane_s32(sum_lo, s15_lo, y_filter_lo, 1);
+ sum_lo = vmlaq_lane_s32(sum_lo, s24_lo, y_filter_hi, 0);
+ sum_lo = vmlaq_lane_s32(sum_lo, vmovl_s16(vget_low_s16(s3)), y_filter_hi, 1);
+
+ int32x4_t s06_hi = vaddl_s16(vget_high_s16(s0), vget_high_s16(s6));
+ int32x4_t s15_hi = vaddl_s16(vget_high_s16(s1), vget_high_s16(s5));
+ int32x4_t s24_hi = vaddl_s16(vget_high_s16(s2), vget_high_s16(s4));
+
+ int32x4_t sum_hi = vmlaq_lane_s32(round_vec, s06_hi, y_filter_lo, 0);
+ sum_hi = vmlaq_lane_s32(sum_hi, s15_hi, y_filter_lo, 1);
+ sum_hi = vmlaq_lane_s32(sum_hi, s24_hi, y_filter_hi, 0);
+ sum_hi = vmlaq_lane_s32(sum_hi, vmovl_s16(vget_high_s16(s3)), y_filter_hi, 1);
+
+ sum_lo = vqrshlq_s32(sum_lo, shift);
+ sum_hi = vqrshlq_s32(sum_hi, shift);
+
+ uint16x8_t res = vcombine_u16(vqmovun_s32(sum_lo), vqmovun_s32(sum_hi));
+ return vminq_u16(res, res_max_val);
+}
+
+static INLINE void highbd_convolve_add_src_vert_hip(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter,
+ const int32x4_t round_vec, const int32x4_t shift,
+ const uint16x8_t res_max_val) {
+ do {
+ const int16_t *s = (int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+ int height = h;
+
+ while (height > 3) {
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9;
+ load_s16_8x10(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
+ &s9);
+
+ uint16x8_t d0 = highbd_wiener_convolve7_8_2d_v(
+ s0, s1, s2, s3, s4, s5, s6, y_filter, round_vec, shift, res_max_val);
+ uint16x8_t d1 = highbd_wiener_convolve7_8_2d_v(
+ s1, s2, s3, s4, s5, s6, s7, y_filter, round_vec, shift, res_max_val);
+ uint16x8_t d2 = highbd_wiener_convolve7_8_2d_v(
+ s2, s3, s4, s5, s6, s7, s8, y_filter, round_vec, shift, res_max_val);
+ uint16x8_t d3 = highbd_wiener_convolve7_8_2d_v(
+ s3, s4, s5, s6, s7, s8, s9, y_filter, round_vec, shift, res_max_val);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ }
+
+ while (height-- != 0) {
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+ uint16x8_t d0 = highbd_wiener_convolve7_8_2d_v(
+ s0, s1, s2, s3, s4, s5, s6, y_filter, round_vec, shift, res_max_val);
+
+ vst1q_u16(d, d0);
s += src_stride;
d += dst_stride;
- height--;
- } while (height > 0);
+ }
- y_q4 += 4 * y_step_q4;
- src_ptr += 4;
- dst_ptr += 4;
- width -= 4;
- } while (width > 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
}
-#define WIENER_MAX_EXT_SIZE 263
-
void av1_highbd_wiener_convolve_add_src_neon(
const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
- ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int x_step_q4,
- const int16_t *y_filter_ptr, int y_step_q4, int w, int h,
+ ptrdiff_t dst_stride, const int16_t *x_filter, int x_step_q4,
+ const int16_t *y_filter, int y_step_q4, int w, int h,
const WienerConvolveParams *conv_params, int bd) {
- assert(x_step_q4 == 16 && y_step_q4 == 16);
+ (void)x_step_q4;
+ (void)y_step_q4;
- DECLARE_ALIGNED(16, uint16_t, im_block[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE]);
- const int im_h = (((h - 1) * y_step_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+ assert(w % 8 == 0);
+ assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
+ assert(x_step_q4 == 16 && y_step_q4 == 16);
+ assert(x_filter[7] == 0 && y_filter[7] == 0);
+
+ DECLARE_ALIGNED(16, uint16_t,
+ im_block[(MAX_SB_SIZE + WIENER_WIN - 1) * MAX_SB_SIZE]);
+
+ int16x4_t x_filter_s16 = vld1_s16(x_filter);
+ int16x4_t y_filter_s16 = vld1_s16(y_filter);
+ // Add 128 to tap 3. (Needed for rounding.)
+ x_filter_s16 = vadd_s16(x_filter_s16, vcreate_s16(128ULL << 48));
+ y_filter_s16 = vadd_s16(y_filter_s16, vcreate_s16(128ULL << 48));
+
const int im_stride = MAX_SB_SIZE;
- const int vert_offset = SUBPEL_TAPS / 2 - 1;
- const int horiz_offset = SUBPEL_TAPS / 2 - 1;
+ const int im_h = h + WIENER_WIN - 1;
+ const int horiz_offset = WIENER_HALFWIN;
+ const int vert_offset = WIENER_HALFWIN * (int)src_stride;
+
+ const int extraprec_clamp_limit =
+ WIENER_CLAMP_LIMIT(conv_params->round_0, bd);
+ const uint16x8_t im_max_val = vdupq_n_u16(extraprec_clamp_limit - 1);
+ const int32x4_t horiz_shift = vdupq_n_s32(-conv_params->round_0);
+ const int32x4_t horiz_round_vec = vdupq_n_s32(1 << (bd + FILTER_BITS - 1));
+
+ const uint16x8_t res_max_val = vdupq_n_u16((1 << bd) - 1);
+ const int32x4_t vert_shift = vdupq_n_s32(-conv_params->round_1);
+ const int32x4_t vert_round_vec =
+ vdupq_n_s32(-(1 << (bd + conv_params->round_1 - 1)));
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
- highbd_convolve_add_src_horiz_hip(src_ptr, src_stride, im_block, im_stride,
- x_filter_ptr, x_step_q4, w, im_h,
- conv_params->round_0, bd);
- highbd_convolve_add_src_vert_hip(im_block, im_stride, dst, dst_stride,
- y_filter_ptr, y_step_q4, w, h,
- conv_params->round_1, bd);
+ highbd_convolve_add_src_horiz_hip(
+ src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+ im_h, x_filter_s16, horiz_round_vec, horiz_shift, im_max_val);
+ highbd_convolve_add_src_vert_hip(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter_s16, vert_round_vec, vert_shift,
+ res_max_val);
}