Move av1_warp_affine_common impl from warp_plane_neon.h The av1_warp_affine_common implementation is only used by av1_warp_affine_neon so move the implementation to that function. Delete warp_affine_horizontal as well, as it is not a common function anymore. Delete horizontal_filter_4x1_f1_beta0 and horizontal_filter_8x1_f1_beta0 in warp_plane_neon_i8mm.c and warp_plane_sve.c as they are not required by the header file anymore. Change-Id: I200f6fa15e6babacff976e079938e318529425ae
diff --git a/av1/common/arm/warp_plane_neon.c b/av1/common/arm/warp_plane_neon.c index 497273b..f4c1377 100644 --- a/av1/common/arm/warp_plane_neon.c +++ b/av1/common/arm/warp_plane_neon.c
@@ -272,13 +272,106 @@ *res_high = horizontal_add_4d_s32x4(m4567_pairs); } +static AOM_FORCE_INLINE void warp_affine_horizontal_neon( + const uint8_t *ref, int width, int height, int stride, int p_width, + int p_height, int16_t alpha, int16_t beta, const int64_t x4, + const int64_t y4, const int i, int16x8_t tmp[]) { + const int height_limit = AOMMIN(8, p_height - i) + 7; + + int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); + int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); + + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + if (warp_affine_special_case(ref, ix4, iy4, width, height, stride, + height_limit, tmp)) { + return; + } + + static const uint8_t kIotaArr[] = { 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15 }; + const uint8x16_t indx = vld1q_u8(kIotaArr); + + const int out_of_boundary_left = -(ix4 - 6); + const int out_of_boundary_right = (ix4 + 8) - width; + + if (p_width == 4) { + if (beta == 0) { + if (alpha == 0) { + int16x8_t f_s16 = + vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]); + APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1_beta0, f_s16); + } else { + APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, sx4, alpha); + } + } else { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1, + (sx4 + beta * (k - 3))); + } else { + APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, (sx4 + beta * (k - 3)), + alpha); + } + } + } else { + if (beta == 0) { + if (alpha == 0) { + int16x8_t f_s16 = + vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]); + APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1_beta0, f_s16); + } else { + APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, sx4, alpha); + } + } else { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1, + (sx4 + beta * (k - 3))); + } else { + APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, (sx4 + beta * (k - 3)), + alpha); + } + } + } +} + void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { - av1_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row, - p_width, p_height, p_stride, subsampling_x, - subsampling_y, conv_params, alpha, beta, gamma, delta); + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const int is_compound = conv_params->is_compound; + uint16_t *const dst = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + + assert(IMPLIES(is_compound, dst != NULL)); + assert(IMPLIES(do_average, is_compound)); + + for (int i = 0; i < p_height; i += 8) { + for (int j = 0; j < p_width; j += 8) { + const int32_t src_x = (p_col + j + 4) << subsampling_x; + const int32_t src_y = (p_row + i + 4) << subsampling_y; + const int64_t dst_x = + (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; + const int64_t dst_y = + (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; + + const int64_t x4 = dst_x >> subsampling_x; + const int64_t y4 = dst_y >> subsampling_y; + + int16x8_t tmp[15]; + warp_affine_horizontal_neon(ref, width, height, stride, p_width, p_height, + alpha, beta, x4, y4, i, tmp); + warp_affine_vertical(pred, p_width, p_height, p_stride, is_compound, dst, + dst_stride, do_average, use_dist_wtd_comp_avg, gamma, + delta, y4, i, j, tmp, w0, w1); + } + } }
diff --git a/av1/common/arm/warp_plane_neon.h b/av1/common/arm/warp_plane_neon.h index 2909df7b..6c50c41 100644 --- a/av1/common/arm/warp_plane_neon.h +++ b/av1/common/arm/warp_plane_neon.h
@@ -24,24 +24,6 @@ #include "av1/common/warped_motion.h" #include "av1/common/scale.h" -static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, - int sx, int alpha); - -static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, - int sx, int alpha); - -static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, - int sx); - -static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, - int sx); - -static AOM_FORCE_INLINE int16x8_t -horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16); - -static AOM_FORCE_INLINE int16x8_t -horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16); - static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, int sy); @@ -95,21 +77,12 @@ return clamp(iy, 0, height - 1); } -static AOM_FORCE_INLINE void warp_affine_horizontal( - const uint8_t *ref, int width, int height, int stride, int p_width, - int p_height, int16_t alpha, int16_t beta, const int64_t x4, - const int64_t y4, const int i, int16x8_t tmp[]) { +static inline bool warp_affine_special_case(const uint8_t *ref, int32_t ix4, + int32_t iy4, int width, int height, + int stride, const int height_limit, + int16x8_t tmp[]) { const int bd = 8; const int reduce_bits_horiz = ROUND0_BITS; - const int height_limit = AOMMIN(8, p_height - i) + 7; - - int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); - int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); - - int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); - sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + - (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); - sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); if (ix4 <= -7) { for (int k = 0; k < height_limit; ++k) { @@ -119,7 +92,7 @@ ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)); tmp[k] = vdupq_n_s16(dup_val); } - return; + return true; } else if (ix4 >= width + 6) { for (int k = 0; k < height_limit; ++k) { int iy = clamp_iy(iy4 + k - 7, height); @@ -128,15 +101,11 @@ (1 << (FILTER_BITS - reduce_bits_horiz)); tmp[k] = vdupq_n_s16(dup_val); } - return; + return true; } - static const uint8_t kIotaArr[] = { 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15 }; - const uint8x16_t indx = vld1q_u8(kIotaArr); - - const int out_of_boundary_left = -(ix4 - 6); - const int out_of_boundary_right = (ix4 + 8) - width; + return false; +} #define APPLY_HORIZONTAL_SHIFT(fn, ...) \ do { \ @@ -172,45 +141,6 @@ } \ } while (0) - if (p_width == 4) { - if (beta == 0) { - if (alpha == 0) { - int16x8_t f_s16 = - vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]); - APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1_beta0, f_s16); - } else { - APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, sx4, alpha); - } - } else { - if (alpha == 0) { - APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1, - (sx4 + beta * (k - 3))); - } else { - APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, (sx4 + beta * (k - 3)), - alpha); - } - } - } else { - if (beta == 0) { - if (alpha == 0) { - int16x8_t f_s16 = - vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]); - APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1_beta0, f_s16); - } else { - APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, sx4, alpha); - } - } else { - if (alpha == 0) { - APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1, - (sx4 + beta * (k - 3))); - } else { - APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, (sx4 + beta * (k - 3)), - alpha); - } - } - } -} - static AOM_FORCE_INLINE void warp_affine_vertical( uint8_t *pred, int p_width, int p_height, int p_stride, int is_compound, uint16_t *dst, int dst_stride, int do_average, int use_dist_wtd_comp_avg, @@ -339,43 +269,4 @@ } } -static AOM_FORCE_INLINE void av1_warp_affine_common( - const int32_t *mat, const uint8_t *ref, int width, int height, int stride, - uint8_t *pred, int p_col, int p_row, int p_width, int p_height, - int p_stride, int subsampling_x, int subsampling_y, - ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, - int16_t delta) { - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const int is_compound = conv_params->is_compound; - uint16_t *const dst = conv_params->dst; - const int dst_stride = conv_params->dst_stride; - const int do_average = conv_params->do_average; - const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; - - assert(IMPLIES(is_compound, dst != NULL)); - assert(IMPLIES(do_average, is_compound)); - - for (int i = 0; i < p_height; i += 8) { - for (int j = 0; j < p_width; j += 8) { - const int32_t src_x = (p_col + j + 4) << subsampling_x; - const int32_t src_y = (p_row + i + 4) << subsampling_y; - const int64_t dst_x = - (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; - const int64_t dst_y = - (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; - - const int64_t x4 = dst_x >> subsampling_x; - const int64_t y4 = dst_y >> subsampling_y; - - int16x8_t tmp[15]; - warp_affine_horizontal(ref, width, height, stride, p_width, p_height, - alpha, beta, x4, y4, i, tmp); - warp_affine_vertical(pred, p_width, p_height, p_stride, is_compound, dst, - dst_stride, do_average, use_dist_wtd_comp_avg, gamma, - delta, y4, i, j, tmp, w0, w1); - } - } -} - #endif // AOM_AV1_COMMON_ARM_WARP_PLANE_NEON_H_
diff --git a/av1/common/arm/warp_plane_neon_i8mm.c b/av1/common/arm/warp_plane_neon_i8mm.c index 2d02974..44689e9 100644 --- a/av1/common/arm/warp_plane_neon_i8mm.c +++ b/av1/common/arm/warp_plane_neon_i8mm.c
@@ -143,10 +143,11 @@ return vreinterpretq_s16_u16(res); } -static AOM_FORCE_INLINE int16x8_t -horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, + int sx) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]); int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16)); uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]); @@ -166,12 +167,6 @@ return vreinterpretq_s16_u16(res); } -static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, - int sx) { - int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]); - return horizontal_filter_4x1_f1_beta0(in, f_s16); -} - static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1_6tap_beta0( const uint8x16_t in, const int8x16_t filter, const uint8x16x2_t perm_tbl) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); @@ -219,10 +214,11 @@ return vreinterpretq_s16_u16(res); } -static AOM_FORCE_INLINE int16x8_t -horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, + int sx) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]); int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16)); uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]); @@ -248,12 +244,6 @@ return vreinterpretq_s16_u16(res); } -static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, - int sx) { - int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]); - return horizontal_filter_8x1_f1_beta0(in, f_s16); -} - static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, int sy) { int16x4_t s0 = vget_low_s16(src[0]); @@ -387,8 +377,6 @@ const uint8_t *ref, int width, int height, int stride, int p_width, int p_height, int16_t alpha, int16_t beta, const int64_t x4, const int64_t y4, const int i, int16x8_t tmp[]) { - const int bd = 8; - const int reduce_bits_horiz = ROUND0_BITS; const int height_limit = AOMMIN(8, p_height - i) + 7; int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); @@ -399,23 +387,8 @@ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); - if (ix4 <= -7) { - for (int k = 0; k < height_limit; ++k) { - int iy = clamp_iy(iy4 + k - 7, height); - int16_t dup_val = - (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + - ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)); - tmp[k] = vdupq_n_s16(dup_val); - } - return; - } else if (ix4 >= width + 6) { - for (int k = 0; k < height_limit; ++k) { - int iy = clamp_iy(iy4 + k - 7, height); - int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + - ref[iy * stride + (width - 1)] * - (1 << (FILTER_BITS - reduce_bits_horiz)); - tmp[k] = vdupq_n_s16(dup_val); - } + if (warp_affine_special_case(ref, ix4, iy4, width, height, stride, + height_limit, tmp)) { return; }
diff --git a/av1/common/arm/warp_plane_sve.c b/av1/common/arm/warp_plane_sve.c index 455e29d..885ffe8 100644 --- a/av1/common/arm/warp_plane_sve.c +++ b/av1/common/arm/warp_plane_sve.c
@@ -146,10 +146,11 @@ return vreinterpretq_s16_u16(res); } -static AOM_FORCE_INLINE int16x8_t -horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, + int sx) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]); int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16)); uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]); @@ -169,12 +170,6 @@ return vreinterpretq_s16_u16(res); } -static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, - int sx) { - int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]); - return horizontal_filter_4x1_f1_beta0(in, f_s16); -} - static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1_6tap_beta0( const uint8x16_t in, const int8x16_t filter, const uint8x16x2_t perm_tbl) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); @@ -222,10 +217,11 @@ return vreinterpretq_s16_u16(res); } -static AOM_FORCE_INLINE int16x8_t -horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, + int sx) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]); int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16)); uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]); @@ -251,12 +247,6 @@ return vreinterpretq_s16_u16(res); } -static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, - int sx) { - int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]); - return horizontal_filter_8x1_f1_beta0(in, f_s16); -} - static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, int sy) { int16x4_t s0 = vget_low_s16(src[0]); @@ -381,8 +371,6 @@ const uint8_t *ref, int width, int height, int stride, int p_width, int p_height, int16_t alpha, int16_t beta, const int64_t x4, const int64_t y4, const int i, int16x8_t tmp[]) { - const int bd = 8; - const int reduce_bits_horiz = ROUND0_BITS; const int height_limit = AOMMIN(8, p_height - i) + 7; int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); @@ -393,23 +381,8 @@ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); - if (ix4 <= -7) { - for (int k = 0; k < height_limit; ++k) { - int iy = clamp_iy(iy4 + k - 7, height); - int16_t dup_val = - (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + - ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)); - tmp[k] = vdupq_n_s16(dup_val); - } - return; - } else if (ix4 >= width + 6) { - for (int k = 0; k < height_limit; ++k) { - int iy = clamp_iy(iy4 + k - 7, height); - int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + - ref[iy * stride + (width - 1)] * - (1 << (FILTER_BITS - reduce_bits_horiz)); - tmp[k] = vdupq_n_s16(dup_val); - } + if (warp_affine_special_case(ref, ix4, iy4, width, height, stride, + height_limit, tmp)) { return; }