Add Neon Dotprod 2/1 scale spec. for av1_resize_and_extend_frame Add an Armv8.4 DotProd implementation for av1_resize_and_extend_frame that specialises on 2 to 1 scaling. When BILINEAR filter is used or phase value is equal to 0 the Armv8.0 Neon implementation is called. Change-Id: Id06fbf78e4d6b29f3dfbaecda73919a092ccb96e
diff --git a/av1/av1.cmake b/av1/av1.cmake index 5630c84..8882b92 100644 --- a/av1/av1.cmake +++ b/av1/av1.cmake
@@ -423,7 +423,8 @@ list(APPEND AOM_AV1_COMMON_INTRIN_NEON_DOTPROD "${AOM_ROOT}/av1/common/arm/av1_convolve_scale_neon_dotprod.c" "${AOM_ROOT}/av1/common/arm/compound_convolve_neon_dotprod.c" - "${AOM_ROOT}/av1/common/arm/convolve_neon_dotprod.c") + "${AOM_ROOT}/av1/common/arm/convolve_neon_dotprod.c" + "${AOM_ROOT}/av1/common/arm/resize_neon_dotprod.c") list(APPEND AOM_AV1_COMMON_INTRIN_NEON_I8MM "${AOM_ROOT}/av1/common/arm/av1_convolve_scale_neon_i8mm.c"
diff --git a/av1/common/arm/resize_neon.c b/av1/common/arm/resize_neon.c index d29dc0a..ef28503 100644 --- a/av1/common/arm/resize_neon.c +++ b/av1/common/arm/resize_neon.c
@@ -15,6 +15,7 @@ #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" +#include "av1/common/arm/resize_neon.h" #include "av1/common/resize.h" #include "config/aom_scale_rtcd.h" #include "config/av1_rtcd.h" @@ -169,25 +170,6 @@ } while (--h != 0); } -static inline uint8x8_t scale_filter6_8(const int16x8_t s0, const int16x8_t s1, - const int16x8_t s2, const int16x8_t s3, - const int16x8_t s4, const int16x8_t s5, - const int16x8_t filter) { - const int16x4_t filter_lo = vget_low_s16(filter); - const int16x4_t filter_hi = vget_high_s16(filter); - - // Filter values at indices 0 and 7 are 0. - int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 1); - sum = vmlaq_lane_s16(sum, s1, filter_lo, 2); - sum = vmlaq_lane_s16(sum, s2, filter_lo, 3); - sum = vmlaq_lane_s16(sum, s3, filter_hi, 0); - sum = vmlaq_lane_s16(sum, s4, filter_hi, 1); - sum = vmlaq_lane_s16(sum, s5, filter_hi, 2); - - // We halved the convolution filter values so -1 from the right shift. - return vqrshrun_n_s16(sum, FILTER_BITS - 1); -} - static inline void scale_2_to_1_horiz_6tap(const uint8_t *src, const int src_stride, int w, int h, uint8_t *dst, const int dst_stride, @@ -255,59 +237,6 @@ } while (h > 0); } -static inline void scale_2_to_1_vert_6tap(const uint8_t *src, - const int src_stride, int w, int h, - uint8_t *dst, const int dst_stride, - const int16x8_t filters) { - do { - uint8x8_t t0, t1, t2, t3; - load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); - - int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - - const uint8_t *s = src + 4 * src_stride; - uint8_t *d = dst; - int height = h; - - do { - uint8x8_t t4, t5, t6, t7, t8, t9, t10, t11; - load_u8_8x8(s, src_stride, &t4, &t5, &t6, &t7, &t8, &t9, &t10, &t11); - - int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); - int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); - int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); - int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); - int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); - int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11)); - - uint8x8_t d0 = scale_filter6_8(s0, s1, s2, s3, s4, s5, filters); - uint8x8_t d1 = scale_filter6_8(s2, s3, s4, s5, s6, s7, filters); - uint8x8_t d2 = scale_filter6_8(s4, s5, s6, s7, s8, s9, filters); - uint8x8_t d3 = scale_filter6_8(s6, s7, s8, s9, s10, s11, filters); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - s0 = s8; - s1 = s9; - s2 = s10; - s3 = s11; - - d += 4 * dst_stride; - s += 8 * src_stride; - height -= 4; - } while (height > 0); - - dst += 8; - src += 8; - w -= 8; - } while (w > 0); -} - static inline void scale_plane_2_to_1_6tap(const uint8_t *src, const int src_stride, uint8_t *dst, const int dst_stride, const int w,
diff --git a/av1/common/arm/resize_neon.h b/av1/common/arm/resize_neon.h new file mode 100644 index 0000000..785c57a --- /dev/null +++ b/av1/common/arm/resize_neon.h
@@ -0,0 +1,93 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved. + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_ARM_RESIZE_NEON_H_ +#define AOM_AV1_COMMON_ARM_RESIZE_NEON_H_ + +#include <arm_neon.h> + +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" + +static inline uint8x8_t scale_filter6_8(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + int16x8_t filter) { + const int16x4_t filter_lo = vget_low_s16(filter); + const int16x4_t filter_hi = vget_high_s16(filter); + + // Filter values at indices 0 and 7 are 0. + int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 1); + sum = vmlaq_lane_s16(sum, s1, filter_lo, 2); + sum = vmlaq_lane_s16(sum, s2, filter_lo, 3); + sum = vmlaq_lane_s16(sum, s3, filter_hi, 0); + sum = vmlaq_lane_s16(sum, s4, filter_hi, 1); + sum = vmlaq_lane_s16(sum, s5, filter_hi, 2); + + // We halved the convolution filter values so -1 from the right shift. + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static inline void scale_2_to_1_vert_6tap(const uint8_t *src, + const int src_stride, int w, int h, + uint8_t *dst, const int dst_stride, + const int16x8_t filters) { + do { + uint8x8_t t0, t1, t2, t3; + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + const uint8_t *s = src + 4 * src_stride; + uint8_t *d = dst; + int height = h; + + do { + uint8x8_t t4, t5, t6, t7, t8, t9, t10, t11; + load_u8_8x8(s, src_stride, &t4, &t5, &t6, &t7, &t8, &t9, &t10, &t11); + + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11)); + + uint8x8_t d0 = scale_filter6_8(s0, s1, s2, s3, s4, s5, filters); + uint8x8_t d1 = scale_filter6_8(s2, s3, s4, s5, s6, s7, filters); + uint8x8_t d2 = scale_filter6_8(s4, s5, s6, s7, s8, s9, filters); + uint8x8_t d3 = scale_filter6_8(s6, s7, s8, s9, s10, s11, filters); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + + d += 4 * dst_stride; + s += 8 * src_stride; + height -= 4; + } while (height > 0); + + dst += 8; + src += 8; + w -= 8; + } while (w > 0); +} + +#endif // AOM_AV1_COMMON_ARM_RESIZE_NEON_H_
diff --git a/av1/common/arm/resize_neon_dotprod.c b/av1/common/arm/resize_neon_dotprod.c new file mode 100644 index 0000000..a8ecb98 --- /dev/null +++ b/av1/common/arm/resize_neon_dotprod.c
@@ -0,0 +1,197 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved. + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "av1/common/arm/resize_neon.h" +#include "av1/common/resize.h" +#include "config/aom_scale_rtcd.h" +#include "config/av1_rtcd.h" + +// clang-format off +DECLARE_ALIGNED(16, static const uint8_t, kScale2DotProdPermuteTbl[32]) = { + 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, + 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 +}; +// clang-format on + +static inline uint8x8_t scale_2_to_1_filter8_8(const uint8x16_t s0, + const uint8x16_t s1, + const uint8x16x2_t permute_tbl, + const int8x8_t filter) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t s0_128 = vreinterpretq_s8_u8(vsubq_u8(s0, vdupq_n_u8(128))); + int8x16_t s1_128 = vreinterpretq_s8_u8(vsubq_u8(s1, vdupq_n_u8(128))); + + // Permute samples ready for dot product. + int8x16_t perm_samples[4] = { vqtbl1q_s8(s0_128, permute_tbl.val[0]), + vqtbl1q_s8(s0_128, permute_tbl.val[1]), + vqtbl1q_s8(s1_128, permute_tbl.val[0]), + vqtbl1q_s8(s1_128, permute_tbl.val[1]) }; + + // Dot product constant: + // The shim of 128 << FILTER_BITS is needed because we are subtracting 128 + // from every source value. The additional right shift by one is needed + // because we halve the filter values. + const int32x4_t acc = vdupq_n_s32((128 << FILTER_BITS) >> 1); + + // First 4 output values. + int32x4_t sum0123 = vdotq_lane_s32(acc, perm_samples[0], filter, 0); + sum0123 = vdotq_lane_s32(sum0123, perm_samples[1], filter, 1); + // Second 4 output values. + int32x4_t sum4567 = vdotq_lane_s32(acc, perm_samples[2], filter, 0); + sum4567 = vdotq_lane_s32(sum4567, perm_samples[3], filter, 1); + + int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); + + // We halved the filter values so -1 from right shift. + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static inline void scale_2_to_1_horiz_8tap(const uint8_t *src, + const int src_stride, int w, int h, + uint8_t *dst, const int dst_stride, + const int16x8_t filters) { + const int8x8_t filter = vmovn_s16(filters); + const uint8x16x2_t permute_tbl = vld1q_u8_x2(kScale2DotProdPermuteTbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + do { + uint8x16_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + load_u8_16x8(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0], &s4[0], + &s5[0], &s6[0], &s7[0]); + load_u8_16x8(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1], &s4[1], + &s5[1], &s6[1], &s7[1]); + + uint8x8_t d0 = scale_2_to_1_filter8_8(s0[0], s0[1], permute_tbl, filter); + uint8x8_t d1 = scale_2_to_1_filter8_8(s1[0], s1[1], permute_tbl, filter); + uint8x8_t d2 = scale_2_to_1_filter8_8(s2[0], s2[1], permute_tbl, filter); + uint8x8_t d3 = scale_2_to_1_filter8_8(s3[0], s3[1], permute_tbl, filter); + + uint8x8_t d4 = scale_2_to_1_filter8_8(s4[0], s4[1], permute_tbl, filter); + uint8x8_t d5 = scale_2_to_1_filter8_8(s5[0], s5[1], permute_tbl, filter); + uint8x8_t d6 = scale_2_to_1_filter8_8(s6[0], s6[1], permute_tbl, filter); + uint8x8_t d7 = scale_2_to_1_filter8_8(s7[0], s7[1], permute_tbl, filter); + + store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + d += 8; + s += 16; + width -= 8; + } while (width > 0); + + dst += 8 * dst_stride; + src += 8 * src_stride; + h -= 8; + } while (h > 0); +} + +static inline void scale_plane_2_to_1_8tap(const uint8_t *src, + const int src_stride, uint8_t *dst, + const int dst_stride, const int w, + const int h, + const int16_t *const filter_ptr, + uint8_t *const im_block) { + assert(w > 0 && h > 0); + + const int im_h = 2 * h + SUBPEL_TAPS - 3; + const int im_stride = (w + 7) & ~7; + // All filter values are even, halve them to fit in int8_t when applying + // horizontal filter and stay in 16-bit elements when applying vertical + // filter. + const int16x8_t filters = vshrq_n_s16(vld1q_s16(filter_ptr), 1); + + const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1; + const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride; + + scale_2_to_1_horiz_8tap(src - horiz_offset - vert_offset, src_stride, w, im_h, + im_block, im_stride, filters); + + // We can specialise the vertical filtering for 6-tap filters given that the + // EIGHTTAP_SMOOTH and EIGHTTAP_REGULAR filters are 0-padded. + scale_2_to_1_vert_6tap(im_block + im_stride, im_stride, w, h, dst, dst_stride, + filters); +} + +static inline bool has_normative_scaler_neon_dotprod(const int src_width, + const int src_height, + const int dst_width, + const int dst_height) { + return (2 * dst_width == src_width && 2 * dst_height == src_height); +} + +void av1_resize_and_extend_frame_neon_dotprod(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + const InterpFilter filter, + const int phase, + const int num_planes) { + assert(filter == BILINEAR || filter == EIGHTTAP_SMOOTH || + filter == EIGHTTAP_REGULAR); + + bool has_normative_scaler = + has_normative_scaler_neon_dotprod(src->y_crop_width, src->y_crop_height, + dst->y_crop_width, dst->y_crop_height); + + if (num_planes > 1) { + has_normative_scaler = + has_normative_scaler && has_normative_scaler_neon_dotprod( + src->uv_crop_width, src->uv_crop_height, + dst->uv_crop_width, dst->uv_crop_height); + } + + if (!has_normative_scaler || filter == BILINEAR || phase == 0) { + av1_resize_and_extend_frame_neon(src, dst, filter, phase, num_planes); + return; + } + + // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet + // the static analysis warnings. + int malloc_failed = 0; + for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) { + const int is_uv = i > 0; + const int src_w = src->crop_widths[is_uv]; + const int src_h = src->crop_heights[is_uv]; + const int dst_w = dst->crop_widths[is_uv]; + const int dst_h = dst->crop_heights[is_uv]; + const int dst_y_w = (dst->crop_widths[0] + 1) & ~1; + const int dst_y_h = (dst->crop_heights[0] + 1) & ~1; + + if (2 * dst_w == src_w && 2 * dst_h == src_h) { + const int buffer_stride = (dst_y_w + 7) & ~7; + const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (!temp_buffer) { + malloc_failed = 1; + break; + } + const InterpKernel *interp_kernel = + (const InterpKernel *)av1_interp_filter_params_list[filter] + .filter_ptr; + scale_plane_2_to_1_8tap(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h, interp_kernel[phase], temp_buffer); + free(temp_buffer); + } + } + + if (malloc_failed) { + av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes); + } else { + aom_extend_frame_borders(dst, num_planes); + } +}
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index 889f952..f4f0bb5 100644 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl
@@ -253,7 +253,7 @@ # Resize functions. add_proto qw/void av1_resize_and_extend_frame/, "const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes"; -specialize qw/av1_resize_and_extend_frame ssse3 neon/; +specialize qw/av1_resize_and_extend_frame ssse3 neon neon_dotprod/; # # Encoder functions below this point.
diff --git a/test/av1_scale_test.cc b/test/av1_scale_test.cc index 9d5a10a..2865b05 100644 --- a/test/av1_scale_test.cc +++ b/test/av1_scale_test.cc
@@ -286,4 +286,11 @@ ::testing::Values(av1_resize_and_extend_frame_neon)); #endif // HAVE_NEON +#if HAVE_NEON_DOTPROD +INSTANTIATE_TEST_SUITE_P( + NEON_DOTPROD, ResizeAndExtendTest, + ::testing::Values(av1_resize_and_extend_frame_neon_dotprod)); + +#endif // HAVE_NEON_DOTPROD + } // namespace